diff options
Diffstat (limited to 'contrib/llvm/lib/Target/ARM')
97 files changed, 108263 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Target/ARM/A15SDOptimizer.cpp b/contrib/llvm/lib/Target/ARM/A15SDOptimizer.cpp new file mode 100644 index 0000000..7a1865c --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/A15SDOptimizer.cpp @@ -0,0 +1,720 @@ +//=== A15SDOptimizerPass.cpp - Optimize DPR and SPR register accesses on A15==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// The Cortex-A15 processor employs a tracking scheme in its register renaming +// in order to process each instruction's micro-ops speculatively and +// out-of-order with appropriate forwarding. The ARM architecture allows VFP +// instructions to read and write 32-bit S-registers. Each S-register +// corresponds to one half (upper or lower) of an overlaid 64-bit D-register. +// +// There are several instruction patterns which can be used to provide this +// capability which can provide higher performance than other, potentially more +// direct patterns, specifically around when one micro-op reads a D-register +// operand that has recently been written as one or more S-register results. +// +// This file defines a pre-regalloc pass which looks for SPR producers which +// are going to be used by a DPR (or QPR) consumers and creates the more +// optimized access pattern. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "ARMBaseRegisterInfo.h" +#include "ARMSubtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <map> +#include <set> + +using namespace llvm; + +#define DEBUG_TYPE "a15-sd-optimizer" + +namespace { + struct A15SDOptimizer : public MachineFunctionPass { + static char ID; + A15SDOptimizer() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &Fn) override; + + const char *getPassName() const override { + return "ARM A15 S->D optimizer"; + } + + private: + const ARMBaseInstrInfo *TII; + const TargetRegisterInfo *TRI; + MachineRegisterInfo *MRI; + + bool runOnInstruction(MachineInstr *MI); + + // + // Instruction builder helpers + // + unsigned createDupLane(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, + DebugLoc DL, + unsigned Reg, unsigned Lane, + bool QPR=false); + + unsigned createExtractSubreg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, + DebugLoc DL, + unsigned DReg, unsigned Lane, + const TargetRegisterClass *TRC); + + unsigned createVExt(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, + DebugLoc DL, + unsigned Ssub0, unsigned Ssub1); + + unsigned createRegSequence(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, + DebugLoc DL, + unsigned Reg1, unsigned Reg2); + + unsigned createInsertSubreg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, + DebugLoc DL, unsigned DReg, unsigned Lane, + unsigned ToInsert); + + unsigned createImplicitDef(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, + DebugLoc DL); + + // + // Various property checkers + // + bool usesRegClass(MachineOperand &MO, const TargetRegisterClass *TRC); + bool hasPartialWrite(MachineInstr *MI); + SmallVector<unsigned, 8> getReadDPRs(MachineInstr *MI); + unsigned getDPRLaneFromSPR(unsigned SReg); + + // + // Methods used for getting the definitions of partial registers + // + + MachineInstr *elideCopies(MachineInstr *MI); + void elideCopiesAndPHIs(MachineInstr *MI, + SmallVectorImpl<MachineInstr*> &Outs); + + // + // Pattern optimization methods + // + unsigned optimizeAllLanesPattern(MachineInstr *MI, unsigned Reg); + unsigned optimizeSDPattern(MachineInstr *MI); + unsigned getPrefSPRLane(unsigned SReg); + + // + // Sanitizing method - used to make sure if don't leave dead code around. + // + void eraseInstrWithNoUses(MachineInstr *MI); + + // + // A map used to track the changes done by this pass. + // + std::map<MachineInstr*, unsigned> Replacements; + std::set<MachineInstr *> DeadInstr; + }; + char A15SDOptimizer::ID = 0; +} // end anonymous namespace + +// Returns true if this is a use of a SPR register. +bool A15SDOptimizer::usesRegClass(MachineOperand &MO, + const TargetRegisterClass *TRC) { + if (!MO.isReg()) + return false; + unsigned Reg = MO.getReg(); + + if (TargetRegisterInfo::isVirtualRegister(Reg)) + return MRI->getRegClass(Reg)->hasSuperClassEq(TRC); + else + return TRC->contains(Reg); +} + +unsigned A15SDOptimizer::getDPRLaneFromSPR(unsigned SReg) { + unsigned DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_1, + &ARM::DPRRegClass); + if (DReg != ARM::NoRegister) return ARM::ssub_1; + return ARM::ssub_0; +} + +// Get the subreg type that is most likely to be coalesced +// for an SPR register that will be used in VDUP32d pseudo. +unsigned A15SDOptimizer::getPrefSPRLane(unsigned SReg) { + if (!TRI->isVirtualRegister(SReg)) + return getDPRLaneFromSPR(SReg); + + MachineInstr *MI = MRI->getVRegDef(SReg); + if (!MI) return ARM::ssub_0; + MachineOperand *MO = MI->findRegisterDefOperand(SReg); + + assert(MO->isReg() && "Non-register operand found!"); + if (!MO) return ARM::ssub_0; + + if (MI->isCopy() && usesRegClass(MI->getOperand(1), + &ARM::SPRRegClass)) { + SReg = MI->getOperand(1).getReg(); + } + + if (TargetRegisterInfo::isVirtualRegister(SReg)) { + if (MO->getSubReg() == ARM::ssub_1) return ARM::ssub_1; + return ARM::ssub_0; + } + return getDPRLaneFromSPR(SReg); +} + +// MI is known to be dead. Figure out what instructions +// are also made dead by this and mark them for removal. +void A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) { + SmallVector<MachineInstr *, 8> Front; + DeadInstr.insert(MI); + + DEBUG(dbgs() << "Deleting base instruction " << *MI << "\n"); + Front.push_back(MI); + + while (Front.size() != 0) { + MI = Front.back(); + Front.pop_back(); + + // MI is already known to be dead. We need to see + // if other instructions can also be removed. + for (unsigned int i = 0; i < MI->getNumOperands(); ++i) { + MachineOperand &MO = MI->getOperand(i); + if ((!MO.isReg()) || (!MO.isUse())) + continue; + unsigned Reg = MO.getReg(); + if (!TRI->isVirtualRegister(Reg)) + continue; + MachineOperand *Op = MI->findRegisterDefOperand(Reg); + + if (!Op) + continue; + + MachineInstr *Def = Op->getParent(); + + // We don't need to do anything if we have already marked + // this instruction as being dead. + if (DeadInstr.find(Def) != DeadInstr.end()) + continue; + + // Check if all the uses of this instruction are marked as + // dead. If so, we can also mark this instruction as being + // dead. + bool IsDead = true; + for (unsigned int j = 0; j < Def->getNumOperands(); ++j) { + MachineOperand &MODef = Def->getOperand(j); + if ((!MODef.isReg()) || (!MODef.isDef())) + continue; + unsigned DefReg = MODef.getReg(); + if (!TRI->isVirtualRegister(DefReg)) { + IsDead = false; + break; + } + for (MachineRegisterInfo::use_instr_iterator + II = MRI->use_instr_begin(Reg), EE = MRI->use_instr_end(); + II != EE; ++II) { + // We don't care about self references. + if (&*II == Def) + continue; + if (DeadInstr.find(&*II) == DeadInstr.end()) { + IsDead = false; + break; + } + } + } + + if (!IsDead) continue; + + DEBUG(dbgs() << "Deleting instruction " << *Def << "\n"); + DeadInstr.insert(Def); + } + } +} + +// Creates the more optimized patterns and generally does all the code +// transformations in this pass. +unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) { + if (MI->isCopy()) { + return optimizeAllLanesPattern(MI, MI->getOperand(1).getReg()); + } + + if (MI->isInsertSubreg()) { + unsigned DPRReg = MI->getOperand(1).getReg(); + unsigned SPRReg = MI->getOperand(2).getReg(); + + if (TRI->isVirtualRegister(DPRReg) && TRI->isVirtualRegister(SPRReg)) { + MachineInstr *DPRMI = MRI->getVRegDef(MI->getOperand(1).getReg()); + MachineInstr *SPRMI = MRI->getVRegDef(MI->getOperand(2).getReg()); + + if (DPRMI && SPRMI) { + // See if the first operand of this insert_subreg is IMPLICIT_DEF + MachineInstr *ECDef = elideCopies(DPRMI); + if (ECDef && ECDef->isImplicitDef()) { + // Another corner case - if we're inserting something that is purely + // a subreg copy of a DPR, just use that DPR. + + MachineInstr *EC = elideCopies(SPRMI); + // Is it a subreg copy of ssub_0? + if (EC && EC->isCopy() && + EC->getOperand(1).getSubReg() == ARM::ssub_0) { + DEBUG(dbgs() << "Found a subreg copy: " << *SPRMI); + + // Find the thing we're subreg copying out of - is it of the same + // regclass as DPRMI? (i.e. a DPR or QPR). + unsigned FullReg = SPRMI->getOperand(1).getReg(); + const TargetRegisterClass *TRC = + MRI->getRegClass(MI->getOperand(1).getReg()); + if (TRC->hasSuperClassEq(MRI->getRegClass(FullReg))) { + DEBUG(dbgs() << "Subreg copy is compatible - returning "); + DEBUG(dbgs() << PrintReg(FullReg) << "\n"); + eraseInstrWithNoUses(MI); + return FullReg; + } + } + + return optimizeAllLanesPattern(MI, MI->getOperand(2).getReg()); + } + } + } + return optimizeAllLanesPattern(MI, MI->getOperand(0).getReg()); + } + + if (MI->isRegSequence() && usesRegClass(MI->getOperand(1), + &ARM::SPRRegClass)) { + // See if all bar one of the operands are IMPLICIT_DEF and insert the + // optimizer pattern accordingly. + unsigned NumImplicit = 0, NumTotal = 0; + unsigned NonImplicitReg = ~0U; + + for (unsigned I = 1; I < MI->getNumExplicitOperands(); ++I) { + if (!MI->getOperand(I).isReg()) + continue; + ++NumTotal; + unsigned OpReg = MI->getOperand(I).getReg(); + + if (!TRI->isVirtualRegister(OpReg)) + break; + + MachineInstr *Def = MRI->getVRegDef(OpReg); + if (!Def) + break; + if (Def->isImplicitDef()) + ++NumImplicit; + else + NonImplicitReg = MI->getOperand(I).getReg(); + } + + if (NumImplicit == NumTotal - 1) + return optimizeAllLanesPattern(MI, NonImplicitReg); + else + return optimizeAllLanesPattern(MI, MI->getOperand(0).getReg()); + } + + llvm_unreachable("Unhandled update pattern!"); +} + +// Return true if this MachineInstr inserts a scalar (SPR) value into +// a D or Q register. +bool A15SDOptimizer::hasPartialWrite(MachineInstr *MI) { + // The only way we can do a partial register update is through a COPY, + // INSERT_SUBREG or REG_SEQUENCE. + if (MI->isCopy() && usesRegClass(MI->getOperand(1), &ARM::SPRRegClass)) + return true; + + if (MI->isInsertSubreg() && usesRegClass(MI->getOperand(2), + &ARM::SPRRegClass)) + return true; + + if (MI->isRegSequence() && usesRegClass(MI->getOperand(1), &ARM::SPRRegClass)) + return true; + + return false; +} + +// Looks through full copies to get the instruction that defines the input +// operand for MI. +MachineInstr *A15SDOptimizer::elideCopies(MachineInstr *MI) { + if (!MI->isFullCopy()) + return MI; + if (!TRI->isVirtualRegister(MI->getOperand(1).getReg())) + return nullptr; + MachineInstr *Def = MRI->getVRegDef(MI->getOperand(1).getReg()); + if (!Def) + return nullptr; + return elideCopies(Def); +} + +// Look through full copies and PHIs to get the set of non-copy MachineInstrs +// that can produce MI. +void A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI, + SmallVectorImpl<MachineInstr*> &Outs) { + // Looking through PHIs may create loops so we need to track what + // instructions we have visited before. + std::set<MachineInstr *> Reached; + SmallVector<MachineInstr *, 8> Front; + Front.push_back(MI); + while (Front.size() != 0) { + MI = Front.back(); + Front.pop_back(); + + // If we have already explored this MachineInstr, ignore it. + if (Reached.find(MI) != Reached.end()) + continue; + Reached.insert(MI); + if (MI->isPHI()) { + for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { + unsigned Reg = MI->getOperand(I).getReg(); + if (!TRI->isVirtualRegister(Reg)) { + continue; + } + MachineInstr *NewMI = MRI->getVRegDef(Reg); + if (!NewMI) + continue; + Front.push_back(NewMI); + } + } else if (MI->isFullCopy()) { + if (!TRI->isVirtualRegister(MI->getOperand(1).getReg())) + continue; + MachineInstr *NewMI = MRI->getVRegDef(MI->getOperand(1).getReg()); + if (!NewMI) + continue; + Front.push_back(NewMI); + } else { + DEBUG(dbgs() << "Found partial copy" << *MI <<"\n"); + Outs.push_back(MI); + } + } +} + +// Return the DPR virtual registers that are read by this machine instruction +// (if any). +SmallVector<unsigned, 8> A15SDOptimizer::getReadDPRs(MachineInstr *MI) { + if (MI->isCopyLike() || MI->isInsertSubreg() || MI->isRegSequence() || + MI->isKill()) + return SmallVector<unsigned, 8>(); + + SmallVector<unsigned, 8> Defs; + for (unsigned i = 0; i < MI->getNumOperands(); ++i) { + MachineOperand &MO = MI->getOperand(i); + + if (!MO.isReg() || !MO.isUse()) + continue; + if (!usesRegClass(MO, &ARM::DPRRegClass) && + !usesRegClass(MO, &ARM::QPRRegClass) && + !usesRegClass(MO, &ARM::DPairRegClass)) // Treat DPair as QPR + continue; + + Defs.push_back(MO.getReg()); + } + return Defs; +} + +// Creates a DPR register from an SPR one by using a VDUP. +unsigned +A15SDOptimizer::createDupLane(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, + DebugLoc DL, + unsigned Reg, unsigned Lane, bool QPR) { + unsigned Out = MRI->createVirtualRegister(QPR ? &ARM::QPRRegClass : + &ARM::DPRRegClass); + AddDefaultPred(BuildMI(MBB, + InsertBefore, + DL, + TII->get(QPR ? ARM::VDUPLN32q : ARM::VDUPLN32d), + Out) + .addReg(Reg) + .addImm(Lane)); + + return Out; +} + +// Creates a SPR register from a DPR by copying the value in lane 0. +unsigned +A15SDOptimizer::createExtractSubreg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, + DebugLoc DL, + unsigned DReg, unsigned Lane, + const TargetRegisterClass *TRC) { + unsigned Out = MRI->createVirtualRegister(TRC); + BuildMI(MBB, + InsertBefore, + DL, + TII->get(TargetOpcode::COPY), Out) + .addReg(DReg, 0, Lane); + + return Out; +} + +// Takes two SPR registers and creates a DPR by using a REG_SEQUENCE. +unsigned +A15SDOptimizer::createRegSequence(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, + DebugLoc DL, + unsigned Reg1, unsigned Reg2) { + unsigned Out = MRI->createVirtualRegister(&ARM::QPRRegClass); + BuildMI(MBB, + InsertBefore, + DL, + TII->get(TargetOpcode::REG_SEQUENCE), Out) + .addReg(Reg1) + .addImm(ARM::dsub_0) + .addReg(Reg2) + .addImm(ARM::dsub_1); + return Out; +} + +// Takes two DPR registers that have previously been VDUPed (Ssub0 and Ssub1) +// and merges them into one DPR register. +unsigned +A15SDOptimizer::createVExt(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, + DebugLoc DL, + unsigned Ssub0, unsigned Ssub1) { + unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass); + AddDefaultPred(BuildMI(MBB, + InsertBefore, + DL, + TII->get(ARM::VEXTd32), Out) + .addReg(Ssub0) + .addReg(Ssub1) + .addImm(1)); + return Out; +} + +unsigned +A15SDOptimizer::createInsertSubreg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, + DebugLoc DL, unsigned DReg, unsigned Lane, + unsigned ToInsert) { + unsigned Out = MRI->createVirtualRegister(&ARM::DPR_VFP2RegClass); + BuildMI(MBB, + InsertBefore, + DL, + TII->get(TargetOpcode::INSERT_SUBREG), Out) + .addReg(DReg) + .addReg(ToInsert) + .addImm(Lane); + + return Out; +} + +unsigned +A15SDOptimizer::createImplicitDef(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, + DebugLoc DL) { + unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass); + BuildMI(MBB, + InsertBefore, + DL, + TII->get(TargetOpcode::IMPLICIT_DEF), Out); + return Out; +} + +// This function inserts instructions in order to optimize interactions between +// SPR registers and DPR/QPR registers. It does so by performing VDUPs on all +// lanes, and the using VEXT instructions to recompose the result. +unsigned +A15SDOptimizer::optimizeAllLanesPattern(MachineInstr *MI, unsigned Reg) { + MachineBasicBlock::iterator InsertPt(MI); + DebugLoc DL = MI->getDebugLoc(); + MachineBasicBlock &MBB = *MI->getParent(); + InsertPt++; + unsigned Out; + + // DPair has the same length as QPR and also has two DPRs as subreg. + // Treat DPair as QPR. + if (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::QPRRegClass) || + MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::DPairRegClass)) { + unsigned DSub0 = createExtractSubreg(MBB, InsertPt, DL, Reg, + ARM::dsub_0, &ARM::DPRRegClass); + unsigned DSub1 = createExtractSubreg(MBB, InsertPt, DL, Reg, + ARM::dsub_1, &ARM::DPRRegClass); + + unsigned Out1 = createDupLane(MBB, InsertPt, DL, DSub0, 0); + unsigned Out2 = createDupLane(MBB, InsertPt, DL, DSub0, 1); + Out = createVExt(MBB, InsertPt, DL, Out1, Out2); + + unsigned Out3 = createDupLane(MBB, InsertPt, DL, DSub1, 0); + unsigned Out4 = createDupLane(MBB, InsertPt, DL, DSub1, 1); + Out2 = createVExt(MBB, InsertPt, DL, Out3, Out4); + + Out = createRegSequence(MBB, InsertPt, DL, Out, Out2); + + } else if (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::DPRRegClass)) { + unsigned Out1 = createDupLane(MBB, InsertPt, DL, Reg, 0); + unsigned Out2 = createDupLane(MBB, InsertPt, DL, Reg, 1); + Out = createVExt(MBB, InsertPt, DL, Out1, Out2); + + } else { + assert(MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::SPRRegClass) && + "Found unexpected regclass!"); + + unsigned PrefLane = getPrefSPRLane(Reg); + unsigned Lane; + switch (PrefLane) { + case ARM::ssub_0: Lane = 0; break; + case ARM::ssub_1: Lane = 1; break; + default: llvm_unreachable("Unknown preferred lane!"); + } + + // Treat DPair as QPR + bool UsesQPR = usesRegClass(MI->getOperand(0), &ARM::QPRRegClass) || + usesRegClass(MI->getOperand(0), &ARM::DPairRegClass); + + Out = createImplicitDef(MBB, InsertPt, DL); + Out = createInsertSubreg(MBB, InsertPt, DL, Out, PrefLane, Reg); + Out = createDupLane(MBB, InsertPt, DL, Out, Lane, UsesQPR); + eraseInstrWithNoUses(MI); + } + return Out; +} + +bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) { + // We look for instructions that write S registers that are then read as + // D/Q registers. These can only be caused by COPY, INSERT_SUBREG and + // REG_SEQUENCE pseudos that insert an SPR value into a DPR register or + // merge two SPR values to form a DPR register. In order avoid false + // positives we make sure that there is an SPR producer so we look past + // COPY and PHI nodes to find it. + // + // The best code pattern for when an SPR producer is going to be used by a + // DPR or QPR consumer depends on whether the other lanes of the + // corresponding DPR/QPR are currently defined. + // + // We can handle these efficiently, depending on the type of + // pseudo-instruction that is producing the pattern + // + // * COPY: * VDUP all lanes and merge the results together + // using VEXTs. + // + // * INSERT_SUBREG: * If the SPR value was originally in another DPR/QPR + // lane, and the other lane(s) of the DPR/QPR register + // that we are inserting in are undefined, use the + // original DPR/QPR value. + // * Otherwise, fall back on the same stategy as COPY. + // + // * REG_SEQUENCE: * If all except one of the input operands are + // IMPLICIT_DEFs, insert the VDUP pattern for just the + // defined input operand + // * Otherwise, fall back on the same stategy as COPY. + // + + // First, get all the reads of D-registers done by this instruction. + SmallVector<unsigned, 8> Defs = getReadDPRs(MI); + bool Modified = false; + + for (SmallVectorImpl<unsigned>::iterator I = Defs.begin(), E = Defs.end(); + I != E; ++I) { + // Follow the def-use chain for this DPR through COPYs, and also through + // PHIs (which are essentially multi-way COPYs). It is because of PHIs that + // we can end up with multiple defs of this DPR. + + SmallVector<MachineInstr *, 8> DefSrcs; + if (!TRI->isVirtualRegister(*I)) + continue; + MachineInstr *Def = MRI->getVRegDef(*I); + if (!Def) + continue; + + elideCopiesAndPHIs(Def, DefSrcs); + + for (SmallVectorImpl<MachineInstr *>::iterator II = DefSrcs.begin(), + EE = DefSrcs.end(); II != EE; ++II) { + MachineInstr *MI = *II; + + // If we've already analyzed and replaced this operand, don't do + // anything. + if (Replacements.find(MI) != Replacements.end()) + continue; + + // Now, work out if the instruction causes a SPR->DPR dependency. + if (!hasPartialWrite(MI)) + continue; + + // Collect all the uses of this MI's DPR def for updating later. + SmallVector<MachineOperand*, 8> Uses; + unsigned DPRDefReg = MI->getOperand(0).getReg(); + for (MachineRegisterInfo::use_iterator I = MRI->use_begin(DPRDefReg), + E = MRI->use_end(); I != E; ++I) + Uses.push_back(&*I); + + // We can optimize this. + unsigned NewReg = optimizeSDPattern(MI); + + if (NewReg != 0) { + Modified = true; + for (SmallVectorImpl<MachineOperand *>::const_iterator I = Uses.begin(), + E = Uses.end(); I != E; ++I) { + // Make sure to constrain the register class of the new register to + // match what we're replacing. Otherwise we can optimize a DPR_VFP2 + // reference into a plain DPR, and that will end poorly. NewReg is + // always virtual here, so there will always be a matching subclass + // to find. + MRI->constrainRegClass(NewReg, MRI->getRegClass((*I)->getReg())); + + DEBUG(dbgs() << "Replacing operand " + << **I << " with " + << PrintReg(NewReg) << "\n"); + (*I)->substVirtReg(NewReg, 0, *TRI); + } + } + Replacements[MI] = NewReg; + } + } + return Modified; +} + +bool A15SDOptimizer::runOnMachineFunction(MachineFunction &Fn) { + const ARMSubtarget &STI = Fn.getSubtarget<ARMSubtarget>(); + // Since the A15SDOptimizer pass can insert VDUP instructions, it can only be + // enabled when NEON is available. + if (!(STI.isCortexA15() && STI.hasNEON())) + return false; + TII = STI.getInstrInfo(); + TRI = STI.getRegisterInfo(); + MRI = &Fn.getRegInfo(); + bool Modified = false; + + DEBUG(dbgs() << "Running on function " << Fn.getName()<< "\n"); + + DeadInstr.clear(); + Replacements.clear(); + + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; + ++MFI) { + + for (MachineBasicBlock::iterator MI = MFI->begin(), ME = MFI->end(); + MI != ME;) { + Modified |= runOnInstruction(MI++); + } + + } + + for (std::set<MachineInstr *>::iterator I = DeadInstr.begin(), + E = DeadInstr.end(); + I != E; ++I) { + (*I)->eraseFromParent(); + } + + return Modified; +} + +FunctionPass *llvm::createA15SDOptimizerPass() { + return new A15SDOptimizer(); +} diff --git a/contrib/llvm/lib/Target/ARM/ARM.h b/contrib/llvm/lib/Target/ARM/ARM.h new file mode 100644 index 0000000..cd7540e --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARM.h @@ -0,0 +1,50 @@ +//===-- ARM.h - Top-level interface for ARM representation ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the entry points for global functions defined in the LLVM +// ARM back-end. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_ARM_H +#define LLVM_LIB_TARGET_ARM_ARM_H + +#include "llvm/Support/CodeGen.h" +#include <functional> + +namespace llvm { + +class ARMAsmPrinter; +class ARMBaseTargetMachine; +class Function; +class FunctionPass; +class ImmutablePass; +class MachineInstr; +class MCInst; +class TargetLowering; +class TargetMachine; + +FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM, + CodeGenOpt::Level OptLevel); +FunctionPass *createA15SDOptimizerPass(); +FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false); +FunctionPass *createARMExpandPseudoPass(); +FunctionPass *createARMConstantIslandPass(); +FunctionPass *createMLxExpansionPass(); +FunctionPass *createThumb2ITBlockPass(); +FunctionPass *createARMOptimizeBarriersPass(); +FunctionPass *createThumb2SizeReductionPass( + std::function<bool(const Function &)> Ftor = nullptr); + +void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, + ARMAsmPrinter &AP); + +} // end namespace llvm; + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARM.td b/contrib/llvm/lib/Target/ARM/ARM.td new file mode 100644 index 0000000..c171656 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARM.td @@ -0,0 +1,699 @@ +//===-- ARM.td - Describe the ARM Target Machine -----------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Target-independent interfaces which we are implementing +//===----------------------------------------------------------------------===// + +include "llvm/Target/Target.td" + +//===----------------------------------------------------------------------===// +// ARM Helper classes. +// + +class ProcNoItin<string Name, list<SubtargetFeature> Features> + : Processor<Name, NoItineraries, Features>; + +class Architecture<string fname, string aname, list<SubtargetFeature> features > + : SubtargetFeature<fname, "ARMArch", aname, + !strconcat(aname, " architecture"), features>; + +//===----------------------------------------------------------------------===// +// ARM Subtarget state. +// + +def ModeThumb : SubtargetFeature<"thumb-mode", "InThumbMode", "true", + "Thumb mode">; + +def ModeSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true", + "Use software floating point features.">; + +//===----------------------------------------------------------------------===// +// ARM Subtarget features. +// + +def FeatureVFP2 : SubtargetFeature<"vfp2", "HasVFPv2", "true", + "Enable VFP2 instructions">; +def FeatureVFP3 : SubtargetFeature<"vfp3", "HasVFPv3", "true", + "Enable VFP3 instructions", + [FeatureVFP2]>; +def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true", + "Enable NEON instructions", + [FeatureVFP3]>; +def FeatureThumb2 : SubtargetFeature<"thumb2", "HasThumb2", "true", + "Enable Thumb2 instructions">; +def FeatureNoARM : SubtargetFeature<"noarm", "NoARM", "true", + "Does not support ARM mode execution", + [ModeThumb]>; +def FeatureFP16 : SubtargetFeature<"fp16", "HasFP16", "true", + "Enable half-precision floating point">; +def FeatureVFP4 : SubtargetFeature<"vfp4", "HasVFPv4", "true", + "Enable VFP4 instructions", + [FeatureVFP3, FeatureFP16]>; +def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", + "true", "Enable ARMv8 FP", + [FeatureVFP4]>; +def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true", + "Enable full half-precision floating point", + [FeatureFPARMv8]>; +def FeatureD16 : SubtargetFeature<"d16", "HasD16", "true", + "Restrict FP to 16 double registers">; +def FeatureHWDiv : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true", + "Enable divide instructions">; +def FeatureHWDivARM : SubtargetFeature<"hwdiv-arm", + "HasHardwareDivideInARM", "true", + "Enable divide instructions in ARM mode">; +def FeatureT2XtPk : SubtargetFeature<"t2xtpk", "HasT2ExtractPack", "true", + "Enable Thumb2 extract and pack instructions">; +def FeatureDB : SubtargetFeature<"db", "HasDataBarrier", "true", + "Has data barrier (dmb / dsb) instructions">; +def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true", + "FP compare + branch is slow">; +def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true", + "Floating point unit supports single precision only">; +def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true", + "Enable support for Performance Monitor extensions">; +def FeatureTrustZone : SubtargetFeature<"trustzone", "HasTrustZone", "true", + "Enable support for TrustZone security extensions">; +def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", + "Enable support for Cryptography extensions", + [FeatureNEON]>; +def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true", + "Enable support for CRC instructions">; + +// Cyclone has preferred instructions for zeroing VFP registers, which can +// execute in 0 cycles. +def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true", + "Has zero-cycle zeroing instructions">; + +// Some processors have FP multiply-accumulate instructions that don't +// play nicely with other VFP / NEON instructions, and it's generally better +// to just not use them. +def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true", + "Disable VFP / NEON MAC instructions">; + +// Cortex-A8 / A9 Advanced SIMD has multiplier accumulator forwarding. +def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding", + "HasVMLxForwarding", "true", + "Has multiplier accumulator forwarding">; + +// Some processors benefit from using NEON instructions for scalar +// single-precision FP operations. +def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP", + "true", + "Use NEON for single precision FP">; + +// Disable 32-bit to 16-bit narrowing for experimentation. +def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true", + "Prefer 32-bit Thumb instrs">; + +/// Some instructions update CPSR partially, which can add false dependency for +/// out-of-order implementation, e.g. Cortex-A9, unless each individual bit is +/// mapped to a separate physical register. Avoid partial CPSR update for these +/// processors. +def FeatureAvoidPartialCPSR : SubtargetFeature<"avoid-partial-cpsr", + "AvoidCPSRPartialUpdate", "true", + "Avoid CPSR partial update for OOO execution">; + +def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop", + "AvoidMOVsShifterOperand", "true", + "Avoid movs instructions with shifter operand">; + +// Some processors perform return stack prediction. CodeGen should avoid issue +// "normal" call instructions to callees which do not return. +def FeatureHasRAS : SubtargetFeature<"ras", "HasRAS", "true", + "Has return address stack">; + +/// DSP extension. +def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true", + "Supports DSP instructions in ARM and/or Thumb2">; + +// Multiprocessing extension. +def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true", + "Supports Multiprocessing extension">; + +// Virtualization extension - requires HW divide (ARMv7-AR ARMARM - 4.4.8). +def FeatureVirtualization : SubtargetFeature<"virtualization", + "HasVirtualization", "true", + "Supports Virtualization extension", + [FeatureHWDiv, FeatureHWDivARM]>; + +// M-series ISA +def FeatureMClass : SubtargetFeature<"mclass", "ARMProcClass", "MClass", + "Is microcontroller profile ('M' series)">; + +// R-series ISA +def FeatureRClass : SubtargetFeature<"rclass", "ARMProcClass", "RClass", + "Is realtime profile ('R' series)">; + +// A-series ISA +def FeatureAClass : SubtargetFeature<"aclass", "ARMProcClass", "AClass", + "Is application profile ('A' series)">; + +// Special TRAP encoding for NaCl, which looks like a TRAP in Thumb too. +// See ARMInstrInfo.td for details. +def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true", + "NaCl trap">; + +def FeatureStrictAlign : SubtargetFeature<"strict-align", + "StrictAlign", "true", + "Disallow all unaligned memory " + "access">; + +def FeatureLongCalls : SubtargetFeature<"long-calls", "GenLongCalls", "true", + "Generate calls via indirect call " + "instructions">; + +def FeatureReserveR9 : SubtargetFeature<"reserve-r9", "ReserveR9", "true", + "Reserve R9, making it unavailable as " + "GPR">; + +def FeatureNoMovt : SubtargetFeature<"no-movt", "NoMovt", "true", + "Don't use movt/movw pairs for 32-bit " + "imms">; + + +//===----------------------------------------------------------------------===// +// ARM ISAa. +// + +def HasV4TOps : SubtargetFeature<"v4t", "HasV4TOps", "true", + "Support ARM v4T instructions">; +def HasV5TOps : SubtargetFeature<"v5t", "HasV5TOps", "true", + "Support ARM v5T instructions", + [HasV4TOps]>; +def HasV5TEOps : SubtargetFeature<"v5te", "HasV5TEOps", "true", + "Support ARM v5TE, v5TEj, and v5TExp instructions", + [HasV5TOps]>; +def HasV6Ops : SubtargetFeature<"v6", "HasV6Ops", "true", + "Support ARM v6 instructions", + [HasV5TEOps]>; +def HasV6MOps : SubtargetFeature<"v6m", "HasV6MOps", "true", + "Support ARM v6M instructions", + [HasV6Ops]>; +def HasV6KOps : SubtargetFeature<"v6k", "HasV6KOps", "true", + "Support ARM v6k instructions", + [HasV6Ops]>; +def HasV6T2Ops : SubtargetFeature<"v6t2", "HasV6T2Ops", "true", + "Support ARM v6t2 instructions", + [HasV6MOps, HasV6KOps, FeatureThumb2]>; +def HasV7Ops : SubtargetFeature<"v7", "HasV7Ops", "true", + "Support ARM v7 instructions", + [HasV6T2Ops, FeaturePerfMon]>; +def HasV8Ops : SubtargetFeature<"v8", "HasV8Ops", "true", + "Support ARM v8 instructions", + [HasV7Ops]>; +def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true", + "Support ARM v8.1a instructions", + [HasV8Ops]>; +def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", + "Support ARM v8.2a instructions", + [HasV8_1aOps]>; + + +//===----------------------------------------------------------------------===// +// ARM Processor subtarget features. +// + +def ProcA5 : SubtargetFeature<"a5", "ARMProcFamily", "CortexA5", + "Cortex-A5 ARM processors", []>; +def ProcA7 : SubtargetFeature<"a7", "ARMProcFamily", "CortexA7", + "Cortex-A7 ARM processors", []>; +def ProcA8 : SubtargetFeature<"a8", "ARMProcFamily", "CortexA8", + "Cortex-A8 ARM processors", []>; +def ProcA9 : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9", + "Cortex-A9 ARM processors", []>; +def ProcA12 : SubtargetFeature<"a12", "ARMProcFamily", "CortexA12", + "Cortex-A12 ARM processors", []>; +def ProcA15 : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15", + "Cortex-A15 ARM processors", []>; +def ProcA17 : SubtargetFeature<"a17", "ARMProcFamily", "CortexA17", + "Cortex-A17 ARM processors", []>; +def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", + "Cortex-A35 ARM processors", []>; +def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", + "Cortex-A53 ARM processors", []>; +def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", + "Cortex-A57 ARM processors", []>; +def ProcA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72", + "Cortex-A72 ARM processors", []>; + +def ProcKrait : SubtargetFeature<"krait", "ARMProcFamily", "Krait", + "Qualcomm ARM processors", []>; +def ProcSwift : SubtargetFeature<"swift", "ARMProcFamily", "Swift", + "Swift ARM processors", []>; + +def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1", + "Samsung Exynos-M1 processors", []>; + +def ProcR4 : SubtargetFeature<"r4", "ARMProcFamily", "CortexR4", + "Cortex-R4 ARM processors", []>; +def ProcR5 : SubtargetFeature<"r5", "ARMProcFamily", "CortexR5", + "Cortex-R5 ARM processors", []>; +def ProcR7 : SubtargetFeature<"r7", "ARMProcFamily", "CortexR7", + "Cortex-R7 ARM processors", []>; + + +//===----------------------------------------------------------------------===// +// ARM schedules. +// + +include "ARMSchedule.td" + + +//===----------------------------------------------------------------------===// +// ARM architectures +// + +def ARMv2 : Architecture<"armv2", "ARMv2", []>; + +def ARMv2a : Architecture<"armv2a", "ARMv2a", []>; + +def ARMv3 : Architecture<"armv3", "ARMv3", []>; + +def ARMv3m : Architecture<"armv3m", "ARMv3m", []>; + +def ARMv4 : Architecture<"armv4", "ARMv4", []>; + +def ARMv4t : Architecture<"armv4t", "ARMv4t", [HasV4TOps]>; + +def ARMv5t : Architecture<"armv5t", "ARMv5t", [HasV5TOps]>; + +def ARMv5te : Architecture<"armv5te", "ARMv5te", [HasV5TEOps]>; + +def ARMv5tej : Architecture<"armv5tej", "ARMv5tej", [HasV5TEOps]>; + +def ARMv6 : Architecture<"armv6", "ARMv6", [HasV6Ops]>; + +def ARMv6t2 : Architecture<"armv6t2", "ARMv6t2", [HasV6T2Ops, + FeatureDSP]>; + +def ARMv6k : Architecture<"armv6k", "ARMv6k", [HasV6KOps]>; + +def ARMv6kz : Architecture<"armv6kz", "ARMv6kz", [HasV6KOps, + FeatureTrustZone]>; + +def ARMv6m : Architecture<"armv6-m", "ARMv6m", [HasV6MOps, + FeatureNoARM, + FeatureDB, + FeatureMClass]>; + +def ARMv6sm : Architecture<"armv6s-m", "ARMv6sm", [HasV6MOps, + FeatureNoARM, + FeatureDB, + FeatureMClass]>; + +def ARMv7a : Architecture<"armv7-a", "ARMv7a", [HasV7Ops, + FeatureNEON, + FeatureDB, + FeatureDSP, + FeatureAClass]>; + +def ARMv7r : Architecture<"armv7-r", "ARMv7r", [HasV7Ops, + FeatureDB, + FeatureDSP, + FeatureHWDiv, + FeatureRClass]>; + +def ARMv7m : Architecture<"armv7-m", "ARMv7m", [HasV7Ops, + FeatureThumb2, + FeatureNoARM, + FeatureDB, + FeatureHWDiv, + FeatureMClass]>; + +def ARMv7em : Architecture<"armv7e-m", "ARMv7em", [HasV7Ops, + FeatureThumb2, + FeatureNoARM, + FeatureDB, + FeatureHWDiv, + FeatureMClass, + FeatureDSP, + FeatureT2XtPk]>; + +def ARMv8a : Architecture<"armv8-a", "ARMv8a", [HasV8Ops, + FeatureAClass, + FeatureDB, + FeatureFPARMv8, + FeatureNEON, + FeatureDSP, + FeatureTrustZone, + FeatureMP, + FeatureVirtualization, + FeatureCrypto, + FeatureCRC]>; + +def ARMv81a : Architecture<"armv8.1-a", "ARMv81a", [HasV8_1aOps, + FeatureAClass, + FeatureDB, + FeatureFPARMv8, + FeatureNEON, + FeatureDSP, + FeatureTrustZone, + FeatureMP, + FeatureVirtualization, + FeatureCrypto, + FeatureCRC]>; + +def ARMv82a : Architecture<"armv8.2-a", "ARMv82a", [HasV8_2aOps, + FeatureAClass, + FeatureDB, + FeatureFPARMv8, + FeatureNEON, + FeatureDSP, + FeatureTrustZone, + FeatureMP, + FeatureVirtualization, + FeatureCrypto, + FeatureCRC]>; + +// Aliases +def IWMMXT : Architecture<"iwmmxt", "ARMv5te", [ARMv5te]>; +def IWMMXT2 : Architecture<"iwmmxt2", "ARMv5te", [ARMv5te]>; +def XScale : Architecture<"xscale", "ARMv5te", [ARMv5te]>; +def ARMv6j : Architecture<"armv6j", "ARMv7a", [ARMv6]>; +def ARMv7k : Architecture<"armv7k", "ARMv7a", [ARMv7a]>; +def ARMv7s : Architecture<"armv7s", "ARMv7a", [ARMv7a]>; + + +//===----------------------------------------------------------------------===// +// ARM processors +// + +// Dummy CPU, used to target architectures +def : ProcNoItin<"generic", []>; + +def : ProcNoItin<"arm8", [ARMv4]>; +def : ProcNoItin<"arm810", [ARMv4]>; +def : ProcNoItin<"strongarm", [ARMv4]>; +def : ProcNoItin<"strongarm110", [ARMv4]>; +def : ProcNoItin<"strongarm1100", [ARMv4]>; +def : ProcNoItin<"strongarm1110", [ARMv4]>; + +def : ProcNoItin<"arm7tdmi", [ARMv4t]>; +def : ProcNoItin<"arm7tdmi-s", [ARMv4t]>; +def : ProcNoItin<"arm710t", [ARMv4t]>; +def : ProcNoItin<"arm720t", [ARMv4t]>; +def : ProcNoItin<"arm9", [ARMv4t]>; +def : ProcNoItin<"arm9tdmi", [ARMv4t]>; +def : ProcNoItin<"arm920", [ARMv4t]>; +def : ProcNoItin<"arm920t", [ARMv4t]>; +def : ProcNoItin<"arm922t", [ARMv4t]>; +def : ProcNoItin<"arm940t", [ARMv4t]>; +def : ProcNoItin<"ep9312", [ARMv4t]>; + +def : ProcNoItin<"arm10tdmi", [ARMv5t]>; +def : ProcNoItin<"arm1020t", [ARMv5t]>; + +def : ProcNoItin<"arm9e", [ARMv5te]>; +def : ProcNoItin<"arm926ej-s", [ARMv5te]>; +def : ProcNoItin<"arm946e-s", [ARMv5te]>; +def : ProcNoItin<"arm966e-s", [ARMv5te]>; +def : ProcNoItin<"arm968e-s", [ARMv5te]>; +def : ProcNoItin<"arm10e", [ARMv5te]>; +def : ProcNoItin<"arm1020e", [ARMv5te]>; +def : ProcNoItin<"arm1022e", [ARMv5te]>; +def : ProcNoItin<"xscale", [ARMv5te]>; +def : ProcNoItin<"iwmmxt", [ARMv5te]>; + +def : Processor<"arm1136j-s", ARMV6Itineraries, [ARMv6]>; +def : Processor<"arm1136jf-s", ARMV6Itineraries, [ARMv6, + FeatureVFP2, + FeatureHasSlowFPVMLx]>; + +def : Processor<"cortex-m0", ARMV6Itineraries, [ARMv6m]>; +def : Processor<"cortex-m0plus", ARMV6Itineraries, [ARMv6m]>; +def : Processor<"cortex-m1", ARMV6Itineraries, [ARMv6m]>; +def : Processor<"sc000", ARMV6Itineraries, [ARMv6m]>; + +def : Processor<"arm1176jz-s", ARMV6Itineraries, [ARMv6kz]>; +def : Processor<"arm1176jzf-s", ARMV6Itineraries, [ARMv6kz, + FeatureVFP2, + FeatureHasSlowFPVMLx]>; + +def : Processor<"mpcorenovfp", ARMV6Itineraries, [ARMv6k]>; +def : Processor<"mpcore", ARMV6Itineraries, [ARMv6k, + FeatureVFP2, + FeatureHasSlowFPVMLx]>; + +def : Processor<"arm1156t2-s", ARMV6Itineraries, [ARMv6t2]>; +def : Processor<"arm1156t2f-s", ARMV6Itineraries, [ARMv6t2, + FeatureVFP2, + FeatureHasSlowFPVMLx]>; + +// FIXME: A5 has currently the same Schedule model as A8 +def : ProcessorModel<"cortex-a5", CortexA8Model, [ARMv7a, ProcA5, + FeatureHasRAS, + FeatureTrustZone, + FeatureSlowFPBrcc, + FeatureHasSlowFPVMLx, + FeatureVMLxForwarding, + FeatureT2XtPk, + FeatureMP, + FeatureVFP4]>; + +def : ProcessorModel<"cortex-a7", CortexA8Model, [ARMv7a, ProcA7, + FeatureHasRAS, + FeatureTrustZone, + FeatureSlowFPBrcc, + FeatureHasSlowFPVMLx, + FeatureVMLxForwarding, + FeatureT2XtPk, + FeatureMP, + FeatureVFP4, + FeatureHWDiv, + FeatureHWDivARM, + FeatureVirtualization]>; + +def : ProcessorModel<"cortex-a8", CortexA8Model, [ARMv7a, ProcA8, + FeatureHasRAS, + FeatureTrustZone, + FeatureSlowFPBrcc, + FeatureHasSlowFPVMLx, + FeatureVMLxForwarding, + FeatureT2XtPk]>; + +def : ProcessorModel<"cortex-a9", CortexA9Model, [ARMv7a, ProcA9, + FeatureHasRAS, + FeatureTrustZone, + FeatureVMLxForwarding, + FeatureT2XtPk, + FeatureFP16, + FeatureAvoidPartialCPSR, + FeatureMP]>; + +// FIXME: A12 has currently the same Schedule model as A9 +def : ProcessorModel<"cortex-a12", CortexA9Model, [ARMv7a, ProcA12, + FeatureHasRAS, + FeatureTrustZone, + FeatureVMLxForwarding, + FeatureT2XtPk, + FeatureVFP4, + FeatureHWDiv, + FeatureHWDivARM, + FeatureAvoidPartialCPSR, + FeatureVirtualization, + FeatureMP]>; + +// FIXME: A15 has currently the same Schedule model as A9. +def : ProcessorModel<"cortex-a15", CortexA9Model, [ARMv7a, ProcA15, + FeatureHasRAS, + FeatureTrustZone, + FeatureT2XtPk, + FeatureVFP4, + FeatureMP, + FeatureHWDiv, + FeatureHWDivARM, + FeatureAvoidPartialCPSR, + FeatureVirtualization]>; + +// FIXME: A17 has currently the same Schedule model as A9 +def : ProcessorModel<"cortex-a17", CortexA9Model, [ARMv7a, ProcA17, + FeatureHasRAS, + FeatureTrustZone, + FeatureMP, + FeatureVMLxForwarding, + FeatureT2XtPk, + FeatureVFP4, + FeatureHWDiv, + FeatureHWDivARM, + FeatureAvoidPartialCPSR, + FeatureVirtualization]>; + +// FIXME: krait has currently the same Schedule model as A9 +// FIXME: krait has currently the same features as A9 plus VFP4 and hardware +// division features. +def : ProcessorModel<"krait", CortexA9Model, [ARMv7a, ProcKrait, + FeatureHasRAS, + FeatureVMLxForwarding, + FeatureT2XtPk, + FeatureFP16, + FeatureAvoidPartialCPSR, + FeatureVFP4, + FeatureHWDiv, + FeatureHWDivARM]>; + +def : ProcessorModel<"swift", SwiftModel, [ARMv7a, ProcSwift, + FeatureHasRAS, + FeatureNEONForFP, + FeatureT2XtPk, + FeatureVFP4, + FeatureMP, + FeatureHWDiv, + FeatureHWDivARM, + FeatureAvoidPartialCPSR, + FeatureAvoidMOVsShOp, + FeatureHasSlowFPVMLx]>; + +// FIXME: R4 has currently the same ProcessorModel as A8. +def : ProcessorModel<"cortex-r4", CortexA8Model, [ARMv7r, ProcR4, + FeatureHasRAS, + FeatureAvoidPartialCPSR, + FeatureT2XtPk]>; + +// FIXME: R4F has currently the same ProcessorModel as A8. +def : ProcessorModel<"cortex-r4f", CortexA8Model, [ARMv7r, ProcR4, + FeatureHasRAS, + FeatureSlowFPBrcc, + FeatureHasSlowFPVMLx, + FeatureVFP3, + FeatureD16, + FeatureAvoidPartialCPSR, + FeatureT2XtPk]>; + +// FIXME: R5 has currently the same ProcessorModel as A8. +def : ProcessorModel<"cortex-r5", CortexA8Model, [ARMv7r, ProcR5, + FeatureHasRAS, + FeatureVFP3, + FeatureD16, + FeatureSlowFPBrcc, + FeatureHWDivARM, + FeatureHasSlowFPVMLx, + FeatureAvoidPartialCPSR, + FeatureT2XtPk]>; + +// FIXME: R7 has currently the same ProcessorModel as A8 and is modelled as R5. +def : ProcessorModel<"cortex-r7", CortexA8Model, [ARMv7r, ProcR7, + FeatureHasRAS, + FeatureVFP3, + FeatureVFPOnlySP, + FeatureD16, + FeatureFP16, + FeatureMP, + FeatureSlowFPBrcc, + FeatureHWDivARM, + FeatureHasSlowFPVMLx, + FeatureAvoidPartialCPSR, + FeatureT2XtPk]>; + +def : ProcNoItin<"cortex-m3", [ARMv7m]>; +def : ProcNoItin<"sc300", [ARMv7m]>; + +def : ProcNoItin<"cortex-m4", [ARMv7em, + FeatureVFP4, + FeatureVFPOnlySP, + FeatureD16]>; + +def : ProcNoItin<"cortex-m7", [ARMv7em, + FeatureFPARMv8, + FeatureD16]>; + + +def : ProcNoItin<"cortex-a35", [ARMv8a, ProcA35, + FeatureHWDiv, + FeatureHWDivARM, + FeatureT2XtPk, + FeatureCrypto, + FeatureCRC]>; + +def : ProcNoItin<"cortex-a53", [ARMv8a, ProcA53, + FeatureHWDiv, + FeatureHWDivARM, + FeatureT2XtPk, + FeatureCrypto, + FeatureCRC]>; + +def : ProcNoItin<"cortex-a57", [ARMv8a, ProcA57, + FeatureHWDiv, + FeatureHWDivARM, + FeatureT2XtPk, + FeatureCrypto, + FeatureCRC]>; + +def : ProcNoItin<"cortex-a72", [ARMv8a, ProcA72, + FeatureHWDiv, + FeatureHWDivARM, + FeatureT2XtPk, + FeatureCrypto, + FeatureCRC]>; + +// Cyclone is very similar to swift +def : ProcessorModel<"cyclone", SwiftModel, [ARMv8a, ProcSwift, + FeatureHasRAS, + FeatureNEONForFP, + FeatureT2XtPk, + FeatureVFP4, + FeatureMP, + FeatureHWDiv, + FeatureHWDivARM, + FeatureAvoidPartialCPSR, + FeatureAvoidMOVsShOp, + FeatureHasSlowFPVMLx, + FeatureCrypto, + FeatureZCZeroing]>; + +def : ProcNoItin<"exynos-m1", [ARMv8a, ProcExynosM1, + FeatureHWDiv, + FeatureHWDivARM, + FeatureT2XtPk, + FeatureCrypto, + FeatureCRC]>; + +//===----------------------------------------------------------------------===// +// Register File Description +//===----------------------------------------------------------------------===// + +include "ARMRegisterInfo.td" + +include "ARMCallingConv.td" + +//===----------------------------------------------------------------------===// +// Instruction Descriptions +//===----------------------------------------------------------------------===// + +include "ARMInstrInfo.td" + +def ARMInstrInfo : InstrInfo; + +//===----------------------------------------------------------------------===// +// Declare the target which we are implementing +//===----------------------------------------------------------------------===// + +def ARMAsmWriter : AsmWriter { + string AsmWriterClassName = "InstPrinter"; + int PassSubtarget = 1; + int Variant = 0; + bit isMCAsmWriter = 1; +} + +def ARMAsmParserVariant : AsmParserVariant { + int Variant = 0; + string Name = "ARM"; + string BreakCharacters = "."; +} + +def ARM : Target { + // Pull in Instruction Info: + let InstructionSet = ARMInstrInfo; + let AssemblyWriters = [ARMAsmWriter]; + let AssemblyParserVariants = [ARMAsmParserVariant]; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp new file mode 100644 index 0000000..206db96 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp @@ -0,0 +1,1906 @@ +//===-- ARMAsmPrinter.cpp - Print machine code to an ARM .s file ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a printer that converts from our internal representation +// of machine-dependent LLVM code to GAS-format ARM assembly language. +// +//===----------------------------------------------------------------------===// + +#include "ARMAsmPrinter.h" +#include "ARM.h" +#include "ARMConstantPoolValue.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMTargetMachine.h" +#include "ARMTargetObjectFile.h" +#include "InstPrinter/ARMInstPrinter.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "MCTargetDesc/ARMMCExpr.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/Mangler.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCELFStreamer.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstBuilder.h" +#include "llvm/MC/MCObjectStreamer.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/ARMBuildAttributes.h" +#include "llvm/Support/TargetParser.h" +#include "llvm/Support/COFF.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include <cctype> +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +ARMAsmPrinter::ARMAsmPrinter(TargetMachine &TM, + std::unique_ptr<MCStreamer> Streamer) + : AsmPrinter(TM, std::move(Streamer)), AFI(nullptr), MCP(nullptr), + InConstantPool(false), OptimizationGoals(-1) {} + +void ARMAsmPrinter::EmitFunctionBodyEnd() { + // Make sure to terminate any constant pools that were at the end + // of the function. + if (!InConstantPool) + return; + InConstantPool = false; + OutStreamer->EmitDataRegion(MCDR_DataRegionEnd); +} + +void ARMAsmPrinter::EmitFunctionEntryLabel() { + if (AFI->isThumbFunction()) { + OutStreamer->EmitAssemblerFlag(MCAF_Code16); + OutStreamer->EmitThumbFunc(CurrentFnSym); + } + + OutStreamer->EmitLabel(CurrentFnSym); +} + +void ARMAsmPrinter::EmitXXStructor(const DataLayout &DL, const Constant *CV) { + uint64_t Size = getDataLayout().getTypeAllocSize(CV->getType()); + assert(Size && "C++ constructor pointer had zero size!"); + + const GlobalValue *GV = dyn_cast<GlobalValue>(CV->stripPointerCasts()); + assert(GV && "C++ constructor pointer was not a GlobalValue!"); + + const MCExpr *E = MCSymbolRefExpr::create(GetARMGVSymbol(GV, + ARMII::MO_NO_FLAG), + (Subtarget->isTargetELF() + ? MCSymbolRefExpr::VK_ARM_TARGET1 + : MCSymbolRefExpr::VK_None), + OutContext); + + OutStreamer->EmitValue(E, Size); +} + +/// runOnMachineFunction - This uses the EmitInstruction() +/// method to print assembly for each instruction. +/// +bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) { + AFI = MF.getInfo<ARMFunctionInfo>(); + MCP = MF.getConstantPool(); + Subtarget = &MF.getSubtarget<ARMSubtarget>(); + + SetupMachineFunction(MF); + const Function* F = MF.getFunction(); + const TargetMachine& TM = MF.getTarget(); + + // Calculate this function's optimization goal. + unsigned OptimizationGoal; + if (F->hasFnAttribute(Attribute::OptimizeNone)) + // For best debugging illusion, speed and small size sacrificed + OptimizationGoal = 6; + else if (F->optForMinSize()) + // Aggressively for small size, speed and debug illusion sacrificed + OptimizationGoal = 4; + else if (F->optForSize()) + // For small size, but speed and debugging illusion preserved + OptimizationGoal = 3; + else if (TM.getOptLevel() == CodeGenOpt::Aggressive) + // Aggressively for speed, small size and debug illusion sacrificed + OptimizationGoal = 2; + else if (TM.getOptLevel() > CodeGenOpt::None) + // For speed, but small size and good debug illusion preserved + OptimizationGoal = 1; + else // TM.getOptLevel() == CodeGenOpt::None + // For good debugging, but speed and small size preserved + OptimizationGoal = 5; + + // Combine a new optimization goal with existing ones. + if (OptimizationGoals == -1) // uninitialized goals + OptimizationGoals = OptimizationGoal; + else if (OptimizationGoals != (int)OptimizationGoal) // conflicting goals + OptimizationGoals = 0; + + if (Subtarget->isTargetCOFF()) { + bool Internal = F->hasInternalLinkage(); + COFF::SymbolStorageClass Scl = Internal ? COFF::IMAGE_SYM_CLASS_STATIC + : COFF::IMAGE_SYM_CLASS_EXTERNAL; + int Type = COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT; + + OutStreamer->BeginCOFFSymbolDef(CurrentFnSym); + OutStreamer->EmitCOFFSymbolStorageClass(Scl); + OutStreamer->EmitCOFFSymbolType(Type); + OutStreamer->EndCOFFSymbolDef(); + } + + // Emit the rest of the function body. + EmitFunctionBody(); + + // If we need V4T thumb mode Register Indirect Jump pads, emit them. + // These are created per function, rather than per TU, since it's + // relatively easy to exceed the thumb branch range within a TU. + if (! ThumbIndirectPads.empty()) { + OutStreamer->EmitAssemblerFlag(MCAF_Code16); + EmitAlignment(1); + for (unsigned i = 0, e = ThumbIndirectPads.size(); i < e; i++) { + OutStreamer->EmitLabel(ThumbIndirectPads[i].second); + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tBX) + .addReg(ThumbIndirectPads[i].first) + // Add predicate operands. + .addImm(ARMCC::AL) + .addReg(0)); + } + ThumbIndirectPads.clear(); + } + + // We didn't modify anything. + return false; +} + +void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O) { + const MachineOperand &MO = MI->getOperand(OpNum); + unsigned TF = MO.getTargetFlags(); + + switch (MO.getType()) { + default: llvm_unreachable("<unknown operand type>"); + case MachineOperand::MO_Register: { + unsigned Reg = MO.getReg(); + assert(TargetRegisterInfo::isPhysicalRegister(Reg)); + assert(!MO.getSubReg() && "Subregs should be eliminated!"); + if(ARM::GPRPairRegClass.contains(Reg)) { + const MachineFunction &MF = *MI->getParent()->getParent(); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + Reg = TRI->getSubReg(Reg, ARM::gsub_0); + } + O << ARMInstPrinter::getRegisterName(Reg); + break; + } + case MachineOperand::MO_Immediate: { + int64_t Imm = MO.getImm(); + O << '#'; + if (TF == ARMII::MO_LO16) + O << ":lower16:"; + else if (TF == ARMII::MO_HI16) + O << ":upper16:"; + O << Imm; + break; + } + case MachineOperand::MO_MachineBasicBlock: + MO.getMBB()->getSymbol()->print(O, MAI); + return; + case MachineOperand::MO_GlobalAddress: { + const GlobalValue *GV = MO.getGlobal(); + if (TF & ARMII::MO_LO16) + O << ":lower16:"; + else if (TF & ARMII::MO_HI16) + O << ":upper16:"; + GetARMGVSymbol(GV, TF)->print(O, MAI); + + printOffset(MO.getOffset(), O); + if (TF == ARMII::MO_PLT) + O << "(PLT)"; + break; + } + case MachineOperand::MO_ConstantPoolIndex: + GetCPISymbol(MO.getIndex())->print(O, MAI); + break; + } +} + +//===--------------------------------------------------------------------===// + +MCSymbol *ARMAsmPrinter:: +GetARMJTIPICJumpTableLabel(unsigned uid) const { + const DataLayout &DL = getDataLayout(); + SmallString<60> Name; + raw_svector_ostream(Name) << DL.getPrivateGlobalPrefix() << "JTI" + << getFunctionNumber() << '_' << uid; + return OutContext.getOrCreateSymbol(Name); +} + +bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &O) { + // Does this asm operand have a single letter operand modifier? + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) return true; // Unknown modifier. + + switch (ExtraCode[0]) { + default: + // See if this is a generic print operand + return AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O); + case 'a': // Print as a memory address. + if (MI->getOperand(OpNum).isReg()) { + O << "[" + << ARMInstPrinter::getRegisterName(MI->getOperand(OpNum).getReg()) + << "]"; + return false; + } + // Fallthrough + case 'c': // Don't print "#" before an immediate operand. + if (!MI->getOperand(OpNum).isImm()) + return true; + O << MI->getOperand(OpNum).getImm(); + return false; + case 'P': // Print a VFP double precision register. + case 'q': // Print a NEON quad precision register. + printOperand(MI, OpNum, O); + return false; + case 'y': // Print a VFP single precision register as indexed double. + if (MI->getOperand(OpNum).isReg()) { + unsigned Reg = MI->getOperand(OpNum).getReg(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + // Find the 'd' register that has this 's' register as a sub-register, + // and determine the lane number. + for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR) { + if (!ARM::DPRRegClass.contains(*SR)) + continue; + bool Lane0 = TRI->getSubReg(*SR, ARM::ssub_0) == Reg; + O << ARMInstPrinter::getRegisterName(*SR) << (Lane0 ? "[0]" : "[1]"); + return false; + } + } + return true; + case 'B': // Bitwise inverse of integer or symbol without a preceding #. + if (!MI->getOperand(OpNum).isImm()) + return true; + O << ~(MI->getOperand(OpNum).getImm()); + return false; + case 'L': // The low 16 bits of an immediate constant. + if (!MI->getOperand(OpNum).isImm()) + return true; + O << (MI->getOperand(OpNum).getImm() & 0xffff); + return false; + case 'M': { // A register range suitable for LDM/STM. + if (!MI->getOperand(OpNum).isReg()) + return true; + const MachineOperand &MO = MI->getOperand(OpNum); + unsigned RegBegin = MO.getReg(); + // This takes advantage of the 2 operand-ness of ldm/stm and that we've + // already got the operands in registers that are operands to the + // inline asm statement. + O << "{"; + if (ARM::GPRPairRegClass.contains(RegBegin)) { + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + unsigned Reg0 = TRI->getSubReg(RegBegin, ARM::gsub_0); + O << ARMInstPrinter::getRegisterName(Reg0) << ", "; + RegBegin = TRI->getSubReg(RegBegin, ARM::gsub_1); + } + O << ARMInstPrinter::getRegisterName(RegBegin); + + // FIXME: The register allocator not only may not have given us the + // registers in sequence, but may not be in ascending registers. This + // will require changes in the register allocator that'll need to be + // propagated down here if the operands change. + unsigned RegOps = OpNum + 1; + while (MI->getOperand(RegOps).isReg()) { + O << ", " + << ARMInstPrinter::getRegisterName(MI->getOperand(RegOps).getReg()); + RegOps++; + } + + O << "}"; + + return false; + } + case 'R': // The most significant register of a pair. + case 'Q': { // The least significant register of a pair. + if (OpNum == 0) + return true; + const MachineOperand &FlagsOP = MI->getOperand(OpNum - 1); + if (!FlagsOP.isImm()) + return true; + unsigned Flags = FlagsOP.getImm(); + + // This operand may not be the one that actually provides the register. If + // it's tied to a previous one then we should refer instead to that one + // for registers and their classes. + unsigned TiedIdx; + if (InlineAsm::isUseOperandTiedToDef(Flags, TiedIdx)) { + for (OpNum = InlineAsm::MIOp_FirstOperand; TiedIdx; --TiedIdx) { + unsigned OpFlags = MI->getOperand(OpNum).getImm(); + OpNum += InlineAsm::getNumOperandRegisters(OpFlags) + 1; + } + Flags = MI->getOperand(OpNum).getImm(); + + // Later code expects OpNum to be pointing at the register rather than + // the flags. + OpNum += 1; + } + + unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags); + unsigned RC; + InlineAsm::hasRegClassConstraint(Flags, RC); + if (RC == ARM::GPRPairRegClassID) { + if (NumVals != 1) + return true; + const MachineOperand &MO = MI->getOperand(OpNum); + if (!MO.isReg()) + return true; + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + unsigned Reg = TRI->getSubReg(MO.getReg(), ExtraCode[0] == 'Q' ? + ARM::gsub_0 : ARM::gsub_1); + O << ARMInstPrinter::getRegisterName(Reg); + return false; + } + if (NumVals != 2) + return true; + unsigned RegOp = ExtraCode[0] == 'Q' ? OpNum : OpNum + 1; + if (RegOp >= MI->getNumOperands()) + return true; + const MachineOperand &MO = MI->getOperand(RegOp); + if (!MO.isReg()) + return true; + unsigned Reg = MO.getReg(); + O << ARMInstPrinter::getRegisterName(Reg); + return false; + } + + case 'e': // The low doubleword register of a NEON quad register. + case 'f': { // The high doubleword register of a NEON quad register. + if (!MI->getOperand(OpNum).isReg()) + return true; + unsigned Reg = MI->getOperand(OpNum).getReg(); + if (!ARM::QPRRegClass.contains(Reg)) + return true; + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + unsigned SubReg = TRI->getSubReg(Reg, ExtraCode[0] == 'e' ? + ARM::dsub_0 : ARM::dsub_1); + O << ARMInstPrinter::getRegisterName(SubReg); + return false; + } + + // This modifier is not yet supported. + case 'h': // A range of VFP/NEON registers suitable for VLD1/VST1. + return true; + case 'H': { // The highest-numbered register of a pair. + const MachineOperand &MO = MI->getOperand(OpNum); + if (!MO.isReg()) + return true; + const MachineFunction &MF = *MI->getParent()->getParent(); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + unsigned Reg = MO.getReg(); + if(!ARM::GPRPairRegClass.contains(Reg)) + return false; + Reg = TRI->getSubReg(Reg, ARM::gsub_1); + O << ARMInstPrinter::getRegisterName(Reg); + return false; + } + } + } + + printOperand(MI, OpNum, O); + return false; +} + +bool ARMAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, + unsigned OpNum, unsigned AsmVariant, + const char *ExtraCode, + raw_ostream &O) { + // Does this asm operand have a single letter operand modifier? + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) return true; // Unknown modifier. + + switch (ExtraCode[0]) { + case 'A': // A memory operand for a VLD1/VST1 instruction. + default: return true; // Unknown modifier. + case 'm': // The base register of a memory operand. + if (!MI->getOperand(OpNum).isReg()) + return true; + O << ARMInstPrinter::getRegisterName(MI->getOperand(OpNum).getReg()); + return false; + } + } + + const MachineOperand &MO = MI->getOperand(OpNum); + assert(MO.isReg() && "unexpected inline asm memory operand"); + O << "[" << ARMInstPrinter::getRegisterName(MO.getReg()) << "]"; + return false; +} + +static bool isThumb(const MCSubtargetInfo& STI) { + return STI.getFeatureBits()[ARM::ModeThumb]; +} + +void ARMAsmPrinter::emitInlineAsmEnd(const MCSubtargetInfo &StartInfo, + const MCSubtargetInfo *EndInfo) const { + // If either end mode is unknown (EndInfo == NULL) or different than + // the start mode, then restore the start mode. + const bool WasThumb = isThumb(StartInfo); + if (!EndInfo || WasThumb != isThumb(*EndInfo)) { + OutStreamer->EmitAssemblerFlag(WasThumb ? MCAF_Code16 : MCAF_Code32); + } +} + +void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) { + const Triple &TT = TM.getTargetTriple(); + // Use unified assembler syntax. + OutStreamer->EmitAssemblerFlag(MCAF_SyntaxUnified); + + // Emit ARM Build Attributes + if (TT.isOSBinFormatELF()) + emitAttributes(); + + // Use the triple's architecture and subarchitecture to determine + // if we're thumb for the purposes of the top level code16 assembler + // flag. + bool isThumb = TT.getArch() == Triple::thumb || + TT.getArch() == Triple::thumbeb || + TT.getSubArch() == Triple::ARMSubArch_v7m || + TT.getSubArch() == Triple::ARMSubArch_v6m; + if (!M.getModuleInlineAsm().empty() && isThumb) + OutStreamer->EmitAssemblerFlag(MCAF_Code16); +} + +static void +emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel, + MachineModuleInfoImpl::StubValueTy &MCSym) { + // L_foo$stub: + OutStreamer.EmitLabel(StubLabel); + // .indirect_symbol _foo + OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol); + + if (MCSym.getInt()) + // External to current translation unit. + OutStreamer.EmitIntValue(0, 4/*size*/); + else + // Internal to current translation unit. + // + // When we place the LSDA into the TEXT section, the type info + // pointers need to be indirect and pc-rel. We accomplish this by + // using NLPs; however, sometimes the types are local to the file. + // We need to fill in the value for the NLP in those cases. + OutStreamer.EmitValue( + MCSymbolRefExpr::create(MCSym.getPointer(), OutStreamer.getContext()), + 4 /*size*/); +} + + +void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) { + const Triple &TT = TM.getTargetTriple(); + if (TT.isOSBinFormatMachO()) { + // All darwin targets use mach-o. + const TargetLoweringObjectFileMachO &TLOFMacho = + static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering()); + MachineModuleInfoMachO &MMIMacho = + MMI->getObjFileInfo<MachineModuleInfoMachO>(); + + // Output non-lazy-pointers for external and common global variables. + MachineModuleInfoMachO::SymbolListTy Stubs = MMIMacho.GetGVStubList(); + + if (!Stubs.empty()) { + // Switch with ".non_lazy_symbol_pointer" directive. + OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection()); + EmitAlignment(2); + + for (auto &Stub : Stubs) + emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second); + + Stubs.clear(); + OutStreamer->AddBlankLine(); + } + + Stubs = MMIMacho.GetHiddenGVStubList(); + if (!Stubs.empty()) { + OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection()); + EmitAlignment(2); + + for (auto &Stub : Stubs) + emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second); + + Stubs.clear(); + OutStreamer->AddBlankLine(); + } + + // Funny Darwin hack: This flag tells the linker that no global symbols + // contain code that falls through to other global symbols (e.g. the obvious + // implementation of multiple entry points). If this doesn't occur, the + // linker can safely perform dead code stripping. Since LLVM never + // generates code that does this, it is always safe to set. + OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); + } + + // The last attribute to be emitted is ABI_optimization_goals + MCTargetStreamer &TS = *OutStreamer->getTargetStreamer(); + ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS); + + if (OptimizationGoals > 0 && + (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI())) + ATS.emitAttribute(ARMBuildAttrs::ABI_optimization_goals, OptimizationGoals); + OptimizationGoals = -1; + + ATS.finishAttributeSection(); +} + +//===----------------------------------------------------------------------===// +// Helper routines for EmitStartOfAsmFile() and EmitEndOfAsmFile() +// FIXME: +// The following seem like one-off assembler flags, but they actually need +// to appear in the .ARM.attributes section in ELF. +// Instead of subclassing the MCELFStreamer, we do the work here. + +static ARMBuildAttrs::CPUArch getArchForCPU(StringRef CPU, + const ARMSubtarget *Subtarget) { + if (CPU == "xscale") + return ARMBuildAttrs::v5TEJ; + + if (Subtarget->hasV8Ops()) + return ARMBuildAttrs::v8; + else if (Subtarget->hasV7Ops()) { + if (Subtarget->isMClass() && Subtarget->hasDSP()) + return ARMBuildAttrs::v7E_M; + return ARMBuildAttrs::v7; + } else if (Subtarget->hasV6T2Ops()) + return ARMBuildAttrs::v6T2; + else if (Subtarget->hasV6MOps()) + return ARMBuildAttrs::v6S_M; + else if (Subtarget->hasV6Ops()) + return ARMBuildAttrs::v6; + else if (Subtarget->hasV5TEOps()) + return ARMBuildAttrs::v5TE; + else if (Subtarget->hasV5TOps()) + return ARMBuildAttrs::v5T; + else if (Subtarget->hasV4TOps()) + return ARMBuildAttrs::v4T; + else + return ARMBuildAttrs::v4; +} + +void ARMAsmPrinter::emitAttributes() { + MCTargetStreamer &TS = *OutStreamer->getTargetStreamer(); + ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS); + + ATS.emitTextAttribute(ARMBuildAttrs::conformance, "2.09"); + + ATS.switchVendor("aeabi"); + + // Compute ARM ELF Attributes based on the default subtarget that + // we'd have constructed. The existing ARM behavior isn't LTO clean + // anyhow. + // FIXME: For ifunc related functions we could iterate over and look + // for a feature string that doesn't match the default one. + const Triple &TT = TM.getTargetTriple(); + StringRef CPU = TM.getTargetCPU(); + StringRef FS = TM.getTargetFeatureString(); + std::string ArchFS = ARM_MC::ParseARMTriple(TT, CPU); + if (!FS.empty()) { + if (!ArchFS.empty()) + ArchFS = (Twine(ArchFS) + "," + FS).str(); + else + ArchFS = FS; + } + const ARMBaseTargetMachine &ATM = + static_cast<const ARMBaseTargetMachine &>(TM); + const ARMSubtarget STI(TT, CPU, ArchFS, ATM, ATM.isLittleEndian()); + + std::string CPUString = STI.getCPUString(); + + if (CPUString.find("generic") != 0) { //CPUString doesn't start with "generic" + // FIXME: remove krait check when GNU tools support krait cpu + if (STI.isKrait()) { + ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, "cortex-a9"); + // We consider krait as a "cortex-a9" + hwdiv CPU + // Enable hwdiv through ".arch_extension idiv" + if (STI.hasDivide() || STI.hasDivideInARMMode()) + ATS.emitArchExtension(ARM::AEK_HWDIV | ARM::AEK_HWDIVARM); + } else + ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, CPUString); + } + + ATS.emitAttribute(ARMBuildAttrs::CPU_arch, getArchForCPU(CPUString, &STI)); + + // Tag_CPU_arch_profile must have the default value of 0 when "Architecture + // profile is not applicable (e.g. pre v7, or cross-profile code)". + if (STI.hasV7Ops()) { + if (STI.isAClass()) { + ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile, + ARMBuildAttrs::ApplicationProfile); + } else if (STI.isRClass()) { + ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile, + ARMBuildAttrs::RealTimeProfile); + } else if (STI.isMClass()) { + ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile, + ARMBuildAttrs::MicroControllerProfile); + } + } + + ATS.emitAttribute(ARMBuildAttrs::ARM_ISA_use, + STI.hasARMOps() ? ARMBuildAttrs::Allowed + : ARMBuildAttrs::Not_Allowed); + if (STI.isThumb1Only()) { + ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use, ARMBuildAttrs::Allowed); + } else if (STI.hasThumb2()) { + ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use, + ARMBuildAttrs::AllowThumb32); + } + + if (STI.hasNEON()) { + /* NEON is not exactly a VFP architecture, but GAS emit one of + * neon/neon-fp-armv8/neon-vfpv4/vfpv3/vfpv2 for .fpu parameters */ + if (STI.hasFPARMv8()) { + if (STI.hasCrypto()) + ATS.emitFPU(ARM::FK_CRYPTO_NEON_FP_ARMV8); + else + ATS.emitFPU(ARM::FK_NEON_FP_ARMV8); + } else if (STI.hasVFP4()) + ATS.emitFPU(ARM::FK_NEON_VFPV4); + else + ATS.emitFPU(STI.hasFP16() ? ARM::FK_NEON_FP16 : ARM::FK_NEON); + // Emit Tag_Advanced_SIMD_arch for ARMv8 architecture + if (STI.hasV8Ops()) + ATS.emitAttribute(ARMBuildAttrs::Advanced_SIMD_arch, + STI.hasV8_1aOps() ? ARMBuildAttrs::AllowNeonARMv8_1a: + ARMBuildAttrs::AllowNeonARMv8); + } else { + if (STI.hasFPARMv8()) + // FPv5 and FP-ARMv8 have the same instructions, so are modeled as one + // FPU, but there are two different names for it depending on the CPU. + ATS.emitFPU(STI.hasD16() + ? (STI.isFPOnlySP() ? ARM::FK_FPV5_SP_D16 : ARM::FK_FPV5_D16) + : ARM::FK_FP_ARMV8); + else if (STI.hasVFP4()) + ATS.emitFPU(STI.hasD16() + ? (STI.isFPOnlySP() ? ARM::FK_FPV4_SP_D16 : ARM::FK_VFPV4_D16) + : ARM::FK_VFPV4); + else if (STI.hasVFP3()) + ATS.emitFPU(STI.hasD16() + // +d16 + ? (STI.isFPOnlySP() + ? (STI.hasFP16() ? ARM::FK_VFPV3XD_FP16 : ARM::FK_VFPV3XD) + : (STI.hasFP16() ? ARM::FK_VFPV3_D16_FP16 : ARM::FK_VFPV3_D16)) + // -d16 + : (STI.hasFP16() ? ARM::FK_VFPV3_FP16 : ARM::FK_VFPV3)); + else if (STI.hasVFP2()) + ATS.emitFPU(ARM::FK_VFPV2); + } + + if (TM.getRelocationModel() == Reloc::PIC_) { + // PIC specific attributes. + ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_RW_data, + ARMBuildAttrs::AddressRWPCRel); + ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_RO_data, + ARMBuildAttrs::AddressROPCRel); + ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_GOT_use, + ARMBuildAttrs::AddressGOT); + } else { + // Allow direct addressing of imported data for all other relocation models. + ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_GOT_use, + ARMBuildAttrs::AddressDirect); + } + + // Signal various FP modes. + if (!TM.Options.UnsafeFPMath) { + ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal, + ARMBuildAttrs::IEEEDenormals); + ATS.emitAttribute(ARMBuildAttrs::ABI_FP_exceptions, ARMBuildAttrs::Allowed); + + // If the user has permitted this code to choose the IEEE 754 + // rounding at run-time, emit the rounding attribute. + if (TM.Options.HonorSignDependentRoundingFPMathOption) + ATS.emitAttribute(ARMBuildAttrs::ABI_FP_rounding, ARMBuildAttrs::Allowed); + } else { + if (!STI.hasVFP2()) { + // When the target doesn't have an FPU (by design or + // intention), the assumptions made on the software support + // mirror that of the equivalent hardware support *if it + // existed*. For v7 and better we indicate that denormals are + // flushed preserving sign, and for V6 we indicate that + // denormals are flushed to positive zero. + if (STI.hasV7Ops()) + ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal, + ARMBuildAttrs::PreserveFPSign); + } else if (STI.hasVFP3()) { + // In VFPv4, VFPv4U, VFPv3, or VFPv3U, it is preserved. That is, + // the sign bit of the zero matches the sign bit of the input or + // result that is being flushed to zero. + ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal, + ARMBuildAttrs::PreserveFPSign); + } + // For VFPv2 implementations it is implementation defined as + // to whether denormals are flushed to positive zero or to + // whatever the sign of zero is (ARM v7AR ARM 2.7.5). Historically + // LLVM has chosen to flush this to positive zero (most likely for + // GCC compatibility), so that's the chosen value here (the + // absence of its emission implies zero). + } + + // TM.Options.NoInfsFPMath && TM.Options.NoNaNsFPMath is the + // equivalent of GCC's -ffinite-math-only flag. + if (TM.Options.NoInfsFPMath && TM.Options.NoNaNsFPMath) + ATS.emitAttribute(ARMBuildAttrs::ABI_FP_number_model, + ARMBuildAttrs::Allowed); + else + ATS.emitAttribute(ARMBuildAttrs::ABI_FP_number_model, + ARMBuildAttrs::AllowIEE754); + + if (STI.allowsUnalignedMem()) + ATS.emitAttribute(ARMBuildAttrs::CPU_unaligned_access, + ARMBuildAttrs::Allowed); + else + ATS.emitAttribute(ARMBuildAttrs::CPU_unaligned_access, + ARMBuildAttrs::Not_Allowed); + + // FIXME: add more flags to ARMBuildAttributes.h + // 8-bytes alignment stuff. + ATS.emitAttribute(ARMBuildAttrs::ABI_align_needed, 1); + ATS.emitAttribute(ARMBuildAttrs::ABI_align_preserved, 1); + + // ABI_HardFP_use attribute to indicate single precision FP. + if (STI.isFPOnlySP()) + ATS.emitAttribute(ARMBuildAttrs::ABI_HardFP_use, + ARMBuildAttrs::HardFPSinglePrecision); + + // Hard float. Use both S and D registers and conform to AAPCS-VFP. + if (STI.isAAPCS_ABI() && TM.Options.FloatABIType == FloatABI::Hard) + ATS.emitAttribute(ARMBuildAttrs::ABI_VFP_args, ARMBuildAttrs::HardFPAAPCS); + + // FIXME: Should we signal R9 usage? + + if (STI.hasFP16()) + ATS.emitAttribute(ARMBuildAttrs::FP_HP_extension, ARMBuildAttrs::AllowHPFP); + + // FIXME: To support emitting this build attribute as GCC does, the + // -mfp16-format option and associated plumbing must be + // supported. For now the __fp16 type is exposed by default, so this + // attribute should be emitted with value 1. + ATS.emitAttribute(ARMBuildAttrs::ABI_FP_16bit_format, + ARMBuildAttrs::FP16FormatIEEE); + + if (STI.hasMPExtension()) + ATS.emitAttribute(ARMBuildAttrs::MPextension_use, ARMBuildAttrs::AllowMP); + + // Hardware divide in ARM mode is part of base arch, starting from ARMv8. + // If only Thumb hwdiv is present, it must also be in base arch (ARMv7-R/M). + // It is not possible to produce DisallowDIV: if hwdiv is present in the base + // arch, supplying -hwdiv downgrades the effective arch, via ClearImpliedBits. + // AllowDIVExt is only emitted if hwdiv isn't available in the base arch; + // otherwise, the default value (AllowDIVIfExists) applies. + if (STI.hasDivideInARMMode() && !STI.hasV8Ops()) + ATS.emitAttribute(ARMBuildAttrs::DIV_use, ARMBuildAttrs::AllowDIVExt); + + if (MMI) { + if (const Module *SourceModule = MMI->getModule()) { + // ABI_PCS_wchar_t to indicate wchar_t width + // FIXME: There is no way to emit value 0 (wchar_t prohibited). + if (auto WCharWidthValue = mdconst::extract_or_null<ConstantInt>( + SourceModule->getModuleFlag("wchar_size"))) { + int WCharWidth = WCharWidthValue->getZExtValue(); + assert((WCharWidth == 2 || WCharWidth == 4) && + "wchar_t width must be 2 or 4 bytes"); + ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_wchar_t, WCharWidth); + } + + // ABI_enum_size to indicate enum width + // FIXME: There is no way to emit value 0 (enums prohibited) or value 3 + // (all enums contain a value needing 32 bits to encode). + if (auto EnumWidthValue = mdconst::extract_or_null<ConstantInt>( + SourceModule->getModuleFlag("min_enum_size"))) { + int EnumWidth = EnumWidthValue->getZExtValue(); + assert((EnumWidth == 1 || EnumWidth == 4) && + "Minimum enum width must be 1 or 4 bytes"); + int EnumBuildAttr = EnumWidth == 1 ? 1 : 2; + ATS.emitAttribute(ARMBuildAttrs::ABI_enum_size, EnumBuildAttr); + } + } + } + + // TODO: We currently only support either reserving the register, or treating + // it as another callee-saved register, but not as SB or a TLS pointer; It + // would instead be nicer to push this from the frontend as metadata, as we do + // for the wchar and enum size tags + if (STI.isR9Reserved()) + ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_R9_use, ARMBuildAttrs::R9Reserved); + else + ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_R9_use, ARMBuildAttrs::R9IsGPR); + + if (STI.hasTrustZone() && STI.hasVirtualization()) + ATS.emitAttribute(ARMBuildAttrs::Virtualization_use, + ARMBuildAttrs::AllowTZVirtualization); + else if (STI.hasTrustZone()) + ATS.emitAttribute(ARMBuildAttrs::Virtualization_use, + ARMBuildAttrs::AllowTZ); + else if (STI.hasVirtualization()) + ATS.emitAttribute(ARMBuildAttrs::Virtualization_use, + ARMBuildAttrs::AllowVirtualization); +} + +//===----------------------------------------------------------------------===// + +static MCSymbol *getPICLabel(const char *Prefix, unsigned FunctionNumber, + unsigned LabelId, MCContext &Ctx) { + + MCSymbol *Label = Ctx.getOrCreateSymbol(Twine(Prefix) + + "PC" + Twine(FunctionNumber) + "_" + Twine(LabelId)); + return Label; +} + +static MCSymbolRefExpr::VariantKind +getModifierVariantKind(ARMCP::ARMCPModifier Modifier) { + switch (Modifier) { + case ARMCP::no_modifier: return MCSymbolRefExpr::VK_None; + case ARMCP::TLSGD: return MCSymbolRefExpr::VK_TLSGD; + case ARMCP::TPOFF: return MCSymbolRefExpr::VK_TPOFF; + case ARMCP::GOTTPOFF: return MCSymbolRefExpr::VK_GOTTPOFF; + case ARMCP::GOT_PREL: return MCSymbolRefExpr::VK_ARM_GOT_PREL; + } + llvm_unreachable("Invalid ARMCPModifier!"); +} + +MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV, + unsigned char TargetFlags) { + if (Subtarget->isTargetMachO()) { + bool IsIndirect = (TargetFlags & ARMII::MO_NONLAZY) && + Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel()); + + if (!IsIndirect) + return getSymbol(GV); + + // FIXME: Remove this when Darwin transition to @GOT like syntax. + MCSymbol *MCSym = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); + MachineModuleInfoMachO &MMIMachO = + MMI->getObjFileInfo<MachineModuleInfoMachO>(); + MachineModuleInfoImpl::StubValueTy &StubSym = + GV->hasHiddenVisibility() ? MMIMachO.getHiddenGVStubEntry(MCSym) + : MMIMachO.getGVStubEntry(MCSym); + if (!StubSym.getPointer()) + StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV), + !GV->hasInternalLinkage()); + return MCSym; + } else if (Subtarget->isTargetCOFF()) { + assert(Subtarget->isTargetWindows() && + "Windows is the only supported COFF target"); + + bool IsIndirect = (TargetFlags & ARMII::MO_DLLIMPORT); + if (!IsIndirect) + return getSymbol(GV); + + SmallString<128> Name; + Name = "__imp_"; + getNameWithPrefix(Name, GV); + + return OutContext.getOrCreateSymbol(Name); + } else if (Subtarget->isTargetELF()) { + return getSymbol(GV); + } + llvm_unreachable("unexpected target"); +} + +void ARMAsmPrinter:: +EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) { + const DataLayout &DL = getDataLayout(); + int Size = DL.getTypeAllocSize(MCPV->getType()); + + ARMConstantPoolValue *ACPV = static_cast<ARMConstantPoolValue*>(MCPV); + + MCSymbol *MCSym; + if (ACPV->isLSDA()) { + MCSym = getCurExceptionSym(); + } else if (ACPV->isBlockAddress()) { + const BlockAddress *BA = + cast<ARMConstantPoolConstant>(ACPV)->getBlockAddress(); + MCSym = GetBlockAddressSymbol(BA); + } else if (ACPV->isGlobalValue()) { + const GlobalValue *GV = cast<ARMConstantPoolConstant>(ACPV)->getGV(); + + // On Darwin, const-pool entries may get the "FOO$non_lazy_ptr" mangling, so + // flag the global as MO_NONLAZY. + unsigned char TF = Subtarget->isTargetMachO() ? ARMII::MO_NONLAZY : 0; + MCSym = GetARMGVSymbol(GV, TF); + } else if (ACPV->isMachineBasicBlock()) { + const MachineBasicBlock *MBB = cast<ARMConstantPoolMBB>(ACPV)->getMBB(); + MCSym = MBB->getSymbol(); + } else { + assert(ACPV->isExtSymbol() && "unrecognized constant pool value"); + const char *Sym = cast<ARMConstantPoolSymbol>(ACPV)->getSymbol(); + MCSym = GetExternalSymbolSymbol(Sym); + } + + // Create an MCSymbol for the reference. + const MCExpr *Expr = + MCSymbolRefExpr::create(MCSym, getModifierVariantKind(ACPV->getModifier()), + OutContext); + + if (ACPV->getPCAdjustment()) { + MCSymbol *PCLabel = + getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(), + ACPV->getLabelId(), OutContext); + const MCExpr *PCRelExpr = MCSymbolRefExpr::create(PCLabel, OutContext); + PCRelExpr = + MCBinaryExpr::createAdd(PCRelExpr, + MCConstantExpr::create(ACPV->getPCAdjustment(), + OutContext), + OutContext); + if (ACPV->mustAddCurrentAddress()) { + // We want "(<expr> - .)", but MC doesn't have a concept of the '.' + // label, so just emit a local label end reference that instead. + MCSymbol *DotSym = OutContext.createTempSymbol(); + OutStreamer->EmitLabel(DotSym); + const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext); + PCRelExpr = MCBinaryExpr::createSub(PCRelExpr, DotExpr, OutContext); + } + Expr = MCBinaryExpr::createSub(Expr, PCRelExpr, OutContext); + } + OutStreamer->EmitValue(Expr, Size); +} + +void ARMAsmPrinter::EmitJumpTableAddrs(const MachineInstr *MI) { + const MachineOperand &MO1 = MI->getOperand(1); + unsigned JTI = MO1.getIndex(); + + // Make sure the Thumb jump table is 4-byte aligned. This will be a nop for + // ARM mode tables. + EmitAlignment(2); + + // Emit a label for the jump table. + MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI); + OutStreamer->EmitLabel(JTISymbol); + + // Mark the jump table as data-in-code. + OutStreamer->EmitDataRegion(MCDR_DataRegionJT32); + + // Emit each entry of the table. + const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); + const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); + const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs; + + for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) { + MachineBasicBlock *MBB = JTBBs[i]; + // Construct an MCExpr for the entry. We want a value of the form: + // (BasicBlockAddr - TableBeginAddr) + // + // For example, a table with entries jumping to basic blocks BB0 and BB1 + // would look like: + // LJTI_0_0: + // .word (LBB0 - LJTI_0_0) + // .word (LBB1 - LJTI_0_0) + const MCExpr *Expr = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext); + + if (TM.getRelocationModel() == Reloc::PIC_) + Expr = MCBinaryExpr::createSub(Expr, MCSymbolRefExpr::create(JTISymbol, + OutContext), + OutContext); + // If we're generating a table of Thumb addresses in static relocation + // model, we need to add one to keep interworking correctly. + else if (AFI->isThumbFunction()) + Expr = MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(1,OutContext), + OutContext); + OutStreamer->EmitValue(Expr, 4); + } + // Mark the end of jump table data-in-code region. + OutStreamer->EmitDataRegion(MCDR_DataRegionEnd); +} + +void ARMAsmPrinter::EmitJumpTableInsts(const MachineInstr *MI) { + const MachineOperand &MO1 = MI->getOperand(1); + unsigned JTI = MO1.getIndex(); + + MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI); + OutStreamer->EmitLabel(JTISymbol); + + // Emit each entry of the table. + const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); + const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); + const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs; + + for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) { + MachineBasicBlock *MBB = JTBBs[i]; + const MCExpr *MBBSymbolExpr = MCSymbolRefExpr::create(MBB->getSymbol(), + OutContext); + // If this isn't a TBB or TBH, the entries are direct branch instructions. + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2B) + .addExpr(MBBSymbolExpr) + .addImm(ARMCC::AL) + .addReg(0)); + } +} + +void ARMAsmPrinter::EmitJumpTableTBInst(const MachineInstr *MI, + unsigned OffsetWidth) { + assert((OffsetWidth == 1 || OffsetWidth == 2) && "invalid tbb/tbh width"); + const MachineOperand &MO1 = MI->getOperand(1); + unsigned JTI = MO1.getIndex(); + + MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI); + OutStreamer->EmitLabel(JTISymbol); + + // Emit each entry of the table. + const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); + const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); + const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs; + + // Mark the jump table as data-in-code. + OutStreamer->EmitDataRegion(OffsetWidth == 1 ? MCDR_DataRegionJT8 + : MCDR_DataRegionJT16); + + for (auto MBB : JTBBs) { + const MCExpr *MBBSymbolExpr = MCSymbolRefExpr::create(MBB->getSymbol(), + OutContext); + // Otherwise it's an offset from the dispatch instruction. Construct an + // MCExpr for the entry. We want a value of the form: + // (BasicBlockAddr - TBBInstAddr + 4) / 2 + // + // For example, a TBB table with entries jumping to basic blocks BB0 and BB1 + // would look like: + // LJTI_0_0: + // .byte (LBB0 - (LCPI0_0 + 4)) / 2 + // .byte (LBB1 - (LCPI0_0 + 4)) / 2 + // where LCPI0_0 is a label defined just before the TBB instruction using + // this table. + MCSymbol *TBInstPC = GetCPISymbol(MI->getOperand(0).getImm()); + const MCExpr *Expr = MCBinaryExpr::createAdd( + MCSymbolRefExpr::create(TBInstPC, OutContext), + MCConstantExpr::create(4, OutContext), OutContext); + Expr = MCBinaryExpr::createSub(MBBSymbolExpr, Expr, OutContext); + Expr = MCBinaryExpr::createDiv(Expr, MCConstantExpr::create(2, OutContext), + OutContext); + OutStreamer->EmitValue(Expr, OffsetWidth); + } + // Mark the end of jump table data-in-code region. 32-bit offsets use + // actual branch instructions here, so we don't mark those as a data-region + // at all. + OutStreamer->EmitDataRegion(MCDR_DataRegionEnd); + + // Make sure the next instruction is 2-byte aligned. + EmitAlignment(1); +} + +void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { + assert(MI->getFlag(MachineInstr::FrameSetup) && + "Only instruction which are involved into frame setup code are allowed"); + + MCTargetStreamer &TS = *OutStreamer->getTargetStreamer(); + ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS); + const MachineFunction &MF = *MI->getParent()->getParent(); + const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); + const ARMFunctionInfo &AFI = *MF.getInfo<ARMFunctionInfo>(); + + unsigned FramePtr = RegInfo->getFrameRegister(MF); + unsigned Opc = MI->getOpcode(); + unsigned SrcReg, DstReg; + + if (Opc == ARM::tPUSH || Opc == ARM::tLDRpci) { + // Two special cases: + // 1) tPUSH does not have src/dst regs. + // 2) for Thumb1 code we sometimes materialize the constant via constpool + // load. Yes, this is pretty fragile, but for now I don't see better + // way... :( + SrcReg = DstReg = ARM::SP; + } else { + SrcReg = MI->getOperand(1).getReg(); + DstReg = MI->getOperand(0).getReg(); + } + + // Try to figure out the unwinding opcode out of src / dst regs. + if (MI->mayStore()) { + // Register saves. + assert(DstReg == ARM::SP && + "Only stack pointer as a destination reg is supported"); + + SmallVector<unsigned, 4> RegList; + // Skip src & dst reg, and pred ops. + unsigned StartOp = 2 + 2; + // Use all the operands. + unsigned NumOffset = 0; + + switch (Opc) { + default: + MI->dump(); + llvm_unreachable("Unsupported opcode for unwinding information"); + case ARM::tPUSH: + // Special case here: no src & dst reg, but two extra imp ops. + StartOp = 2; NumOffset = 2; + case ARM::STMDB_UPD: + case ARM::t2STMDB_UPD: + case ARM::VSTMDDB_UPD: + assert(SrcReg == ARM::SP && + "Only stack pointer as a source reg is supported"); + for (unsigned i = StartOp, NumOps = MI->getNumOperands() - NumOffset; + i != NumOps; ++i) { + const MachineOperand &MO = MI->getOperand(i); + // Actually, there should never be any impdef stuff here. Skip it + // temporary to workaround PR11902. + if (MO.isImplicit()) + continue; + RegList.push_back(MO.getReg()); + } + break; + case ARM::STR_PRE_IMM: + case ARM::STR_PRE_REG: + case ARM::t2STR_PRE: + assert(MI->getOperand(2).getReg() == ARM::SP && + "Only stack pointer as a source reg is supported"); + RegList.push_back(SrcReg); + break; + } + if (MAI->getExceptionHandlingType() == ExceptionHandling::ARM) + ATS.emitRegSave(RegList, Opc == ARM::VSTMDDB_UPD); + } else { + // Changes of stack / frame pointer. + if (SrcReg == ARM::SP) { + int64_t Offset = 0; + switch (Opc) { + default: + MI->dump(); + llvm_unreachable("Unsupported opcode for unwinding information"); + case ARM::MOVr: + case ARM::tMOVr: + Offset = 0; + break; + case ARM::ADDri: + case ARM::t2ADDri: + Offset = -MI->getOperand(2).getImm(); + break; + case ARM::SUBri: + case ARM::t2SUBri: + Offset = MI->getOperand(2).getImm(); + break; + case ARM::tSUBspi: + Offset = MI->getOperand(2).getImm()*4; + break; + case ARM::tADDspi: + case ARM::tADDrSPi: + Offset = -MI->getOperand(2).getImm()*4; + break; + case ARM::tLDRpci: { + // Grab the constpool index and check, whether it corresponds to + // original or cloned constpool entry. + unsigned CPI = MI->getOperand(1).getIndex(); + const MachineConstantPool *MCP = MF.getConstantPool(); + if (CPI >= MCP->getConstants().size()) + CPI = AFI.getOriginalCPIdx(CPI); + assert(CPI != -1U && "Invalid constpool index"); + + // Derive the actual offset. + const MachineConstantPoolEntry &CPE = MCP->getConstants()[CPI]; + assert(!CPE.isMachineConstantPoolEntry() && "Invalid constpool entry"); + // FIXME: Check for user, it should be "add" instruction! + Offset = -cast<ConstantInt>(CPE.Val.ConstVal)->getSExtValue(); + break; + } + } + + if (MAI->getExceptionHandlingType() == ExceptionHandling::ARM) { + if (DstReg == FramePtr && FramePtr != ARM::SP) + // Set-up of the frame pointer. Positive values correspond to "add" + // instruction. + ATS.emitSetFP(FramePtr, ARM::SP, -Offset); + else if (DstReg == ARM::SP) { + // Change of SP by an offset. Positive values correspond to "sub" + // instruction. + ATS.emitPad(Offset); + } else { + // Move of SP to a register. Positive values correspond to an "add" + // instruction. + ATS.emitMovSP(DstReg, -Offset); + } + } + } else if (DstReg == ARM::SP) { + MI->dump(); + llvm_unreachable("Unsupported opcode for unwinding information"); + } + else { + MI->dump(); + llvm_unreachable("Unsupported opcode for unwinding information"); + } + } +} + +// Simple pseudo-instructions have their lowering (with expansion to real +// instructions) auto-generated. +#include "ARMGenMCPseudoLowering.inc" + +void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { + const DataLayout &DL = getDataLayout(); + + // If we just ended a constant pool, mark it as such. + if (InConstantPool && MI->getOpcode() != ARM::CONSTPOOL_ENTRY) { + OutStreamer->EmitDataRegion(MCDR_DataRegionEnd); + InConstantPool = false; + } + + // Emit unwinding stuff for frame-related instructions + if (Subtarget->isTargetEHABICompatible() && + MI->getFlag(MachineInstr::FrameSetup)) + EmitUnwindingInstruction(MI); + + // Do any auto-generated pseudo lowerings. + if (emitPseudoExpansionLowering(*OutStreamer, MI)) + return; + + assert(!convertAddSubFlagsOpcode(MI->getOpcode()) && + "Pseudo flag setting opcode should be expanded early"); + + // Check for manual lowerings. + unsigned Opc = MI->getOpcode(); + switch (Opc) { + case ARM::t2MOVi32imm: llvm_unreachable("Should be lowered by thumb2it pass"); + case ARM::DBG_VALUE: llvm_unreachable("Should be handled by generic printing"); + case ARM::LEApcrel: + case ARM::tLEApcrel: + case ARM::t2LEApcrel: { + // FIXME: Need to also handle globals and externals + MCSymbol *CPISymbol = GetCPISymbol(MI->getOperand(1).getIndex()); + EmitToStreamer(*OutStreamer, MCInstBuilder(MI->getOpcode() == + ARM::t2LEApcrel ? ARM::t2ADR + : (MI->getOpcode() == ARM::tLEApcrel ? ARM::tADR + : ARM::ADR)) + .addReg(MI->getOperand(0).getReg()) + .addExpr(MCSymbolRefExpr::create(CPISymbol, OutContext)) + // Add predicate operands. + .addImm(MI->getOperand(2).getImm()) + .addReg(MI->getOperand(3).getReg())); + return; + } + case ARM::LEApcrelJT: + case ARM::tLEApcrelJT: + case ARM::t2LEApcrelJT: { + MCSymbol *JTIPICSymbol = + GetARMJTIPICJumpTableLabel(MI->getOperand(1).getIndex()); + EmitToStreamer(*OutStreamer, MCInstBuilder(MI->getOpcode() == + ARM::t2LEApcrelJT ? ARM::t2ADR + : (MI->getOpcode() == ARM::tLEApcrelJT ? ARM::tADR + : ARM::ADR)) + .addReg(MI->getOperand(0).getReg()) + .addExpr(MCSymbolRefExpr::create(JTIPICSymbol, OutContext)) + // Add predicate operands. + .addImm(MI->getOperand(2).getImm()) + .addReg(MI->getOperand(3).getReg())); + return; + } + // Darwin call instructions are just normal call instructions with different + // clobber semantics (they clobber R9). + case ARM::BX_CALL: { + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::MOVr) + .addReg(ARM::LR) + .addReg(ARM::PC) + // Add predicate operands. + .addImm(ARMCC::AL) + .addReg(0) + // Add 's' bit operand (always reg0 for this) + .addReg(0)); + + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::BX) + .addReg(MI->getOperand(0).getReg())); + return; + } + case ARM::tBX_CALL: { + if (Subtarget->hasV5TOps()) + llvm_unreachable("Expected BLX to be selected for v5t+"); + + // On ARM v4t, when doing a call from thumb mode, we need to ensure + // that the saved lr has its LSB set correctly (the arch doesn't + // have blx). + // So here we generate a bl to a small jump pad that does bx rN. + // The jump pads are emitted after the function body. + + unsigned TReg = MI->getOperand(0).getReg(); + MCSymbol *TRegSym = nullptr; + for (unsigned i = 0, e = ThumbIndirectPads.size(); i < e; i++) { + if (ThumbIndirectPads[i].first == TReg) { + TRegSym = ThumbIndirectPads[i].second; + break; + } + } + + if (!TRegSym) { + TRegSym = OutContext.createTempSymbol(); + ThumbIndirectPads.push_back(std::make_pair(TReg, TRegSym)); + } + + // Create a link-saving branch to the Reg Indirect Jump Pad. + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tBL) + // Predicate comes first here. + .addImm(ARMCC::AL).addReg(0) + .addExpr(MCSymbolRefExpr::create(TRegSym, OutContext))); + return; + } + case ARM::BMOVPCRX_CALL: { + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::MOVr) + .addReg(ARM::LR) + .addReg(ARM::PC) + // Add predicate operands. + .addImm(ARMCC::AL) + .addReg(0) + // Add 's' bit operand (always reg0 for this) + .addReg(0)); + + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::MOVr) + .addReg(ARM::PC) + .addReg(MI->getOperand(0).getReg()) + // Add predicate operands. + .addImm(ARMCC::AL) + .addReg(0) + // Add 's' bit operand (always reg0 for this) + .addReg(0)); + return; + } + case ARM::BMOVPCB_CALL: { + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::MOVr) + .addReg(ARM::LR) + .addReg(ARM::PC) + // Add predicate operands. + .addImm(ARMCC::AL) + .addReg(0) + // Add 's' bit operand (always reg0 for this) + .addReg(0)); + + const MachineOperand &Op = MI->getOperand(0); + const GlobalValue *GV = Op.getGlobal(); + const unsigned TF = Op.getTargetFlags(); + MCSymbol *GVSym = GetARMGVSymbol(GV, TF); + const MCExpr *GVSymExpr = MCSymbolRefExpr::create(GVSym, OutContext); + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::Bcc) + .addExpr(GVSymExpr) + // Add predicate operands. + .addImm(ARMCC::AL) + .addReg(0)); + return; + } + case ARM::MOVi16_ga_pcrel: + case ARM::t2MOVi16_ga_pcrel: { + MCInst TmpInst; + TmpInst.setOpcode(Opc == ARM::MOVi16_ga_pcrel? ARM::MOVi16 : ARM::t2MOVi16); + TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg())); + + unsigned TF = MI->getOperand(1).getTargetFlags(); + const GlobalValue *GV = MI->getOperand(1).getGlobal(); + MCSymbol *GVSym = GetARMGVSymbol(GV, TF); + const MCExpr *GVSymExpr = MCSymbolRefExpr::create(GVSym, OutContext); + + MCSymbol *LabelSym = + getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(), + MI->getOperand(2).getImm(), OutContext); + const MCExpr *LabelSymExpr= MCSymbolRefExpr::create(LabelSym, OutContext); + unsigned PCAdj = (Opc == ARM::MOVi16_ga_pcrel) ? 8 : 4; + const MCExpr *PCRelExpr = + ARMMCExpr::createLower16(MCBinaryExpr::createSub(GVSymExpr, + MCBinaryExpr::createAdd(LabelSymExpr, + MCConstantExpr::create(PCAdj, OutContext), + OutContext), OutContext), OutContext); + TmpInst.addOperand(MCOperand::createExpr(PCRelExpr)); + + // Add predicate operands. + TmpInst.addOperand(MCOperand::createImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::createReg(0)); + // Add 's' bit operand (always reg0 for this) + TmpInst.addOperand(MCOperand::createReg(0)); + EmitToStreamer(*OutStreamer, TmpInst); + return; + } + case ARM::MOVTi16_ga_pcrel: + case ARM::t2MOVTi16_ga_pcrel: { + MCInst TmpInst; + TmpInst.setOpcode(Opc == ARM::MOVTi16_ga_pcrel + ? ARM::MOVTi16 : ARM::t2MOVTi16); + TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg())); + TmpInst.addOperand(MCOperand::createReg(MI->getOperand(1).getReg())); + + unsigned TF = MI->getOperand(2).getTargetFlags(); + const GlobalValue *GV = MI->getOperand(2).getGlobal(); + MCSymbol *GVSym = GetARMGVSymbol(GV, TF); + const MCExpr *GVSymExpr = MCSymbolRefExpr::create(GVSym, OutContext); + + MCSymbol *LabelSym = + getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(), + MI->getOperand(3).getImm(), OutContext); + const MCExpr *LabelSymExpr= MCSymbolRefExpr::create(LabelSym, OutContext); + unsigned PCAdj = (Opc == ARM::MOVTi16_ga_pcrel) ? 8 : 4; + const MCExpr *PCRelExpr = + ARMMCExpr::createUpper16(MCBinaryExpr::createSub(GVSymExpr, + MCBinaryExpr::createAdd(LabelSymExpr, + MCConstantExpr::create(PCAdj, OutContext), + OutContext), OutContext), OutContext); + TmpInst.addOperand(MCOperand::createExpr(PCRelExpr)); + // Add predicate operands. + TmpInst.addOperand(MCOperand::createImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::createReg(0)); + // Add 's' bit operand (always reg0 for this) + TmpInst.addOperand(MCOperand::createReg(0)); + EmitToStreamer(*OutStreamer, TmpInst); + return; + } + case ARM::tPICADD: { + // This is a pseudo op for a label + instruction sequence, which looks like: + // LPC0: + // add r0, pc + // This adds the address of LPC0 to r0. + + // Emit the label. + OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(), + getFunctionNumber(), + MI->getOperand(2).getImm(), OutContext)); + + // Form and emit the add. + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDhirr) + .addReg(MI->getOperand(0).getReg()) + .addReg(MI->getOperand(0).getReg()) + .addReg(ARM::PC) + // Add predicate operands. + .addImm(ARMCC::AL) + .addReg(0)); + return; + } + case ARM::PICADD: { + // This is a pseudo op for a label + instruction sequence, which looks like: + // LPC0: + // add r0, pc, r0 + // This adds the address of LPC0 to r0. + + // Emit the label. + OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(), + getFunctionNumber(), + MI->getOperand(2).getImm(), OutContext)); + + // Form and emit the add. + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::ADDrr) + .addReg(MI->getOperand(0).getReg()) + .addReg(ARM::PC) + .addReg(MI->getOperand(1).getReg()) + // Add predicate operands. + .addImm(MI->getOperand(3).getImm()) + .addReg(MI->getOperand(4).getReg()) + // Add 's' bit operand (always reg0 for this) + .addReg(0)); + return; + } + case ARM::PICSTR: + case ARM::PICSTRB: + case ARM::PICSTRH: + case ARM::PICLDR: + case ARM::PICLDRB: + case ARM::PICLDRH: + case ARM::PICLDRSB: + case ARM::PICLDRSH: { + // This is a pseudo op for a label + instruction sequence, which looks like: + // LPC0: + // OP r0, [pc, r0] + // The LCP0 label is referenced by a constant pool entry in order to get + // a PC-relative address at the ldr instruction. + + // Emit the label. + OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(), + getFunctionNumber(), + MI->getOperand(2).getImm(), OutContext)); + + // Form and emit the load + unsigned Opcode; + switch (MI->getOpcode()) { + default: + llvm_unreachable("Unexpected opcode!"); + case ARM::PICSTR: Opcode = ARM::STRrs; break; + case ARM::PICSTRB: Opcode = ARM::STRBrs; break; + case ARM::PICSTRH: Opcode = ARM::STRH; break; + case ARM::PICLDR: Opcode = ARM::LDRrs; break; + case ARM::PICLDRB: Opcode = ARM::LDRBrs; break; + case ARM::PICLDRH: Opcode = ARM::LDRH; break; + case ARM::PICLDRSB: Opcode = ARM::LDRSB; break; + case ARM::PICLDRSH: Opcode = ARM::LDRSH; break; + } + EmitToStreamer(*OutStreamer, MCInstBuilder(Opcode) + .addReg(MI->getOperand(0).getReg()) + .addReg(ARM::PC) + .addReg(MI->getOperand(1).getReg()) + .addImm(0) + // Add predicate operands. + .addImm(MI->getOperand(3).getImm()) + .addReg(MI->getOperand(4).getReg())); + + return; + } + case ARM::CONSTPOOL_ENTRY: { + /// CONSTPOOL_ENTRY - This instruction represents a floating constant pool + /// in the function. The first operand is the ID# for this instruction, the + /// second is the index into the MachineConstantPool that this is, the third + /// is the size in bytes of this constant pool entry. + /// The required alignment is specified on the basic block holding this MI. + unsigned LabelId = (unsigned)MI->getOperand(0).getImm(); + unsigned CPIdx = (unsigned)MI->getOperand(1).getIndex(); + + // If this is the first entry of the pool, mark it. + if (!InConstantPool) { + OutStreamer->EmitDataRegion(MCDR_DataRegion); + InConstantPool = true; + } + + OutStreamer->EmitLabel(GetCPISymbol(LabelId)); + + const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx]; + if (MCPE.isMachineConstantPoolEntry()) + EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal); + else + EmitGlobalConstant(DL, MCPE.Val.ConstVal); + return; + } + case ARM::JUMPTABLE_ADDRS: + EmitJumpTableAddrs(MI); + return; + case ARM::JUMPTABLE_INSTS: + EmitJumpTableInsts(MI); + return; + case ARM::JUMPTABLE_TBB: + case ARM::JUMPTABLE_TBH: + EmitJumpTableTBInst(MI, MI->getOpcode() == ARM::JUMPTABLE_TBB ? 1 : 2); + return; + case ARM::t2BR_JT: { + // Lower and emit the instruction itself, then the jump table following it. + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr) + .addReg(ARM::PC) + .addReg(MI->getOperand(0).getReg()) + // Add predicate operands. + .addImm(ARMCC::AL) + .addReg(0)); + return; + } + case ARM::t2TBB_JT: + case ARM::t2TBH_JT: { + unsigned Opc = MI->getOpcode() == ARM::t2TBB_JT ? ARM::t2TBB : ARM::t2TBH; + // Lower and emit the PC label, then the instruction itself. + OutStreamer->EmitLabel(GetCPISymbol(MI->getOperand(3).getImm())); + EmitToStreamer(*OutStreamer, MCInstBuilder(Opc) + .addReg(MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()) + // Add predicate operands. + .addImm(ARMCC::AL) + .addReg(0)); + return; + } + case ARM::tBR_JTr: + case ARM::BR_JTr: { + // Lower and emit the instruction itself, then the jump table following it. + // mov pc, target + MCInst TmpInst; + unsigned Opc = MI->getOpcode() == ARM::BR_JTr ? + ARM::MOVr : ARM::tMOVr; + TmpInst.setOpcode(Opc); + TmpInst.addOperand(MCOperand::createReg(ARM::PC)); + TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg())); + // Add predicate operands. + TmpInst.addOperand(MCOperand::createImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::createReg(0)); + // Add 's' bit operand (always reg0 for this) + if (Opc == ARM::MOVr) + TmpInst.addOperand(MCOperand::createReg(0)); + EmitToStreamer(*OutStreamer, TmpInst); + return; + } + case ARM::BR_JTm: { + // Lower and emit the instruction itself, then the jump table following it. + // ldr pc, target + MCInst TmpInst; + if (MI->getOperand(1).getReg() == 0) { + // literal offset + TmpInst.setOpcode(ARM::LDRi12); + TmpInst.addOperand(MCOperand::createReg(ARM::PC)); + TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg())); + TmpInst.addOperand(MCOperand::createImm(MI->getOperand(2).getImm())); + } else { + TmpInst.setOpcode(ARM::LDRrs); + TmpInst.addOperand(MCOperand::createReg(ARM::PC)); + TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg())); + TmpInst.addOperand(MCOperand::createReg(MI->getOperand(1).getReg())); + TmpInst.addOperand(MCOperand::createImm(0)); + } + // Add predicate operands. + TmpInst.addOperand(MCOperand::createImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::createReg(0)); + EmitToStreamer(*OutStreamer, TmpInst); + return; + } + case ARM::BR_JTadd: { + // Lower and emit the instruction itself, then the jump table following it. + // add pc, target, idx + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::ADDrr) + .addReg(ARM::PC) + .addReg(MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()) + // Add predicate operands. + .addImm(ARMCC::AL) + .addReg(0) + // Add 's' bit operand (always reg0 for this) + .addReg(0)); + return; + } + case ARM::SPACE: + OutStreamer->EmitZeros(MI->getOperand(1).getImm()); + return; + case ARM::TRAP: { + // Non-Darwin binutils don't yet support the "trap" mnemonic. + // FIXME: Remove this special case when they do. + if (!Subtarget->isTargetMachO()) { + //.long 0xe7ffdefe @ trap + uint32_t Val = 0xe7ffdefeUL; + OutStreamer->AddComment("trap"); + OutStreamer->EmitIntValue(Val, 4); + return; + } + break; + } + case ARM::TRAPNaCl: { + //.long 0xe7fedef0 @ trap + uint32_t Val = 0xe7fedef0UL; + OutStreamer->AddComment("trap"); + OutStreamer->EmitIntValue(Val, 4); + return; + } + case ARM::tTRAP: { + // Non-Darwin binutils don't yet support the "trap" mnemonic. + // FIXME: Remove this special case when they do. + if (!Subtarget->isTargetMachO()) { + //.short 57086 @ trap + uint16_t Val = 0xdefe; + OutStreamer->AddComment("trap"); + OutStreamer->EmitIntValue(Val, 2); + return; + } + break; + } + case ARM::t2Int_eh_sjlj_setjmp: + case ARM::t2Int_eh_sjlj_setjmp_nofp: + case ARM::tInt_eh_sjlj_setjmp: { + // Two incoming args: GPR:$src, GPR:$val + // mov $val, pc + // adds $val, #7 + // str $val, [$src, #4] + // movs r0, #0 + // b LSJLJEH + // movs r0, #1 + // LSJLJEH: + unsigned SrcReg = MI->getOperand(0).getReg(); + unsigned ValReg = MI->getOperand(1).getReg(); + MCSymbol *Label = OutContext.createTempSymbol("SJLJEH", false, true); + OutStreamer->AddComment("eh_setjmp begin"); + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr) + .addReg(ValReg) + .addReg(ARM::PC) + // Predicate. + .addImm(ARMCC::AL) + .addReg(0)); + + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDi3) + .addReg(ValReg) + // 's' bit operand + .addReg(ARM::CPSR) + .addReg(ValReg) + .addImm(7) + // Predicate. + .addImm(ARMCC::AL) + .addReg(0)); + + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tSTRi) + .addReg(ValReg) + .addReg(SrcReg) + // The offset immediate is #4. The operand value is scaled by 4 for the + // tSTR instruction. + .addImm(1) + // Predicate. + .addImm(ARMCC::AL) + .addReg(0)); + + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVi8) + .addReg(ARM::R0) + .addReg(ARM::CPSR) + .addImm(0) + // Predicate. + .addImm(ARMCC::AL) + .addReg(0)); + + const MCExpr *SymbolExpr = MCSymbolRefExpr::create(Label, OutContext); + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tB) + .addExpr(SymbolExpr) + .addImm(ARMCC::AL) + .addReg(0)); + + OutStreamer->AddComment("eh_setjmp end"); + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVi8) + .addReg(ARM::R0) + .addReg(ARM::CPSR) + .addImm(1) + // Predicate. + .addImm(ARMCC::AL) + .addReg(0)); + + OutStreamer->EmitLabel(Label); + return; + } + + case ARM::Int_eh_sjlj_setjmp_nofp: + case ARM::Int_eh_sjlj_setjmp: { + // Two incoming args: GPR:$src, GPR:$val + // add $val, pc, #8 + // str $val, [$src, #+4] + // mov r0, #0 + // add pc, pc, #0 + // mov r0, #1 + unsigned SrcReg = MI->getOperand(0).getReg(); + unsigned ValReg = MI->getOperand(1).getReg(); + + OutStreamer->AddComment("eh_setjmp begin"); + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::ADDri) + .addReg(ValReg) + .addReg(ARM::PC) + .addImm(8) + // Predicate. + .addImm(ARMCC::AL) + .addReg(0) + // 's' bit operand (always reg0 for this). + .addReg(0)); + + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::STRi12) + .addReg(ValReg) + .addReg(SrcReg) + .addImm(4) + // Predicate. + .addImm(ARMCC::AL) + .addReg(0)); + + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::MOVi) + .addReg(ARM::R0) + .addImm(0) + // Predicate. + .addImm(ARMCC::AL) + .addReg(0) + // 's' bit operand (always reg0 for this). + .addReg(0)); + + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::ADDri) + .addReg(ARM::PC) + .addReg(ARM::PC) + .addImm(0) + // Predicate. + .addImm(ARMCC::AL) + .addReg(0) + // 's' bit operand (always reg0 for this). + .addReg(0)); + + OutStreamer->AddComment("eh_setjmp end"); + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::MOVi) + .addReg(ARM::R0) + .addImm(1) + // Predicate. + .addImm(ARMCC::AL) + .addReg(0) + // 's' bit operand (always reg0 for this). + .addReg(0)); + return; + } + case ARM::Int_eh_sjlj_longjmp: { + // ldr sp, [$src, #8] + // ldr $scratch, [$src, #4] + // ldr r7, [$src] + // bx $scratch + unsigned SrcReg = MI->getOperand(0).getReg(); + unsigned ScratchReg = MI->getOperand(1).getReg(); + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::LDRi12) + .addReg(ARM::SP) + .addReg(SrcReg) + .addImm(8) + // Predicate. + .addImm(ARMCC::AL) + .addReg(0)); + + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::LDRi12) + .addReg(ScratchReg) + .addReg(SrcReg) + .addImm(4) + // Predicate. + .addImm(ARMCC::AL) + .addReg(0)); + + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::LDRi12) + .addReg(ARM::R7) + .addReg(SrcReg) + .addImm(0) + // Predicate. + .addImm(ARMCC::AL) + .addReg(0)); + + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::BX) + .addReg(ScratchReg) + // Predicate. + .addImm(ARMCC::AL) + .addReg(0)); + return; + } + case ARM::tInt_eh_sjlj_longjmp: { + // ldr $scratch, [$src, #8] + // mov sp, $scratch + // ldr $scratch, [$src, #4] + // ldr r7, [$src] + // bx $scratch + unsigned SrcReg = MI->getOperand(0).getReg(); + unsigned ScratchReg = MI->getOperand(1).getReg(); + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLDRi) + .addReg(ScratchReg) + .addReg(SrcReg) + // The offset immediate is #8. The operand value is scaled by 4 for the + // tLDR instruction. + .addImm(2) + // Predicate. + .addImm(ARMCC::AL) + .addReg(0)); + + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr) + .addReg(ARM::SP) + .addReg(ScratchReg) + // Predicate. + .addImm(ARMCC::AL) + .addReg(0)); + + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLDRi) + .addReg(ScratchReg) + .addReg(SrcReg) + .addImm(1) + // Predicate. + .addImm(ARMCC::AL) + .addReg(0)); + + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLDRi) + .addReg(ARM::R7) + .addReg(SrcReg) + .addImm(0) + // Predicate. + .addImm(ARMCC::AL) + .addReg(0)); + + EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tBX) + .addReg(ScratchReg) + // Predicate. + .addImm(ARMCC::AL) + .addReg(0)); + return; + } + } + + MCInst TmpInst; + LowerARMMachineInstrToMCInst(MI, TmpInst, *this); + + EmitToStreamer(*OutStreamer, TmpInst); +} + +//===----------------------------------------------------------------------===// +// Target Registry Stuff +//===----------------------------------------------------------------------===// + +// Force static initialization. +extern "C" void LLVMInitializeARMAsmPrinter() { + RegisterAsmPrinter<ARMAsmPrinter> X(TheARMLETarget); + RegisterAsmPrinter<ARMAsmPrinter> Y(TheARMBETarget); + RegisterAsmPrinter<ARMAsmPrinter> A(TheThumbLETarget); + RegisterAsmPrinter<ARMAsmPrinter> B(TheThumbBETarget); +} diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h new file mode 100644 index 0000000..ed7be2d --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h @@ -0,0 +1,136 @@ +//===-- ARMAsmPrinter.h - ARM implementation of AsmPrinter ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_ARMASMPRINTER_H +#define LLVM_LIB_TARGET_ARM_ARMASMPRINTER_H + +#include "ARMSubtarget.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + +class ARMFunctionInfo; +class MCOperand; +class MachineConstantPool; +class MachineOperand; +class MCSymbol; + +namespace ARM { + enum DW_ISA { + DW_ISA_ARM_thumb = 1, + DW_ISA_ARM_arm = 2 + }; +} + +class LLVM_LIBRARY_VISIBILITY ARMAsmPrinter : public AsmPrinter { + + /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can + /// make the right decision when printing asm code for different targets. + const ARMSubtarget *Subtarget; + + /// AFI - Keep a pointer to ARMFunctionInfo for the current + /// MachineFunction. + ARMFunctionInfo *AFI; + + /// MCP - Keep a pointer to constantpool entries of the current + /// MachineFunction. + const MachineConstantPool *MCP; + + /// InConstantPool - Maintain state when emitting a sequence of constant + /// pool entries so we can properly mark them as data regions. + bool InConstantPool; + + /// ThumbIndirectPads - These maintain a per-function list of jump pad + /// labels used for ARMv4t thumb code to make register indirect calls. + SmallVector<std::pair<unsigned, MCSymbol*>, 4> ThumbIndirectPads; + + /// OptimizationGoals - Maintain a combined optimization goal for all + /// functions in a module: one of Tag_ABI_optimization_goals values, + /// -1 if uninitialized, 0 if conflicting goals + int OptimizationGoals; + +public: + explicit ARMAsmPrinter(TargetMachine &TM, + std::unique_ptr<MCStreamer> Streamer); + + const char *getPassName() const override { + return "ARM Assembly / Object Emitter"; + } + + void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O); + + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &O) override; + bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &O) override; + + void emitInlineAsmEnd(const MCSubtargetInfo &StartInfo, + const MCSubtargetInfo *EndInfo) const override; + + void EmitJumpTableAddrs(const MachineInstr *MI); + void EmitJumpTableInsts(const MachineInstr *MI); + void EmitJumpTableTBInst(const MachineInstr *MI, unsigned OffsetWidth); + void EmitInstruction(const MachineInstr *MI) override; + bool runOnMachineFunction(MachineFunction &F) override; + + void EmitConstantPool() override { + // we emit constant pools customly! + } + void EmitFunctionBodyEnd() override; + void EmitFunctionEntryLabel() override; + void EmitStartOfAsmFile(Module &M) override; + void EmitEndOfAsmFile(Module &M) override; + void EmitXXStructor(const DataLayout &DL, const Constant *CV) override; + + // lowerOperand - Convert a MachineOperand into the equivalent MCOperand. + bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp); + +private: + // Helpers for EmitStartOfAsmFile() and EmitEndOfAsmFile() + void emitAttributes(); + + // Generic helper used to emit e.g. ARMv5 mul pseudos + void EmitPatchedInstruction(const MachineInstr *MI, unsigned TargetOpc); + + void EmitUnwindingInstruction(const MachineInstr *MI); + + // emitPseudoExpansionLowering - tblgen'erated. + bool emitPseudoExpansionLowering(MCStreamer &OutStreamer, + const MachineInstr *MI); + +public: + unsigned getISAEncoding() override { + // ARM/Darwin adds ISA to the DWARF info for each function. + const Triple &TT = TM.getTargetTriple(); + if (!TT.isOSBinFormatMachO()) + return 0; + bool isThumb = TT.getArch() == Triple::thumb || + TT.getArch() == Triple::thumbeb || + TT.getSubArch() == Triple::ARMSubArch_v7m || + TT.getSubArch() == Triple::ARMSubArch_v6m; + return isThumb ? ARM::DW_ISA_ARM_thumb : ARM::DW_ISA_ARM_arm; + } + +private: + MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol); + MCSymbol *GetARMJTIPICJumpTableLabel(unsigned uid) const; + + MCSymbol *GetARMGVSymbol(const GlobalValue *GV, unsigned char TargetFlags); + +public: + /// EmitMachineConstantPoolValue - Print a machine constantpool value to + /// the .s file. + void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override; +}; +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp new file mode 100644 index 0000000..49f3288 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -0,0 +1,4631 @@ +//===-- ARMBaseInstrInfo.cpp - ARM Instruction Information ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Base ARM implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "ARMBaseRegisterInfo.h" +#include "ARMConstantPoolValue.h" +#include "ARMFeatures.h" +#include "ARMHazardRecognizer.h" +#include "ARMMachineFunctionInfo.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "arm-instrinfo" + +#define GET_INSTRINFO_CTOR_DTOR +#include "ARMGenInstrInfo.inc" + +static cl::opt<bool> +EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden, + cl::desc("Enable ARM 2-addr to 3-addr conv")); + +static cl::opt<bool> +WidenVMOVS("widen-vmovs", cl::Hidden, cl::init(true), + cl::desc("Widen ARM vmovs to vmovd when possible")); + +static cl::opt<unsigned> +SwiftPartialUpdateClearance("swift-partial-update-clearance", + cl::Hidden, cl::init(12), + cl::desc("Clearance before partial register updates")); + +/// ARM_MLxEntry - Record information about MLA / MLS instructions. +struct ARM_MLxEntry { + uint16_t MLxOpc; // MLA / MLS opcode + uint16_t MulOpc; // Expanded multiplication opcode + uint16_t AddSubOpc; // Expanded add / sub opcode + bool NegAcc; // True if the acc is negated before the add / sub. + bool HasLane; // True if instruction has an extra "lane" operand. +}; + +static const ARM_MLxEntry ARM_MLxTable[] = { + // MLxOpc, MulOpc, AddSubOpc, NegAcc, HasLane + // fp scalar ops + { ARM::VMLAS, ARM::VMULS, ARM::VADDS, false, false }, + { ARM::VMLSS, ARM::VMULS, ARM::VSUBS, false, false }, + { ARM::VMLAD, ARM::VMULD, ARM::VADDD, false, false }, + { ARM::VMLSD, ARM::VMULD, ARM::VSUBD, false, false }, + { ARM::VNMLAS, ARM::VNMULS, ARM::VSUBS, true, false }, + { ARM::VNMLSS, ARM::VMULS, ARM::VSUBS, true, false }, + { ARM::VNMLAD, ARM::VNMULD, ARM::VSUBD, true, false }, + { ARM::VNMLSD, ARM::VMULD, ARM::VSUBD, true, false }, + + // fp SIMD ops + { ARM::VMLAfd, ARM::VMULfd, ARM::VADDfd, false, false }, + { ARM::VMLSfd, ARM::VMULfd, ARM::VSUBfd, false, false }, + { ARM::VMLAfq, ARM::VMULfq, ARM::VADDfq, false, false }, + { ARM::VMLSfq, ARM::VMULfq, ARM::VSUBfq, false, false }, + { ARM::VMLAslfd, ARM::VMULslfd, ARM::VADDfd, false, true }, + { ARM::VMLSslfd, ARM::VMULslfd, ARM::VSUBfd, false, true }, + { ARM::VMLAslfq, ARM::VMULslfq, ARM::VADDfq, false, true }, + { ARM::VMLSslfq, ARM::VMULslfq, ARM::VSUBfq, false, true }, +}; + +ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget& STI) + : ARMGenInstrInfo(ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP), + Subtarget(STI) { + for (unsigned i = 0, e = array_lengthof(ARM_MLxTable); i != e; ++i) { + if (!MLxEntryMap.insert(std::make_pair(ARM_MLxTable[i].MLxOpc, i)).second) + llvm_unreachable("Duplicated entries?"); + MLxHazardOpcodes.insert(ARM_MLxTable[i].AddSubOpc); + MLxHazardOpcodes.insert(ARM_MLxTable[i].MulOpc); + } +} + +// Use a ScoreboardHazardRecognizer for prepass ARM scheduling. TargetInstrImpl +// currently defaults to no prepass hazard recognizer. +ScheduleHazardRecognizer * +ARMBaseInstrInfo::CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI, + const ScheduleDAG *DAG) const { + if (usePreRAHazardRecognizer()) { + const InstrItineraryData *II = + static_cast<const ARMSubtarget *>(STI)->getInstrItineraryData(); + return new ScoreboardHazardRecognizer(II, DAG, "pre-RA-sched"); + } + return TargetInstrInfo::CreateTargetHazardRecognizer(STI, DAG); +} + +ScheduleHazardRecognizer *ARMBaseInstrInfo:: +CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, + const ScheduleDAG *DAG) const { + if (Subtarget.isThumb2() || Subtarget.hasVFP2()) + return (ScheduleHazardRecognizer *)new ARMHazardRecognizer(II, DAG); + return TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG); +} + +MachineInstr * +ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const { + // FIXME: Thumb2 support. + + if (!EnableARM3Addr) + return nullptr; + + MachineInstr *MI = MBBI; + MachineFunction &MF = *MI->getParent()->getParent(); + uint64_t TSFlags = MI->getDesc().TSFlags; + bool isPre = false; + switch ((TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift) { + default: return nullptr; + case ARMII::IndexModePre: + isPre = true; + break; + case ARMII::IndexModePost: + break; + } + + // Try splitting an indexed load/store to an un-indexed one plus an add/sub + // operation. + unsigned MemOpc = getUnindexedOpcode(MI->getOpcode()); + if (MemOpc == 0) + return nullptr; + + MachineInstr *UpdateMI = nullptr; + MachineInstr *MemMI = nullptr; + unsigned AddrMode = (TSFlags & ARMII::AddrModeMask); + const MCInstrDesc &MCID = MI->getDesc(); + unsigned NumOps = MCID.getNumOperands(); + bool isLoad = !MI->mayStore(); + const MachineOperand &WB = isLoad ? MI->getOperand(1) : MI->getOperand(0); + const MachineOperand &Base = MI->getOperand(2); + const MachineOperand &Offset = MI->getOperand(NumOps-3); + unsigned WBReg = WB.getReg(); + unsigned BaseReg = Base.getReg(); + unsigned OffReg = Offset.getReg(); + unsigned OffImm = MI->getOperand(NumOps-2).getImm(); + ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NumOps-1).getImm(); + switch (AddrMode) { + default: llvm_unreachable("Unknown indexed op!"); + case ARMII::AddrMode2: { + bool isSub = ARM_AM::getAM2Op(OffImm) == ARM_AM::sub; + unsigned Amt = ARM_AM::getAM2Offset(OffImm); + if (OffReg == 0) { + if (ARM_AM::getSOImmVal(Amt) == -1) + // Can't encode it in a so_imm operand. This transformation will + // add more than 1 instruction. Abandon! + return nullptr; + UpdateMI = BuildMI(MF, MI->getDebugLoc(), + get(isSub ? ARM::SUBri : ARM::ADDri), WBReg) + .addReg(BaseReg).addImm(Amt) + .addImm(Pred).addReg(0).addReg(0); + } else if (Amt != 0) { + ARM_AM::ShiftOpc ShOpc = ARM_AM::getAM2ShiftOpc(OffImm); + unsigned SOOpc = ARM_AM::getSORegOpc(ShOpc, Amt); + UpdateMI = BuildMI(MF, MI->getDebugLoc(), + get(isSub ? ARM::SUBrsi : ARM::ADDrsi), WBReg) + .addReg(BaseReg).addReg(OffReg).addReg(0).addImm(SOOpc) + .addImm(Pred).addReg(0).addReg(0); + } else + UpdateMI = BuildMI(MF, MI->getDebugLoc(), + get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg) + .addReg(BaseReg).addReg(OffReg) + .addImm(Pred).addReg(0).addReg(0); + break; + } + case ARMII::AddrMode3 : { + bool isSub = ARM_AM::getAM3Op(OffImm) == ARM_AM::sub; + unsigned Amt = ARM_AM::getAM3Offset(OffImm); + if (OffReg == 0) + // Immediate is 8-bits. It's guaranteed to fit in a so_imm operand. + UpdateMI = BuildMI(MF, MI->getDebugLoc(), + get(isSub ? ARM::SUBri : ARM::ADDri), WBReg) + .addReg(BaseReg).addImm(Amt) + .addImm(Pred).addReg(0).addReg(0); + else + UpdateMI = BuildMI(MF, MI->getDebugLoc(), + get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg) + .addReg(BaseReg).addReg(OffReg) + .addImm(Pred).addReg(0).addReg(0); + break; + } + } + + std::vector<MachineInstr*> NewMIs; + if (isPre) { + if (isLoad) + MemMI = BuildMI(MF, MI->getDebugLoc(), + get(MemOpc), MI->getOperand(0).getReg()) + .addReg(WBReg).addImm(0).addImm(Pred); + else + MemMI = BuildMI(MF, MI->getDebugLoc(), + get(MemOpc)).addReg(MI->getOperand(1).getReg()) + .addReg(WBReg).addReg(0).addImm(0).addImm(Pred); + NewMIs.push_back(MemMI); + NewMIs.push_back(UpdateMI); + } else { + if (isLoad) + MemMI = BuildMI(MF, MI->getDebugLoc(), + get(MemOpc), MI->getOperand(0).getReg()) + .addReg(BaseReg).addImm(0).addImm(Pred); + else + MemMI = BuildMI(MF, MI->getDebugLoc(), + get(MemOpc)).addReg(MI->getOperand(1).getReg()) + .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred); + if (WB.isDead()) + UpdateMI->getOperand(0).setIsDead(); + NewMIs.push_back(UpdateMI); + NewMIs.push_back(MemMI); + } + + // Transfer LiveVariables states, kill / dead info. + if (LV) { + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) { + unsigned Reg = MO.getReg(); + + LiveVariables::VarInfo &VI = LV->getVarInfo(Reg); + if (MO.isDef()) { + MachineInstr *NewMI = (Reg == WBReg) ? UpdateMI : MemMI; + if (MO.isDead()) + LV->addVirtualRegisterDead(Reg, NewMI); + } + if (MO.isUse() && MO.isKill()) { + for (unsigned j = 0; j < 2; ++j) { + // Look at the two new MI's in reverse order. + MachineInstr *NewMI = NewMIs[j]; + if (!NewMI->readsRegister(Reg)) + continue; + LV->addVirtualRegisterKilled(Reg, NewMI); + if (VI.removeKill(MI)) + VI.Kills.push_back(NewMI); + break; + } + } + } + } + } + + MFI->insert(MBBI, NewMIs[1]); + MFI->insert(MBBI, NewMIs[0]); + return NewMIs[0]; +} + +// Branch analysis. +bool +ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { + TBB = nullptr; + FBB = nullptr; + + MachineBasicBlock::iterator I = MBB.end(); + if (I == MBB.begin()) + return false; // Empty blocks are easy. + --I; + + // Walk backwards from the end of the basic block until the branch is + // analyzed or we give up. + while (isPredicated(I) || I->isTerminator() || I->isDebugValue()) { + + // Flag to be raised on unanalyzeable instructions. This is useful in cases + // where we want to clean up on the end of the basic block before we bail + // out. + bool CantAnalyze = false; + + // Skip over DEBUG values and predicated nonterminators. + while (I->isDebugValue() || !I->isTerminator()) { + if (I == MBB.begin()) + return false; + --I; + } + + if (isIndirectBranchOpcode(I->getOpcode()) || + isJumpTableBranchOpcode(I->getOpcode())) { + // Indirect branches and jump tables can't be analyzed, but we still want + // to clean up any instructions at the tail of the basic block. + CantAnalyze = true; + } else if (isUncondBranchOpcode(I->getOpcode())) { + TBB = I->getOperand(0).getMBB(); + } else if (isCondBranchOpcode(I->getOpcode())) { + // Bail out if we encounter multiple conditional branches. + if (!Cond.empty()) + return true; + + assert(!FBB && "FBB should have been null."); + FBB = TBB; + TBB = I->getOperand(0).getMBB(); + Cond.push_back(I->getOperand(1)); + Cond.push_back(I->getOperand(2)); + } else if (I->isReturn()) { + // Returns can't be analyzed, but we should run cleanup. + CantAnalyze = !isPredicated(I); + } else { + // We encountered other unrecognized terminator. Bail out immediately. + return true; + } + + // Cleanup code - to be run for unpredicated unconditional branches and + // returns. + if (!isPredicated(I) && + (isUncondBranchOpcode(I->getOpcode()) || + isIndirectBranchOpcode(I->getOpcode()) || + isJumpTableBranchOpcode(I->getOpcode()) || + I->isReturn())) { + // Forget any previous condition branch information - it no longer applies. + Cond.clear(); + FBB = nullptr; + + // If we can modify the function, delete everything below this + // unconditional branch. + if (AllowModify) { + MachineBasicBlock::iterator DI = std::next(I); + while (DI != MBB.end()) { + MachineInstr *InstToDelete = DI; + ++DI; + InstToDelete->eraseFromParent(); + } + } + } + + if (CantAnalyze) + return true; + + if (I == MBB.begin()) + return false; + + --I; + } + + // We made it past the terminators without bailing out - we must have + // analyzed this branch successfully. + return false; +} + + +unsigned ARMBaseInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); + if (I == MBB.end()) + return 0; + + if (!isUncondBranchOpcode(I->getOpcode()) && + !isCondBranchOpcode(I->getOpcode())) + return 0; + + // Remove the branch. + I->eraseFromParent(); + + I = MBB.end(); + + if (I == MBB.begin()) return 1; + --I; + if (!isCondBranchOpcode(I->getOpcode())) + return 1; + + // Remove the branch. + I->eraseFromParent(); + return 2; +} + +unsigned +ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + ArrayRef<MachineOperand> Cond, + DebugLoc DL) const { + ARMFunctionInfo *AFI = MBB.getParent()->getInfo<ARMFunctionInfo>(); + int BOpc = !AFI->isThumbFunction() + ? ARM::B : (AFI->isThumb2Function() ? ARM::t2B : ARM::tB); + int BccOpc = !AFI->isThumbFunction() + ? ARM::Bcc : (AFI->isThumb2Function() ? ARM::t2Bcc : ARM::tBcc); + bool isThumb = AFI->isThumbFunction() || AFI->isThumb2Function(); + + // Shouldn't be a fall through. + assert(TBB && "InsertBranch must not be told to insert a fallthrough"); + assert((Cond.size() == 2 || Cond.size() == 0) && + "ARM branch conditions have two components!"); + + // For conditional branches, we use addOperand to preserve CPSR flags. + + if (!FBB) { + if (Cond.empty()) { // Unconditional branch? + if (isThumb) + BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB).addImm(ARMCC::AL).addReg(0); + else + BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB); + } else + BuildMI(&MBB, DL, get(BccOpc)).addMBB(TBB) + .addImm(Cond[0].getImm()).addOperand(Cond[1]); + return 1; + } + + // Two-way conditional branch. + BuildMI(&MBB, DL, get(BccOpc)).addMBB(TBB) + .addImm(Cond[0].getImm()).addOperand(Cond[1]); + if (isThumb) + BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB).addImm(ARMCC::AL).addReg(0); + else + BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB); + return 2; +} + +bool ARMBaseInstrInfo:: +ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { + ARMCC::CondCodes CC = (ARMCC::CondCodes)(int)Cond[0].getImm(); + Cond[0].setImm(ARMCC::getOppositeCondition(CC)); + return false; +} + +bool ARMBaseInstrInfo::isPredicated(const MachineInstr *MI) const { + if (MI->isBundle()) { + MachineBasicBlock::const_instr_iterator I = MI->getIterator(); + MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end(); + while (++I != E && I->isInsideBundle()) { + int PIdx = I->findFirstPredOperandIdx(); + if (PIdx != -1 && I->getOperand(PIdx).getImm() != ARMCC::AL) + return true; + } + return false; + } + + int PIdx = MI->findFirstPredOperandIdx(); + return PIdx != -1 && MI->getOperand(PIdx).getImm() != ARMCC::AL; +} + +bool ARMBaseInstrInfo:: +PredicateInstruction(MachineInstr *MI, ArrayRef<MachineOperand> Pred) const { + unsigned Opc = MI->getOpcode(); + if (isUncondBranchOpcode(Opc)) { + MI->setDesc(get(getMatchingCondBranchOpcode(Opc))); + MachineInstrBuilder(*MI->getParent()->getParent(), MI) + .addImm(Pred[0].getImm()) + .addReg(Pred[1].getReg()); + return true; + } + + int PIdx = MI->findFirstPredOperandIdx(); + if (PIdx != -1) { + MachineOperand &PMO = MI->getOperand(PIdx); + PMO.setImm(Pred[0].getImm()); + MI->getOperand(PIdx+1).setReg(Pred[1].getReg()); + return true; + } + return false; +} + +bool ARMBaseInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1, + ArrayRef<MachineOperand> Pred2) const { + if (Pred1.size() > 2 || Pred2.size() > 2) + return false; + + ARMCC::CondCodes CC1 = (ARMCC::CondCodes)Pred1[0].getImm(); + ARMCC::CondCodes CC2 = (ARMCC::CondCodes)Pred2[0].getImm(); + if (CC1 == CC2) + return true; + + switch (CC1) { + default: + return false; + case ARMCC::AL: + return true; + case ARMCC::HS: + return CC2 == ARMCC::HI; + case ARMCC::LS: + return CC2 == ARMCC::LO || CC2 == ARMCC::EQ; + case ARMCC::GE: + return CC2 == ARMCC::GT; + case ARMCC::LE: + return CC2 == ARMCC::LT; + } +} + +bool ARMBaseInstrInfo::DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const { + bool Found = false; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if ((MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR)) || + (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR)) { + Pred.push_back(MO); + Found = true; + } + } + + return Found; +} + +static bool isCPSRDefined(const MachineInstr *MI) { + for (const auto &MO : MI->operands()) + if (MO.isReg() && MO.getReg() == ARM::CPSR && MO.isDef() && !MO.isDead()) + return true; + return false; +} + +static bool isEligibleForITBlock(const MachineInstr *MI) { + switch (MI->getOpcode()) { + default: return true; + case ARM::tADC: // ADC (register) T1 + case ARM::tADDi3: // ADD (immediate) T1 + case ARM::tADDi8: // ADD (immediate) T2 + case ARM::tADDrr: // ADD (register) T1 + case ARM::tAND: // AND (register) T1 + case ARM::tASRri: // ASR (immediate) T1 + case ARM::tASRrr: // ASR (register) T1 + case ARM::tBIC: // BIC (register) T1 + case ARM::tEOR: // EOR (register) T1 + case ARM::tLSLri: // LSL (immediate) T1 + case ARM::tLSLrr: // LSL (register) T1 + case ARM::tLSRri: // LSR (immediate) T1 + case ARM::tLSRrr: // LSR (register) T1 + case ARM::tMUL: // MUL T1 + case ARM::tMVN: // MVN (register) T1 + case ARM::tORR: // ORR (register) T1 + case ARM::tROR: // ROR (register) T1 + case ARM::tRSB: // RSB (immediate) T1 + case ARM::tSBC: // SBC (register) T1 + case ARM::tSUBi3: // SUB (immediate) T1 + case ARM::tSUBi8: // SUB (immediate) T2 + case ARM::tSUBrr: // SUB (register) T1 + return !isCPSRDefined(MI); + } +} + +/// isPredicable - Return true if the specified instruction can be predicated. +/// By default, this returns true for every instruction with a +/// PredicateOperand. +bool ARMBaseInstrInfo::isPredicable(MachineInstr *MI) const { + if (!MI->isPredicable()) + return false; + + if (!isEligibleForITBlock(MI)) + return false; + + ARMFunctionInfo *AFI = + MI->getParent()->getParent()->getInfo<ARMFunctionInfo>(); + + if (AFI->isThumb2Function()) { + if (getSubtarget().restrictIT()) + return isV8EligibleForIT(MI); + } else { // non-Thumb + if ((MI->getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON) + return false; + } + + return true; +} + +namespace llvm { +template <> bool IsCPSRDead<MachineInstr>(MachineInstr *MI) { + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg() || MO.isUndef() || MO.isUse()) + continue; + if (MO.getReg() != ARM::CPSR) + continue; + if (!MO.isDead()) + return false; + } + // all definitions of CPSR are dead + return true; +} +} + +/// GetInstSize - Return the size of the specified MachineInstr. +/// +unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { + const MachineBasicBlock &MBB = *MI->getParent(); + const MachineFunction *MF = MBB.getParent(); + const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); + + const MCInstrDesc &MCID = MI->getDesc(); + if (MCID.getSize()) + return MCID.getSize(); + + // If this machine instr is an inline asm, measure it. + if (MI->getOpcode() == ARM::INLINEASM) + return getInlineAsmLength(MI->getOperand(0).getSymbolName(), *MAI); + unsigned Opc = MI->getOpcode(); + switch (Opc) { + default: + // pseudo-instruction sizes are zero. + return 0; + case TargetOpcode::BUNDLE: + return getInstBundleLength(MI); + case ARM::MOVi16_ga_pcrel: + case ARM::MOVTi16_ga_pcrel: + case ARM::t2MOVi16_ga_pcrel: + case ARM::t2MOVTi16_ga_pcrel: + return 4; + case ARM::MOVi32imm: + case ARM::t2MOVi32imm: + return 8; + case ARM::CONSTPOOL_ENTRY: + case ARM::JUMPTABLE_INSTS: + case ARM::JUMPTABLE_ADDRS: + case ARM::JUMPTABLE_TBB: + case ARM::JUMPTABLE_TBH: + // If this machine instr is a constant pool entry, its size is recorded as + // operand #2. + return MI->getOperand(2).getImm(); + case ARM::Int_eh_sjlj_longjmp: + return 16; + case ARM::tInt_eh_sjlj_longjmp: + return 10; + case ARM::Int_eh_sjlj_setjmp: + case ARM::Int_eh_sjlj_setjmp_nofp: + return 20; + case ARM::tInt_eh_sjlj_setjmp: + case ARM::t2Int_eh_sjlj_setjmp: + case ARM::t2Int_eh_sjlj_setjmp_nofp: + return 12; + case ARM::SPACE: + return MI->getOperand(1).getImm(); + } +} + +unsigned ARMBaseInstrInfo::getInstBundleLength(const MachineInstr *MI) const { + unsigned Size = 0; + MachineBasicBlock::const_instr_iterator I = MI->getIterator(); + MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end(); + while (++I != E && I->isInsideBundle()) { + assert(!I->isBundle() && "No nested bundle!"); + Size += GetInstSizeInBytes(&*I); + } + return Size; +} + +void ARMBaseInstrInfo::copyFromCPSR(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, bool KillSrc, + const ARMSubtarget &Subtarget) const { + unsigned Opc = Subtarget.isThumb() + ? (Subtarget.isMClass() ? ARM::t2MRS_M : ARM::t2MRS_AR) + : ARM::MRS; + + MachineInstrBuilder MIB = + BuildMI(MBB, I, I->getDebugLoc(), get(Opc), DestReg); + + // There is only 1 A/R class MRS instruction, and it always refers to + // APSR. However, there are lots of other possibilities on M-class cores. + if (Subtarget.isMClass()) + MIB.addImm(0x800); + + AddDefaultPred(MIB); + + MIB.addReg(ARM::CPSR, RegState::Implicit | getKillRegState(KillSrc)); +} + +void ARMBaseInstrInfo::copyToCPSR(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned SrcReg, bool KillSrc, + const ARMSubtarget &Subtarget) const { + unsigned Opc = Subtarget.isThumb() + ? (Subtarget.isMClass() ? ARM::t2MSR_M : ARM::t2MSR_AR) + : ARM::MSR; + + MachineInstrBuilder MIB = BuildMI(MBB, I, I->getDebugLoc(), get(Opc)); + + if (Subtarget.isMClass()) + MIB.addImm(0x800); + else + MIB.addImm(8); + + MIB.addReg(SrcReg, getKillRegState(KillSrc)); + + AddDefaultPred(MIB); + + MIB.addReg(ARM::CPSR, RegState::Implicit | RegState::Define); +} + +void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const { + bool GPRDest = ARM::GPRRegClass.contains(DestReg); + bool GPRSrc = ARM::GPRRegClass.contains(SrcReg); + + if (GPRDest && GPRSrc) { + AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::MOVr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)))); + return; + } + + bool SPRDest = ARM::SPRRegClass.contains(DestReg); + bool SPRSrc = ARM::SPRRegClass.contains(SrcReg); + + unsigned Opc = 0; + if (SPRDest && SPRSrc) + Opc = ARM::VMOVS; + else if (GPRDest && SPRSrc) + Opc = ARM::VMOVRS; + else if (SPRDest && GPRSrc) + Opc = ARM::VMOVSR; + else if (ARM::DPRRegClass.contains(DestReg, SrcReg) && !Subtarget.isFPOnlySP()) + Opc = ARM::VMOVD; + else if (ARM::QPRRegClass.contains(DestReg, SrcReg)) + Opc = ARM::VORRq; + + if (Opc) { + MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc), DestReg); + MIB.addReg(SrcReg, getKillRegState(KillSrc)); + if (Opc == ARM::VORRq) + MIB.addReg(SrcReg, getKillRegState(KillSrc)); + AddDefaultPred(MIB); + return; + } + + // Handle register classes that require multiple instructions. + unsigned BeginIdx = 0; + unsigned SubRegs = 0; + int Spacing = 1; + + // Use VORRq when possible. + if (ARM::QQPRRegClass.contains(DestReg, SrcReg)) { + Opc = ARM::VORRq; + BeginIdx = ARM::qsub_0; + SubRegs = 2; + } else if (ARM::QQQQPRRegClass.contains(DestReg, SrcReg)) { + Opc = ARM::VORRq; + BeginIdx = ARM::qsub_0; + SubRegs = 4; + // Fall back to VMOVD. + } else if (ARM::DPairRegClass.contains(DestReg, SrcReg)) { + Opc = ARM::VMOVD; + BeginIdx = ARM::dsub_0; + SubRegs = 2; + } else if (ARM::DTripleRegClass.contains(DestReg, SrcReg)) { + Opc = ARM::VMOVD; + BeginIdx = ARM::dsub_0; + SubRegs = 3; + } else if (ARM::DQuadRegClass.contains(DestReg, SrcReg)) { + Opc = ARM::VMOVD; + BeginIdx = ARM::dsub_0; + SubRegs = 4; + } else if (ARM::GPRPairRegClass.contains(DestReg, SrcReg)) { + Opc = Subtarget.isThumb2() ? ARM::tMOVr : ARM::MOVr; + BeginIdx = ARM::gsub_0; + SubRegs = 2; + } else if (ARM::DPairSpcRegClass.contains(DestReg, SrcReg)) { + Opc = ARM::VMOVD; + BeginIdx = ARM::dsub_0; + SubRegs = 2; + Spacing = 2; + } else if (ARM::DTripleSpcRegClass.contains(DestReg, SrcReg)) { + Opc = ARM::VMOVD; + BeginIdx = ARM::dsub_0; + SubRegs = 3; + Spacing = 2; + } else if (ARM::DQuadSpcRegClass.contains(DestReg, SrcReg)) { + Opc = ARM::VMOVD; + BeginIdx = ARM::dsub_0; + SubRegs = 4; + Spacing = 2; + } else if (ARM::DPRRegClass.contains(DestReg, SrcReg) && Subtarget.isFPOnlySP()) { + Opc = ARM::VMOVS; + BeginIdx = ARM::ssub_0; + SubRegs = 2; + } else if (SrcReg == ARM::CPSR) { + copyFromCPSR(MBB, I, DestReg, KillSrc, Subtarget); + return; + } else if (DestReg == ARM::CPSR) { + copyToCPSR(MBB, I, SrcReg, KillSrc, Subtarget); + return; + } + + assert(Opc && "Impossible reg-to-reg copy"); + + const TargetRegisterInfo *TRI = &getRegisterInfo(); + MachineInstrBuilder Mov; + + // Copy register tuples backward when the first Dest reg overlaps with SrcReg. + if (TRI->regsOverlap(SrcReg, TRI->getSubReg(DestReg, BeginIdx))) { + BeginIdx = BeginIdx + ((SubRegs - 1) * Spacing); + Spacing = -Spacing; + } +#ifndef NDEBUG + SmallSet<unsigned, 4> DstRegs; +#endif + for (unsigned i = 0; i != SubRegs; ++i) { + unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i * Spacing); + unsigned Src = TRI->getSubReg(SrcReg, BeginIdx + i * Spacing); + assert(Dst && Src && "Bad sub-register"); +#ifndef NDEBUG + assert(!DstRegs.count(Src) && "destructive vector copy"); + DstRegs.insert(Dst); +#endif + Mov = BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst).addReg(Src); + // VORR takes two source operands. + if (Opc == ARM::VORRq) + Mov.addReg(Src); + Mov = AddDefaultPred(Mov); + // MOVr can set CC. + if (Opc == ARM::MOVr) + Mov = AddDefaultCC(Mov); + } + // Add implicit super-register defs and kills to the last instruction. + Mov->addRegisterDefined(DestReg, TRI); + if (KillSrc) + Mov->addRegisterKilled(SrcReg, TRI); +} + +const MachineInstrBuilder & +ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB, unsigned Reg, + unsigned SubIdx, unsigned State, + const TargetRegisterInfo *TRI) const { + if (!SubIdx) + return MIB.addReg(Reg, State); + + if (TargetRegisterInfo::isPhysicalRegister(Reg)) + return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State); + return MIB.addReg(Reg, State, SubIdx); +} + +void ARMBaseInstrInfo:: +storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned SrcReg, bool isKill, int FI, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + DebugLoc DL; + if (I != MBB.end()) DL = I->getDebugLoc(); + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + unsigned Align = MFI.getObjectAlignment(FI); + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore, + MFI.getObjectSize(FI), Align); + + switch (RC->getSize()) { + case 4: + if (ARM::GPRRegClass.hasSubClassEq(RC)) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STRi12)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + } else if (ARM::SPRRegClass.hasSubClassEq(RC)) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRS)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + } else + llvm_unreachable("Unknown reg class!"); + break; + case 8: + if (ARM::DPRRegClass.hasSubClassEq(RC)) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRD)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + } else if (ARM::GPRPairRegClass.hasSubClassEq(RC)) { + if (Subtarget.hasV5TEOps()) { + MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::STRD)); + AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI); + AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI); + MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO); + + AddDefaultPred(MIB); + } else { + // Fallback to STM instruction, which has existed since the dawn of + // time. + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STMIA)) + .addFrameIndex(FI).addMemOperand(MMO)); + AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI); + AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI); + } + } else + llvm_unreachable("Unknown reg class!"); + break; + case 16: + if (ARM::DPairRegClass.hasSubClassEq(RC)) { + // Use aligned spills if the stack can be realigned. + if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q64)) + .addFrameIndex(FI).addImm(16) + .addReg(SrcReg, getKillRegState(isKill)) + .addMemOperand(MMO)); + } else { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMQIA)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI) + .addMemOperand(MMO)); + } + } else + llvm_unreachable("Unknown reg class!"); + break; + case 24: + if (ARM::DTripleRegClass.hasSubClassEq(RC)) { + // Use aligned spills if the stack can be realigned. + if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1d64TPseudo)) + .addFrameIndex(FI).addImm(16) + .addReg(SrcReg, getKillRegState(isKill)) + .addMemOperand(MMO)); + } else { + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA)) + .addFrameIndex(FI)) + .addMemOperand(MMO); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI); + AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI); + } + } else + llvm_unreachable("Unknown reg class!"); + break; + case 32: + if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) { + if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { + // FIXME: It's possible to only store part of the QQ register if the + // spilled def has a sub-register index. + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1d64QPseudo)) + .addFrameIndex(FI).addImm(16) + .addReg(SrcReg, getKillRegState(isKill)) + .addMemOperand(MMO)); + } else { + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA)) + .addFrameIndex(FI)) + .addMemOperand(MMO); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI); + AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI); + } + } else + llvm_unreachable("Unknown reg class!"); + break; + case 64: + if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) { + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA)) + .addFrameIndex(FI)) + .addMemOperand(MMO); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_4, 0, TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_5, 0, TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_6, 0, TRI); + AddDReg(MIB, SrcReg, ARM::dsub_7, 0, TRI); + } else + llvm_unreachable("Unknown reg class!"); + break; + default: + llvm_unreachable("Unknown reg class!"); + } +} + +unsigned +ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + switch (MI->getOpcode()) { + default: break; + case ARM::STRrs: + case ARM::t2STRs: // FIXME: don't use t2STRs to access frame. + if (MI->getOperand(1).isFI() && + MI->getOperand(2).isReg() && + MI->getOperand(3).isImm() && + MI->getOperand(2).getReg() == 0 && + MI->getOperand(3).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + case ARM::STRi12: + case ARM::t2STRi12: + case ARM::tSTRspi: + case ARM::VSTRD: + case ARM::VSTRS: + if (MI->getOperand(1).isFI() && + MI->getOperand(2).isImm() && + MI->getOperand(2).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + case ARM::VST1q64: + case ARM::VST1d64TPseudo: + case ARM::VST1d64QPseudo: + if (MI->getOperand(0).isFI() && + MI->getOperand(2).getSubReg() == 0) { + FrameIndex = MI->getOperand(0).getIndex(); + return MI->getOperand(2).getReg(); + } + break; + case ARM::VSTMQIA: + if (MI->getOperand(1).isFI() && + MI->getOperand(0).getSubReg() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + } + + return 0; +} + +unsigned ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const { + const MachineMemOperand *Dummy; + return MI->mayStore() && hasStoreToStackSlot(MI, Dummy, FrameIndex); +} + +void ARMBaseInstrInfo:: +loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned DestReg, int FI, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + DebugLoc DL; + if (I != MBB.end()) DL = I->getDebugLoc(); + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + unsigned Align = MFI.getObjectAlignment(FI); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad, + MFI.getObjectSize(FI), Align); + + switch (RC->getSize()) { + case 4: + if (ARM::GPRRegClass.hasSubClassEq(RC)) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDRi12), DestReg) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + + } else if (ARM::SPRRegClass.hasSubClassEq(RC)) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRS), DestReg) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + } else + llvm_unreachable("Unknown reg class!"); + break; + case 8: + if (ARM::DPRRegClass.hasSubClassEq(RC)) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRD), DestReg) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + } else if (ARM::GPRPairRegClass.hasSubClassEq(RC)) { + MachineInstrBuilder MIB; + + if (Subtarget.hasV5TEOps()) { + MIB = BuildMI(MBB, I, DL, get(ARM::LDRD)); + AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI); + AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI); + MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO); + + AddDefaultPred(MIB); + } else { + // Fallback to LDM instruction, which has existed since the dawn of + // time. + MIB = AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDMIA)) + .addFrameIndex(FI).addMemOperand(MMO)); + MIB = AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI); + } + + if (TargetRegisterInfo::isPhysicalRegister(DestReg)) + MIB.addReg(DestReg, RegState::ImplicitDefine); + } else + llvm_unreachable("Unknown reg class!"); + break; + case 16: + if (ARM::DPairRegClass.hasSubClassEq(RC)) { + if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q64), DestReg) + .addFrameIndex(FI).addImm(16) + .addMemOperand(MMO)); + } else { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMQIA), DestReg) + .addFrameIndex(FI) + .addMemOperand(MMO)); + } + } else + llvm_unreachable("Unknown reg class!"); + break; + case 24: + if (ARM::DTripleRegClass.hasSubClassEq(RC)) { + if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1d64TPseudo), DestReg) + .addFrameIndex(FI).addImm(16) + .addMemOperand(MMO)); + } else { + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA)) + .addFrameIndex(FI) + .addMemOperand(MMO)); + MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI); + if (TargetRegisterInfo::isPhysicalRegister(DestReg)) + MIB.addReg(DestReg, RegState::ImplicitDefine); + } + } else + llvm_unreachable("Unknown reg class!"); + break; + case 32: + if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) { + if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg) + .addFrameIndex(FI).addImm(16) + .addMemOperand(MMO)); + } else { + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA)) + .addFrameIndex(FI)) + .addMemOperand(MMO); + MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI); + if (TargetRegisterInfo::isPhysicalRegister(DestReg)) + MIB.addReg(DestReg, RegState::ImplicitDefine); + } + } else + llvm_unreachable("Unknown reg class!"); + break; + case 64: + if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) { + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA)) + .addFrameIndex(FI)) + .addMemOperand(MMO); + MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_4, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_7, RegState::DefineNoRead, TRI); + if (TargetRegisterInfo::isPhysicalRegister(DestReg)) + MIB.addReg(DestReg, RegState::ImplicitDefine); + } else + llvm_unreachable("Unknown reg class!"); + break; + default: + llvm_unreachable("Unknown regclass!"); + } +} + +unsigned +ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + switch (MI->getOpcode()) { + default: break; + case ARM::LDRrs: + case ARM::t2LDRs: // FIXME: don't use t2LDRs to access frame. + if (MI->getOperand(1).isFI() && + MI->getOperand(2).isReg() && + MI->getOperand(3).isImm() && + MI->getOperand(2).getReg() == 0 && + MI->getOperand(3).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + case ARM::LDRi12: + case ARM::t2LDRi12: + case ARM::tLDRspi: + case ARM::VLDRD: + case ARM::VLDRS: + if (MI->getOperand(1).isFI() && + MI->getOperand(2).isImm() && + MI->getOperand(2).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + case ARM::VLD1q64: + case ARM::VLD1d64TPseudo: + case ARM::VLD1d64QPseudo: + if (MI->getOperand(1).isFI() && + MI->getOperand(0).getSubReg() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + case ARM::VLDMQIA: + if (MI->getOperand(1).isFI() && + MI->getOperand(0).getSubReg() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + } + + return 0; +} + +unsigned ARMBaseInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const { + const MachineMemOperand *Dummy; + return MI->mayLoad() && hasLoadFromStackSlot(MI, Dummy, FrameIndex); +} + +/// \brief Expands MEMCPY to either LDMIA/STMIA or LDMIA_UPD/STMID_UPD +/// depending on whether the result is used. +void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MBBI) const { + bool isThumb1 = Subtarget.isThumb1Only(); + bool isThumb2 = Subtarget.isThumb2(); + const ARMBaseInstrInfo *TII = Subtarget.getInstrInfo(); + + MachineInstr *MI = MBBI; + DebugLoc dl = MI->getDebugLoc(); + MachineBasicBlock *BB = MI->getParent(); + + MachineInstrBuilder LDM, STM; + if (isThumb1 || !MI->getOperand(1).isDead()) { + LDM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA_UPD + : isThumb1 ? ARM::tLDMIA_UPD + : ARM::LDMIA_UPD)) + .addOperand(MI->getOperand(1)); + } else { + LDM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA : ARM::LDMIA)); + } + + if (isThumb1 || !MI->getOperand(0).isDead()) { + STM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA_UPD + : isThumb1 ? ARM::tSTMIA_UPD + : ARM::STMIA_UPD)) + .addOperand(MI->getOperand(0)); + } else { + STM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA : ARM::STMIA)); + } + + AddDefaultPred(LDM.addOperand(MI->getOperand(3))); + AddDefaultPred(STM.addOperand(MI->getOperand(2))); + + // Sort the scratch registers into ascending order. + const TargetRegisterInfo &TRI = getRegisterInfo(); + llvm::SmallVector<unsigned, 6> ScratchRegs; + for(unsigned I = 5; I < MI->getNumOperands(); ++I) + ScratchRegs.push_back(MI->getOperand(I).getReg()); + std::sort(ScratchRegs.begin(), ScratchRegs.end(), + [&TRI](const unsigned &Reg1, + const unsigned &Reg2) -> bool { + return TRI.getEncodingValue(Reg1) < + TRI.getEncodingValue(Reg2); + }); + + for (const auto &Reg : ScratchRegs) { + LDM.addReg(Reg, RegState::Define); + STM.addReg(Reg, RegState::Kill); + } + + BB->erase(MBBI); +} + + +bool +ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { + MachineFunction &MF = *MI->getParent()->getParent(); + Reloc::Model RM = MF.getTarget().getRelocationModel(); + + if (MI->getOpcode() == TargetOpcode::LOAD_STACK_GUARD) { + assert(getSubtarget().getTargetTriple().isOSBinFormatMachO() && + "LOAD_STACK_GUARD currently supported only for MachO."); + expandLoadStackGuard(MI, RM); + MI->getParent()->erase(MI); + return true; + } + + if (MI->getOpcode() == ARM::MEMCPY) { + expandMEMCPY(MI); + return true; + } + + // This hook gets to expand COPY instructions before they become + // copyPhysReg() calls. Look for VMOVS instructions that can legally be + // widened to VMOVD. We prefer the VMOVD when possible because it may be + // changed into a VORR that can go down the NEON pipeline. + if (!WidenVMOVS || !MI->isCopy() || Subtarget.isCortexA15() || + Subtarget.isFPOnlySP()) + return false; + + // Look for a copy between even S-registers. That is where we keep floats + // when using NEON v2f32 instructions for f32 arithmetic. + unsigned DstRegS = MI->getOperand(0).getReg(); + unsigned SrcRegS = MI->getOperand(1).getReg(); + if (!ARM::SPRRegClass.contains(DstRegS, SrcRegS)) + return false; + + const TargetRegisterInfo *TRI = &getRegisterInfo(); + unsigned DstRegD = TRI->getMatchingSuperReg(DstRegS, ARM::ssub_0, + &ARM::DPRRegClass); + unsigned SrcRegD = TRI->getMatchingSuperReg(SrcRegS, ARM::ssub_0, + &ARM::DPRRegClass); + if (!DstRegD || !SrcRegD) + return false; + + // We want to widen this into a DstRegD = VMOVD SrcRegD copy. This is only + // legal if the COPY already defines the full DstRegD, and it isn't a + // sub-register insertion. + if (!MI->definesRegister(DstRegD, TRI) || MI->readsRegister(DstRegD, TRI)) + return false; + + // A dead copy shouldn't show up here, but reject it just in case. + if (MI->getOperand(0).isDead()) + return false; + + // All clear, widen the COPY. + DEBUG(dbgs() << "widening: " << *MI); + MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); + + // Get rid of the old <imp-def> of DstRegD. Leave it if it defines a Q-reg + // or some other super-register. + int ImpDefIdx = MI->findRegisterDefOperandIdx(DstRegD); + if (ImpDefIdx != -1) + MI->RemoveOperand(ImpDefIdx); + + // Change the opcode and operands. + MI->setDesc(get(ARM::VMOVD)); + MI->getOperand(0).setReg(DstRegD); + MI->getOperand(1).setReg(SrcRegD); + AddDefaultPred(MIB); + + // We are now reading SrcRegD instead of SrcRegS. This may upset the + // register scavenger and machine verifier, so we need to indicate that we + // are reading an undefined value from SrcRegD, but a proper value from + // SrcRegS. + MI->getOperand(1).setIsUndef(); + MIB.addReg(SrcRegS, RegState::Implicit); + + // SrcRegD may actually contain an unrelated value in the ssub_1 + // sub-register. Don't kill it. Only kill the ssub_0 sub-register. + if (MI->getOperand(1).isKill()) { + MI->getOperand(1).setIsKill(false); + MI->addRegisterKilled(SrcRegS, TRI, true); + } + + DEBUG(dbgs() << "replaced by: " << *MI); + return true; +} + +/// Create a copy of a const pool value. Update CPI to the new index and return +/// the label UID. +static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) { + MachineConstantPool *MCP = MF.getConstantPool(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + + const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPI]; + assert(MCPE.isMachineConstantPoolEntry() && + "Expecting a machine constantpool entry!"); + ARMConstantPoolValue *ACPV = + static_cast<ARMConstantPoolValue*>(MCPE.Val.MachineCPVal); + + unsigned PCLabelId = AFI->createPICLabelUId(); + ARMConstantPoolValue *NewCPV = nullptr; + + // FIXME: The below assumes PIC relocation model and that the function + // is Thumb mode (t1 or t2). PCAdjustment would be 8 for ARM mode PIC, and + // zero for non-PIC in ARM or Thumb. The callers are all of thumb LDR + // instructions, so that's probably OK, but is PIC always correct when + // we get here? + if (ACPV->isGlobalValue()) + NewCPV = ARMConstantPoolConstant::Create( + cast<ARMConstantPoolConstant>(ACPV)->getGV(), PCLabelId, ARMCP::CPValue, + 4, ACPV->getModifier(), ACPV->mustAddCurrentAddress()); + else if (ACPV->isExtSymbol()) + NewCPV = ARMConstantPoolSymbol:: + Create(MF.getFunction()->getContext(), + cast<ARMConstantPoolSymbol>(ACPV)->getSymbol(), PCLabelId, 4); + else if (ACPV->isBlockAddress()) + NewCPV = ARMConstantPoolConstant:: + Create(cast<ARMConstantPoolConstant>(ACPV)->getBlockAddress(), PCLabelId, + ARMCP::CPBlockAddress, 4); + else if (ACPV->isLSDA()) + NewCPV = ARMConstantPoolConstant::Create(MF.getFunction(), PCLabelId, + ARMCP::CPLSDA, 4); + else if (ACPV->isMachineBasicBlock()) + NewCPV = ARMConstantPoolMBB:: + Create(MF.getFunction()->getContext(), + cast<ARMConstantPoolMBB>(ACPV)->getMBB(), PCLabelId, 4); + else + llvm_unreachable("Unexpected ARM constantpool value type!!"); + CPI = MCP->getConstantPoolIndex(NewCPV, MCPE.getAlignment()); + return PCLabelId; +} + +void ARMBaseInstrInfo:: +reMaterialize(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, unsigned SubIdx, + const MachineInstr *Orig, + const TargetRegisterInfo &TRI) const { + unsigned Opcode = Orig->getOpcode(); + switch (Opcode) { + default: { + MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig); + MI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI); + MBB.insert(I, MI); + break; + } + case ARM::tLDRpci_pic: + case ARM::t2LDRpci_pic: { + MachineFunction &MF = *MBB.getParent(); + unsigned CPI = Orig->getOperand(1).getIndex(); + unsigned PCLabelId = duplicateCPV(MF, CPI); + MachineInstrBuilder MIB = BuildMI(MBB, I, Orig->getDebugLoc(), get(Opcode), + DestReg) + .addConstantPoolIndex(CPI).addImm(PCLabelId); + MIB->setMemRefs(Orig->memoperands_begin(), Orig->memoperands_end()); + break; + } + } +} + +MachineInstr * +ARMBaseInstrInfo::duplicate(MachineInstr *Orig, MachineFunction &MF) const { + MachineInstr *MI = TargetInstrInfo::duplicate(Orig, MF); + switch(Orig->getOpcode()) { + case ARM::tLDRpci_pic: + case ARM::t2LDRpci_pic: { + unsigned CPI = Orig->getOperand(1).getIndex(); + unsigned PCLabelId = duplicateCPV(MF, CPI); + Orig->getOperand(1).setIndex(CPI); + Orig->getOperand(2).setImm(PCLabelId); + break; + } + } + return MI; +} + +bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0, + const MachineInstr *MI1, + const MachineRegisterInfo *MRI) const { + unsigned Opcode = MI0->getOpcode(); + if (Opcode == ARM::t2LDRpci || + Opcode == ARM::t2LDRpci_pic || + Opcode == ARM::tLDRpci || + Opcode == ARM::tLDRpci_pic || + Opcode == ARM::LDRLIT_ga_pcrel || + Opcode == ARM::LDRLIT_ga_pcrel_ldr || + Opcode == ARM::tLDRLIT_ga_pcrel || + Opcode == ARM::MOV_ga_pcrel || + Opcode == ARM::MOV_ga_pcrel_ldr || + Opcode == ARM::t2MOV_ga_pcrel) { + if (MI1->getOpcode() != Opcode) + return false; + if (MI0->getNumOperands() != MI1->getNumOperands()) + return false; + + const MachineOperand &MO0 = MI0->getOperand(1); + const MachineOperand &MO1 = MI1->getOperand(1); + if (MO0.getOffset() != MO1.getOffset()) + return false; + + if (Opcode == ARM::LDRLIT_ga_pcrel || + Opcode == ARM::LDRLIT_ga_pcrel_ldr || + Opcode == ARM::tLDRLIT_ga_pcrel || + Opcode == ARM::MOV_ga_pcrel || + Opcode == ARM::MOV_ga_pcrel_ldr || + Opcode == ARM::t2MOV_ga_pcrel) + // Ignore the PC labels. + return MO0.getGlobal() == MO1.getGlobal(); + + const MachineFunction *MF = MI0->getParent()->getParent(); + const MachineConstantPool *MCP = MF->getConstantPool(); + int CPI0 = MO0.getIndex(); + int CPI1 = MO1.getIndex(); + const MachineConstantPoolEntry &MCPE0 = MCP->getConstants()[CPI0]; + const MachineConstantPoolEntry &MCPE1 = MCP->getConstants()[CPI1]; + bool isARMCP0 = MCPE0.isMachineConstantPoolEntry(); + bool isARMCP1 = MCPE1.isMachineConstantPoolEntry(); + if (isARMCP0 && isARMCP1) { + ARMConstantPoolValue *ACPV0 = + static_cast<ARMConstantPoolValue*>(MCPE0.Val.MachineCPVal); + ARMConstantPoolValue *ACPV1 = + static_cast<ARMConstantPoolValue*>(MCPE1.Val.MachineCPVal); + return ACPV0->hasSameValue(ACPV1); + } else if (!isARMCP0 && !isARMCP1) { + return MCPE0.Val.ConstVal == MCPE1.Val.ConstVal; + } + return false; + } else if (Opcode == ARM::PICLDR) { + if (MI1->getOpcode() != Opcode) + return false; + if (MI0->getNumOperands() != MI1->getNumOperands()) + return false; + + unsigned Addr0 = MI0->getOperand(1).getReg(); + unsigned Addr1 = MI1->getOperand(1).getReg(); + if (Addr0 != Addr1) { + if (!MRI || + !TargetRegisterInfo::isVirtualRegister(Addr0) || + !TargetRegisterInfo::isVirtualRegister(Addr1)) + return false; + + // This assumes SSA form. + MachineInstr *Def0 = MRI->getVRegDef(Addr0); + MachineInstr *Def1 = MRI->getVRegDef(Addr1); + // Check if the loaded value, e.g. a constantpool of a global address, are + // the same. + if (!produceSameValue(Def0, Def1, MRI)) + return false; + } + + for (unsigned i = 3, e = MI0->getNumOperands(); i != e; ++i) { + // %vreg12<def> = PICLDR %vreg11, 0, pred:14, pred:%noreg + const MachineOperand &MO0 = MI0->getOperand(i); + const MachineOperand &MO1 = MI1->getOperand(i); + if (!MO0.isIdenticalTo(MO1)) + return false; + } + return true; + } + + return MI0->isIdenticalTo(MI1, MachineInstr::IgnoreVRegDefs); +} + +/// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to +/// determine if two loads are loading from the same base address. It should +/// only return true if the base pointers are the same and the only differences +/// between the two addresses is the offset. It also returns the offsets by +/// reference. +/// +/// FIXME: remove this in favor of the MachineInstr interface once pre-RA-sched +/// is permanently disabled. +bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, + int64_t &Offset1, + int64_t &Offset2) const { + // Don't worry about Thumb: just ARM and Thumb2. + if (Subtarget.isThumb1Only()) return false; + + if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode()) + return false; + + switch (Load1->getMachineOpcode()) { + default: + return false; + case ARM::LDRi12: + case ARM::LDRBi12: + case ARM::LDRD: + case ARM::LDRH: + case ARM::LDRSB: + case ARM::LDRSH: + case ARM::VLDRD: + case ARM::VLDRS: + case ARM::t2LDRi8: + case ARM::t2LDRBi8: + case ARM::t2LDRDi8: + case ARM::t2LDRSHi8: + case ARM::t2LDRi12: + case ARM::t2LDRBi12: + case ARM::t2LDRSHi12: + break; + } + + switch (Load2->getMachineOpcode()) { + default: + return false; + case ARM::LDRi12: + case ARM::LDRBi12: + case ARM::LDRD: + case ARM::LDRH: + case ARM::LDRSB: + case ARM::LDRSH: + case ARM::VLDRD: + case ARM::VLDRS: + case ARM::t2LDRi8: + case ARM::t2LDRBi8: + case ARM::t2LDRSHi8: + case ARM::t2LDRi12: + case ARM::t2LDRBi12: + case ARM::t2LDRSHi12: + break; + } + + // Check if base addresses and chain operands match. + if (Load1->getOperand(0) != Load2->getOperand(0) || + Load1->getOperand(4) != Load2->getOperand(4)) + return false; + + // Index should be Reg0. + if (Load1->getOperand(3) != Load2->getOperand(3)) + return false; + + // Determine the offsets. + if (isa<ConstantSDNode>(Load1->getOperand(1)) && + isa<ConstantSDNode>(Load2->getOperand(1))) { + Offset1 = cast<ConstantSDNode>(Load1->getOperand(1))->getSExtValue(); + Offset2 = cast<ConstantSDNode>(Load2->getOperand(1))->getSExtValue(); + return true; + } + + return false; +} + +/// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to +/// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should +/// be scheduled togther. On some targets if two loads are loading from +/// addresses in the same cache line, it's better if they are scheduled +/// together. This function takes two integers that represent the load offsets +/// from the common base address. It returns true if it decides it's desirable +/// to schedule the two loads together. "NumLoads" is the number of loads that +/// have already been scheduled after Load1. +/// +/// FIXME: remove this in favor of the MachineInstr interface once pre-RA-sched +/// is permanently disabled. +bool ARMBaseInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, + int64_t Offset1, int64_t Offset2, + unsigned NumLoads) const { + // Don't worry about Thumb: just ARM and Thumb2. + if (Subtarget.isThumb1Only()) return false; + + assert(Offset2 > Offset1); + + if ((Offset2 - Offset1) / 8 > 64) + return false; + + // Check if the machine opcodes are different. If they are different + // then we consider them to not be of the same base address, + // EXCEPT in the case of Thumb2 byte loads where one is LDRBi8 and the other LDRBi12. + // In this case, they are considered to be the same because they are different + // encoding forms of the same basic instruction. + if ((Load1->getMachineOpcode() != Load2->getMachineOpcode()) && + !((Load1->getMachineOpcode() == ARM::t2LDRBi8 && + Load2->getMachineOpcode() == ARM::t2LDRBi12) || + (Load1->getMachineOpcode() == ARM::t2LDRBi12 && + Load2->getMachineOpcode() == ARM::t2LDRBi8))) + return false; // FIXME: overly conservative? + + // Four loads in a row should be sufficient. + if (NumLoads >= 3) + return false; + + return true; +} + +bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI, + const MachineBasicBlock *MBB, + const MachineFunction &MF) const { + // Debug info is never a scheduling boundary. It's necessary to be explicit + // due to the special treatment of IT instructions below, otherwise a + // dbg_value followed by an IT will result in the IT instruction being + // considered a scheduling hazard, which is wrong. It should be the actual + // instruction preceding the dbg_value instruction(s), just like it is + // when debug info is not present. + if (MI->isDebugValue()) + return false; + + // Terminators and labels can't be scheduled around. + if (MI->isTerminator() || MI->isPosition()) + return true; + + // Treat the start of the IT block as a scheduling boundary, but schedule + // t2IT along with all instructions following it. + // FIXME: This is a big hammer. But the alternative is to add all potential + // true and anti dependencies to IT block instructions as implicit operands + // to the t2IT instruction. The added compile time and complexity does not + // seem worth it. + MachineBasicBlock::const_iterator I = MI; + // Make sure to skip any dbg_value instructions + while (++I != MBB->end() && I->isDebugValue()) + ; + if (I != MBB->end() && I->getOpcode() == ARM::t2IT) + return true; + + // Don't attempt to schedule around any instruction that defines + // a stack-oriented pointer, as it's unlikely to be profitable. This + // saves compile time, because it doesn't require every single + // stack slot reference to depend on the instruction that does the + // modification. + // Calls don't actually change the stack pointer, even if they have imp-defs. + // No ARM calling conventions change the stack pointer. (X86 calling + // conventions sometimes do). + if (!MI->isCall() && MI->definesRegister(ARM::SP)) + return true; + + return false; +} + +bool ARMBaseInstrInfo:: +isProfitableToIfCvt(MachineBasicBlock &MBB, + unsigned NumCycles, unsigned ExtraPredCycles, + BranchProbability Probability) const { + if (!NumCycles) + return false; + + // If we are optimizing for size, see if the branch in the predecessor can be + // lowered to cbn?z by the constant island lowering pass, and return false if + // so. This results in a shorter instruction sequence. + if (MBB.getParent()->getFunction()->optForSize()) { + MachineBasicBlock *Pred = *MBB.pred_begin(); + if (!Pred->empty()) { + MachineInstr *LastMI = &*Pred->rbegin(); + if (LastMI->getOpcode() == ARM::t2Bcc) { + MachineBasicBlock::iterator CmpMI = LastMI; + if (CmpMI != Pred->begin()) { + --CmpMI; + if (CmpMI->getOpcode() == ARM::tCMPi8 || + CmpMI->getOpcode() == ARM::t2CMPri) { + unsigned Reg = CmpMI->getOperand(0).getReg(); + unsigned PredReg = 0; + ARMCC::CondCodes P = getInstrPredicate(CmpMI, PredReg); + if (P == ARMCC::AL && CmpMI->getOperand(1).getImm() == 0 && + isARMLowRegister(Reg)) + return false; + } + } + } + } + } + + // Attempt to estimate the relative costs of predication versus branching. + // Here we scale up each component of UnpredCost to avoid precision issue when + // scaling NumCycles by Probability. + const unsigned ScalingUpFactor = 1024; + unsigned UnpredCost = Probability.scale(NumCycles * ScalingUpFactor); + UnpredCost += ScalingUpFactor; // The branch itself + UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10; + + return (NumCycles + ExtraPredCycles) * ScalingUpFactor <= UnpredCost; +} + +bool ARMBaseInstrInfo:: +isProfitableToIfCvt(MachineBasicBlock &TMBB, + unsigned TCycles, unsigned TExtra, + MachineBasicBlock &FMBB, + unsigned FCycles, unsigned FExtra, + BranchProbability Probability) const { + if (!TCycles || !FCycles) + return false; + + // Attempt to estimate the relative costs of predication versus branching. + // Here we scale up each component of UnpredCost to avoid precision issue when + // scaling TCycles/FCycles by Probability. + const unsigned ScalingUpFactor = 1024; + unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor); + unsigned FUnpredCost = + Probability.getCompl().scale(FCycles * ScalingUpFactor); + unsigned UnpredCost = TUnpredCost + FUnpredCost; + UnpredCost += 1 * ScalingUpFactor; // The branch itself + UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10; + + return (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor <= UnpredCost; +} + +bool +ARMBaseInstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB, + MachineBasicBlock &FMBB) const { + // Reduce false anti-dependencies to let Swift's out-of-order execution + // engine do its thing. + return Subtarget.isSwift(); +} + +/// getInstrPredicate - If instruction is predicated, returns its predicate +/// condition, otherwise returns AL. It also returns the condition code +/// register by reference. +ARMCC::CondCodes +llvm::getInstrPredicate(const MachineInstr *MI, unsigned &PredReg) { + int PIdx = MI->findFirstPredOperandIdx(); + if (PIdx == -1) { + PredReg = 0; + return ARMCC::AL; + } + + PredReg = MI->getOperand(PIdx+1).getReg(); + return (ARMCC::CondCodes)MI->getOperand(PIdx).getImm(); +} + + +unsigned llvm::getMatchingCondBranchOpcode(unsigned Opc) { + if (Opc == ARM::B) + return ARM::Bcc; + if (Opc == ARM::tB) + return ARM::tBcc; + if (Opc == ARM::t2B) + return ARM::t2Bcc; + + llvm_unreachable("Unknown unconditional branch opcode!"); +} + +MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr *MI, + bool NewMI, + unsigned OpIdx1, + unsigned OpIdx2) const { + switch (MI->getOpcode()) { + case ARM::MOVCCr: + case ARM::t2MOVCCr: { + // MOVCC can be commuted by inverting the condition. + unsigned PredReg = 0; + ARMCC::CondCodes CC = getInstrPredicate(MI, PredReg); + // MOVCC AL can't be inverted. Shouldn't happen. + if (CC == ARMCC::AL || PredReg != ARM::CPSR) + return nullptr; + MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + if (!MI) + return nullptr; + // After swapping the MOVCC operands, also invert the condition. + MI->getOperand(MI->findFirstPredOperandIdx()) + .setImm(ARMCC::getOppositeCondition(CC)); + return MI; + } + } + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); +} + +/// Identify instructions that can be folded into a MOVCC instruction, and +/// return the defining instruction. +static MachineInstr *canFoldIntoMOVCC(unsigned Reg, + const MachineRegisterInfo &MRI, + const TargetInstrInfo *TII) { + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + return nullptr; + if (!MRI.hasOneNonDBGUse(Reg)) + return nullptr; + MachineInstr *MI = MRI.getVRegDef(Reg); + if (!MI) + return nullptr; + // MI is folded into the MOVCC by predicating it. + if (!MI->isPredicable()) + return nullptr; + // Check if MI has any non-dead defs or physreg uses. This also detects + // predicated instructions which will be reading CPSR. + for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + // Reject frame index operands, PEI can't handle the predicated pseudos. + if (MO.isFI() || MO.isCPI() || MO.isJTI()) + return nullptr; + if (!MO.isReg()) + continue; + // MI can't have any tied operands, that would conflict with predication. + if (MO.isTied()) + return nullptr; + if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) + return nullptr; + if (MO.isDef() && !MO.isDead()) + return nullptr; + } + bool DontMoveAcrossStores = true; + if (!MI->isSafeToMove(/* AliasAnalysis = */ nullptr, DontMoveAcrossStores)) + return nullptr; + return MI; +} + +bool ARMBaseInstrInfo::analyzeSelect(const MachineInstr *MI, + SmallVectorImpl<MachineOperand> &Cond, + unsigned &TrueOp, unsigned &FalseOp, + bool &Optimizable) const { + assert((MI->getOpcode() == ARM::MOVCCr || MI->getOpcode() == ARM::t2MOVCCr) && + "Unknown select instruction"); + // MOVCC operands: + // 0: Def. + // 1: True use. + // 2: False use. + // 3: Condition code. + // 4: CPSR use. + TrueOp = 1; + FalseOp = 2; + Cond.push_back(MI->getOperand(3)); + Cond.push_back(MI->getOperand(4)); + // We can always fold a def. + Optimizable = true; + return false; +} + +MachineInstr * +ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI, + SmallPtrSetImpl<MachineInstr *> &SeenMIs, + bool PreferFalse) const { + assert((MI->getOpcode() == ARM::MOVCCr || MI->getOpcode() == ARM::t2MOVCCr) && + "Unknown select instruction"); + MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + MachineInstr *DefMI = canFoldIntoMOVCC(MI->getOperand(2).getReg(), MRI, this); + bool Invert = !DefMI; + if (!DefMI) + DefMI = canFoldIntoMOVCC(MI->getOperand(1).getReg(), MRI, this); + if (!DefMI) + return nullptr; + + // Find new register class to use. + MachineOperand FalseReg = MI->getOperand(Invert ? 2 : 1); + unsigned DestReg = MI->getOperand(0).getReg(); + const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg()); + if (!MRI.constrainRegClass(DestReg, PreviousClass)) + return nullptr; + + // Create a new predicated version of DefMI. + // Rfalse is the first use. + MachineInstrBuilder NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + DefMI->getDesc(), DestReg); + + // Copy all the DefMI operands, excluding its (null) predicate. + const MCInstrDesc &DefDesc = DefMI->getDesc(); + for (unsigned i = 1, e = DefDesc.getNumOperands(); + i != e && !DefDesc.OpInfo[i].isPredicate(); ++i) + NewMI.addOperand(DefMI->getOperand(i)); + + unsigned CondCode = MI->getOperand(3).getImm(); + if (Invert) + NewMI.addImm(ARMCC::getOppositeCondition(ARMCC::CondCodes(CondCode))); + else + NewMI.addImm(CondCode); + NewMI.addOperand(MI->getOperand(4)); + + // DefMI is not the -S version that sets CPSR, so add an optional %noreg. + if (NewMI->hasOptionalDef()) + AddDefaultCC(NewMI); + + // The output register value when the predicate is false is an implicit + // register operand tied to the first def. + // The tie makes the register allocator ensure the FalseReg is allocated the + // same register as operand 0. + FalseReg.setImplicit(); + NewMI.addOperand(FalseReg); + NewMI->tieOperands(0, NewMI->getNumOperands() - 1); + + // Update SeenMIs set: register newly created MI and erase removed DefMI. + SeenMIs.insert(NewMI); + SeenMIs.erase(DefMI); + + // If MI is inside a loop, and DefMI is outside the loop, then kill flags on + // DefMI would be invalid when tranferred inside the loop. Checking for a + // loop is expensive, but at least remove kill flags if they are in different + // BBs. + if (DefMI->getParent() != MI->getParent()) + NewMI->clearKillInfo(); + + // The caller will erase MI, but not DefMI. + DefMI->eraseFromParent(); + return NewMI; +} + +/// Map pseudo instructions that imply an 'S' bit onto real opcodes. Whether the +/// instruction is encoded with an 'S' bit is determined by the optional CPSR +/// def operand. +/// +/// This will go away once we can teach tblgen how to set the optional CPSR def +/// operand itself. +struct AddSubFlagsOpcodePair { + uint16_t PseudoOpc; + uint16_t MachineOpc; +}; + +static const AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = { + {ARM::ADDSri, ARM::ADDri}, + {ARM::ADDSrr, ARM::ADDrr}, + {ARM::ADDSrsi, ARM::ADDrsi}, + {ARM::ADDSrsr, ARM::ADDrsr}, + + {ARM::SUBSri, ARM::SUBri}, + {ARM::SUBSrr, ARM::SUBrr}, + {ARM::SUBSrsi, ARM::SUBrsi}, + {ARM::SUBSrsr, ARM::SUBrsr}, + + {ARM::RSBSri, ARM::RSBri}, + {ARM::RSBSrsi, ARM::RSBrsi}, + {ARM::RSBSrsr, ARM::RSBrsr}, + + {ARM::t2ADDSri, ARM::t2ADDri}, + {ARM::t2ADDSrr, ARM::t2ADDrr}, + {ARM::t2ADDSrs, ARM::t2ADDrs}, + + {ARM::t2SUBSri, ARM::t2SUBri}, + {ARM::t2SUBSrr, ARM::t2SUBrr}, + {ARM::t2SUBSrs, ARM::t2SUBrs}, + + {ARM::t2RSBSri, ARM::t2RSBri}, + {ARM::t2RSBSrs, ARM::t2RSBrs}, +}; + +unsigned llvm::convertAddSubFlagsOpcode(unsigned OldOpc) { + for (unsigned i = 0, e = array_lengthof(AddSubFlagsOpcodeMap); i != e; ++i) + if (OldOpc == AddSubFlagsOpcodeMap[i].PseudoOpc) + return AddSubFlagsOpcodeMap[i].MachineOpc; + return 0; +} + +void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, DebugLoc dl, + unsigned DestReg, unsigned BaseReg, int NumBytes, + ARMCC::CondCodes Pred, unsigned PredReg, + const ARMBaseInstrInfo &TII, unsigned MIFlags) { + if (NumBytes == 0 && DestReg != BaseReg) { + BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), DestReg) + .addReg(BaseReg, RegState::Kill) + .addImm((unsigned)Pred).addReg(PredReg).addReg(0) + .setMIFlags(MIFlags); + return; + } + + bool isSub = NumBytes < 0; + if (isSub) NumBytes = -NumBytes; + + while (NumBytes) { + unsigned RotAmt = ARM_AM::getSOImmValRotate(NumBytes); + unsigned ThisVal = NumBytes & ARM_AM::rotr32(0xFF, RotAmt); + assert(ThisVal && "Didn't extract field correctly"); + + // We will handle these bits from offset, clear them. + NumBytes &= ~ThisVal; + + assert(ARM_AM::getSOImmVal(ThisVal) != -1 && "Bit extraction didn't work?"); + + // Build the new ADD / SUB. + unsigned Opc = isSub ? ARM::SUBri : ARM::ADDri; + BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg) + .addReg(BaseReg, RegState::Kill).addImm(ThisVal) + .addImm((unsigned)Pred).addReg(PredReg).addReg(0) + .setMIFlags(MIFlags); + BaseReg = DestReg; + } +} + +bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget, + MachineFunction &MF, MachineInstr *MI, + unsigned NumBytes) { + // This optimisation potentially adds lots of load and store + // micro-operations, it's only really a great benefit to code-size. + if (!MF.getFunction()->optForMinSize()) + return false; + + // If only one register is pushed/popped, LLVM can use an LDR/STR + // instead. We can't modify those so make sure we're dealing with an + // instruction we understand. + bool IsPop = isPopOpcode(MI->getOpcode()); + bool IsPush = isPushOpcode(MI->getOpcode()); + if (!IsPush && !IsPop) + return false; + + bool IsVFPPushPop = MI->getOpcode() == ARM::VSTMDDB_UPD || + MI->getOpcode() == ARM::VLDMDIA_UPD; + bool IsT1PushPop = MI->getOpcode() == ARM::tPUSH || + MI->getOpcode() == ARM::tPOP || + MI->getOpcode() == ARM::tPOP_RET; + + assert((IsT1PushPop || (MI->getOperand(0).getReg() == ARM::SP && + MI->getOperand(1).getReg() == ARM::SP)) && + "trying to fold sp update into non-sp-updating push/pop"); + + // The VFP push & pop act on D-registers, so we can only fold an adjustment + // by a multiple of 8 bytes in correctly. Similarly rN is 4-bytes. Don't try + // if this is violated. + if (NumBytes % (IsVFPPushPop ? 8 : 4) != 0) + return false; + + // ARM and Thumb2 push/pop insts have explicit "sp, sp" operands (+ + // pred) so the list starts at 4. Thumb1 starts after the predicate. + int RegListIdx = IsT1PushPop ? 2 : 4; + + // Calculate the space we'll need in terms of registers. + unsigned FirstReg = MI->getOperand(RegListIdx).getReg(); + unsigned RD0Reg, RegsNeeded; + if (IsVFPPushPop) { + RD0Reg = ARM::D0; + RegsNeeded = NumBytes / 8; + } else { + RD0Reg = ARM::R0; + RegsNeeded = NumBytes / 4; + } + + // We're going to have to strip all list operands off before + // re-adding them since the order matters, so save the existing ones + // for later. + SmallVector<MachineOperand, 4> RegList; + for (int i = MI->getNumOperands() - 1; i >= RegListIdx; --i) + RegList.push_back(MI->getOperand(i)); + + const TargetRegisterInfo *TRI = MF.getRegInfo().getTargetRegisterInfo(); + const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF); + + // Now try to find enough space in the reglist to allocate NumBytes. + for (unsigned CurReg = FirstReg - 1; CurReg >= RD0Reg && RegsNeeded; + --CurReg) { + if (!IsPop) { + // Pushing any register is completely harmless, mark the + // register involved as undef since we don't care about it in + // the slightest. + RegList.push_back(MachineOperand::CreateReg(CurReg, false, false, + false, false, true)); + --RegsNeeded; + continue; + } + + // However, we can only pop an extra register if it's not live. For + // registers live within the function we might clobber a return value + // register; the other way a register can be live here is if it's + // callee-saved. + if (isCalleeSavedRegister(CurReg, CSRegs) || + MI->getParent()->computeRegisterLiveness(TRI, CurReg, MI) != + MachineBasicBlock::LQR_Dead) { + // VFP pops don't allow holes in the register list, so any skip is fatal + // for our transformation. GPR pops do, so we should just keep looking. + if (IsVFPPushPop) + return false; + else + continue; + } + + // Mark the unimportant registers as <def,dead> in the POP. + RegList.push_back(MachineOperand::CreateReg(CurReg, true, false, false, + true)); + --RegsNeeded; + } + + if (RegsNeeded > 0) + return false; + + // Finally we know we can profitably perform the optimisation so go + // ahead: strip all existing registers off and add them back again + // in the right order. + for (int i = MI->getNumOperands() - 1; i >= RegListIdx; --i) + MI->RemoveOperand(i); + + // Add the complete list back in. + MachineInstrBuilder MIB(MF, &*MI); + for (int i = RegList.size() - 1; i >= 0; --i) + MIB.addOperand(RegList[i]); + + return true; +} + +bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const ARMBaseInstrInfo &TII) { + unsigned Opcode = MI.getOpcode(); + const MCInstrDesc &Desc = MI.getDesc(); + unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); + bool isSub = false; + + // Memory operands in inline assembly always use AddrMode2. + if (Opcode == ARM::INLINEASM) + AddrMode = ARMII::AddrMode2; + + if (Opcode == ARM::ADDri) { + Offset += MI.getOperand(FrameRegIdx+1).getImm(); + if (Offset == 0) { + // Turn it into a move. + MI.setDesc(TII.get(ARM::MOVr)); + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + MI.RemoveOperand(FrameRegIdx+1); + Offset = 0; + return true; + } else if (Offset < 0) { + Offset = -Offset; + isSub = true; + MI.setDesc(TII.get(ARM::SUBri)); + } + + // Common case: small offset, fits into instruction. + if (ARM_AM::getSOImmVal(Offset) != -1) { + // Replace the FrameIndex with sp / fp + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset); + Offset = 0; + return true; + } + + // Otherwise, pull as much of the immedidate into this ADDri/SUBri + // as possible. + unsigned RotAmt = ARM_AM::getSOImmValRotate(Offset); + unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xFF, RotAmt); + + // We will handle these bits from offset, clear them. + Offset &= ~ThisImmVal; + + // Get the properly encoded SOImmVal field. + assert(ARM_AM::getSOImmVal(ThisImmVal) != -1 && + "Bit extraction didn't work?"); + MI.getOperand(FrameRegIdx+1).ChangeToImmediate(ThisImmVal); + } else { + unsigned ImmIdx = 0; + int InstrOffs = 0; + unsigned NumBits = 0; + unsigned Scale = 1; + switch (AddrMode) { + case ARMII::AddrMode_i12: { + ImmIdx = FrameRegIdx + 1; + InstrOffs = MI.getOperand(ImmIdx).getImm(); + NumBits = 12; + break; + } + case ARMII::AddrMode2: { + ImmIdx = FrameRegIdx+2; + InstrOffs = ARM_AM::getAM2Offset(MI.getOperand(ImmIdx).getImm()); + if (ARM_AM::getAM2Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub) + InstrOffs *= -1; + NumBits = 12; + break; + } + case ARMII::AddrMode3: { + ImmIdx = FrameRegIdx+2; + InstrOffs = ARM_AM::getAM3Offset(MI.getOperand(ImmIdx).getImm()); + if (ARM_AM::getAM3Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub) + InstrOffs *= -1; + NumBits = 8; + break; + } + case ARMII::AddrMode4: + case ARMII::AddrMode6: + // Can't fold any offset even if it's zero. + return false; + case ARMII::AddrMode5: { + ImmIdx = FrameRegIdx+1; + InstrOffs = ARM_AM::getAM5Offset(MI.getOperand(ImmIdx).getImm()); + if (ARM_AM::getAM5Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub) + InstrOffs *= -1; + NumBits = 8; + Scale = 4; + break; + } + default: + llvm_unreachable("Unsupported addressing mode!"); + } + + Offset += InstrOffs * Scale; + assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!"); + if (Offset < 0) { + Offset = -Offset; + isSub = true; + } + + // Attempt to fold address comp. if opcode has offset bits + if (NumBits > 0) { + // Common case: small offset, fits into instruction. + MachineOperand &ImmOp = MI.getOperand(ImmIdx); + int ImmedOffset = Offset / Scale; + unsigned Mask = (1 << NumBits) - 1; + if ((unsigned)Offset <= Mask * Scale) { + // Replace the FrameIndex with sp + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + // FIXME: When addrmode2 goes away, this will simplify (like the + // T2 version), as the LDR.i12 versions don't need the encoding + // tricks for the offset value. + if (isSub) { + if (AddrMode == ARMII::AddrMode_i12) + ImmedOffset = -ImmedOffset; + else + ImmedOffset |= 1 << NumBits; + } + ImmOp.ChangeToImmediate(ImmedOffset); + Offset = 0; + return true; + } + + // Otherwise, it didn't fit. Pull in what we can to simplify the immed. + ImmedOffset = ImmedOffset & Mask; + if (isSub) { + if (AddrMode == ARMII::AddrMode_i12) + ImmedOffset = -ImmedOffset; + else + ImmedOffset |= 1 << NumBits; + } + ImmOp.ChangeToImmediate(ImmedOffset); + Offset &= ~(Mask*Scale); + } + } + + Offset = (isSub) ? -Offset : Offset; + return Offset == 0; +} + +/// analyzeCompare - For a comparison instruction, return the source registers +/// in SrcReg and SrcReg2 if having two register operands, and the value it +/// compares against in CmpValue. Return true if the comparison instruction +/// can be analyzed. +bool ARMBaseInstrInfo:: +analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2, + int &CmpMask, int &CmpValue) const { + switch (MI->getOpcode()) { + default: break; + case ARM::CMPri: + case ARM::t2CMPri: + SrcReg = MI->getOperand(0).getReg(); + SrcReg2 = 0; + CmpMask = ~0; + CmpValue = MI->getOperand(1).getImm(); + return true; + case ARM::CMPrr: + case ARM::t2CMPrr: + SrcReg = MI->getOperand(0).getReg(); + SrcReg2 = MI->getOperand(1).getReg(); + CmpMask = ~0; + CmpValue = 0; + return true; + case ARM::TSTri: + case ARM::t2TSTri: + SrcReg = MI->getOperand(0).getReg(); + SrcReg2 = 0; + CmpMask = MI->getOperand(1).getImm(); + CmpValue = 0; + return true; + } + + return false; +} + +/// isSuitableForMask - Identify a suitable 'and' instruction that +/// operates on the given source register and applies the same mask +/// as a 'tst' instruction. Provide a limited look-through for copies. +/// When successful, MI will hold the found instruction. +static bool isSuitableForMask(MachineInstr *&MI, unsigned SrcReg, + int CmpMask, bool CommonUse) { + switch (MI->getOpcode()) { + case ARM::ANDri: + case ARM::t2ANDri: + if (CmpMask != MI->getOperand(2).getImm()) + return false; + if (SrcReg == MI->getOperand(CommonUse ? 1 : 0).getReg()) + return true; + break; + } + + return false; +} + +/// getSwappedCondition - assume the flags are set by MI(a,b), return +/// the condition code if we modify the instructions such that flags are +/// set by MI(b,a). +inline static ARMCC::CondCodes getSwappedCondition(ARMCC::CondCodes CC) { + switch (CC) { + default: return ARMCC::AL; + case ARMCC::EQ: return ARMCC::EQ; + case ARMCC::NE: return ARMCC::NE; + case ARMCC::HS: return ARMCC::LS; + case ARMCC::LO: return ARMCC::HI; + case ARMCC::HI: return ARMCC::LO; + case ARMCC::LS: return ARMCC::HS; + case ARMCC::GE: return ARMCC::LE; + case ARMCC::LT: return ARMCC::GT; + case ARMCC::GT: return ARMCC::LT; + case ARMCC::LE: return ARMCC::GE; + } +} + +/// isRedundantFlagInstr - check whether the first instruction, whose only +/// purpose is to update flags, can be made redundant. +/// CMPrr can be made redundant by SUBrr if the operands are the same. +/// CMPri can be made redundant by SUBri if the operands are the same. +/// This function can be extended later on. +inline static bool isRedundantFlagInstr(MachineInstr *CmpI, unsigned SrcReg, + unsigned SrcReg2, int ImmValue, + MachineInstr *OI) { + if ((CmpI->getOpcode() == ARM::CMPrr || + CmpI->getOpcode() == ARM::t2CMPrr) && + (OI->getOpcode() == ARM::SUBrr || + OI->getOpcode() == ARM::t2SUBrr) && + ((OI->getOperand(1).getReg() == SrcReg && + OI->getOperand(2).getReg() == SrcReg2) || + (OI->getOperand(1).getReg() == SrcReg2 && + OI->getOperand(2).getReg() == SrcReg))) + return true; + + if ((CmpI->getOpcode() == ARM::CMPri || + CmpI->getOpcode() == ARM::t2CMPri) && + (OI->getOpcode() == ARM::SUBri || + OI->getOpcode() == ARM::t2SUBri) && + OI->getOperand(1).getReg() == SrcReg && + OI->getOperand(2).getImm() == ImmValue) + return true; + return false; +} + +/// optimizeCompareInstr - Convert the instruction supplying the argument to the +/// comparison into one that sets the zero bit in the flags register; +/// Remove a redundant Compare instruction if an earlier instruction can set the +/// flags in the same way as Compare. +/// E.g. SUBrr(r1,r2) and CMPrr(r1,r2). We also handle the case where two +/// operands are swapped: SUBrr(r1,r2) and CMPrr(r2,r1), by updating the +/// condition code of instructions which use the flags. +bool ARMBaseInstrInfo:: +optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, + int CmpMask, int CmpValue, + const MachineRegisterInfo *MRI) const { + // Get the unique definition of SrcReg. + MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); + if (!MI) return false; + + // Masked compares sometimes use the same register as the corresponding 'and'. + if (CmpMask != ~0) { + if (!isSuitableForMask(MI, SrcReg, CmpMask, false) || isPredicated(MI)) { + MI = nullptr; + for (MachineRegisterInfo::use_instr_iterator + UI = MRI->use_instr_begin(SrcReg), UE = MRI->use_instr_end(); + UI != UE; ++UI) { + if (UI->getParent() != CmpInstr->getParent()) continue; + MachineInstr *PotentialAND = &*UI; + if (!isSuitableForMask(PotentialAND, SrcReg, CmpMask, true) || + isPredicated(PotentialAND)) + continue; + MI = PotentialAND; + break; + } + if (!MI) return false; + } + } + + // Get ready to iterate backward from CmpInstr. + MachineBasicBlock::iterator I = CmpInstr, E = MI, + B = CmpInstr->getParent()->begin(); + + // Early exit if CmpInstr is at the beginning of the BB. + if (I == B) return false; + + // There are two possible candidates which can be changed to set CPSR: + // One is MI, the other is a SUB instruction. + // For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1). + // For CMPri(r1, CmpValue), we are looking for SUBri(r1, CmpValue). + MachineInstr *Sub = nullptr; + if (SrcReg2 != 0) + // MI is not a candidate for CMPrr. + MI = nullptr; + else if (MI->getParent() != CmpInstr->getParent() || CmpValue != 0) { + // Conservatively refuse to convert an instruction which isn't in the same + // BB as the comparison. + // For CMPri w/ CmpValue != 0, a Sub may still be a candidate. + // Thus we cannot return here. + if (CmpInstr->getOpcode() == ARM::CMPri || + CmpInstr->getOpcode() == ARM::t2CMPri) + MI = nullptr; + else + return false; + } + + // Check that CPSR isn't set between the comparison instruction and the one we + // want to change. At the same time, search for Sub. + const TargetRegisterInfo *TRI = &getRegisterInfo(); + --I; + for (; I != E; --I) { + const MachineInstr &Instr = *I; + + if (Instr.modifiesRegister(ARM::CPSR, TRI) || + Instr.readsRegister(ARM::CPSR, TRI)) + // This instruction modifies or uses CPSR after the one we want to + // change. We can't do this transformation. + return false; + + // Check whether CmpInstr can be made redundant by the current instruction. + if (isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, &*I)) { + Sub = &*I; + break; + } + + if (I == B) + // The 'and' is below the comparison instruction. + return false; + } + + // Return false if no candidates exist. + if (!MI && !Sub) + return false; + + // The single candidate is called MI. + if (!MI) MI = Sub; + + // We can't use a predicated instruction - it doesn't always write the flags. + if (isPredicated(MI)) + return false; + + switch (MI->getOpcode()) { + default: break; + case ARM::RSBrr: + case ARM::RSBri: + case ARM::RSCrr: + case ARM::RSCri: + case ARM::ADDrr: + case ARM::ADDri: + case ARM::ADCrr: + case ARM::ADCri: + case ARM::SUBrr: + case ARM::SUBri: + case ARM::SBCrr: + case ARM::SBCri: + case ARM::t2RSBri: + case ARM::t2ADDrr: + case ARM::t2ADDri: + case ARM::t2ADCrr: + case ARM::t2ADCri: + case ARM::t2SUBrr: + case ARM::t2SUBri: + case ARM::t2SBCrr: + case ARM::t2SBCri: + case ARM::ANDrr: + case ARM::ANDri: + case ARM::t2ANDrr: + case ARM::t2ANDri: + case ARM::ORRrr: + case ARM::ORRri: + case ARM::t2ORRrr: + case ARM::t2ORRri: + case ARM::EORrr: + case ARM::EORri: + case ARM::t2EORrr: + case ARM::t2EORri: { + // Scan forward for the use of CPSR + // When checking against MI: if it's a conditional code that requires + // checking of the V bit or C bit, then this is not safe to do. + // It is safe to remove CmpInstr if CPSR is redefined or killed. + // If we are done with the basic block, we need to check whether CPSR is + // live-out. + SmallVector<std::pair<MachineOperand*, ARMCC::CondCodes>, 4> + OperandsToUpdate; + bool isSafe = false; + I = CmpInstr; + E = CmpInstr->getParent()->end(); + while (!isSafe && ++I != E) { + const MachineInstr &Instr = *I; + for (unsigned IO = 0, EO = Instr.getNumOperands(); + !isSafe && IO != EO; ++IO) { + const MachineOperand &MO = Instr.getOperand(IO); + if (MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR)) { + isSafe = true; + break; + } + if (!MO.isReg() || MO.getReg() != ARM::CPSR) + continue; + if (MO.isDef()) { + isSafe = true; + break; + } + // Condition code is after the operand before CPSR except for VSELs. + ARMCC::CondCodes CC; + bool IsInstrVSel = true; + switch (Instr.getOpcode()) { + default: + IsInstrVSel = false; + CC = (ARMCC::CondCodes)Instr.getOperand(IO - 1).getImm(); + break; + case ARM::VSELEQD: + case ARM::VSELEQS: + CC = ARMCC::EQ; + break; + case ARM::VSELGTD: + case ARM::VSELGTS: + CC = ARMCC::GT; + break; + case ARM::VSELGED: + case ARM::VSELGES: + CC = ARMCC::GE; + break; + case ARM::VSELVSS: + case ARM::VSELVSD: + CC = ARMCC::VS; + break; + } + + if (Sub) { + ARMCC::CondCodes NewCC = getSwappedCondition(CC); + if (NewCC == ARMCC::AL) + return false; + // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based + // on CMP needs to be updated to be based on SUB. + // Push the condition code operands to OperandsToUpdate. + // If it is safe to remove CmpInstr, the condition code of these + // operands will be modified. + if (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 && + Sub->getOperand(2).getReg() == SrcReg) { + // VSel doesn't support condition code update. + if (IsInstrVSel) + return false; + OperandsToUpdate.push_back( + std::make_pair(&((*I).getOperand(IO - 1)), NewCC)); + } + } else { + // No Sub, so this is x = <op> y, z; cmp x, 0. + switch (CC) { + case ARMCC::EQ: // Z + case ARMCC::NE: // Z + case ARMCC::MI: // N + case ARMCC::PL: // N + case ARMCC::AL: // none + // CPSR can be used multiple times, we should continue. + break; + case ARMCC::HS: // C + case ARMCC::LO: // C + case ARMCC::VS: // V + case ARMCC::VC: // V + case ARMCC::HI: // C Z + case ARMCC::LS: // C Z + case ARMCC::GE: // N V + case ARMCC::LT: // N V + case ARMCC::GT: // Z N V + case ARMCC::LE: // Z N V + // The instruction uses the V bit or C bit which is not safe. + return false; + } + } + } + } + + // If CPSR is not killed nor re-defined, we should check whether it is + // live-out. If it is live-out, do not optimize. + if (!isSafe) { + MachineBasicBlock *MBB = CmpInstr->getParent(); + for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(), + SE = MBB->succ_end(); SI != SE; ++SI) + if ((*SI)->isLiveIn(ARM::CPSR)) + return false; + } + + // Toggle the optional operand to CPSR. + MI->getOperand(5).setReg(ARM::CPSR); + MI->getOperand(5).setIsDef(true); + assert(!isPredicated(MI) && "Can't use flags from predicated instruction"); + CmpInstr->eraseFromParent(); + + // Modify the condition code of operands in OperandsToUpdate. + // Since we have SUB(r1, r2) and CMP(r2, r1), the condition code needs to + // be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc. + for (unsigned i = 0, e = OperandsToUpdate.size(); i < e; i++) + OperandsToUpdate[i].first->setImm(OperandsToUpdate[i].second); + return true; + } + } + + return false; +} + +bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI, + MachineInstr *DefMI, unsigned Reg, + MachineRegisterInfo *MRI) const { + // Fold large immediates into add, sub, or, xor. + unsigned DefOpc = DefMI->getOpcode(); + if (DefOpc != ARM::t2MOVi32imm && DefOpc != ARM::MOVi32imm) + return false; + if (!DefMI->getOperand(1).isImm()) + // Could be t2MOVi32imm <ga:xx> + return false; + + if (!MRI->hasOneNonDBGUse(Reg)) + return false; + + const MCInstrDesc &DefMCID = DefMI->getDesc(); + if (DefMCID.hasOptionalDef()) { + unsigned NumOps = DefMCID.getNumOperands(); + const MachineOperand &MO = DefMI->getOperand(NumOps-1); + if (MO.getReg() == ARM::CPSR && !MO.isDead()) + // If DefMI defines CPSR and it is not dead, it's obviously not safe + // to delete DefMI. + return false; + } + + const MCInstrDesc &UseMCID = UseMI->getDesc(); + if (UseMCID.hasOptionalDef()) { + unsigned NumOps = UseMCID.getNumOperands(); + if (UseMI->getOperand(NumOps-1).getReg() == ARM::CPSR) + // If the instruction sets the flag, do not attempt this optimization + // since it may change the semantics of the code. + return false; + } + + unsigned UseOpc = UseMI->getOpcode(); + unsigned NewUseOpc = 0; + uint32_t ImmVal = (uint32_t)DefMI->getOperand(1).getImm(); + uint32_t SOImmValV1 = 0, SOImmValV2 = 0; + bool Commute = false; + switch (UseOpc) { + default: return false; + case ARM::SUBrr: + case ARM::ADDrr: + case ARM::ORRrr: + case ARM::EORrr: + case ARM::t2SUBrr: + case ARM::t2ADDrr: + case ARM::t2ORRrr: + case ARM::t2EORrr: { + Commute = UseMI->getOperand(2).getReg() != Reg; + switch (UseOpc) { + default: break; + case ARM::SUBrr: { + if (Commute) + return false; + ImmVal = -ImmVal; + NewUseOpc = ARM::SUBri; + // Fallthrough + } + case ARM::ADDrr: + case ARM::ORRrr: + case ARM::EORrr: { + if (!ARM_AM::isSOImmTwoPartVal(ImmVal)) + return false; + SOImmValV1 = (uint32_t)ARM_AM::getSOImmTwoPartFirst(ImmVal); + SOImmValV2 = (uint32_t)ARM_AM::getSOImmTwoPartSecond(ImmVal); + switch (UseOpc) { + default: break; + case ARM::ADDrr: NewUseOpc = ARM::ADDri; break; + case ARM::ORRrr: NewUseOpc = ARM::ORRri; break; + case ARM::EORrr: NewUseOpc = ARM::EORri; break; + } + break; + } + case ARM::t2SUBrr: { + if (Commute) + return false; + ImmVal = -ImmVal; + NewUseOpc = ARM::t2SUBri; + // Fallthrough + } + case ARM::t2ADDrr: + case ARM::t2ORRrr: + case ARM::t2EORrr: { + if (!ARM_AM::isT2SOImmTwoPartVal(ImmVal)) + return false; + SOImmValV1 = (uint32_t)ARM_AM::getT2SOImmTwoPartFirst(ImmVal); + SOImmValV2 = (uint32_t)ARM_AM::getT2SOImmTwoPartSecond(ImmVal); + switch (UseOpc) { + default: break; + case ARM::t2ADDrr: NewUseOpc = ARM::t2ADDri; break; + case ARM::t2ORRrr: NewUseOpc = ARM::t2ORRri; break; + case ARM::t2EORrr: NewUseOpc = ARM::t2EORri; break; + } + break; + } + } + } + } + + unsigned OpIdx = Commute ? 2 : 1; + unsigned Reg1 = UseMI->getOperand(OpIdx).getReg(); + bool isKill = UseMI->getOperand(OpIdx).isKill(); + unsigned NewReg = MRI->createVirtualRegister(MRI->getRegClass(Reg)); + AddDefaultCC(AddDefaultPred(BuildMI(*UseMI->getParent(), + UseMI, UseMI->getDebugLoc(), + get(NewUseOpc), NewReg) + .addReg(Reg1, getKillRegState(isKill)) + .addImm(SOImmValV1))); + UseMI->setDesc(get(NewUseOpc)); + UseMI->getOperand(1).setReg(NewReg); + UseMI->getOperand(1).setIsKill(); + UseMI->getOperand(2).ChangeToImmediate(SOImmValV2); + DefMI->eraseFromParent(); + return true; +} + +static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData, + const MachineInstr *MI) { + switch (MI->getOpcode()) { + default: { + const MCInstrDesc &Desc = MI->getDesc(); + int UOps = ItinData->getNumMicroOps(Desc.getSchedClass()); + assert(UOps >= 0 && "bad # UOps"); + return UOps; + } + + case ARM::LDRrs: + case ARM::LDRBrs: + case ARM::STRrs: + case ARM::STRBrs: { + unsigned ShOpVal = MI->getOperand(3).getImm(); + bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub; + unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal); + if (!isSub && + (ShImm == 0 || + ((ShImm == 1 || ShImm == 2 || ShImm == 3) && + ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))) + return 1; + return 2; + } + + case ARM::LDRH: + case ARM::STRH: { + if (!MI->getOperand(2).getReg()) + return 1; + + unsigned ShOpVal = MI->getOperand(3).getImm(); + bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub; + unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal); + if (!isSub && + (ShImm == 0 || + ((ShImm == 1 || ShImm == 2 || ShImm == 3) && + ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))) + return 1; + return 2; + } + + case ARM::LDRSB: + case ARM::LDRSH: + return (ARM_AM::getAM3Op(MI->getOperand(3).getImm()) == ARM_AM::sub) ? 3:2; + + case ARM::LDRSB_POST: + case ARM::LDRSH_POST: { + unsigned Rt = MI->getOperand(0).getReg(); + unsigned Rm = MI->getOperand(3).getReg(); + return (Rt == Rm) ? 4 : 3; + } + + case ARM::LDR_PRE_REG: + case ARM::LDRB_PRE_REG: { + unsigned Rt = MI->getOperand(0).getReg(); + unsigned Rm = MI->getOperand(3).getReg(); + if (Rt == Rm) + return 3; + unsigned ShOpVal = MI->getOperand(4).getImm(); + bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub; + unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal); + if (!isSub && + (ShImm == 0 || + ((ShImm == 1 || ShImm == 2 || ShImm == 3) && + ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))) + return 2; + return 3; + } + + case ARM::STR_PRE_REG: + case ARM::STRB_PRE_REG: { + unsigned ShOpVal = MI->getOperand(4).getImm(); + bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub; + unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal); + if (!isSub && + (ShImm == 0 || + ((ShImm == 1 || ShImm == 2 || ShImm == 3) && + ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))) + return 2; + return 3; + } + + case ARM::LDRH_PRE: + case ARM::STRH_PRE: { + unsigned Rt = MI->getOperand(0).getReg(); + unsigned Rm = MI->getOperand(3).getReg(); + if (!Rm) + return 2; + if (Rt == Rm) + return 3; + return (ARM_AM::getAM3Op(MI->getOperand(4).getImm()) == ARM_AM::sub) + ? 3 : 2; + } + + case ARM::LDR_POST_REG: + case ARM::LDRB_POST_REG: + case ARM::LDRH_POST: { + unsigned Rt = MI->getOperand(0).getReg(); + unsigned Rm = MI->getOperand(3).getReg(); + return (Rt == Rm) ? 3 : 2; + } + + case ARM::LDR_PRE_IMM: + case ARM::LDRB_PRE_IMM: + case ARM::LDR_POST_IMM: + case ARM::LDRB_POST_IMM: + case ARM::STRB_POST_IMM: + case ARM::STRB_POST_REG: + case ARM::STRB_PRE_IMM: + case ARM::STRH_POST: + case ARM::STR_POST_IMM: + case ARM::STR_POST_REG: + case ARM::STR_PRE_IMM: + return 2; + + case ARM::LDRSB_PRE: + case ARM::LDRSH_PRE: { + unsigned Rm = MI->getOperand(3).getReg(); + if (Rm == 0) + return 3; + unsigned Rt = MI->getOperand(0).getReg(); + if (Rt == Rm) + return 4; + unsigned ShOpVal = MI->getOperand(4).getImm(); + bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub; + unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal); + if (!isSub && + (ShImm == 0 || + ((ShImm == 1 || ShImm == 2 || ShImm == 3) && + ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))) + return 3; + return 4; + } + + case ARM::LDRD: { + unsigned Rt = MI->getOperand(0).getReg(); + unsigned Rn = MI->getOperand(2).getReg(); + unsigned Rm = MI->getOperand(3).getReg(); + if (Rm) + return (ARM_AM::getAM3Op(MI->getOperand(4).getImm()) == ARM_AM::sub) ?4:3; + return (Rt == Rn) ? 3 : 2; + } + + case ARM::STRD: { + unsigned Rm = MI->getOperand(3).getReg(); + if (Rm) + return (ARM_AM::getAM3Op(MI->getOperand(4).getImm()) == ARM_AM::sub) ?4:3; + return 2; + } + + case ARM::LDRD_POST: + case ARM::t2LDRD_POST: + return 3; + + case ARM::STRD_POST: + case ARM::t2STRD_POST: + return 4; + + case ARM::LDRD_PRE: { + unsigned Rt = MI->getOperand(0).getReg(); + unsigned Rn = MI->getOperand(3).getReg(); + unsigned Rm = MI->getOperand(4).getReg(); + if (Rm) + return (ARM_AM::getAM3Op(MI->getOperand(5).getImm()) == ARM_AM::sub) ?5:4; + return (Rt == Rn) ? 4 : 3; + } + + case ARM::t2LDRD_PRE: { + unsigned Rt = MI->getOperand(0).getReg(); + unsigned Rn = MI->getOperand(3).getReg(); + return (Rt == Rn) ? 4 : 3; + } + + case ARM::STRD_PRE: { + unsigned Rm = MI->getOperand(4).getReg(); + if (Rm) + return (ARM_AM::getAM3Op(MI->getOperand(5).getImm()) == ARM_AM::sub) ?5:4; + return 3; + } + + case ARM::t2STRD_PRE: + return 3; + + case ARM::t2LDR_POST: + case ARM::t2LDRB_POST: + case ARM::t2LDRB_PRE: + case ARM::t2LDRSBi12: + case ARM::t2LDRSBi8: + case ARM::t2LDRSBpci: + case ARM::t2LDRSBs: + case ARM::t2LDRH_POST: + case ARM::t2LDRH_PRE: + case ARM::t2LDRSBT: + case ARM::t2LDRSB_POST: + case ARM::t2LDRSB_PRE: + case ARM::t2LDRSH_POST: + case ARM::t2LDRSH_PRE: + case ARM::t2LDRSHi12: + case ARM::t2LDRSHi8: + case ARM::t2LDRSHpci: + case ARM::t2LDRSHs: + return 2; + + case ARM::t2LDRDi8: { + unsigned Rt = MI->getOperand(0).getReg(); + unsigned Rn = MI->getOperand(2).getReg(); + return (Rt == Rn) ? 3 : 2; + } + + case ARM::t2STRB_POST: + case ARM::t2STRB_PRE: + case ARM::t2STRBs: + case ARM::t2STRDi8: + case ARM::t2STRH_POST: + case ARM::t2STRH_PRE: + case ARM::t2STRHs: + case ARM::t2STR_POST: + case ARM::t2STR_PRE: + case ARM::t2STRs: + return 2; + } +} + +// Return the number of 32-bit words loaded by LDM or stored by STM. If this +// can't be easily determined return 0 (missing MachineMemOperand). +// +// FIXME: The current MachineInstr design does not support relying on machine +// mem operands to determine the width of a memory access. Instead, we expect +// the target to provide this information based on the instruction opcode and +// operands. However, using MachineMemOperand is the best solution now for +// two reasons: +// +// 1) getNumMicroOps tries to infer LDM memory width from the total number of MI +// operands. This is much more dangerous than using the MachineMemOperand +// sizes because CodeGen passes can insert/remove optional machine operands. In +// fact, it's totally incorrect for preRA passes and appears to be wrong for +// postRA passes as well. +// +// 2) getNumLDMAddresses is only used by the scheduling machine model and any +// machine model that calls this should handle the unknown (zero size) case. +// +// Long term, we should require a target hook that verifies MachineMemOperand +// sizes during MC lowering. That target hook should be local to MC lowering +// because we can't ensure that it is aware of other MI forms. Doing this will +// ensure that MachineMemOperands are correctly propagated through all passes. +unsigned ARMBaseInstrInfo::getNumLDMAddresses(const MachineInstr *MI) const { + unsigned Size = 0; + for (MachineInstr::mmo_iterator I = MI->memoperands_begin(), + E = MI->memoperands_end(); I != E; ++I) { + Size += (*I)->getSize(); + } + return Size / 4; +} + +unsigned +ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData, + const MachineInstr *MI) const { + if (!ItinData || ItinData->isEmpty()) + return 1; + + const MCInstrDesc &Desc = MI->getDesc(); + unsigned Class = Desc.getSchedClass(); + int ItinUOps = ItinData->getNumMicroOps(Class); + if (ItinUOps >= 0) { + if (Subtarget.isSwift() && (Desc.mayLoad() || Desc.mayStore())) + return getNumMicroOpsSwiftLdSt(ItinData, MI); + + return ItinUOps; + } + + unsigned Opc = MI->getOpcode(); + switch (Opc) { + default: + llvm_unreachable("Unexpected multi-uops instruction!"); + case ARM::VLDMQIA: + case ARM::VSTMQIA: + return 2; + + // The number of uOps for load / store multiple are determined by the number + // registers. + // + // On Cortex-A8, each pair of register loads / stores can be scheduled on the + // same cycle. The scheduling for the first load / store must be done + // separately by assuming the address is not 64-bit aligned. + // + // On Cortex-A9, the formula is simply (#reg / 2) + (#reg % 2). If the address + // is not 64-bit aligned, then AGU would take an extra cycle. For VFP / NEON + // load / store multiple, the formula is (#reg / 2) + (#reg % 2) + 1. + case ARM::VLDMDIA: + case ARM::VLDMDIA_UPD: + case ARM::VLDMDDB_UPD: + case ARM::VLDMSIA: + case ARM::VLDMSIA_UPD: + case ARM::VLDMSDB_UPD: + case ARM::VSTMDIA: + case ARM::VSTMDIA_UPD: + case ARM::VSTMDDB_UPD: + case ARM::VSTMSIA: + case ARM::VSTMSIA_UPD: + case ARM::VSTMSDB_UPD: { + unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands(); + return (NumRegs / 2) + (NumRegs % 2) + 1; + } + + case ARM::LDMIA_RET: + case ARM::LDMIA: + case ARM::LDMDA: + case ARM::LDMDB: + case ARM::LDMIB: + case ARM::LDMIA_UPD: + case ARM::LDMDA_UPD: + case ARM::LDMDB_UPD: + case ARM::LDMIB_UPD: + case ARM::STMIA: + case ARM::STMDA: + case ARM::STMDB: + case ARM::STMIB: + case ARM::STMIA_UPD: + case ARM::STMDA_UPD: + case ARM::STMDB_UPD: + case ARM::STMIB_UPD: + case ARM::tLDMIA: + case ARM::tLDMIA_UPD: + case ARM::tSTMIA_UPD: + case ARM::tPOP_RET: + case ARM::tPOP: + case ARM::tPUSH: + case ARM::t2LDMIA_RET: + case ARM::t2LDMIA: + case ARM::t2LDMDB: + case ARM::t2LDMIA_UPD: + case ARM::t2LDMDB_UPD: + case ARM::t2STMIA: + case ARM::t2STMDB: + case ARM::t2STMIA_UPD: + case ARM::t2STMDB_UPD: { + unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands() + 1; + if (Subtarget.isSwift()) { + int UOps = 1 + NumRegs; // One for address computation, one for each ld / st. + switch (Opc) { + default: break; + case ARM::VLDMDIA_UPD: + case ARM::VLDMDDB_UPD: + case ARM::VLDMSIA_UPD: + case ARM::VLDMSDB_UPD: + case ARM::VSTMDIA_UPD: + case ARM::VSTMDDB_UPD: + case ARM::VSTMSIA_UPD: + case ARM::VSTMSDB_UPD: + case ARM::LDMIA_UPD: + case ARM::LDMDA_UPD: + case ARM::LDMDB_UPD: + case ARM::LDMIB_UPD: + case ARM::STMIA_UPD: + case ARM::STMDA_UPD: + case ARM::STMDB_UPD: + case ARM::STMIB_UPD: + case ARM::tLDMIA_UPD: + case ARM::tSTMIA_UPD: + case ARM::t2LDMIA_UPD: + case ARM::t2LDMDB_UPD: + case ARM::t2STMIA_UPD: + case ARM::t2STMDB_UPD: + ++UOps; // One for base register writeback. + break; + case ARM::LDMIA_RET: + case ARM::tPOP_RET: + case ARM::t2LDMIA_RET: + UOps += 2; // One for base reg wb, one for write to pc. + break; + } + return UOps; + } else if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) { + if (NumRegs < 4) + return 2; + // 4 registers would be issued: 2, 2. + // 5 registers would be issued: 2, 2, 1. + int A8UOps = (NumRegs / 2); + if (NumRegs % 2) + ++A8UOps; + return A8UOps; + } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) { + int A9UOps = (NumRegs / 2); + // If there are odd number of registers or if it's not 64-bit aligned, + // then it takes an extra AGU (Address Generation Unit) cycle. + if ((NumRegs % 2) || + !MI->hasOneMemOperand() || + (*MI->memoperands_begin())->getAlignment() < 8) + ++A9UOps; + return A9UOps; + } else { + // Assume the worst. + return NumRegs; + } + } + } +} + +int +ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData, + const MCInstrDesc &DefMCID, + unsigned DefClass, + unsigned DefIdx, unsigned DefAlign) const { + int RegNo = (int)(DefIdx+1) - DefMCID.getNumOperands() + 1; + if (RegNo <= 0) + // Def is the address writeback. + return ItinData->getOperandCycle(DefClass, DefIdx); + + int DefCycle; + if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) { + // (regno / 2) + (regno % 2) + 1 + DefCycle = RegNo / 2 + 1; + if (RegNo % 2) + ++DefCycle; + } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) { + DefCycle = RegNo; + bool isSLoad = false; + + switch (DefMCID.getOpcode()) { + default: break; + case ARM::VLDMSIA: + case ARM::VLDMSIA_UPD: + case ARM::VLDMSDB_UPD: + isSLoad = true; + break; + } + + // If there are odd number of 'S' registers or if it's not 64-bit aligned, + // then it takes an extra cycle. + if ((isSLoad && (RegNo % 2)) || DefAlign < 8) + ++DefCycle; + } else { + // Assume the worst. + DefCycle = RegNo + 2; + } + + return DefCycle; +} + +int +ARMBaseInstrInfo::getLDMDefCycle(const InstrItineraryData *ItinData, + const MCInstrDesc &DefMCID, + unsigned DefClass, + unsigned DefIdx, unsigned DefAlign) const { + int RegNo = (int)(DefIdx+1) - DefMCID.getNumOperands() + 1; + if (RegNo <= 0) + // Def is the address writeback. + return ItinData->getOperandCycle(DefClass, DefIdx); + + int DefCycle; + if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) { + // 4 registers would be issued: 1, 2, 1. + // 5 registers would be issued: 1, 2, 2. + DefCycle = RegNo / 2; + if (DefCycle < 1) + DefCycle = 1; + // Result latency is issue cycle + 2: E2. + DefCycle += 2; + } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) { + DefCycle = (RegNo / 2); + // If there are odd number of registers or if it's not 64-bit aligned, + // then it takes an extra AGU (Address Generation Unit) cycle. + if ((RegNo % 2) || DefAlign < 8) + ++DefCycle; + // Result latency is AGU cycles + 2. + DefCycle += 2; + } else { + // Assume the worst. + DefCycle = RegNo + 2; + } + + return DefCycle; +} + +int +ARMBaseInstrInfo::getVSTMUseCycle(const InstrItineraryData *ItinData, + const MCInstrDesc &UseMCID, + unsigned UseClass, + unsigned UseIdx, unsigned UseAlign) const { + int RegNo = (int)(UseIdx+1) - UseMCID.getNumOperands() + 1; + if (RegNo <= 0) + return ItinData->getOperandCycle(UseClass, UseIdx); + + int UseCycle; + if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) { + // (regno / 2) + (regno % 2) + 1 + UseCycle = RegNo / 2 + 1; + if (RegNo % 2) + ++UseCycle; + } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) { + UseCycle = RegNo; + bool isSStore = false; + + switch (UseMCID.getOpcode()) { + default: break; + case ARM::VSTMSIA: + case ARM::VSTMSIA_UPD: + case ARM::VSTMSDB_UPD: + isSStore = true; + break; + } + + // If there are odd number of 'S' registers or if it's not 64-bit aligned, + // then it takes an extra cycle. + if ((isSStore && (RegNo % 2)) || UseAlign < 8) + ++UseCycle; + } else { + // Assume the worst. + UseCycle = RegNo + 2; + } + + return UseCycle; +} + +int +ARMBaseInstrInfo::getSTMUseCycle(const InstrItineraryData *ItinData, + const MCInstrDesc &UseMCID, + unsigned UseClass, + unsigned UseIdx, unsigned UseAlign) const { + int RegNo = (int)(UseIdx+1) - UseMCID.getNumOperands() + 1; + if (RegNo <= 0) + return ItinData->getOperandCycle(UseClass, UseIdx); + + int UseCycle; + if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) { + UseCycle = RegNo / 2; + if (UseCycle < 2) + UseCycle = 2; + // Read in E3. + UseCycle += 2; + } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) { + UseCycle = (RegNo / 2); + // If there are odd number of registers or if it's not 64-bit aligned, + // then it takes an extra AGU (Address Generation Unit) cycle. + if ((RegNo % 2) || UseAlign < 8) + ++UseCycle; + } else { + // Assume the worst. + UseCycle = 1; + } + return UseCycle; +} + +int +ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, + const MCInstrDesc &DefMCID, + unsigned DefIdx, unsigned DefAlign, + const MCInstrDesc &UseMCID, + unsigned UseIdx, unsigned UseAlign) const { + unsigned DefClass = DefMCID.getSchedClass(); + unsigned UseClass = UseMCID.getSchedClass(); + + if (DefIdx < DefMCID.getNumDefs() && UseIdx < UseMCID.getNumOperands()) + return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx); + + // This may be a def / use of a variable_ops instruction, the operand + // latency might be determinable dynamically. Let the target try to + // figure it out. + int DefCycle = -1; + bool LdmBypass = false; + switch (DefMCID.getOpcode()) { + default: + DefCycle = ItinData->getOperandCycle(DefClass, DefIdx); + break; + + case ARM::VLDMDIA: + case ARM::VLDMDIA_UPD: + case ARM::VLDMDDB_UPD: + case ARM::VLDMSIA: + case ARM::VLDMSIA_UPD: + case ARM::VLDMSDB_UPD: + DefCycle = getVLDMDefCycle(ItinData, DefMCID, DefClass, DefIdx, DefAlign); + break; + + case ARM::LDMIA_RET: + case ARM::LDMIA: + case ARM::LDMDA: + case ARM::LDMDB: + case ARM::LDMIB: + case ARM::LDMIA_UPD: + case ARM::LDMDA_UPD: + case ARM::LDMDB_UPD: + case ARM::LDMIB_UPD: + case ARM::tLDMIA: + case ARM::tLDMIA_UPD: + case ARM::tPUSH: + case ARM::t2LDMIA_RET: + case ARM::t2LDMIA: + case ARM::t2LDMDB: + case ARM::t2LDMIA_UPD: + case ARM::t2LDMDB_UPD: + LdmBypass = 1; + DefCycle = getLDMDefCycle(ItinData, DefMCID, DefClass, DefIdx, DefAlign); + break; + } + + if (DefCycle == -1) + // We can't seem to determine the result latency of the def, assume it's 2. + DefCycle = 2; + + int UseCycle = -1; + switch (UseMCID.getOpcode()) { + default: + UseCycle = ItinData->getOperandCycle(UseClass, UseIdx); + break; + + case ARM::VSTMDIA: + case ARM::VSTMDIA_UPD: + case ARM::VSTMDDB_UPD: + case ARM::VSTMSIA: + case ARM::VSTMSIA_UPD: + case ARM::VSTMSDB_UPD: + UseCycle = getVSTMUseCycle(ItinData, UseMCID, UseClass, UseIdx, UseAlign); + break; + + case ARM::STMIA: + case ARM::STMDA: + case ARM::STMDB: + case ARM::STMIB: + case ARM::STMIA_UPD: + case ARM::STMDA_UPD: + case ARM::STMDB_UPD: + case ARM::STMIB_UPD: + case ARM::tSTMIA_UPD: + case ARM::tPOP_RET: + case ARM::tPOP: + case ARM::t2STMIA: + case ARM::t2STMDB: + case ARM::t2STMIA_UPD: + case ARM::t2STMDB_UPD: + UseCycle = getSTMUseCycle(ItinData, UseMCID, UseClass, UseIdx, UseAlign); + break; + } + + if (UseCycle == -1) + // Assume it's read in the first stage. + UseCycle = 1; + + UseCycle = DefCycle - UseCycle + 1; + if (UseCycle > 0) { + if (LdmBypass) { + // It's a variable_ops instruction so we can't use DefIdx here. Just use + // first def operand. + if (ItinData->hasPipelineForwarding(DefClass, DefMCID.getNumOperands()-1, + UseClass, UseIdx)) + --UseCycle; + } else if (ItinData->hasPipelineForwarding(DefClass, DefIdx, + UseClass, UseIdx)) { + --UseCycle; + } + } + + return UseCycle; +} + +static const MachineInstr *getBundledDefMI(const TargetRegisterInfo *TRI, + const MachineInstr *MI, unsigned Reg, + unsigned &DefIdx, unsigned &Dist) { + Dist = 0; + + MachineBasicBlock::const_iterator I = MI; ++I; + MachineBasicBlock::const_instr_iterator II = std::prev(I.getInstrIterator()); + assert(II->isInsideBundle() && "Empty bundle?"); + + int Idx = -1; + while (II->isInsideBundle()) { + Idx = II->findRegisterDefOperandIdx(Reg, false, true, TRI); + if (Idx != -1) + break; + --II; + ++Dist; + } + + assert(Idx != -1 && "Cannot find bundled definition!"); + DefIdx = Idx; + return &*II; +} + +static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI, + const MachineInstr *MI, unsigned Reg, + unsigned &UseIdx, unsigned &Dist) { + Dist = 0; + + MachineBasicBlock::const_instr_iterator II = ++MI->getIterator(); + assert(II->isInsideBundle() && "Empty bundle?"); + MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end(); + + // FIXME: This doesn't properly handle multiple uses. + int Idx = -1; + while (II != E && II->isInsideBundle()) { + Idx = II->findRegisterUseOperandIdx(Reg, false, TRI); + if (Idx != -1) + break; + if (II->getOpcode() != ARM::t2IT) + ++Dist; + ++II; + } + + if (Idx == -1) { + Dist = 0; + return nullptr; + } + + UseIdx = Idx; + return &*II; +} + +/// Return the number of cycles to add to (or subtract from) the static +/// itinerary based on the def opcode and alignment. The caller will ensure that +/// adjusted latency is at least one cycle. +static int adjustDefLatency(const ARMSubtarget &Subtarget, + const MachineInstr *DefMI, + const MCInstrDesc *DefMCID, unsigned DefAlign) { + int Adjust = 0; + if (Subtarget.isCortexA8() || Subtarget.isLikeA9() || Subtarget.isCortexA7()) { + // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2] + // variants are one cycle cheaper. + switch (DefMCID->getOpcode()) { + default: break; + case ARM::LDRrs: + case ARM::LDRBrs: { + unsigned ShOpVal = DefMI->getOperand(3).getImm(); + unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal); + if (ShImm == 0 || + (ShImm == 2 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)) + --Adjust; + break; + } + case ARM::t2LDRs: + case ARM::t2LDRBs: + case ARM::t2LDRHs: + case ARM::t2LDRSHs: { + // Thumb2 mode: lsl only. + unsigned ShAmt = DefMI->getOperand(3).getImm(); + if (ShAmt == 0 || ShAmt == 2) + --Adjust; + break; + } + } + } else if (Subtarget.isSwift()) { + // FIXME: Properly handle all of the latency adjustments for address + // writeback. + switch (DefMCID->getOpcode()) { + default: break; + case ARM::LDRrs: + case ARM::LDRBrs: { + unsigned ShOpVal = DefMI->getOperand(3).getImm(); + bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub; + unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal); + if (!isSub && + (ShImm == 0 || + ((ShImm == 1 || ShImm == 2 || ShImm == 3) && + ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))) + Adjust -= 2; + else if (!isSub && + ShImm == 1 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsr) + --Adjust; + break; + } + case ARM::t2LDRs: + case ARM::t2LDRBs: + case ARM::t2LDRHs: + case ARM::t2LDRSHs: { + // Thumb2 mode: lsl only. + unsigned ShAmt = DefMI->getOperand(3).getImm(); + if (ShAmt == 0 || ShAmt == 1 || ShAmt == 2 || ShAmt == 3) + Adjust -= 2; + break; + } + } + } + + if (DefAlign < 8 && Subtarget.isLikeA9()) { + switch (DefMCID->getOpcode()) { + default: break; + case ARM::VLD1q8: + case ARM::VLD1q16: + case ARM::VLD1q32: + case ARM::VLD1q64: + case ARM::VLD1q8wb_fixed: + case ARM::VLD1q16wb_fixed: + case ARM::VLD1q32wb_fixed: + case ARM::VLD1q64wb_fixed: + case ARM::VLD1q8wb_register: + case ARM::VLD1q16wb_register: + case ARM::VLD1q32wb_register: + case ARM::VLD1q64wb_register: + case ARM::VLD2d8: + case ARM::VLD2d16: + case ARM::VLD2d32: + case ARM::VLD2q8: + case ARM::VLD2q16: + case ARM::VLD2q32: + case ARM::VLD2d8wb_fixed: + case ARM::VLD2d16wb_fixed: + case ARM::VLD2d32wb_fixed: + case ARM::VLD2q8wb_fixed: + case ARM::VLD2q16wb_fixed: + case ARM::VLD2q32wb_fixed: + case ARM::VLD2d8wb_register: + case ARM::VLD2d16wb_register: + case ARM::VLD2d32wb_register: + case ARM::VLD2q8wb_register: + case ARM::VLD2q16wb_register: + case ARM::VLD2q32wb_register: + case ARM::VLD3d8: + case ARM::VLD3d16: + case ARM::VLD3d32: + case ARM::VLD1d64T: + case ARM::VLD3d8_UPD: + case ARM::VLD3d16_UPD: + case ARM::VLD3d32_UPD: + case ARM::VLD1d64Twb_fixed: + case ARM::VLD1d64Twb_register: + case ARM::VLD3q8_UPD: + case ARM::VLD3q16_UPD: + case ARM::VLD3q32_UPD: + case ARM::VLD4d8: + case ARM::VLD4d16: + case ARM::VLD4d32: + case ARM::VLD1d64Q: + case ARM::VLD4d8_UPD: + case ARM::VLD4d16_UPD: + case ARM::VLD4d32_UPD: + case ARM::VLD1d64Qwb_fixed: + case ARM::VLD1d64Qwb_register: + case ARM::VLD4q8_UPD: + case ARM::VLD4q16_UPD: + case ARM::VLD4q32_UPD: + case ARM::VLD1DUPq8: + case ARM::VLD1DUPq16: + case ARM::VLD1DUPq32: + case ARM::VLD1DUPq8wb_fixed: + case ARM::VLD1DUPq16wb_fixed: + case ARM::VLD1DUPq32wb_fixed: + case ARM::VLD1DUPq8wb_register: + case ARM::VLD1DUPq16wb_register: + case ARM::VLD1DUPq32wb_register: + case ARM::VLD2DUPd8: + case ARM::VLD2DUPd16: + case ARM::VLD2DUPd32: + case ARM::VLD2DUPd8wb_fixed: + case ARM::VLD2DUPd16wb_fixed: + case ARM::VLD2DUPd32wb_fixed: + case ARM::VLD2DUPd8wb_register: + case ARM::VLD2DUPd16wb_register: + case ARM::VLD2DUPd32wb_register: + case ARM::VLD4DUPd8: + case ARM::VLD4DUPd16: + case ARM::VLD4DUPd32: + case ARM::VLD4DUPd8_UPD: + case ARM::VLD4DUPd16_UPD: + case ARM::VLD4DUPd32_UPD: + case ARM::VLD1LNd8: + case ARM::VLD1LNd16: + case ARM::VLD1LNd32: + case ARM::VLD1LNd8_UPD: + case ARM::VLD1LNd16_UPD: + case ARM::VLD1LNd32_UPD: + case ARM::VLD2LNd8: + case ARM::VLD2LNd16: + case ARM::VLD2LNd32: + case ARM::VLD2LNq16: + case ARM::VLD2LNq32: + case ARM::VLD2LNd8_UPD: + case ARM::VLD2LNd16_UPD: + case ARM::VLD2LNd32_UPD: + case ARM::VLD2LNq16_UPD: + case ARM::VLD2LNq32_UPD: + case ARM::VLD4LNd8: + case ARM::VLD4LNd16: + case ARM::VLD4LNd32: + case ARM::VLD4LNq16: + case ARM::VLD4LNq32: + case ARM::VLD4LNd8_UPD: + case ARM::VLD4LNd16_UPD: + case ARM::VLD4LNd32_UPD: + case ARM::VLD4LNq16_UPD: + case ARM::VLD4LNq32_UPD: + // If the address is not 64-bit aligned, the latencies of these + // instructions increases by one. + ++Adjust; + break; + } + } + return Adjust; +} + + + +int +ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, + unsigned UseIdx) const { + // No operand latency. The caller may fall back to getInstrLatency. + if (!ItinData || ItinData->isEmpty()) + return -1; + + const MachineOperand &DefMO = DefMI->getOperand(DefIdx); + unsigned Reg = DefMO.getReg(); + const MCInstrDesc *DefMCID = &DefMI->getDesc(); + const MCInstrDesc *UseMCID = &UseMI->getDesc(); + + unsigned DefAdj = 0; + if (DefMI->isBundle()) { + DefMI = getBundledDefMI(&getRegisterInfo(), DefMI, Reg, DefIdx, DefAdj); + DefMCID = &DefMI->getDesc(); + } + if (DefMI->isCopyLike() || DefMI->isInsertSubreg() || + DefMI->isRegSequence() || DefMI->isImplicitDef()) { + return 1; + } + + unsigned UseAdj = 0; + if (UseMI->isBundle()) { + unsigned NewUseIdx; + const MachineInstr *NewUseMI = getBundledUseMI(&getRegisterInfo(), UseMI, + Reg, NewUseIdx, UseAdj); + if (!NewUseMI) + return -1; + + UseMI = NewUseMI; + UseIdx = NewUseIdx; + UseMCID = &UseMI->getDesc(); + } + + if (Reg == ARM::CPSR) { + if (DefMI->getOpcode() == ARM::FMSTAT) { + // fpscr -> cpsr stalls over 20 cycles on A8 (and earlier?) + return Subtarget.isLikeA9() ? 1 : 20; + } + + // CPSR set and branch can be paired in the same cycle. + if (UseMI->isBranch()) + return 0; + + // Otherwise it takes the instruction latency (generally one). + unsigned Latency = getInstrLatency(ItinData, DefMI); + + // For Thumb2 and -Os, prefer scheduling CPSR setting instruction close to + // its uses. Instructions which are otherwise scheduled between them may + // incur a code size penalty (not able to use the CPSR setting 16-bit + // instructions). + if (Latency > 0 && Subtarget.isThumb2()) { + const MachineFunction *MF = DefMI->getParent()->getParent(); + // FIXME: Use Function::optForSize(). + if (MF->getFunction()->hasFnAttribute(Attribute::OptimizeForSize)) + --Latency; + } + return Latency; + } + + if (DefMO.isImplicit() || UseMI->getOperand(UseIdx).isImplicit()) + return -1; + + unsigned DefAlign = DefMI->hasOneMemOperand() + ? (*DefMI->memoperands_begin())->getAlignment() : 0; + unsigned UseAlign = UseMI->hasOneMemOperand() + ? (*UseMI->memoperands_begin())->getAlignment() : 0; + + // Get the itinerary's latency if possible, and handle variable_ops. + int Latency = getOperandLatency(ItinData, *DefMCID, DefIdx, DefAlign, + *UseMCID, UseIdx, UseAlign); + // Unable to find operand latency. The caller may resort to getInstrLatency. + if (Latency < 0) + return Latency; + + // Adjust for IT block position. + int Adj = DefAdj + UseAdj; + + // Adjust for dynamic def-side opcode variants not captured by the itinerary. + Adj += adjustDefLatency(Subtarget, DefMI, DefMCID, DefAlign); + if (Adj >= 0 || (int)Latency > -Adj) { + return Latency + Adj; + } + // Return the itinerary latency, which may be zero but not less than zero. + return Latency; +} + +int +ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, + SDNode *DefNode, unsigned DefIdx, + SDNode *UseNode, unsigned UseIdx) const { + if (!DefNode->isMachineOpcode()) + return 1; + + const MCInstrDesc &DefMCID = get(DefNode->getMachineOpcode()); + + if (isZeroCost(DefMCID.Opcode)) + return 0; + + if (!ItinData || ItinData->isEmpty()) + return DefMCID.mayLoad() ? 3 : 1; + + if (!UseNode->isMachineOpcode()) { + int Latency = ItinData->getOperandCycle(DefMCID.getSchedClass(), DefIdx); + if (Subtarget.isLikeA9() || Subtarget.isSwift()) + return Latency <= 2 ? 1 : Latency - 1; + else + return Latency <= 3 ? 1 : Latency - 2; + } + + const MCInstrDesc &UseMCID = get(UseNode->getMachineOpcode()); + const MachineSDNode *DefMN = dyn_cast<MachineSDNode>(DefNode); + unsigned DefAlign = !DefMN->memoperands_empty() + ? (*DefMN->memoperands_begin())->getAlignment() : 0; + const MachineSDNode *UseMN = dyn_cast<MachineSDNode>(UseNode); + unsigned UseAlign = !UseMN->memoperands_empty() + ? (*UseMN->memoperands_begin())->getAlignment() : 0; + int Latency = getOperandLatency(ItinData, DefMCID, DefIdx, DefAlign, + UseMCID, UseIdx, UseAlign); + + if (Latency > 1 && + (Subtarget.isCortexA8() || Subtarget.isLikeA9() || + Subtarget.isCortexA7())) { + // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2] + // variants are one cycle cheaper. + switch (DefMCID.getOpcode()) { + default: break; + case ARM::LDRrs: + case ARM::LDRBrs: { + unsigned ShOpVal = + cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue(); + unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal); + if (ShImm == 0 || + (ShImm == 2 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)) + --Latency; + break; + } + case ARM::t2LDRs: + case ARM::t2LDRBs: + case ARM::t2LDRHs: + case ARM::t2LDRSHs: { + // Thumb2 mode: lsl only. + unsigned ShAmt = + cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue(); + if (ShAmt == 0 || ShAmt == 2) + --Latency; + break; + } + } + } else if (DefIdx == 0 && Latency > 2 && Subtarget.isSwift()) { + // FIXME: Properly handle all of the latency adjustments for address + // writeback. + switch (DefMCID.getOpcode()) { + default: break; + case ARM::LDRrs: + case ARM::LDRBrs: { + unsigned ShOpVal = + cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue(); + unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal); + if (ShImm == 0 || + ((ShImm == 1 || ShImm == 2 || ShImm == 3) && + ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)) + Latency -= 2; + else if (ShImm == 1 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsr) + --Latency; + break; + } + case ARM::t2LDRs: + case ARM::t2LDRBs: + case ARM::t2LDRHs: + case ARM::t2LDRSHs: { + // Thumb2 mode: lsl 0-3 only. + Latency -= 2; + break; + } + } + } + + if (DefAlign < 8 && Subtarget.isLikeA9()) + switch (DefMCID.getOpcode()) { + default: break; + case ARM::VLD1q8: + case ARM::VLD1q16: + case ARM::VLD1q32: + case ARM::VLD1q64: + case ARM::VLD1q8wb_register: + case ARM::VLD1q16wb_register: + case ARM::VLD1q32wb_register: + case ARM::VLD1q64wb_register: + case ARM::VLD1q8wb_fixed: + case ARM::VLD1q16wb_fixed: + case ARM::VLD1q32wb_fixed: + case ARM::VLD1q64wb_fixed: + case ARM::VLD2d8: + case ARM::VLD2d16: + case ARM::VLD2d32: + case ARM::VLD2q8Pseudo: + case ARM::VLD2q16Pseudo: + case ARM::VLD2q32Pseudo: + case ARM::VLD2d8wb_fixed: + case ARM::VLD2d16wb_fixed: + case ARM::VLD2d32wb_fixed: + case ARM::VLD2q8PseudoWB_fixed: + case ARM::VLD2q16PseudoWB_fixed: + case ARM::VLD2q32PseudoWB_fixed: + case ARM::VLD2d8wb_register: + case ARM::VLD2d16wb_register: + case ARM::VLD2d32wb_register: + case ARM::VLD2q8PseudoWB_register: + case ARM::VLD2q16PseudoWB_register: + case ARM::VLD2q32PseudoWB_register: + case ARM::VLD3d8Pseudo: + case ARM::VLD3d16Pseudo: + case ARM::VLD3d32Pseudo: + case ARM::VLD1d64TPseudo: + case ARM::VLD1d64TPseudoWB_fixed: + case ARM::VLD3d8Pseudo_UPD: + case ARM::VLD3d16Pseudo_UPD: + case ARM::VLD3d32Pseudo_UPD: + case ARM::VLD3q8Pseudo_UPD: + case ARM::VLD3q16Pseudo_UPD: + case ARM::VLD3q32Pseudo_UPD: + case ARM::VLD3q8oddPseudo: + case ARM::VLD3q16oddPseudo: + case ARM::VLD3q32oddPseudo: + case ARM::VLD3q8oddPseudo_UPD: + case ARM::VLD3q16oddPseudo_UPD: + case ARM::VLD3q32oddPseudo_UPD: + case ARM::VLD4d8Pseudo: + case ARM::VLD4d16Pseudo: + case ARM::VLD4d32Pseudo: + case ARM::VLD1d64QPseudo: + case ARM::VLD1d64QPseudoWB_fixed: + case ARM::VLD4d8Pseudo_UPD: + case ARM::VLD4d16Pseudo_UPD: + case ARM::VLD4d32Pseudo_UPD: + case ARM::VLD4q8Pseudo_UPD: + case ARM::VLD4q16Pseudo_UPD: + case ARM::VLD4q32Pseudo_UPD: + case ARM::VLD4q8oddPseudo: + case ARM::VLD4q16oddPseudo: + case ARM::VLD4q32oddPseudo: + case ARM::VLD4q8oddPseudo_UPD: + case ARM::VLD4q16oddPseudo_UPD: + case ARM::VLD4q32oddPseudo_UPD: + case ARM::VLD1DUPq8: + case ARM::VLD1DUPq16: + case ARM::VLD1DUPq32: + case ARM::VLD1DUPq8wb_fixed: + case ARM::VLD1DUPq16wb_fixed: + case ARM::VLD1DUPq32wb_fixed: + case ARM::VLD1DUPq8wb_register: + case ARM::VLD1DUPq16wb_register: + case ARM::VLD1DUPq32wb_register: + case ARM::VLD2DUPd8: + case ARM::VLD2DUPd16: + case ARM::VLD2DUPd32: + case ARM::VLD2DUPd8wb_fixed: + case ARM::VLD2DUPd16wb_fixed: + case ARM::VLD2DUPd32wb_fixed: + case ARM::VLD2DUPd8wb_register: + case ARM::VLD2DUPd16wb_register: + case ARM::VLD2DUPd32wb_register: + case ARM::VLD4DUPd8Pseudo: + case ARM::VLD4DUPd16Pseudo: + case ARM::VLD4DUPd32Pseudo: + case ARM::VLD4DUPd8Pseudo_UPD: + case ARM::VLD4DUPd16Pseudo_UPD: + case ARM::VLD4DUPd32Pseudo_UPD: + case ARM::VLD1LNq8Pseudo: + case ARM::VLD1LNq16Pseudo: + case ARM::VLD1LNq32Pseudo: + case ARM::VLD1LNq8Pseudo_UPD: + case ARM::VLD1LNq16Pseudo_UPD: + case ARM::VLD1LNq32Pseudo_UPD: + case ARM::VLD2LNd8Pseudo: + case ARM::VLD2LNd16Pseudo: + case ARM::VLD2LNd32Pseudo: + case ARM::VLD2LNq16Pseudo: + case ARM::VLD2LNq32Pseudo: + case ARM::VLD2LNd8Pseudo_UPD: + case ARM::VLD2LNd16Pseudo_UPD: + case ARM::VLD2LNd32Pseudo_UPD: + case ARM::VLD2LNq16Pseudo_UPD: + case ARM::VLD2LNq32Pseudo_UPD: + case ARM::VLD4LNd8Pseudo: + case ARM::VLD4LNd16Pseudo: + case ARM::VLD4LNd32Pseudo: + case ARM::VLD4LNq16Pseudo: + case ARM::VLD4LNq32Pseudo: + case ARM::VLD4LNd8Pseudo_UPD: + case ARM::VLD4LNd16Pseudo_UPD: + case ARM::VLD4LNd32Pseudo_UPD: + case ARM::VLD4LNq16Pseudo_UPD: + case ARM::VLD4LNq32Pseudo_UPD: + // If the address is not 64-bit aligned, the latencies of these + // instructions increases by one. + ++Latency; + break; + } + + return Latency; +} + +unsigned ARMBaseInstrInfo::getPredicationCost(const MachineInstr *MI) const { + if (MI->isCopyLike() || MI->isInsertSubreg() || + MI->isRegSequence() || MI->isImplicitDef()) + return 0; + + if (MI->isBundle()) + return 0; + + const MCInstrDesc &MCID = MI->getDesc(); + + if (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR)) { + // When predicated, CPSR is an additional source operand for CPSR updating + // instructions, this apparently increases their latencies. + return 1; + } + return 0; +} + +unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost) const { + if (MI->isCopyLike() || MI->isInsertSubreg() || + MI->isRegSequence() || MI->isImplicitDef()) + return 1; + + // An instruction scheduler typically runs on unbundled instructions, however + // other passes may query the latency of a bundled instruction. + if (MI->isBundle()) { + unsigned Latency = 0; + MachineBasicBlock::const_instr_iterator I = MI->getIterator(); + MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end(); + while (++I != E && I->isInsideBundle()) { + if (I->getOpcode() != ARM::t2IT) + Latency += getInstrLatency(ItinData, &*I, PredCost); + } + return Latency; + } + + const MCInstrDesc &MCID = MI->getDesc(); + if (PredCost && (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR))) { + // When predicated, CPSR is an additional source operand for CPSR updating + // instructions, this apparently increases their latencies. + *PredCost = 1; + } + // Be sure to call getStageLatency for an empty itinerary in case it has a + // valid MinLatency property. + if (!ItinData) + return MI->mayLoad() ? 3 : 1; + + unsigned Class = MCID.getSchedClass(); + + // For instructions with variable uops, use uops as latency. + if (!ItinData->isEmpty() && ItinData->getNumMicroOps(Class) < 0) + return getNumMicroOps(ItinData, MI); + + // For the common case, fall back on the itinerary's latency. + unsigned Latency = ItinData->getStageLatency(Class); + + // Adjust for dynamic def-side opcode variants not captured by the itinerary. + unsigned DefAlign = MI->hasOneMemOperand() + ? (*MI->memoperands_begin())->getAlignment() : 0; + int Adj = adjustDefLatency(Subtarget, MI, &MCID, DefAlign); + if (Adj >= 0 || (int)Latency > -Adj) { + return Latency + Adj; + } + return Latency; +} + +int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, + SDNode *Node) const { + if (!Node->isMachineOpcode()) + return 1; + + if (!ItinData || ItinData->isEmpty()) + return 1; + + unsigned Opcode = Node->getMachineOpcode(); + switch (Opcode) { + default: + return ItinData->getStageLatency(get(Opcode).getSchedClass()); + case ARM::VLDMQIA: + case ARM::VSTMQIA: + return 2; + } +} + +bool ARMBaseInstrInfo:: +hasHighOperandLatency(const TargetSchedModel &SchedModel, + const MachineRegisterInfo *MRI, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const { + unsigned DDomain = DefMI->getDesc().TSFlags & ARMII::DomainMask; + unsigned UDomain = UseMI->getDesc().TSFlags & ARMII::DomainMask; + if (Subtarget.isCortexA8() && + (DDomain == ARMII::DomainVFP || UDomain == ARMII::DomainVFP)) + // CortexA8 VFP instructions are not pipelined. + return true; + + // Hoist VFP / NEON instructions with 4 or higher latency. + unsigned Latency + = SchedModel.computeOperandLatency(DefMI, DefIdx, UseMI, UseIdx); + if (Latency <= 3) + return false; + return DDomain == ARMII::DomainVFP || DDomain == ARMII::DomainNEON || + UDomain == ARMII::DomainVFP || UDomain == ARMII::DomainNEON; +} + +bool ARMBaseInstrInfo:: +hasLowDefLatency(const TargetSchedModel &SchedModel, + const MachineInstr *DefMI, unsigned DefIdx) const { + const InstrItineraryData *ItinData = SchedModel.getInstrItineraries(); + if (!ItinData || ItinData->isEmpty()) + return false; + + unsigned DDomain = DefMI->getDesc().TSFlags & ARMII::DomainMask; + if (DDomain == ARMII::DomainGeneral) { + unsigned DefClass = DefMI->getDesc().getSchedClass(); + int DefCycle = ItinData->getOperandCycle(DefClass, DefIdx); + return (DefCycle != -1 && DefCycle <= 2); + } + return false; +} + +bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr *MI, + StringRef &ErrInfo) const { + if (convertAddSubFlagsOpcode(MI->getOpcode())) { + ErrInfo = "Pseudo flag setting opcodes only exist in Selection DAG"; + return false; + } + return true; +} + +// LoadStackGuard has so far only been implemented for MachO. Different code +// sequence is needed for other targets. +void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI, + unsigned LoadImmOpc, + unsigned LoadOpc, + Reloc::Model RM) const { + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + unsigned Reg = MI->getOperand(0).getReg(); + const GlobalValue *GV = + cast<GlobalValue>((*MI->memoperands_begin())->getValue()); + MachineInstrBuilder MIB; + + BuildMI(MBB, MI, DL, get(LoadImmOpc), Reg) + .addGlobalAddress(GV, 0, ARMII::MO_NONLAZY); + + if (Subtarget.GVIsIndirectSymbol(GV, RM)) { + MIB = BuildMI(MBB, MI, DL, get(LoadOpc), Reg); + MIB.addReg(Reg, RegState::Kill).addImm(0); + unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant; + MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand( + MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 4, 4); + MIB.addMemOperand(MMO); + AddDefaultPred(MIB); + } + + MIB = BuildMI(MBB, MI, DL, get(LoadOpc), Reg); + MIB.addReg(Reg, RegState::Kill).addImm(0); + MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + AddDefaultPred(MIB); +} + +bool +ARMBaseInstrInfo::isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc, + unsigned &AddSubOpc, + bool &NegAcc, bool &HasLane) const { + DenseMap<unsigned, unsigned>::const_iterator I = MLxEntryMap.find(Opcode); + if (I == MLxEntryMap.end()) + return false; + + const ARM_MLxEntry &Entry = ARM_MLxTable[I->second]; + MulOpc = Entry.MulOpc; + AddSubOpc = Entry.AddSubOpc; + NegAcc = Entry.NegAcc; + HasLane = Entry.HasLane; + return true; +} + +//===----------------------------------------------------------------------===// +// Execution domains. +//===----------------------------------------------------------------------===// +// +// Some instructions go down the NEON pipeline, some go down the VFP pipeline, +// and some can go down both. The vmov instructions go down the VFP pipeline, +// but they can be changed to vorr equivalents that are executed by the NEON +// pipeline. +// +// We use the following execution domain numbering: +// +enum ARMExeDomain { + ExeGeneric = 0, + ExeVFP = 1, + ExeNEON = 2 +}; +// +// Also see ARMInstrFormats.td and Domain* enums in ARMBaseInfo.h +// +std::pair<uint16_t, uint16_t> +ARMBaseInstrInfo::getExecutionDomain(const MachineInstr *MI) const { + // If we don't have access to NEON instructions then we won't be able + // to swizzle anything to the NEON domain. Check to make sure. + if (Subtarget.hasNEON()) { + // VMOVD, VMOVRS and VMOVSR are VFP instructions, but can be changed to NEON + // if they are not predicated. + if (MI->getOpcode() == ARM::VMOVD && !isPredicated(MI)) + return std::make_pair(ExeVFP, (1 << ExeVFP) | (1 << ExeNEON)); + + // CortexA9 is particularly picky about mixing the two and wants these + // converted. + if (Subtarget.isCortexA9() && !isPredicated(MI) && + (MI->getOpcode() == ARM::VMOVRS || MI->getOpcode() == ARM::VMOVSR || + MI->getOpcode() == ARM::VMOVS)) + return std::make_pair(ExeVFP, (1 << ExeVFP) | (1 << ExeNEON)); + } + // No other instructions can be swizzled, so just determine their domain. + unsigned Domain = MI->getDesc().TSFlags & ARMII::DomainMask; + + if (Domain & ARMII::DomainNEON) + return std::make_pair(ExeNEON, 0); + + // Certain instructions can go either way on Cortex-A8. + // Treat them as NEON instructions. + if ((Domain & ARMII::DomainNEONA8) && Subtarget.isCortexA8()) + return std::make_pair(ExeNEON, 0); + + if (Domain & ARMII::DomainVFP) + return std::make_pair(ExeVFP, 0); + + return std::make_pair(ExeGeneric, 0); +} + +static unsigned getCorrespondingDRegAndLane(const TargetRegisterInfo *TRI, + unsigned SReg, unsigned &Lane) { + unsigned DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_0, &ARM::DPRRegClass); + Lane = 0; + + if (DReg != ARM::NoRegister) + return DReg; + + Lane = 1; + DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_1, &ARM::DPRRegClass); + + assert(DReg && "S-register with no D super-register?"); + return DReg; +} + +/// getImplicitSPRUseForDPRUse - Given a use of a DPR register and lane, +/// set ImplicitSReg to a register number that must be marked as implicit-use or +/// zero if no register needs to be defined as implicit-use. +/// +/// If the function cannot determine if an SPR should be marked implicit use or +/// not, it returns false. +/// +/// This function handles cases where an instruction is being modified from taking +/// an SPR to a DPR[Lane]. A use of the DPR is being added, which may conflict +/// with an earlier def of an SPR corresponding to DPR[Lane^1] (i.e. the other +/// lane of the DPR). +/// +/// If the other SPR is defined, an implicit-use of it should be added. Else, +/// (including the case where the DPR itself is defined), it should not. +/// +static bool getImplicitSPRUseForDPRUse(const TargetRegisterInfo *TRI, + MachineInstr *MI, + unsigned DReg, unsigned Lane, + unsigned &ImplicitSReg) { + // If the DPR is defined or used already, the other SPR lane will be chained + // correctly, so there is nothing to be done. + if (MI->definesRegister(DReg, TRI) || MI->readsRegister(DReg, TRI)) { + ImplicitSReg = 0; + return true; + } + + // Otherwise we need to go searching to see if the SPR is set explicitly. + ImplicitSReg = TRI->getSubReg(DReg, + (Lane & 1) ? ARM::ssub_0 : ARM::ssub_1); + MachineBasicBlock::LivenessQueryResult LQR = + MI->getParent()->computeRegisterLiveness(TRI, ImplicitSReg, MI); + + if (LQR == MachineBasicBlock::LQR_Live) + return true; + else if (LQR == MachineBasicBlock::LQR_Unknown) + return false; + + // If the register is known not to be live, there is no need to add an + // implicit-use. + ImplicitSReg = 0; + return true; +} + +void +ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { + unsigned DstReg, SrcReg, DReg; + unsigned Lane; + MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); + const TargetRegisterInfo *TRI = &getRegisterInfo(); + switch (MI->getOpcode()) { + default: + llvm_unreachable("cannot handle opcode!"); + break; + case ARM::VMOVD: + if (Domain != ExeNEON) + break; + + // Zap the predicate operands. + assert(!isPredicated(MI) && "Cannot predicate a VORRd"); + + // Make sure we've got NEON instructions. + assert(Subtarget.hasNEON() && "VORRd requires NEON"); + + // Source instruction is %DDst = VMOVD %DSrc, 14, %noreg (; implicits) + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + + for (unsigned i = MI->getDesc().getNumOperands(); i; --i) + MI->RemoveOperand(i-1); + + // Change to a %DDst = VORRd %DSrc, %DSrc, 14, %noreg (; implicits) + MI->setDesc(get(ARM::VORRd)); + AddDefaultPred(MIB.addReg(DstReg, RegState::Define) + .addReg(SrcReg) + .addReg(SrcReg)); + break; + case ARM::VMOVRS: + if (Domain != ExeNEON) + break; + assert(!isPredicated(MI) && "Cannot predicate a VGETLN"); + + // Source instruction is %RDst = VMOVRS %SSrc, 14, %noreg (; implicits) + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + + for (unsigned i = MI->getDesc().getNumOperands(); i; --i) + MI->RemoveOperand(i-1); + + DReg = getCorrespondingDRegAndLane(TRI, SrcReg, Lane); + + // Convert to %RDst = VGETLNi32 %DSrc, Lane, 14, %noreg (; imps) + // Note that DSrc has been widened and the other lane may be undef, which + // contaminates the entire register. + MI->setDesc(get(ARM::VGETLNi32)); + AddDefaultPred(MIB.addReg(DstReg, RegState::Define) + .addReg(DReg, RegState::Undef) + .addImm(Lane)); + + // The old source should be an implicit use, otherwise we might think it + // was dead before here. + MIB.addReg(SrcReg, RegState::Implicit); + break; + case ARM::VMOVSR: { + if (Domain != ExeNEON) + break; + assert(!isPredicated(MI) && "Cannot predicate a VSETLN"); + + // Source instruction is %SDst = VMOVSR %RSrc, 14, %noreg (; implicits) + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + + DReg = getCorrespondingDRegAndLane(TRI, DstReg, Lane); + + unsigned ImplicitSReg; + if (!getImplicitSPRUseForDPRUse(TRI, MI, DReg, Lane, ImplicitSReg)) + break; + + for (unsigned i = MI->getDesc().getNumOperands(); i; --i) + MI->RemoveOperand(i-1); + + // Convert to %DDst = VSETLNi32 %DDst, %RSrc, Lane, 14, %noreg (; imps) + // Again DDst may be undefined at the beginning of this instruction. + MI->setDesc(get(ARM::VSETLNi32)); + MIB.addReg(DReg, RegState::Define) + .addReg(DReg, getUndefRegState(!MI->readsRegister(DReg, TRI))) + .addReg(SrcReg) + .addImm(Lane); + AddDefaultPred(MIB); + + // The narrower destination must be marked as set to keep previous chains + // in place. + MIB.addReg(DstReg, RegState::Define | RegState::Implicit); + if (ImplicitSReg != 0) + MIB.addReg(ImplicitSReg, RegState::Implicit); + break; + } + case ARM::VMOVS: { + if (Domain != ExeNEON) + break; + + // Source instruction is %SDst = VMOVS %SSrc, 14, %noreg (; implicits) + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + + unsigned DstLane = 0, SrcLane = 0, DDst, DSrc; + DDst = getCorrespondingDRegAndLane(TRI, DstReg, DstLane); + DSrc = getCorrespondingDRegAndLane(TRI, SrcReg, SrcLane); + + unsigned ImplicitSReg; + if (!getImplicitSPRUseForDPRUse(TRI, MI, DSrc, SrcLane, ImplicitSReg)) + break; + + for (unsigned i = MI->getDesc().getNumOperands(); i; --i) + MI->RemoveOperand(i-1); + + if (DSrc == DDst) { + // Destination can be: + // %DDst = VDUPLN32d %DDst, Lane, 14, %noreg (; implicits) + MI->setDesc(get(ARM::VDUPLN32d)); + MIB.addReg(DDst, RegState::Define) + .addReg(DDst, getUndefRegState(!MI->readsRegister(DDst, TRI))) + .addImm(SrcLane); + AddDefaultPred(MIB); + + // Neither the source or the destination are naturally represented any + // more, so add them in manually. + MIB.addReg(DstReg, RegState::Implicit | RegState::Define); + MIB.addReg(SrcReg, RegState::Implicit); + if (ImplicitSReg != 0) + MIB.addReg(ImplicitSReg, RegState::Implicit); + break; + } + + // In general there's no single instruction that can perform an S <-> S + // move in NEON space, but a pair of VEXT instructions *can* do the + // job. It turns out that the VEXTs needed will only use DSrc once, with + // the position based purely on the combination of lane-0 and lane-1 + // involved. For example + // vmov s0, s2 -> vext.32 d0, d0, d1, #1 vext.32 d0, d0, d0, #1 + // vmov s1, s3 -> vext.32 d0, d1, d0, #1 vext.32 d0, d0, d0, #1 + // vmov s0, s3 -> vext.32 d0, d0, d0, #1 vext.32 d0, d1, d0, #1 + // vmov s1, s2 -> vext.32 d0, d0, d0, #1 vext.32 d0, d0, d1, #1 + // + // Pattern of the MachineInstrs is: + // %DDst = VEXTd32 %DSrc1, %DSrc2, Lane, 14, %noreg (;implicits) + MachineInstrBuilder NewMIB; + NewMIB = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + get(ARM::VEXTd32), DDst); + + // On the first instruction, both DSrc and DDst may be <undef> if present. + // Specifically when the original instruction didn't have them as an + // <imp-use>. + unsigned CurReg = SrcLane == 1 && DstLane == 1 ? DSrc : DDst; + bool CurUndef = !MI->readsRegister(CurReg, TRI); + NewMIB.addReg(CurReg, getUndefRegState(CurUndef)); + + CurReg = SrcLane == 0 && DstLane == 0 ? DSrc : DDst; + CurUndef = !MI->readsRegister(CurReg, TRI); + NewMIB.addReg(CurReg, getUndefRegState(CurUndef)); + + NewMIB.addImm(1); + AddDefaultPred(NewMIB); + + if (SrcLane == DstLane) + NewMIB.addReg(SrcReg, RegState::Implicit); + + MI->setDesc(get(ARM::VEXTd32)); + MIB.addReg(DDst, RegState::Define); + + // On the second instruction, DDst has definitely been defined above, so + // it is not <undef>. DSrc, if present, can be <undef> as above. + CurReg = SrcLane == 1 && DstLane == 0 ? DSrc : DDst; + CurUndef = CurReg == DSrc && !MI->readsRegister(CurReg, TRI); + MIB.addReg(CurReg, getUndefRegState(CurUndef)); + + CurReg = SrcLane == 0 && DstLane == 1 ? DSrc : DDst; + CurUndef = CurReg == DSrc && !MI->readsRegister(CurReg, TRI); + MIB.addReg(CurReg, getUndefRegState(CurUndef)); + + MIB.addImm(1); + AddDefaultPred(MIB); + + if (SrcLane != DstLane) + MIB.addReg(SrcReg, RegState::Implicit); + + // As before, the original destination is no longer represented, add it + // implicitly. + MIB.addReg(DstReg, RegState::Define | RegState::Implicit); + if (ImplicitSReg != 0) + MIB.addReg(ImplicitSReg, RegState::Implicit); + break; + } + } + +} + +//===----------------------------------------------------------------------===// +// Partial register updates +//===----------------------------------------------------------------------===// +// +// Swift renames NEON registers with 64-bit granularity. That means any +// instruction writing an S-reg implicitly reads the containing D-reg. The +// problem is mostly avoided by translating f32 operations to v2f32 operations +// on D-registers, but f32 loads are still a problem. +// +// These instructions can load an f32 into a NEON register: +// +// VLDRS - Only writes S, partial D update. +// VLD1LNd32 - Writes all D-regs, explicit partial D update, 2 uops. +// VLD1DUPd32 - Writes all D-regs, no partial reg update, 2 uops. +// +// FCONSTD can be used as a dependency-breaking instruction. +unsigned ARMBaseInstrInfo:: +getPartialRegUpdateClearance(const MachineInstr *MI, + unsigned OpNum, + const TargetRegisterInfo *TRI) const { + if (!SwiftPartialUpdateClearance || + !(Subtarget.isSwift() || Subtarget.isCortexA15())) + return 0; + + assert(TRI && "Need TRI instance"); + + const MachineOperand &MO = MI->getOperand(OpNum); + if (MO.readsReg()) + return 0; + unsigned Reg = MO.getReg(); + int UseOp = -1; + + switch(MI->getOpcode()) { + // Normal instructions writing only an S-register. + case ARM::VLDRS: + case ARM::FCONSTS: + case ARM::VMOVSR: + case ARM::VMOVv8i8: + case ARM::VMOVv4i16: + case ARM::VMOVv2i32: + case ARM::VMOVv2f32: + case ARM::VMOVv1i64: + UseOp = MI->findRegisterUseOperandIdx(Reg, false, TRI); + break; + + // Explicitly reads the dependency. + case ARM::VLD1LNd32: + UseOp = 3; + break; + default: + return 0; + } + + // If this instruction actually reads a value from Reg, there is no unwanted + // dependency. + if (UseOp != -1 && MI->getOperand(UseOp).readsReg()) + return 0; + + // We must be able to clobber the whole D-reg. + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + // Virtual register must be a foo:ssub_0<def,undef> operand. + if (!MO.getSubReg() || MI->readsVirtualRegister(Reg)) + return 0; + } else if (ARM::SPRRegClass.contains(Reg)) { + // Physical register: MI must define the full D-reg. + unsigned DReg = TRI->getMatchingSuperReg(Reg, ARM::ssub_0, + &ARM::DPRRegClass); + if (!DReg || !MI->definesRegister(DReg, TRI)) + return 0; + } + + // MI has an unwanted D-register dependency. + // Avoid defs in the previous N instructrions. + return SwiftPartialUpdateClearance; +} + +// Break a partial register dependency after getPartialRegUpdateClearance +// returned non-zero. +void ARMBaseInstrInfo:: +breakPartialRegDependency(MachineBasicBlock::iterator MI, + unsigned OpNum, + const TargetRegisterInfo *TRI) const { + assert(MI && OpNum < MI->getDesc().getNumDefs() && "OpNum is not a def"); + assert(TRI && "Need TRI instance"); + + const MachineOperand &MO = MI->getOperand(OpNum); + unsigned Reg = MO.getReg(); + assert(TargetRegisterInfo::isPhysicalRegister(Reg) && + "Can't break virtual register dependencies."); + unsigned DReg = Reg; + + // If MI defines an S-reg, find the corresponding D super-register. + if (ARM::SPRRegClass.contains(Reg)) { + DReg = ARM::D0 + (Reg - ARM::S0) / 2; + assert(TRI->isSuperRegister(Reg, DReg) && "Register enums broken"); + } + + assert(ARM::DPRRegClass.contains(DReg) && "Can only break D-reg deps"); + assert(MI->definesRegister(DReg, TRI) && "MI doesn't clobber full D-reg"); + + // FIXME: In some cases, VLDRS can be changed to a VLD1DUPd32 which defines + // the full D-register by loading the same value to both lanes. The + // instruction is micro-coded with 2 uops, so don't do this until we can + // properly schedule micro-coded instructions. The dispatcher stalls cause + // too big regressions. + + // Insert the dependency-breaking FCONSTD before MI. + // 96 is the encoding of 0.5, but the actual value doesn't matter here. + AddDefaultPred(BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + get(ARM::FCONSTD), DReg).addImm(96)); + MI->addRegisterKilled(DReg, TRI, true); +} + +bool ARMBaseInstrInfo::hasNOP() const { + return Subtarget.getFeatureBits()[ARM::HasV6KOps]; +} + +bool ARMBaseInstrInfo::isSwiftFastImmShift(const MachineInstr *MI) const { + if (MI->getNumOperands() < 4) + return true; + unsigned ShOpVal = MI->getOperand(3).getImm(); + unsigned ShImm = ARM_AM::getSORegOffset(ShOpVal); + // Swift supports faster shifts for: lsl 2, lsl 1, and lsr 1. + if ((ShImm == 1 && ARM_AM::getSORegShOp(ShOpVal) == ARM_AM::lsr) || + ((ShImm == 1 || ShImm == 2) && + ARM_AM::getSORegShOp(ShOpVal) == ARM_AM::lsl)) + return true; + + return false; +} + +bool ARMBaseInstrInfo::getRegSequenceLikeInputs( + const MachineInstr &MI, unsigned DefIdx, + SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const { + assert(DefIdx < MI.getDesc().getNumDefs() && "Invalid definition index"); + assert(MI.isRegSequenceLike() && "Invalid kind of instruction"); + + switch (MI.getOpcode()) { + case ARM::VMOVDRR: + // dX = VMOVDRR rY, rZ + // is the same as: + // dX = REG_SEQUENCE rY, ssub_0, rZ, ssub_1 + // Populate the InputRegs accordingly. + // rY + const MachineOperand *MOReg = &MI.getOperand(1); + InputRegs.push_back( + RegSubRegPairAndIdx(MOReg->getReg(), MOReg->getSubReg(), ARM::ssub_0)); + // rZ + MOReg = &MI.getOperand(2); + InputRegs.push_back( + RegSubRegPairAndIdx(MOReg->getReg(), MOReg->getSubReg(), ARM::ssub_1)); + return true; + } + llvm_unreachable("Target dependent opcode missing"); +} + +bool ARMBaseInstrInfo::getExtractSubregLikeInputs( + const MachineInstr &MI, unsigned DefIdx, + RegSubRegPairAndIdx &InputReg) const { + assert(DefIdx < MI.getDesc().getNumDefs() && "Invalid definition index"); + assert(MI.isExtractSubregLike() && "Invalid kind of instruction"); + + switch (MI.getOpcode()) { + case ARM::VMOVRRD: + // rX, rY = VMOVRRD dZ + // is the same as: + // rX = EXTRACT_SUBREG dZ, ssub_0 + // rY = EXTRACT_SUBREG dZ, ssub_1 + const MachineOperand &MOReg = MI.getOperand(2); + InputReg.Reg = MOReg.getReg(); + InputReg.SubReg = MOReg.getSubReg(); + InputReg.SubIdx = DefIdx == 0 ? ARM::ssub_0 : ARM::ssub_1; + return true; + } + llvm_unreachable("Target dependent opcode missing"); +} + +bool ARMBaseInstrInfo::getInsertSubregLikeInputs( + const MachineInstr &MI, unsigned DefIdx, RegSubRegPair &BaseReg, + RegSubRegPairAndIdx &InsertedReg) const { + assert(DefIdx < MI.getDesc().getNumDefs() && "Invalid definition index"); + assert(MI.isInsertSubregLike() && "Invalid kind of instruction"); + + switch (MI.getOpcode()) { + case ARM::VSETLNi32: + // dX = VSETLNi32 dY, rZ, imm + const MachineOperand &MOBaseReg = MI.getOperand(1); + const MachineOperand &MOInsertedReg = MI.getOperand(2); + const MachineOperand &MOIndex = MI.getOperand(3); + BaseReg.Reg = MOBaseReg.getReg(); + BaseReg.SubReg = MOBaseReg.getSubReg(); + + InsertedReg.Reg = MOInsertedReg.getReg(); + InsertedReg.SubReg = MOInsertedReg.getSubReg(); + InsertedReg.SubIdx = MOIndex.getImm() == 0 ? ARM::ssub_0 : ARM::ssub_1; + return true; + } + llvm_unreachable("Target dependent opcode missing"); +} diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h new file mode 100644 index 0000000..d80c494 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -0,0 +1,509 @@ +//===-- ARMBaseInstrInfo.h - ARM Base Instruction Information ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Base ARM implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H +#define LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H + +#include "MCTargetDesc/ARMBaseInfo.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Target/TargetInstrInfo.h" + +#define GET_INSTRINFO_HEADER +#include "ARMGenInstrInfo.inc" + +namespace llvm { + class ARMSubtarget; + class ARMBaseRegisterInfo; + +class ARMBaseInstrInfo : public ARMGenInstrInfo { + const ARMSubtarget &Subtarget; + +protected: + // Can be only subclassed. + explicit ARMBaseInstrInfo(const ARMSubtarget &STI); + + void expandLoadStackGuardBase(MachineBasicBlock::iterator MI, + unsigned LoadImmOpc, unsigned LoadOpc, + Reloc::Model RM) const; + + /// Build the equivalent inputs of a REG_SEQUENCE for the given \p MI + /// and \p DefIdx. + /// \p [out] InputRegs of the equivalent REG_SEQUENCE. Each element of + /// the list is modeled as <Reg:SubReg, SubIdx>. + /// E.g., REG_SEQUENCE vreg1:sub1, sub0, vreg2, sub1 would produce + /// two elements: + /// - vreg1:sub1, sub0 + /// - vreg2<:0>, sub1 + /// + /// \returns true if it is possible to build such an input sequence + /// with the pair \p MI, \p DefIdx. False otherwise. + /// + /// \pre MI.isRegSequenceLike(). + bool getRegSequenceLikeInputs( + const MachineInstr &MI, unsigned DefIdx, + SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const override; + + /// Build the equivalent inputs of a EXTRACT_SUBREG for the given \p MI + /// and \p DefIdx. + /// \p [out] InputReg of the equivalent EXTRACT_SUBREG. + /// E.g., EXTRACT_SUBREG vreg1:sub1, sub0, sub1 would produce: + /// - vreg1:sub1, sub0 + /// + /// \returns true if it is possible to build such an input sequence + /// with the pair \p MI, \p DefIdx. False otherwise. + /// + /// \pre MI.isExtractSubregLike(). + bool getExtractSubregLikeInputs(const MachineInstr &MI, unsigned DefIdx, + RegSubRegPairAndIdx &InputReg) const override; + + /// Build the equivalent inputs of a INSERT_SUBREG for the given \p MI + /// and \p DefIdx. + /// \p [out] BaseReg and \p [out] InsertedReg contain + /// the equivalent inputs of INSERT_SUBREG. + /// E.g., INSERT_SUBREG vreg0:sub0, vreg1:sub1, sub3 would produce: + /// - BaseReg: vreg0:sub0 + /// - InsertedReg: vreg1:sub1, sub3 + /// + /// \returns true if it is possible to build such an input sequence + /// with the pair \p MI, \p DefIdx. False otherwise. + /// + /// \pre MI.isInsertSubregLike(). + bool + getInsertSubregLikeInputs(const MachineInstr &MI, unsigned DefIdx, + RegSubRegPair &BaseReg, + RegSubRegPairAndIdx &InsertedReg) const override; + + /// Commutes the operands in the given instruction. + /// The commutable operands are specified by their indices OpIdx1 and OpIdx2. + /// + /// Do not call this method for a non-commutable instruction or for + /// non-commutable pair of operand indices OpIdx1 and OpIdx2. + /// Even though the instruction is commutable, the method may still + /// fail to commute the operands, null pointer is returned in such cases. + MachineInstr *commuteInstructionImpl(MachineInstr *MI, + bool NewMI, + unsigned OpIdx1, + unsigned OpIdx2) const override; + +public: + // Return whether the target has an explicit NOP encoding. + bool hasNOP() const; + + // Return the non-pre/post incrementing version of 'Opc'. Return 0 + // if there is not such an opcode. + virtual unsigned getUnindexedOpcode(unsigned Opc) const =0; + + MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const override; + + virtual const ARMBaseRegisterInfo &getRegisterInfo() const = 0; + const ARMSubtarget &getSubtarget() const { return Subtarget; } + + ScheduleHazardRecognizer * + CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI, + const ScheduleDAG *DAG) const override; + + ScheduleHazardRecognizer * + CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, + const ScheduleDAG *DAG) const override; + + // Branch analysis. + bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify = false) const override; + unsigned RemoveBranch(MachineBasicBlock &MBB) const override; + unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, + DebugLoc DL) const override; + + bool + ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; + + // Predication support. + bool isPredicated(const MachineInstr *MI) const override; + + ARMCC::CondCodes getPredicate(const MachineInstr *MI) const { + int PIdx = MI->findFirstPredOperandIdx(); + return PIdx != -1 ? (ARMCC::CondCodes)MI->getOperand(PIdx).getImm() + : ARMCC::AL; + } + + bool PredicateInstruction(MachineInstr *MI, + ArrayRef<MachineOperand> Pred) const override; + + bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1, + ArrayRef<MachineOperand> Pred2) const override; + + bool DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const override; + + bool isPredicable(MachineInstr *MI) const override; + + /// GetInstSize - Returns the size of the specified MachineInstr. + /// + virtual unsigned GetInstSizeInBytes(const MachineInstr* MI) const; + + unsigned isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const override; + unsigned isStoreToStackSlot(const MachineInstr *MI, + int &FrameIndex) const override; + unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const override; + unsigned isStoreToStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const override; + + void copyToCPSR(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned SrcReg, bool KillSrc, + const ARMSubtarget &Subtarget) const; + void copyFromCPSR(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned DestReg, bool KillSrc, + const ARMSubtarget &Subtarget) const; + + void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + DebugLoc DL, unsigned DestReg, unsigned SrcReg, + bool KillSrc) const override; + + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned SrcReg, bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + + bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; + + void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + unsigned DestReg, unsigned SubIdx, + const MachineInstr *Orig, + const TargetRegisterInfo &TRI) const override; + + MachineInstr *duplicate(MachineInstr *Orig, + MachineFunction &MF) const override; + + const MachineInstrBuilder &AddDReg(MachineInstrBuilder &MIB, unsigned Reg, + unsigned SubIdx, unsigned State, + const TargetRegisterInfo *TRI) const; + + bool produceSameValue(const MachineInstr *MI0, const MachineInstr *MI1, + const MachineRegisterInfo *MRI) const override; + + /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to + /// determine if two loads are loading from the same base address. It should + /// only return true if the base pointers are the same and the only + /// differences between the two addresses is the offset. It also returns the + /// offsets by reference. + bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1, + int64_t &Offset2) const override; + + /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to + /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads + /// should be scheduled togther. On some targets if two loads are loading from + /// addresses in the same cache line, it's better if they are scheduled + /// together. This function takes two integers that represent the load offsets + /// from the common base address. It returns true if it decides it's desirable + /// to schedule the two loads together. "NumLoads" is the number of loads that + /// have already been scheduled after Load1. + bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, + int64_t Offset1, int64_t Offset2, + unsigned NumLoads) const override; + + bool isSchedulingBoundary(const MachineInstr *MI, + const MachineBasicBlock *MBB, + const MachineFunction &MF) const override; + + bool isProfitableToIfCvt(MachineBasicBlock &MBB, + unsigned NumCycles, unsigned ExtraPredCycles, + BranchProbability Probability) const override; + + bool isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumT, + unsigned ExtraT, MachineBasicBlock &FMBB, + unsigned NumF, unsigned ExtraF, + BranchProbability Probability) const override; + + bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, + BranchProbability Probability) const override { + return NumCycles == 1; + } + + bool isProfitableToUnpredicate(MachineBasicBlock &TMBB, + MachineBasicBlock &FMBB) const override; + + /// analyzeCompare - For a comparison instruction, return the source registers + /// in SrcReg and SrcReg2 if having two register operands, and the value it + /// compares against in CmpValue. Return true if the comparison instruction + /// can be analyzed. + bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, + unsigned &SrcReg2, int &CmpMask, + int &CmpValue) const override; + + /// optimizeCompareInstr - Convert the instruction to set the zero flag so + /// that we can remove a "comparison with zero"; Remove a redundant CMP + /// instruction if the flags can be updated in the same way by an earlier + /// instruction such as SUB. + bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, + unsigned SrcReg2, int CmpMask, int CmpValue, + const MachineRegisterInfo *MRI) const override; + + bool analyzeSelect(const MachineInstr *MI, + SmallVectorImpl<MachineOperand> &Cond, + unsigned &TrueOp, unsigned &FalseOp, + bool &Optimizable) const override; + + MachineInstr *optimizeSelect(MachineInstr *MI, + SmallPtrSetImpl<MachineInstr *> &SeenMIs, + bool) const override; + + /// FoldImmediate - 'Reg' is known to be defined by a move immediate + /// instruction, try to fold the immediate into the use instruction. + bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, + unsigned Reg, MachineRegisterInfo *MRI) const override; + + unsigned getNumMicroOps(const InstrItineraryData *ItinData, + const MachineInstr *MI) const override; + + int getOperandLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, + unsigned UseIdx) const override; + int getOperandLatency(const InstrItineraryData *ItinData, + SDNode *DefNode, unsigned DefIdx, + SDNode *UseNode, unsigned UseIdx) const override; + + /// VFP/NEON execution domains. + std::pair<uint16_t, uint16_t> + getExecutionDomain(const MachineInstr *MI) const override; + void setExecutionDomain(MachineInstr *MI, unsigned Domain) const override; + + unsigned getPartialRegUpdateClearance(const MachineInstr*, unsigned, + const TargetRegisterInfo*) const override; + void breakPartialRegDependency(MachineBasicBlock::iterator, unsigned, + const TargetRegisterInfo *TRI) const override; + + /// Get the number of addresses by LDM or VLDM or zero for unknown. + unsigned getNumLDMAddresses(const MachineInstr *MI) const; + +private: + unsigned getInstBundleLength(const MachineInstr *MI) const; + + int getVLDMDefCycle(const InstrItineraryData *ItinData, + const MCInstrDesc &DefMCID, + unsigned DefClass, + unsigned DefIdx, unsigned DefAlign) const; + int getLDMDefCycle(const InstrItineraryData *ItinData, + const MCInstrDesc &DefMCID, + unsigned DefClass, + unsigned DefIdx, unsigned DefAlign) const; + int getVSTMUseCycle(const InstrItineraryData *ItinData, + const MCInstrDesc &UseMCID, + unsigned UseClass, + unsigned UseIdx, unsigned UseAlign) const; + int getSTMUseCycle(const InstrItineraryData *ItinData, + const MCInstrDesc &UseMCID, + unsigned UseClass, + unsigned UseIdx, unsigned UseAlign) const; + int getOperandLatency(const InstrItineraryData *ItinData, + const MCInstrDesc &DefMCID, + unsigned DefIdx, unsigned DefAlign, + const MCInstrDesc &UseMCID, + unsigned UseIdx, unsigned UseAlign) const; + + unsigned getPredicationCost(const MachineInstr *MI) const override; + + unsigned getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost = nullptr) const override; + + int getInstrLatency(const InstrItineraryData *ItinData, + SDNode *Node) const override; + + bool hasHighOperandLatency(const TargetSchedModel &SchedModel, + const MachineRegisterInfo *MRI, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, + unsigned UseIdx) const override; + bool hasLowDefLatency(const TargetSchedModel &SchedModel, + const MachineInstr *DefMI, + unsigned DefIdx) const override; + + /// verifyInstruction - Perform target specific instruction verification. + bool verifyInstruction(const MachineInstr *MI, + StringRef &ErrInfo) const override; + + virtual void expandLoadStackGuard(MachineBasicBlock::iterator MI, + Reloc::Model RM) const = 0; + + void expandMEMCPY(MachineBasicBlock::iterator) const; + +private: + /// Modeling special VFP / NEON fp MLA / MLS hazards. + + /// MLxEntryMap - Map fp MLA / MLS to the corresponding entry in the internal + /// MLx table. + DenseMap<unsigned, unsigned> MLxEntryMap; + + /// MLxHazardOpcodes - Set of add / sub and multiply opcodes that would cause + /// stalls when scheduled together with fp MLA / MLS opcodes. + SmallSet<unsigned, 16> MLxHazardOpcodes; + +public: + /// isFpMLxInstruction - Return true if the specified opcode is a fp MLA / MLS + /// instruction. + bool isFpMLxInstruction(unsigned Opcode) const { + return MLxEntryMap.count(Opcode); + } + + /// isFpMLxInstruction - This version also returns the multiply opcode and the + /// addition / subtraction opcode to expand to. Return true for 'HasLane' for + /// the MLX instructions with an extra lane operand. + bool isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc, + unsigned &AddSubOpc, bool &NegAcc, + bool &HasLane) const; + + /// canCauseFpMLxStall - Return true if an instruction of the specified opcode + /// will cause stalls when scheduled after (within 4-cycle window) a fp + /// MLA / MLS instruction. + bool canCauseFpMLxStall(unsigned Opcode) const { + return MLxHazardOpcodes.count(Opcode); + } + + /// Returns true if the instruction has a shift by immediate that can be + /// executed in one cycle less. + bool isSwiftFastImmShift(const MachineInstr *MI) const; +}; + +static inline +const MachineInstrBuilder &AddDefaultPred(const MachineInstrBuilder &MIB) { + return MIB.addImm((int64_t)ARMCC::AL).addReg(0); +} + +static inline +const MachineInstrBuilder &AddDefaultCC(const MachineInstrBuilder &MIB) { + return MIB.addReg(0); +} + +static inline +const MachineInstrBuilder &AddDefaultT1CC(const MachineInstrBuilder &MIB, + bool isDead = false) { + return MIB.addReg(ARM::CPSR, getDefRegState(true) | getDeadRegState(isDead)); +} + +static inline +const MachineInstrBuilder &AddNoT1CC(const MachineInstrBuilder &MIB) { + return MIB.addReg(0); +} + +static inline +bool isUncondBranchOpcode(int Opc) { + return Opc == ARM::B || Opc == ARM::tB || Opc == ARM::t2B; +} + +static inline +bool isCondBranchOpcode(int Opc) { + return Opc == ARM::Bcc || Opc == ARM::tBcc || Opc == ARM::t2Bcc; +} + +static inline +bool isJumpTableBranchOpcode(int Opc) { + return Opc == ARM::BR_JTr || Opc == ARM::BR_JTm || Opc == ARM::BR_JTadd || + Opc == ARM::tBR_JTr || Opc == ARM::t2BR_JT; +} + +static inline +bool isIndirectBranchOpcode(int Opc) { + return Opc == ARM::BX || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND; +} + +static inline bool isPopOpcode(int Opc) { + return Opc == ARM::tPOP_RET || Opc == ARM::LDMIA_RET || + Opc == ARM::t2LDMIA_RET || Opc == ARM::tPOP || Opc == ARM::LDMIA_UPD || + Opc == ARM::t2LDMIA_UPD || Opc == ARM::VLDMDIA_UPD; +} + +static inline bool isPushOpcode(int Opc) { + return Opc == ARM::tPUSH || Opc == ARM::t2STMDB_UPD || + Opc == ARM::STMDB_UPD || Opc == ARM::VSTMDDB_UPD; +} + +/// getInstrPredicate - If instruction is predicated, returns its predicate +/// condition, otherwise returns AL. It also returns the condition code +/// register by reference. +ARMCC::CondCodes getInstrPredicate(const MachineInstr *MI, unsigned &PredReg); + +unsigned getMatchingCondBranchOpcode(unsigned Opc); + +/// Determine if MI can be folded into an ARM MOVCC instruction, and return the +/// opcode of the SSA instruction representing the conditional MI. +unsigned canFoldARMInstrIntoMOVCC(unsigned Reg, + MachineInstr *&MI, + const MachineRegisterInfo &MRI); + +/// Map pseudo instructions that imply an 'S' bit onto real opcodes. Whether +/// the instruction is encoded with an 'S' bit is determined by the optional +/// CPSR def operand. +unsigned convertAddSubFlagsOpcode(unsigned OldOpc); + +/// emitARMRegPlusImmediate / emitT2RegPlusImmediate - Emits a series of +/// instructions to materializea destreg = basereg + immediate in ARM / Thumb2 +/// code. +void emitARMRegPlusImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, DebugLoc dl, + unsigned DestReg, unsigned BaseReg, int NumBytes, + ARMCC::CondCodes Pred, unsigned PredReg, + const ARMBaseInstrInfo &TII, unsigned MIFlags = 0); + +void emitT2RegPlusImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, DebugLoc dl, + unsigned DestReg, unsigned BaseReg, int NumBytes, + ARMCC::CondCodes Pred, unsigned PredReg, + const ARMBaseInstrInfo &TII, unsigned MIFlags = 0); +void emitThumbRegPlusImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, DebugLoc dl, + unsigned DestReg, unsigned BaseReg, + int NumBytes, const TargetInstrInfo &TII, + const ARMBaseRegisterInfo& MRI, + unsigned MIFlags = 0); + +/// Tries to add registers to the reglist of a given base-updating +/// push/pop instruction to adjust the stack by an additional +/// NumBytes. This can save a few bytes per function in code-size, but +/// obviously generates more memory traffic. As such, it only takes +/// effect in functions being optimised for size. +bool tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget, + MachineFunction &MF, MachineInstr *MI, + unsigned NumBytes); + +/// rewriteARMFrameIndex / rewriteT2FrameIndex - +/// Rewrite MI to access 'Offset' bytes from the FP. Return false if the +/// offset could not be handled directly in MI, and return the left-over +/// portion by reference. +bool rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const ARMBaseInstrInfo &TII); + +bool rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const ARMBaseInstrInfo &TII); + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp new file mode 100644 index 0000000..419717c --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -0,0 +1,810 @@ +//===-- ARMBaseRegisterInfo.cpp - ARM Register Information ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the base ARM implementation of TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#include "ARMBaseRegisterInfo.h" +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "ARMFrameLowering.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMSubtarget.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" + +#define DEBUG_TYPE "arm-register-info" + +#define GET_REGINFO_TARGET_DESC +#include "ARMGenRegisterInfo.inc" + +using namespace llvm; + +ARMBaseRegisterInfo::ARMBaseRegisterInfo() + : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC), BasePtr(ARM::R6) {} + +static unsigned getFramePointerReg(const ARMSubtarget &STI) { + if (STI.isTargetMachO()) { + if (STI.isTargetDarwin() || STI.isThumb1Only()) + return ARM::R7; + else + return ARM::R11; + } else if (STI.isTargetWindows()) + return ARM::R11; + else // ARM EABI + return STI.isThumb() ? ARM::R7 : ARM::R11; +} + +const MCPhysReg* +ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { + const ARMSubtarget &STI = MF->getSubtarget<ARMSubtarget>(); + const MCPhysReg *RegList = + STI.isTargetDarwin() ? CSR_iOS_SaveList : CSR_AAPCS_SaveList; + + const Function *F = MF->getFunction(); + if (F->getCallingConv() == CallingConv::GHC) { + // GHC set of callee saved regs is empty as all those regs are + // used for passing STG regs around + return CSR_NoRegs_SaveList; + } else if (F->hasFnAttribute("interrupt")) { + if (STI.isMClass()) { + // M-class CPUs have hardware which saves the registers needed to allow a + // function conforming to the AAPCS to function as a handler. + return CSR_AAPCS_SaveList; + } else if (F->getFnAttribute("interrupt").getValueAsString() == "FIQ") { + // Fast interrupt mode gives the handler a private copy of R8-R14, so less + // need to be saved to restore user-mode state. + return CSR_FIQ_SaveList; + } else { + // Generally only R13-R14 (i.e. SP, LR) are automatically preserved by + // exception handling. + return CSR_GenericInt_SaveList; + } + } + + return RegList; +} + +const uint32_t * +ARMBaseRegisterInfo::getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID CC) const { + const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); + if (CC == CallingConv::GHC) + // This is academic becase all GHC calls are (supposed to be) tail calls + return CSR_NoRegs_RegMask; + return STI.isTargetDarwin() ? CSR_iOS_RegMask : CSR_AAPCS_RegMask; +} + +const uint32_t* +ARMBaseRegisterInfo::getNoPreservedMask() const { + return CSR_NoRegs_RegMask; +} + +const uint32_t * +ARMBaseRegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF, + CallingConv::ID CC) const { + const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); + // This should return a register mask that is the same as that returned by + // getCallPreservedMask but that additionally preserves the register used for + // the first i32 argument (which must also be the register used to return a + // single i32 return value) + // + // In case that the calling convention does not use the same register for + // both or otherwise does not want to enable this optimization, the function + // should return NULL + if (CC == CallingConv::GHC) + // This is academic becase all GHC calls are (supposed to be) tail calls + return nullptr; + return STI.isTargetDarwin() ? CSR_iOS_ThisReturn_RegMask + : CSR_AAPCS_ThisReturn_RegMask; +} + +BitVector ARMBaseRegisterInfo:: +getReservedRegs(const MachineFunction &MF) const { + const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); + const ARMFrameLowering *TFI = getFrameLowering(MF); + + // FIXME: avoid re-calculating this every time. + BitVector Reserved(getNumRegs()); + Reserved.set(ARM::SP); + Reserved.set(ARM::PC); + Reserved.set(ARM::FPSCR); + Reserved.set(ARM::APSR_NZCV); + if (TFI->hasFP(MF)) + Reserved.set(getFramePointerReg(STI)); + if (hasBasePointer(MF)) + Reserved.set(BasePtr); + // Some targets reserve R9. + if (STI.isR9Reserved()) + Reserved.set(ARM::R9); + // Reserve D16-D31 if the subtarget doesn't support them. + if (!STI.hasVFP3() || STI.hasD16()) { + assert(ARM::D31 == ARM::D16 + 15); + for (unsigned i = 0; i != 16; ++i) + Reserved.set(ARM::D16 + i); + } + const TargetRegisterClass *RC = &ARM::GPRPairRegClass; + for(TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); I!=E; ++I) + for (MCSubRegIterator SI(*I, this); SI.isValid(); ++SI) + if (Reserved.test(*SI)) Reserved.set(*I); + + return Reserved; +} + +const TargetRegisterClass * +ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, + const MachineFunction &) const { + const TargetRegisterClass *Super = RC; + TargetRegisterClass::sc_iterator I = RC->getSuperClasses(); + do { + switch (Super->getID()) { + case ARM::GPRRegClassID: + case ARM::SPRRegClassID: + case ARM::DPRRegClassID: + case ARM::QPRRegClassID: + case ARM::QQPRRegClassID: + case ARM::QQQQPRRegClassID: + case ARM::GPRPairRegClassID: + return Super; + } + Super = *I++; + } while (Super); + return RC; +} + +const TargetRegisterClass * +ARMBaseRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) + const { + return &ARM::GPRRegClass; +} + +const TargetRegisterClass * +ARMBaseRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { + if (RC == &ARM::CCRRegClass) + return &ARM::rGPRRegClass; // Can't copy CCR registers. + return RC; +} + +unsigned +ARMBaseRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const { + const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); + const ARMFrameLowering *TFI = getFrameLowering(MF); + + switch (RC->getID()) { + default: + return 0; + case ARM::tGPRRegClassID: + return TFI->hasFP(MF) ? 4 : 5; + case ARM::GPRRegClassID: { + unsigned FP = TFI->hasFP(MF) ? 1 : 0; + return 10 - FP - (STI.isR9Reserved() ? 1 : 0); + } + case ARM::SPRRegClassID: // Currently not used as 'rep' register class. + case ARM::DPRRegClassID: + return 32 - 10; + } +} + +// Get the other register in a GPRPair. +static unsigned getPairedGPR(unsigned Reg, bool Odd, const MCRegisterInfo *RI) { + for (MCSuperRegIterator Supers(Reg, RI); Supers.isValid(); ++Supers) + if (ARM::GPRPairRegClass.contains(*Supers)) + return RI->getSubReg(*Supers, Odd ? ARM::gsub_1 : ARM::gsub_0); + return 0; +} + +// Resolve the RegPairEven / RegPairOdd register allocator hints. +void +ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg, + ArrayRef<MCPhysReg> Order, + SmallVectorImpl<MCPhysReg> &Hints, + const MachineFunction &MF, + const VirtRegMap *VRM, + const LiveRegMatrix *Matrix) const { + const MachineRegisterInfo &MRI = MF.getRegInfo(); + std::pair<unsigned, unsigned> Hint = MRI.getRegAllocationHint(VirtReg); + + unsigned Odd; + switch (Hint.first) { + case ARMRI::RegPairEven: + Odd = 0; + break; + case ARMRI::RegPairOdd: + Odd = 1; + break; + default: + TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, VRM); + return; + } + + // This register should preferably be even (Odd == 0) or odd (Odd == 1). + // Check if the other part of the pair has already been assigned, and provide + // the paired register as the first hint. + unsigned Paired = Hint.second; + if (Paired == 0) + return; + + unsigned PairedPhys = 0; + if (TargetRegisterInfo::isPhysicalRegister(Paired)) { + PairedPhys = Paired; + } else if (VRM && VRM->hasPhys(Paired)) { + PairedPhys = getPairedGPR(VRM->getPhys(Paired), Odd, this); + } + + // First prefer the paired physreg. + if (PairedPhys && + std::find(Order.begin(), Order.end(), PairedPhys) != Order.end()) + Hints.push_back(PairedPhys); + + // Then prefer even or odd registers. + for (unsigned I = 0, E = Order.size(); I != E; ++I) { + unsigned Reg = Order[I]; + if (Reg == PairedPhys || (getEncodingValue(Reg) & 1) != Odd) + continue; + // Don't provide hints that are paired to a reserved register. + unsigned Paired = getPairedGPR(Reg, !Odd, this); + if (!Paired || MRI.isReserved(Paired)) + continue; + Hints.push_back(Reg); + } +} + +void +ARMBaseRegisterInfo::updateRegAllocHint(unsigned Reg, unsigned NewReg, + MachineFunction &MF) const { + MachineRegisterInfo *MRI = &MF.getRegInfo(); + std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(Reg); + if ((Hint.first == (unsigned)ARMRI::RegPairOdd || + Hint.first == (unsigned)ARMRI::RegPairEven) && + TargetRegisterInfo::isVirtualRegister(Hint.second)) { + // If 'Reg' is one of the even / odd register pair and it's now changed + // (e.g. coalesced) into a different register. The other register of the + // pair allocation hint must be updated to reflect the relationship + // change. + unsigned OtherReg = Hint.second; + Hint = MRI->getRegAllocationHint(OtherReg); + // Make sure the pair has not already divorced. + if (Hint.second == Reg) { + MRI->setRegAllocationHint(OtherReg, Hint.first, NewReg); + if (TargetRegisterInfo::isVirtualRegister(NewReg)) + MRI->setRegAllocationHint(NewReg, + Hint.first == (unsigned)ARMRI::RegPairOdd ? ARMRI::RegPairEven + : ARMRI::RegPairOdd, OtherReg); + } + } +} + +bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + const ARMFrameLowering *TFI = getFrameLowering(MF); + + // When outgoing call frames are so large that we adjust the stack pointer + // around the call, we can no longer use the stack pointer to reach the + // emergency spill slot. + if (needsStackRealignment(MF) && !TFI->hasReservedCallFrame(MF)) + return true; + + // Thumb has trouble with negative offsets from the FP. Thumb2 has a limited + // negative range for ldr/str (255), and thumb1 is positive offsets only. + // It's going to be better to use the SP or Base Pointer instead. When there + // are variable sized objects, we can't reference off of the SP, so we + // reserve a Base Pointer. + if (AFI->isThumbFunction() && MFI->hasVarSizedObjects()) { + // Conservatively estimate whether the negative offset from the frame + // pointer will be sufficient to reach. If a function has a smallish + // frame, it's less likely to have lots of spills and callee saved + // space, so it's all more likely to be within range of the frame pointer. + // If it's wrong, the scavenger will still enable access to work, it just + // won't be optimal. + if (AFI->isThumb2Function() && MFI->getLocalFrameSize() < 128) + return false; + return true; + } + + return false; +} + +bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const { + const MachineRegisterInfo *MRI = &MF.getRegInfo(); + const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + const ARMFrameLowering *TFI = getFrameLowering(MF); + // We can't realign the stack if: + // 1. Dynamic stack realignment is explicitly disabled, + // 2. This is a Thumb1 function (it's not useful, so we don't bother), or + // 3. There are VLAs in the function and the base pointer is disabled. + if (!TargetRegisterInfo::canRealignStack(MF)) + return false; + if (AFI->isThumb1OnlyFunction()) + return false; + // Stack realignment requires a frame pointer. If we already started + // register allocation with frame pointer elimination, it is too late now. + if (!MRI->canReserveReg(getFramePointerReg(MF.getSubtarget<ARMSubtarget>()))) + return false; + // We may also need a base pointer if there are dynamic allocas or stack + // pointer adjustments around calls. + if (TFI->hasReservedCallFrame(MF)) + return true; + // A base pointer is required and allowed. Check that it isn't too late to + // reserve it. + return MRI->canReserveReg(BasePtr); +} + +bool ARMBaseRegisterInfo:: +cannotEliminateFrame(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + if (MF.getTarget().Options.DisableFramePointerElim(MF) && MFI->adjustsStack()) + return true; + return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() + || needsStackRealignment(MF); +} + +unsigned +ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const { + const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); + const ARMFrameLowering *TFI = getFrameLowering(MF); + + if (TFI->hasFP(MF)) + return getFramePointerReg(STI); + return ARM::SP; +} + +/// emitLoadConstPool - Emits a load from constpool to materialize the +/// specified immediate. +void ARMBaseRegisterInfo:: +emitLoadConstPool(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + DebugLoc dl, + unsigned DestReg, unsigned SubIdx, int Val, + ARMCC::CondCodes Pred, + unsigned PredReg, unsigned MIFlags) const { + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + MachineConstantPool *ConstantPool = MF.getConstantPool(); + const Constant *C = + ConstantInt::get(Type::getInt32Ty(MF.getFunction()->getContext()), Val); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4); + + BuildMI(MBB, MBBI, dl, TII.get(ARM::LDRcp)) + .addReg(DestReg, getDefRegState(true), SubIdx) + .addConstantPoolIndex(Idx) + .addImm(0).addImm(Pred).addReg(PredReg) + .setMIFlags(MIFlags); +} + +bool ARMBaseRegisterInfo:: +requiresRegisterScavenging(const MachineFunction &MF) const { + return true; +} + +bool ARMBaseRegisterInfo:: +trackLivenessAfterRegAlloc(const MachineFunction &MF) const { + return true; +} + +bool ARMBaseRegisterInfo:: +requiresFrameIndexScavenging(const MachineFunction &MF) const { + return true; +} + +bool ARMBaseRegisterInfo:: +requiresVirtualBaseRegisters(const MachineFunction &MF) const { + return true; +} + +int64_t ARMBaseRegisterInfo:: +getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const { + const MCInstrDesc &Desc = MI->getDesc(); + unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); + int64_t InstrOffs = 0; + int Scale = 1; + unsigned ImmIdx = 0; + switch (AddrMode) { + case ARMII::AddrModeT2_i8: + case ARMII::AddrModeT2_i12: + case ARMII::AddrMode_i12: + InstrOffs = MI->getOperand(Idx+1).getImm(); + Scale = 1; + break; + case ARMII::AddrMode5: { + // VFP address mode. + const MachineOperand &OffOp = MI->getOperand(Idx+1); + InstrOffs = ARM_AM::getAM5Offset(OffOp.getImm()); + if (ARM_AM::getAM5Op(OffOp.getImm()) == ARM_AM::sub) + InstrOffs = -InstrOffs; + Scale = 4; + break; + } + case ARMII::AddrMode2: { + ImmIdx = Idx+2; + InstrOffs = ARM_AM::getAM2Offset(MI->getOperand(ImmIdx).getImm()); + if (ARM_AM::getAM2Op(MI->getOperand(ImmIdx).getImm()) == ARM_AM::sub) + InstrOffs = -InstrOffs; + break; + } + case ARMII::AddrMode3: { + ImmIdx = Idx+2; + InstrOffs = ARM_AM::getAM3Offset(MI->getOperand(ImmIdx).getImm()); + if (ARM_AM::getAM3Op(MI->getOperand(ImmIdx).getImm()) == ARM_AM::sub) + InstrOffs = -InstrOffs; + break; + } + case ARMII::AddrModeT1_s: { + ImmIdx = Idx+1; + InstrOffs = MI->getOperand(ImmIdx).getImm(); + Scale = 4; + break; + } + default: + llvm_unreachable("Unsupported addressing mode!"); + } + + return InstrOffs * Scale; +} + +/// needsFrameBaseReg - Returns true if the instruction's frame index +/// reference would be better served by a base register other than FP +/// or SP. Used by LocalStackFrameAllocation to determine which frame index +/// references it should create new base registers for. +bool ARMBaseRegisterInfo:: +needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { + for (unsigned i = 0; !MI->getOperand(i).isFI(); ++i) { + assert(i < MI->getNumOperands() &&"Instr doesn't have FrameIndex operand!"); + } + + // It's the load/store FI references that cause issues, as it can be difficult + // to materialize the offset if it won't fit in the literal field. Estimate + // based on the size of the local frame and some conservative assumptions + // about the rest of the stack frame (note, this is pre-regalloc, so + // we don't know everything for certain yet) whether this offset is likely + // to be out of range of the immediate. Return true if so. + + // We only generate virtual base registers for loads and stores, so + // return false for everything else. + unsigned Opc = MI->getOpcode(); + switch (Opc) { + case ARM::LDRi12: case ARM::LDRH: case ARM::LDRBi12: + case ARM::STRi12: case ARM::STRH: case ARM::STRBi12: + case ARM::t2LDRi12: case ARM::t2LDRi8: + case ARM::t2STRi12: case ARM::t2STRi8: + case ARM::VLDRS: case ARM::VLDRD: + case ARM::VSTRS: case ARM::VSTRD: + case ARM::tSTRspi: case ARM::tLDRspi: + break; + default: + return false; + } + + // Without a virtual base register, if the function has variable sized + // objects, all fixed-size local references will be via the frame pointer, + // Approximate the offset and see if it's legal for the instruction. + // Note that the incoming offset is based on the SP value at function entry, + // so it'll be negative. + MachineFunction &MF = *MI->getParent()->getParent(); + const ARMFrameLowering *TFI = getFrameLowering(MF); + MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + + // Estimate an offset from the frame pointer. + // Conservatively assume all callee-saved registers get pushed. R4-R6 + // will be earlier than the FP, so we ignore those. + // R7, LR + int64_t FPOffset = Offset - 8; + // ARM and Thumb2 functions also need to consider R8-R11 and D8-D15 + if (!AFI->isThumbFunction() || !AFI->isThumb1OnlyFunction()) + FPOffset -= 80; + // Estimate an offset from the stack pointer. + // The incoming offset is relating to the SP at the start of the function, + // but when we access the local it'll be relative to the SP after local + // allocation, so adjust our SP-relative offset by that allocation size. + Offset += MFI->getLocalFrameSize(); + // Assume that we'll have at least some spill slots allocated. + // FIXME: This is a total SWAG number. We should run some statistics + // and pick a real one. + Offset += 128; // 128 bytes of spill slots + + // If there's a frame pointer and the addressing mode allows it, try using it. + // The FP is only available if there is no dynamic realignment. We + // don't know for sure yet whether we'll need that, so we guess based + // on whether there are any local variables that would trigger it. + unsigned StackAlign = TFI->getStackAlignment(); + if (TFI->hasFP(MF) && + !((MFI->getLocalFrameMaxAlign() > StackAlign) && canRealignStack(MF))) { + if (isFrameOffsetLegal(MI, getFrameRegister(MF), FPOffset)) + return false; + } + // If we can reference via the stack pointer, try that. + // FIXME: This (and the code that resolves the references) can be improved + // to only disallow SP relative references in the live range of + // the VLA(s). In practice, it's unclear how much difference that + // would make, but it may be worth doing. + if (!MFI->hasVarSizedObjects() && isFrameOffsetLegal(MI, ARM::SP, Offset)) + return false; + + // The offset likely isn't legal, we want to allocate a virtual base register. + return true; +} + +/// materializeFrameBaseRegister - Insert defining instruction(s) for BaseReg to +/// be a pointer to FrameIdx at the beginning of the basic block. +void ARMBaseRegisterInfo:: +materializeFrameBaseRegister(MachineBasicBlock *MBB, + unsigned BaseReg, int FrameIdx, + int64_t Offset) const { + ARMFunctionInfo *AFI = MBB->getParent()->getInfo<ARMFunctionInfo>(); + unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri : + (AFI->isThumb1OnlyFunction() ? ARM::tADDframe : ARM::t2ADDri); + + MachineBasicBlock::iterator Ins = MBB->begin(); + DebugLoc DL; // Defaults to "unknown" + if (Ins != MBB->end()) + DL = Ins->getDebugLoc(); + + const MachineFunction &MF = *MBB->getParent(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const MCInstrDesc &MCID = TII.get(ADDriOpc); + MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF)); + + MachineInstrBuilder MIB = BuildMI(*MBB, Ins, DL, MCID, BaseReg) + .addFrameIndex(FrameIdx).addImm(Offset); + + if (!AFI->isThumb1OnlyFunction()) + AddDefaultCC(AddDefaultPred(MIB)); +} + +void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, + int64_t Offset) const { + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const ARMBaseInstrInfo &TII = + *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo()); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + int Off = Offset; // ARM doesn't need the general 64-bit offsets + unsigned i = 0; + + assert(!AFI->isThumb1OnlyFunction() && + "This resolveFrameIndex does not support Thumb1!"); + + while (!MI.getOperand(i).isFI()) { + ++i; + assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!"); + } + bool Done = false; + if (!AFI->isThumbFunction()) + Done = rewriteARMFrameIndex(MI, i, BaseReg, Off, TII); + else { + assert(AFI->isThumb2Function()); + Done = rewriteT2FrameIndex(MI, i, BaseReg, Off, TII); + } + assert (Done && "Unable to resolve frame index!"); + (void)Done; +} + +bool ARMBaseRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg, + int64_t Offset) const { + const MCInstrDesc &Desc = MI->getDesc(); + unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); + unsigned i = 0; + + while (!MI->getOperand(i).isFI()) { + ++i; + assert(i < MI->getNumOperands() &&"Instr doesn't have FrameIndex operand!"); + } + + // AddrMode4 and AddrMode6 cannot handle any offset. + if (AddrMode == ARMII::AddrMode4 || AddrMode == ARMII::AddrMode6) + return Offset == 0; + + unsigned NumBits = 0; + unsigned Scale = 1; + bool isSigned = true; + switch (AddrMode) { + case ARMII::AddrModeT2_i8: + case ARMII::AddrModeT2_i12: + // i8 supports only negative, and i12 supports only positive, so + // based on Offset sign, consider the appropriate instruction + Scale = 1; + if (Offset < 0) { + NumBits = 8; + Offset = -Offset; + } else { + NumBits = 12; + } + break; + case ARMII::AddrMode5: + // VFP address mode. + NumBits = 8; + Scale = 4; + break; + case ARMII::AddrMode_i12: + case ARMII::AddrMode2: + NumBits = 12; + break; + case ARMII::AddrMode3: + NumBits = 8; + break; + case ARMII::AddrModeT1_s: + NumBits = (BaseReg == ARM::SP ? 8 : 5); + Scale = 4; + isSigned = false; + break; + default: + llvm_unreachable("Unsupported addressing mode!"); + } + + Offset += getFrameIndexInstrOffset(MI, i); + // Make sure the offset is encodable for instructions that scale the + // immediate. + if ((Offset & (Scale-1)) != 0) + return false; + + if (isSigned && Offset < 0) + Offset = -Offset; + + unsigned Mask = (1 << NumBits) - 1; + if ((unsigned)Offset <= Mask * Scale) + return true; + + return false; +} + +void +ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, unsigned FIOperandNum, + RegScavenger *RS) const { + MachineInstr &MI = *II; + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const ARMBaseInstrInfo &TII = + *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const ARMFrameLowering *TFI = getFrameLowering(MF); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + assert(!AFI->isThumb1OnlyFunction() && + "This eliminateFrameIndex does not support Thumb1!"); + int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); + unsigned FrameReg; + + int Offset = TFI->ResolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj); + + // PEI::scavengeFrameVirtualRegs() cannot accurately track SPAdj because the + // call frame setup/destroy instructions have already been eliminated. That + // means the stack pointer cannot be used to access the emergency spill slot + // when !hasReservedCallFrame(). +#ifndef NDEBUG + if (RS && FrameReg == ARM::SP && RS->isScavengingFrameIndex(FrameIndex)){ + assert(TFI->hasReservedCallFrame(MF) && + "Cannot use SP to access the emergency spill slot in " + "functions without a reserved call frame"); + assert(!MF.getFrameInfo()->hasVarSizedObjects() && + "Cannot use SP to access the emergency spill slot in " + "functions with variable sized frame objects"); + } +#endif // NDEBUG + + assert(!MI.isDebugValue() && "DBG_VALUEs should be handled in target-independent code"); + + // Modify MI as necessary to handle as much of 'Offset' as possible + bool Done = false; + if (!AFI->isThumbFunction()) + Done = rewriteARMFrameIndex(MI, FIOperandNum, FrameReg, Offset, TII); + else { + assert(AFI->isThumb2Function()); + Done = rewriteT2FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII); + } + if (Done) + return; + + // If we get here, the immediate doesn't fit into the instruction. We folded + // as much as possible above, handle the rest, providing a register that is + // SP+LargeImm. + assert((Offset || + (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode4 || + (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode6) && + "This code isn't needed if offset already handled!"); + + unsigned ScratchReg = 0; + int PIdx = MI.findFirstPredOperandIdx(); + ARMCC::CondCodes Pred = (PIdx == -1) + ? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImm(); + unsigned PredReg = (PIdx == -1) ? 0 : MI.getOperand(PIdx+1).getReg(); + if (Offset == 0) + // Must be addrmode4/6. + MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false, false, false); + else { + ScratchReg = MF.getRegInfo().createVirtualRegister(&ARM::GPRRegClass); + if (!AFI->isThumbFunction()) + emitARMRegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, + Offset, Pred, PredReg, TII); + else { + assert(AFI->isThumb2Function()); + emitT2RegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, + Offset, Pred, PredReg, TII); + } + // Update the original instruction to use the scratch register. + MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false,true); + } +} + +bool ARMBaseRegisterInfo::shouldCoalesce(MachineInstr *MI, + const TargetRegisterClass *SrcRC, + unsigned SubReg, + const TargetRegisterClass *DstRC, + unsigned DstSubReg, + const TargetRegisterClass *NewRC) const { + auto MBB = MI->getParent(); + auto MF = MBB->getParent(); + const MachineRegisterInfo &MRI = MF->getRegInfo(); + // If not copying into a sub-register this should be ok because we shouldn't + // need to split the reg. + if (!DstSubReg) + return true; + // Small registers don't frequently cause a problem, so we can coalesce them. + if (NewRC->getSize() < 32 && DstRC->getSize() < 32 && SrcRC->getSize() < 32) + return true; + + auto NewRCWeight = + MRI.getTargetRegisterInfo()->getRegClassWeight(NewRC); + auto SrcRCWeight = + MRI.getTargetRegisterInfo()->getRegClassWeight(SrcRC); + auto DstRCWeight = + MRI.getTargetRegisterInfo()->getRegClassWeight(DstRC); + // If the source register class is more expensive than the destination, the + // coalescing is probably profitable. + if (SrcRCWeight.RegWeight > NewRCWeight.RegWeight) + return true; + if (DstRCWeight.RegWeight > NewRCWeight.RegWeight) + return true; + + // If the register allocator isn't constrained, we can always allow coalescing + // unfortunately we don't know yet if we will be constrained. + // The goal of this heuristic is to restrict how many expensive registers + // we allow to coalesce in a given basic block. + auto AFI = MF->getInfo<ARMFunctionInfo>(); + auto It = AFI->getCoalescedWeight(MBB); + + DEBUG(dbgs() << "\tARM::shouldCoalesce - Coalesced Weight: " + << It->second << "\n"); + DEBUG(dbgs() << "\tARM::shouldCoalesce - Reg Weight: " + << NewRCWeight.RegWeight << "\n"); + + // This number is the largest round number that which meets the criteria: + // (1) addresses PR18825 + // (2) generates better code in some test cases (like vldm-shed-a9.ll) + // (3) Doesn't regress any test cases (in-tree, test-suite, and SPEC) + // In practice the SizeMultiplier will only factor in for straight line code + // that uses a lot of NEON vectors, which isn't terribly common. + unsigned SizeMultiplier = MBB->size()/100; + SizeMultiplier = SizeMultiplier ? SizeMultiplier : 1; + if (It->second < NewRCWeight.WeightLimit * SizeMultiplier) { + It->second += NewRCWeight.RegWeight; + return true; + } + return false; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h new file mode 100644 index 0000000..cea8b80 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h @@ -0,0 +1,191 @@ +//===-- ARMBaseRegisterInfo.h - ARM Register Information Impl ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the base ARM implementation of TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_ARMBASEREGISTERINFO_H +#define LLVM_LIB_TARGET_ARM_ARMBASEREGISTERINFO_H + +#include "MCTargetDesc/ARMBaseInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" + +#define GET_REGINFO_HEADER +#include "ARMGenRegisterInfo.inc" + +namespace llvm { +/// Register allocation hints. +namespace ARMRI { + enum { + RegPairOdd = 1, + RegPairEven = 2 + }; +} + +/// isARMArea1Register - Returns true if the register is a low register (r0-r7) +/// or a stack/pc register that we should push/pop. +static inline bool isARMArea1Register(unsigned Reg, bool isIOS) { + using namespace ARM; + switch (Reg) { + case R0: case R1: case R2: case R3: + case R4: case R5: case R6: case R7: + case LR: case SP: case PC: + return true; + case R8: case R9: case R10: case R11: case R12: + // For iOS we want r7 and lr to be next to each other. + return !isIOS; + default: + return false; + } +} + +static inline bool isARMArea2Register(unsigned Reg, bool isIOS) { + using namespace ARM; + switch (Reg) { + case R8: case R9: case R10: case R11: case R12: + // iOS has this second area. + return isIOS; + default: + return false; + } +} + +static inline bool isARMArea3Register(unsigned Reg, bool isIOS) { + using namespace ARM; + switch (Reg) { + case D15: case D14: case D13: case D12: + case D11: case D10: case D9: case D8: + return true; + default: + return false; + } +} + +static inline bool isCalleeSavedRegister(unsigned Reg, + const MCPhysReg *CSRegs) { + for (unsigned i = 0; CSRegs[i]; ++i) + if (Reg == CSRegs[i]) + return true; + return false; +} + +class ARMBaseRegisterInfo : public ARMGenRegisterInfo { +protected: + /// BasePtr - ARM physical register used as a base ptr in complex stack + /// frames. I.e., when we need a 3rd base, not just SP and FP, due to + /// variable size stack objects. + unsigned BasePtr; + + // Can be only subclassed. + explicit ARMBaseRegisterInfo(); + + // Return the opcode that implements 'Op', or 0 if no opcode + unsigned getOpcode(int Op) const; + +public: + /// Code Generation virtual methods... + const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const uint32_t *getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID) const override; + const uint32_t *getNoPreservedMask() const override; + + /// getThisReturnPreservedMask - Returns a call preserved mask specific to the + /// case that 'returned' is on an i32 first argument if the calling convention + /// is one that can (partially) model this attribute with a preserved mask + /// (i.e. it is a calling convention that uses the same register for the first + /// i32 argument and an i32 return value) + /// + /// Should return NULL in the case that the calling convention does not have + /// this property + const uint32_t *getThisReturnPreservedMask(const MachineFunction &MF, + CallingConv::ID) const; + + BitVector getReservedRegs(const MachineFunction &MF) const override; + + const TargetRegisterClass * + getPointerRegClass(const MachineFunction &MF, + unsigned Kind = 0) const override; + const TargetRegisterClass * + getCrossCopyRegClass(const TargetRegisterClass *RC) const override; + + const TargetRegisterClass * + getLargestLegalSuperClass(const TargetRegisterClass *RC, + const MachineFunction &MF) const override; + + unsigned getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const override; + + void getRegAllocationHints(unsigned VirtReg, + ArrayRef<MCPhysReg> Order, + SmallVectorImpl<MCPhysReg> &Hints, + const MachineFunction &MF, + const VirtRegMap *VRM, + const LiveRegMatrix *Matrix) const override; + + void updateRegAllocHint(unsigned Reg, unsigned NewReg, + MachineFunction &MF) const override; + + bool hasBasePointer(const MachineFunction &MF) const; + + bool canRealignStack(const MachineFunction &MF) const override; + int64_t getFrameIndexInstrOffset(const MachineInstr *MI, + int Idx) const override; + bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override; + void materializeFrameBaseRegister(MachineBasicBlock *MBB, + unsigned BaseReg, int FrameIdx, + int64_t Offset) const override; + void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, + int64_t Offset) const override; + bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg, + int64_t Offset) const override; + + bool cannotEliminateFrame(const MachineFunction &MF) const; + + // Debug information queries. + unsigned getFrameRegister(const MachineFunction &MF) const override; + unsigned getBaseRegister() const { return BasePtr; } + + bool isLowRegister(unsigned Reg) const; + + + /// emitLoadConstPool - Emits a load from constpool to materialize the + /// specified immediate. + virtual void emitLoadConstPool(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + DebugLoc dl, unsigned DestReg, unsigned SubIdx, + int Val, ARMCC::CondCodes Pred = ARMCC::AL, + unsigned PredReg = 0, + unsigned MIFlags = MachineInstr::NoFlags)const; + + /// Code Generation virtual methods... + bool requiresRegisterScavenging(const MachineFunction &MF) const override; + + bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override; + + bool requiresFrameIndexScavenging(const MachineFunction &MF) const override; + + bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override; + + void eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, unsigned FIOperandNum, + RegScavenger *RS = nullptr) const override; + + /// \brief SrcRC and DstRC will be morphed into NewRC if this returns true + bool shouldCoalesce(MachineInstr *MI, + const TargetRegisterClass *SrcRC, + unsigned SubReg, + const TargetRegisterClass *DstRC, + unsigned DstSubReg, + const TargetRegisterClass *NewRC) const override; +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.h b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h new file mode 100644 index 0000000..a731d00 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h @@ -0,0 +1,288 @@ +//=== ARMCallingConv.h - ARM Custom Calling Convention Routines -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the custom routines for the ARM Calling Convention that +// aren't done by tablegen. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_ARMCALLINGCONV_H +#define LLVM_LIB_TARGET_ARM_ARMCALLINGCONV_H + +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "ARMSubtarget.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/Target/TargetInstrInfo.h" + +namespace llvm { + +// APCS f64 is in register pairs, possibly split to stack +static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + CCState &State, bool CanFail) { + static const MCPhysReg RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; + + // Try to get the first register. + if (unsigned Reg = State.AllocateReg(RegList)) + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + else { + // For the 2nd half of a v2f64, do not fail. + if (CanFail) + return false; + + // Put the whole thing on the stack. + State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, + State.AllocateStack(8, 4), + LocVT, LocInfo)); + return true; + } + + // Try to get the second register. + if (unsigned Reg = State.AllocateReg(RegList)) + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + else + State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, + State.AllocateStack(4, 4), + LocVT, LocInfo)); + return true; +} + +static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + if (!f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, true)) + return false; + if (LocVT == MVT::v2f64 && + !f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, false)) + return false; + return true; // we handled it +} + +// AAPCS f64 is in aligned register pairs +static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + CCState &State, bool CanFail) { + static const MCPhysReg HiRegList[] = { ARM::R0, ARM::R2 }; + static const MCPhysReg LoRegList[] = { ARM::R1, ARM::R3 }; + static const MCPhysReg ShadowRegList[] = { ARM::R0, ARM::R1 }; + static const MCPhysReg GPRArgRegs[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; + + unsigned Reg = State.AllocateReg(HiRegList, ShadowRegList); + if (Reg == 0) { + + // If we had R3 unallocated only, now we still must to waste it. + Reg = State.AllocateReg(GPRArgRegs); + assert((!Reg || Reg == ARM::R3) && "Wrong GPRs usage for f64"); + + // For the 2nd half of a v2f64, do not just fail. + if (CanFail) + return false; + + // Put the whole thing on the stack. + State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, + State.AllocateStack(8, 8), + LocVT, LocInfo)); + return true; + } + + unsigned i; + for (i = 0; i < 2; ++i) + if (HiRegList[i] == Reg) + break; + + unsigned T = State.AllocateReg(LoRegList[i]); + (void)T; + assert(T == LoRegList[i] && "Could not allocate register"); + + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i], + LocVT, LocInfo)); + return true; +} + +static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + if (!f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, true)) + return false; + if (LocVT == MVT::v2f64 && + !f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, false)) + return false; + return true; // we handled it +} + +static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, CCState &State) { + static const MCPhysReg HiRegList[] = { ARM::R0, ARM::R2 }; + static const MCPhysReg LoRegList[] = { ARM::R1, ARM::R3 }; + + unsigned Reg = State.AllocateReg(HiRegList, LoRegList); + if (Reg == 0) + return false; // we didn't handle it + + unsigned i; + for (i = 0; i < 2; ++i) + if (HiRegList[i] == Reg) + break; + + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i], + LocVT, LocInfo)); + return true; +} + +static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + if (!f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State)) + return false; + if (LocVT == MVT::v2f64 && !f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State)) + return false; + return true; // we handled it +} + +static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + return RetCC_ARM_APCS_Custom_f64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, + State); +} + +static const MCPhysReg RRegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; + +static const MCPhysReg SRegList[] = { ARM::S0, ARM::S1, ARM::S2, ARM::S3, + ARM::S4, ARM::S5, ARM::S6, ARM::S7, + ARM::S8, ARM::S9, ARM::S10, ARM::S11, + ARM::S12, ARM::S13, ARM::S14, ARM::S15 }; +static const MCPhysReg DRegList[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3, + ARM::D4, ARM::D5, ARM::D6, ARM::D7 }; +static const MCPhysReg QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 }; + + +// Allocate part of an AAPCS HFA or HVA. We assume that each member of the HA +// has InConsecutiveRegs set, and that the last member also has +// InConsecutiveRegsLast set. We must process all members of the HA before +// we can allocate it, as we need to know the total number of registers that +// will be needed in order to (attempt to) allocate a contiguous block. +static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs(); + + // AAPCS HFAs must have 1-4 elements, all of the same type + if (PendingMembers.size() > 0) + assert(PendingMembers[0].getLocVT() == LocVT); + + // Add the argument to the list to be allocated once we know the size of the + // aggregate. Store the type's required alignmnent as extra info for later: in + // the [N x i64] case all trace has been removed by the time we actually get + // to do allocation. + PendingMembers.push_back(CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo, + ArgFlags.getOrigAlign())); + + if (!ArgFlags.isInConsecutiveRegsLast()) + return true; + + // Try to allocate a contiguous block of registers, each of the correct + // size to hold one member. + auto &DL = State.getMachineFunction().getDataLayout(); + unsigned StackAlign = DL.getStackAlignment(); + unsigned Align = std::min(PendingMembers[0].getExtraInfo(), StackAlign); + + ArrayRef<MCPhysReg> RegList; + switch (LocVT.SimpleTy) { + case MVT::i32: { + RegList = RRegList; + unsigned RegIdx = State.getFirstUnallocated(RegList); + + // First consume all registers that would give an unaligned object. Whether + // we go on stack or in regs, no-one will be using them in future. + unsigned RegAlign = RoundUpToAlignment(Align, 4) / 4; + while (RegIdx % RegAlign != 0 && RegIdx < RegList.size()) + State.AllocateReg(RegList[RegIdx++]); + + break; + } + case MVT::f32: + RegList = SRegList; + break; + case MVT::f64: + RegList = DRegList; + break; + case MVT::v2f64: + RegList = QRegList; + break; + default: + llvm_unreachable("Unexpected member type for block aggregate"); + break; + } + + unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size()); + if (RegResult) { + for (SmallVectorImpl<CCValAssign>::iterator It = PendingMembers.begin(); + It != PendingMembers.end(); ++It) { + It->convertToReg(RegResult); + State.addLoc(*It); + ++RegResult; + } + PendingMembers.clear(); + return true; + } + + // Register allocation failed, we'll be needing the stack + unsigned Size = LocVT.getSizeInBits() / 8; + if (LocVT == MVT::i32 && State.getNextStackOffset() == 0) { + // If nothing else has used the stack until this point, a non-HFA aggregate + // can be split between regs and stack. + unsigned RegIdx = State.getFirstUnallocated(RegList); + for (auto &It : PendingMembers) { + if (RegIdx >= RegList.size()) + It.convertToMem(State.AllocateStack(Size, Size)); + else + It.convertToReg(State.AllocateReg(RegList[RegIdx++])); + + State.addLoc(It); + } + PendingMembers.clear(); + return true; + } else if (LocVT != MVT::i32) + RegList = SRegList; + + // Mark all regs as unavailable (AAPCS rule C.2.vfp for VFP, C.6 for core) + for (auto Reg : RegList) + State.AllocateReg(Reg); + + for (auto &It : PendingMembers) { + It.convertToMem(State.AllocateStack(Size, Align)); + State.addLoc(It); + + // After the first item has been allocated, the rest are packed as tightly + // as possible. (E.g. an incoming i64 would have starting Align of 8, but + // we'll be allocating a bunch of i32 slots). + Align = Size; + } + + // All pending members have now been allocated + PendingMembers.clear(); + + // This will be allocated by the last member of the aggregate + return true; +} + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.td b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td new file mode 100644 index 0000000..2335164 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td @@ -0,0 +1,246 @@ +//===-- ARMCallingConv.td - Calling Conventions for ARM ----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This describes the calling conventions for ARM architecture. +//===----------------------------------------------------------------------===// + +/// CCIfAlign - Match of the original alignment of the arg +class CCIfAlign<string Align, CCAction A>: + CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>; + +//===----------------------------------------------------------------------===// +// ARM APCS Calling Convention +//===----------------------------------------------------------------------===// +def CC_ARM_APCS : CallingConv<[ + + // Handles byval parameters. + CCIfByVal<CCPassByVal<4, 4>>, + + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + // f64 and v2f64 are passed in adjacent GPRs, possibly split onto the stack + CCIfType<[f64, v2f64], CCCustom<"CC_ARM_APCS_Custom_f64">>, + + CCIfType<[f32], CCBitConvertToType<i32>>, + CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>, + + CCIfType<[i32], CCAssignToStack<4, 4>>, + CCIfType<[f64], CCAssignToStack<8, 4>>, + CCIfType<[v2f64], CCAssignToStack<16, 4>> +]>; + +def RetCC_ARM_APCS : CallingConv<[ + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + CCIfType<[f32], CCBitConvertToType<i32>>, + + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>, + + CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>, + CCIfType<[i64], CCAssignToRegWithShadow<[R0, R2], [R1, R3]>> +]>; + +//===----------------------------------------------------------------------===// +// ARM APCS Calling Convention for FastCC (when VFP2 or later is available) +//===----------------------------------------------------------------------===// +def FastCC_ARM_APCS : CallingConv<[ + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, + CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, + CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, + S9, S10, S11, S12, S13, S14, S15]>>, + + // CPRCs may be allocated to co-processor registers or the stack - they + // may never be allocated to core registers. + CCIfType<[f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>, + CCIfType<[f64], CCAssignToStackWithShadow<8, 4, [Q0, Q1, Q2, Q3]>>, + CCIfType<[v2f64], CCAssignToStackWithShadow<16, 4, [Q0, Q1, Q2, Q3]>>, + + CCDelegateTo<CC_ARM_APCS> +]>; + +def RetFastCC_ARM_APCS : CallingConv<[ + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, + CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, + CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, + S9, S10, S11, S12, S13, S14, S15]>>, + CCDelegateTo<RetCC_ARM_APCS> +]>; + +//===----------------------------------------------------------------------===// +// ARM APCS Calling Convention for GHC +//===----------------------------------------------------------------------===// + +def CC_ARM_APCS_GHC : CallingConv<[ + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + CCIfType<[v2f64], CCAssignToReg<[Q4, Q5]>>, + CCIfType<[f64], CCAssignToReg<[D8, D9, D10, D11]>>, + CCIfType<[f32], CCAssignToReg<[S16, S17, S18, S19, S20, S21, S22, S23]>>, + + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, SpLim + CCIfType<[i32], CCAssignToReg<[R4, R5, R6, R7, R8, R9, R10, R11]>> +]>; + +//===----------------------------------------------------------------------===// +// ARM AAPCS (EABI) Calling Convention, common parts +//===----------------------------------------------------------------------===// + +def CC_ARM_AAPCS_Common : CallingConv<[ + + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + + // i64/f64 is passed in even pairs of GPRs + // i64 is 8-aligned i32 here, so we may need to eat R1 as a pad register + // (and the same is true for f64 if VFP is not enabled) + CCIfType<[i32], CCIfAlign<"8", CCAssignToRegWithShadow<[R0, R2], [R0, R1]>>>, + CCIfType<[i32], CCIf<"ArgFlags.getOrigAlign() != 8", + CCAssignToReg<[R0, R1, R2, R3]>>>, + + CCIfType<[i32], CCIfAlign<"8", CCAssignToStackWithShadow<4, 8, [R0, R1, R2, R3]>>>, + CCIfType<[i32], CCAssignToStackWithShadow<4, 4, [R0, R1, R2, R3]>>, + CCIfType<[f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>, + CCIfType<[f64], CCAssignToStackWithShadow<8, 8, [Q0, Q1, Q2, Q3]>>, + CCIfType<[v2f64], CCIfAlign<"16", + CCAssignToStackWithShadow<16, 16, [Q0, Q1, Q2, Q3]>>>, + CCIfType<[v2f64], CCAssignToStackWithShadow<16, 8, [Q0, Q1, Q2, Q3]>> +]>; + +def RetCC_ARM_AAPCS_Common : CallingConv<[ + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>, + CCIfType<[i64], CCAssignToRegWithShadow<[R0, R2], [R1, R3]>> +]>; + +//===----------------------------------------------------------------------===// +// ARM AAPCS (EABI) Calling Convention +//===----------------------------------------------------------------------===// + +def CC_ARM_AAPCS : CallingConv<[ + // Handles byval parameters. + CCIfByVal<CCPassByVal<4, 4>>, + + // The 'nest' parameter, if any, is passed in R12. + CCIfNest<CCAssignToReg<[R12]>>, + + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + CCIfType<[f64, v2f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>, + CCIfType<[f32], CCBitConvertToType<i32>>, + CCDelegateTo<CC_ARM_AAPCS_Common> +]>; + +def RetCC_ARM_AAPCS : CallingConv<[ + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>, + CCIfType<[f32], CCBitConvertToType<i32>>, + CCDelegateTo<RetCC_ARM_AAPCS_Common> +]>; + +//===----------------------------------------------------------------------===// +// ARM AAPCS-VFP (EABI) Calling Convention +// Also used for FastCC (when VFP2 or later is available) +//===----------------------------------------------------------------------===// + +def CC_ARM_AAPCS_VFP : CallingConv<[ + // Handles byval parameters. + CCIfByVal<CCPassByVal<4, 4>>, + + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + // HFAs are passed in a contiguous block of registers, or on the stack + CCIfConsecutiveRegs<CCCustom<"CC_ARM_AAPCS_Custom_Aggregate">>, + + CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, + CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, + CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, + S9, S10, S11, S12, S13, S14, S15]>>, + CCDelegateTo<CC_ARM_AAPCS_Common> +]>; + +def RetCC_ARM_AAPCS_VFP : CallingConv<[ + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, + CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, + CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, + S9, S10, S11, S12, S13, S14, S15]>>, + CCDelegateTo<RetCC_ARM_AAPCS_Common> +]>; + +//===----------------------------------------------------------------------===// +// Callee-saved register lists. +//===----------------------------------------------------------------------===// + +def CSR_NoRegs : CalleeSavedRegs<(add)>; + +def CSR_AAPCS : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6, R5, R4, + (sequence "D%u", 15, 8))>; + +// Constructors and destructors return 'this' in the ARM C++ ABI; since 'this' +// and the pointer return value are both passed in R0 in these cases, this can +// be partially modelled by treating R0 as a callee-saved register +// Only the resulting RegMask is used; the SaveList is ignored +def CSR_AAPCS_ThisReturn : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6, + R5, R4, (sequence "D%u", 15, 8), + R0)>; + +// iOS ABI deviates from ARM standard ABI. R9 is not a callee-saved register. +// Also save R7-R4 first to match the stack frame fixed spill areas. +def CSR_iOS : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS, R9))>; + +def CSR_iOS_ThisReturn : CalleeSavedRegs<(add LR, R7, R6, R5, R4, + (sub CSR_AAPCS_ThisReturn, R9))>; + +// The "interrupt" attribute is used to generate code that is acceptable in +// exception-handlers of various kinds. It makes us use a different return +// instruction (handled elsewhere) and affects which registers we must return to +// our "caller" in the same state as we receive them. + +// For most interrupts, all registers except SP and LR are shared with +// user-space. We mark LR to be saved anyway, since this is what the ARM backend +// generally does rather than tracking its liveness as a normal register. +def CSR_GenericInt : CalleeSavedRegs<(add LR, (sequence "R%u", 12, 0))>; + +// The fast interrupt handlers have more private state and get their own copies +// of R8-R12, in addition to SP and LR. As before, mark LR for saving too. + +// FIXME: we mark R11 as callee-saved since it's often the frame-pointer, and +// current frame lowering expects to encounter it while processing callee-saved +// registers. +def CSR_FIQ : CalleeSavedRegs<(add LR, R11, (sequence "R%u", 7, 0))>; + + diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp new file mode 100644 index 0000000..55c1684 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -0,0 +1,2281 @@ +//===-- ARMConstantIslandPass.cpp - ARM constant islands ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that splits the constant pool up into 'islands' +// which are scattered through-out the function. This is required due to the +// limited pc-relative displacements that ARM has. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMMachineFunctionInfo.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "Thumb2InstrInfo.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include <algorithm> +using namespace llvm; + +#define DEBUG_TYPE "arm-cp-islands" + +STATISTIC(NumCPEs, "Number of constpool entries"); +STATISTIC(NumSplit, "Number of uncond branches inserted"); +STATISTIC(NumCBrFixed, "Number of cond branches fixed"); +STATISTIC(NumUBrFixed, "Number of uncond branches fixed"); +STATISTIC(NumTBs, "Number of table branches generated"); +STATISTIC(NumT2CPShrunk, "Number of Thumb2 constantpool instructions shrunk"); +STATISTIC(NumT2BrShrunk, "Number of Thumb2 immediate branches shrunk"); +STATISTIC(NumCBZ, "Number of CBZ / CBNZ formed"); +STATISTIC(NumJTMoved, "Number of jump table destination blocks moved"); +STATISTIC(NumJTInserted, "Number of jump table intermediate blocks inserted"); + + +static cl::opt<bool> +AdjustJumpTableBlocks("arm-adjust-jump-tables", cl::Hidden, cl::init(true), + cl::desc("Adjust basic block layout to better use TB[BH]")); + +/// UnknownPadding - Return the worst case padding that could result from +/// unknown offset bits. This does not include alignment padding caused by +/// known offset bits. +/// +/// @param LogAlign log2(alignment) +/// @param KnownBits Number of known low offset bits. +static inline unsigned UnknownPadding(unsigned LogAlign, unsigned KnownBits) { + if (KnownBits < LogAlign) + return (1u << LogAlign) - (1u << KnownBits); + return 0; +} + +namespace { + /// ARMConstantIslands - Due to limited PC-relative displacements, ARM + /// requires constant pool entries to be scattered among the instructions + /// inside a function. To do this, it completely ignores the normal LLVM + /// constant pool; instead, it places constants wherever it feels like with + /// special instructions. + /// + /// The terminology used in this pass includes: + /// Islands - Clumps of constants placed in the function. + /// Water - Potential places where an island could be formed. + /// CPE - A constant pool entry that has been placed somewhere, which + /// tracks a list of users. + class ARMConstantIslands : public MachineFunctionPass { + /// BasicBlockInfo - Information about the offset and size of a single + /// basic block. + struct BasicBlockInfo { + /// Offset - Distance from the beginning of the function to the beginning + /// of this basic block. + /// + /// Offsets are computed assuming worst case padding before an aligned + /// block. This means that subtracting basic block offsets always gives a + /// conservative estimate of the real distance which may be smaller. + /// + /// Because worst case padding is used, the computed offset of an aligned + /// block may not actually be aligned. + unsigned Offset; + + /// Size - Size of the basic block in bytes. If the block contains + /// inline assembly, this is a worst case estimate. + /// + /// The size does not include any alignment padding whether from the + /// beginning of the block, or from an aligned jump table at the end. + unsigned Size; + + /// KnownBits - The number of low bits in Offset that are known to be + /// exact. The remaining bits of Offset are an upper bound. + uint8_t KnownBits; + + /// Unalign - When non-zero, the block contains instructions (inline asm) + /// of unknown size. The real size may be smaller than Size bytes by a + /// multiple of 1 << Unalign. + uint8_t Unalign; + + /// PostAlign - When non-zero, the block terminator contains a .align + /// directive, so the end of the block is aligned to 1 << PostAlign + /// bytes. + uint8_t PostAlign; + + BasicBlockInfo() : Offset(0), Size(0), KnownBits(0), Unalign(0), + PostAlign(0) {} + + /// Compute the number of known offset bits internally to this block. + /// This number should be used to predict worst case padding when + /// splitting the block. + unsigned internalKnownBits() const { + unsigned Bits = Unalign ? Unalign : KnownBits; + // If the block size isn't a multiple of the known bits, assume the + // worst case padding. + if (Size & ((1u << Bits) - 1)) + Bits = countTrailingZeros(Size); + return Bits; + } + + /// Compute the offset immediately following this block. If LogAlign is + /// specified, return the offset the successor block will get if it has + /// this alignment. + unsigned postOffset(unsigned LogAlign = 0) const { + unsigned PO = Offset + Size; + unsigned LA = std::max(unsigned(PostAlign), LogAlign); + if (!LA) + return PO; + // Add alignment padding from the terminator. + return PO + UnknownPadding(LA, internalKnownBits()); + } + + /// Compute the number of known low bits of postOffset. If this block + /// contains inline asm, the number of known bits drops to the + /// instruction alignment. An aligned terminator may increase the number + /// of know bits. + /// If LogAlign is given, also consider the alignment of the next block. + unsigned postKnownBits(unsigned LogAlign = 0) const { + return std::max(std::max(unsigned(PostAlign), LogAlign), + internalKnownBits()); + } + }; + + std::vector<BasicBlockInfo> BBInfo; + + /// WaterList - A sorted list of basic blocks where islands could be placed + /// (i.e. blocks that don't fall through to the following block, due + /// to a return, unreachable, or unconditional branch). + std::vector<MachineBasicBlock*> WaterList; + + /// NewWaterList - The subset of WaterList that was created since the + /// previous iteration by inserting unconditional branches. + SmallSet<MachineBasicBlock*, 4> NewWaterList; + + typedef std::vector<MachineBasicBlock*>::iterator water_iterator; + + /// CPUser - One user of a constant pool, keeping the machine instruction + /// pointer, the constant pool being referenced, and the max displacement + /// allowed from the instruction to the CP. The HighWaterMark records the + /// highest basic block where a new CPEntry can be placed. To ensure this + /// pass terminates, the CP entries are initially placed at the end of the + /// function and then move monotonically to lower addresses. The + /// exception to this rule is when the current CP entry for a particular + /// CPUser is out of range, but there is another CP entry for the same + /// constant value in range. We want to use the existing in-range CP + /// entry, but if it later moves out of range, the search for new water + /// should resume where it left off. The HighWaterMark is used to record + /// that point. + struct CPUser { + MachineInstr *MI; + MachineInstr *CPEMI; + MachineBasicBlock *HighWaterMark; + unsigned MaxDisp; + bool NegOk; + bool IsSoImm; + bool KnownAlignment; + CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp, + bool neg, bool soimm) + : MI(mi), CPEMI(cpemi), MaxDisp(maxdisp), NegOk(neg), IsSoImm(soimm), + KnownAlignment(false) { + HighWaterMark = CPEMI->getParent(); + } + /// getMaxDisp - Returns the maximum displacement supported by MI. + /// Correct for unknown alignment. + /// Conservatively subtract 2 bytes to handle weird alignment effects. + unsigned getMaxDisp() const { + return (KnownAlignment ? MaxDisp : MaxDisp - 2) - 2; + } + }; + + /// CPUsers - Keep track of all of the machine instructions that use various + /// constant pools and their max displacement. + std::vector<CPUser> CPUsers; + + /// CPEntry - One per constant pool entry, keeping the machine instruction + /// pointer, the constpool index, and the number of CPUser's which + /// reference this entry. + struct CPEntry { + MachineInstr *CPEMI; + unsigned CPI; + unsigned RefCount; + CPEntry(MachineInstr *cpemi, unsigned cpi, unsigned rc = 0) + : CPEMI(cpemi), CPI(cpi), RefCount(rc) {} + }; + + /// CPEntries - Keep track of all of the constant pool entry machine + /// instructions. For each original constpool index (i.e. those that existed + /// upon entry to this pass), it keeps a vector of entries. Original + /// elements are cloned as we go along; the clones are put in the vector of + /// the original element, but have distinct CPIs. + /// + /// The first half of CPEntries contains generic constants, the second half + /// contains jump tables. Use getCombinedIndex on a generic CPEMI to look up + /// which vector it will be in here. + std::vector<std::vector<CPEntry> > CPEntries; + + /// Maps a JT index to the offset in CPEntries containing copies of that + /// table. The equivalent map for a CONSTPOOL_ENTRY is the identity. + DenseMap<int, int> JumpTableEntryIndices; + + /// Maps a JT index to the LEA that actually uses the index to calculate its + /// base address. + DenseMap<int, int> JumpTableUserIndices; + + /// ImmBranch - One per immediate branch, keeping the machine instruction + /// pointer, conditional or unconditional, the max displacement, + /// and (if isCond is true) the corresponding unconditional branch + /// opcode. + struct ImmBranch { + MachineInstr *MI; + unsigned MaxDisp : 31; + bool isCond : 1; + unsigned UncondBr; + ImmBranch(MachineInstr *mi, unsigned maxdisp, bool cond, unsigned ubr) + : MI(mi), MaxDisp(maxdisp), isCond(cond), UncondBr(ubr) {} + }; + + /// ImmBranches - Keep track of all the immediate branch instructions. + /// + std::vector<ImmBranch> ImmBranches; + + /// PushPopMIs - Keep track of all the Thumb push / pop instructions. + /// + SmallVector<MachineInstr*, 4> PushPopMIs; + + /// T2JumpTables - Keep track of all the Thumb2 jumptable instructions. + SmallVector<MachineInstr*, 4> T2JumpTables; + + /// HasFarJump - True if any far jump instruction has been emitted during + /// the branch fix up pass. + bool HasFarJump; + + MachineFunction *MF; + MachineConstantPool *MCP; + const ARMBaseInstrInfo *TII; + const ARMSubtarget *STI; + ARMFunctionInfo *AFI; + bool isThumb; + bool isThumb1; + bool isThumb2; + public: + static char ID; + ARMConstantIslands() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "ARM constant island placement and branch shortening pass"; + } + + private: + void doInitialConstPlacement(std::vector<MachineInstr *> &CPEMIs); + void doInitialJumpTablePlacement(std::vector<MachineInstr *> &CPEMIs); + bool BBHasFallthrough(MachineBasicBlock *MBB); + CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI); + unsigned getCPELogAlign(const MachineInstr *CPEMI); + void scanFunctionJumpTables(); + void initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs); + MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI); + void updateForInsertedWaterBlock(MachineBasicBlock *NewBB); + void adjustBBOffsetsAfter(MachineBasicBlock *BB); + bool decrementCPEReferenceCount(unsigned CPI, MachineInstr* CPEMI); + unsigned getCombinedIndex(const MachineInstr *CPEMI); + int findInRangeCPEntry(CPUser& U, unsigned UserOffset); + bool findAvailableWater(CPUser&U, unsigned UserOffset, + water_iterator &WaterIter); + void createNewWater(unsigned CPUserIndex, unsigned UserOffset, + MachineBasicBlock *&NewMBB); + bool handleConstantPoolUser(unsigned CPUserIndex); + void removeDeadCPEMI(MachineInstr *CPEMI); + bool removeUnusedCPEntries(); + bool isCPEntryInRange(MachineInstr *MI, unsigned UserOffset, + MachineInstr *CPEMI, unsigned Disp, bool NegOk, + bool DoDump = false); + bool isWaterInRange(unsigned UserOffset, MachineBasicBlock *Water, + CPUser &U, unsigned &Growth); + bool isBBInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp); + bool fixupImmediateBr(ImmBranch &Br); + bool fixupConditionalBr(ImmBranch &Br); + bool fixupUnconditionalBr(ImmBranch &Br); + bool undoLRSpillRestore(); + bool mayOptimizeThumb2Instruction(const MachineInstr *MI) const; + bool optimizeThumb2Instructions(); + bool optimizeThumb2Branches(); + bool reorderThumb2JumpTables(); + bool preserveBaseRegister(MachineInstr *JumpMI, MachineInstr *LEAMI, + unsigned &DeadSize, bool &CanDeleteLEA, + bool &BaseRegKill); + bool optimizeThumb2JumpTables(); + MachineBasicBlock *adjustJTTargetBlockForward(MachineBasicBlock *BB, + MachineBasicBlock *JTBB); + + void computeBlockSize(MachineBasicBlock *MBB); + unsigned getOffsetOf(MachineInstr *MI) const; + unsigned getUserOffset(CPUser&) const; + void dumpBBs(); + void verify(); + + bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset, + unsigned Disp, bool NegativeOK, bool IsSoImm = false); + bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset, + const CPUser &U) { + return isOffsetInRange(UserOffset, TrialOffset, + U.getMaxDisp(), U.NegOk, U.IsSoImm); + } + }; + char ARMConstantIslands::ID = 0; +} + +/// verify - check BBOffsets, BBSizes, alignment of islands +void ARMConstantIslands::verify() { +#ifndef NDEBUG + assert(std::is_sorted(MF->begin(), MF->end(), + [this](const MachineBasicBlock &LHS, + const MachineBasicBlock &RHS) { + return BBInfo[LHS.getNumber()].postOffset() < + BBInfo[RHS.getNumber()].postOffset(); + })); + DEBUG(dbgs() << "Verifying " << CPUsers.size() << " CP users.\n"); + for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) { + CPUser &U = CPUsers[i]; + unsigned UserOffset = getUserOffset(U); + // Verify offset using the real max displacement without the safety + // adjustment. + if (isCPEntryInRange(U.MI, UserOffset, U.CPEMI, U.getMaxDisp()+2, U.NegOk, + /* DoDump = */ true)) { + DEBUG(dbgs() << "OK\n"); + continue; + } + DEBUG(dbgs() << "Out of range.\n"); + dumpBBs(); + DEBUG(MF->dump()); + llvm_unreachable("Constant pool entry out of range!"); + } +#endif +} + +/// print block size and offset information - debugging +void ARMConstantIslands::dumpBBs() { + DEBUG({ + for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) { + const BasicBlockInfo &BBI = BBInfo[J]; + dbgs() << format("%08x BB#%u\t", BBI.Offset, J) + << " kb=" << unsigned(BBI.KnownBits) + << " ua=" << unsigned(BBI.Unalign) + << " pa=" << unsigned(BBI.PostAlign) + << format(" size=%#x\n", BBInfo[J].Size); + } + }); +} + +/// createARMConstantIslandPass - returns an instance of the constpool +/// island pass. +FunctionPass *llvm::createARMConstantIslandPass() { + return new ARMConstantIslands(); +} + +bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) { + MF = &mf; + MCP = mf.getConstantPool(); + + DEBUG(dbgs() << "***** ARMConstantIslands: " + << MCP->getConstants().size() << " CP entries, aligned to " + << MCP->getConstantPoolAlignment() << " bytes *****\n"); + + STI = &static_cast<const ARMSubtarget &>(MF->getSubtarget()); + TII = STI->getInstrInfo(); + AFI = MF->getInfo<ARMFunctionInfo>(); + + isThumb = AFI->isThumbFunction(); + isThumb1 = AFI->isThumb1OnlyFunction(); + isThumb2 = AFI->isThumb2Function(); + + HasFarJump = false; + + // This pass invalidates liveness information when it splits basic blocks. + MF->getRegInfo().invalidateLiveness(); + + // Renumber all of the machine basic blocks in the function, guaranteeing that + // the numbers agree with the position of the block in the function. + MF->RenumberBlocks(); + + // Try to reorder and otherwise adjust the block layout to make good use + // of the TB[BH] instructions. + bool MadeChange = false; + if (isThumb2 && AdjustJumpTableBlocks) { + scanFunctionJumpTables(); + MadeChange |= reorderThumb2JumpTables(); + // Data is out of date, so clear it. It'll be re-computed later. + T2JumpTables.clear(); + // Blocks may have shifted around. Keep the numbering up to date. + MF->RenumberBlocks(); + } + + // Perform the initial placement of the constant pool entries. To start with, + // we put them all at the end of the function. + std::vector<MachineInstr*> CPEMIs; + if (!MCP->isEmpty()) + doInitialConstPlacement(CPEMIs); + + if (MF->getJumpTableInfo()) + doInitialJumpTablePlacement(CPEMIs); + + /// The next UID to take is the first unused one. + AFI->initPICLabelUId(CPEMIs.size()); + + // Do the initial scan of the function, building up information about the + // sizes of each block, the location of all the water, and finding all of the + // constant pool users. + initializeFunctionInfo(CPEMIs); + CPEMIs.clear(); + DEBUG(dumpBBs()); + + // Functions with jump tables need an alignment of 4 because they use the ADR + // instruction, which aligns the PC to 4 bytes before adding an offset. + if (!T2JumpTables.empty()) + MF->ensureAlignment(2); + + /// Remove dead constant pool entries. + MadeChange |= removeUnusedCPEntries(); + + // Iteratively place constant pool entries and fix up branches until there + // is no change. + unsigned NoCPIters = 0, NoBRIters = 0; + while (true) { + DEBUG(dbgs() << "Beginning CP iteration #" << NoCPIters << '\n'); + bool CPChange = false; + for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) + CPChange |= handleConstantPoolUser(i); + if (CPChange && ++NoCPIters > 30) + report_fatal_error("Constant Island pass failed to converge!"); + DEBUG(dumpBBs()); + + // Clear NewWaterList now. If we split a block for branches, it should + // appear as "new water" for the next iteration of constant pool placement. + NewWaterList.clear(); + + DEBUG(dbgs() << "Beginning BR iteration #" << NoBRIters << '\n'); + bool BRChange = false; + for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i) + BRChange |= fixupImmediateBr(ImmBranches[i]); + if (BRChange && ++NoBRIters > 30) + report_fatal_error("Branch Fix Up pass failed to converge!"); + DEBUG(dumpBBs()); + + if (!CPChange && !BRChange) + break; + MadeChange = true; + } + + // Shrink 32-bit Thumb2 branch, load, and store instructions. + if (isThumb2 && !STI->prefers32BitThumb()) + MadeChange |= optimizeThumb2Instructions(); + + // After a while, this might be made debug-only, but it is not expensive. + verify(); + + // If LR has been forced spilled and no far jump (i.e. BL) has been issued, + // undo the spill / restore of LR if possible. + if (isThumb && !HasFarJump && AFI->isLRSpilledForFarJump()) + MadeChange |= undoLRSpillRestore(); + + // Save the mapping between original and cloned constpool entries. + for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) { + for (unsigned j = 0, je = CPEntries[i].size(); j != je; ++j) { + const CPEntry & CPE = CPEntries[i][j]; + if (CPE.CPEMI && CPE.CPEMI->getOperand(1).isCPI()) + AFI->recordCPEClone(i, CPE.CPI); + } + } + + DEBUG(dbgs() << '\n'; dumpBBs()); + + BBInfo.clear(); + WaterList.clear(); + CPUsers.clear(); + CPEntries.clear(); + JumpTableEntryIndices.clear(); + JumpTableUserIndices.clear(); + ImmBranches.clear(); + PushPopMIs.clear(); + T2JumpTables.clear(); + + return MadeChange; +} + +/// \brief Perform the initial placement of the regular constant pool entries. +/// To start with, we put them all at the end of the function. +void +ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs) { + // Create the basic block to hold the CPE's. + MachineBasicBlock *BB = MF->CreateMachineBasicBlock(); + MF->push_back(BB); + + // MachineConstantPool measures alignment in bytes. We measure in log2(bytes). + unsigned MaxAlign = Log2_32(MCP->getConstantPoolAlignment()); + + // Mark the basic block as required by the const-pool. + BB->setAlignment(MaxAlign); + + // The function needs to be as aligned as the basic blocks. The linker may + // move functions around based on their alignment. + MF->ensureAlignment(BB->getAlignment()); + + // Order the entries in BB by descending alignment. That ensures correct + // alignment of all entries as long as BB is sufficiently aligned. Keep + // track of the insertion point for each alignment. We are going to bucket + // sort the entries as they are created. + SmallVector<MachineBasicBlock::iterator, 8> InsPoint(MaxAlign + 1, BB->end()); + + // Add all of the constants from the constant pool to the end block, use an + // identity mapping of CPI's to CPE's. + const std::vector<MachineConstantPoolEntry> &CPs = MCP->getConstants(); + + const DataLayout &TD = MF->getDataLayout(); + for (unsigned i = 0, e = CPs.size(); i != e; ++i) { + unsigned Size = TD.getTypeAllocSize(CPs[i].getType()); + assert(Size >= 4 && "Too small constant pool entry"); + unsigned Align = CPs[i].getAlignment(); + assert(isPowerOf2_32(Align) && "Invalid alignment"); + // Verify that all constant pool entries are a multiple of their alignment. + // If not, we would have to pad them out so that instructions stay aligned. + assert((Size % Align) == 0 && "CP Entry not multiple of 4 bytes!"); + + // Insert CONSTPOOL_ENTRY before entries with a smaller alignment. + unsigned LogAlign = Log2_32(Align); + MachineBasicBlock::iterator InsAt = InsPoint[LogAlign]; + MachineInstr *CPEMI = + BuildMI(*BB, InsAt, DebugLoc(), TII->get(ARM::CONSTPOOL_ENTRY)) + .addImm(i).addConstantPoolIndex(i).addImm(Size); + CPEMIs.push_back(CPEMI); + + // Ensure that future entries with higher alignment get inserted before + // CPEMI. This is bucket sort with iterators. + for (unsigned a = LogAlign + 1; a <= MaxAlign; ++a) + if (InsPoint[a] == InsAt) + InsPoint[a] = CPEMI; + + // Add a new CPEntry, but no corresponding CPUser yet. + CPEntries.emplace_back(1, CPEntry(CPEMI, i)); + ++NumCPEs; + DEBUG(dbgs() << "Moved CPI#" << i << " to end of function, size = " + << Size << ", align = " << Align <<'\n'); + } + DEBUG(BB->dump()); +} + +/// \brief Do initial placement of the jump tables. Because Thumb2's TBB and TBH +/// instructions can be made more efficient if the jump table immediately +/// follows the instruction, it's best to place them immediately next to their +/// jumps to begin with. In almost all cases they'll never be moved from that +/// position. +void ARMConstantIslands::doInitialJumpTablePlacement( + std::vector<MachineInstr *> &CPEMIs) { + unsigned i = CPEntries.size(); + auto MJTI = MF->getJumpTableInfo(); + const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); + + MachineBasicBlock *LastCorrectlyNumberedBB = nullptr; + for (MachineBasicBlock &MBB : *MF) { + auto MI = MBB.getLastNonDebugInstr(); + if (MI == MBB.end()) + continue; + + unsigned JTOpcode; + switch (MI->getOpcode()) { + default: + continue; + case ARM::BR_JTadd: + case ARM::BR_JTr: + case ARM::tBR_JTr: + case ARM::BR_JTm: + JTOpcode = ARM::JUMPTABLE_ADDRS; + break; + case ARM::t2BR_JT: + JTOpcode = ARM::JUMPTABLE_INSTS; + break; + case ARM::t2TBB_JT: + JTOpcode = ARM::JUMPTABLE_TBB; + break; + case ARM::t2TBH_JT: + JTOpcode = ARM::JUMPTABLE_TBH; + break; + } + + unsigned NumOps = MI->getDesc().getNumOperands(); + MachineOperand JTOp = + MI->getOperand(NumOps - (MI->isPredicable() ? 2 : 1)); + unsigned JTI = JTOp.getIndex(); + unsigned Size = JT[JTI].MBBs.size() * sizeof(uint32_t); + MachineBasicBlock *JumpTableBB = MF->CreateMachineBasicBlock(); + MF->insert(std::next(MachineFunction::iterator(MBB)), JumpTableBB); + MachineInstr *CPEMI = BuildMI(*JumpTableBB, JumpTableBB->begin(), + DebugLoc(), TII->get(JTOpcode)) + .addImm(i++) + .addJumpTableIndex(JTI) + .addImm(Size); + CPEMIs.push_back(CPEMI); + CPEntries.emplace_back(1, CPEntry(CPEMI, JTI)); + JumpTableEntryIndices.insert(std::make_pair(JTI, CPEntries.size() - 1)); + if (!LastCorrectlyNumberedBB) + LastCorrectlyNumberedBB = &MBB; + } + + // If we did anything then we need to renumber the subsequent blocks. + if (LastCorrectlyNumberedBB) + MF->RenumberBlocks(LastCorrectlyNumberedBB); +} + +/// BBHasFallthrough - Return true if the specified basic block can fallthrough +/// into the block immediately after it. +bool ARMConstantIslands::BBHasFallthrough(MachineBasicBlock *MBB) { + // Get the next machine basic block in the function. + MachineFunction::iterator MBBI = MBB->getIterator(); + // Can't fall off end of function. + if (std::next(MBBI) == MBB->getParent()->end()) + return false; + + MachineBasicBlock *NextBB = &*std::next(MBBI); + if (std::find(MBB->succ_begin(), MBB->succ_end(), NextBB) == MBB->succ_end()) + return false; + + // Try to analyze the end of the block. A potential fallthrough may already + // have an unconditional branch for whatever reason. + MachineBasicBlock *TBB, *FBB; + SmallVector<MachineOperand, 4> Cond; + bool TooDifficult = TII->AnalyzeBranch(*MBB, TBB, FBB, Cond); + return TooDifficult || FBB == nullptr; +} + +/// findConstPoolEntry - Given the constpool index and CONSTPOOL_ENTRY MI, +/// look up the corresponding CPEntry. +ARMConstantIslands::CPEntry +*ARMConstantIslands::findConstPoolEntry(unsigned CPI, + const MachineInstr *CPEMI) { + std::vector<CPEntry> &CPEs = CPEntries[CPI]; + // Number of entries per constpool index should be small, just do a + // linear search. + for (unsigned i = 0, e = CPEs.size(); i != e; ++i) { + if (CPEs[i].CPEMI == CPEMI) + return &CPEs[i]; + } + return nullptr; +} + +/// getCPELogAlign - Returns the required alignment of the constant pool entry +/// represented by CPEMI. Alignment is measured in log2(bytes) units. +unsigned ARMConstantIslands::getCPELogAlign(const MachineInstr *CPEMI) { + switch (CPEMI->getOpcode()) { + case ARM::CONSTPOOL_ENTRY: + break; + case ARM::JUMPTABLE_TBB: + return 0; + case ARM::JUMPTABLE_TBH: + case ARM::JUMPTABLE_INSTS: + return 1; + case ARM::JUMPTABLE_ADDRS: + return 2; + default: + llvm_unreachable("unknown constpool entry kind"); + } + + unsigned CPI = getCombinedIndex(CPEMI); + assert(CPI < MCP->getConstants().size() && "Invalid constant pool index."); + unsigned Align = MCP->getConstants()[CPI].getAlignment(); + assert(isPowerOf2_32(Align) && "Invalid CPE alignment"); + return Log2_32(Align); +} + +/// scanFunctionJumpTables - Do a scan of the function, building up +/// information about the sizes of each block and the locations of all +/// the jump tables. +void ARMConstantIslands::scanFunctionJumpTables() { + for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end(); + MBBI != E; ++MBBI) { + MachineBasicBlock &MBB = *MBBI; + + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) + if (I->isBranch() && I->getOpcode() == ARM::t2BR_JT) + T2JumpTables.push_back(I); + } +} + +/// initializeFunctionInfo - Do the initial scan of the function, building up +/// information about the sizes of each block, the location of all the water, +/// and finding all of the constant pool users. +void ARMConstantIslands:: +initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) { + BBInfo.clear(); + BBInfo.resize(MF->getNumBlockIDs()); + + // First thing, compute the size of all basic blocks, and see if the function + // has any inline assembly in it. If so, we have to be conservative about + // alignment assumptions, as we don't know for sure the size of any + // instructions in the inline assembly. + for (MachineBasicBlock &MBB : *MF) + computeBlockSize(&MBB); + + // The known bits of the entry block offset are determined by the function + // alignment. + BBInfo.front().KnownBits = MF->getAlignment(); + + // Compute block offsets and known bits. + adjustBBOffsetsAfter(&MF->front()); + + // Now go back through the instructions and build up our data structures. + for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end(); + MBBI != E; ++MBBI) { + MachineBasicBlock &MBB = *MBBI; + + // If this block doesn't fall through into the next MBB, then this is + // 'water' that a constant pool island could be placed. + if (!BBHasFallthrough(&MBB)) + WaterList.push_back(&MBB); + + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + if (I->isDebugValue()) + continue; + + unsigned Opc = I->getOpcode(); + if (I->isBranch()) { + bool isCond = false; + unsigned Bits = 0; + unsigned Scale = 1; + int UOpc = Opc; + switch (Opc) { + default: + continue; // Ignore other JT branches + case ARM::t2BR_JT: + T2JumpTables.push_back(I); + continue; // Does not get an entry in ImmBranches + case ARM::Bcc: + isCond = true; + UOpc = ARM::B; + // Fallthrough + case ARM::B: + Bits = 24; + Scale = 4; + break; + case ARM::tBcc: + isCond = true; + UOpc = ARM::tB; + Bits = 8; + Scale = 2; + break; + case ARM::tB: + Bits = 11; + Scale = 2; + break; + case ARM::t2Bcc: + isCond = true; + UOpc = ARM::t2B; + Bits = 20; + Scale = 2; + break; + case ARM::t2B: + Bits = 24; + Scale = 2; + break; + } + + // Record this immediate branch. + unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale; + ImmBranches.push_back(ImmBranch(I, MaxOffs, isCond, UOpc)); + } + + if (Opc == ARM::tPUSH || Opc == ARM::tPOP_RET) + PushPopMIs.push_back(I); + + if (Opc == ARM::CONSTPOOL_ENTRY || Opc == ARM::JUMPTABLE_ADDRS || + Opc == ARM::JUMPTABLE_INSTS || Opc == ARM::JUMPTABLE_TBB || + Opc == ARM::JUMPTABLE_TBH) + continue; + + // Scan the instructions for constant pool operands. + for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op) + if (I->getOperand(op).isCPI() || I->getOperand(op).isJTI()) { + // We found one. The addressing mode tells us the max displacement + // from the PC that this instruction permits. + + // Basic size info comes from the TSFlags field. + unsigned Bits = 0; + unsigned Scale = 1; + bool NegOk = false; + bool IsSoImm = false; + + switch (Opc) { + default: + llvm_unreachable("Unknown addressing mode for CP reference!"); + + // Taking the address of a CP entry. + case ARM::LEApcrel: + case ARM::LEApcrelJT: + // This takes a SoImm, which is 8 bit immediate rotated. We'll + // pretend the maximum offset is 255 * 4. Since each instruction + // 4 byte wide, this is always correct. We'll check for other + // displacements that fits in a SoImm as well. + Bits = 8; + Scale = 4; + NegOk = true; + IsSoImm = true; + break; + case ARM::t2LEApcrel: + case ARM::t2LEApcrelJT: + Bits = 12; + NegOk = true; + break; + case ARM::tLEApcrel: + case ARM::tLEApcrelJT: + Bits = 8; + Scale = 4; + break; + + case ARM::LDRBi12: + case ARM::LDRi12: + case ARM::LDRcp: + case ARM::t2LDRpci: + Bits = 12; // +-offset_12 + NegOk = true; + break; + + case ARM::tLDRpci: + Bits = 8; + Scale = 4; // +(offset_8*4) + break; + + case ARM::VLDRD: + case ARM::VLDRS: + Bits = 8; + Scale = 4; // +-(offset_8*4) + NegOk = true; + break; + } + + // Remember that this is a user of a CP entry. + unsigned CPI = I->getOperand(op).getIndex(); + if (I->getOperand(op).isJTI()) { + JumpTableUserIndices.insert(std::make_pair(CPI, CPUsers.size())); + CPI = JumpTableEntryIndices[CPI]; + } + + MachineInstr *CPEMI = CPEMIs[CPI]; + unsigned MaxOffs = ((1 << Bits)-1) * Scale; + CPUsers.push_back(CPUser(I, CPEMI, MaxOffs, NegOk, IsSoImm)); + + // Increment corresponding CPEntry reference count. + CPEntry *CPE = findConstPoolEntry(CPI, CPEMI); + assert(CPE && "Cannot find a corresponding CPEntry!"); + CPE->RefCount++; + + // Instructions can only use one CP entry, don't bother scanning the + // rest of the operands. + break; + } + } + } +} + +/// computeBlockSize - Compute the size and some alignment information for MBB. +/// This function updates BBInfo directly. +void ARMConstantIslands::computeBlockSize(MachineBasicBlock *MBB) { + BasicBlockInfo &BBI = BBInfo[MBB->getNumber()]; + BBI.Size = 0; + BBI.Unalign = 0; + BBI.PostAlign = 0; + + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; + ++I) { + BBI.Size += TII->GetInstSizeInBytes(I); + // For inline asm, GetInstSizeInBytes returns a conservative estimate. + // The actual size may be smaller, but still a multiple of the instr size. + if (I->isInlineAsm()) + BBI.Unalign = isThumb ? 1 : 2; + // Also consider instructions that may be shrunk later. + else if (isThumb && mayOptimizeThumb2Instruction(I)) + BBI.Unalign = 1; + } + + // tBR_JTr contains a .align 2 directive. + if (!MBB->empty() && MBB->back().getOpcode() == ARM::tBR_JTr) { + BBI.PostAlign = 2; + MBB->getParent()->ensureAlignment(2); + } +} + +/// getOffsetOf - Return the current offset of the specified machine instruction +/// from the start of the function. This offset changes as stuff is moved +/// around inside the function. +unsigned ARMConstantIslands::getOffsetOf(MachineInstr *MI) const { + MachineBasicBlock *MBB = MI->getParent(); + + // The offset is composed of two things: the sum of the sizes of all MBB's + // before this instruction's block, and the offset from the start of the block + // it is in. + unsigned Offset = BBInfo[MBB->getNumber()].Offset; + + // Sum instructions before MI in MBB. + for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) { + assert(I != MBB->end() && "Didn't find MI in its own basic block?"); + Offset += TII->GetInstSizeInBytes(I); + } + return Offset; +} + +/// CompareMBBNumbers - Little predicate function to sort the WaterList by MBB +/// ID. +static bool CompareMBBNumbers(const MachineBasicBlock *LHS, + const MachineBasicBlock *RHS) { + return LHS->getNumber() < RHS->getNumber(); +} + +/// updateForInsertedWaterBlock - When a block is newly inserted into the +/// machine function, it upsets all of the block numbers. Renumber the blocks +/// and update the arrays that parallel this numbering. +void ARMConstantIslands::updateForInsertedWaterBlock(MachineBasicBlock *NewBB) { + // Renumber the MBB's to keep them consecutive. + NewBB->getParent()->RenumberBlocks(NewBB); + + // Insert an entry into BBInfo to align it properly with the (newly + // renumbered) block numbers. + BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo()); + + // Next, update WaterList. Specifically, we need to add NewMBB as having + // available water after it. + water_iterator IP = + std::lower_bound(WaterList.begin(), WaterList.end(), NewBB, + CompareMBBNumbers); + WaterList.insert(IP, NewBB); +} + + +/// Split the basic block containing MI into two blocks, which are joined by +/// an unconditional branch. Update data structures and renumber blocks to +/// account for this change and returns the newly created block. +MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) { + MachineBasicBlock *OrigBB = MI->getParent(); + + // Create a new MBB for the code after the OrigBB. + MachineBasicBlock *NewBB = + MF->CreateMachineBasicBlock(OrigBB->getBasicBlock()); + MachineFunction::iterator MBBI = ++OrigBB->getIterator(); + MF->insert(MBBI, NewBB); + + // Splice the instructions starting with MI over to NewBB. + NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end()); + + // Add an unconditional branch from OrigBB to NewBB. + // Note the new unconditional branch is not being recorded. + // There doesn't seem to be meaningful DebugInfo available; this doesn't + // correspond to anything in the source. + unsigned Opc = isThumb ? (isThumb2 ? ARM::t2B : ARM::tB) : ARM::B; + if (!isThumb) + BuildMI(OrigBB, DebugLoc(), TII->get(Opc)).addMBB(NewBB); + else + BuildMI(OrigBB, DebugLoc(), TII->get(Opc)).addMBB(NewBB) + .addImm(ARMCC::AL).addReg(0); + ++NumSplit; + + // Update the CFG. All succs of OrigBB are now succs of NewBB. + NewBB->transferSuccessors(OrigBB); + + // OrigBB branches to NewBB. + OrigBB->addSuccessor(NewBB); + + // Update internal data structures to account for the newly inserted MBB. + // This is almost the same as updateForInsertedWaterBlock, except that + // the Water goes after OrigBB, not NewBB. + MF->RenumberBlocks(NewBB); + + // Insert an entry into BBInfo to align it properly with the (newly + // renumbered) block numbers. + BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo()); + + // Next, update WaterList. Specifically, we need to add OrigMBB as having + // available water after it (but not if it's already there, which happens + // when splitting before a conditional branch that is followed by an + // unconditional branch - in that case we want to insert NewBB). + water_iterator IP = + std::lower_bound(WaterList.begin(), WaterList.end(), OrigBB, + CompareMBBNumbers); + MachineBasicBlock* WaterBB = *IP; + if (WaterBB == OrigBB) + WaterList.insert(std::next(IP), NewBB); + else + WaterList.insert(IP, OrigBB); + NewWaterList.insert(OrigBB); + + // Figure out how large the OrigBB is. As the first half of the original + // block, it cannot contain a tablejump. The size includes + // the new jump we added. (It should be possible to do this without + // recounting everything, but it's very confusing, and this is rarely + // executed.) + computeBlockSize(OrigBB); + + // Figure out how large the NewMBB is. As the second half of the original + // block, it may contain a tablejump. + computeBlockSize(NewBB); + + // All BBOffsets following these blocks must be modified. + adjustBBOffsetsAfter(OrigBB); + + return NewBB; +} + +/// getUserOffset - Compute the offset of U.MI as seen by the hardware +/// displacement computation. Update U.KnownAlignment to match its current +/// basic block location. +unsigned ARMConstantIslands::getUserOffset(CPUser &U) const { + unsigned UserOffset = getOffsetOf(U.MI); + const BasicBlockInfo &BBI = BBInfo[U.MI->getParent()->getNumber()]; + unsigned KnownBits = BBI.internalKnownBits(); + + // The value read from PC is offset from the actual instruction address. + UserOffset += (isThumb ? 4 : 8); + + // Because of inline assembly, we may not know the alignment (mod 4) of U.MI. + // Make sure U.getMaxDisp() returns a constrained range. + U.KnownAlignment = (KnownBits >= 2); + + // On Thumb, offsets==2 mod 4 are rounded down by the hardware for + // purposes of the displacement computation; compensate for that here. + // For unknown alignments, getMaxDisp() constrains the range instead. + if (isThumb && U.KnownAlignment) + UserOffset &= ~3u; + + return UserOffset; +} + +/// isOffsetInRange - Checks whether UserOffset (the location of a constant pool +/// reference) is within MaxDisp of TrialOffset (a proposed location of a +/// constant pool entry). +/// UserOffset is computed by getUserOffset above to include PC adjustments. If +/// the mod 4 alignment of UserOffset is not known, the uncertainty must be +/// subtracted from MaxDisp instead. CPUser::getMaxDisp() does that. +bool ARMConstantIslands::isOffsetInRange(unsigned UserOffset, + unsigned TrialOffset, unsigned MaxDisp, + bool NegativeOK, bool IsSoImm) { + if (UserOffset <= TrialOffset) { + // User before the Trial. + if (TrialOffset - UserOffset <= MaxDisp) + return true; + // FIXME: Make use full range of soimm values. + } else if (NegativeOK) { + if (UserOffset - TrialOffset <= MaxDisp) + return true; + // FIXME: Make use full range of soimm values. + } + return false; +} + +/// isWaterInRange - Returns true if a CPE placed after the specified +/// Water (a basic block) will be in range for the specific MI. +/// +/// Compute how much the function will grow by inserting a CPE after Water. +bool ARMConstantIslands::isWaterInRange(unsigned UserOffset, + MachineBasicBlock* Water, CPUser &U, + unsigned &Growth) { + unsigned CPELogAlign = getCPELogAlign(U.CPEMI); + unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign); + unsigned NextBlockOffset, NextBlockAlignment; + MachineFunction::const_iterator NextBlock = Water->getIterator(); + if (++NextBlock == MF->end()) { + NextBlockOffset = BBInfo[Water->getNumber()].postOffset(); + NextBlockAlignment = 0; + } else { + NextBlockOffset = BBInfo[NextBlock->getNumber()].Offset; + NextBlockAlignment = NextBlock->getAlignment(); + } + unsigned Size = U.CPEMI->getOperand(2).getImm(); + unsigned CPEEnd = CPEOffset + Size; + + // The CPE may be able to hide in the alignment padding before the next + // block. It may also cause more padding to be required if it is more aligned + // that the next block. + if (CPEEnd > NextBlockOffset) { + Growth = CPEEnd - NextBlockOffset; + // Compute the padding that would go at the end of the CPE to align the next + // block. + Growth += OffsetToAlignment(CPEEnd, 1u << NextBlockAlignment); + + // If the CPE is to be inserted before the instruction, that will raise + // the offset of the instruction. Also account for unknown alignment padding + // in blocks between CPE and the user. + if (CPEOffset < UserOffset) + UserOffset += Growth + UnknownPadding(MF->getAlignment(), CPELogAlign); + } else + // CPE fits in existing padding. + Growth = 0; + + return isOffsetInRange(UserOffset, CPEOffset, U); +} + +/// isCPEntryInRange - Returns true if the distance between specific MI and +/// specific ConstPool entry instruction can fit in MI's displacement field. +bool ARMConstantIslands::isCPEntryInRange(MachineInstr *MI, unsigned UserOffset, + MachineInstr *CPEMI, unsigned MaxDisp, + bool NegOk, bool DoDump) { + unsigned CPEOffset = getOffsetOf(CPEMI); + + if (DoDump) { + DEBUG({ + unsigned Block = MI->getParent()->getNumber(); + const BasicBlockInfo &BBI = BBInfo[Block]; + dbgs() << "User of CPE#" << CPEMI->getOperand(0).getImm() + << " max delta=" << MaxDisp + << format(" insn address=%#x", UserOffset) + << " in BB#" << Block << ": " + << format("%#x-%x\t", BBI.Offset, BBI.postOffset()) << *MI + << format("CPE address=%#x offset=%+d: ", CPEOffset, + int(CPEOffset-UserOffset)); + }); + } + + return isOffsetInRange(UserOffset, CPEOffset, MaxDisp, NegOk); +} + +#ifndef NDEBUG +/// BBIsJumpedOver - Return true of the specified basic block's only predecessor +/// unconditionally branches to its only successor. +static bool BBIsJumpedOver(MachineBasicBlock *MBB) { + if (MBB->pred_size() != 1 || MBB->succ_size() != 1) + return false; + + MachineBasicBlock *Succ = *MBB->succ_begin(); + MachineBasicBlock *Pred = *MBB->pred_begin(); + MachineInstr *PredMI = &Pred->back(); + if (PredMI->getOpcode() == ARM::B || PredMI->getOpcode() == ARM::tB + || PredMI->getOpcode() == ARM::t2B) + return PredMI->getOperand(0).getMBB() == Succ; + return false; +} +#endif // NDEBUG + +void ARMConstantIslands::adjustBBOffsetsAfter(MachineBasicBlock *BB) { + unsigned BBNum = BB->getNumber(); + for(unsigned i = BBNum + 1, e = MF->getNumBlockIDs(); i < e; ++i) { + // Get the offset and known bits at the end of the layout predecessor. + // Include the alignment of the current block. + unsigned LogAlign = MF->getBlockNumbered(i)->getAlignment(); + unsigned Offset = BBInfo[i - 1].postOffset(LogAlign); + unsigned KnownBits = BBInfo[i - 1].postKnownBits(LogAlign); + + // This is where block i begins. Stop if the offset is already correct, + // and we have updated 2 blocks. This is the maximum number of blocks + // changed before calling this function. + if (i > BBNum + 2 && + BBInfo[i].Offset == Offset && + BBInfo[i].KnownBits == KnownBits) + break; + + BBInfo[i].Offset = Offset; + BBInfo[i].KnownBits = KnownBits; + } +} + +/// decrementCPEReferenceCount - find the constant pool entry with index CPI +/// and instruction CPEMI, and decrement its refcount. If the refcount +/// becomes 0 remove the entry and instruction. Returns true if we removed +/// the entry, false if we didn't. + +bool ARMConstantIslands::decrementCPEReferenceCount(unsigned CPI, + MachineInstr *CPEMI) { + // Find the old entry. Eliminate it if it is no longer used. + CPEntry *CPE = findConstPoolEntry(CPI, CPEMI); + assert(CPE && "Unexpected!"); + if (--CPE->RefCount == 0) { + removeDeadCPEMI(CPEMI); + CPE->CPEMI = nullptr; + --NumCPEs; + return true; + } + return false; +} + +unsigned ARMConstantIslands::getCombinedIndex(const MachineInstr *CPEMI) { + if (CPEMI->getOperand(1).isCPI()) + return CPEMI->getOperand(1).getIndex(); + + return JumpTableEntryIndices[CPEMI->getOperand(1).getIndex()]; +} + +/// LookForCPEntryInRange - see if the currently referenced CPE is in range; +/// if not, see if an in-range clone of the CPE is in range, and if so, +/// change the data structures so the user references the clone. Returns: +/// 0 = no existing entry found +/// 1 = entry found, and there were no code insertions or deletions +/// 2 = entry found, and there were code insertions or deletions +int ARMConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset) +{ + MachineInstr *UserMI = U.MI; + MachineInstr *CPEMI = U.CPEMI; + + // Check to see if the CPE is already in-range. + if (isCPEntryInRange(UserMI, UserOffset, CPEMI, U.getMaxDisp(), U.NegOk, + true)) { + DEBUG(dbgs() << "In range\n"); + return 1; + } + + // No. Look for previously created clones of the CPE that are in range. + unsigned CPI = getCombinedIndex(CPEMI); + std::vector<CPEntry> &CPEs = CPEntries[CPI]; + for (unsigned i = 0, e = CPEs.size(); i != e; ++i) { + // We already tried this one + if (CPEs[i].CPEMI == CPEMI) + continue; + // Removing CPEs can leave empty entries, skip + if (CPEs[i].CPEMI == nullptr) + continue; + if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.getMaxDisp(), + U.NegOk)) { + DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#" + << CPEs[i].CPI << "\n"); + // Point the CPUser node to the replacement + U.CPEMI = CPEs[i].CPEMI; + // Change the CPI in the instruction operand to refer to the clone. + for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j) + if (UserMI->getOperand(j).isCPI()) { + UserMI->getOperand(j).setIndex(CPEs[i].CPI); + break; + } + // Adjust the refcount of the clone... + CPEs[i].RefCount++; + // ...and the original. If we didn't remove the old entry, none of the + // addresses changed, so we don't need another pass. + return decrementCPEReferenceCount(CPI, CPEMI) ? 2 : 1; + } + } + return 0; +} + +/// getUnconditionalBrDisp - Returns the maximum displacement that can fit in +/// the specific unconditional branch instruction. +static inline unsigned getUnconditionalBrDisp(int Opc) { + switch (Opc) { + case ARM::tB: + return ((1<<10)-1)*2; + case ARM::t2B: + return ((1<<23)-1)*2; + default: + break; + } + + return ((1<<23)-1)*4; +} + +/// findAvailableWater - Look for an existing entry in the WaterList in which +/// we can place the CPE referenced from U so it's within range of U's MI. +/// Returns true if found, false if not. If it returns true, WaterIter +/// is set to the WaterList entry. For Thumb, prefer water that will not +/// introduce padding to water that will. To ensure that this pass +/// terminates, the CPE location for a particular CPUser is only allowed to +/// move to a lower address, so search backward from the end of the list and +/// prefer the first water that is in range. +bool ARMConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset, + water_iterator &WaterIter) { + if (WaterList.empty()) + return false; + + unsigned BestGrowth = ~0u; + for (water_iterator IP = std::prev(WaterList.end()), B = WaterList.begin();; + --IP) { + MachineBasicBlock* WaterBB = *IP; + // Check if water is in range and is either at a lower address than the + // current "high water mark" or a new water block that was created since + // the previous iteration by inserting an unconditional branch. In the + // latter case, we want to allow resetting the high water mark back to + // this new water since we haven't seen it before. Inserting branches + // should be relatively uncommon and when it does happen, we want to be + // sure to take advantage of it for all the CPEs near that block, so that + // we don't insert more branches than necessary. + unsigned Growth; + if (isWaterInRange(UserOffset, WaterBB, U, Growth) && + (WaterBB->getNumber() < U.HighWaterMark->getNumber() || + NewWaterList.count(WaterBB) || WaterBB == U.MI->getParent()) && + Growth < BestGrowth) { + // This is the least amount of required padding seen so far. + BestGrowth = Growth; + WaterIter = IP; + DEBUG(dbgs() << "Found water after BB#" << WaterBB->getNumber() + << " Growth=" << Growth << '\n'); + + // Keep looking unless it is perfect. + if (BestGrowth == 0) + return true; + } + if (IP == B) + break; + } + return BestGrowth != ~0u; +} + +/// createNewWater - No existing WaterList entry will work for +/// CPUsers[CPUserIndex], so create a place to put the CPE. The end of the +/// block is used if in range, and the conditional branch munged so control +/// flow is correct. Otherwise the block is split to create a hole with an +/// unconditional branch around it. In either case NewMBB is set to a +/// block following which the new island can be inserted (the WaterList +/// is not adjusted). +void ARMConstantIslands::createNewWater(unsigned CPUserIndex, + unsigned UserOffset, + MachineBasicBlock *&NewMBB) { + CPUser &U = CPUsers[CPUserIndex]; + MachineInstr *UserMI = U.MI; + MachineInstr *CPEMI = U.CPEMI; + unsigned CPELogAlign = getCPELogAlign(CPEMI); + MachineBasicBlock *UserMBB = UserMI->getParent(); + const BasicBlockInfo &UserBBI = BBInfo[UserMBB->getNumber()]; + + // If the block does not end in an unconditional branch already, and if the + // end of the block is within range, make new water there. (The addition + // below is for the unconditional branch we will be adding: 4 bytes on ARM + + // Thumb2, 2 on Thumb1. + if (BBHasFallthrough(UserMBB)) { + // Size of branch to insert. + unsigned Delta = isThumb1 ? 2 : 4; + // Compute the offset where the CPE will begin. + unsigned CPEOffset = UserBBI.postOffset(CPELogAlign) + Delta; + + if (isOffsetInRange(UserOffset, CPEOffset, U)) { + DEBUG(dbgs() << "Split at end of BB#" << UserMBB->getNumber() + << format(", expected CPE offset %#x\n", CPEOffset)); + NewMBB = &*++UserMBB->getIterator(); + // Add an unconditional branch from UserMBB to fallthrough block. Record + // it for branch lengthening; this new branch will not get out of range, + // but if the preceding conditional branch is out of range, the targets + // will be exchanged, and the altered branch may be out of range, so the + // machinery has to know about it. + int UncondBr = isThumb ? ((isThumb2) ? ARM::t2B : ARM::tB) : ARM::B; + if (!isThumb) + BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB); + else + BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB) + .addImm(ARMCC::AL).addReg(0); + unsigned MaxDisp = getUnconditionalBrDisp(UncondBr); + ImmBranches.push_back(ImmBranch(&UserMBB->back(), + MaxDisp, false, UncondBr)); + computeBlockSize(UserMBB); + adjustBBOffsetsAfter(UserMBB); + return; + } + } + + // What a big block. Find a place within the block to split it. This is a + // little tricky on Thumb1 since instructions are 2 bytes and constant pool + // entries are 4 bytes: if instruction I references island CPE, and + // instruction I+1 references CPE', it will not work well to put CPE as far + // forward as possible, since then CPE' cannot immediately follow it (that + // location is 2 bytes farther away from I+1 than CPE was from I) and we'd + // need to create a new island. So, we make a first guess, then walk through + // the instructions between the one currently being looked at and the + // possible insertion point, and make sure any other instructions that + // reference CPEs will be able to use the same island area; if not, we back + // up the insertion point. + + // Try to split the block so it's fully aligned. Compute the latest split + // point where we can add a 4-byte branch instruction, and then align to + // LogAlign which is the largest possible alignment in the function. + unsigned LogAlign = MF->getAlignment(); + assert(LogAlign >= CPELogAlign && "Over-aligned constant pool entry"); + unsigned KnownBits = UserBBI.internalKnownBits(); + unsigned UPad = UnknownPadding(LogAlign, KnownBits); + unsigned BaseInsertOffset = UserOffset + U.getMaxDisp() - UPad; + DEBUG(dbgs() << format("Split in middle of big block before %#x", + BaseInsertOffset)); + + // The 4 in the following is for the unconditional branch we'll be inserting + // (allows for long branch on Thumb1). Alignment of the island is handled + // inside isOffsetInRange. + BaseInsertOffset -= 4; + + DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset) + << " la=" << LogAlign + << " kb=" << KnownBits + << " up=" << UPad << '\n'); + + // This could point off the end of the block if we've already got constant + // pool entries following this block; only the last one is in the water list. + // Back past any possible branches (allow for a conditional and a maximally + // long unconditional). + if (BaseInsertOffset + 8 >= UserBBI.postOffset()) { + // Ensure BaseInsertOffset is larger than the offset of the instruction + // following UserMI so that the loop which searches for the split point + // iterates at least once. + BaseInsertOffset = + std::max(UserBBI.postOffset() - UPad - 8, + UserOffset + TII->GetInstSizeInBytes(UserMI) + 1); + DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset)); + } + unsigned EndInsertOffset = BaseInsertOffset + 4 + UPad + + CPEMI->getOperand(2).getImm(); + MachineBasicBlock::iterator MI = UserMI; + ++MI; + unsigned CPUIndex = CPUserIndex+1; + unsigned NumCPUsers = CPUsers.size(); + MachineInstr *LastIT = nullptr; + for (unsigned Offset = UserOffset+TII->GetInstSizeInBytes(UserMI); + Offset < BaseInsertOffset; + Offset += TII->GetInstSizeInBytes(MI), MI = std::next(MI)) { + assert(MI != UserMBB->end() && "Fell off end of block"); + if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == MI) { + CPUser &U = CPUsers[CPUIndex]; + if (!isOffsetInRange(Offset, EndInsertOffset, U)) { + // Shift intertion point by one unit of alignment so it is within reach. + BaseInsertOffset -= 1u << LogAlign; + EndInsertOffset -= 1u << LogAlign; + } + // This is overly conservative, as we don't account for CPEMIs being + // reused within the block, but it doesn't matter much. Also assume CPEs + // are added in order with alignment padding. We may eventually be able + // to pack the aligned CPEs better. + EndInsertOffset += U.CPEMI->getOperand(2).getImm(); + CPUIndex++; + } + + // Remember the last IT instruction. + if (MI->getOpcode() == ARM::t2IT) + LastIT = MI; + } + + --MI; + + // Avoid splitting an IT block. + if (LastIT) { + unsigned PredReg = 0; + ARMCC::CondCodes CC = getITInstrPredicate(MI, PredReg); + if (CC != ARMCC::AL) + MI = LastIT; + } + + // We really must not split an IT block. + DEBUG(unsigned PredReg; + assert(!isThumb || getITInstrPredicate(MI, PredReg) == ARMCC::AL)); + + NewMBB = splitBlockBeforeInstr(MI); +} + +/// handleConstantPoolUser - Analyze the specified user, checking to see if it +/// is out-of-range. If so, pick up the constant pool value and move it some +/// place in-range. Return true if we changed any addresses (thus must run +/// another pass of branch lengthening), false otherwise. +bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) { + CPUser &U = CPUsers[CPUserIndex]; + MachineInstr *UserMI = U.MI; + MachineInstr *CPEMI = U.CPEMI; + unsigned CPI = getCombinedIndex(CPEMI); + unsigned Size = CPEMI->getOperand(2).getImm(); + // Compute this only once, it's expensive. + unsigned UserOffset = getUserOffset(U); + + // See if the current entry is within range, or there is a clone of it + // in range. + int result = findInRangeCPEntry(U, UserOffset); + if (result==1) return false; + else if (result==2) return true; + + // No existing clone of this CPE is within range. + // We will be generating a new clone. Get a UID for it. + unsigned ID = AFI->createPICLabelUId(); + + // Look for water where we can place this CPE. + MachineBasicBlock *NewIsland = MF->CreateMachineBasicBlock(); + MachineBasicBlock *NewMBB; + water_iterator IP; + if (findAvailableWater(U, UserOffset, IP)) { + DEBUG(dbgs() << "Found water in range\n"); + MachineBasicBlock *WaterBB = *IP; + + // If the original WaterList entry was "new water" on this iteration, + // propagate that to the new island. This is just keeping NewWaterList + // updated to match the WaterList, which will be updated below. + if (NewWaterList.erase(WaterBB)) + NewWaterList.insert(NewIsland); + + // The new CPE goes before the following block (NewMBB). + NewMBB = &*++WaterBB->getIterator(); + } else { + // No water found. + DEBUG(dbgs() << "No water found\n"); + createNewWater(CPUserIndex, UserOffset, NewMBB); + + // splitBlockBeforeInstr adds to WaterList, which is important when it is + // called while handling branches so that the water will be seen on the + // next iteration for constant pools, but in this context, we don't want + // it. Check for this so it will be removed from the WaterList. + // Also remove any entry from NewWaterList. + MachineBasicBlock *WaterBB = &*--NewMBB->getIterator(); + IP = std::find(WaterList.begin(), WaterList.end(), WaterBB); + if (IP != WaterList.end()) + NewWaterList.erase(WaterBB); + + // We are adding new water. Update NewWaterList. + NewWaterList.insert(NewIsland); + } + + // Remove the original WaterList entry; we want subsequent insertions in + // this vicinity to go after the one we're about to insert. This + // considerably reduces the number of times we have to move the same CPE + // more than once and is also important to ensure the algorithm terminates. + if (IP != WaterList.end()) + WaterList.erase(IP); + + // Okay, we know we can put an island before NewMBB now, do it! + MF->insert(NewMBB->getIterator(), NewIsland); + + // Update internal data structures to account for the newly inserted MBB. + updateForInsertedWaterBlock(NewIsland); + + // Now that we have an island to add the CPE to, clone the original CPE and + // add it to the island. + U.HighWaterMark = NewIsland; + U.CPEMI = BuildMI(NewIsland, DebugLoc(), CPEMI->getDesc()) + .addImm(ID).addOperand(CPEMI->getOperand(1)).addImm(Size); + CPEntries[CPI].push_back(CPEntry(U.CPEMI, ID, 1)); + ++NumCPEs; + + // Decrement the old entry, and remove it if refcount becomes 0. + decrementCPEReferenceCount(CPI, CPEMI); + + // Mark the basic block as aligned as required by the const-pool entry. + NewIsland->setAlignment(getCPELogAlign(U.CPEMI)); + + // Increase the size of the island block to account for the new entry. + BBInfo[NewIsland->getNumber()].Size += Size; + adjustBBOffsetsAfter(&*--NewIsland->getIterator()); + + // Finally, change the CPI in the instruction operand to be ID. + for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i) + if (UserMI->getOperand(i).isCPI()) { + UserMI->getOperand(i).setIndex(ID); + break; + } + + DEBUG(dbgs() << " Moved CPE to #" << ID << " CPI=" << CPI + << format(" offset=%#x\n", BBInfo[NewIsland->getNumber()].Offset)); + + return true; +} + +/// removeDeadCPEMI - Remove a dead constant pool entry instruction. Update +/// sizes and offsets of impacted basic blocks. +void ARMConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) { + MachineBasicBlock *CPEBB = CPEMI->getParent(); + unsigned Size = CPEMI->getOperand(2).getImm(); + CPEMI->eraseFromParent(); + BBInfo[CPEBB->getNumber()].Size -= Size; + // All succeeding offsets have the current size value added in, fix this. + if (CPEBB->empty()) { + BBInfo[CPEBB->getNumber()].Size = 0; + + // This block no longer needs to be aligned. + CPEBB->setAlignment(0); + } else + // Entries are sorted by descending alignment, so realign from the front. + CPEBB->setAlignment(getCPELogAlign(CPEBB->begin())); + + adjustBBOffsetsAfter(CPEBB); + // An island has only one predecessor BB and one successor BB. Check if + // this BB's predecessor jumps directly to this BB's successor. This + // shouldn't happen currently. + assert(!BBIsJumpedOver(CPEBB) && "How did this happen?"); + // FIXME: remove the empty blocks after all the work is done? +} + +/// removeUnusedCPEntries - Remove constant pool entries whose refcounts +/// are zero. +bool ARMConstantIslands::removeUnusedCPEntries() { + unsigned MadeChange = false; + for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) { + std::vector<CPEntry> &CPEs = CPEntries[i]; + for (unsigned j = 0, ee = CPEs.size(); j != ee; ++j) { + if (CPEs[j].RefCount == 0 && CPEs[j].CPEMI) { + removeDeadCPEMI(CPEs[j].CPEMI); + CPEs[j].CPEMI = nullptr; + MadeChange = true; + } + } + } + return MadeChange; +} + +/// isBBInRange - Returns true if the distance between specific MI and +/// specific BB can fit in MI's displacement field. +bool ARMConstantIslands::isBBInRange(MachineInstr *MI,MachineBasicBlock *DestBB, + unsigned MaxDisp) { + unsigned PCAdj = isThumb ? 4 : 8; + unsigned BrOffset = getOffsetOf(MI) + PCAdj; + unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset; + + DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber() + << " from BB#" << MI->getParent()->getNumber() + << " max delta=" << MaxDisp + << " from " << getOffsetOf(MI) << " to " << DestOffset + << " offset " << int(DestOffset-BrOffset) << "\t" << *MI); + + if (BrOffset <= DestOffset) { + // Branch before the Dest. + if (DestOffset-BrOffset <= MaxDisp) + return true; + } else { + if (BrOffset-DestOffset <= MaxDisp) + return true; + } + return false; +} + +/// fixupImmediateBr - Fix up an immediate branch whose destination is too far +/// away to fit in its displacement field. +bool ARMConstantIslands::fixupImmediateBr(ImmBranch &Br) { + MachineInstr *MI = Br.MI; + MachineBasicBlock *DestBB = MI->getOperand(0).getMBB(); + + // Check to see if the DestBB is already in-range. + if (isBBInRange(MI, DestBB, Br.MaxDisp)) + return false; + + if (!Br.isCond) + return fixupUnconditionalBr(Br); + return fixupConditionalBr(Br); +} + +/// fixupUnconditionalBr - Fix up an unconditional branch whose destination is +/// too far away to fit in its displacement field. If the LR register has been +/// spilled in the epilogue, then we can use BL to implement a far jump. +/// Otherwise, add an intermediate branch instruction to a branch. +bool +ARMConstantIslands::fixupUnconditionalBr(ImmBranch &Br) { + MachineInstr *MI = Br.MI; + MachineBasicBlock *MBB = MI->getParent(); + if (!isThumb1) + llvm_unreachable("fixupUnconditionalBr is Thumb1 only!"); + + // Use BL to implement far jump. + Br.MaxDisp = (1 << 21) * 2; + MI->setDesc(TII->get(ARM::tBfar)); + BBInfo[MBB->getNumber()].Size += 2; + adjustBBOffsetsAfter(MBB); + HasFarJump = true; + ++NumUBrFixed; + + DEBUG(dbgs() << " Changed B to long jump " << *MI); + + return true; +} + +/// fixupConditionalBr - Fix up a conditional branch whose destination is too +/// far away to fit in its displacement field. It is converted to an inverse +/// conditional branch + an unconditional branch to the destination. +bool +ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) { + MachineInstr *MI = Br.MI; + MachineBasicBlock *DestBB = MI->getOperand(0).getMBB(); + + // Add an unconditional branch to the destination and invert the branch + // condition to jump over it: + // blt L1 + // => + // bge L2 + // b L1 + // L2: + ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(1).getImm(); + CC = ARMCC::getOppositeCondition(CC); + unsigned CCReg = MI->getOperand(2).getReg(); + + // If the branch is at the end of its MBB and that has a fall-through block, + // direct the updated conditional branch to the fall-through block. Otherwise, + // split the MBB before the next instruction. + MachineBasicBlock *MBB = MI->getParent(); + MachineInstr *BMI = &MBB->back(); + bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB); + + ++NumCBrFixed; + if (BMI != MI) { + if (std::next(MachineBasicBlock::iterator(MI)) == std::prev(MBB->end()) && + BMI->getOpcode() == Br.UncondBr) { + // Last MI in the BB is an unconditional branch. Can we simply invert the + // condition and swap destinations: + // beq L1 + // b L2 + // => + // bne L2 + // b L1 + MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB(); + if (isBBInRange(MI, NewDest, Br.MaxDisp)) { + DEBUG(dbgs() << " Invert Bcc condition and swap its destination with " + << *BMI); + BMI->getOperand(0).setMBB(DestBB); + MI->getOperand(0).setMBB(NewDest); + MI->getOperand(1).setImm(CC); + return true; + } + } + } + + if (NeedSplit) { + splitBlockBeforeInstr(MI); + // No need for the branch to the next block. We're adding an unconditional + // branch to the destination. + int delta = TII->GetInstSizeInBytes(&MBB->back()); + BBInfo[MBB->getNumber()].Size -= delta; + MBB->back().eraseFromParent(); + // BBInfo[SplitBB].Offset is wrong temporarily, fixed below + } + MachineBasicBlock *NextBB = &*++MBB->getIterator(); + + DEBUG(dbgs() << " Insert B to BB#" << DestBB->getNumber() + << " also invert condition and change dest. to BB#" + << NextBB->getNumber() << "\n"); + + // Insert a new conditional branch and a new unconditional branch. + // Also update the ImmBranch as well as adding a new entry for the new branch. + BuildMI(MBB, DebugLoc(), TII->get(MI->getOpcode())) + .addMBB(NextBB).addImm(CC).addReg(CCReg); + Br.MI = &MBB->back(); + BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back()); + if (isThumb) + BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB) + .addImm(ARMCC::AL).addReg(0); + else + BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB); + BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back()); + unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr); + ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr)); + + // Remove the old conditional branch. It may or may not still be in MBB. + BBInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI); + MI->eraseFromParent(); + adjustBBOffsetsAfter(MBB); + return true; +} + +/// undoLRSpillRestore - Remove Thumb push / pop instructions that only spills +/// LR / restores LR to pc. FIXME: This is done here because it's only possible +/// to do this if tBfar is not used. +bool ARMConstantIslands::undoLRSpillRestore() { + bool MadeChange = false; + for (unsigned i = 0, e = PushPopMIs.size(); i != e; ++i) { + MachineInstr *MI = PushPopMIs[i]; + // First two operands are predicates. + if (MI->getOpcode() == ARM::tPOP_RET && + MI->getOperand(2).getReg() == ARM::PC && + MI->getNumExplicitOperands() == 3) { + // Create the new insn and copy the predicate from the old. + BuildMI(MI->getParent(), MI->getDebugLoc(), TII->get(ARM::tBX_RET)) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)); + MI->eraseFromParent(); + MadeChange = true; + } + } + return MadeChange; +} + +// mayOptimizeThumb2Instruction - Returns true if optimizeThumb2Instructions +// below may shrink MI. +bool +ARMConstantIslands::mayOptimizeThumb2Instruction(const MachineInstr *MI) const { + switch(MI->getOpcode()) { + // optimizeThumb2Instructions. + case ARM::t2LEApcrel: + case ARM::t2LDRpci: + // optimizeThumb2Branches. + case ARM::t2B: + case ARM::t2Bcc: + case ARM::tBcc: + // optimizeThumb2JumpTables. + case ARM::t2BR_JT: + return true; + } + return false; +} + +bool ARMConstantIslands::optimizeThumb2Instructions() { + bool MadeChange = false; + + // Shrink ADR and LDR from constantpool. + for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) { + CPUser &U = CPUsers[i]; + unsigned Opcode = U.MI->getOpcode(); + unsigned NewOpc = 0; + unsigned Scale = 1; + unsigned Bits = 0; + switch (Opcode) { + default: break; + case ARM::t2LEApcrel: + if (isARMLowRegister(U.MI->getOperand(0).getReg())) { + NewOpc = ARM::tLEApcrel; + Bits = 8; + Scale = 4; + } + break; + case ARM::t2LDRpci: + if (isARMLowRegister(U.MI->getOperand(0).getReg())) { + NewOpc = ARM::tLDRpci; + Bits = 8; + Scale = 4; + } + break; + } + + if (!NewOpc) + continue; + + unsigned UserOffset = getUserOffset(U); + unsigned MaxOffs = ((1 << Bits) - 1) * Scale; + + // Be conservative with inline asm. + if (!U.KnownAlignment) + MaxOffs -= 2; + + // FIXME: Check if offset is multiple of scale if scale is not 4. + if (isCPEntryInRange(U.MI, UserOffset, U.CPEMI, MaxOffs, false, true)) { + DEBUG(dbgs() << "Shrink: " << *U.MI); + U.MI->setDesc(TII->get(NewOpc)); + MachineBasicBlock *MBB = U.MI->getParent(); + BBInfo[MBB->getNumber()].Size -= 2; + adjustBBOffsetsAfter(MBB); + ++NumT2CPShrunk; + MadeChange = true; + } + } + + MadeChange |= optimizeThumb2Branches(); + MadeChange |= optimizeThumb2JumpTables(); + return MadeChange; +} + +bool ARMConstantIslands::optimizeThumb2Branches() { + bool MadeChange = false; + + // The order in which branches appear in ImmBranches is approximately their + // order within the function body. By visiting later branches first, we reduce + // the distance between earlier forward branches and their targets, making it + // more likely that the cbn?z optimization, which can only apply to forward + // branches, will succeed. + for (unsigned i = ImmBranches.size(); i != 0; --i) { + ImmBranch &Br = ImmBranches[i-1]; + unsigned Opcode = Br.MI->getOpcode(); + unsigned NewOpc = 0; + unsigned Scale = 1; + unsigned Bits = 0; + switch (Opcode) { + default: break; + case ARM::t2B: + NewOpc = ARM::tB; + Bits = 11; + Scale = 2; + break; + case ARM::t2Bcc: { + NewOpc = ARM::tBcc; + Bits = 8; + Scale = 2; + break; + } + } + if (NewOpc) { + unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale; + MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB(); + if (isBBInRange(Br.MI, DestBB, MaxOffs)) { + DEBUG(dbgs() << "Shrink branch: " << *Br.MI); + Br.MI->setDesc(TII->get(NewOpc)); + MachineBasicBlock *MBB = Br.MI->getParent(); + BBInfo[MBB->getNumber()].Size -= 2; + adjustBBOffsetsAfter(MBB); + ++NumT2BrShrunk; + MadeChange = true; + } + } + + Opcode = Br.MI->getOpcode(); + if (Opcode != ARM::tBcc) + continue; + + // If the conditional branch doesn't kill CPSR, then CPSR can be liveout + // so this transformation is not safe. + if (!Br.MI->killsRegister(ARM::CPSR)) + continue; + + NewOpc = 0; + unsigned PredReg = 0; + ARMCC::CondCodes Pred = getInstrPredicate(Br.MI, PredReg); + if (Pred == ARMCC::EQ) + NewOpc = ARM::tCBZ; + else if (Pred == ARMCC::NE) + NewOpc = ARM::tCBNZ; + if (!NewOpc) + continue; + MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB(); + // Check if the distance is within 126. Subtract starting offset by 2 + // because the cmp will be eliminated. + unsigned BrOffset = getOffsetOf(Br.MI) + 4 - 2; + unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset; + if (BrOffset < DestOffset && (DestOffset - BrOffset) <= 126) { + MachineBasicBlock::iterator CmpMI = Br.MI; + if (CmpMI != Br.MI->getParent()->begin()) { + --CmpMI; + if (CmpMI->getOpcode() == ARM::tCMPi8) { + unsigned Reg = CmpMI->getOperand(0).getReg(); + Pred = getInstrPredicate(CmpMI, PredReg); + if (Pred == ARMCC::AL && + CmpMI->getOperand(1).getImm() == 0 && + isARMLowRegister(Reg)) { + MachineBasicBlock *MBB = Br.MI->getParent(); + DEBUG(dbgs() << "Fold: " << *CmpMI << " and: " << *Br.MI); + MachineInstr *NewBR = + BuildMI(*MBB, CmpMI, Br.MI->getDebugLoc(), TII->get(NewOpc)) + .addReg(Reg).addMBB(DestBB,Br.MI->getOperand(0).getTargetFlags()); + CmpMI->eraseFromParent(); + Br.MI->eraseFromParent(); + Br.MI = NewBR; + BBInfo[MBB->getNumber()].Size -= 2; + adjustBBOffsetsAfter(MBB); + ++NumCBZ; + MadeChange = true; + } + } + } + } + } + + return MadeChange; +} + +static bool isSimpleIndexCalc(MachineInstr &I, unsigned EntryReg, + unsigned BaseReg) { + if (I.getOpcode() != ARM::t2ADDrs) + return false; + + if (I.getOperand(0).getReg() != EntryReg) + return false; + + if (I.getOperand(1).getReg() != BaseReg) + return false; + + // FIXME: what about CC and IdxReg? + return true; +} + +/// \brief While trying to form a TBB/TBH instruction, we may (if the table +/// doesn't immediately follow the BR_JT) need access to the start of the +/// jump-table. We know one instruction that produces such a register; this +/// function works out whether that definition can be preserved to the BR_JT, +/// possibly by removing an intervening addition (which is usually needed to +/// calculate the actual entry to jump to). +bool ARMConstantIslands::preserveBaseRegister(MachineInstr *JumpMI, + MachineInstr *LEAMI, + unsigned &DeadSize, + bool &CanDeleteLEA, + bool &BaseRegKill) { + if (JumpMI->getParent() != LEAMI->getParent()) + return false; + + // Now we hope that we have at least these instructions in the basic block: + // BaseReg = t2LEA ... + // [...] + // EntryReg = t2ADDrs BaseReg, ... + // [...] + // t2BR_JT EntryReg + // + // We have to be very conservative about what we recognise here though. The + // main perturbing factors to watch out for are: + // + Spills at any point in the chain: not direct problems but we would + // expect a blocking Def of the spilled register so in practice what we + // can do is limited. + // + EntryReg == BaseReg: this is the one situation we should allow a Def + // of BaseReg, but only if the t2ADDrs can be removed. + // + Some instruction other than t2ADDrs computing the entry. Not seen in + // the wild, but we should be careful. + unsigned EntryReg = JumpMI->getOperand(0).getReg(); + unsigned BaseReg = LEAMI->getOperand(0).getReg(); + + CanDeleteLEA = true; + BaseRegKill = false; + MachineInstr *RemovableAdd = nullptr; + MachineBasicBlock::iterator I(LEAMI); + for (++I; &*I != JumpMI; ++I) { + if (isSimpleIndexCalc(*I, EntryReg, BaseReg)) { + RemovableAdd = &*I; + break; + } + + for (unsigned K = 0, E = I->getNumOperands(); K != E; ++K) { + const MachineOperand &MO = I->getOperand(K); + if (!MO.isReg() || !MO.getReg()) + continue; + if (MO.isDef() && MO.getReg() == BaseReg) + return false; + if (MO.isUse() && MO.getReg() == BaseReg) { + BaseRegKill = BaseRegKill || MO.isKill(); + CanDeleteLEA = false; + } + } + } + + if (!RemovableAdd) + return true; + + // Check the add really is removable, and that nothing else in the block + // clobbers BaseReg. + for (++I; &*I != JumpMI; ++I) { + for (unsigned K = 0, E = I->getNumOperands(); K != E; ++K) { + const MachineOperand &MO = I->getOperand(K); + if (!MO.isReg() || !MO.getReg()) + continue; + if (MO.isDef() && MO.getReg() == BaseReg) + return false; + if (MO.isUse() && MO.getReg() == EntryReg) + RemovableAdd = nullptr; + } + } + + if (RemovableAdd) { + RemovableAdd->eraseFromParent(); + DeadSize += 4; + } else if (BaseReg == EntryReg) { + // The add wasn't removable, but clobbered the base for the TBB. So we can't + // preserve it. + return false; + } + + // We reached the end of the block without seeing another definition of + // BaseReg (except, possibly the t2ADDrs, which was removed). BaseReg can be + // used in the TBB/TBH if necessary. + return true; +} + +/// \brief Returns whether CPEMI is the first instruction in the block +/// immediately following JTMI (assumed to be a TBB or TBH terminator). If so, +/// we can switch the first register to PC and usually remove the address +/// calculation that preceded it. +static bool jumpTableFollowsTB(MachineInstr *JTMI, MachineInstr *CPEMI) { + MachineFunction::iterator MBB = JTMI->getParent()->getIterator(); + MachineFunction *MF = MBB->getParent(); + ++MBB; + + return MBB != MF->end() && MBB->begin() != MBB->end() && + &*MBB->begin() == CPEMI; +} + +/// optimizeThumb2JumpTables - Use tbb / tbh instructions to generate smaller +/// jumptables when it's possible. +bool ARMConstantIslands::optimizeThumb2JumpTables() { + bool MadeChange = false; + + // FIXME: After the tables are shrunk, can we get rid some of the + // constantpool tables? + MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); + if (!MJTI) return false; + + const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); + for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) { + MachineInstr *MI = T2JumpTables[i]; + const MCInstrDesc &MCID = MI->getDesc(); + unsigned NumOps = MCID.getNumOperands(); + unsigned JTOpIdx = NumOps - (MI->isPredicable() ? 2 : 1); + MachineOperand JTOP = MI->getOperand(JTOpIdx); + unsigned JTI = JTOP.getIndex(); + assert(JTI < JT.size()); + + bool ByteOk = true; + bool HalfWordOk = true; + unsigned JTOffset = getOffsetOf(MI) + 4; + const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs; + for (unsigned j = 0, ee = JTBBs.size(); j != ee; ++j) { + MachineBasicBlock *MBB = JTBBs[j]; + unsigned DstOffset = BBInfo[MBB->getNumber()].Offset; + // Negative offset is not ok. FIXME: We should change BB layout to make + // sure all the branches are forward. + if (ByteOk && (DstOffset - JTOffset) > ((1<<8)-1)*2) + ByteOk = false; + unsigned TBHLimit = ((1<<16)-1)*2; + if (HalfWordOk && (DstOffset - JTOffset) > TBHLimit) + HalfWordOk = false; + if (!ByteOk && !HalfWordOk) + break; + } + + if (!ByteOk && !HalfWordOk) + continue; + + MachineBasicBlock *MBB = MI->getParent(); + if (!MI->getOperand(0).isKill()) // FIXME: needed now? + continue; + unsigned IdxReg = MI->getOperand(1).getReg(); + bool IdxRegKill = MI->getOperand(1).isKill(); + + CPUser &User = CPUsers[JumpTableUserIndices[JTI]]; + unsigned DeadSize = 0; + bool CanDeleteLEA = false; + bool BaseRegKill = false; + bool PreservedBaseReg = + preserveBaseRegister(MI, User.MI, DeadSize, CanDeleteLEA, BaseRegKill); + + if (!jumpTableFollowsTB(MI, User.CPEMI) && !PreservedBaseReg) + continue; + + DEBUG(dbgs() << "Shrink JT: " << *MI); + MachineInstr *CPEMI = User.CPEMI; + unsigned Opc = ByteOk ? ARM::t2TBB_JT : ARM::t2TBH_JT; + MachineBasicBlock::iterator MI_JT = MI; + MachineInstr *NewJTMI = + BuildMI(*MBB, MI_JT, MI->getDebugLoc(), TII->get(Opc)) + .addReg(User.MI->getOperand(0).getReg(), + getKillRegState(BaseRegKill)) + .addReg(IdxReg, getKillRegState(IdxRegKill)) + .addJumpTableIndex(JTI, JTOP.getTargetFlags()) + .addImm(CPEMI->getOperand(0).getImm()); + DEBUG(dbgs() << "BB#" << MBB->getNumber() << ": " << *NewJTMI); + + unsigned JTOpc = ByteOk ? ARM::JUMPTABLE_TBB : ARM::JUMPTABLE_TBH; + CPEMI->setDesc(TII->get(JTOpc)); + + if (jumpTableFollowsTB(MI, User.CPEMI)) { + NewJTMI->getOperand(0).setReg(ARM::PC); + NewJTMI->getOperand(0).setIsKill(false); + + if (CanDeleteLEA) { + User.MI->eraseFromParent(); + DeadSize += 4; + + // The LEA was eliminated, the TBB instruction becomes the only new user + // of the jump table. + User.MI = NewJTMI; + User.MaxDisp = 4; + User.NegOk = false; + User.IsSoImm = false; + User.KnownAlignment = false; + } else { + // The LEA couldn't be eliminated, so we must add another CPUser to + // record the TBB or TBH use. + int CPEntryIdx = JumpTableEntryIndices[JTI]; + auto &CPEs = CPEntries[CPEntryIdx]; + auto Entry = std::find_if(CPEs.begin(), CPEs.end(), [&](CPEntry &E) { + return E.CPEMI == User.CPEMI; + }); + ++Entry->RefCount; + CPUsers.emplace_back(CPUser(NewJTMI, User.CPEMI, 4, false, false)); + } + } + + unsigned NewSize = TII->GetInstSizeInBytes(NewJTMI); + unsigned OrigSize = TII->GetInstSizeInBytes(MI); + MI->eraseFromParent(); + + int Delta = OrigSize - NewSize + DeadSize; + BBInfo[MBB->getNumber()].Size -= Delta; + adjustBBOffsetsAfter(MBB); + + ++NumTBs; + MadeChange = true; + } + + return MadeChange; +} + +/// reorderThumb2JumpTables - Adjust the function's block layout to ensure that +/// jump tables always branch forwards, since that's what tbb and tbh need. +bool ARMConstantIslands::reorderThumb2JumpTables() { + bool MadeChange = false; + + MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); + if (!MJTI) return false; + + const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); + for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) { + MachineInstr *MI = T2JumpTables[i]; + const MCInstrDesc &MCID = MI->getDesc(); + unsigned NumOps = MCID.getNumOperands(); + unsigned JTOpIdx = NumOps - (MI->isPredicable() ? 2 : 1); + MachineOperand JTOP = MI->getOperand(JTOpIdx); + unsigned JTI = JTOP.getIndex(); + assert(JTI < JT.size()); + + // We prefer if target blocks for the jump table come after the jump + // instruction so we can use TB[BH]. Loop through the target blocks + // and try to adjust them such that that's true. + int JTNumber = MI->getParent()->getNumber(); + const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs; + for (unsigned j = 0, ee = JTBBs.size(); j != ee; ++j) { + MachineBasicBlock *MBB = JTBBs[j]; + int DTNumber = MBB->getNumber(); + + if (DTNumber < JTNumber) { + // The destination precedes the switch. Try to move the block forward + // so we have a positive offset. + MachineBasicBlock *NewBB = + adjustJTTargetBlockForward(MBB, MI->getParent()); + if (NewBB) + MJTI->ReplaceMBBInJumpTable(JTI, JTBBs[j], NewBB); + MadeChange = true; + } + } + } + + return MadeChange; +} + +MachineBasicBlock *ARMConstantIslands:: +adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) { + // If the destination block is terminated by an unconditional branch, + // try to move it; otherwise, create a new block following the jump + // table that branches back to the actual target. This is a very simple + // heuristic. FIXME: We can definitely improve it. + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + SmallVector<MachineOperand, 4> Cond; + SmallVector<MachineOperand, 4> CondPrior; + MachineFunction::iterator BBi = BB->getIterator(); + MachineFunction::iterator OldPrior = std::prev(BBi); + + // If the block terminator isn't analyzable, don't try to move the block + bool B = TII->AnalyzeBranch(*BB, TBB, FBB, Cond); + + // If the block ends in an unconditional branch, move it. The prior block + // has to have an analyzable terminator for us to move this one. Be paranoid + // and make sure we're not trying to move the entry block of the function. + if (!B && Cond.empty() && BB != MF->begin() && + !TII->AnalyzeBranch(*OldPrior, TBB, FBB, CondPrior)) { + BB->moveAfter(JTBB); + OldPrior->updateTerminator(); + BB->updateTerminator(); + // Update numbering to account for the block being moved. + MF->RenumberBlocks(); + ++NumJTMoved; + return nullptr; + } + + // Create a new MBB for the code after the jump BB. + MachineBasicBlock *NewBB = + MF->CreateMachineBasicBlock(JTBB->getBasicBlock()); + MachineFunction::iterator MBBI = ++JTBB->getIterator(); + MF->insert(MBBI, NewBB); + + // Add an unconditional branch from NewBB to BB. + // There doesn't seem to be meaningful DebugInfo available; this doesn't + // correspond directly to anything in the source. + assert (isThumb2 && "Adjusting for TB[BH] but not in Thumb2?"); + BuildMI(NewBB, DebugLoc(), TII->get(ARM::t2B)).addMBB(BB) + .addImm(ARMCC::AL).addReg(0); + + // Update internal data structures to account for the newly inserted MBB. + MF->RenumberBlocks(NewBB); + + // Update the CFG. + NewBB->addSuccessor(BB); + JTBB->replaceSuccessor(BB, NewBB); + + ++NumJTInserted; + return NewBB; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp new file mode 100644 index 0000000..c9849b2 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp @@ -0,0 +1,261 @@ +//===-- ARMConstantPoolValue.cpp - ARM constantpool value -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the ARM specific constantpool value class. +// +//===----------------------------------------------------------------------===// + +#include "ARMConstantPoolValue.h" +#include "llvm/ADT/FoldingSet.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/raw_ostream.h" +#include <cstdlib> +using namespace llvm; + +//===----------------------------------------------------------------------===// +// ARMConstantPoolValue +//===----------------------------------------------------------------------===// + +ARMConstantPoolValue::ARMConstantPoolValue(Type *Ty, unsigned id, + ARMCP::ARMCPKind kind, + unsigned char PCAdj, + ARMCP::ARMCPModifier modifier, + bool addCurrentAddress) + : MachineConstantPoolValue(Ty), LabelId(id), Kind(kind), + PCAdjust(PCAdj), Modifier(modifier), + AddCurrentAddress(addCurrentAddress) {} + +ARMConstantPoolValue::ARMConstantPoolValue(LLVMContext &C, unsigned id, + ARMCP::ARMCPKind kind, + unsigned char PCAdj, + ARMCP::ARMCPModifier modifier, + bool addCurrentAddress) + : MachineConstantPoolValue((Type*)Type::getInt32Ty(C)), + LabelId(id), Kind(kind), PCAdjust(PCAdj), Modifier(modifier), + AddCurrentAddress(addCurrentAddress) {} + +ARMConstantPoolValue::~ARMConstantPoolValue() {} + +const char *ARMConstantPoolValue::getModifierText() const { + switch (Modifier) { + // FIXME: Are these case sensitive? It'd be nice to lower-case all the + // strings if that's legal. + case ARMCP::no_modifier: return "none"; + case ARMCP::TLSGD: return "tlsgd"; + case ARMCP::GOT_PREL: return "GOT_PREL"; + case ARMCP::GOTTPOFF: return "gottpoff"; + case ARMCP::TPOFF: return "tpoff"; + } + llvm_unreachable("Unknown modifier!"); +} + +int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP, + unsigned Alignment) { + llvm_unreachable("Shouldn't be calling this directly!"); +} + +void +ARMConstantPoolValue::addSelectionDAGCSEId(FoldingSetNodeID &ID) { + ID.AddInteger(LabelId); + ID.AddInteger(PCAdjust); +} + +bool +ARMConstantPoolValue::hasSameValue(ARMConstantPoolValue *ACPV) { + if (ACPV->Kind == Kind && + ACPV->PCAdjust == PCAdjust && + ACPV->Modifier == Modifier) { + if (ACPV->LabelId == LabelId) + return true; + // Two PC relative constpool entries containing the same GV address or + // external symbols. FIXME: What about blockaddress? + if (Kind == ARMCP::CPValue || Kind == ARMCP::CPExtSymbol) + return true; + } + return false; +} + +void ARMConstantPoolValue::dump() const { + errs() << " " << *this; +} + +void ARMConstantPoolValue::print(raw_ostream &O) const { + if (Modifier) O << "(" << getModifierText() << ")"; + if (PCAdjust != 0) { + O << "-(LPC" << LabelId << "+" << (unsigned)PCAdjust; + if (AddCurrentAddress) O << "-."; + O << ")"; + } +} + +//===----------------------------------------------------------------------===// +// ARMConstantPoolConstant +//===----------------------------------------------------------------------===// + +ARMConstantPoolConstant::ARMConstantPoolConstant(Type *Ty, + const Constant *C, + unsigned ID, + ARMCP::ARMCPKind Kind, + unsigned char PCAdj, + ARMCP::ARMCPModifier Modifier, + bool AddCurrentAddress) + : ARMConstantPoolValue(Ty, ID, Kind, PCAdj, Modifier, AddCurrentAddress), + CVal(C) {} + +ARMConstantPoolConstant::ARMConstantPoolConstant(const Constant *C, + unsigned ID, + ARMCP::ARMCPKind Kind, + unsigned char PCAdj, + ARMCP::ARMCPModifier Modifier, + bool AddCurrentAddress) + : ARMConstantPoolValue((Type*)C->getType(), ID, Kind, PCAdj, Modifier, + AddCurrentAddress), + CVal(C) {} + +ARMConstantPoolConstant * +ARMConstantPoolConstant::Create(const Constant *C, unsigned ID) { + return new ARMConstantPoolConstant(C, ID, ARMCP::CPValue, 0, + ARMCP::no_modifier, false); +} + +ARMConstantPoolConstant * +ARMConstantPoolConstant::Create(const GlobalValue *GV, + ARMCP::ARMCPModifier Modifier) { + return new ARMConstantPoolConstant((Type*)Type::getInt32Ty(GV->getContext()), + GV, 0, ARMCP::CPValue, 0, + Modifier, false); +} + +ARMConstantPoolConstant * +ARMConstantPoolConstant::Create(const Constant *C, unsigned ID, + ARMCP::ARMCPKind Kind, unsigned char PCAdj) { + return new ARMConstantPoolConstant(C, ID, Kind, PCAdj, + ARMCP::no_modifier, false); +} + +ARMConstantPoolConstant * +ARMConstantPoolConstant::Create(const Constant *C, unsigned ID, + ARMCP::ARMCPKind Kind, unsigned char PCAdj, + ARMCP::ARMCPModifier Modifier, + bool AddCurrentAddress) { + return new ARMConstantPoolConstant(C, ID, Kind, PCAdj, Modifier, + AddCurrentAddress); +} + +const GlobalValue *ARMConstantPoolConstant::getGV() const { + return dyn_cast_or_null<GlobalValue>(CVal); +} + +const BlockAddress *ARMConstantPoolConstant::getBlockAddress() const { + return dyn_cast_or_null<BlockAddress>(CVal); +} + +int ARMConstantPoolConstant::getExistingMachineCPValue(MachineConstantPool *CP, + unsigned Alignment) { + return getExistingMachineCPValueImpl<ARMConstantPoolConstant>(CP, Alignment); +} + +bool ARMConstantPoolConstant::hasSameValue(ARMConstantPoolValue *ACPV) { + const ARMConstantPoolConstant *ACPC = dyn_cast<ARMConstantPoolConstant>(ACPV); + return ACPC && ACPC->CVal == CVal && ARMConstantPoolValue::hasSameValue(ACPV); +} + +void ARMConstantPoolConstant::addSelectionDAGCSEId(FoldingSetNodeID &ID) { + ID.AddPointer(CVal); + ARMConstantPoolValue::addSelectionDAGCSEId(ID); +} + +void ARMConstantPoolConstant::print(raw_ostream &O) const { + O << CVal->getName(); + ARMConstantPoolValue::print(O); +} + +//===----------------------------------------------------------------------===// +// ARMConstantPoolSymbol +//===----------------------------------------------------------------------===// + +ARMConstantPoolSymbol::ARMConstantPoolSymbol(LLVMContext &C, const char *s, + unsigned id, + unsigned char PCAdj, + ARMCP::ARMCPModifier Modifier, + bool AddCurrentAddress) + : ARMConstantPoolValue(C, id, ARMCP::CPExtSymbol, PCAdj, Modifier, + AddCurrentAddress), + S(s) {} + +ARMConstantPoolSymbol * +ARMConstantPoolSymbol::Create(LLVMContext &C, const char *s, + unsigned ID, unsigned char PCAdj) { + return new ARMConstantPoolSymbol(C, s, ID, PCAdj, ARMCP::no_modifier, false); +} + +int ARMConstantPoolSymbol::getExistingMachineCPValue(MachineConstantPool *CP, + unsigned Alignment) { + return getExistingMachineCPValueImpl<ARMConstantPoolSymbol>(CP, Alignment); +} + +bool ARMConstantPoolSymbol::hasSameValue(ARMConstantPoolValue *ACPV) { + const ARMConstantPoolSymbol *ACPS = dyn_cast<ARMConstantPoolSymbol>(ACPV); + return ACPS && ACPS->S == S && ARMConstantPoolValue::hasSameValue(ACPV); +} + +void ARMConstantPoolSymbol::addSelectionDAGCSEId(FoldingSetNodeID &ID) { + ID.AddString(S); + ARMConstantPoolValue::addSelectionDAGCSEId(ID); +} + +void ARMConstantPoolSymbol::print(raw_ostream &O) const { + O << S; + ARMConstantPoolValue::print(O); +} + +//===----------------------------------------------------------------------===// +// ARMConstantPoolMBB +//===----------------------------------------------------------------------===// + +ARMConstantPoolMBB::ARMConstantPoolMBB(LLVMContext &C, + const MachineBasicBlock *mbb, + unsigned id, unsigned char PCAdj, + ARMCP::ARMCPModifier Modifier, + bool AddCurrentAddress) + : ARMConstantPoolValue(C, id, ARMCP::CPMachineBasicBlock, PCAdj, + Modifier, AddCurrentAddress), + MBB(mbb) {} + +ARMConstantPoolMBB *ARMConstantPoolMBB::Create(LLVMContext &C, + const MachineBasicBlock *mbb, + unsigned ID, + unsigned char PCAdj) { + return new ARMConstantPoolMBB(C, mbb, ID, PCAdj, ARMCP::no_modifier, false); +} + +int ARMConstantPoolMBB::getExistingMachineCPValue(MachineConstantPool *CP, + unsigned Alignment) { + return getExistingMachineCPValueImpl<ARMConstantPoolMBB>(CP, Alignment); +} + +bool ARMConstantPoolMBB::hasSameValue(ARMConstantPoolValue *ACPV) { + const ARMConstantPoolMBB *ACPMBB = dyn_cast<ARMConstantPoolMBB>(ACPV); + return ACPMBB && ACPMBB->MBB == MBB && + ARMConstantPoolValue::hasSameValue(ACPV); +} + +void ARMConstantPoolMBB::addSelectionDAGCSEId(FoldingSetNodeID &ID) { + ID.AddPointer(MBB); + ARMConstantPoolValue::addSelectionDAGCSEId(ID); +} + +void ARMConstantPoolMBB::print(raw_ostream &O) const { + O << "BB#" << MBB->getNumber(); + ARMConstantPoolValue::print(O); +} diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h new file mode 100644 index 0000000..6b18a4e --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h @@ -0,0 +1,256 @@ +//===-- ARMConstantPoolValue.h - ARM constantpool value ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the ARM specific constantpool value class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_ARMCONSTANTPOOLVALUE_H +#define LLVM_LIB_TARGET_ARM_ARMCONSTANTPOOLVALUE_H + +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include <cstddef> + +namespace llvm { + +class BlockAddress; +class Constant; +class GlobalValue; +class LLVMContext; +class MachineBasicBlock; + +namespace ARMCP { + enum ARMCPKind { + CPValue, + CPExtSymbol, + CPBlockAddress, + CPLSDA, + CPMachineBasicBlock + }; + + enum ARMCPModifier { + no_modifier, + TLSGD, + GOT_PREL, + GOTTPOFF, + TPOFF + }; +} + +/// ARMConstantPoolValue - ARM specific constantpool value. This is used to +/// represent PC-relative displacement between the address of the load +/// instruction and the constant being loaded, i.e. (&GV-(LPIC+8)). +class ARMConstantPoolValue : public MachineConstantPoolValue { + unsigned LabelId; // Label id of the load. + ARMCP::ARMCPKind Kind; // Kind of constant. + unsigned char PCAdjust; // Extra adjustment if constantpool is pc-relative. + // 8 for ARM, 4 for Thumb. + ARMCP::ARMCPModifier Modifier; // GV modifier i.e. (&GV(modifier)-(LPIC+8)) + bool AddCurrentAddress; + +protected: + ARMConstantPoolValue(Type *Ty, unsigned id, ARMCP::ARMCPKind Kind, + unsigned char PCAdj, ARMCP::ARMCPModifier Modifier, + bool AddCurrentAddress); + + ARMConstantPoolValue(LLVMContext &C, unsigned id, ARMCP::ARMCPKind Kind, + unsigned char PCAdj, ARMCP::ARMCPModifier Modifier, + bool AddCurrentAddress); + + template <typename Derived> + int getExistingMachineCPValueImpl(MachineConstantPool *CP, + unsigned Alignment) { + unsigned AlignMask = Alignment - 1; + const std::vector<MachineConstantPoolEntry> &Constants = CP->getConstants(); + for (unsigned i = 0, e = Constants.size(); i != e; ++i) { + if (Constants[i].isMachineConstantPoolEntry() && + (Constants[i].getAlignment() & AlignMask) == 0) { + ARMConstantPoolValue *CPV = + (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal; + if (Derived *APC = dyn_cast<Derived>(CPV)) + if (cast<Derived>(this)->equals(APC)) + return i; + } + } + + return -1; + } + +public: + ~ARMConstantPoolValue() override; + + ARMCP::ARMCPModifier getModifier() const { return Modifier; } + const char *getModifierText() const; + bool hasModifier() const { return Modifier != ARMCP::no_modifier; } + + bool mustAddCurrentAddress() const { return AddCurrentAddress; } + + unsigned getLabelId() const { return LabelId; } + unsigned char getPCAdjustment() const { return PCAdjust; } + + bool isGlobalValue() const { return Kind == ARMCP::CPValue; } + bool isExtSymbol() const { return Kind == ARMCP::CPExtSymbol; } + bool isBlockAddress() const { return Kind == ARMCP::CPBlockAddress; } + bool isLSDA() const { return Kind == ARMCP::CPLSDA; } + bool isMachineBasicBlock() const{ return Kind == ARMCP::CPMachineBasicBlock; } + + int getExistingMachineCPValue(MachineConstantPool *CP, + unsigned Alignment) override; + + void addSelectionDAGCSEId(FoldingSetNodeID &ID) override; + + /// hasSameValue - Return true if this ARM constpool value can share the same + /// constantpool entry as another ARM constpool value. + virtual bool hasSameValue(ARMConstantPoolValue *ACPV); + + bool equals(const ARMConstantPoolValue *A) const { + return this->LabelId == A->LabelId && + this->PCAdjust == A->PCAdjust && + this->Modifier == A->Modifier; + } + + void print(raw_ostream &O) const override; + void print(raw_ostream *O) const { if (O) print(*O); } + void dump() const; +}; + +inline raw_ostream &operator<<(raw_ostream &O, const ARMConstantPoolValue &V) { + V.print(O); + return O; +} + +/// ARMConstantPoolConstant - ARM-specific constant pool values for Constants, +/// Functions, and BlockAddresses. +class ARMConstantPoolConstant : public ARMConstantPoolValue { + const Constant *CVal; // Constant being loaded. + + ARMConstantPoolConstant(const Constant *C, + unsigned ID, + ARMCP::ARMCPKind Kind, + unsigned char PCAdj, + ARMCP::ARMCPModifier Modifier, + bool AddCurrentAddress); + ARMConstantPoolConstant(Type *Ty, const Constant *C, + unsigned ID, + ARMCP::ARMCPKind Kind, + unsigned char PCAdj, + ARMCP::ARMCPModifier Modifier, + bool AddCurrentAddress); + +public: + static ARMConstantPoolConstant *Create(const Constant *C, unsigned ID); + static ARMConstantPoolConstant *Create(const GlobalValue *GV, + ARMCP::ARMCPModifier Modifier); + static ARMConstantPoolConstant *Create(const Constant *C, unsigned ID, + ARMCP::ARMCPKind Kind, + unsigned char PCAdj); + static ARMConstantPoolConstant *Create(const Constant *C, unsigned ID, + ARMCP::ARMCPKind Kind, + unsigned char PCAdj, + ARMCP::ARMCPModifier Modifier, + bool AddCurrentAddress); + + const GlobalValue *getGV() const; + const BlockAddress *getBlockAddress() const; + + int getExistingMachineCPValue(MachineConstantPool *CP, + unsigned Alignment) override; + + /// hasSameValue - Return true if this ARM constpool value can share the same + /// constantpool entry as another ARM constpool value. + bool hasSameValue(ARMConstantPoolValue *ACPV) override; + + void addSelectionDAGCSEId(FoldingSetNodeID &ID) override; + + void print(raw_ostream &O) const override; + static bool classof(const ARMConstantPoolValue *APV) { + return APV->isGlobalValue() || APV->isBlockAddress() || APV->isLSDA(); + } + + bool equals(const ARMConstantPoolConstant *A) const { + return CVal == A->CVal && ARMConstantPoolValue::equals(A); + } +}; + +/// ARMConstantPoolSymbol - ARM-specific constantpool values for external +/// symbols. +class ARMConstantPoolSymbol : public ARMConstantPoolValue { + const std::string S; // ExtSymbol being loaded. + + ARMConstantPoolSymbol(LLVMContext &C, const char *s, unsigned id, + unsigned char PCAdj, ARMCP::ARMCPModifier Modifier, + bool AddCurrentAddress); + +public: + static ARMConstantPoolSymbol *Create(LLVMContext &C, const char *s, + unsigned ID, unsigned char PCAdj); + + const char *getSymbol() const { return S.c_str(); } + + int getExistingMachineCPValue(MachineConstantPool *CP, + unsigned Alignment) override; + + void addSelectionDAGCSEId(FoldingSetNodeID &ID) override; + + /// hasSameValue - Return true if this ARM constpool value can share the same + /// constantpool entry as another ARM constpool value. + bool hasSameValue(ARMConstantPoolValue *ACPV) override; + + void print(raw_ostream &O) const override; + + static bool classof(const ARMConstantPoolValue *ACPV) { + return ACPV->isExtSymbol(); + } + + bool equals(const ARMConstantPoolSymbol *A) const { + return S == A->S && ARMConstantPoolValue::equals(A); + } +}; + +/// ARMConstantPoolMBB - ARM-specific constantpool value of a machine basic +/// block. +class ARMConstantPoolMBB : public ARMConstantPoolValue { + const MachineBasicBlock *MBB; // Machine basic block. + + ARMConstantPoolMBB(LLVMContext &C, const MachineBasicBlock *mbb, unsigned id, + unsigned char PCAdj, ARMCP::ARMCPModifier Modifier, + bool AddCurrentAddress); + +public: + static ARMConstantPoolMBB *Create(LLVMContext &C, + const MachineBasicBlock *mbb, + unsigned ID, unsigned char PCAdj); + + const MachineBasicBlock *getMBB() const { return MBB; } + + int getExistingMachineCPValue(MachineConstantPool *CP, + unsigned Alignment) override; + + void addSelectionDAGCSEId(FoldingSetNodeID &ID) override; + + /// hasSameValue - Return true if this ARM constpool value can share the same + /// constantpool entry as another ARM constpool value. + bool hasSameValue(ARMConstantPoolValue *ACPV) override; + + void print(raw_ostream &O) const override; + + static bool classof(const ARMConstantPoolValue *ACPV) { + return ACPV->isMachineBasicBlock(); + } + + bool equals(const ARMConstantPoolMBB *A) const { + return MBB == A->MBB && ARMConstantPoolValue::equals(A); + } +}; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp new file mode 100644 index 0000000..56f3498 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -0,0 +1,1413 @@ +//===-- ARMExpandPseudoInsts.cpp - Expand pseudo instructions -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that expands pseudo instructions into target +// instructions to allow proper scheduling, if-conversion, and other late +// optimizations. This pass should be run after register allocation but before +// the post-regalloc scheduling pass. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "ARMBaseRegisterInfo.h" +#include "ARMConstantPoolValue.h" +#include "ARMMachineFunctionInfo.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineInstrBundle.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/raw_ostream.h" // FIXME: for debug only. remove! +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetRegisterInfo.h" +using namespace llvm; + +#define DEBUG_TYPE "arm-pseudo" + +static cl::opt<bool> +VerifyARMPseudo("verify-arm-pseudo-expand", cl::Hidden, + cl::desc("Verify machine code after expanding ARM pseudos")); + +namespace { + class ARMExpandPseudo : public MachineFunctionPass { + public: + static char ID; + ARMExpandPseudo() : MachineFunctionPass(ID) {} + + const ARMBaseInstrInfo *TII; + const TargetRegisterInfo *TRI; + const ARMSubtarget *STI; + ARMFunctionInfo *AFI; + + bool runOnMachineFunction(MachineFunction &Fn) override; + + const char *getPassName() const override { + return "ARM pseudo instruction expansion pass"; + } + + private: + void TransferImpOps(MachineInstr &OldMI, + MachineInstrBuilder &UseMI, MachineInstrBuilder &DefMI); + bool ExpandMI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI); + bool ExpandMBB(MachineBasicBlock &MBB); + void ExpandVLD(MachineBasicBlock::iterator &MBBI); + void ExpandVST(MachineBasicBlock::iterator &MBBI); + void ExpandLaneOp(MachineBasicBlock::iterator &MBBI); + void ExpandVTBL(MachineBasicBlock::iterator &MBBI, + unsigned Opc, bool IsExt); + void ExpandMOV32BitImm(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI); + }; + char ARMExpandPseudo::ID = 0; +} + +/// TransferImpOps - Transfer implicit operands on the pseudo instruction to +/// the instructions created from the expansion. +void ARMExpandPseudo::TransferImpOps(MachineInstr &OldMI, + MachineInstrBuilder &UseMI, + MachineInstrBuilder &DefMI) { + const MCInstrDesc &Desc = OldMI.getDesc(); + for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands(); + i != e; ++i) { + const MachineOperand &MO = OldMI.getOperand(i); + assert(MO.isReg() && MO.getReg()); + if (MO.isUse()) + UseMI.addOperand(MO); + else + DefMI.addOperand(MO); + } +} + +namespace { + // Constants for register spacing in NEON load/store instructions. + // For quad-register load-lane and store-lane pseudo instructors, the + // spacing is initially assumed to be EvenDblSpc, and that is changed to + // OddDblSpc depending on the lane number operand. + enum NEONRegSpacing { + SingleSpc, + EvenDblSpc, + OddDblSpc + }; + + // Entries for NEON load/store information table. The table is sorted by + // PseudoOpc for fast binary-search lookups. + struct NEONLdStTableEntry { + uint16_t PseudoOpc; + uint16_t RealOpc; + bool IsLoad; + bool isUpdating; + bool hasWritebackOperand; + uint8_t RegSpacing; // One of type NEONRegSpacing + uint8_t NumRegs; // D registers loaded or stored + uint8_t RegElts; // elements per D register; used for lane ops + // FIXME: Temporary flag to denote whether the real instruction takes + // a single register (like the encoding) or all of the registers in + // the list (like the asm syntax and the isel DAG). When all definitions + // are converted to take only the single encoded register, this will + // go away. + bool copyAllListRegs; + + // Comparison methods for binary search of the table. + bool operator<(const NEONLdStTableEntry &TE) const { + return PseudoOpc < TE.PseudoOpc; + } + friend bool operator<(const NEONLdStTableEntry &TE, unsigned PseudoOpc) { + return TE.PseudoOpc < PseudoOpc; + } + friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned PseudoOpc, + const NEONLdStTableEntry &TE) { + return PseudoOpc < TE.PseudoOpc; + } + }; +} + +static const NEONLdStTableEntry NEONLdStTable[] = { +{ ARM::VLD1LNq16Pseudo, ARM::VLD1LNd16, true, false, false, EvenDblSpc, 1, 4 ,true}, +{ ARM::VLD1LNq16Pseudo_UPD, ARM::VLD1LNd16_UPD, true, true, true, EvenDblSpc, 1, 4 ,true}, +{ ARM::VLD1LNq32Pseudo, ARM::VLD1LNd32, true, false, false, EvenDblSpc, 1, 2 ,true}, +{ ARM::VLD1LNq32Pseudo_UPD, ARM::VLD1LNd32_UPD, true, true, true, EvenDblSpc, 1, 2 ,true}, +{ ARM::VLD1LNq8Pseudo, ARM::VLD1LNd8, true, false, false, EvenDblSpc, 1, 8 ,true}, +{ ARM::VLD1LNq8Pseudo_UPD, ARM::VLD1LNd8_UPD, true, true, true, EvenDblSpc, 1, 8 ,true}, + +{ ARM::VLD1d64QPseudo, ARM::VLD1d64Q, true, false, false, SingleSpc, 4, 1 ,false}, +{ ARM::VLD1d64QPseudoWB_fixed, ARM::VLD1d64Qwb_fixed, true, true, false, SingleSpc, 4, 1 ,false}, +{ ARM::VLD1d64TPseudo, ARM::VLD1d64T, true, false, false, SingleSpc, 3, 1 ,false}, +{ ARM::VLD1d64TPseudoWB_fixed, ARM::VLD1d64Twb_fixed, true, true, false, SingleSpc, 3, 1 ,false}, + +{ ARM::VLD2LNd16Pseudo, ARM::VLD2LNd16, true, false, false, SingleSpc, 2, 4 ,true}, +{ ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, true, SingleSpc, 2, 4 ,true}, +{ ARM::VLD2LNd32Pseudo, ARM::VLD2LNd32, true, false, false, SingleSpc, 2, 2 ,true}, +{ ARM::VLD2LNd32Pseudo_UPD, ARM::VLD2LNd32_UPD, true, true, true, SingleSpc, 2, 2 ,true}, +{ ARM::VLD2LNd8Pseudo, ARM::VLD2LNd8, true, false, false, SingleSpc, 2, 8 ,true}, +{ ARM::VLD2LNd8Pseudo_UPD, ARM::VLD2LNd8_UPD, true, true, true, SingleSpc, 2, 8 ,true}, +{ ARM::VLD2LNq16Pseudo, ARM::VLD2LNq16, true, false, false, EvenDblSpc, 2, 4 ,true}, +{ ARM::VLD2LNq16Pseudo_UPD, ARM::VLD2LNq16_UPD, true, true, true, EvenDblSpc, 2, 4 ,true}, +{ ARM::VLD2LNq32Pseudo, ARM::VLD2LNq32, true, false, false, EvenDblSpc, 2, 2 ,true}, +{ ARM::VLD2LNq32Pseudo_UPD, ARM::VLD2LNq32_UPD, true, true, true, EvenDblSpc, 2, 2 ,true}, + +{ ARM::VLD2q16Pseudo, ARM::VLD2q16, true, false, false, SingleSpc, 4, 4 ,false}, +{ ARM::VLD2q16PseudoWB_fixed, ARM::VLD2q16wb_fixed, true, true, false, SingleSpc, 4, 4 ,false}, +{ ARM::VLD2q16PseudoWB_register, ARM::VLD2q16wb_register, true, true, true, SingleSpc, 4, 4 ,false}, +{ ARM::VLD2q32Pseudo, ARM::VLD2q32, true, false, false, SingleSpc, 4, 2 ,false}, +{ ARM::VLD2q32PseudoWB_fixed, ARM::VLD2q32wb_fixed, true, true, false, SingleSpc, 4, 2 ,false}, +{ ARM::VLD2q32PseudoWB_register, ARM::VLD2q32wb_register, true, true, true, SingleSpc, 4, 2 ,false}, +{ ARM::VLD2q8Pseudo, ARM::VLD2q8, true, false, false, SingleSpc, 4, 8 ,false}, +{ ARM::VLD2q8PseudoWB_fixed, ARM::VLD2q8wb_fixed, true, true, false, SingleSpc, 4, 8 ,false}, +{ ARM::VLD2q8PseudoWB_register, ARM::VLD2q8wb_register, true, true, true, SingleSpc, 4, 8 ,false}, + +{ ARM::VLD3DUPd16Pseudo, ARM::VLD3DUPd16, true, false, false, SingleSpc, 3, 4,true}, +{ ARM::VLD3DUPd16Pseudo_UPD, ARM::VLD3DUPd16_UPD, true, true, true, SingleSpc, 3, 4,true}, +{ ARM::VLD3DUPd32Pseudo, ARM::VLD3DUPd32, true, false, false, SingleSpc, 3, 2,true}, +{ ARM::VLD3DUPd32Pseudo_UPD, ARM::VLD3DUPd32_UPD, true, true, true, SingleSpc, 3, 2,true}, +{ ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd8, true, false, false, SingleSpc, 3, 8,true}, +{ ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd8_UPD, true, true, true, SingleSpc, 3, 8,true}, + +{ ARM::VLD3LNd16Pseudo, ARM::VLD3LNd16, true, false, false, SingleSpc, 3, 4 ,true}, +{ ARM::VLD3LNd16Pseudo_UPD, ARM::VLD3LNd16_UPD, true, true, true, SingleSpc, 3, 4 ,true}, +{ ARM::VLD3LNd32Pseudo, ARM::VLD3LNd32, true, false, false, SingleSpc, 3, 2 ,true}, +{ ARM::VLD3LNd32Pseudo_UPD, ARM::VLD3LNd32_UPD, true, true, true, SingleSpc, 3, 2 ,true}, +{ ARM::VLD3LNd8Pseudo, ARM::VLD3LNd8, true, false, false, SingleSpc, 3, 8 ,true}, +{ ARM::VLD3LNd8Pseudo_UPD, ARM::VLD3LNd8_UPD, true, true, true, SingleSpc, 3, 8 ,true}, +{ ARM::VLD3LNq16Pseudo, ARM::VLD3LNq16, true, false, false, EvenDblSpc, 3, 4 ,true}, +{ ARM::VLD3LNq16Pseudo_UPD, ARM::VLD3LNq16_UPD, true, true, true, EvenDblSpc, 3, 4 ,true}, +{ ARM::VLD3LNq32Pseudo, ARM::VLD3LNq32, true, false, false, EvenDblSpc, 3, 2 ,true}, +{ ARM::VLD3LNq32Pseudo_UPD, ARM::VLD3LNq32_UPD, true, true, true, EvenDblSpc, 3, 2 ,true}, + +{ ARM::VLD3d16Pseudo, ARM::VLD3d16, true, false, false, SingleSpc, 3, 4 ,true}, +{ ARM::VLD3d16Pseudo_UPD, ARM::VLD3d16_UPD, true, true, true, SingleSpc, 3, 4 ,true}, +{ ARM::VLD3d32Pseudo, ARM::VLD3d32, true, false, false, SingleSpc, 3, 2 ,true}, +{ ARM::VLD3d32Pseudo_UPD, ARM::VLD3d32_UPD, true, true, true, SingleSpc, 3, 2 ,true}, +{ ARM::VLD3d8Pseudo, ARM::VLD3d8, true, false, false, SingleSpc, 3, 8 ,true}, +{ ARM::VLD3d8Pseudo_UPD, ARM::VLD3d8_UPD, true, true, true, SingleSpc, 3, 8 ,true}, + +{ ARM::VLD3q16Pseudo_UPD, ARM::VLD3q16_UPD, true, true, true, EvenDblSpc, 3, 4 ,true}, +{ ARM::VLD3q16oddPseudo, ARM::VLD3q16, true, false, false, OddDblSpc, 3, 4 ,true}, +{ ARM::VLD3q16oddPseudo_UPD, ARM::VLD3q16_UPD, true, true, true, OddDblSpc, 3, 4 ,true}, +{ ARM::VLD3q32Pseudo_UPD, ARM::VLD3q32_UPD, true, true, true, EvenDblSpc, 3, 2 ,true}, +{ ARM::VLD3q32oddPseudo, ARM::VLD3q32, true, false, false, OddDblSpc, 3, 2 ,true}, +{ ARM::VLD3q32oddPseudo_UPD, ARM::VLD3q32_UPD, true, true, true, OddDblSpc, 3, 2 ,true}, +{ ARM::VLD3q8Pseudo_UPD, ARM::VLD3q8_UPD, true, true, true, EvenDblSpc, 3, 8 ,true}, +{ ARM::VLD3q8oddPseudo, ARM::VLD3q8, true, false, false, OddDblSpc, 3, 8 ,true}, +{ ARM::VLD3q8oddPseudo_UPD, ARM::VLD3q8_UPD, true, true, true, OddDblSpc, 3, 8 ,true}, + +{ ARM::VLD4DUPd16Pseudo, ARM::VLD4DUPd16, true, false, false, SingleSpc, 4, 4,true}, +{ ARM::VLD4DUPd16Pseudo_UPD, ARM::VLD4DUPd16_UPD, true, true, true, SingleSpc, 4, 4,true}, +{ ARM::VLD4DUPd32Pseudo, ARM::VLD4DUPd32, true, false, false, SingleSpc, 4, 2,true}, +{ ARM::VLD4DUPd32Pseudo_UPD, ARM::VLD4DUPd32_UPD, true, true, true, SingleSpc, 4, 2,true}, +{ ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd8, true, false, false, SingleSpc, 4, 8,true}, +{ ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd8_UPD, true, true, true, SingleSpc, 4, 8,true}, + +{ ARM::VLD4LNd16Pseudo, ARM::VLD4LNd16, true, false, false, SingleSpc, 4, 4 ,true}, +{ ARM::VLD4LNd16Pseudo_UPD, ARM::VLD4LNd16_UPD, true, true, true, SingleSpc, 4, 4 ,true}, +{ ARM::VLD4LNd32Pseudo, ARM::VLD4LNd32, true, false, false, SingleSpc, 4, 2 ,true}, +{ ARM::VLD4LNd32Pseudo_UPD, ARM::VLD4LNd32_UPD, true, true, true, SingleSpc, 4, 2 ,true}, +{ ARM::VLD4LNd8Pseudo, ARM::VLD4LNd8, true, false, false, SingleSpc, 4, 8 ,true}, +{ ARM::VLD4LNd8Pseudo_UPD, ARM::VLD4LNd8_UPD, true, true, true, SingleSpc, 4, 8 ,true}, +{ ARM::VLD4LNq16Pseudo, ARM::VLD4LNq16, true, false, false, EvenDblSpc, 4, 4 ,true}, +{ ARM::VLD4LNq16Pseudo_UPD, ARM::VLD4LNq16_UPD, true, true, true, EvenDblSpc, 4, 4 ,true}, +{ ARM::VLD4LNq32Pseudo, ARM::VLD4LNq32, true, false, false, EvenDblSpc, 4, 2 ,true}, +{ ARM::VLD4LNq32Pseudo_UPD, ARM::VLD4LNq32_UPD, true, true, true, EvenDblSpc, 4, 2 ,true}, + +{ ARM::VLD4d16Pseudo, ARM::VLD4d16, true, false, false, SingleSpc, 4, 4 ,true}, +{ ARM::VLD4d16Pseudo_UPD, ARM::VLD4d16_UPD, true, true, true, SingleSpc, 4, 4 ,true}, +{ ARM::VLD4d32Pseudo, ARM::VLD4d32, true, false, false, SingleSpc, 4, 2 ,true}, +{ ARM::VLD4d32Pseudo_UPD, ARM::VLD4d32_UPD, true, true, true, SingleSpc, 4, 2 ,true}, +{ ARM::VLD4d8Pseudo, ARM::VLD4d8, true, false, false, SingleSpc, 4, 8 ,true}, +{ ARM::VLD4d8Pseudo_UPD, ARM::VLD4d8_UPD, true, true, true, SingleSpc, 4, 8 ,true}, + +{ ARM::VLD4q16Pseudo_UPD, ARM::VLD4q16_UPD, true, true, true, EvenDblSpc, 4, 4 ,true}, +{ ARM::VLD4q16oddPseudo, ARM::VLD4q16, true, false, false, OddDblSpc, 4, 4 ,true}, +{ ARM::VLD4q16oddPseudo_UPD, ARM::VLD4q16_UPD, true, true, true, OddDblSpc, 4, 4 ,true}, +{ ARM::VLD4q32Pseudo_UPD, ARM::VLD4q32_UPD, true, true, true, EvenDblSpc, 4, 2 ,true}, +{ ARM::VLD4q32oddPseudo, ARM::VLD4q32, true, false, false, OddDblSpc, 4, 2 ,true}, +{ ARM::VLD4q32oddPseudo_UPD, ARM::VLD4q32_UPD, true, true, true, OddDblSpc, 4, 2 ,true}, +{ ARM::VLD4q8Pseudo_UPD, ARM::VLD4q8_UPD, true, true, true, EvenDblSpc, 4, 8 ,true}, +{ ARM::VLD4q8oddPseudo, ARM::VLD4q8, true, false, false, OddDblSpc, 4, 8 ,true}, +{ ARM::VLD4q8oddPseudo_UPD, ARM::VLD4q8_UPD, true, true, true, OddDblSpc, 4, 8 ,true}, + +{ ARM::VST1LNq16Pseudo, ARM::VST1LNd16, false, false, false, EvenDblSpc, 1, 4 ,true}, +{ ARM::VST1LNq16Pseudo_UPD, ARM::VST1LNd16_UPD, false, true, true, EvenDblSpc, 1, 4 ,true}, +{ ARM::VST1LNq32Pseudo, ARM::VST1LNd32, false, false, false, EvenDblSpc, 1, 2 ,true}, +{ ARM::VST1LNq32Pseudo_UPD, ARM::VST1LNd32_UPD, false, true, true, EvenDblSpc, 1, 2 ,true}, +{ ARM::VST1LNq8Pseudo, ARM::VST1LNd8, false, false, false, EvenDblSpc, 1, 8 ,true}, +{ ARM::VST1LNq8Pseudo_UPD, ARM::VST1LNd8_UPD, false, true, true, EvenDblSpc, 1, 8 ,true}, + +{ ARM::VST1d64QPseudo, ARM::VST1d64Q, false, false, false, SingleSpc, 4, 1 ,false}, +{ ARM::VST1d64QPseudoWB_fixed, ARM::VST1d64Qwb_fixed, false, true, false, SingleSpc, 4, 1 ,false}, +{ ARM::VST1d64QPseudoWB_register, ARM::VST1d64Qwb_register, false, true, true, SingleSpc, 4, 1 ,false}, +{ ARM::VST1d64TPseudo, ARM::VST1d64T, false, false, false, SingleSpc, 3, 1 ,false}, +{ ARM::VST1d64TPseudoWB_fixed, ARM::VST1d64Twb_fixed, false, true, false, SingleSpc, 3, 1 ,false}, +{ ARM::VST1d64TPseudoWB_register, ARM::VST1d64Twb_register, false, true, true, SingleSpc, 3, 1 ,false}, + +{ ARM::VST2LNd16Pseudo, ARM::VST2LNd16, false, false, false, SingleSpc, 2, 4 ,true}, +{ ARM::VST2LNd16Pseudo_UPD, ARM::VST2LNd16_UPD, false, true, true, SingleSpc, 2, 4 ,true}, +{ ARM::VST2LNd32Pseudo, ARM::VST2LNd32, false, false, false, SingleSpc, 2, 2 ,true}, +{ ARM::VST2LNd32Pseudo_UPD, ARM::VST2LNd32_UPD, false, true, true, SingleSpc, 2, 2 ,true}, +{ ARM::VST2LNd8Pseudo, ARM::VST2LNd8, false, false, false, SingleSpc, 2, 8 ,true}, +{ ARM::VST2LNd8Pseudo_UPD, ARM::VST2LNd8_UPD, false, true, true, SingleSpc, 2, 8 ,true}, +{ ARM::VST2LNq16Pseudo, ARM::VST2LNq16, false, false, false, EvenDblSpc, 2, 4,true}, +{ ARM::VST2LNq16Pseudo_UPD, ARM::VST2LNq16_UPD, false, true, true, EvenDblSpc, 2, 4,true}, +{ ARM::VST2LNq32Pseudo, ARM::VST2LNq32, false, false, false, EvenDblSpc, 2, 2,true}, +{ ARM::VST2LNq32Pseudo_UPD, ARM::VST2LNq32_UPD, false, true, true, EvenDblSpc, 2, 2,true}, + +{ ARM::VST2q16Pseudo, ARM::VST2q16, false, false, false, SingleSpc, 4, 4 ,false}, +{ ARM::VST2q16PseudoWB_fixed, ARM::VST2q16wb_fixed, false, true, false, SingleSpc, 4, 4 ,false}, +{ ARM::VST2q16PseudoWB_register, ARM::VST2q16wb_register, false, true, true, SingleSpc, 4, 4 ,false}, +{ ARM::VST2q32Pseudo, ARM::VST2q32, false, false, false, SingleSpc, 4, 2 ,false}, +{ ARM::VST2q32PseudoWB_fixed, ARM::VST2q32wb_fixed, false, true, false, SingleSpc, 4, 2 ,false}, +{ ARM::VST2q32PseudoWB_register, ARM::VST2q32wb_register, false, true, true, SingleSpc, 4, 2 ,false}, +{ ARM::VST2q8Pseudo, ARM::VST2q8, false, false, false, SingleSpc, 4, 8 ,false}, +{ ARM::VST2q8PseudoWB_fixed, ARM::VST2q8wb_fixed, false, true, false, SingleSpc, 4, 8 ,false}, +{ ARM::VST2q8PseudoWB_register, ARM::VST2q8wb_register, false, true, true, SingleSpc, 4, 8 ,false}, + +{ ARM::VST3LNd16Pseudo, ARM::VST3LNd16, false, false, false, SingleSpc, 3, 4 ,true}, +{ ARM::VST3LNd16Pseudo_UPD, ARM::VST3LNd16_UPD, false, true, true, SingleSpc, 3, 4 ,true}, +{ ARM::VST3LNd32Pseudo, ARM::VST3LNd32, false, false, false, SingleSpc, 3, 2 ,true}, +{ ARM::VST3LNd32Pseudo_UPD, ARM::VST3LNd32_UPD, false, true, true, SingleSpc, 3, 2 ,true}, +{ ARM::VST3LNd8Pseudo, ARM::VST3LNd8, false, false, false, SingleSpc, 3, 8 ,true}, +{ ARM::VST3LNd8Pseudo_UPD, ARM::VST3LNd8_UPD, false, true, true, SingleSpc, 3, 8 ,true}, +{ ARM::VST3LNq16Pseudo, ARM::VST3LNq16, false, false, false, EvenDblSpc, 3, 4,true}, +{ ARM::VST3LNq16Pseudo_UPD, ARM::VST3LNq16_UPD, false, true, true, EvenDblSpc, 3, 4,true}, +{ ARM::VST3LNq32Pseudo, ARM::VST3LNq32, false, false, false, EvenDblSpc, 3, 2,true}, +{ ARM::VST3LNq32Pseudo_UPD, ARM::VST3LNq32_UPD, false, true, true, EvenDblSpc, 3, 2,true}, + +{ ARM::VST3d16Pseudo, ARM::VST3d16, false, false, false, SingleSpc, 3, 4 ,true}, +{ ARM::VST3d16Pseudo_UPD, ARM::VST3d16_UPD, false, true, true, SingleSpc, 3, 4 ,true}, +{ ARM::VST3d32Pseudo, ARM::VST3d32, false, false, false, SingleSpc, 3, 2 ,true}, +{ ARM::VST3d32Pseudo_UPD, ARM::VST3d32_UPD, false, true, true, SingleSpc, 3, 2 ,true}, +{ ARM::VST3d8Pseudo, ARM::VST3d8, false, false, false, SingleSpc, 3, 8 ,true}, +{ ARM::VST3d8Pseudo_UPD, ARM::VST3d8_UPD, false, true, true, SingleSpc, 3, 8 ,true}, + +{ ARM::VST3q16Pseudo_UPD, ARM::VST3q16_UPD, false, true, true, EvenDblSpc, 3, 4 ,true}, +{ ARM::VST3q16oddPseudo, ARM::VST3q16, false, false, false, OddDblSpc, 3, 4 ,true}, +{ ARM::VST3q16oddPseudo_UPD, ARM::VST3q16_UPD, false, true, true, OddDblSpc, 3, 4 ,true}, +{ ARM::VST3q32Pseudo_UPD, ARM::VST3q32_UPD, false, true, true, EvenDblSpc, 3, 2 ,true}, +{ ARM::VST3q32oddPseudo, ARM::VST3q32, false, false, false, OddDblSpc, 3, 2 ,true}, +{ ARM::VST3q32oddPseudo_UPD, ARM::VST3q32_UPD, false, true, true, OddDblSpc, 3, 2 ,true}, +{ ARM::VST3q8Pseudo_UPD, ARM::VST3q8_UPD, false, true, true, EvenDblSpc, 3, 8 ,true}, +{ ARM::VST3q8oddPseudo, ARM::VST3q8, false, false, false, OddDblSpc, 3, 8 ,true}, +{ ARM::VST3q8oddPseudo_UPD, ARM::VST3q8_UPD, false, true, true, OddDblSpc, 3, 8 ,true}, + +{ ARM::VST4LNd16Pseudo, ARM::VST4LNd16, false, false, false, SingleSpc, 4, 4 ,true}, +{ ARM::VST4LNd16Pseudo_UPD, ARM::VST4LNd16_UPD, false, true, true, SingleSpc, 4, 4 ,true}, +{ ARM::VST4LNd32Pseudo, ARM::VST4LNd32, false, false, false, SingleSpc, 4, 2 ,true}, +{ ARM::VST4LNd32Pseudo_UPD, ARM::VST4LNd32_UPD, false, true, true, SingleSpc, 4, 2 ,true}, +{ ARM::VST4LNd8Pseudo, ARM::VST4LNd8, false, false, false, SingleSpc, 4, 8 ,true}, +{ ARM::VST4LNd8Pseudo_UPD, ARM::VST4LNd8_UPD, false, true, true, SingleSpc, 4, 8 ,true}, +{ ARM::VST4LNq16Pseudo, ARM::VST4LNq16, false, false, false, EvenDblSpc, 4, 4,true}, +{ ARM::VST4LNq16Pseudo_UPD, ARM::VST4LNq16_UPD, false, true, true, EvenDblSpc, 4, 4,true}, +{ ARM::VST4LNq32Pseudo, ARM::VST4LNq32, false, false, false, EvenDblSpc, 4, 2,true}, +{ ARM::VST4LNq32Pseudo_UPD, ARM::VST4LNq32_UPD, false, true, true, EvenDblSpc, 4, 2,true}, + +{ ARM::VST4d16Pseudo, ARM::VST4d16, false, false, false, SingleSpc, 4, 4 ,true}, +{ ARM::VST4d16Pseudo_UPD, ARM::VST4d16_UPD, false, true, true, SingleSpc, 4, 4 ,true}, +{ ARM::VST4d32Pseudo, ARM::VST4d32, false, false, false, SingleSpc, 4, 2 ,true}, +{ ARM::VST4d32Pseudo_UPD, ARM::VST4d32_UPD, false, true, true, SingleSpc, 4, 2 ,true}, +{ ARM::VST4d8Pseudo, ARM::VST4d8, false, false, false, SingleSpc, 4, 8 ,true}, +{ ARM::VST4d8Pseudo_UPD, ARM::VST4d8_UPD, false, true, true, SingleSpc, 4, 8 ,true}, + +{ ARM::VST4q16Pseudo_UPD, ARM::VST4q16_UPD, false, true, true, EvenDblSpc, 4, 4 ,true}, +{ ARM::VST4q16oddPseudo, ARM::VST4q16, false, false, false, OddDblSpc, 4, 4 ,true}, +{ ARM::VST4q16oddPseudo_UPD, ARM::VST4q16_UPD, false, true, true, OddDblSpc, 4, 4 ,true}, +{ ARM::VST4q32Pseudo_UPD, ARM::VST4q32_UPD, false, true, true, EvenDblSpc, 4, 2 ,true}, +{ ARM::VST4q32oddPseudo, ARM::VST4q32, false, false, false, OddDblSpc, 4, 2 ,true}, +{ ARM::VST4q32oddPseudo_UPD, ARM::VST4q32_UPD, false, true, true, OddDblSpc, 4, 2 ,true}, +{ ARM::VST4q8Pseudo_UPD, ARM::VST4q8_UPD, false, true, true, EvenDblSpc, 4, 8 ,true}, +{ ARM::VST4q8oddPseudo, ARM::VST4q8, false, false, false, OddDblSpc, 4, 8 ,true}, +{ ARM::VST4q8oddPseudo_UPD, ARM::VST4q8_UPD, false, true, true, OddDblSpc, 4, 8 ,true} +}; + +/// LookupNEONLdSt - Search the NEONLdStTable for information about a NEON +/// load or store pseudo instruction. +static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) { +#ifndef NDEBUG + // Make sure the table is sorted. + static bool TableChecked = false; + if (!TableChecked) { + assert(std::is_sorted(std::begin(NEONLdStTable), std::end(NEONLdStTable)) && + "NEONLdStTable is not sorted!"); + TableChecked = true; + } +#endif + + auto I = std::lower_bound(std::begin(NEONLdStTable), + std::end(NEONLdStTable), Opcode); + if (I != std::end(NEONLdStTable) && I->PseudoOpc == Opcode) + return I; + return nullptr; +} + +/// GetDSubRegs - Get 4 D subregisters of a Q, QQ, or QQQQ register, +/// corresponding to the specified register spacing. Not all of the results +/// are necessarily valid, e.g., a Q register only has 2 D subregisters. +static void GetDSubRegs(unsigned Reg, NEONRegSpacing RegSpc, + const TargetRegisterInfo *TRI, unsigned &D0, + unsigned &D1, unsigned &D2, unsigned &D3) { + if (RegSpc == SingleSpc) { + D0 = TRI->getSubReg(Reg, ARM::dsub_0); + D1 = TRI->getSubReg(Reg, ARM::dsub_1); + D2 = TRI->getSubReg(Reg, ARM::dsub_2); + D3 = TRI->getSubReg(Reg, ARM::dsub_3); + } else if (RegSpc == EvenDblSpc) { + D0 = TRI->getSubReg(Reg, ARM::dsub_0); + D1 = TRI->getSubReg(Reg, ARM::dsub_2); + D2 = TRI->getSubReg(Reg, ARM::dsub_4); + D3 = TRI->getSubReg(Reg, ARM::dsub_6); + } else { + assert(RegSpc == OddDblSpc && "unknown register spacing"); + D0 = TRI->getSubReg(Reg, ARM::dsub_1); + D1 = TRI->getSubReg(Reg, ARM::dsub_3); + D2 = TRI->getSubReg(Reg, ARM::dsub_5); + D3 = TRI->getSubReg(Reg, ARM::dsub_7); + } +} + +/// ExpandVLD - Translate VLD pseudo instructions with Q, QQ or QQQQ register +/// operands to real VLD instructions with D register operands. +void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) { + MachineInstr &MI = *MBBI; + MachineBasicBlock &MBB = *MI.getParent(); + + const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode()); + assert(TableEntry && TableEntry->IsLoad && "NEONLdStTable lookup failed"); + NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing; + unsigned NumRegs = TableEntry->NumRegs; + + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(TableEntry->RealOpc)); + unsigned OpIdx = 0; + + bool DstIsDead = MI.getOperand(OpIdx).isDead(); + unsigned DstReg = MI.getOperand(OpIdx++).getReg(); + unsigned D0, D1, D2, D3; + GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3); + MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead)); + if (NumRegs > 1 && TableEntry->copyAllListRegs) + MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead)); + if (NumRegs > 2 && TableEntry->copyAllListRegs) + MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead)); + if (NumRegs > 3 && TableEntry->copyAllListRegs) + MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead)); + + if (TableEntry->isUpdating) + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Copy the addrmode6 operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + // Copy the am6offset operand. + if (TableEntry->hasWritebackOperand) + MIB.addOperand(MI.getOperand(OpIdx++)); + + // For an instruction writing double-spaced subregs, the pseudo instruction + // has an extra operand that is a use of the super-register. Record the + // operand index and skip over it. + unsigned SrcOpIdx = 0; + if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc) + SrcOpIdx = OpIdx++; + + // Copy the predicate operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Copy the super-register source operand used for double-spaced subregs over + // to the new instruction as an implicit operand. + if (SrcOpIdx != 0) { + MachineOperand MO = MI.getOperand(SrcOpIdx); + MO.setImplicit(true); + MIB.addOperand(MO); + } + // Add an implicit def for the super-register. + MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead)); + TransferImpOps(MI, MIB, MIB); + + // Transfer memoperands. + MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + + MI.eraseFromParent(); +} + +/// ExpandVST - Translate VST pseudo instructions with Q, QQ or QQQQ register +/// operands to real VST instructions with D register operands. +void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) { + MachineInstr &MI = *MBBI; + MachineBasicBlock &MBB = *MI.getParent(); + + const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode()); + assert(TableEntry && !TableEntry->IsLoad && "NEONLdStTable lookup failed"); + NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing; + unsigned NumRegs = TableEntry->NumRegs; + + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(TableEntry->RealOpc)); + unsigned OpIdx = 0; + if (TableEntry->isUpdating) + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Copy the addrmode6 operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + // Copy the am6offset operand. + if (TableEntry->hasWritebackOperand) + MIB.addOperand(MI.getOperand(OpIdx++)); + + bool SrcIsKill = MI.getOperand(OpIdx).isKill(); + bool SrcIsUndef = MI.getOperand(OpIdx).isUndef(); + unsigned SrcReg = MI.getOperand(OpIdx++).getReg(); + unsigned D0, D1, D2, D3; + GetDSubRegs(SrcReg, RegSpc, TRI, D0, D1, D2, D3); + MIB.addReg(D0, getUndefRegState(SrcIsUndef)); + if (NumRegs > 1 && TableEntry->copyAllListRegs) + MIB.addReg(D1, getUndefRegState(SrcIsUndef)); + if (NumRegs > 2 && TableEntry->copyAllListRegs) + MIB.addReg(D2, getUndefRegState(SrcIsUndef)); + if (NumRegs > 3 && TableEntry->copyAllListRegs) + MIB.addReg(D3, getUndefRegState(SrcIsUndef)); + + // Copy the predicate operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + + if (SrcIsKill && !SrcIsUndef) // Add an implicit kill for the super-reg. + MIB->addRegisterKilled(SrcReg, TRI, true); + else if (!SrcIsUndef) + MIB.addReg(SrcReg, RegState::Implicit); // Add implicit uses for src reg. + TransferImpOps(MI, MIB, MIB); + + // Transfer memoperands. + MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + + MI.eraseFromParent(); +} + +/// ExpandLaneOp - Translate VLD*LN and VST*LN instructions with Q, QQ or QQQQ +/// register operands to real instructions with D register operands. +void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) { + MachineInstr &MI = *MBBI; + MachineBasicBlock &MBB = *MI.getParent(); + + const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode()); + assert(TableEntry && "NEONLdStTable lookup failed"); + NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing; + unsigned NumRegs = TableEntry->NumRegs; + unsigned RegElts = TableEntry->RegElts; + + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(TableEntry->RealOpc)); + unsigned OpIdx = 0; + // The lane operand is always the 3rd from last operand, before the 2 + // predicate operands. + unsigned Lane = MI.getOperand(MI.getDesc().getNumOperands() - 3).getImm(); + + // Adjust the lane and spacing as needed for Q registers. + assert(RegSpc != OddDblSpc && "unexpected register spacing for VLD/VST-lane"); + if (RegSpc == EvenDblSpc && Lane >= RegElts) { + RegSpc = OddDblSpc; + Lane -= RegElts; + } + assert(Lane < RegElts && "out of range lane for VLD/VST-lane"); + + unsigned D0 = 0, D1 = 0, D2 = 0, D3 = 0; + unsigned DstReg = 0; + bool DstIsDead = false; + if (TableEntry->IsLoad) { + DstIsDead = MI.getOperand(OpIdx).isDead(); + DstReg = MI.getOperand(OpIdx++).getReg(); + GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3); + MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead)); + if (NumRegs > 1) + MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead)); + if (NumRegs > 2) + MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead)); + if (NumRegs > 3) + MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead)); + } + + if (TableEntry->isUpdating) + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Copy the addrmode6 operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + // Copy the am6offset operand. + if (TableEntry->hasWritebackOperand) + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Grab the super-register source. + MachineOperand MO = MI.getOperand(OpIdx++); + if (!TableEntry->IsLoad) + GetDSubRegs(MO.getReg(), RegSpc, TRI, D0, D1, D2, D3); + + // Add the subregs as sources of the new instruction. + unsigned SrcFlags = (getUndefRegState(MO.isUndef()) | + getKillRegState(MO.isKill())); + MIB.addReg(D0, SrcFlags); + if (NumRegs > 1) + MIB.addReg(D1, SrcFlags); + if (NumRegs > 2) + MIB.addReg(D2, SrcFlags); + if (NumRegs > 3) + MIB.addReg(D3, SrcFlags); + + // Add the lane number operand. + MIB.addImm(Lane); + OpIdx += 1; + + // Copy the predicate operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Copy the super-register source to be an implicit source. + MO.setImplicit(true); + MIB.addOperand(MO); + if (TableEntry->IsLoad) + // Add an implicit def for the super-register. + MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead)); + TransferImpOps(MI, MIB, MIB); + // Transfer memoperands. + MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MI.eraseFromParent(); +} + +/// ExpandVTBL - Translate VTBL and VTBX pseudo instructions with Q or QQ +/// register operands to real instructions with D register operands. +void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI, + unsigned Opc, bool IsExt) { + MachineInstr &MI = *MBBI; + MachineBasicBlock &MBB = *MI.getParent(); + + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)); + unsigned OpIdx = 0; + + // Transfer the destination register operand. + MIB.addOperand(MI.getOperand(OpIdx++)); + if (IsExt) + MIB.addOperand(MI.getOperand(OpIdx++)); + + bool SrcIsKill = MI.getOperand(OpIdx).isKill(); + unsigned SrcReg = MI.getOperand(OpIdx++).getReg(); + unsigned D0, D1, D2, D3; + GetDSubRegs(SrcReg, SingleSpc, TRI, D0, D1, D2, D3); + MIB.addReg(D0); + + // Copy the other source register operand. + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Copy the predicate operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Add an implicit kill and use for the super-reg. + MIB.addReg(SrcReg, RegState::Implicit | getKillRegState(SrcIsKill)); + TransferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); +} + +static bool IsAnAddressOperand(const MachineOperand &MO) { + // This check is overly conservative. Unless we are certain that the machine + // operand is not a symbol reference, we return that it is a symbol reference. + // This is important as the load pair may not be split up Windows. + switch (MO.getType()) { + case MachineOperand::MO_Register: + case MachineOperand::MO_Immediate: + case MachineOperand::MO_CImmediate: + case MachineOperand::MO_FPImmediate: + return false; + case MachineOperand::MO_MachineBasicBlock: + return true; + case MachineOperand::MO_FrameIndex: + return false; + case MachineOperand::MO_ConstantPoolIndex: + case MachineOperand::MO_TargetIndex: + case MachineOperand::MO_JumpTableIndex: + case MachineOperand::MO_ExternalSymbol: + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_BlockAddress: + return true; + case MachineOperand::MO_RegisterMask: + case MachineOperand::MO_RegisterLiveOut: + return false; + case MachineOperand::MO_Metadata: + case MachineOperand::MO_MCSymbol: + return true; + case MachineOperand::MO_CFIIndex: + return false; + } + llvm_unreachable("unhandled machine operand type"); +} + +void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI) { + MachineInstr &MI = *MBBI; + unsigned Opcode = MI.getOpcode(); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = getInstrPredicate(&MI, PredReg); + unsigned DstReg = MI.getOperand(0).getReg(); + bool DstIsDead = MI.getOperand(0).isDead(); + bool isCC = Opcode == ARM::MOVCCi32imm || Opcode == ARM::t2MOVCCi32imm; + const MachineOperand &MO = MI.getOperand(isCC ? 2 : 1); + bool RequiresBundling = STI->isTargetWindows() && IsAnAddressOperand(MO); + MachineInstrBuilder LO16, HI16; + + if (!STI->hasV6T2Ops() && + (Opcode == ARM::MOVi32imm || Opcode == ARM::MOVCCi32imm)) { + // FIXME Windows CE supports older ARM CPUs + assert(!STI->isTargetWindows() && "Windows on ARM requires ARMv7+"); + + // Expand into a movi + orr. + LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi), DstReg); + HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::ORRri)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstReg); + + assert (MO.isImm() && "MOVi32imm w/ non-immediate source operand!"); + unsigned ImmVal = (unsigned)MO.getImm(); + unsigned SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal); + unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal); + LO16 = LO16.addImm(SOImmValV1); + HI16 = HI16.addImm(SOImmValV2); + LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + LO16.addImm(Pred).addReg(PredReg).addReg(0); + HI16.addImm(Pred).addReg(PredReg).addReg(0); + TransferImpOps(MI, LO16, HI16); + MI.eraseFromParent(); + return; + } + + unsigned LO16Opc = 0; + unsigned HI16Opc = 0; + if (Opcode == ARM::t2MOVi32imm || Opcode == ARM::t2MOVCCi32imm) { + LO16Opc = ARM::t2MOVi16; + HI16Opc = ARM::t2MOVTi16; + } else { + LO16Opc = ARM::MOVi16; + HI16Opc = ARM::MOVTi16; + } + + LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LO16Opc), DstReg); + HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(HI16Opc)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstReg); + + switch (MO.getType()) { + case MachineOperand::MO_Immediate: { + unsigned Imm = MO.getImm(); + unsigned Lo16 = Imm & 0xffff; + unsigned Hi16 = (Imm >> 16) & 0xffff; + LO16 = LO16.addImm(Lo16); + HI16 = HI16.addImm(Hi16); + break; + } + case MachineOperand::MO_ExternalSymbol: { + const char *ES = MO.getSymbolName(); + unsigned TF = MO.getTargetFlags(); + LO16 = LO16.addExternalSymbol(ES, TF | ARMII::MO_LO16); + HI16 = HI16.addExternalSymbol(ES, TF | ARMII::MO_HI16); + break; + } + default: { + const GlobalValue *GV = MO.getGlobal(); + unsigned TF = MO.getTargetFlags(); + LO16 = LO16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_LO16); + HI16 = HI16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_HI16); + break; + } + } + + LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + LO16.addImm(Pred).addReg(PredReg); + HI16.addImm(Pred).addReg(PredReg); + + if (RequiresBundling) + finalizeBundle(MBB, LO16->getIterator(), MBBI->getIterator()); + + TransferImpOps(MI, LO16, HI16); + MI.eraseFromParent(); +} + +bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + MachineInstr &MI = *MBBI; + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + default: + return false; + + case ARM::TCRETURNdi: + case ARM::TCRETURNri: { + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + assert(MBBI->isReturn() && + "Can only insert epilog into returning blocks"); + unsigned RetOpcode = MBBI->getOpcode(); + DebugLoc dl = MBBI->getDebugLoc(); + const ARMBaseInstrInfo &TII = *static_cast<const ARMBaseInstrInfo *>( + MBB.getParent()->getSubtarget().getInstrInfo()); + + // Tail call return: adjust the stack pointer and jump to callee. + MBBI = MBB.getLastNonDebugInstr(); + MachineOperand &JumpTarget = MBBI->getOperand(0); + + // Jump to label or value in register. + if (RetOpcode == ARM::TCRETURNdi) { + unsigned TCOpcode = + STI->isThumb() + ? (STI->isTargetMachO() ? ARM::tTAILJMPd : ARM::tTAILJMPdND) + : ARM::TAILJMPd; + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(TCOpcode)); + if (JumpTarget.isGlobal()) + MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), + JumpTarget.getTargetFlags()); + else { + assert(JumpTarget.isSymbol()); + MIB.addExternalSymbol(JumpTarget.getSymbolName(), + JumpTarget.getTargetFlags()); + } + + // Add the default predicate in Thumb mode. + if (STI->isThumb()) + MIB.addImm(ARMCC::AL).addReg(0); + } else if (RetOpcode == ARM::TCRETURNri) { + BuildMI(MBB, MBBI, dl, + TII.get(STI->isThumb() ? ARM::tTAILJMPr : ARM::TAILJMPr)) + .addReg(JumpTarget.getReg(), RegState::Kill); + } + + MachineInstr *NewMI = std::prev(MBBI); + for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i) + NewMI->addOperand(MBBI->getOperand(i)); + + // Delete the pseudo instruction TCRETURN. + MBB.erase(MBBI); + MBBI = NewMI; + return true; + } + case ARM::VMOVScc: + case ARM::VMOVDcc: { + unsigned newOpc = Opcode == ARM::VMOVScc ? ARM::VMOVS : ARM::VMOVD; + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(newOpc), + MI.getOperand(1).getReg()) + .addOperand(MI.getOperand(2)) + .addImm(MI.getOperand(3).getImm()) // 'pred' + .addOperand(MI.getOperand(4)); + + MI.eraseFromParent(); + return true; + } + case ARM::t2MOVCCr: + case ARM::MOVCCr: { + unsigned Opc = AFI->isThumbFunction() ? ARM::t2MOVr : ARM::MOVr; + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc), + MI.getOperand(1).getReg()) + .addOperand(MI.getOperand(2)) + .addImm(MI.getOperand(3).getImm()) // 'pred' + .addOperand(MI.getOperand(4)) + .addReg(0); // 's' bit + + MI.eraseFromParent(); + return true; + } + case ARM::MOVCCsi: { + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi), + (MI.getOperand(1).getReg())) + .addOperand(MI.getOperand(2)) + .addImm(MI.getOperand(3).getImm()) + .addImm(MI.getOperand(4).getImm()) // 'pred' + .addOperand(MI.getOperand(5)) + .addReg(0); // 's' bit + + MI.eraseFromParent(); + return true; + } + case ARM::MOVCCsr: { + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsr), + (MI.getOperand(1).getReg())) + .addOperand(MI.getOperand(2)) + .addOperand(MI.getOperand(3)) + .addImm(MI.getOperand(4).getImm()) + .addImm(MI.getOperand(5).getImm()) // 'pred' + .addOperand(MI.getOperand(6)) + .addReg(0); // 's' bit + + MI.eraseFromParent(); + return true; + } + case ARM::t2MOVCCi16: + case ARM::MOVCCi16: { + unsigned NewOpc = AFI->isThumbFunction() ? ARM::t2MOVi16 : ARM::MOVi16; + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc), + MI.getOperand(1).getReg()) + .addImm(MI.getOperand(2).getImm()) + .addImm(MI.getOperand(3).getImm()) // 'pred' + .addOperand(MI.getOperand(4)); + MI.eraseFromParent(); + return true; + } + case ARM::t2MOVCCi: + case ARM::MOVCCi: { + unsigned Opc = AFI->isThumbFunction() ? ARM::t2MOVi : ARM::MOVi; + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc), + MI.getOperand(1).getReg()) + .addImm(MI.getOperand(2).getImm()) + .addImm(MI.getOperand(3).getImm()) // 'pred' + .addOperand(MI.getOperand(4)) + .addReg(0); // 's' bit + + MI.eraseFromParent(); + return true; + } + case ARM::t2MVNCCi: + case ARM::MVNCCi: { + unsigned Opc = AFI->isThumbFunction() ? ARM::t2MVNi : ARM::MVNi; + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc), + MI.getOperand(1).getReg()) + .addImm(MI.getOperand(2).getImm()) + .addImm(MI.getOperand(3).getImm()) // 'pred' + .addOperand(MI.getOperand(4)) + .addReg(0); // 's' bit + + MI.eraseFromParent(); + return true; + } + case ARM::t2MOVCClsl: + case ARM::t2MOVCClsr: + case ARM::t2MOVCCasr: + case ARM::t2MOVCCror: { + unsigned NewOpc; + switch (Opcode) { + case ARM::t2MOVCClsl: NewOpc = ARM::t2LSLri; break; + case ARM::t2MOVCClsr: NewOpc = ARM::t2LSRri; break; + case ARM::t2MOVCCasr: NewOpc = ARM::t2ASRri; break; + case ARM::t2MOVCCror: NewOpc = ARM::t2RORri; break; + default: llvm_unreachable("unexpeced conditional move"); + } + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc), + MI.getOperand(1).getReg()) + .addOperand(MI.getOperand(2)) + .addImm(MI.getOperand(3).getImm()) + .addImm(MI.getOperand(4).getImm()) // 'pred' + .addOperand(MI.getOperand(5)) + .addReg(0); // 's' bit + MI.eraseFromParent(); + return true; + } + case ARM::Int_eh_sjlj_dispatchsetup: { + MachineFunction &MF = *MI.getParent()->getParent(); + const ARMBaseInstrInfo *AII = + static_cast<const ARMBaseInstrInfo*>(TII); + const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); + // For functions using a base pointer, we rematerialize it (via the frame + // pointer) here since eh.sjlj.setjmp and eh.sjlj.longjmp don't do it + // for us. Otherwise, expand to nothing. + if (RI.hasBasePointer(MF)) { + int32_t NumBytes = AFI->getFramePtrSpillOffset(); + unsigned FramePtr = RI.getFrameRegister(MF); + assert(MF.getSubtarget().getFrameLowering()->hasFP(MF) && + "base pointer without frame pointer?"); + + if (AFI->isThumb2Function()) { + emitT2RegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6, + FramePtr, -NumBytes, ARMCC::AL, 0, *TII); + } else if (AFI->isThumbFunction()) { + emitThumbRegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6, + FramePtr, -NumBytes, *TII, RI); + } else { + emitARMRegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6, + FramePtr, -NumBytes, ARMCC::AL, 0, + *TII); + } + // If there's dynamic realignment, adjust for it. + if (RI.needsStackRealignment(MF)) { + MachineFrameInfo *MFI = MF.getFrameInfo(); + unsigned MaxAlign = MFI->getMaxAlignment(); + assert (!AFI->isThumb1OnlyFunction()); + // Emit bic r6, r6, MaxAlign + assert(MaxAlign <= 256 && "The BIC instruction cannot encode " + "immediates larger than 256 with all lower " + "bits set."); + unsigned bicOpc = AFI->isThumbFunction() ? + ARM::t2BICri : ARM::BICri; + AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(bicOpc), ARM::R6) + .addReg(ARM::R6, RegState::Kill) + .addImm(MaxAlign-1))); + } + + } + MI.eraseFromParent(); + return true; + } + + case ARM::MOVsrl_flag: + case ARM::MOVsra_flag: { + // These are just fancy MOVs instructions. + AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi), + MI.getOperand(0).getReg()) + .addOperand(MI.getOperand(1)) + .addImm(ARM_AM::getSORegOpc((Opcode == ARM::MOVsrl_flag ? + ARM_AM::lsr : ARM_AM::asr), + 1))) + .addReg(ARM::CPSR, RegState::Define); + MI.eraseFromParent(); + return true; + } + case ARM::RRX: { + // This encodes as "MOVs Rd, Rm, rrx + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),TII->get(ARM::MOVsi), + MI.getOperand(0).getReg()) + .addOperand(MI.getOperand(1)) + .addImm(ARM_AM::getSORegOpc(ARM_AM::rrx, 0))) + .addReg(0); + TransferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); + return true; + } + case ARM::tTPsoft: + case ARM::TPsoft: { + MachineInstrBuilder MIB; + if (Opcode == ARM::tTPsoft) + MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get( ARM::tBL)) + .addImm((unsigned)ARMCC::AL).addReg(0) + .addExternalSymbol("__aeabi_read_tp", 0); + else + MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get( ARM::BL)) + .addExternalSymbol("__aeabi_read_tp", 0); + + MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + TransferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); + return true; + } + case ARM::tLDRpci_pic: + case ARM::t2LDRpci_pic: { + unsigned NewLdOpc = (Opcode == ARM::tLDRpci_pic) + ? ARM::tLDRpci : ARM::t2LDRpci; + unsigned DstReg = MI.getOperand(0).getReg(); + bool DstIsDead = MI.getOperand(0).isDead(); + MachineInstrBuilder MIB1 = + AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(NewLdOpc), DstReg) + .addOperand(MI.getOperand(1))); + MIB1->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(ARM::tPICADD)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstReg) + .addOperand(MI.getOperand(2)); + TransferImpOps(MI, MIB1, MIB2); + MI.eraseFromParent(); + return true; + } + + case ARM::LDRLIT_ga_abs: + case ARM::LDRLIT_ga_pcrel: + case ARM::LDRLIT_ga_pcrel_ldr: + case ARM::tLDRLIT_ga_abs: + case ARM::tLDRLIT_ga_pcrel: { + unsigned DstReg = MI.getOperand(0).getReg(); + bool DstIsDead = MI.getOperand(0).isDead(); + const MachineOperand &MO1 = MI.getOperand(1); + const GlobalValue *GV = MO1.getGlobal(); + bool IsARM = + Opcode != ARM::tLDRLIT_ga_pcrel && Opcode != ARM::tLDRLIT_ga_abs; + bool IsPIC = + Opcode != ARM::LDRLIT_ga_abs && Opcode != ARM::tLDRLIT_ga_abs; + unsigned LDRLITOpc = IsARM ? ARM::LDRi12 : ARM::tLDRpci; + unsigned PICAddOpc = + IsARM + ? (Opcode == ARM::LDRLIT_ga_pcrel_ldr ? ARM::PICLDR : ARM::PICADD) + : ARM::tPICADD; + + // We need a new const-pool entry to load from. + MachineConstantPool *MCP = MBB.getParent()->getConstantPool(); + unsigned ARMPCLabelIndex = 0; + MachineConstantPoolValue *CPV; + + if (IsPIC) { + unsigned PCAdj = IsARM ? 8 : 4; + ARMPCLabelIndex = AFI->createPICLabelUId(); + CPV = ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, + ARMCP::CPValue, PCAdj); + } else + CPV = ARMConstantPoolConstant::Create(GV, ARMCP::no_modifier); + + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LDRLITOpc), DstReg) + .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, 4)); + if (IsARM) + MIB.addImm(0); + AddDefaultPred(MIB); + + if (IsPIC) { + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(PICAddOpc)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstReg) + .addImm(ARMPCLabelIndex); + + if (IsARM) + AddDefaultPred(MIB); + } + + MI.eraseFromParent(); + return true; + } + case ARM::MOV_ga_pcrel: + case ARM::MOV_ga_pcrel_ldr: + case ARM::t2MOV_ga_pcrel: { + // Expand into movw + movw. Also "add pc" / ldr [pc] in PIC mode. + unsigned LabelId = AFI->createPICLabelUId(); + unsigned DstReg = MI.getOperand(0).getReg(); + bool DstIsDead = MI.getOperand(0).isDead(); + const MachineOperand &MO1 = MI.getOperand(1); + const GlobalValue *GV = MO1.getGlobal(); + unsigned TF = MO1.getTargetFlags(); + bool isARM = Opcode != ARM::t2MOV_ga_pcrel; + unsigned LO16Opc = isARM ? ARM::MOVi16_ga_pcrel : ARM::t2MOVi16_ga_pcrel; + unsigned HI16Opc = isARM ? ARM::MOVTi16_ga_pcrel :ARM::t2MOVTi16_ga_pcrel; + unsigned LO16TF = TF | ARMII::MO_LO16; + unsigned HI16TF = TF | ARMII::MO_HI16; + unsigned PICAddOpc = isARM + ? (Opcode == ARM::MOV_ga_pcrel_ldr ? ARM::PICLDR : ARM::PICADD) + : ARM::tPICADD; + MachineInstrBuilder MIB1 = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(LO16Opc), DstReg) + .addGlobalAddress(GV, MO1.getOffset(), TF | LO16TF) + .addImm(LabelId); + + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(HI16Opc), DstReg) + .addReg(DstReg) + .addGlobalAddress(GV, MO1.getOffset(), TF | HI16TF) + .addImm(LabelId); + + MachineInstrBuilder MIB3 = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(PICAddOpc)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstReg).addImm(LabelId); + if (isARM) { + AddDefaultPred(MIB3); + if (Opcode == ARM::MOV_ga_pcrel_ldr) + MIB3->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + } + TransferImpOps(MI, MIB1, MIB3); + MI.eraseFromParent(); + return true; + } + + case ARM::MOVi32imm: + case ARM::MOVCCi32imm: + case ARM::t2MOVi32imm: + case ARM::t2MOVCCi32imm: + ExpandMOV32BitImm(MBB, MBBI); + return true; + + case ARM::SUBS_PC_LR: { + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::SUBri), ARM::PC) + .addReg(ARM::LR) + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(1)) + .addOperand(MI.getOperand(2)) + .addReg(ARM::CPSR, RegState::Undef); + TransferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); + return true; + } + case ARM::VLDMQIA: { + unsigned NewOpc = ARM::VLDMDIA; + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc)); + unsigned OpIdx = 0; + + // Grab the Q register destination. + bool DstIsDead = MI.getOperand(OpIdx).isDead(); + unsigned DstReg = MI.getOperand(OpIdx++).getReg(); + + // Copy the source register. + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Copy the predicate operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Add the destination operands (D subregs). + unsigned D0 = TRI->getSubReg(DstReg, ARM::dsub_0); + unsigned D1 = TRI->getSubReg(DstReg, ARM::dsub_1); + MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(D1, RegState::Define | getDeadRegState(DstIsDead)); + + // Add an implicit def for the super-register. + MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead)); + TransferImpOps(MI, MIB, MIB); + MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MI.eraseFromParent(); + return true; + } + + case ARM::VSTMQIA: { + unsigned NewOpc = ARM::VSTMDIA; + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc)); + unsigned OpIdx = 0; + + // Grab the Q register source. + bool SrcIsKill = MI.getOperand(OpIdx).isKill(); + unsigned SrcReg = MI.getOperand(OpIdx++).getReg(); + + // Copy the destination register. + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Copy the predicate operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Add the source operands (D subregs). + unsigned D0 = TRI->getSubReg(SrcReg, ARM::dsub_0); + unsigned D1 = TRI->getSubReg(SrcReg, ARM::dsub_1); + MIB.addReg(D0, SrcIsKill ? RegState::Kill : 0) + .addReg(D1, SrcIsKill ? RegState::Kill : 0); + + if (SrcIsKill) // Add an implicit kill for the Q register. + MIB->addRegisterKilled(SrcReg, TRI, true); + + TransferImpOps(MI, MIB, MIB); + MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MI.eraseFromParent(); + return true; + } + + case ARM::VLD2q8Pseudo: + case ARM::VLD2q16Pseudo: + case ARM::VLD2q32Pseudo: + case ARM::VLD2q8PseudoWB_fixed: + case ARM::VLD2q16PseudoWB_fixed: + case ARM::VLD2q32PseudoWB_fixed: + case ARM::VLD2q8PseudoWB_register: + case ARM::VLD2q16PseudoWB_register: + case ARM::VLD2q32PseudoWB_register: + case ARM::VLD3d8Pseudo: + case ARM::VLD3d16Pseudo: + case ARM::VLD3d32Pseudo: + case ARM::VLD1d64TPseudo: + case ARM::VLD1d64TPseudoWB_fixed: + case ARM::VLD3d8Pseudo_UPD: + case ARM::VLD3d16Pseudo_UPD: + case ARM::VLD3d32Pseudo_UPD: + case ARM::VLD3q8Pseudo_UPD: + case ARM::VLD3q16Pseudo_UPD: + case ARM::VLD3q32Pseudo_UPD: + case ARM::VLD3q8oddPseudo: + case ARM::VLD3q16oddPseudo: + case ARM::VLD3q32oddPseudo: + case ARM::VLD3q8oddPseudo_UPD: + case ARM::VLD3q16oddPseudo_UPD: + case ARM::VLD3q32oddPseudo_UPD: + case ARM::VLD4d8Pseudo: + case ARM::VLD4d16Pseudo: + case ARM::VLD4d32Pseudo: + case ARM::VLD1d64QPseudo: + case ARM::VLD1d64QPseudoWB_fixed: + case ARM::VLD4d8Pseudo_UPD: + case ARM::VLD4d16Pseudo_UPD: + case ARM::VLD4d32Pseudo_UPD: + case ARM::VLD4q8Pseudo_UPD: + case ARM::VLD4q16Pseudo_UPD: + case ARM::VLD4q32Pseudo_UPD: + case ARM::VLD4q8oddPseudo: + case ARM::VLD4q16oddPseudo: + case ARM::VLD4q32oddPseudo: + case ARM::VLD4q8oddPseudo_UPD: + case ARM::VLD4q16oddPseudo_UPD: + case ARM::VLD4q32oddPseudo_UPD: + case ARM::VLD3DUPd8Pseudo: + case ARM::VLD3DUPd16Pseudo: + case ARM::VLD3DUPd32Pseudo: + case ARM::VLD3DUPd8Pseudo_UPD: + case ARM::VLD3DUPd16Pseudo_UPD: + case ARM::VLD3DUPd32Pseudo_UPD: + case ARM::VLD4DUPd8Pseudo: + case ARM::VLD4DUPd16Pseudo: + case ARM::VLD4DUPd32Pseudo: + case ARM::VLD4DUPd8Pseudo_UPD: + case ARM::VLD4DUPd16Pseudo_UPD: + case ARM::VLD4DUPd32Pseudo_UPD: + ExpandVLD(MBBI); + return true; + + case ARM::VST2q8Pseudo: + case ARM::VST2q16Pseudo: + case ARM::VST2q32Pseudo: + case ARM::VST2q8PseudoWB_fixed: + case ARM::VST2q16PseudoWB_fixed: + case ARM::VST2q32PseudoWB_fixed: + case ARM::VST2q8PseudoWB_register: + case ARM::VST2q16PseudoWB_register: + case ARM::VST2q32PseudoWB_register: + case ARM::VST3d8Pseudo: + case ARM::VST3d16Pseudo: + case ARM::VST3d32Pseudo: + case ARM::VST1d64TPseudo: + case ARM::VST3d8Pseudo_UPD: + case ARM::VST3d16Pseudo_UPD: + case ARM::VST3d32Pseudo_UPD: + case ARM::VST1d64TPseudoWB_fixed: + case ARM::VST1d64TPseudoWB_register: + case ARM::VST3q8Pseudo_UPD: + case ARM::VST3q16Pseudo_UPD: + case ARM::VST3q32Pseudo_UPD: + case ARM::VST3q8oddPseudo: + case ARM::VST3q16oddPseudo: + case ARM::VST3q32oddPseudo: + case ARM::VST3q8oddPseudo_UPD: + case ARM::VST3q16oddPseudo_UPD: + case ARM::VST3q32oddPseudo_UPD: + case ARM::VST4d8Pseudo: + case ARM::VST4d16Pseudo: + case ARM::VST4d32Pseudo: + case ARM::VST1d64QPseudo: + case ARM::VST4d8Pseudo_UPD: + case ARM::VST4d16Pseudo_UPD: + case ARM::VST4d32Pseudo_UPD: + case ARM::VST1d64QPseudoWB_fixed: + case ARM::VST1d64QPseudoWB_register: + case ARM::VST4q8Pseudo_UPD: + case ARM::VST4q16Pseudo_UPD: + case ARM::VST4q32Pseudo_UPD: + case ARM::VST4q8oddPseudo: + case ARM::VST4q16oddPseudo: + case ARM::VST4q32oddPseudo: + case ARM::VST4q8oddPseudo_UPD: + case ARM::VST4q16oddPseudo_UPD: + case ARM::VST4q32oddPseudo_UPD: + ExpandVST(MBBI); + return true; + + case ARM::VLD1LNq8Pseudo: + case ARM::VLD1LNq16Pseudo: + case ARM::VLD1LNq32Pseudo: + case ARM::VLD1LNq8Pseudo_UPD: + case ARM::VLD1LNq16Pseudo_UPD: + case ARM::VLD1LNq32Pseudo_UPD: + case ARM::VLD2LNd8Pseudo: + case ARM::VLD2LNd16Pseudo: + case ARM::VLD2LNd32Pseudo: + case ARM::VLD2LNq16Pseudo: + case ARM::VLD2LNq32Pseudo: + case ARM::VLD2LNd8Pseudo_UPD: + case ARM::VLD2LNd16Pseudo_UPD: + case ARM::VLD2LNd32Pseudo_UPD: + case ARM::VLD2LNq16Pseudo_UPD: + case ARM::VLD2LNq32Pseudo_UPD: + case ARM::VLD3LNd8Pseudo: + case ARM::VLD3LNd16Pseudo: + case ARM::VLD3LNd32Pseudo: + case ARM::VLD3LNq16Pseudo: + case ARM::VLD3LNq32Pseudo: + case ARM::VLD3LNd8Pseudo_UPD: + case ARM::VLD3LNd16Pseudo_UPD: + case ARM::VLD3LNd32Pseudo_UPD: + case ARM::VLD3LNq16Pseudo_UPD: + case ARM::VLD3LNq32Pseudo_UPD: + case ARM::VLD4LNd8Pseudo: + case ARM::VLD4LNd16Pseudo: + case ARM::VLD4LNd32Pseudo: + case ARM::VLD4LNq16Pseudo: + case ARM::VLD4LNq32Pseudo: + case ARM::VLD4LNd8Pseudo_UPD: + case ARM::VLD4LNd16Pseudo_UPD: + case ARM::VLD4LNd32Pseudo_UPD: + case ARM::VLD4LNq16Pseudo_UPD: + case ARM::VLD4LNq32Pseudo_UPD: + case ARM::VST1LNq8Pseudo: + case ARM::VST1LNq16Pseudo: + case ARM::VST1LNq32Pseudo: + case ARM::VST1LNq8Pseudo_UPD: + case ARM::VST1LNq16Pseudo_UPD: + case ARM::VST1LNq32Pseudo_UPD: + case ARM::VST2LNd8Pseudo: + case ARM::VST2LNd16Pseudo: + case ARM::VST2LNd32Pseudo: + case ARM::VST2LNq16Pseudo: + case ARM::VST2LNq32Pseudo: + case ARM::VST2LNd8Pseudo_UPD: + case ARM::VST2LNd16Pseudo_UPD: + case ARM::VST2LNd32Pseudo_UPD: + case ARM::VST2LNq16Pseudo_UPD: + case ARM::VST2LNq32Pseudo_UPD: + case ARM::VST3LNd8Pseudo: + case ARM::VST3LNd16Pseudo: + case ARM::VST3LNd32Pseudo: + case ARM::VST3LNq16Pseudo: + case ARM::VST3LNq32Pseudo: + case ARM::VST3LNd8Pseudo_UPD: + case ARM::VST3LNd16Pseudo_UPD: + case ARM::VST3LNd32Pseudo_UPD: + case ARM::VST3LNq16Pseudo_UPD: + case ARM::VST3LNq32Pseudo_UPD: + case ARM::VST4LNd8Pseudo: + case ARM::VST4LNd16Pseudo: + case ARM::VST4LNd32Pseudo: + case ARM::VST4LNq16Pseudo: + case ARM::VST4LNq32Pseudo: + case ARM::VST4LNd8Pseudo_UPD: + case ARM::VST4LNd16Pseudo_UPD: + case ARM::VST4LNd32Pseudo_UPD: + case ARM::VST4LNq16Pseudo_UPD: + case ARM::VST4LNq32Pseudo_UPD: + ExpandLaneOp(MBBI); + return true; + + case ARM::VTBL3Pseudo: ExpandVTBL(MBBI, ARM::VTBL3, false); return true; + case ARM::VTBL4Pseudo: ExpandVTBL(MBBI, ARM::VTBL4, false); return true; + case ARM::VTBX3Pseudo: ExpandVTBL(MBBI, ARM::VTBX3, true); return true; + case ARM::VTBX4Pseudo: ExpandVTBL(MBBI, ARM::VTBX4, true); return true; + } +} + +bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { + bool Modified = false; + + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + while (MBBI != E) { + MachineBasicBlock::iterator NMBBI = std::next(MBBI); + Modified |= ExpandMI(MBB, MBBI); + MBBI = NMBBI; + } + + return Modified; +} + +bool ARMExpandPseudo::runOnMachineFunction(MachineFunction &MF) { + STI = &static_cast<const ARMSubtarget &>(MF.getSubtarget()); + TII = STI->getInstrInfo(); + TRI = STI->getRegisterInfo(); + AFI = MF.getInfo<ARMFunctionInfo>(); + + bool Modified = false; + for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E; + ++MFI) + Modified |= ExpandMBB(*MFI); + if (VerifyARMPseudo) + MF.verify(this, "After expanding ARM pseudo instructions."); + return Modified; +} + +/// createARMExpandPseudoPass - returns an instance of the pseudo instruction +/// expansion pass. +FunctionPass *llvm::createARMExpandPseudoPass() { + return new ARMExpandPseudo(); +} diff --git a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp new file mode 100644 index 0000000..9bdf823c --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp @@ -0,0 +1,3070 @@ +//===-- ARMFastISel.cpp - ARM FastISel implementation ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the ARM-specific support for the FastISel class. Some +// of the target-specific code is generated by tablegen in the file +// ARMGenFastISel.inc, which is #included here. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMBaseRegisterInfo.h" +#include "ARMCallingConv.h" +#include "ARMConstantPoolValue.h" +#include "ARMISelLowering.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMSubtarget.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/FastISel.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +using namespace llvm; + +namespace { + + // All possible address modes, plus some. + typedef struct Address { + enum { + RegBase, + FrameIndexBase + } BaseType; + + union { + unsigned Reg; + int FI; + } Base; + + int Offset; + + // Innocuous defaults for our address. + Address() + : BaseType(RegBase), Offset(0) { + Base.Reg = 0; + } + } Address; + +class ARMFastISel final : public FastISel { + + /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can + /// make the right decision when generating code for different targets. + const ARMSubtarget *Subtarget; + Module &M; + const TargetMachine &TM; + const TargetInstrInfo &TII; + const TargetLowering &TLI; + ARMFunctionInfo *AFI; + + // Convenience variables to avoid some queries. + bool isThumb2; + LLVMContext *Context; + + public: + explicit ARMFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo) + : FastISel(funcInfo, libInfo), + Subtarget( + &static_cast<const ARMSubtarget &>(funcInfo.MF->getSubtarget())), + M(const_cast<Module &>(*funcInfo.Fn->getParent())), + TM(funcInfo.MF->getTarget()), TII(*Subtarget->getInstrInfo()), + TLI(*Subtarget->getTargetLowering()) { + AFI = funcInfo.MF->getInfo<ARMFunctionInfo>(); + isThumb2 = AFI->isThumbFunction(); + Context = &funcInfo.Fn->getContext(); + } + + // Code from FastISel.cpp. + private: + unsigned fastEmitInst_r(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill); + unsigned fastEmitInst_rr(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill); + unsigned fastEmitInst_rrr(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill, + unsigned Op2, bool Op2IsKill); + unsigned fastEmitInst_ri(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + uint64_t Imm); + unsigned fastEmitInst_rri(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill, + uint64_t Imm); + unsigned fastEmitInst_i(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + uint64_t Imm); + + // Backend specific FastISel code. + private: + bool fastSelectInstruction(const Instruction *I) override; + unsigned fastMaterializeConstant(const Constant *C) override; + unsigned fastMaterializeAlloca(const AllocaInst *AI) override; + bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, + const LoadInst *LI) override; + bool fastLowerArguments() override; + private: + #include "ARMGenFastISel.inc" + + // Instruction selection routines. + private: + bool SelectLoad(const Instruction *I); + bool SelectStore(const Instruction *I); + bool SelectBranch(const Instruction *I); + bool SelectIndirectBr(const Instruction *I); + bool SelectCmp(const Instruction *I); + bool SelectFPExt(const Instruction *I); + bool SelectFPTrunc(const Instruction *I); + bool SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode); + bool SelectBinaryFPOp(const Instruction *I, unsigned ISDOpcode); + bool SelectIToFP(const Instruction *I, bool isSigned); + bool SelectFPToI(const Instruction *I, bool isSigned); + bool SelectDiv(const Instruction *I, bool isSigned); + bool SelectRem(const Instruction *I, bool isSigned); + bool SelectCall(const Instruction *I, const char *IntrMemName); + bool SelectIntrinsicCall(const IntrinsicInst &I); + bool SelectSelect(const Instruction *I); + bool SelectRet(const Instruction *I); + bool SelectTrunc(const Instruction *I); + bool SelectIntExt(const Instruction *I); + bool SelectShift(const Instruction *I, ARM_AM::ShiftOpc ShiftTy); + + // Utility routines. + private: + bool isTypeLegal(Type *Ty, MVT &VT); + bool isLoadTypeLegal(Type *Ty, MVT &VT); + bool ARMEmitCmp(const Value *Src1Value, const Value *Src2Value, + bool isZExt); + bool ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr, + unsigned Alignment = 0, bool isZExt = true, + bool allocReg = true); + bool ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr, + unsigned Alignment = 0); + bool ARMComputeAddress(const Value *Obj, Address &Addr); + void ARMSimplifyAddress(Address &Addr, MVT VT, bool useAM3); + bool ARMIsMemCpySmall(uint64_t Len); + bool ARMTryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len, + unsigned Alignment); + unsigned ARMEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt); + unsigned ARMMaterializeFP(const ConstantFP *CFP, MVT VT); + unsigned ARMMaterializeInt(const Constant *C, MVT VT); + unsigned ARMMaterializeGV(const GlobalValue *GV, MVT VT); + unsigned ARMMoveToFPReg(MVT VT, unsigned SrcReg); + unsigned ARMMoveToIntReg(MVT VT, unsigned SrcReg); + unsigned ARMSelectCallOp(bool UseReg); + unsigned ARMLowerPICELF(const GlobalValue *GV, unsigned Align, MVT VT); + + const TargetLowering *getTargetLowering() { return &TLI; } + + // Call handling routines. + private: + CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, + bool Return, + bool isVarArg); + bool ProcessCallArgs(SmallVectorImpl<Value*> &Args, + SmallVectorImpl<unsigned> &ArgRegs, + SmallVectorImpl<MVT> &ArgVTs, + SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags, + SmallVectorImpl<unsigned> &RegArgs, + CallingConv::ID CC, + unsigned &NumBytes, + bool isVarArg); + unsigned getLibcallReg(const Twine &Name); + bool FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs, + const Instruction *I, CallingConv::ID CC, + unsigned &NumBytes, bool isVarArg); + bool ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call); + + // OptionalDef handling routines. + private: + bool isARMNEONPred(const MachineInstr *MI); + bool DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR); + const MachineInstrBuilder &AddOptionalDefs(const MachineInstrBuilder &MIB); + void AddLoadStoreOperands(MVT VT, Address &Addr, + const MachineInstrBuilder &MIB, + unsigned Flags, bool useAM3); +}; + +} // end anonymous namespace + +#include "ARMGenCallingConv.inc" + +// DefinesOptionalPredicate - This is different from DefinesPredicate in that +// we don't care about implicit defs here, just places we'll need to add a +// default CCReg argument. Sets CPSR if we're setting CPSR instead of CCR. +bool ARMFastISel::DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR) { + if (!MI->hasOptionalDef()) + return false; + + // Look to see if our OptionalDef is defining CPSR or CCR. + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg() || !MO.isDef()) continue; + if (MO.getReg() == ARM::CPSR) + *CPSR = true; + } + return true; +} + +bool ARMFastISel::isARMNEONPred(const MachineInstr *MI) { + const MCInstrDesc &MCID = MI->getDesc(); + + // If we're a thumb2 or not NEON function we'll be handled via isPredicable. + if ((MCID.TSFlags & ARMII::DomainMask) != ARMII::DomainNEON || + AFI->isThumb2Function()) + return MI->isPredicable(); + + for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) + if (MCID.OpInfo[i].isPredicate()) + return true; + + return false; +} + +// If the machine is predicable go ahead and add the predicate operands, if +// it needs default CC operands add those. +// TODO: If we want to support thumb1 then we'll need to deal with optional +// CPSR defs that need to be added before the remaining operands. See s_cc_out +// for descriptions why. +const MachineInstrBuilder & +ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) { + MachineInstr *MI = &*MIB; + + // Do we use a predicate? or... + // Are we NEON in ARM mode and have a predicate operand? If so, I know + // we're not predicable but add it anyways. + if (isARMNEONPred(MI)) + AddDefaultPred(MIB); + + // Do we optionally set a predicate? Preds is size > 0 iff the predicate + // defines CPSR. All other OptionalDefines in ARM are the CCR register. + bool CPSR = false; + if (DefinesOptionalPredicate(MI, &CPSR)) { + if (CPSR) + AddDefaultT1CC(MIB); + else + AddDefaultCC(MIB); + } + return MIB; +} + +unsigned ARMFastISel::fastEmitInst_r(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill) { + unsigned ResultReg = createResultReg(RC); + const MCInstrDesc &II = TII.get(MachineInstOpcode); + + // Make sure the input operand is sufficiently constrained to be legal + // for this instruction. + Op0 = constrainOperandRegClass(II, Op0, 1); + if (II.getNumDefs() >= 1) { + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, + ResultReg).addReg(Op0, Op0IsKill * RegState::Kill)); + } else { + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) + .addReg(Op0, Op0IsKill * RegState::Kill)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(II.ImplicitDefs[0])); + } + return ResultReg; +} + +unsigned ARMFastISel::fastEmitInst_rr(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill) { + unsigned ResultReg = createResultReg(RC); + const MCInstrDesc &II = TII.get(MachineInstOpcode); + + // Make sure the input operands are sufficiently constrained to be legal + // for this instruction. + Op0 = constrainOperandRegClass(II, Op0, 1); + Op1 = constrainOperandRegClass(II, Op1, 2); + + if (II.getNumDefs() >= 1) { + AddOptionalDefs( + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg) + .addReg(Op0, Op0IsKill * RegState::Kill) + .addReg(Op1, Op1IsKill * RegState::Kill)); + } else { + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) + .addReg(Op0, Op0IsKill * RegState::Kill) + .addReg(Op1, Op1IsKill * RegState::Kill)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(II.ImplicitDefs[0])); + } + return ResultReg; +} + +unsigned ARMFastISel::fastEmitInst_rrr(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill, + unsigned Op2, bool Op2IsKill) { + unsigned ResultReg = createResultReg(RC); + const MCInstrDesc &II = TII.get(MachineInstOpcode); + + // Make sure the input operands are sufficiently constrained to be legal + // for this instruction. + Op0 = constrainOperandRegClass(II, Op0, 1); + Op1 = constrainOperandRegClass(II, Op1, 2); + Op2 = constrainOperandRegClass(II, Op1, 3); + + if (II.getNumDefs() >= 1) { + AddOptionalDefs( + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg) + .addReg(Op0, Op0IsKill * RegState::Kill) + .addReg(Op1, Op1IsKill * RegState::Kill) + .addReg(Op2, Op2IsKill * RegState::Kill)); + } else { + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) + .addReg(Op0, Op0IsKill * RegState::Kill) + .addReg(Op1, Op1IsKill * RegState::Kill) + .addReg(Op2, Op2IsKill * RegState::Kill)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(II.ImplicitDefs[0])); + } + return ResultReg; +} + +unsigned ARMFastISel::fastEmitInst_ri(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + uint64_t Imm) { + unsigned ResultReg = createResultReg(RC); + const MCInstrDesc &II = TII.get(MachineInstOpcode); + + // Make sure the input operand is sufficiently constrained to be legal + // for this instruction. + Op0 = constrainOperandRegClass(II, Op0, 1); + if (II.getNumDefs() >= 1) { + AddOptionalDefs( + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg) + .addReg(Op0, Op0IsKill * RegState::Kill) + .addImm(Imm)); + } else { + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) + .addReg(Op0, Op0IsKill * RegState::Kill) + .addImm(Imm)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(II.ImplicitDefs[0])); + } + return ResultReg; +} + +unsigned ARMFastISel::fastEmitInst_rri(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill, + uint64_t Imm) { + unsigned ResultReg = createResultReg(RC); + const MCInstrDesc &II = TII.get(MachineInstOpcode); + + // Make sure the input operands are sufficiently constrained to be legal + // for this instruction. + Op0 = constrainOperandRegClass(II, Op0, 1); + Op1 = constrainOperandRegClass(II, Op1, 2); + if (II.getNumDefs() >= 1) { + AddOptionalDefs( + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg) + .addReg(Op0, Op0IsKill * RegState::Kill) + .addReg(Op1, Op1IsKill * RegState::Kill) + .addImm(Imm)); + } else { + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) + .addReg(Op0, Op0IsKill * RegState::Kill) + .addReg(Op1, Op1IsKill * RegState::Kill) + .addImm(Imm)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(II.ImplicitDefs[0])); + } + return ResultReg; +} + +unsigned ARMFastISel::fastEmitInst_i(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + uint64_t Imm) { + unsigned ResultReg = createResultReg(RC); + const MCInstrDesc &II = TII.get(MachineInstOpcode); + + if (II.getNumDefs() >= 1) { + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, + ResultReg).addImm(Imm)); + } else { + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) + .addImm(Imm)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(II.ImplicitDefs[0])); + } + return ResultReg; +} + +// TODO: Don't worry about 64-bit now, but when this is fixed remove the +// checks from the various callers. +unsigned ARMFastISel::ARMMoveToFPReg(MVT VT, unsigned SrcReg) { + if (VT == MVT::f64) return 0; + + unsigned MoveReg = createResultReg(TLI.getRegClassFor(VT)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(ARM::VMOVSR), MoveReg) + .addReg(SrcReg)); + return MoveReg; +} + +unsigned ARMFastISel::ARMMoveToIntReg(MVT VT, unsigned SrcReg) { + if (VT == MVT::i64) return 0; + + unsigned MoveReg = createResultReg(TLI.getRegClassFor(VT)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(ARM::VMOVRS), MoveReg) + .addReg(SrcReg)); + return MoveReg; +} + +// For double width floating point we need to materialize two constants +// (the high and the low) into integer registers then use a move to get +// the combined constant into an FP reg. +unsigned ARMFastISel::ARMMaterializeFP(const ConstantFP *CFP, MVT VT) { + const APFloat Val = CFP->getValueAPF(); + bool is64bit = VT == MVT::f64; + + // This checks to see if we can use VFP3 instructions to materialize + // a constant, otherwise we have to go through the constant pool. + if (TLI.isFPImmLegal(Val, VT)) { + int Imm; + unsigned Opc; + if (is64bit) { + Imm = ARM_AM::getFP64Imm(Val); + Opc = ARM::FCONSTD; + } else { + Imm = ARM_AM::getFP32Imm(Val); + Opc = ARM::FCONSTS; + } + unsigned DestReg = createResultReg(TLI.getRegClassFor(VT)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), DestReg).addImm(Imm)); + return DestReg; + } + + // Require VFP2 for loading fp constants. + if (!Subtarget->hasVFP2()) return false; + + // MachineConstantPool wants an explicit alignment. + unsigned Align = DL.getPrefTypeAlignment(CFP->getType()); + if (Align == 0) { + // TODO: Figure out if this is correct. + Align = DL.getTypeAllocSize(CFP->getType()); + } + unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align); + unsigned DestReg = createResultReg(TLI.getRegClassFor(VT)); + unsigned Opc = is64bit ? ARM::VLDRD : ARM::VLDRS; + + // The extra reg is for addrmode5. + AddOptionalDefs( + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg) + .addConstantPoolIndex(Idx) + .addReg(0)); + return DestReg; +} + +unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) { + + if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8 && VT != MVT::i1) + return 0; + + // If we can do this in a single instruction without a constant pool entry + // do so now. + const ConstantInt *CI = cast<ConstantInt>(C); + if (Subtarget->hasV6T2Ops() && isUInt<16>(CI->getZExtValue())) { + unsigned Opc = isThumb2 ? ARM::t2MOVi16 : ARM::MOVi16; + const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass : + &ARM::GPRRegClass; + unsigned ImmReg = createResultReg(RC); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ImmReg) + .addImm(CI->getZExtValue())); + return ImmReg; + } + + // Use MVN to emit negative constants. + if (VT == MVT::i32 && Subtarget->hasV6T2Ops() && CI->isNegative()) { + unsigned Imm = (unsigned)~(CI->getSExtValue()); + bool UseImm = isThumb2 ? (ARM_AM::getT2SOImmVal(Imm) != -1) : + (ARM_AM::getSOImmVal(Imm) != -1); + if (UseImm) { + unsigned Opc = isThumb2 ? ARM::t2MVNi : ARM::MVNi; + const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass : + &ARM::GPRRegClass; + unsigned ImmReg = createResultReg(RC); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ImmReg) + .addImm(Imm)); + return ImmReg; + } + } + + unsigned ResultReg = 0; + if (Subtarget->useMovt(*FuncInfo.MF)) + ResultReg = fastEmit_i(VT, VT, ISD::Constant, CI->getZExtValue()); + + if (ResultReg) + return ResultReg; + + // Load from constant pool. For now 32-bit only. + if (VT != MVT::i32) + return 0; + + // MachineConstantPool wants an explicit alignment. + unsigned Align = DL.getPrefTypeAlignment(C->getType()); + if (Align == 0) { + // TODO: Figure out if this is correct. + Align = DL.getTypeAllocSize(C->getType()); + } + unsigned Idx = MCP.getConstantPoolIndex(C, Align); + ResultReg = createResultReg(TLI.getRegClassFor(VT)); + if (isThumb2) + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(ARM::t2LDRpci), ResultReg) + .addConstantPoolIndex(Idx)); + else { + // The extra immediate is for addrmode2. + ResultReg = constrainOperandRegClass(TII.get(ARM::LDRcp), ResultReg, 0); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(ARM::LDRcp), ResultReg) + .addConstantPoolIndex(Idx) + .addImm(0)); + } + return ResultReg; +} + +unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) { + // For now 32-bit only. + if (VT != MVT::i32) return 0; + + Reloc::Model RelocM = TM.getRelocationModel(); + bool IsIndirect = Subtarget->GVIsIndirectSymbol(GV, RelocM); + const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass + : &ARM::GPRRegClass; + unsigned DestReg = createResultReg(RC); + + // FastISel TLS support on non-MachO is broken, punt to SelectionDAG. + const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV); + bool IsThreadLocal = GVar && GVar->isThreadLocal(); + if (!Subtarget->isTargetMachO() && IsThreadLocal) return 0; + + // Use movw+movt when possible, it avoids constant pool entries. + // Non-darwin targets only support static movt relocations in FastISel. + if (Subtarget->useMovt(*FuncInfo.MF) && + (Subtarget->isTargetMachO() || RelocM == Reloc::Static)) { + unsigned Opc; + unsigned char TF = 0; + if (Subtarget->isTargetMachO()) + TF = ARMII::MO_NONLAZY; + + switch (RelocM) { + case Reloc::PIC_: + Opc = isThumb2 ? ARM::t2MOV_ga_pcrel : ARM::MOV_ga_pcrel; + break; + default: + Opc = isThumb2 ? ARM::t2MOVi32imm : ARM::MOVi32imm; + break; + } + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), DestReg).addGlobalAddress(GV, 0, TF)); + } else { + // MachineConstantPool wants an explicit alignment. + unsigned Align = DL.getPrefTypeAlignment(GV->getType()); + if (Align == 0) { + // TODO: Figure out if this is correct. + Align = DL.getTypeAllocSize(GV->getType()); + } + + if (Subtarget->isTargetELF() && RelocM == Reloc::PIC_) + return ARMLowerPICELF(GV, Align, VT); + + // Grab index. + unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : + (Subtarget->isThumb() ? 4 : 8); + unsigned Id = AFI->createPICLabelUId(); + ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, Id, + ARMCP::CPValue, + PCAdj); + unsigned Idx = MCP.getConstantPoolIndex(CPV, Align); + + // Load value. + MachineInstrBuilder MIB; + if (isThumb2) { + unsigned Opc = (RelocM!=Reloc::PIC_) ? ARM::t2LDRpci : ARM::t2LDRpci_pic; + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), + DestReg).addConstantPoolIndex(Idx); + if (RelocM == Reloc::PIC_) + MIB.addImm(Id); + AddOptionalDefs(MIB); + } else { + // The extra immediate is for addrmode2. + DestReg = constrainOperandRegClass(TII.get(ARM::LDRcp), DestReg, 0); + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(ARM::LDRcp), DestReg) + .addConstantPoolIndex(Idx) + .addImm(0); + AddOptionalDefs(MIB); + + if (RelocM == Reloc::PIC_) { + unsigned Opc = IsIndirect ? ARM::PICLDR : ARM::PICADD; + unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT)); + + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, + DbgLoc, TII.get(Opc), NewDestReg) + .addReg(DestReg) + .addImm(Id); + AddOptionalDefs(MIB); + return NewDestReg; + } + } + } + + if (IsIndirect) { + MachineInstrBuilder MIB; + unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT)); + if (isThumb2) + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(ARM::t2LDRi12), NewDestReg) + .addReg(DestReg) + .addImm(0); + else + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(ARM::LDRi12), NewDestReg) + .addReg(DestReg) + .addImm(0); + DestReg = NewDestReg; + AddOptionalDefs(MIB); + } + + return DestReg; +} + +unsigned ARMFastISel::fastMaterializeConstant(const Constant *C) { + EVT CEVT = TLI.getValueType(DL, C->getType(), true); + + // Only handle simple types. + if (!CEVT.isSimple()) return 0; + MVT VT = CEVT.getSimpleVT(); + + if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) + return ARMMaterializeFP(CFP, VT); + else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C)) + return ARMMaterializeGV(GV, VT); + else if (isa<ConstantInt>(C)) + return ARMMaterializeInt(C, VT); + + return 0; +} + +// TODO: unsigned ARMFastISel::TargetMaterializeFloatZero(const ConstantFP *CF); + +unsigned ARMFastISel::fastMaterializeAlloca(const AllocaInst *AI) { + // Don't handle dynamic allocas. + if (!FuncInfo.StaticAllocaMap.count(AI)) return 0; + + MVT VT; + if (!isLoadTypeLegal(AI->getType(), VT)) return 0; + + DenseMap<const AllocaInst*, int>::iterator SI = + FuncInfo.StaticAllocaMap.find(AI); + + // This will get lowered later into the correct offsets and registers + // via rewriteXFrameIndex. + if (SI != FuncInfo.StaticAllocaMap.end()) { + unsigned Opc = isThumb2 ? ARM::t2ADDri : ARM::ADDri; + const TargetRegisterClass* RC = TLI.getRegClassFor(VT); + unsigned ResultReg = createResultReg(RC); + ResultReg = constrainOperandRegClass(TII.get(Opc), ResultReg, 0); + + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg) + .addFrameIndex(SI->second) + .addImm(0)); + return ResultReg; + } + + return 0; +} + +bool ARMFastISel::isTypeLegal(Type *Ty, MVT &VT) { + EVT evt = TLI.getValueType(DL, Ty, true); + + // Only handle simple types. + if (evt == MVT::Other || !evt.isSimple()) return false; + VT = evt.getSimpleVT(); + + // Handle all legal types, i.e. a register that will directly hold this + // value. + return TLI.isTypeLegal(VT); +} + +bool ARMFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) { + if (isTypeLegal(Ty, VT)) return true; + + // If this is a type than can be sign or zero-extended to a basic operation + // go ahead and accept it now. + if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16) + return true; + + return false; +} + +// Computes the address to get to an object. +bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) { + // Some boilerplate from the X86 FastISel. + const User *U = nullptr; + unsigned Opcode = Instruction::UserOp1; + if (const Instruction *I = dyn_cast<Instruction>(Obj)) { + // Don't walk into other basic blocks unless the object is an alloca from + // another block, otherwise it may not have a virtual register assigned. + if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) || + FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) { + Opcode = I->getOpcode(); + U = I; + } + } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) { + Opcode = C->getOpcode(); + U = C; + } + + if (PointerType *Ty = dyn_cast<PointerType>(Obj->getType())) + if (Ty->getAddressSpace() > 255) + // Fast instruction selection doesn't support the special + // address spaces. + return false; + + switch (Opcode) { + default: + break; + case Instruction::BitCast: + // Look through bitcasts. + return ARMComputeAddress(U->getOperand(0), Addr); + case Instruction::IntToPtr: + // Look past no-op inttoptrs. + if (TLI.getValueType(DL, U->getOperand(0)->getType()) == + TLI.getPointerTy(DL)) + return ARMComputeAddress(U->getOperand(0), Addr); + break; + case Instruction::PtrToInt: + // Look past no-op ptrtoints. + if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL)) + return ARMComputeAddress(U->getOperand(0), Addr); + break; + case Instruction::GetElementPtr: { + Address SavedAddr = Addr; + int TmpOffset = Addr.Offset; + + // Iterate through the GEP folding the constants into offsets where + // we can. + gep_type_iterator GTI = gep_type_begin(U); + for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); + i != e; ++i, ++GTI) { + const Value *Op = *i; + if (StructType *STy = dyn_cast<StructType>(*GTI)) { + const StructLayout *SL = DL.getStructLayout(STy); + unsigned Idx = cast<ConstantInt>(Op)->getZExtValue(); + TmpOffset += SL->getElementOffset(Idx); + } else { + uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType()); + for (;;) { + if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { + // Constant-offset addressing. + TmpOffset += CI->getSExtValue() * S; + break; + } + if (canFoldAddIntoGEP(U, Op)) { + // A compatible add with a constant operand. Fold the constant. + ConstantInt *CI = + cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1)); + TmpOffset += CI->getSExtValue() * S; + // Iterate on the other operand. + Op = cast<AddOperator>(Op)->getOperand(0); + continue; + } + // Unsupported + goto unsupported_gep; + } + } + } + + // Try to grab the base operand now. + Addr.Offset = TmpOffset; + if (ARMComputeAddress(U->getOperand(0), Addr)) return true; + + // We failed, restore everything and try the other options. + Addr = SavedAddr; + + unsupported_gep: + break; + } + case Instruction::Alloca: { + const AllocaInst *AI = cast<AllocaInst>(Obj); + DenseMap<const AllocaInst*, int>::iterator SI = + FuncInfo.StaticAllocaMap.find(AI); + if (SI != FuncInfo.StaticAllocaMap.end()) { + Addr.BaseType = Address::FrameIndexBase; + Addr.Base.FI = SI->second; + return true; + } + break; + } + } + + // Try to get this in a register if nothing else has worked. + if (Addr.Base.Reg == 0) Addr.Base.Reg = getRegForValue(Obj); + return Addr.Base.Reg != 0; +} + +void ARMFastISel::ARMSimplifyAddress(Address &Addr, MVT VT, bool useAM3) { + bool needsLowering = false; + switch (VT.SimpleTy) { + default: llvm_unreachable("Unhandled load/store type!"); + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + if (!useAM3) { + // Integer loads/stores handle 12-bit offsets. + needsLowering = ((Addr.Offset & 0xfff) != Addr.Offset); + // Handle negative offsets. + if (needsLowering && isThumb2) + needsLowering = !(Subtarget->hasV6T2Ops() && Addr.Offset < 0 && + Addr.Offset > -256); + } else { + // ARM halfword load/stores and signed byte loads use +/-imm8 offsets. + needsLowering = (Addr.Offset > 255 || Addr.Offset < -255); + } + break; + case MVT::f32: + case MVT::f64: + // Floating point operands handle 8-bit offsets. + needsLowering = ((Addr.Offset & 0xff) != Addr.Offset); + break; + } + + // If this is a stack pointer and the offset needs to be simplified then + // put the alloca address into a register, set the base type back to + // register and continue. This should almost never happen. + if (needsLowering && Addr.BaseType == Address::FrameIndexBase) { + const TargetRegisterClass *RC = isThumb2 ? &ARM::tGPRRegClass + : &ARM::GPRRegClass; + unsigned ResultReg = createResultReg(RC); + unsigned Opc = isThumb2 ? ARM::t2ADDri : ARM::ADDri; + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg) + .addFrameIndex(Addr.Base.FI) + .addImm(0)); + Addr.Base.Reg = ResultReg; + Addr.BaseType = Address::RegBase; + } + + // Since the offset is too large for the load/store instruction + // get the reg+offset into a register. + if (needsLowering) { + Addr.Base.Reg = fastEmit_ri_(MVT::i32, ISD::ADD, Addr.Base.Reg, + /*Op0IsKill*/false, Addr.Offset, MVT::i32); + Addr.Offset = 0; + } +} + +void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr, + const MachineInstrBuilder &MIB, + unsigned Flags, bool useAM3) { + // addrmode5 output depends on the selection dag addressing dividing the + // offset by 4 that it then later multiplies. Do this here as well. + if (VT.SimpleTy == MVT::f32 || VT.SimpleTy == MVT::f64) + Addr.Offset /= 4; + + // Frame base works a bit differently. Handle it separately. + if (Addr.BaseType == Address::FrameIndexBase) { + int FI = Addr.Base.FI; + int Offset = Addr.Offset; + MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*FuncInfo.MF, FI, Offset), Flags, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); + // Now add the rest of the operands. + MIB.addFrameIndex(FI); + + // ARM halfword load/stores and signed byte loads need an additional + // operand. + if (useAM3) { + signed Imm = (Addr.Offset < 0) ? (0x100 | -Addr.Offset) : Addr.Offset; + MIB.addReg(0); + MIB.addImm(Imm); + } else { + MIB.addImm(Addr.Offset); + } + MIB.addMemOperand(MMO); + } else { + // Now add the rest of the operands. + MIB.addReg(Addr.Base.Reg); + + // ARM halfword load/stores and signed byte loads need an additional + // operand. + if (useAM3) { + signed Imm = (Addr.Offset < 0) ? (0x100 | -Addr.Offset) : Addr.Offset; + MIB.addReg(0); + MIB.addImm(Imm); + } else { + MIB.addImm(Addr.Offset); + } + } + AddOptionalDefs(MIB); +} + +bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr, + unsigned Alignment, bool isZExt, bool allocReg) { + unsigned Opc; + bool useAM3 = false; + bool needVMOV = false; + const TargetRegisterClass *RC; + switch (VT.SimpleTy) { + // This is mostly going to be Neon/vector support. + default: return false; + case MVT::i1: + case MVT::i8: + if (isThumb2) { + if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops()) + Opc = isZExt ? ARM::t2LDRBi8 : ARM::t2LDRSBi8; + else + Opc = isZExt ? ARM::t2LDRBi12 : ARM::t2LDRSBi12; + } else { + if (isZExt) { + Opc = ARM::LDRBi12; + } else { + Opc = ARM::LDRSB; + useAM3 = true; + } + } + RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass; + break; + case MVT::i16: + if (Alignment && Alignment < 2 && !Subtarget->allowsUnalignedMem()) + return false; + + if (isThumb2) { + if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops()) + Opc = isZExt ? ARM::t2LDRHi8 : ARM::t2LDRSHi8; + else + Opc = isZExt ? ARM::t2LDRHi12 : ARM::t2LDRSHi12; + } else { + Opc = isZExt ? ARM::LDRH : ARM::LDRSH; + useAM3 = true; + } + RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass; + break; + case MVT::i32: + if (Alignment && Alignment < 4 && !Subtarget->allowsUnalignedMem()) + return false; + + if (isThumb2) { + if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops()) + Opc = ARM::t2LDRi8; + else + Opc = ARM::t2LDRi12; + } else { + Opc = ARM::LDRi12; + } + RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass; + break; + case MVT::f32: + if (!Subtarget->hasVFP2()) return false; + // Unaligned loads need special handling. Floats require word-alignment. + if (Alignment && Alignment < 4) { + needVMOV = true; + VT = MVT::i32; + Opc = isThumb2 ? ARM::t2LDRi12 : ARM::LDRi12; + RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass; + } else { + Opc = ARM::VLDRS; + RC = TLI.getRegClassFor(VT); + } + break; + case MVT::f64: + if (!Subtarget->hasVFP2()) return false; + // FIXME: Unaligned loads need special handling. Doublewords require + // word-alignment. + if (Alignment && Alignment < 4) + return false; + + Opc = ARM::VLDRD; + RC = TLI.getRegClassFor(VT); + break; + } + // Simplify this down to something we can handle. + ARMSimplifyAddress(Addr, VT, useAM3); + + // Create the base instruction, then add the operands. + if (allocReg) + ResultReg = createResultReg(RC); + assert (ResultReg > 255 && "Expected an allocated virtual register."); + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg); + AddLoadStoreOperands(VT, Addr, MIB, MachineMemOperand::MOLoad, useAM3); + + // If we had an unaligned load of a float we've converted it to an regular + // load. Now we must move from the GRP to the FP register. + if (needVMOV) { + unsigned MoveReg = createResultReg(TLI.getRegClassFor(MVT::f32)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(ARM::VMOVSR), MoveReg) + .addReg(ResultReg)); + ResultReg = MoveReg; + } + return true; +} + +bool ARMFastISel::SelectLoad(const Instruction *I) { + // Atomic loads need special handling. + if (cast<LoadInst>(I)->isAtomic()) + return false; + + // Verify we have a legal type before going any further. + MVT VT; + if (!isLoadTypeLegal(I->getType(), VT)) + return false; + + // See if we can handle this address. + Address Addr; + if (!ARMComputeAddress(I->getOperand(0), Addr)) return false; + + unsigned ResultReg; + if (!ARMEmitLoad(VT, ResultReg, Addr, cast<LoadInst>(I)->getAlignment())) + return false; + updateValueMap(I, ResultReg); + return true; +} + +bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr, + unsigned Alignment) { + unsigned StrOpc; + bool useAM3 = false; + switch (VT.SimpleTy) { + // This is mostly going to be Neon/vector support. + default: return false; + case MVT::i1: { + unsigned Res = createResultReg(isThumb2 ? &ARM::tGPRRegClass + : &ARM::GPRRegClass); + unsigned Opc = isThumb2 ? ARM::t2ANDri : ARM::ANDri; + SrcReg = constrainOperandRegClass(TII.get(Opc), SrcReg, 1); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), Res) + .addReg(SrcReg).addImm(1)); + SrcReg = Res; + } // Fallthrough here. + case MVT::i8: + if (isThumb2) { + if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops()) + StrOpc = ARM::t2STRBi8; + else + StrOpc = ARM::t2STRBi12; + } else { + StrOpc = ARM::STRBi12; + } + break; + case MVT::i16: + if (Alignment && Alignment < 2 && !Subtarget->allowsUnalignedMem()) + return false; + + if (isThumb2) { + if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops()) + StrOpc = ARM::t2STRHi8; + else + StrOpc = ARM::t2STRHi12; + } else { + StrOpc = ARM::STRH; + useAM3 = true; + } + break; + case MVT::i32: + if (Alignment && Alignment < 4 && !Subtarget->allowsUnalignedMem()) + return false; + + if (isThumb2) { + if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops()) + StrOpc = ARM::t2STRi8; + else + StrOpc = ARM::t2STRi12; + } else { + StrOpc = ARM::STRi12; + } + break; + case MVT::f32: + if (!Subtarget->hasVFP2()) return false; + // Unaligned stores need special handling. Floats require word-alignment. + if (Alignment && Alignment < 4) { + unsigned MoveReg = createResultReg(TLI.getRegClassFor(MVT::i32)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(ARM::VMOVRS), MoveReg) + .addReg(SrcReg)); + SrcReg = MoveReg; + VT = MVT::i32; + StrOpc = isThumb2 ? ARM::t2STRi12 : ARM::STRi12; + } else { + StrOpc = ARM::VSTRS; + } + break; + case MVT::f64: + if (!Subtarget->hasVFP2()) return false; + // FIXME: Unaligned stores need special handling. Doublewords require + // word-alignment. + if (Alignment && Alignment < 4) + return false; + + StrOpc = ARM::VSTRD; + break; + } + // Simplify this down to something we can handle. + ARMSimplifyAddress(Addr, VT, useAM3); + + // Create the base instruction, then add the operands. + SrcReg = constrainOperandRegClass(TII.get(StrOpc), SrcReg, 0); + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(StrOpc)) + .addReg(SrcReg); + AddLoadStoreOperands(VT, Addr, MIB, MachineMemOperand::MOStore, useAM3); + return true; +} + +bool ARMFastISel::SelectStore(const Instruction *I) { + Value *Op0 = I->getOperand(0); + unsigned SrcReg = 0; + + // Atomic stores need special handling. + if (cast<StoreInst>(I)->isAtomic()) + return false; + + // Verify we have a legal type before going any further. + MVT VT; + if (!isLoadTypeLegal(I->getOperand(0)->getType(), VT)) + return false; + + // Get the value to be stored into a register. + SrcReg = getRegForValue(Op0); + if (SrcReg == 0) return false; + + // See if we can handle this address. + Address Addr; + if (!ARMComputeAddress(I->getOperand(1), Addr)) + return false; + + if (!ARMEmitStore(VT, SrcReg, Addr, cast<StoreInst>(I)->getAlignment())) + return false; + return true; +} + +static ARMCC::CondCodes getComparePred(CmpInst::Predicate Pred) { + switch (Pred) { + // Needs two compares... + case CmpInst::FCMP_ONE: + case CmpInst::FCMP_UEQ: + default: + // AL is our "false" for now. The other two need more compares. + return ARMCC::AL; + case CmpInst::ICMP_EQ: + case CmpInst::FCMP_OEQ: + return ARMCC::EQ; + case CmpInst::ICMP_SGT: + case CmpInst::FCMP_OGT: + return ARMCC::GT; + case CmpInst::ICMP_SGE: + case CmpInst::FCMP_OGE: + return ARMCC::GE; + case CmpInst::ICMP_UGT: + case CmpInst::FCMP_UGT: + return ARMCC::HI; + case CmpInst::FCMP_OLT: + return ARMCC::MI; + case CmpInst::ICMP_ULE: + case CmpInst::FCMP_OLE: + return ARMCC::LS; + case CmpInst::FCMP_ORD: + return ARMCC::VC; + case CmpInst::FCMP_UNO: + return ARMCC::VS; + case CmpInst::FCMP_UGE: + return ARMCC::PL; + case CmpInst::ICMP_SLT: + case CmpInst::FCMP_ULT: + return ARMCC::LT; + case CmpInst::ICMP_SLE: + case CmpInst::FCMP_ULE: + return ARMCC::LE; + case CmpInst::FCMP_UNE: + case CmpInst::ICMP_NE: + return ARMCC::NE; + case CmpInst::ICMP_UGE: + return ARMCC::HS; + case CmpInst::ICMP_ULT: + return ARMCC::LO; + } +} + +bool ARMFastISel::SelectBranch(const Instruction *I) { + const BranchInst *BI = cast<BranchInst>(I); + MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)]; + MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)]; + + // Simple branch support. + + // If we can, avoid recomputing the compare - redoing it could lead to wonky + // behavior. + if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) { + if (CI->hasOneUse() && (CI->getParent() == I->getParent())) { + + // Get the compare predicate. + // Try to take advantage of fallthrough opportunities. + CmpInst::Predicate Predicate = CI->getPredicate(); + if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { + std::swap(TBB, FBB); + Predicate = CmpInst::getInversePredicate(Predicate); + } + + ARMCC::CondCodes ARMPred = getComparePred(Predicate); + + // We may not handle every CC for now. + if (ARMPred == ARMCC::AL) return false; + + // Emit the compare. + if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned())) + return false; + + unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc)) + .addMBB(TBB).addImm(ARMPred).addReg(ARM::CPSR); + finishCondBranch(BI->getParent(), TBB, FBB); + return true; + } + } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) { + MVT SourceVT; + if (TI->hasOneUse() && TI->getParent() == I->getParent() && + (isLoadTypeLegal(TI->getOperand(0)->getType(), SourceVT))) { + unsigned TstOpc = isThumb2 ? ARM::t2TSTri : ARM::TSTri; + unsigned OpReg = getRegForValue(TI->getOperand(0)); + OpReg = constrainOperandRegClass(TII.get(TstOpc), OpReg, 0); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TstOpc)) + .addReg(OpReg).addImm(1)); + + unsigned CCMode = ARMCC::NE; + if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { + std::swap(TBB, FBB); + CCMode = ARMCC::EQ; + } + + unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc)) + .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR); + + finishCondBranch(BI->getParent(), TBB, FBB); + return true; + } + } else if (const ConstantInt *CI = + dyn_cast<ConstantInt>(BI->getCondition())) { + uint64_t Imm = CI->getZExtValue(); + MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB; + fastEmitBranch(Target, DbgLoc); + return true; + } + + unsigned CmpReg = getRegForValue(BI->getCondition()); + if (CmpReg == 0) return false; + + // We've been divorced from our compare! Our block was split, and + // now our compare lives in a predecessor block. We musn't + // re-compare here, as the children of the compare aren't guaranteed + // live across the block boundary (we *could* check for this). + // Regardless, the compare has been done in the predecessor block, + // and it left a value for us in a virtual register. Ergo, we test + // the one-bit value left in the virtual register. + unsigned TstOpc = isThumb2 ? ARM::t2TSTri : ARM::TSTri; + CmpReg = constrainOperandRegClass(TII.get(TstOpc), CmpReg, 0); + AddOptionalDefs( + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TstOpc)) + .addReg(CmpReg) + .addImm(1)); + + unsigned CCMode = ARMCC::NE; + if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { + std::swap(TBB, FBB); + CCMode = ARMCC::EQ; + } + + unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc)) + .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR); + finishCondBranch(BI->getParent(), TBB, FBB); + return true; +} + +bool ARMFastISel::SelectIndirectBr(const Instruction *I) { + unsigned AddrReg = getRegForValue(I->getOperand(0)); + if (AddrReg == 0) return false; + + unsigned Opc = isThumb2 ? ARM::tBRIND : ARM::BX; + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc)).addReg(AddrReg)); + + const IndirectBrInst *IB = cast<IndirectBrInst>(I); + for (const BasicBlock *SuccBB : IB->successors()) + FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[SuccBB]); + + return true; +} + +bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value, + bool isZExt) { + Type *Ty = Src1Value->getType(); + EVT SrcEVT = TLI.getValueType(DL, Ty, true); + if (!SrcEVT.isSimple()) return false; + MVT SrcVT = SrcEVT.getSimpleVT(); + + bool isFloat = (Ty->isFloatTy() || Ty->isDoubleTy()); + if (isFloat && !Subtarget->hasVFP2()) + return false; + + // Check to see if the 2nd operand is a constant that we can encode directly + // in the compare. + int Imm = 0; + bool UseImm = false; + bool isNegativeImm = false; + // FIXME: At -O0 we don't have anything that canonicalizes operand order. + // Thus, Src1Value may be a ConstantInt, but we're missing it. + if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(Src2Value)) { + if (SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8 || + SrcVT == MVT::i1) { + const APInt &CIVal = ConstInt->getValue(); + Imm = (isZExt) ? (int)CIVal.getZExtValue() : (int)CIVal.getSExtValue(); + // For INT_MIN/LONG_MIN (i.e., 0x80000000) we need to use a cmp, rather + // then a cmn, because there is no way to represent 2147483648 as a + // signed 32-bit int. + if (Imm < 0 && Imm != (int)0x80000000) { + isNegativeImm = true; + Imm = -Imm; + } + UseImm = isThumb2 ? (ARM_AM::getT2SOImmVal(Imm) != -1) : + (ARM_AM::getSOImmVal(Imm) != -1); + } + } else if (const ConstantFP *ConstFP = dyn_cast<ConstantFP>(Src2Value)) { + if (SrcVT == MVT::f32 || SrcVT == MVT::f64) + if (ConstFP->isZero() && !ConstFP->isNegative()) + UseImm = true; + } + + unsigned CmpOpc; + bool isICmp = true; + bool needsExt = false; + switch (SrcVT.SimpleTy) { + default: return false; + // TODO: Verify compares. + case MVT::f32: + isICmp = false; + CmpOpc = UseImm ? ARM::VCMPEZS : ARM::VCMPES; + break; + case MVT::f64: + isICmp = false; + CmpOpc = UseImm ? ARM::VCMPEZD : ARM::VCMPED; + break; + case MVT::i1: + case MVT::i8: + case MVT::i16: + needsExt = true; + // Intentional fall-through. + case MVT::i32: + if (isThumb2) { + if (!UseImm) + CmpOpc = ARM::t2CMPrr; + else + CmpOpc = isNegativeImm ? ARM::t2CMNri : ARM::t2CMPri; + } else { + if (!UseImm) + CmpOpc = ARM::CMPrr; + else + CmpOpc = isNegativeImm ? ARM::CMNri : ARM::CMPri; + } + break; + } + + unsigned SrcReg1 = getRegForValue(Src1Value); + if (SrcReg1 == 0) return false; + + unsigned SrcReg2 = 0; + if (!UseImm) { + SrcReg2 = getRegForValue(Src2Value); + if (SrcReg2 == 0) return false; + } + + // We have i1, i8, or i16, we need to either zero extend or sign extend. + if (needsExt) { + SrcReg1 = ARMEmitIntExt(SrcVT, SrcReg1, MVT::i32, isZExt); + if (SrcReg1 == 0) return false; + if (!UseImm) { + SrcReg2 = ARMEmitIntExt(SrcVT, SrcReg2, MVT::i32, isZExt); + if (SrcReg2 == 0) return false; + } + } + + const MCInstrDesc &II = TII.get(CmpOpc); + SrcReg1 = constrainOperandRegClass(II, SrcReg1, 0); + if (!UseImm) { + SrcReg2 = constrainOperandRegClass(II, SrcReg2, 1); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) + .addReg(SrcReg1).addReg(SrcReg2)); + } else { + MachineInstrBuilder MIB; + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) + .addReg(SrcReg1); + + // Only add immediate for icmp as the immediate for fcmp is an implicit 0.0. + if (isICmp) + MIB.addImm(Imm); + AddOptionalDefs(MIB); + } + + // For floating point we need to move the result to a comparison register + // that we can then use for branches. + if (Ty->isFloatTy() || Ty->isDoubleTy()) + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(ARM::FMSTAT))); + return true; +} + +bool ARMFastISel::SelectCmp(const Instruction *I) { + const CmpInst *CI = cast<CmpInst>(I); + + // Get the compare predicate. + ARMCC::CondCodes ARMPred = getComparePred(CI->getPredicate()); + + // We may not handle every CC for now. + if (ARMPred == ARMCC::AL) return false; + + // Emit the compare. + if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned())) + return false; + + // Now set a register based on the comparison. Explicitly set the predicates + // here. + unsigned MovCCOpc = isThumb2 ? ARM::t2MOVCCi : ARM::MOVCCi; + const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass + : &ARM::GPRRegClass; + unsigned DestReg = createResultReg(RC); + Constant *Zero = ConstantInt::get(Type::getInt32Ty(*Context), 0); + unsigned ZeroReg = fastMaterializeConstant(Zero); + // ARMEmitCmp emits a FMSTAT when necessary, so it's always safe to use CPSR. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovCCOpc), DestReg) + .addReg(ZeroReg).addImm(1) + .addImm(ARMPred).addReg(ARM::CPSR); + + updateValueMap(I, DestReg); + return true; +} + +bool ARMFastISel::SelectFPExt(const Instruction *I) { + // Make sure we have VFP and that we're extending float to double. + if (!Subtarget->hasVFP2()) return false; + + Value *V = I->getOperand(0); + if (!I->getType()->isDoubleTy() || + !V->getType()->isFloatTy()) return false; + + unsigned Op = getRegForValue(V); + if (Op == 0) return false; + + unsigned Result = createResultReg(&ARM::DPRRegClass); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(ARM::VCVTDS), Result) + .addReg(Op)); + updateValueMap(I, Result); + return true; +} + +bool ARMFastISel::SelectFPTrunc(const Instruction *I) { + // Make sure we have VFP and that we're truncating double to float. + if (!Subtarget->hasVFP2()) return false; + + Value *V = I->getOperand(0); + if (!(I->getType()->isFloatTy() && + V->getType()->isDoubleTy())) return false; + + unsigned Op = getRegForValue(V); + if (Op == 0) return false; + + unsigned Result = createResultReg(&ARM::SPRRegClass); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(ARM::VCVTSD), Result) + .addReg(Op)); + updateValueMap(I, Result); + return true; +} + +bool ARMFastISel::SelectIToFP(const Instruction *I, bool isSigned) { + // Make sure we have VFP. + if (!Subtarget->hasVFP2()) return false; + + MVT DstVT; + Type *Ty = I->getType(); + if (!isTypeLegal(Ty, DstVT)) + return false; + + Value *Src = I->getOperand(0); + EVT SrcEVT = TLI.getValueType(DL, Src->getType(), true); + if (!SrcEVT.isSimple()) + return false; + MVT SrcVT = SrcEVT.getSimpleVT(); + if (SrcVT != MVT::i32 && SrcVT != MVT::i16 && SrcVT != MVT::i8) + return false; + + unsigned SrcReg = getRegForValue(Src); + if (SrcReg == 0) return false; + + // Handle sign-extension. + if (SrcVT == MVT::i16 || SrcVT == MVT::i8) { + SrcReg = ARMEmitIntExt(SrcVT, SrcReg, MVT::i32, + /*isZExt*/!isSigned); + if (SrcReg == 0) return false; + } + + // The conversion routine works on fp-reg to fp-reg and the operand above + // was an integer, move it to the fp registers if possible. + unsigned FP = ARMMoveToFPReg(MVT::f32, SrcReg); + if (FP == 0) return false; + + unsigned Opc; + if (Ty->isFloatTy()) Opc = isSigned ? ARM::VSITOS : ARM::VUITOS; + else if (Ty->isDoubleTy()) Opc = isSigned ? ARM::VSITOD : ARM::VUITOD; + else return false; + + unsigned ResultReg = createResultReg(TLI.getRegClassFor(DstVT)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg).addReg(FP)); + updateValueMap(I, ResultReg); + return true; +} + +bool ARMFastISel::SelectFPToI(const Instruction *I, bool isSigned) { + // Make sure we have VFP. + if (!Subtarget->hasVFP2()) return false; + + MVT DstVT; + Type *RetTy = I->getType(); + if (!isTypeLegal(RetTy, DstVT)) + return false; + + unsigned Op = getRegForValue(I->getOperand(0)); + if (Op == 0) return false; + + unsigned Opc; + Type *OpTy = I->getOperand(0)->getType(); + if (OpTy->isFloatTy()) Opc = isSigned ? ARM::VTOSIZS : ARM::VTOUIZS; + else if (OpTy->isDoubleTy()) Opc = isSigned ? ARM::VTOSIZD : ARM::VTOUIZD; + else return false; + + // f64->s32/u32 or f32->s32/u32 both need an intermediate f32 reg. + unsigned ResultReg = createResultReg(TLI.getRegClassFor(MVT::f32)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg).addReg(Op)); + + // This result needs to be in an integer register, but the conversion only + // takes place in fp-regs. + unsigned IntReg = ARMMoveToIntReg(DstVT, ResultReg); + if (IntReg == 0) return false; + + updateValueMap(I, IntReg); + return true; +} + +bool ARMFastISel::SelectSelect(const Instruction *I) { + MVT VT; + if (!isTypeLegal(I->getType(), VT)) + return false; + + // Things need to be register sized for register moves. + if (VT != MVT::i32) return false; + + unsigned CondReg = getRegForValue(I->getOperand(0)); + if (CondReg == 0) return false; + unsigned Op1Reg = getRegForValue(I->getOperand(1)); + if (Op1Reg == 0) return false; + + // Check to see if we can use an immediate in the conditional move. + int Imm = 0; + bool UseImm = false; + bool isNegativeImm = false; + if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(I->getOperand(2))) { + assert (VT == MVT::i32 && "Expecting an i32."); + Imm = (int)ConstInt->getValue().getZExtValue(); + if (Imm < 0) { + isNegativeImm = true; + Imm = ~Imm; + } + UseImm = isThumb2 ? (ARM_AM::getT2SOImmVal(Imm) != -1) : + (ARM_AM::getSOImmVal(Imm) != -1); + } + + unsigned Op2Reg = 0; + if (!UseImm) { + Op2Reg = getRegForValue(I->getOperand(2)); + if (Op2Reg == 0) return false; + } + + unsigned TstOpc = isThumb2 ? ARM::t2TSTri : ARM::TSTri; + CondReg = constrainOperandRegClass(TII.get(TstOpc), CondReg, 0); + AddOptionalDefs( + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TstOpc)) + .addReg(CondReg) + .addImm(1)); + + unsigned MovCCOpc; + const TargetRegisterClass *RC; + if (!UseImm) { + RC = isThumb2 ? &ARM::tGPRRegClass : &ARM::GPRRegClass; + MovCCOpc = isThumb2 ? ARM::t2MOVCCr : ARM::MOVCCr; + } else { + RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass; + if (!isNegativeImm) + MovCCOpc = isThumb2 ? ARM::t2MOVCCi : ARM::MOVCCi; + else + MovCCOpc = isThumb2 ? ARM::t2MVNCCi : ARM::MVNCCi; + } + unsigned ResultReg = createResultReg(RC); + if (!UseImm) { + Op2Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op2Reg, 1); + Op1Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op1Reg, 2); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovCCOpc), + ResultReg) + .addReg(Op2Reg) + .addReg(Op1Reg) + .addImm(ARMCC::NE) + .addReg(ARM::CPSR); + } else { + Op1Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op1Reg, 1); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovCCOpc), + ResultReg) + .addReg(Op1Reg) + .addImm(Imm) + .addImm(ARMCC::EQ) + .addReg(ARM::CPSR); + } + updateValueMap(I, ResultReg); + return true; +} + +bool ARMFastISel::SelectDiv(const Instruction *I, bool isSigned) { + MVT VT; + Type *Ty = I->getType(); + if (!isTypeLegal(Ty, VT)) + return false; + + // If we have integer div support we should have selected this automagically. + // In case we have a real miss go ahead and return false and we'll pick + // it up later. + if (Subtarget->hasDivide()) return false; + + // Otherwise emit a libcall. + RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; + if (VT == MVT::i8) + LC = isSigned ? RTLIB::SDIV_I8 : RTLIB::UDIV_I8; + else if (VT == MVT::i16) + LC = isSigned ? RTLIB::SDIV_I16 : RTLIB::UDIV_I16; + else if (VT == MVT::i32) + LC = isSigned ? RTLIB::SDIV_I32 : RTLIB::UDIV_I32; + else if (VT == MVT::i64) + LC = isSigned ? RTLIB::SDIV_I64 : RTLIB::UDIV_I64; + else if (VT == MVT::i128) + LC = isSigned ? RTLIB::SDIV_I128 : RTLIB::UDIV_I128; + assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SDIV!"); + + return ARMEmitLibcall(I, LC); +} + +bool ARMFastISel::SelectRem(const Instruction *I, bool isSigned) { + MVT VT; + Type *Ty = I->getType(); + if (!isTypeLegal(Ty, VT)) + return false; + + RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; + if (VT == MVT::i8) + LC = isSigned ? RTLIB::SREM_I8 : RTLIB::UREM_I8; + else if (VT == MVT::i16) + LC = isSigned ? RTLIB::SREM_I16 : RTLIB::UREM_I16; + else if (VT == MVT::i32) + LC = isSigned ? RTLIB::SREM_I32 : RTLIB::UREM_I32; + else if (VT == MVT::i64) + LC = isSigned ? RTLIB::SREM_I64 : RTLIB::UREM_I64; + else if (VT == MVT::i128) + LC = isSigned ? RTLIB::SREM_I128 : RTLIB::UREM_I128; + assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SREM!"); + + return ARMEmitLibcall(I, LC); +} + +bool ARMFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) { + EVT DestVT = TLI.getValueType(DL, I->getType(), true); + + // We can get here in the case when we have a binary operation on a non-legal + // type and the target independent selector doesn't know how to handle it. + if (DestVT != MVT::i16 && DestVT != MVT::i8 && DestVT != MVT::i1) + return false; + + unsigned Opc; + switch (ISDOpcode) { + default: return false; + case ISD::ADD: + Opc = isThumb2 ? ARM::t2ADDrr : ARM::ADDrr; + break; + case ISD::OR: + Opc = isThumb2 ? ARM::t2ORRrr : ARM::ORRrr; + break; + case ISD::SUB: + Opc = isThumb2 ? ARM::t2SUBrr : ARM::SUBrr; + break; + } + + unsigned SrcReg1 = getRegForValue(I->getOperand(0)); + if (SrcReg1 == 0) return false; + + // TODO: Often the 2nd operand is an immediate, which can be encoded directly + // in the instruction, rather then materializing the value in a register. + unsigned SrcReg2 = getRegForValue(I->getOperand(1)); + if (SrcReg2 == 0) return false; + + unsigned ResultReg = createResultReg(&ARM::GPRnopcRegClass); + SrcReg1 = constrainOperandRegClass(TII.get(Opc), SrcReg1, 1); + SrcReg2 = constrainOperandRegClass(TII.get(Opc), SrcReg2, 2); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg) + .addReg(SrcReg1).addReg(SrcReg2)); + updateValueMap(I, ResultReg); + return true; +} + +bool ARMFastISel::SelectBinaryFPOp(const Instruction *I, unsigned ISDOpcode) { + EVT FPVT = TLI.getValueType(DL, I->getType(), true); + if (!FPVT.isSimple()) return false; + MVT VT = FPVT.getSimpleVT(); + + // FIXME: Support vector types where possible. + if (VT.isVector()) + return false; + + // We can get here in the case when we want to use NEON for our fp + // operations, but can't figure out how to. Just use the vfp instructions + // if we have them. + // FIXME: It'd be nice to use NEON instructions. + Type *Ty = I->getType(); + bool isFloat = (Ty->isDoubleTy() || Ty->isFloatTy()); + if (isFloat && !Subtarget->hasVFP2()) + return false; + + unsigned Opc; + bool is64bit = VT == MVT::f64 || VT == MVT::i64; + switch (ISDOpcode) { + default: return false; + case ISD::FADD: + Opc = is64bit ? ARM::VADDD : ARM::VADDS; + break; + case ISD::FSUB: + Opc = is64bit ? ARM::VSUBD : ARM::VSUBS; + break; + case ISD::FMUL: + Opc = is64bit ? ARM::VMULD : ARM::VMULS; + break; + } + unsigned Op1 = getRegForValue(I->getOperand(0)); + if (Op1 == 0) return false; + + unsigned Op2 = getRegForValue(I->getOperand(1)); + if (Op2 == 0) return false; + + unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg) + .addReg(Op1).addReg(Op2)); + updateValueMap(I, ResultReg); + return true; +} + +// Call Handling Code + +// This is largely taken directly from CCAssignFnForNode +// TODO: We may not support all of this. +CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC, + bool Return, + bool isVarArg) { + switch (CC) { + default: + llvm_unreachable("Unsupported calling convention"); + case CallingConv::Fast: + if (Subtarget->hasVFP2() && !isVarArg) { + if (!Subtarget->isAAPCS_ABI()) + return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS); + // For AAPCS ABI targets, just use VFP variant of the calling convention. + return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); + } + // Fallthrough + case CallingConv::C: + // Use target triple & subtarget features to do actual dispatch. + if (Subtarget->isAAPCS_ABI()) { + if (Subtarget->hasVFP2() && + TM.Options.FloatABIType == FloatABI::Hard && !isVarArg) + return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP); + else + return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS); + } else { + return (Return ? RetCC_ARM_APCS: CC_ARM_APCS); + } + case CallingConv::ARM_AAPCS_VFP: + if (!isVarArg) + return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP); + // Fall through to soft float variant, variadic functions don't + // use hard floating point ABI. + case CallingConv::ARM_AAPCS: + return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS); + case CallingConv::ARM_APCS: + return (Return ? RetCC_ARM_APCS: CC_ARM_APCS); + case CallingConv::GHC: + if (Return) + llvm_unreachable("Can't return in GHC call convention"); + else + return CC_ARM_APCS_GHC; + } +} + +bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args, + SmallVectorImpl<unsigned> &ArgRegs, + SmallVectorImpl<MVT> &ArgVTs, + SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags, + SmallVectorImpl<unsigned> &RegArgs, + CallingConv::ID CC, + unsigned &NumBytes, + bool isVarArg) { + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CC, isVarArg, *FuncInfo.MF, ArgLocs, *Context); + CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, + CCAssignFnForCall(CC, false, isVarArg)); + + // Check that we can handle all of the arguments. If we can't, then bail out + // now before we add code to the MBB. + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + MVT ArgVT = ArgVTs[VA.getValNo()]; + + // We don't handle NEON/vector parameters yet. + if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64) + return false; + + // Now copy/store arg to correct locations. + if (VA.isRegLoc() && !VA.needsCustom()) { + continue; + } else if (VA.needsCustom()) { + // TODO: We need custom lowering for vector (v2f64) args. + if (VA.getLocVT() != MVT::f64 || + // TODO: Only handle register args for now. + !VA.isRegLoc() || !ArgLocs[++i].isRegLoc()) + return false; + } else { + switch (ArgVT.SimpleTy) { + default: + return false; + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + break; + case MVT::f32: + if (!Subtarget->hasVFP2()) + return false; + break; + case MVT::f64: + if (!Subtarget->hasVFP2()) + return false; + break; + } + } + } + + // At the point, we are able to handle the call's arguments in fast isel. + + // Get a count of how many bytes are to be pushed on the stack. + NumBytes = CCInfo.getNextStackOffset(); + + // Issue CALLSEQ_START + unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(AdjStackDown)) + .addImm(NumBytes)); + + // Process the args. + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + const Value *ArgVal = Args[VA.getValNo()]; + unsigned Arg = ArgRegs[VA.getValNo()]; + MVT ArgVT = ArgVTs[VA.getValNo()]; + + assert((!ArgVT.isVector() && ArgVT.getSizeInBits() <= 64) && + "We don't handle NEON/vector parameters yet."); + + // Handle arg promotion, etc. + switch (VA.getLocInfo()) { + case CCValAssign::Full: break; + case CCValAssign::SExt: { + MVT DestVT = VA.getLocVT(); + Arg = ARMEmitIntExt(ArgVT, Arg, DestVT, /*isZExt*/false); + assert (Arg != 0 && "Failed to emit a sext"); + ArgVT = DestVT; + break; + } + case CCValAssign::AExt: + // Intentional fall-through. Handle AExt and ZExt. + case CCValAssign::ZExt: { + MVT DestVT = VA.getLocVT(); + Arg = ARMEmitIntExt(ArgVT, Arg, DestVT, /*isZExt*/true); + assert (Arg != 0 && "Failed to emit a zext"); + ArgVT = DestVT; + break; + } + case CCValAssign::BCvt: { + unsigned BC = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, Arg, + /*TODO: Kill=*/false); + assert(BC != 0 && "Failed to emit a bitcast!"); + Arg = BC; + ArgVT = VA.getLocVT(); + break; + } + default: llvm_unreachable("Unknown arg promotion!"); + } + + // Now copy/store arg to correct locations. + if (VA.isRegLoc() && !VA.needsCustom()) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(Arg); + RegArgs.push_back(VA.getLocReg()); + } else if (VA.needsCustom()) { + // TODO: We need custom lowering for vector (v2f64) args. + assert(VA.getLocVT() == MVT::f64 && + "Custom lowering for v2f64 args not available"); + + CCValAssign &NextVA = ArgLocs[++i]; + + assert(VA.isRegLoc() && NextVA.isRegLoc() && + "We only handle register args!"); + + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(ARM::VMOVRRD), VA.getLocReg()) + .addReg(NextVA.getLocReg(), RegState::Define) + .addReg(Arg)); + RegArgs.push_back(VA.getLocReg()); + RegArgs.push_back(NextVA.getLocReg()); + } else { + assert(VA.isMemLoc()); + // Need to store on the stack. + + // Don't emit stores for undef values. + if (isa<UndefValue>(ArgVal)) + continue; + + Address Addr; + Addr.BaseType = Address::RegBase; + Addr.Base.Reg = ARM::SP; + Addr.Offset = VA.getLocMemOffset(); + + bool EmitRet = ARMEmitStore(ArgVT, Arg, Addr); (void)EmitRet; + assert(EmitRet && "Could not emit a store for argument!"); + } + } + + return true; +} + +bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs, + const Instruction *I, CallingConv::ID CC, + unsigned &NumBytes, bool isVarArg) { + // Issue CALLSEQ_END + unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(AdjStackUp)) + .addImm(NumBytes).addImm(0)); + + // Now the return value. + if (RetVT != MVT::isVoid) { + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CC, isVarArg, *FuncInfo.MF, RVLocs, *Context); + CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, isVarArg)); + + // Copy all of the result registers out of their specified physreg. + if (RVLocs.size() == 2 && RetVT == MVT::f64) { + // For this move we copy into two registers and then move into the + // double fp reg we want. + MVT DestVT = RVLocs[0].getValVT(); + const TargetRegisterClass* DstRC = TLI.getRegClassFor(DestVT); + unsigned ResultReg = createResultReg(DstRC); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(ARM::VMOVDRR), ResultReg) + .addReg(RVLocs[0].getLocReg()) + .addReg(RVLocs[1].getLocReg())); + + UsedRegs.push_back(RVLocs[0].getLocReg()); + UsedRegs.push_back(RVLocs[1].getLocReg()); + + // Finally update the result. + updateValueMap(I, ResultReg); + } else { + assert(RVLocs.size() == 1 &&"Can't handle non-double multi-reg retvals!"); + MVT CopyVT = RVLocs[0].getValVT(); + + // Special handling for extended integers. + if (RetVT == MVT::i1 || RetVT == MVT::i8 || RetVT == MVT::i16) + CopyVT = MVT::i32; + + const TargetRegisterClass* DstRC = TLI.getRegClassFor(CopyVT); + + unsigned ResultReg = createResultReg(DstRC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), + ResultReg).addReg(RVLocs[0].getLocReg()); + UsedRegs.push_back(RVLocs[0].getLocReg()); + + // Finally update the result. + updateValueMap(I, ResultReg); + } + } + + return true; +} + +bool ARMFastISel::SelectRet(const Instruction *I) { + const ReturnInst *Ret = cast<ReturnInst>(I); + const Function &F = *I->getParent()->getParent(); + + if (!FuncInfo.CanLowerReturn) + return false; + + // Build a list of return value registers. + SmallVector<unsigned, 4> RetRegs; + + CallingConv::ID CC = F.getCallingConv(); + if (Ret->getNumOperands() > 0) { + SmallVector<ISD::OutputArg, 4> Outs; + GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL); + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ValLocs; + CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext()); + CCInfo.AnalyzeReturn(Outs, CCAssignFnForCall(CC, true /* is Ret */, + F.isVarArg())); + + const Value *RV = Ret->getOperand(0); + unsigned Reg = getRegForValue(RV); + if (Reg == 0) + return false; + + // Only handle a single return value for now. + if (ValLocs.size() != 1) + return false; + + CCValAssign &VA = ValLocs[0]; + + // Don't bother handling odd stuff for now. + if (VA.getLocInfo() != CCValAssign::Full) + return false; + // Only handle register returns for now. + if (!VA.isRegLoc()) + return false; + + unsigned SrcReg = Reg + VA.getValNo(); + EVT RVEVT = TLI.getValueType(DL, RV->getType()); + if (!RVEVT.isSimple()) return false; + MVT RVVT = RVEVT.getSimpleVT(); + MVT DestVT = VA.getValVT(); + // Special handling for extended integers. + if (RVVT != DestVT) { + if (RVVT != MVT::i1 && RVVT != MVT::i8 && RVVT != MVT::i16) + return false; + + assert(DestVT == MVT::i32 && "ARM should always ext to i32"); + + // Perform extension if flagged as either zext or sext. Otherwise, do + // nothing. + if (Outs[0].Flags.isZExt() || Outs[0].Flags.isSExt()) { + SrcReg = ARMEmitIntExt(RVVT, SrcReg, DestVT, Outs[0].Flags.isZExt()); + if (SrcReg == 0) return false; + } + } + + // Make the copy. + unsigned DstReg = VA.getLocReg(); + const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg); + // Avoid a cross-class copy. This is very unlikely. + if (!SrcRC->contains(DstReg)) + return false; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg); + + // Add register to return instruction. + RetRegs.push_back(VA.getLocReg()); + } + + unsigned RetOpc = isThumb2 ? ARM::tBX_RET : ARM::BX_RET; + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(RetOpc)); + AddOptionalDefs(MIB); + for (unsigned i = 0, e = RetRegs.size(); i != e; ++i) + MIB.addReg(RetRegs[i], RegState::Implicit); + return true; +} + +unsigned ARMFastISel::ARMSelectCallOp(bool UseReg) { + if (UseReg) + return isThumb2 ? ARM::tBLXr : ARM::BLX; + else + return isThumb2 ? ARM::tBL : ARM::BL; +} + +unsigned ARMFastISel::getLibcallReg(const Twine &Name) { + // Manually compute the global's type to avoid building it when unnecessary. + Type *GVTy = Type::getInt32PtrTy(*Context, /*AS=*/0); + EVT LCREVT = TLI.getValueType(DL, GVTy); + if (!LCREVT.isSimple()) return 0; + + GlobalValue *GV = new GlobalVariable(M, Type::getInt32Ty(*Context), false, + GlobalValue::ExternalLinkage, nullptr, + Name); + assert(GV->getType() == GVTy && "We miscomputed the type for the global!"); + return ARMMaterializeGV(GV, LCREVT.getSimpleVT()); +} + +// A quick function that will emit a call for a named libcall in F with the +// vector of passed arguments for the Instruction in I. We can assume that we +// can emit a call for any libcall we can produce. This is an abridged version +// of the full call infrastructure since we won't need to worry about things +// like computed function pointers or strange arguments at call sites. +// TODO: Try to unify this and the normal call bits for ARM, then try to unify +// with X86. +bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { + CallingConv::ID CC = TLI.getLibcallCallingConv(Call); + + // Handle *simple* calls for now. + Type *RetTy = I->getType(); + MVT RetVT; + if (RetTy->isVoidTy()) + RetVT = MVT::isVoid; + else if (!isTypeLegal(RetTy, RetVT)) + return false; + + // Can't handle non-double multi-reg retvals. + if (RetVT != MVT::isVoid && RetVT != MVT::i32) { + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context); + CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, false)); + if (RVLocs.size() >= 2 && RetVT != MVT::f64) + return false; + } + + // Set up the argument vectors. + SmallVector<Value*, 8> Args; + SmallVector<unsigned, 8> ArgRegs; + SmallVector<MVT, 8> ArgVTs; + SmallVector<ISD::ArgFlagsTy, 8> ArgFlags; + Args.reserve(I->getNumOperands()); + ArgRegs.reserve(I->getNumOperands()); + ArgVTs.reserve(I->getNumOperands()); + ArgFlags.reserve(I->getNumOperands()); + for (unsigned i = 0; i < I->getNumOperands(); ++i) { + Value *Op = I->getOperand(i); + unsigned Arg = getRegForValue(Op); + if (Arg == 0) return false; + + Type *ArgTy = Op->getType(); + MVT ArgVT; + if (!isTypeLegal(ArgTy, ArgVT)) return false; + + ISD::ArgFlagsTy Flags; + unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy); + Flags.setOrigAlign(OriginalAlignment); + + Args.push_back(Op); + ArgRegs.push_back(Arg); + ArgVTs.push_back(ArgVT); + ArgFlags.push_back(Flags); + } + + // Handle the arguments now that we've gotten them. + SmallVector<unsigned, 4> RegArgs; + unsigned NumBytes; + if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, + RegArgs, CC, NumBytes, false)) + return false; + + unsigned CalleeReg = 0; + if (Subtarget->genLongCalls()) { + CalleeReg = getLibcallReg(TLI.getLibcallName(Call)); + if (CalleeReg == 0) return false; + } + + // Issue the call. + unsigned CallOpc = ARMSelectCallOp(Subtarget->genLongCalls()); + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, + DbgLoc, TII.get(CallOpc)); + // BL / BLX don't take a predicate, but tBL / tBLX do. + if (isThumb2) + AddDefaultPred(MIB); + if (Subtarget->genLongCalls()) + MIB.addReg(CalleeReg); + else + MIB.addExternalSymbol(TLI.getLibcallName(Call)); + + // Add implicit physical register uses to the call. + for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) + MIB.addReg(RegArgs[i], RegState::Implicit); + + // Add a register mask with the call-preserved registers. + // Proper defs for return values will be added by setPhysRegsDeadExcept(). + MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC)); + + // Finish off the call including any return values. + SmallVector<unsigned, 4> UsedRegs; + if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes, false)) return false; + + // Set all unused physreg defs as dead. + static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI); + + return true; +} + +bool ARMFastISel::SelectCall(const Instruction *I, + const char *IntrMemName = nullptr) { + const CallInst *CI = cast<CallInst>(I); + const Value *Callee = CI->getCalledValue(); + + // Can't handle inline asm. + if (isa<InlineAsm>(Callee)) return false; + + // Allow SelectionDAG isel to handle tail calls. + if (CI->isTailCall()) return false; + + // Check the calling convention. + ImmutableCallSite CS(CI); + CallingConv::ID CC = CS.getCallingConv(); + + // TODO: Avoid some calling conventions? + + PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType()); + FunctionType *FTy = cast<FunctionType>(PT->getElementType()); + bool isVarArg = FTy->isVarArg(); + + // Handle *simple* calls for now. + Type *RetTy = I->getType(); + MVT RetVT; + if (RetTy->isVoidTy()) + RetVT = MVT::isVoid; + else if (!isTypeLegal(RetTy, RetVT) && RetVT != MVT::i16 && + RetVT != MVT::i8 && RetVT != MVT::i1) + return false; + + // Can't handle non-double multi-reg retvals. + if (RetVT != MVT::isVoid && RetVT != MVT::i1 && RetVT != MVT::i8 && + RetVT != MVT::i16 && RetVT != MVT::i32) { + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CC, isVarArg, *FuncInfo.MF, RVLocs, *Context); + CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, isVarArg)); + if (RVLocs.size() >= 2 && RetVT != MVT::f64) + return false; + } + + // Set up the argument vectors. + SmallVector<Value*, 8> Args; + SmallVector<unsigned, 8> ArgRegs; + SmallVector<MVT, 8> ArgVTs; + SmallVector<ISD::ArgFlagsTy, 8> ArgFlags; + unsigned arg_size = CS.arg_size(); + Args.reserve(arg_size); + ArgRegs.reserve(arg_size); + ArgVTs.reserve(arg_size); + ArgFlags.reserve(arg_size); + for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end(); + i != e; ++i) { + // If we're lowering a memory intrinsic instead of a regular call, skip the + // last two arguments, which shouldn't be passed to the underlying function. + if (IntrMemName && e-i <= 2) + break; + + ISD::ArgFlagsTy Flags; + unsigned AttrInd = i - CS.arg_begin() + 1; + if (CS.paramHasAttr(AttrInd, Attribute::SExt)) + Flags.setSExt(); + if (CS.paramHasAttr(AttrInd, Attribute::ZExt)) + Flags.setZExt(); + + // FIXME: Only handle *easy* calls for now. + if (CS.paramHasAttr(AttrInd, Attribute::InReg) || + CS.paramHasAttr(AttrInd, Attribute::StructRet) || + CS.paramHasAttr(AttrInd, Attribute::Nest) || + CS.paramHasAttr(AttrInd, Attribute::ByVal)) + return false; + + Type *ArgTy = (*i)->getType(); + MVT ArgVT; + if (!isTypeLegal(ArgTy, ArgVT) && ArgVT != MVT::i16 && ArgVT != MVT::i8 && + ArgVT != MVT::i1) + return false; + + unsigned Arg = getRegForValue(*i); + if (Arg == 0) + return false; + + unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy); + Flags.setOrigAlign(OriginalAlignment); + + Args.push_back(*i); + ArgRegs.push_back(Arg); + ArgVTs.push_back(ArgVT); + ArgFlags.push_back(Flags); + } + + // Handle the arguments now that we've gotten them. + SmallVector<unsigned, 4> RegArgs; + unsigned NumBytes; + if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, + RegArgs, CC, NumBytes, isVarArg)) + return false; + + bool UseReg = false; + const GlobalValue *GV = dyn_cast<GlobalValue>(Callee); + if (!GV || Subtarget->genLongCalls()) UseReg = true; + + unsigned CalleeReg = 0; + if (UseReg) { + if (IntrMemName) + CalleeReg = getLibcallReg(IntrMemName); + else + CalleeReg = getRegForValue(Callee); + + if (CalleeReg == 0) return false; + } + + // Issue the call. + unsigned CallOpc = ARMSelectCallOp(UseReg); + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, + DbgLoc, TII.get(CallOpc)); + + unsigned char OpFlags = 0; + + // Add MO_PLT for global address or external symbol in the PIC relocation + // model. + if (Subtarget->isTargetELF() && TM.getRelocationModel() == Reloc::PIC_) + OpFlags = ARMII::MO_PLT; + + // ARM calls don't take a predicate, but tBL / tBLX do. + if(isThumb2) + AddDefaultPred(MIB); + if (UseReg) + MIB.addReg(CalleeReg); + else if (!IntrMemName) + MIB.addGlobalAddress(GV, 0, OpFlags); + else + MIB.addExternalSymbol(IntrMemName, OpFlags); + + // Add implicit physical register uses to the call. + for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) + MIB.addReg(RegArgs[i], RegState::Implicit); + + // Add a register mask with the call-preserved registers. + // Proper defs for return values will be added by setPhysRegsDeadExcept(). + MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC)); + + // Finish off the call including any return values. + SmallVector<unsigned, 4> UsedRegs; + if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes, isVarArg)) + return false; + + // Set all unused physreg defs as dead. + static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI); + + return true; +} + +bool ARMFastISel::ARMIsMemCpySmall(uint64_t Len) { + return Len <= 16; +} + +bool ARMFastISel::ARMTryEmitSmallMemCpy(Address Dest, Address Src, + uint64_t Len, unsigned Alignment) { + // Make sure we don't bloat code by inlining very large memcpy's. + if (!ARMIsMemCpySmall(Len)) + return false; + + while (Len) { + MVT VT; + if (!Alignment || Alignment >= 4) { + if (Len >= 4) + VT = MVT::i32; + else if (Len >= 2) + VT = MVT::i16; + else { + assert (Len == 1 && "Expected a length of 1!"); + VT = MVT::i8; + } + } else { + // Bound based on alignment. + if (Len >= 2 && Alignment == 2) + VT = MVT::i16; + else { + VT = MVT::i8; + } + } + + bool RV; + unsigned ResultReg; + RV = ARMEmitLoad(VT, ResultReg, Src); + assert (RV == true && "Should be able to handle this load."); + RV = ARMEmitStore(VT, ResultReg, Dest); + assert (RV == true && "Should be able to handle this store."); + (void)RV; + + unsigned Size = VT.getSizeInBits()/8; + Len -= Size; + Dest.Offset += Size; + Src.Offset += Size; + } + + return true; +} + +bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) { + // FIXME: Handle more intrinsics. + switch (I.getIntrinsicID()) { + default: return false; + case Intrinsic::frameaddress: { + MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo(); + MFI->setFrameAddressIsTaken(true); + + unsigned LdrOpc = isThumb2 ? ARM::t2LDRi12 : ARM::LDRi12; + const TargetRegisterClass *RC = isThumb2 ? &ARM::tGPRRegClass + : &ARM::GPRRegClass; + + const ARMBaseRegisterInfo *RegInfo = + static_cast<const ARMBaseRegisterInfo *>(Subtarget->getRegisterInfo()); + unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF)); + unsigned SrcReg = FramePtr; + + // Recursively load frame address + // ldr r0 [fp] + // ldr r0 [r0] + // ldr r0 [r0] + // ... + unsigned DestReg; + unsigned Depth = cast<ConstantInt>(I.getOperand(0))->getZExtValue(); + while (Depth--) { + DestReg = createResultReg(RC); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(LdrOpc), DestReg) + .addReg(SrcReg).addImm(0)); + SrcReg = DestReg; + } + updateValueMap(&I, SrcReg); + return true; + } + case Intrinsic::memcpy: + case Intrinsic::memmove: { + const MemTransferInst &MTI = cast<MemTransferInst>(I); + // Don't handle volatile. + if (MTI.isVolatile()) + return false; + + // Disable inlining for memmove before calls to ComputeAddress. Otherwise, + // we would emit dead code because we don't currently handle memmoves. + bool isMemCpy = (I.getIntrinsicID() == Intrinsic::memcpy); + if (isa<ConstantInt>(MTI.getLength()) && isMemCpy) { + // Small memcpy's are common enough that we want to do them without a call + // if possible. + uint64_t Len = cast<ConstantInt>(MTI.getLength())->getZExtValue(); + if (ARMIsMemCpySmall(Len)) { + Address Dest, Src; + if (!ARMComputeAddress(MTI.getRawDest(), Dest) || + !ARMComputeAddress(MTI.getRawSource(), Src)) + return false; + unsigned Alignment = MTI.getAlignment(); + if (ARMTryEmitSmallMemCpy(Dest, Src, Len, Alignment)) + return true; + } + } + + if (!MTI.getLength()->getType()->isIntegerTy(32)) + return false; + + if (MTI.getSourceAddressSpace() > 255 || MTI.getDestAddressSpace() > 255) + return false; + + const char *IntrMemName = isa<MemCpyInst>(I) ? "memcpy" : "memmove"; + return SelectCall(&I, IntrMemName); + } + case Intrinsic::memset: { + const MemSetInst &MSI = cast<MemSetInst>(I); + // Don't handle volatile. + if (MSI.isVolatile()) + return false; + + if (!MSI.getLength()->getType()->isIntegerTy(32)) + return false; + + if (MSI.getDestAddressSpace() > 255) + return false; + + return SelectCall(&I, "memset"); + } + case Intrinsic::trap: { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get( + Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP)); + return true; + } + } +} + +bool ARMFastISel::SelectTrunc(const Instruction *I) { + // The high bits for a type smaller than the register size are assumed to be + // undefined. + Value *Op = I->getOperand(0); + + EVT SrcVT, DestVT; + SrcVT = TLI.getValueType(DL, Op->getType(), true); + DestVT = TLI.getValueType(DL, I->getType(), true); + + if (SrcVT != MVT::i32 && SrcVT != MVT::i16 && SrcVT != MVT::i8) + return false; + if (DestVT != MVT::i16 && DestVT != MVT::i8 && DestVT != MVT::i1) + return false; + + unsigned SrcReg = getRegForValue(Op); + if (!SrcReg) return false; + + // Because the high bits are undefined, a truncate doesn't generate + // any code. + updateValueMap(I, SrcReg); + return true; +} + +unsigned ARMFastISel::ARMEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, + bool isZExt) { + if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8) + return 0; + if (SrcVT != MVT::i16 && SrcVT != MVT::i8 && SrcVT != MVT::i1) + return 0; + + // Table of which combinations can be emitted as a single instruction, + // and which will require two. + static const uint8_t isSingleInstrTbl[3][2][2][2] = { + // ARM Thumb + // !hasV6Ops hasV6Ops !hasV6Ops hasV6Ops + // ext: s z s z s z s z + /* 1 */ { { { 0, 1 }, { 0, 1 } }, { { 0, 0 }, { 0, 1 } } }, + /* 8 */ { { { 0, 1 }, { 1, 1 } }, { { 0, 0 }, { 1, 1 } } }, + /* 16 */ { { { 0, 0 }, { 1, 1 } }, { { 0, 0 }, { 1, 1 } } } + }; + + // Target registers for: + // - For ARM can never be PC. + // - For 16-bit Thumb are restricted to lower 8 registers. + // - For 32-bit Thumb are restricted to non-SP and non-PC. + static const TargetRegisterClass *RCTbl[2][2] = { + // Instructions: Two Single + /* ARM */ { &ARM::GPRnopcRegClass, &ARM::GPRnopcRegClass }, + /* Thumb */ { &ARM::tGPRRegClass, &ARM::rGPRRegClass } + }; + + // Table governing the instruction(s) to be emitted. + static const struct InstructionTable { + uint32_t Opc : 16; + uint32_t hasS : 1; // Some instructions have an S bit, always set it to 0. + uint32_t Shift : 7; // For shift operand addressing mode, used by MOVsi. + uint32_t Imm : 8; // All instructions have either a shift or a mask. + } IT[2][2][3][2] = { + { // Two instructions (first is left shift, second is in this table). + { // ARM Opc S Shift Imm + /* 1 bit sext */ { { ARM::MOVsi , 1, ARM_AM::asr , 31 }, + /* 1 bit zext */ { ARM::MOVsi , 1, ARM_AM::lsr , 31 } }, + /* 8 bit sext */ { { ARM::MOVsi , 1, ARM_AM::asr , 24 }, + /* 8 bit zext */ { ARM::MOVsi , 1, ARM_AM::lsr , 24 } }, + /* 16 bit sext */ { { ARM::MOVsi , 1, ARM_AM::asr , 16 }, + /* 16 bit zext */ { ARM::MOVsi , 1, ARM_AM::lsr , 16 } } + }, + { // Thumb Opc S Shift Imm + /* 1 bit sext */ { { ARM::tASRri , 0, ARM_AM::no_shift, 31 }, + /* 1 bit zext */ { ARM::tLSRri , 0, ARM_AM::no_shift, 31 } }, + /* 8 bit sext */ { { ARM::tASRri , 0, ARM_AM::no_shift, 24 }, + /* 8 bit zext */ { ARM::tLSRri , 0, ARM_AM::no_shift, 24 } }, + /* 16 bit sext */ { { ARM::tASRri , 0, ARM_AM::no_shift, 16 }, + /* 16 bit zext */ { ARM::tLSRri , 0, ARM_AM::no_shift, 16 } } + } + }, + { // Single instruction. + { // ARM Opc S Shift Imm + /* 1 bit sext */ { { ARM::KILL , 0, ARM_AM::no_shift, 0 }, + /* 1 bit zext */ { ARM::ANDri , 1, ARM_AM::no_shift, 1 } }, + /* 8 bit sext */ { { ARM::SXTB , 0, ARM_AM::no_shift, 0 }, + /* 8 bit zext */ { ARM::ANDri , 1, ARM_AM::no_shift, 255 } }, + /* 16 bit sext */ { { ARM::SXTH , 0, ARM_AM::no_shift, 0 }, + /* 16 bit zext */ { ARM::UXTH , 0, ARM_AM::no_shift, 0 } } + }, + { // Thumb Opc S Shift Imm + /* 1 bit sext */ { { ARM::KILL , 0, ARM_AM::no_shift, 0 }, + /* 1 bit zext */ { ARM::t2ANDri, 1, ARM_AM::no_shift, 1 } }, + /* 8 bit sext */ { { ARM::t2SXTB , 0, ARM_AM::no_shift, 0 }, + /* 8 bit zext */ { ARM::t2ANDri, 1, ARM_AM::no_shift, 255 } }, + /* 16 bit sext */ { { ARM::t2SXTH , 0, ARM_AM::no_shift, 0 }, + /* 16 bit zext */ { ARM::t2UXTH , 0, ARM_AM::no_shift, 0 } } + } + } + }; + + unsigned SrcBits = SrcVT.getSizeInBits(); + unsigned DestBits = DestVT.getSizeInBits(); + (void) DestBits; + assert((SrcBits < DestBits) && "can only extend to larger types"); + assert((DestBits == 32 || DestBits == 16 || DestBits == 8) && + "other sizes unimplemented"); + assert((SrcBits == 16 || SrcBits == 8 || SrcBits == 1) && + "other sizes unimplemented"); + + bool hasV6Ops = Subtarget->hasV6Ops(); + unsigned Bitness = SrcBits / 8; // {1,8,16}=>{0,1,2} + assert((Bitness < 3) && "sanity-check table bounds"); + + bool isSingleInstr = isSingleInstrTbl[Bitness][isThumb2][hasV6Ops][isZExt]; + const TargetRegisterClass *RC = RCTbl[isThumb2][isSingleInstr]; + const InstructionTable *ITP = &IT[isSingleInstr][isThumb2][Bitness][isZExt]; + unsigned Opc = ITP->Opc; + assert(ARM::KILL != Opc && "Invalid table entry"); + unsigned hasS = ITP->hasS; + ARM_AM::ShiftOpc Shift = (ARM_AM::ShiftOpc) ITP->Shift; + assert(((Shift == ARM_AM::no_shift) == (Opc != ARM::MOVsi)) && + "only MOVsi has shift operand addressing mode"); + unsigned Imm = ITP->Imm; + + // 16-bit Thumb instructions always set CPSR (unless they're in an IT block). + bool setsCPSR = &ARM::tGPRRegClass == RC; + unsigned LSLOpc = isThumb2 ? ARM::tLSLri : ARM::MOVsi; + unsigned ResultReg; + // MOVsi encodes shift and immediate in shift operand addressing mode. + // The following condition has the same value when emitting two + // instruction sequences: both are shifts. + bool ImmIsSO = (Shift != ARM_AM::no_shift); + + // Either one or two instructions are emitted. + // They're always of the form: + // dst = in OP imm + // CPSR is set only by 16-bit Thumb instructions. + // Predicate, if any, is AL. + // S bit, if available, is always 0. + // When two are emitted the first's result will feed as the second's input, + // that value is then dead. + unsigned NumInstrsEmitted = isSingleInstr ? 1 : 2; + for (unsigned Instr = 0; Instr != NumInstrsEmitted; ++Instr) { + ResultReg = createResultReg(RC); + bool isLsl = (0 == Instr) && !isSingleInstr; + unsigned Opcode = isLsl ? LSLOpc : Opc; + ARM_AM::ShiftOpc ShiftAM = isLsl ? ARM_AM::lsl : Shift; + unsigned ImmEnc = ImmIsSO ? ARM_AM::getSORegOpc(ShiftAM, Imm) : Imm; + bool isKill = 1 == Instr; + MachineInstrBuilder MIB = BuildMI( + *FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opcode), ResultReg); + if (setsCPSR) + MIB.addReg(ARM::CPSR, RegState::Define); + SrcReg = constrainOperandRegClass(TII.get(Opcode), SrcReg, 1 + setsCPSR); + AddDefaultPred(MIB.addReg(SrcReg, isKill * RegState::Kill).addImm(ImmEnc)); + if (hasS) + AddDefaultCC(MIB); + // Second instruction consumes the first's result. + SrcReg = ResultReg; + } + + return ResultReg; +} + +bool ARMFastISel::SelectIntExt(const Instruction *I) { + // On ARM, in general, integer casts don't involve legal types; this code + // handles promotable integers. + Type *DestTy = I->getType(); + Value *Src = I->getOperand(0); + Type *SrcTy = Src->getType(); + + bool isZExt = isa<ZExtInst>(I); + unsigned SrcReg = getRegForValue(Src); + if (!SrcReg) return false; + + EVT SrcEVT, DestEVT; + SrcEVT = TLI.getValueType(DL, SrcTy, true); + DestEVT = TLI.getValueType(DL, DestTy, true); + if (!SrcEVT.isSimple()) return false; + if (!DestEVT.isSimple()) return false; + + MVT SrcVT = SrcEVT.getSimpleVT(); + MVT DestVT = DestEVT.getSimpleVT(); + unsigned ResultReg = ARMEmitIntExt(SrcVT, SrcReg, DestVT, isZExt); + if (ResultReg == 0) return false; + updateValueMap(I, ResultReg); + return true; +} + +bool ARMFastISel::SelectShift(const Instruction *I, + ARM_AM::ShiftOpc ShiftTy) { + // We handle thumb2 mode by target independent selector + // or SelectionDAG ISel. + if (isThumb2) + return false; + + // Only handle i32 now. + EVT DestVT = TLI.getValueType(DL, I->getType(), true); + if (DestVT != MVT::i32) + return false; + + unsigned Opc = ARM::MOVsr; + unsigned ShiftImm; + Value *Src2Value = I->getOperand(1); + if (const ConstantInt *CI = dyn_cast<ConstantInt>(Src2Value)) { + ShiftImm = CI->getZExtValue(); + + // Fall back to selection DAG isel if the shift amount + // is zero or greater than the width of the value type. + if (ShiftImm == 0 || ShiftImm >=32) + return false; + + Opc = ARM::MOVsi; + } + + Value *Src1Value = I->getOperand(0); + unsigned Reg1 = getRegForValue(Src1Value); + if (Reg1 == 0) return false; + + unsigned Reg2 = 0; + if (Opc == ARM::MOVsr) { + Reg2 = getRegForValue(Src2Value); + if (Reg2 == 0) return false; + } + + unsigned ResultReg = createResultReg(&ARM::GPRnopcRegClass); + if(ResultReg == 0) return false; + + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg) + .addReg(Reg1); + + if (Opc == ARM::MOVsi) + MIB.addImm(ARM_AM::getSORegOpc(ShiftTy, ShiftImm)); + else if (Opc == ARM::MOVsr) { + MIB.addReg(Reg2); + MIB.addImm(ARM_AM::getSORegOpc(ShiftTy, 0)); + } + + AddOptionalDefs(MIB); + updateValueMap(I, ResultReg); + return true; +} + +// TODO: SoftFP support. +bool ARMFastISel::fastSelectInstruction(const Instruction *I) { + + switch (I->getOpcode()) { + case Instruction::Load: + return SelectLoad(I); + case Instruction::Store: + return SelectStore(I); + case Instruction::Br: + return SelectBranch(I); + case Instruction::IndirectBr: + return SelectIndirectBr(I); + case Instruction::ICmp: + case Instruction::FCmp: + return SelectCmp(I); + case Instruction::FPExt: + return SelectFPExt(I); + case Instruction::FPTrunc: + return SelectFPTrunc(I); + case Instruction::SIToFP: + return SelectIToFP(I, /*isSigned*/ true); + case Instruction::UIToFP: + return SelectIToFP(I, /*isSigned*/ false); + case Instruction::FPToSI: + return SelectFPToI(I, /*isSigned*/ true); + case Instruction::FPToUI: + return SelectFPToI(I, /*isSigned*/ false); + case Instruction::Add: + return SelectBinaryIntOp(I, ISD::ADD); + case Instruction::Or: + return SelectBinaryIntOp(I, ISD::OR); + case Instruction::Sub: + return SelectBinaryIntOp(I, ISD::SUB); + case Instruction::FAdd: + return SelectBinaryFPOp(I, ISD::FADD); + case Instruction::FSub: + return SelectBinaryFPOp(I, ISD::FSUB); + case Instruction::FMul: + return SelectBinaryFPOp(I, ISD::FMUL); + case Instruction::SDiv: + return SelectDiv(I, /*isSigned*/ true); + case Instruction::UDiv: + return SelectDiv(I, /*isSigned*/ false); + case Instruction::SRem: + return SelectRem(I, /*isSigned*/ true); + case Instruction::URem: + return SelectRem(I, /*isSigned*/ false); + case Instruction::Call: + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) + return SelectIntrinsicCall(*II); + return SelectCall(I); + case Instruction::Select: + return SelectSelect(I); + case Instruction::Ret: + return SelectRet(I); + case Instruction::Trunc: + return SelectTrunc(I); + case Instruction::ZExt: + case Instruction::SExt: + return SelectIntExt(I); + case Instruction::Shl: + return SelectShift(I, ARM_AM::lsl); + case Instruction::LShr: + return SelectShift(I, ARM_AM::lsr); + case Instruction::AShr: + return SelectShift(I, ARM_AM::asr); + default: break; + } + return false; +} + +namespace { +// This table describes sign- and zero-extend instructions which can be +// folded into a preceding load. All of these extends have an immediate +// (sometimes a mask and sometimes a shift) that's applied after +// extension. +const struct FoldableLoadExtendsStruct { + uint16_t Opc[2]; // ARM, Thumb. + uint8_t ExpectedImm; + uint8_t isZExt : 1; + uint8_t ExpectedVT : 7; +} FoldableLoadExtends[] = { + { { ARM::SXTH, ARM::t2SXTH }, 0, 0, MVT::i16 }, + { { ARM::UXTH, ARM::t2UXTH }, 0, 1, MVT::i16 }, + { { ARM::ANDri, ARM::t2ANDri }, 255, 1, MVT::i8 }, + { { ARM::SXTB, ARM::t2SXTB }, 0, 0, MVT::i8 }, + { { ARM::UXTB, ARM::t2UXTB }, 0, 1, MVT::i8 } +}; +} + +/// \brief The specified machine instr operand is a vreg, and that +/// vreg is being provided by the specified load instruction. If possible, +/// try to fold the load as an operand to the instruction, returning true if +/// successful. +bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, + const LoadInst *LI) { + // Verify we have a legal type before going any further. + MVT VT; + if (!isLoadTypeLegal(LI->getType(), VT)) + return false; + + // Combine load followed by zero- or sign-extend. + // ldrb r1, [r0] ldrb r1, [r0] + // uxtb r2, r1 => + // mov r3, r2 mov r3, r1 + if (MI->getNumOperands() < 3 || !MI->getOperand(2).isImm()) + return false; + const uint64_t Imm = MI->getOperand(2).getImm(); + + bool Found = false; + bool isZExt; + for (unsigned i = 0, e = array_lengthof(FoldableLoadExtends); + i != e; ++i) { + if (FoldableLoadExtends[i].Opc[isThumb2] == MI->getOpcode() && + (uint64_t)FoldableLoadExtends[i].ExpectedImm == Imm && + MVT((MVT::SimpleValueType)FoldableLoadExtends[i].ExpectedVT) == VT) { + Found = true; + isZExt = FoldableLoadExtends[i].isZExt; + } + } + if (!Found) return false; + + // See if we can handle this address. + Address Addr; + if (!ARMComputeAddress(LI->getOperand(0), Addr)) return false; + + unsigned ResultReg = MI->getOperand(0).getReg(); + if (!ARMEmitLoad(VT, ResultReg, Addr, LI->getAlignment(), isZExt, false)) + return false; + MI->eraseFromParent(); + return true; +} + +unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV, + unsigned Align, MVT VT) { + bool UseGOT_PREL = + !(GV->hasHiddenVisibility() || GV->hasLocalLinkage()); + + LLVMContext *Context = &MF->getFunction()->getContext(); + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); + unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; + ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create( + GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj, + UseGOT_PREL ? ARMCP::GOT_PREL : ARMCP::no_modifier, + /*AddCurrentAddress=*/UseGOT_PREL); + + unsigned ConstAlign = + MF->getDataLayout().getPrefTypeAlignment(Type::getInt32PtrTy(*Context)); + unsigned Idx = MF->getConstantPool()->getConstantPoolIndex(CPV, ConstAlign); + + unsigned TempReg = MF->getRegInfo().createVirtualRegister(&ARM::rGPRRegClass); + unsigned Opc = isThumb2 ? ARM::t2LDRpci : ARM::LDRcp; + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), TempReg) + .addConstantPoolIndex(Idx); + if (Opc == ARM::LDRcp) + MIB.addImm(0); + AddDefaultPred(MIB); + + // Fix the address by adding pc. + unsigned DestReg = createResultReg(TLI.getRegClassFor(VT)); + Opc = Subtarget->isThumb() ? ARM::tPICADD : UseGOT_PREL ? ARM::PICLDR + : ARM::PICADD; + DestReg = constrainOperandRegClass(TII.get(Opc), DestReg, 0); + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg) + .addReg(TempReg) + .addImm(ARMPCLabelIndex); + if (!Subtarget->isThumb()) + AddDefaultPred(MIB); + + if (UseGOT_PREL && Subtarget->isThumb()) { + unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT)); + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(ARM::t2LDRi12), NewDestReg) + .addReg(DestReg) + .addImm(0); + DestReg = NewDestReg; + AddOptionalDefs(MIB); + } + return DestReg; +} + +bool ARMFastISel::fastLowerArguments() { + if (!FuncInfo.CanLowerReturn) + return false; + + const Function *F = FuncInfo.Fn; + if (F->isVarArg()) + return false; + + CallingConv::ID CC = F->getCallingConv(); + switch (CC) { + default: + return false; + case CallingConv::Fast: + case CallingConv::C: + case CallingConv::ARM_AAPCS_VFP: + case CallingConv::ARM_AAPCS: + case CallingConv::ARM_APCS: + break; + } + + // Only handle simple cases. i.e. Up to 4 i8/i16/i32 scalar arguments + // which are passed in r0 - r3. + unsigned Idx = 1; + for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); + I != E; ++I, ++Idx) { + if (Idx > 4) + return false; + + if (F->getAttributes().hasAttribute(Idx, Attribute::InReg) || + F->getAttributes().hasAttribute(Idx, Attribute::StructRet) || + F->getAttributes().hasAttribute(Idx, Attribute::ByVal)) + return false; + + Type *ArgTy = I->getType(); + if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy()) + return false; + + EVT ArgVT = TLI.getValueType(DL, ArgTy); + if (!ArgVT.isSimple()) return false; + switch (ArgVT.getSimpleVT().SimpleTy) { + case MVT::i8: + case MVT::i16: + case MVT::i32: + break; + default: + return false; + } + } + + + static const MCPhysReg GPRArgRegs[] = { + ARM::R0, ARM::R1, ARM::R2, ARM::R3 + }; + + const TargetRegisterClass *RC = &ARM::rGPRRegClass; + Idx = 0; + for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); + I != E; ++I, ++Idx) { + unsigned SrcReg = GPRArgRegs[Idx]; + unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC); + // FIXME: Unfortunately it's necessary to emit a copy from the livein copy. + // Without this, EmitLiveInCopies may eliminate the livein if its only + // use is a bitcast (which isn't turned into an instruction). + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), + ResultReg).addReg(DstReg, getKillRegState(true)); + updateValueMap(&*I, ResultReg); + } + + return true; +} + +namespace llvm { + FastISel *ARM::createFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo) { + if (funcInfo.MF->getSubtarget<ARMSubtarget>().useFastISel()) + return new ARMFastISel(funcInfo, libInfo); + + return nullptr; + } +} diff --git a/contrib/llvm/lib/Target/ARM/ARMFeatures.h b/contrib/llvm/lib/Target/ARM/ARMFeatures.h new file mode 100644 index 0000000..0c910ab --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMFeatures.h @@ -0,0 +1,97 @@ +//===-- ARMFeatures.h - Checks for ARM instruction features -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the code shared between ARM CodeGen and ARM MC +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_ARMFEATURES_H +#define LLVM_LIB_TARGET_ARM_ARMFEATURES_H + +#include "MCTargetDesc/ARMMCTargetDesc.h" + +namespace llvm { + +template<typename InstrType> // could be MachineInstr or MCInst +bool IsCPSRDead(InstrType *Instr); + +template<typename InstrType> // could be MachineInstr or MCInst +inline bool isV8EligibleForIT(InstrType *Instr) { + switch (Instr->getOpcode()) { + default: + return false; + case ARM::tADC: + case ARM::tADDi3: + case ARM::tADDi8: + case ARM::tADDrr: + case ARM::tAND: + case ARM::tASRri: + case ARM::tASRrr: + case ARM::tBIC: + case ARM::tEOR: + case ARM::tLSLri: + case ARM::tLSLrr: + case ARM::tLSRri: + case ARM::tLSRrr: + case ARM::tMOVi8: + case ARM::tMUL: + case ARM::tMVN: + case ARM::tORR: + case ARM::tROR: + case ARM::tRSB: + case ARM::tSBC: + case ARM::tSUBi3: + case ARM::tSUBi8: + case ARM::tSUBrr: + // Outside of an IT block, these set CPSR. + return IsCPSRDead(Instr); + case ARM::tADDrSPi: + case ARM::tCMNz: + case ARM::tCMPi8: + case ARM::tCMPr: + case ARM::tLDRBi: + case ARM::tLDRBr: + case ARM::tLDRHi: + case ARM::tLDRHr: + case ARM::tLDRSB: + case ARM::tLDRSH: + case ARM::tLDRi: + case ARM::tLDRr: + case ARM::tLDRspi: + case ARM::tSTRBi: + case ARM::tSTRBr: + case ARM::tSTRHi: + case ARM::tSTRHr: + case ARM::tSTRi: + case ARM::tSTRr: + case ARM::tSTRspi: + case ARM::tTST: + return true; +// there are some "conditionally deprecated" opcodes + case ARM::tADDspr: + case ARM::tBLXr: + return Instr->getOperand(2).getReg() != ARM::PC; + // ADD PC, SP and BLX PC were always unpredictable, + // now on top of it they're deprecated + case ARM::tADDrSP: + case ARM::tBX: + return Instr->getOperand(0).getReg() != ARM::PC; + case ARM::tADDhirr: + return Instr->getOperand(0).getReg() != ARM::PC && + Instr->getOperand(2).getReg() != ARM::PC; + case ARM::tCMPhir: + case ARM::tMOVr: + return Instr->getOperand(0).getReg() != ARM::PC && + Instr->getOperand(1).getReg() != ARM::PC; + } +} + +} + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp new file mode 100644 index 0000000..c5990bb --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp @@ -0,0 +1,2168 @@ +//===-- ARMFrameLowering.cpp - ARM Frame Information ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the ARM implementation of TargetFrameLowering class. +// +//===----------------------------------------------------------------------===// + +#include "ARMFrameLowering.h" +#include "ARMBaseInstrInfo.h" +#include "ARMBaseRegisterInfo.h" +#include "ARMConstantPoolValue.h" +#include "ARMMachineFunctionInfo.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/Function.h" +#include "llvm/MC/MCContext.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetOptions.h" + +using namespace llvm; + +static cl::opt<bool> +SpillAlignedNEONRegs("align-neon-spills", cl::Hidden, cl::init(true), + cl::desc("Align ARM NEON spills in prolog and epilog")); + +static MachineBasicBlock::iterator +skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI, + unsigned NumAlignedDPRCS2Regs); + +ARMFrameLowering::ARMFrameLowering(const ARMSubtarget &sti) + : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, 4), + STI(sti) {} + +bool ARMFrameLowering::noFramePointerElim(const MachineFunction &MF) const { + // iOS always has a FP for backtracking, force other targets to keep their FP + // when doing FastISel. The emitted code is currently superior, and in cases + // like test-suite's lencod FastISel isn't quite correct when FP is eliminated. + return TargetFrameLowering::noFramePointerElim(MF) || + MF.getSubtarget<ARMSubtarget>().useFastISel(); +} + +/// hasFP - Return true if the specified function should have a dedicated frame +/// pointer register. This is true if the function has variable sized allocas +/// or if frame pointer elimination is disabled. +bool ARMFrameLowering::hasFP(const MachineFunction &MF) const { + const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); + + // iOS requires FP not to be clobbered for backtracing purpose. + if (STI.isTargetIOS() || STI.isTargetWatchOS()) + return true; + + const MachineFrameInfo *MFI = MF.getFrameInfo(); + // Always eliminate non-leaf frame pointers. + return ((MF.getTarget().Options.DisableFramePointerElim(MF) && + MFI->hasCalls()) || + RegInfo->needsStackRealignment(MF) || + MFI->hasVarSizedObjects() || + MFI->isFrameAddressTaken()); +} + +/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is +/// not required, we reserve argument space for call sites in the function +/// immediately on entry to the current function. This eliminates the need for +/// add/sub sp brackets around call sites. Returns true if the call frame is +/// included as part of the stack frame. +bool ARMFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { + const MachineFrameInfo *FFI = MF.getFrameInfo(); + unsigned CFSize = FFI->getMaxCallFrameSize(); + // It's not always a good idea to include the call frame as part of the + // stack frame. ARM (especially Thumb) has small immediate offset to + // address the stack frame. So a large call frame can cause poor codegen + // and may even makes it impossible to scavenge a register. + if (CFSize >= ((1 << 12) - 1) / 2) // Half of imm12 + return false; + + return !MF.getFrameInfo()->hasVarSizedObjects(); +} + +/// canSimplifyCallFramePseudos - If there is a reserved call frame, the +/// call frame pseudos can be simplified. Unlike most targets, having a FP +/// is not sufficient here since we still may reference some objects via SP +/// even when FP is available in Thumb2 mode. +bool +ARMFrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const { + return hasReservedCallFrame(MF) || MF.getFrameInfo()->hasVarSizedObjects(); +} + +static bool isCSRestore(MachineInstr *MI, + const ARMBaseInstrInfo &TII, + const MCPhysReg *CSRegs) { + // Integer spill area is handled with "pop". + if (isPopOpcode(MI->getOpcode())) { + // The first two operands are predicates. The last two are + // imp-def and imp-use of SP. Check everything in between. + for (int i = 5, e = MI->getNumOperands(); i != e; ++i) + if (!isCalleeSavedRegister(MI->getOperand(i).getReg(), CSRegs)) + return false; + return true; + } + if ((MI->getOpcode() == ARM::LDR_POST_IMM || + MI->getOpcode() == ARM::LDR_POST_REG || + MI->getOpcode() == ARM::t2LDR_POST) && + isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs) && + MI->getOperand(1).getReg() == ARM::SP) + return true; + + return false; +} + +static void emitRegPlusImmediate(bool isARM, MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, DebugLoc dl, + const ARMBaseInstrInfo &TII, unsigned DestReg, + unsigned SrcReg, int NumBytes, + unsigned MIFlags = MachineInstr::NoFlags, + ARMCC::CondCodes Pred = ARMCC::AL, + unsigned PredReg = 0) { + if (isARM) + emitARMRegPlusImmediate(MBB, MBBI, dl, DestReg, SrcReg, NumBytes, + Pred, PredReg, TII, MIFlags); + else + emitT2RegPlusImmediate(MBB, MBBI, dl, DestReg, SrcReg, NumBytes, + Pred, PredReg, TII, MIFlags); +} + +static void emitSPUpdate(bool isARM, MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, DebugLoc dl, + const ARMBaseInstrInfo &TII, int NumBytes, + unsigned MIFlags = MachineInstr::NoFlags, + ARMCC::CondCodes Pred = ARMCC::AL, + unsigned PredReg = 0) { + emitRegPlusImmediate(isARM, MBB, MBBI, dl, TII, ARM::SP, ARM::SP, NumBytes, + MIFlags, Pred, PredReg); +} + +static int sizeOfSPAdjustment(const MachineInstr *MI) { + int RegSize; + switch (MI->getOpcode()) { + case ARM::VSTMDDB_UPD: + RegSize = 8; + break; + case ARM::STMDB_UPD: + case ARM::t2STMDB_UPD: + RegSize = 4; + break; + case ARM::t2STR_PRE: + case ARM::STR_PRE_IMM: + return 4; + default: + llvm_unreachable("Unknown push or pop like instruction"); + } + + int count = 0; + // ARM and Thumb2 push/pop insts have explicit "sp, sp" operands (+ + // pred) so the list starts at 4. + for (int i = MI->getNumOperands() - 1; i >= 4; --i) + count += RegSize; + return count; +} + +static bool WindowsRequiresStackProbe(const MachineFunction &MF, + size_t StackSizeInBytes) { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const Function *F = MF.getFunction(); + unsigned StackProbeSize = (MFI->getStackProtectorIndex() > 0) ? 4080 : 4096; + if (F->hasFnAttribute("stack-probe-size")) + F->getFnAttribute("stack-probe-size") + .getValueAsString() + .getAsInteger(0, StackProbeSize); + return StackSizeInBytes >= StackProbeSize; +} + +namespace { +struct StackAdjustingInsts { + struct InstInfo { + MachineBasicBlock::iterator I; + unsigned SPAdjust; + bool BeforeFPSet; + }; + + SmallVector<InstInfo, 4> Insts; + + void addInst(MachineBasicBlock::iterator I, unsigned SPAdjust, + bool BeforeFPSet = false) { + InstInfo Info = {I, SPAdjust, BeforeFPSet}; + Insts.push_back(Info); + } + + void addExtraBytes(const MachineBasicBlock::iterator I, unsigned ExtraBytes) { + auto Info = std::find_if(Insts.begin(), Insts.end(), + [&](InstInfo &Info) { return Info.I == I; }); + assert(Info != Insts.end() && "invalid sp adjusting instruction"); + Info->SPAdjust += ExtraBytes; + } + + void emitDefCFAOffsets(MachineModuleInfo &MMI, MachineBasicBlock &MBB, + DebugLoc dl, const ARMBaseInstrInfo &TII, bool HasFP) { + unsigned CFAOffset = 0; + for (auto &Info : Insts) { + if (HasFP && !Info.BeforeFPSet) + return; + + CFAOffset -= Info.SPAdjust; + unsigned CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset)); + BuildMI(MBB, std::next(Info.I), dl, + TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } + } +}; +} + +/// Emit an instruction sequence that will align the address in +/// register Reg by zero-ing out the lower bits. For versions of the +/// architecture that support Neon, this must be done in a single +/// instruction, since skipAlignedDPRCS2Spills assumes it is done in a +/// single instruction. That function only gets called when optimizing +/// spilling of D registers on a core with the Neon instruction set +/// present. +static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI, + const TargetInstrInfo &TII, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, const unsigned Reg, + const unsigned Alignment, + const bool MustBeSingleInstruction) { + const ARMSubtarget &AST = + static_cast<const ARMSubtarget &>(MF.getSubtarget()); + const bool CanUseBFC = AST.hasV6T2Ops() || AST.hasV7Ops(); + const unsigned AlignMask = Alignment - 1; + const unsigned NrBitsToZero = countTrailingZeros(Alignment); + assert(!AFI->isThumb1OnlyFunction() && "Thumb1 not supported"); + if (!AFI->isThumbFunction()) { + // if the BFC instruction is available, use that to zero the lower + // bits: + // bfc Reg, #0, log2(Alignment) + // otherwise use BIC, if the mask to zero the required number of bits + // can be encoded in the bic immediate field + // bic Reg, Reg, Alignment-1 + // otherwise, emit + // lsr Reg, Reg, log2(Alignment) + // lsl Reg, Reg, log2(Alignment) + if (CanUseBFC) { + AddDefaultPred(BuildMI(MBB, MBBI, DL, TII.get(ARM::BFC), Reg) + .addReg(Reg, RegState::Kill) + .addImm(~AlignMask)); + } else if (AlignMask <= 255) { + AddDefaultCC( + AddDefaultPred(BuildMI(MBB, MBBI, DL, TII.get(ARM::BICri), Reg) + .addReg(Reg, RegState::Kill) + .addImm(AlignMask))); + } else { + assert(!MustBeSingleInstruction && + "Shouldn't call emitAligningInstructions demanding a single " + "instruction to be emitted for large stack alignment for a target " + "without BFC."); + AddDefaultCC(AddDefaultPred( + BuildMI(MBB, MBBI, DL, TII.get(ARM::MOVsi), Reg) + .addReg(Reg, RegState::Kill) + .addImm(ARM_AM::getSORegOpc(ARM_AM::lsr, NrBitsToZero)))); + AddDefaultCC(AddDefaultPred( + BuildMI(MBB, MBBI, DL, TII.get(ARM::MOVsi), Reg) + .addReg(Reg, RegState::Kill) + .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, NrBitsToZero)))); + } + } else { + // Since this is only reached for Thumb-2 targets, the BFC instruction + // should always be available. + assert(CanUseBFC); + AddDefaultPred(BuildMI(MBB, MBBI, DL, TII.get(ARM::t2BFC), Reg) + .addReg(Reg, RegState::Kill) + .addImm(~AlignMask)); + } +} + +void ARMFrameLowering::emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator MBBI = MBB.begin(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + MachineModuleInfo &MMI = MF.getMMI(); + MCContext &Context = MMI.getContext(); + const TargetMachine &TM = MF.getTarget(); + const MCRegisterInfo *MRI = Context.getRegisterInfo(); + const ARMBaseRegisterInfo *RegInfo = STI.getRegisterInfo(); + const ARMBaseInstrInfo &TII = *STI.getInstrInfo(); + assert(!AFI->isThumb1OnlyFunction() && + "This emitPrologue does not support Thumb1!"); + bool isARM = !AFI->isThumbFunction(); + unsigned Align = STI.getFrameLowering()->getStackAlignment(); + unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(); + unsigned NumBytes = MFI->getStackSize(); + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. + DebugLoc dl; + + unsigned FramePtr = RegInfo->getFrameRegister(MF); + + // Determine the sizes of each callee-save spill areas and record which frame + // belongs to which callee-save spill areas. + unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0; + int FramePtrSpillFI = 0; + int D8SpillFI = 0; + + // All calls are tail calls in GHC calling conv, and functions have no + // prologue/epilogue. + if (MF.getFunction()->getCallingConv() == CallingConv::GHC) + return; + + StackAdjustingInsts DefCFAOffsetCandidates; + bool HasFP = hasFP(MF); + + // Allocate the vararg register save area. + if (ArgRegsSaveSize) { + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -ArgRegsSaveSize, + MachineInstr::FrameSetup); + DefCFAOffsetCandidates.addInst(std::prev(MBBI), ArgRegsSaveSize, true); + } + + if (!AFI->hasStackFrame() && + (!STI.isTargetWindows() || !WindowsRequiresStackProbe(MF, NumBytes))) { + if (NumBytes - ArgRegsSaveSize != 0) { + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -(NumBytes - ArgRegsSaveSize), + MachineInstr::FrameSetup); + DefCFAOffsetCandidates.addInst(std::prev(MBBI), + NumBytes - ArgRegsSaveSize, true); + } + DefCFAOffsetCandidates.emitDefCFAOffsets(MMI, MBB, dl, TII, HasFP); + return; + } + + // Determine spill area sizes. + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + int FI = CSI[i].getFrameIdx(); + switch (Reg) { + case ARM::R8: + case ARM::R9: + case ARM::R10: + case ARM::R11: + case ARM::R12: + if (STI.isTargetDarwin()) { + GPRCS2Size += 4; + break; + } + // fallthrough + case ARM::R0: + case ARM::R1: + case ARM::R2: + case ARM::R3: + case ARM::R4: + case ARM::R5: + case ARM::R6: + case ARM::R7: + case ARM::LR: + if (Reg == FramePtr) + FramePtrSpillFI = FI; + GPRCS1Size += 4; + break; + default: + // This is a DPR. Exclude the aligned DPRCS2 spills. + if (Reg == ARM::D8) + D8SpillFI = FI; + if (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs()) + DPRCSSize += 8; + } + } + + // Move past area 1. + MachineBasicBlock::iterator LastPush = MBB.end(), GPRCS1Push, GPRCS2Push; + if (GPRCS1Size > 0) { + GPRCS1Push = LastPush = MBBI++; + DefCFAOffsetCandidates.addInst(LastPush, GPRCS1Size, true); + } + + // Determine starting offsets of spill areas. + unsigned GPRCS1Offset = NumBytes - ArgRegsSaveSize - GPRCS1Size; + unsigned GPRCS2Offset = GPRCS1Offset - GPRCS2Size; + unsigned DPRAlign = DPRCSSize ? std::min(8U, Align) : 4U; + unsigned DPRGapSize = (GPRCS1Size + GPRCS2Size + ArgRegsSaveSize) % DPRAlign; + unsigned DPRCSOffset = GPRCS2Offset - DPRGapSize - DPRCSSize; + int FramePtrOffsetInPush = 0; + if (HasFP) { + FramePtrOffsetInPush = + MFI->getObjectOffset(FramePtrSpillFI) + ArgRegsSaveSize; + AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + + NumBytes); + } + AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset); + AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset); + AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset); + + // Move past area 2. + if (GPRCS2Size > 0) { + GPRCS2Push = LastPush = MBBI++; + DefCFAOffsetCandidates.addInst(LastPush, GPRCS2Size); + } + + // Prolog/epilog inserter assumes we correctly align DPRs on the stack, so our + // .cfi_offset operations will reflect that. + if (DPRGapSize) { + assert(DPRGapSize == 4 && "unexpected alignment requirements for DPRs"); + if (tryFoldSPUpdateIntoPushPop(STI, MF, LastPush, DPRGapSize)) + DefCFAOffsetCandidates.addExtraBytes(LastPush, DPRGapSize); + else { + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -DPRGapSize, + MachineInstr::FrameSetup); + DefCFAOffsetCandidates.addInst(std::prev(MBBI), DPRGapSize); + } + } + + // Move past area 3. + if (DPRCSSize > 0) { + // Since vpush register list cannot have gaps, there may be multiple vpush + // instructions in the prologue. + while (MBBI->getOpcode() == ARM::VSTMDDB_UPD) { + DefCFAOffsetCandidates.addInst(MBBI, sizeOfSPAdjustment(MBBI)); + LastPush = MBBI++; + } + } + + // Move past the aligned DPRCS2 area. + if (AFI->getNumAlignedDPRCS2Regs() > 0) { + MBBI = skipAlignedDPRCS2Spills(MBBI, AFI->getNumAlignedDPRCS2Regs()); + // The code inserted by emitAlignedDPRCS2Spills realigns the stack, and + // leaves the stack pointer pointing to the DPRCS2 area. + // + // Adjust NumBytes to represent the stack slots below the DPRCS2 area. + NumBytes += MFI->getObjectOffset(D8SpillFI); + } else + NumBytes = DPRCSOffset; + + if (STI.isTargetWindows() && WindowsRequiresStackProbe(MF, NumBytes)) { + uint32_t NumWords = NumBytes >> 2; + + if (NumWords < 65536) + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), ARM::R4) + .addImm(NumWords) + .setMIFlags(MachineInstr::FrameSetup)); + else + BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi32imm), ARM::R4) + .addImm(NumWords) + .setMIFlags(MachineInstr::FrameSetup); + + switch (TM.getCodeModel()) { + case CodeModel::Small: + case CodeModel::Medium: + case CodeModel::Default: + case CodeModel::Kernel: + BuildMI(MBB, MBBI, dl, TII.get(ARM::tBL)) + .addImm((unsigned)ARMCC::AL).addReg(0) + .addExternalSymbol("__chkstk") + .addReg(ARM::R4, RegState::Implicit) + .setMIFlags(MachineInstr::FrameSetup); + break; + case CodeModel::Large: + case CodeModel::JITDefault: + BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi32imm), ARM::R12) + .addExternalSymbol("__chkstk") + .setMIFlags(MachineInstr::FrameSetup); + + BuildMI(MBB, MBBI, dl, TII.get(ARM::tBLXr)) + .addImm((unsigned)ARMCC::AL).addReg(0) + .addReg(ARM::R12, RegState::Kill) + .addReg(ARM::R4, RegState::Implicit) + .setMIFlags(MachineInstr::FrameSetup); + break; + } + + AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr), + ARM::SP) + .addReg(ARM::SP, RegState::Define) + .addReg(ARM::R4, RegState::Kill) + .setMIFlags(MachineInstr::FrameSetup))); + NumBytes = 0; + } + + if (NumBytes) { + // Adjust SP after all the callee-save spills. + if (AFI->getNumAlignedDPRCS2Regs() == 0 && + tryFoldSPUpdateIntoPushPop(STI, MF, LastPush, NumBytes)) + DefCFAOffsetCandidates.addExtraBytes(LastPush, NumBytes); + else { + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes, + MachineInstr::FrameSetup); + DefCFAOffsetCandidates.addInst(std::prev(MBBI), NumBytes); + } + + if (HasFP && isARM) + // Restore from fp only in ARM mode: e.g. sub sp, r7, #24 + // Note it's not safe to do this in Thumb2 mode because it would have + // taken two instructions: + // mov sp, r7 + // sub sp, #24 + // If an interrupt is taken between the two instructions, then sp is in + // an inconsistent state (pointing to the middle of callee-saved area). + // The interrupt handler can end up clobbering the registers. + AFI->setShouldRestoreSPFromFP(true); + } + + // Set FP to point to the stack slot that contains the previous FP. + // For iOS, FP is R7, which has now been stored in spill area 1. + // Otherwise, if this is not iOS, all the callee-saved registers go + // into spill area 1, including the FP in R11. In either case, it + // is in area one and the adjustment needs to take place just after + // that push. + if (HasFP) { + MachineBasicBlock::iterator AfterPush = std::next(GPRCS1Push); + unsigned PushSize = sizeOfSPAdjustment(GPRCS1Push); + emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, AfterPush, + dl, TII, FramePtr, ARM::SP, + PushSize + FramePtrOffsetInPush, + MachineInstr::FrameSetup); + if (FramePtrOffsetInPush + PushSize != 0) { + unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createDefCfa( + nullptr, MRI->getDwarfRegNum(FramePtr, true), + -(ArgRegsSaveSize - FramePtrOffsetInPush))); + BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } else { + unsigned CFIIndex = + MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister( + nullptr, MRI->getDwarfRegNum(FramePtr, true))); + BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } + } + + // Now that the prologue's actual instructions are finalised, we can insert + // the necessary DWARF cf instructions to describe the situation. Start by + // recording where each register ended up: + if (GPRCS1Size > 0) { + MachineBasicBlock::iterator Pos = std::next(GPRCS1Push); + int CFIIndex; + for (const auto &Entry : CSI) { + unsigned Reg = Entry.getReg(); + int FI = Entry.getFrameIdx(); + switch (Reg) { + case ARM::R8: + case ARM::R9: + case ARM::R10: + case ARM::R11: + case ARM::R12: + if (STI.isTargetDarwin()) + break; + // fallthrough + case ARM::R0: + case ARM::R1: + case ARM::R2: + case ARM::R3: + case ARM::R4: + case ARM::R5: + case ARM::R6: + case ARM::R7: + case ARM::LR: + CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( + nullptr, MRI->getDwarfRegNum(Reg, true), MFI->getObjectOffset(FI))); + BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + break; + } + } + } + + if (GPRCS2Size > 0) { + MachineBasicBlock::iterator Pos = std::next(GPRCS2Push); + for (const auto &Entry : CSI) { + unsigned Reg = Entry.getReg(); + int FI = Entry.getFrameIdx(); + switch (Reg) { + case ARM::R8: + case ARM::R9: + case ARM::R10: + case ARM::R11: + case ARM::R12: + if (STI.isTargetDarwin()) { + unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); + unsigned Offset = MFI->getObjectOffset(FI); + unsigned CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset)); + BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } + break; + } + } + } + + if (DPRCSSize > 0) { + // Since vpush register list cannot have gaps, there may be multiple vpush + // instructions in the prologue. + MachineBasicBlock::iterator Pos = std::next(LastPush); + for (const auto &Entry : CSI) { + unsigned Reg = Entry.getReg(); + int FI = Entry.getFrameIdx(); + if ((Reg >= ARM::D0 && Reg <= ARM::D31) && + (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs())) { + unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); + unsigned Offset = MFI->getObjectOffset(FI); + unsigned CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset)); + BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } + } + } + + // Now we can emit descriptions of where the canonical frame address was + // throughout the process. If we have a frame pointer, it takes over the job + // half-way through, so only the first few .cfi_def_cfa_offset instructions + // actually get emitted. + DefCFAOffsetCandidates.emitDefCFAOffsets(MMI, MBB, dl, TII, HasFP); + + if (STI.isTargetELF() && hasFP(MF)) + MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() - + AFI->getFramePtrSpillOffset()); + + AFI->setGPRCalleeSavedArea1Size(GPRCS1Size); + AFI->setGPRCalleeSavedArea2Size(GPRCS2Size); + AFI->setDPRCalleeSavedGapSize(DPRGapSize); + AFI->setDPRCalleeSavedAreaSize(DPRCSSize); + + // If we need dynamic stack realignment, do it here. Be paranoid and make + // sure if we also have VLAs, we have a base pointer for frame access. + // If aligned NEON registers were spilled, the stack has already been + // realigned. + if (!AFI->getNumAlignedDPRCS2Regs() && RegInfo->needsStackRealignment(MF)) { + unsigned MaxAlign = MFI->getMaxAlignment(); + assert(!AFI->isThumb1OnlyFunction()); + if (!AFI->isThumbFunction()) { + emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::SP, MaxAlign, + false); + } else { + // We cannot use sp as source/dest register here, thus we're using r4 to + // perform the calculations. We're emitting the following sequence: + // mov r4, sp + // -- use emitAligningInstructions to produce best sequence to zero + // -- out lower bits in r4 + // mov sp, r4 + // FIXME: It will be better just to find spare register here. + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::R4) + .addReg(ARM::SP, RegState::Kill)); + emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::R4, MaxAlign, + false); + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP) + .addReg(ARM::R4, RegState::Kill)); + } + + AFI->setShouldRestoreSPFromFP(true); + } + + // If we need a base pointer, set it up here. It's whatever the value + // of the stack pointer is at this point. Any variable size objects + // will be allocated after this, so we can still use the base pointer + // to reference locals. + // FIXME: Clarify FrameSetup flags here. + if (RegInfo->hasBasePointer(MF)) { + if (isARM) + BuildMI(MBB, MBBI, dl, + TII.get(ARM::MOVr), RegInfo->getBaseRegister()) + .addReg(ARM::SP) + .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0); + else + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), + RegInfo->getBaseRegister()) + .addReg(ARM::SP)); + } + + // If the frame has variable sized objects then the epilogue must restore + // the sp from fp. We can assume there's an FP here since hasFP already + // checks for hasVarSizedObjects. + if (MFI->hasVarSizedObjects()) + AFI->setShouldRestoreSPFromFP(true); +} + +void ARMFrameLowering::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); + const ARMBaseInstrInfo &TII = + *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo()); + assert(!AFI->isThumb1OnlyFunction() && + "This emitEpilogue does not support Thumb1!"); + bool isARM = !AFI->isThumbFunction(); + + unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(); + int NumBytes = (int)MFI->getStackSize(); + unsigned FramePtr = RegInfo->getFrameRegister(MF); + + // All calls are tail calls in GHC calling conv, and functions have no + // prologue/epilogue. + if (MF.getFunction()->getCallingConv() == CallingConv::GHC) + return; + + // First put ourselves on the first (from top) terminator instructions. + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + + if (!AFI->hasStackFrame()) { + if (NumBytes - ArgRegsSaveSize != 0) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes - ArgRegsSaveSize); + } else { + // Unwind MBBI to point to first LDR / VLDRD. + const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); + if (MBBI != MBB.begin()) { + do { + --MBBI; + } while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs)); + if (!isCSRestore(MBBI, TII, CSRegs)) + ++MBBI; + } + + // Move SP to start of FP callee save spill area. + NumBytes -= (ArgRegsSaveSize + + AFI->getGPRCalleeSavedArea1Size() + + AFI->getGPRCalleeSavedArea2Size() + + AFI->getDPRCalleeSavedGapSize() + + AFI->getDPRCalleeSavedAreaSize()); + + // Reset SP based on frame pointer only if the stack frame extends beyond + // frame pointer stack slot or target is ELF and the function has FP. + if (AFI->shouldRestoreSPFromFP()) { + NumBytes = AFI->getFramePtrSpillOffset() - NumBytes; + if (NumBytes) { + if (isARM) + emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes, + ARMCC::AL, 0, TII); + else { + // It's not possible to restore SP from FP in a single instruction. + // For iOS, this looks like: + // mov sp, r7 + // sub sp, #24 + // This is bad, if an interrupt is taken after the mov, sp is in an + // inconsistent state. + // Use the first callee-saved register as a scratch register. + assert(!MFI->getPristineRegs(MF).test(ARM::R4) && + "No scratch register to restore SP from FP!"); + emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes, + ARMCC::AL, 0, TII); + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), + ARM::SP) + .addReg(ARM::R4)); + } + } else { + // Thumb2 or ARM. + if (isARM) + BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP) + .addReg(FramePtr).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0); + else + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), + ARM::SP) + .addReg(FramePtr)); + } + } else if (NumBytes && + !tryFoldSPUpdateIntoPushPop(STI, MF, MBBI, NumBytes)) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes); + + // Increment past our save areas. + if (AFI->getDPRCalleeSavedAreaSize()) { + MBBI++; + // Since vpop register list cannot have gaps, there may be multiple vpop + // instructions in the epilogue. + while (MBBI->getOpcode() == ARM::VLDMDIA_UPD) + MBBI++; + } + if (AFI->getDPRCalleeSavedGapSize()) { + assert(AFI->getDPRCalleeSavedGapSize() == 4 && + "unexpected DPR alignment gap"); + emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getDPRCalleeSavedGapSize()); + } + + if (AFI->getGPRCalleeSavedArea2Size()) MBBI++; + if (AFI->getGPRCalleeSavedArea1Size()) MBBI++; + } + + if (ArgRegsSaveSize) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, ArgRegsSaveSize); +} + +/// getFrameIndexReference - Provide a base+offset reference to an FI slot for +/// debug info. It's the same as what we use for resolving the code-gen +/// references for now. FIXME: This can go wrong when references are +/// SP-relative and simple call frames aren't used. +int +ARMFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const { + return ResolveFrameIndexReference(MF, FI, FrameReg, 0); +} + +int +ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF, + int FI, unsigned &FrameReg, + int SPAdj) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>( + MF.getSubtarget().getRegisterInfo()); + const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize(); + int FPOffset = Offset - AFI->getFramePtrSpillOffset(); + bool isFixed = MFI->isFixedObjectIndex(FI); + + FrameReg = ARM::SP; + Offset += SPAdj; + + // SP can move around if there are allocas. We may also lose track of SP + // when emergency spilling inside a non-reserved call frame setup. + bool hasMovingSP = !hasReservedCallFrame(MF); + + // When dynamically realigning the stack, use the frame pointer for + // parameters, and the stack/base pointer for locals. + if (RegInfo->needsStackRealignment(MF)) { + assert (hasFP(MF) && "dynamic stack realignment without a FP!"); + if (isFixed) { + FrameReg = RegInfo->getFrameRegister(MF); + Offset = FPOffset; + } else if (hasMovingSP) { + assert(RegInfo->hasBasePointer(MF) && + "VLAs and dynamic stack alignment, but missing base pointer!"); + FrameReg = RegInfo->getBaseRegister(); + } + return Offset; + } + + // If there is a frame pointer, use it when we can. + if (hasFP(MF) && AFI->hasStackFrame()) { + // Use frame pointer to reference fixed objects. Use it for locals if + // there are VLAs (and thus the SP isn't reliable as a base). + if (isFixed || (hasMovingSP && !RegInfo->hasBasePointer(MF))) { + FrameReg = RegInfo->getFrameRegister(MF); + return FPOffset; + } else if (hasMovingSP) { + assert(RegInfo->hasBasePointer(MF) && "missing base pointer!"); + if (AFI->isThumb2Function()) { + // Try to use the frame pointer if we can, else use the base pointer + // since it's available. This is handy for the emergency spill slot, in + // particular. + if (FPOffset >= -255 && FPOffset < 0) { + FrameReg = RegInfo->getFrameRegister(MF); + return FPOffset; + } + } + } else if (AFI->isThumb2Function()) { + // Use add <rd>, sp, #<imm8> + // ldr <rd>, [sp, #<imm8>] + // if at all possible to save space. + if (Offset >= 0 && (Offset & 3) == 0 && Offset <= 1020) + return Offset; + // In Thumb2 mode, the negative offset is very limited. Try to avoid + // out of range references. ldr <rt>,[<rn>, #-<imm8>] + if (FPOffset >= -255 && FPOffset < 0) { + FrameReg = RegInfo->getFrameRegister(MF); + return FPOffset; + } + } else if (Offset > (FPOffset < 0 ? -FPOffset : FPOffset)) { + // Otherwise, use SP or FP, whichever is closer to the stack slot. + FrameReg = RegInfo->getFrameRegister(MF); + return FPOffset; + } + } + // Use the base pointer if we have one. + if (RegInfo->hasBasePointer(MF)) + FrameReg = RegInfo->getBaseRegister(); + return Offset; +} + +void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + unsigned StmOpc, unsigned StrOpc, + bool NoGap, + bool(*Func)(unsigned, bool), + unsigned NumAlignedDPRCS2Regs, + unsigned MIFlags) const { + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + + DebugLoc DL; + + SmallVector<std::pair<unsigned,bool>, 4> Regs; + unsigned i = CSI.size(); + while (i != 0) { + unsigned LastReg = 0; + for (; i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + if (!(Func)(Reg, STI.isTargetDarwin())) continue; + + // D-registers in the aligned area DPRCS2 are NOT spilled here. + if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs) + continue; + + // Add the callee-saved register as live-in unless it's LR and + // @llvm.returnaddress is called. If LR is returned for + // @llvm.returnaddress then it's already added to the function and + // entry block live-in sets. + bool isKill = true; + if (Reg == ARM::LR) { + if (MF.getFrameInfo()->isReturnAddressTaken() && + MF.getRegInfo().isLiveIn(Reg)) + isKill = false; + } + + if (isKill) + MBB.addLiveIn(Reg); + + // If NoGap is true, push consecutive registers and then leave the rest + // for other instructions. e.g. + // vpush {d8, d10, d11} -> vpush {d8}, vpush {d10, d11} + if (NoGap && LastReg && LastReg != Reg-1) + break; + LastReg = Reg; + Regs.push_back(std::make_pair(Reg, isKill)); + } + + if (Regs.empty()) + continue; + if (Regs.size() > 1 || StrOpc== 0) { + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(StmOpc), ARM::SP) + .addReg(ARM::SP).setMIFlags(MIFlags)); + for (unsigned i = 0, e = Regs.size(); i < e; ++i) + MIB.addReg(Regs[i].first, getKillRegState(Regs[i].second)); + } else if (Regs.size() == 1) { + MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc), + ARM::SP) + .addReg(Regs[0].first, getKillRegState(Regs[0].second)) + .addReg(ARM::SP).setMIFlags(MIFlags) + .addImm(-4); + AddDefaultPred(MIB); + } + Regs.clear(); + + // Put any subsequent vpush instructions before this one: they will refer to + // higher register numbers so need to be pushed first in order to preserve + // monotonicity. + if (MI != MBB.begin()) + --MI; + } +} + +void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + unsigned LdmOpc, unsigned LdrOpc, + bool isVarArg, bool NoGap, + bool(*Func)(unsigned, bool), + unsigned NumAlignedDPRCS2Regs) const { + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + DebugLoc DL; + bool isTailCall = false; + bool isInterrupt = false; + bool isTrap = false; + if (MBB.end() != MI) { + DL = MI->getDebugLoc(); + unsigned RetOpcode = MI->getOpcode(); + isTailCall = (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNri); + isInterrupt = + RetOpcode == ARM::SUBS_PC_LR || RetOpcode == ARM::t2SUBS_PC_LR; + isTrap = + RetOpcode == ARM::TRAP || RetOpcode == ARM::TRAPNaCl || + RetOpcode == ARM::tTRAP; + } + + SmallVector<unsigned, 4> Regs; + unsigned i = CSI.size(); + while (i != 0) { + unsigned LastReg = 0; + bool DeleteRet = false; + for (; i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + if (!(Func)(Reg, STI.isTargetDarwin())) continue; + + // The aligned reloads from area DPRCS2 are not inserted here. + if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs) + continue; + + if (Reg == ARM::LR && !isTailCall && !isVarArg && !isInterrupt && + !isTrap && STI.hasV5TOps()) { + if (MBB.succ_empty()) { + Reg = ARM::PC; + DeleteRet = true; + LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET; + } else + LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_UPD : ARM::LDMIA_UPD; + // Fold the return instruction into the LDM. + } + + // If NoGap is true, pop consecutive registers and then leave the rest + // for other instructions. e.g. + // vpop {d8, d10, d11} -> vpop {d8}, vpop {d10, d11} + if (NoGap && LastReg && LastReg != Reg-1) + break; + + LastReg = Reg; + Regs.push_back(Reg); + } + + if (Regs.empty()) + continue; + if (Regs.size() > 1 || LdrOpc == 0) { + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(LdmOpc), ARM::SP) + .addReg(ARM::SP)); + for (unsigned i = 0, e = Regs.size(); i < e; ++i) + MIB.addReg(Regs[i], getDefRegState(true)); + if (DeleteRet && MI != MBB.end()) { + MIB.copyImplicitOps(&*MI); + MI->eraseFromParent(); + } + MI = MIB; + } else if (Regs.size() == 1) { + // If we adjusted the reg to PC from LR above, switch it back here. We + // only do that for LDM. + if (Regs[0] == ARM::PC) + Regs[0] = ARM::LR; + MachineInstrBuilder MIB = + BuildMI(MBB, MI, DL, TII.get(LdrOpc), Regs[0]) + .addReg(ARM::SP, RegState::Define) + .addReg(ARM::SP); + // ARM mode needs an extra reg0 here due to addrmode2. Will go away once + // that refactoring is complete (eventually). + if (LdrOpc == ARM::LDR_POST_REG || LdrOpc == ARM::LDR_POST_IMM) { + MIB.addReg(0); + MIB.addImm(ARM_AM::getAM2Opc(ARM_AM::add, 4, ARM_AM::no_shift)); + } else + MIB.addImm(4); + AddDefaultPred(MIB); + } + Regs.clear(); + + // Put any subsequent vpop instructions after this one: they will refer to + // higher register numbers so need to be popped afterwards. + if (MI != MBB.end()) + ++MI; + } +} + +/// Emit aligned spill instructions for NumAlignedDPRCS2Regs D-registers +/// starting from d8. Also insert stack realignment code and leave the stack +/// pointer pointing to the d8 spill slot. +static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned NumAlignedDPRCS2Regs, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) { + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + + // Mark the D-register spill slots as properly aligned. Since MFI computes + // stack slot layout backwards, this can actually mean that the d-reg stack + // slot offsets can be wrong. The offset for d8 will always be correct. + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned DNum = CSI[i].getReg() - ARM::D8; + if (DNum > NumAlignedDPRCS2Regs - 1) + continue; + int FI = CSI[i].getFrameIdx(); + // The even-numbered registers will be 16-byte aligned, the odd-numbered + // registers will be 8-byte aligned. + MFI.setObjectAlignment(FI, DNum % 2 ? 8 : 16); + + // The stack slot for D8 needs to be maximally aligned because this is + // actually the point where we align the stack pointer. MachineFrameInfo + // computes all offsets relative to the incoming stack pointer which is a + // bit weird when realigning the stack. Any extra padding for this + // over-alignment is not realized because the code inserted below adjusts + // the stack pointer by numregs * 8 before aligning the stack pointer. + if (DNum == 0) + MFI.setObjectAlignment(FI, MFI.getMaxAlignment()); + } + + // Move the stack pointer to the d8 spill slot, and align it at the same + // time. Leave the stack slot address in the scratch register r4. + // + // sub r4, sp, #numregs * 8 + // bic r4, r4, #align - 1 + // mov sp, r4 + // + bool isThumb = AFI->isThumbFunction(); + assert(!AFI->isThumb1OnlyFunction() && "Can't realign stack for thumb1"); + AFI->setShouldRestoreSPFromFP(true); + + // sub r4, sp, #numregs * 8 + // The immediate is <= 64, so it doesn't need any special encoding. + unsigned Opc = isThumb ? ARM::t2SUBri : ARM::SUBri; + AddDefaultCC(AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4) + .addReg(ARM::SP) + .addImm(8 * NumAlignedDPRCS2Regs))); + + unsigned MaxAlign = MF.getFrameInfo()->getMaxAlignment(); + // We must set parameter MustBeSingleInstruction to true, since + // skipAlignedDPRCS2Spills expects exactly 3 instructions to perform + // stack alignment. Luckily, this can always be done since all ARM + // architecture versions that support Neon also support the BFC + // instruction. + emitAligningInstructions(MF, AFI, TII, MBB, MI, DL, ARM::R4, MaxAlign, true); + + // mov sp, r4 + // The stack pointer must be adjusted before spilling anything, otherwise + // the stack slots could be clobbered by an interrupt handler. + // Leave r4 live, it is used below. + Opc = isThumb ? ARM::tMOVr : ARM::MOVr; + MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(Opc), ARM::SP) + .addReg(ARM::R4); + MIB = AddDefaultPred(MIB); + if (!isThumb) + AddDefaultCC(MIB); + + // Now spill NumAlignedDPRCS2Regs registers starting from d8. + // r4 holds the stack slot address. + unsigned NextReg = ARM::D8; + + // 16-byte aligned vst1.64 with 4 d-regs and address writeback. + // The writeback is only needed when emitting two vst1.64 instructions. + if (NumAlignedDPRCS2Regs >= 6) { + unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, + &ARM::QQPRRegClass); + MBB.addLiveIn(SupReg); + AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Qwb_fixed), + ARM::R4) + .addReg(ARM::R4, RegState::Kill).addImm(16) + .addReg(NextReg) + .addReg(SupReg, RegState::ImplicitKill)); + NextReg += 4; + NumAlignedDPRCS2Regs -= 4; + } + + // We won't modify r4 beyond this point. It currently points to the next + // register to be spilled. + unsigned R4BaseReg = NextReg; + + // 16-byte aligned vst1.64 with 4 d-regs, no writeback. + if (NumAlignedDPRCS2Regs >= 4) { + unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, + &ARM::QQPRRegClass); + MBB.addLiveIn(SupReg); + AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Q)) + .addReg(ARM::R4).addImm(16).addReg(NextReg) + .addReg(SupReg, RegState::ImplicitKill)); + NextReg += 4; + NumAlignedDPRCS2Regs -= 4; + } + + // 16-byte aligned vst1.64 with 2 d-regs. + if (NumAlignedDPRCS2Regs >= 2) { + unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, + &ARM::QPRRegClass); + MBB.addLiveIn(SupReg); + AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1q64)) + .addReg(ARM::R4).addImm(16).addReg(SupReg)); + NextReg += 2; + NumAlignedDPRCS2Regs -= 2; + } + + // Finally, use a vanilla vstr.64 for the odd last register. + if (NumAlignedDPRCS2Regs) { + MBB.addLiveIn(NextReg); + // vstr.64 uses addrmode5 which has an offset scale of 4. + AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VSTRD)) + .addReg(NextReg) + .addReg(ARM::R4).addImm((NextReg-R4BaseReg)*2)); + } + + // The last spill instruction inserted should kill the scratch register r4. + std::prev(MI)->addRegisterKilled(ARM::R4, TRI); +} + +/// Skip past the code inserted by emitAlignedDPRCS2Spills, and return an +/// iterator to the following instruction. +static MachineBasicBlock::iterator +skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI, + unsigned NumAlignedDPRCS2Regs) { + // sub r4, sp, #numregs * 8 + // bic r4, r4, #align - 1 + // mov sp, r4 + ++MI; ++MI; ++MI; + assert(MI->mayStore() && "Expecting spill instruction"); + + // These switches all fall through. + switch(NumAlignedDPRCS2Regs) { + case 7: + ++MI; + assert(MI->mayStore() && "Expecting spill instruction"); + default: + ++MI; + assert(MI->mayStore() && "Expecting spill instruction"); + case 1: + case 2: + case 4: + assert(MI->killsRegister(ARM::R4) && "Missed kill flag"); + ++MI; + } + return MI; +} + +/// Emit aligned reload instructions for NumAlignedDPRCS2Regs D-registers +/// starting from d8. These instructions are assumed to execute while the +/// stack is still aligned, unlike the code inserted by emitPopInst. +static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned NumAlignedDPRCS2Regs, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) { + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + + // Find the frame index assigned to d8. + int D8SpillFI = 0; + for (unsigned i = 0, e = CSI.size(); i != e; ++i) + if (CSI[i].getReg() == ARM::D8) { + D8SpillFI = CSI[i].getFrameIdx(); + break; + } + + // Materialize the address of the d8 spill slot into the scratch register r4. + // This can be fairly complicated if the stack frame is large, so just use + // the normal frame index elimination mechanism to do it. This code runs as + // the initial part of the epilog where the stack and base pointers haven't + // been changed yet. + bool isThumb = AFI->isThumbFunction(); + assert(!AFI->isThumb1OnlyFunction() && "Can't realign stack for thumb1"); + + unsigned Opc = isThumb ? ARM::t2ADDri : ARM::ADDri; + AddDefaultCC(AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4) + .addFrameIndex(D8SpillFI).addImm(0))); + + // Now restore NumAlignedDPRCS2Regs registers starting from d8. + unsigned NextReg = ARM::D8; + + // 16-byte aligned vld1.64 with 4 d-regs and writeback. + if (NumAlignedDPRCS2Regs >= 6) { + unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, + &ARM::QQPRRegClass); + AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Qwb_fixed), NextReg) + .addReg(ARM::R4, RegState::Define) + .addReg(ARM::R4, RegState::Kill).addImm(16) + .addReg(SupReg, RegState::ImplicitDefine)); + NextReg += 4; + NumAlignedDPRCS2Regs -= 4; + } + + // We won't modify r4 beyond this point. It currently points to the next + // register to be spilled. + unsigned R4BaseReg = NextReg; + + // 16-byte aligned vld1.64 with 4 d-regs, no writeback. + if (NumAlignedDPRCS2Regs >= 4) { + unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, + &ARM::QQPRRegClass); + AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Q), NextReg) + .addReg(ARM::R4).addImm(16) + .addReg(SupReg, RegState::ImplicitDefine)); + NextReg += 4; + NumAlignedDPRCS2Regs -= 4; + } + + // 16-byte aligned vld1.64 with 2 d-regs. + if (NumAlignedDPRCS2Regs >= 2) { + unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, + &ARM::QPRRegClass); + AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1q64), SupReg) + .addReg(ARM::R4).addImm(16)); + NextReg += 2; + NumAlignedDPRCS2Regs -= 2; + } + + // Finally, use a vanilla vldr.64 for the remaining odd register. + if (NumAlignedDPRCS2Regs) + AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLDRD), NextReg) + .addReg(ARM::R4).addImm(2*(NextReg-R4BaseReg))); + + // Last store kills r4. + std::prev(MI)->addRegisterKilled(ARM::R4, TRI); +} + +bool ARMFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + if (CSI.empty()) + return false; + + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + + unsigned PushOpc = AFI->isThumbFunction() ? ARM::t2STMDB_UPD : ARM::STMDB_UPD; + unsigned PushOneOpc = AFI->isThumbFunction() ? + ARM::t2STR_PRE : ARM::STR_PRE_IMM; + unsigned FltOpc = ARM::VSTMDDB_UPD; + unsigned NumAlignedDPRCS2Regs = AFI->getNumAlignedDPRCS2Regs(); + emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea1Register, 0, + MachineInstr::FrameSetup); + emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea2Register, 0, + MachineInstr::FrameSetup); + emitPushInst(MBB, MI, CSI, FltOpc, 0, true, &isARMArea3Register, + NumAlignedDPRCS2Regs, MachineInstr::FrameSetup); + + // The code above does not insert spill code for the aligned DPRCS2 registers. + // The stack realignment code will be inserted between the push instructions + // and these spills. + if (NumAlignedDPRCS2Regs) + emitAlignedDPRCS2Spills(MBB, MI, NumAlignedDPRCS2Regs, CSI, TRI); + + return true; +} + +bool ARMFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + if (CSI.empty()) + return false; + + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + bool isVarArg = AFI->getArgRegsSaveSize() > 0; + unsigned NumAlignedDPRCS2Regs = AFI->getNumAlignedDPRCS2Regs(); + + // The emitPopInst calls below do not insert reloads for the aligned DPRCS2 + // registers. Do that here instead. + if (NumAlignedDPRCS2Regs) + emitAlignedDPRCS2Restores(MBB, MI, NumAlignedDPRCS2Regs, CSI, TRI); + + unsigned PopOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_UPD : ARM::LDMIA_UPD; + unsigned LdrOpc = AFI->isThumbFunction() ? ARM::t2LDR_POST :ARM::LDR_POST_IMM; + unsigned FltOpc = ARM::VLDMDIA_UPD; + emitPopInst(MBB, MI, CSI, FltOpc, 0, isVarArg, true, &isARMArea3Register, + NumAlignedDPRCS2Regs); + emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false, + &isARMArea2Register, 0); + emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false, + &isARMArea1Register, 0); + + return true; +} + +// FIXME: Make generic? +static unsigned GetFunctionSizeInBytes(const MachineFunction &MF, + const ARMBaseInstrInfo &TII) { + unsigned FnSize = 0; + for (auto &MBB : MF) { + for (auto &MI : MBB) + FnSize += TII.GetInstSizeInBytes(&MI); + } + return FnSize; +} + +/// estimateRSStackSizeLimit - Look at each instruction that references stack +/// frames and return the stack size limit beyond which some of these +/// instructions will require a scratch register during their expansion later. +// FIXME: Move to TII? +static unsigned estimateRSStackSizeLimit(MachineFunction &MF, + const TargetFrameLowering *TFI) { + const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned Limit = (1 << 12) - 1; + for (auto &MBB : MF) { + for (auto &MI : MBB) { + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + if (!MI.getOperand(i).isFI()) + continue; + + // When using ADDri to get the address of a stack object, 255 is the + // largest offset guaranteed to fit in the immediate offset. + if (MI.getOpcode() == ARM::ADDri) { + Limit = std::min(Limit, (1U << 8) - 1); + break; + } + + // Otherwise check the addressing mode. + switch (MI.getDesc().TSFlags & ARMII::AddrModeMask) { + case ARMII::AddrMode3: + case ARMII::AddrModeT2_i8: + Limit = std::min(Limit, (1U << 8) - 1); + break; + case ARMII::AddrMode5: + case ARMII::AddrModeT2_i8s4: + Limit = std::min(Limit, ((1U << 8) - 1) * 4); + break; + case ARMII::AddrModeT2_i12: + // i12 supports only positive offset so these will be converted to + // i8 opcodes. See llvm::rewriteT2FrameIndex. + if (TFI->hasFP(MF) && AFI->hasStackFrame()) + Limit = std::min(Limit, (1U << 8) - 1); + break; + case ARMII::AddrMode4: + case ARMII::AddrMode6: + // Addressing modes 4 & 6 (load/store) instructions can't encode an + // immediate offset for stack references. + return 0; + default: + break; + } + break; // At most one FI per instruction + } + } + } + + return Limit; +} + +// In functions that realign the stack, it can be an advantage to spill the +// callee-saved vector registers after realigning the stack. The vst1 and vld1 +// instructions take alignment hints that can improve performance. +// +static void +checkNumAlignedDPRCS2Regs(MachineFunction &MF, BitVector &SavedRegs) { + MF.getInfo<ARMFunctionInfo>()->setNumAlignedDPRCS2Regs(0); + if (!SpillAlignedNEONRegs) + return; + + // Naked functions don't spill callee-saved registers. + if (MF.getFunction()->hasFnAttribute(Attribute::Naked)) + return; + + // We are planning to use NEON instructions vst1 / vld1. + if (!static_cast<const ARMSubtarget &>(MF.getSubtarget()).hasNEON()) + return; + + // Don't bother if the default stack alignment is sufficiently high. + if (MF.getSubtarget().getFrameLowering()->getStackAlignment() >= 8) + return; + + // Aligned spills require stack realignment. + if (!static_cast<const ARMBaseRegisterInfo *>( + MF.getSubtarget().getRegisterInfo())->canRealignStack(MF)) + return; + + // We always spill contiguous d-registers starting from d8. Count how many + // needs spilling. The register allocator will almost always use the + // callee-saved registers in order, but it can happen that there are holes in + // the range. Registers above the hole will be spilled to the standard DPRCS + // area. + unsigned NumSpills = 0; + for (; NumSpills < 8; ++NumSpills) + if (!SavedRegs.test(ARM::D8 + NumSpills)) + break; + + // Don't do this for just one d-register. It's not worth it. + if (NumSpills < 2) + return; + + // Spill the first NumSpills D-registers after realigning the stack. + MF.getInfo<ARMFunctionInfo>()->setNumAlignedDPRCS2Regs(NumSpills); + + // A scratch register is required for the vst1 / vld1 instructions. + SavedRegs.set(ARM::R4); +} + +void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, + BitVector &SavedRegs, + RegScavenger *RS) const { + TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); + // This tells PEI to spill the FP as if it is any other callee-save register + // to take advantage the eliminateFrameIndex machinery. This also ensures it + // is spilled in the order specified by getCalleeSavedRegs() to make it easier + // to combine multiple loads / stores. + bool CanEliminateFrame = true; + bool CS1Spilled = false; + bool LRSpilled = false; + unsigned NumGPRSpills = 0; + SmallVector<unsigned, 4> UnspilledCS1GPRs; + SmallVector<unsigned, 4> UnspilledCS2GPRs; + const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>( + MF.getSubtarget().getRegisterInfo()); + const ARMBaseInstrInfo &TII = + *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo()); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + unsigned FramePtr = RegInfo->getFrameRegister(MF); + + // Spill R4 if Thumb2 function requires stack realignment - it will be used as + // scratch register. Also spill R4 if Thumb2 function has varsized objects, + // since it's not always possible to restore sp from fp in a single + // instruction. + // FIXME: It will be better just to find spare register here. + if (AFI->isThumb2Function() && + (MFI->hasVarSizedObjects() || RegInfo->needsStackRealignment(MF))) + SavedRegs.set(ARM::R4); + + if (AFI->isThumb1OnlyFunction()) { + // Spill LR if Thumb1 function uses variable length argument lists. + if (AFI->getArgRegsSaveSize() > 0) + SavedRegs.set(ARM::LR); + + // Spill R4 if Thumb1 epilogue has to restore SP from FP. We don't know + // for sure what the stack size will be, but for this, an estimate is good + // enough. If there anything changes it, it'll be a spill, which implies + // we've used all the registers and so R4 is already used, so not marking + // it here will be OK. + // FIXME: It will be better just to find spare register here. + unsigned StackSize = MFI->estimateStackSize(MF); + if (MFI->hasVarSizedObjects() || StackSize > 508) + SavedRegs.set(ARM::R4); + } + + // See if we can spill vector registers to aligned stack. + checkNumAlignedDPRCS2Regs(MF, SavedRegs); + + // Spill the BasePtr if it's used. + if (RegInfo->hasBasePointer(MF)) + SavedRegs.set(RegInfo->getBaseRegister()); + + // Don't spill FP if the frame can be eliminated. This is determined + // by scanning the callee-save registers to see if any is modified. + const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); + for (unsigned i = 0; CSRegs[i]; ++i) { + unsigned Reg = CSRegs[i]; + bool Spilled = false; + if (SavedRegs.test(Reg)) { + Spilled = true; + CanEliminateFrame = false; + } + + if (!ARM::GPRRegClass.contains(Reg)) + continue; + + if (Spilled) { + NumGPRSpills++; + + if (!STI.isTargetDarwin()) { + if (Reg == ARM::LR) + LRSpilled = true; + CS1Spilled = true; + continue; + } + + // Keep track if LR and any of R4, R5, R6, and R7 is spilled. + switch (Reg) { + case ARM::LR: + LRSpilled = true; + // Fallthrough + case ARM::R0: case ARM::R1: + case ARM::R2: case ARM::R3: + case ARM::R4: case ARM::R5: + case ARM::R6: case ARM::R7: + CS1Spilled = true; + break; + default: + break; + } + } else { + if (!STI.isTargetDarwin()) { + UnspilledCS1GPRs.push_back(Reg); + continue; + } + + switch (Reg) { + case ARM::R0: case ARM::R1: + case ARM::R2: case ARM::R3: + case ARM::R4: case ARM::R5: + case ARM::R6: case ARM::R7: + case ARM::LR: + UnspilledCS1GPRs.push_back(Reg); + break; + default: + UnspilledCS2GPRs.push_back(Reg); + break; + } + } + } + + bool ForceLRSpill = false; + if (!LRSpilled && AFI->isThumb1OnlyFunction()) { + unsigned FnSize = GetFunctionSizeInBytes(MF, TII); + // Force LR to be spilled if the Thumb function size is > 2048. This enables + // use of BL to implement far jump. If it turns out that it's not needed + // then the branch fix up path will undo it. + if (FnSize >= (1 << 11)) { + CanEliminateFrame = false; + ForceLRSpill = true; + } + } + + // If any of the stack slot references may be out of range of an immediate + // offset, make sure a register (or a spill slot) is available for the + // register scavenger. Note that if we're indexing off the frame pointer, the + // effective stack size is 4 bytes larger since the FP points to the stack + // slot of the previous FP. Also, if we have variable sized objects in the + // function, stack slot references will often be negative, and some of + // our instructions are positive-offset only, so conservatively consider + // that case to want a spill slot (or register) as well. Similarly, if + // the function adjusts the stack pointer during execution and the + // adjustments aren't already part of our stack size estimate, our offset + // calculations may be off, so be conservative. + // FIXME: We could add logic to be more precise about negative offsets + // and which instructions will need a scratch register for them. Is it + // worth the effort and added fragility? + bool BigStack = (RS && (MFI->estimateStackSize(MF) + + ((hasFP(MF) && AFI->hasStackFrame()) ? 4 : 0) >= + estimateRSStackSizeLimit(MF, this))) || + MFI->hasVarSizedObjects() || + (MFI->adjustsStack() && !canSimplifyCallFramePseudos(MF)); + + bool ExtraCSSpill = false; + if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) { + AFI->setHasStackFrame(true); + + // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled. + // Spill LR as well so we can fold BX_RET to the registers restore (LDM). + if (!LRSpilled && CS1Spilled) { + SavedRegs.set(ARM::LR); + NumGPRSpills++; + SmallVectorImpl<unsigned>::iterator LRPos; + LRPos = std::find(UnspilledCS1GPRs.begin(), UnspilledCS1GPRs.end(), + (unsigned)ARM::LR); + if (LRPos != UnspilledCS1GPRs.end()) + UnspilledCS1GPRs.erase(LRPos); + + ForceLRSpill = false; + ExtraCSSpill = true; + } + + if (hasFP(MF)) { + SavedRegs.set(FramePtr); + auto FPPos = std::find(UnspilledCS1GPRs.begin(), UnspilledCS1GPRs.end(), + FramePtr); + if (FPPos != UnspilledCS1GPRs.end()) + UnspilledCS1GPRs.erase(FPPos); + NumGPRSpills++; + } + + // If stack and double are 8-byte aligned and we are spilling an odd number + // of GPRs, spill one extra callee save GPR so we won't have to pad between + // the integer and double callee save areas. + unsigned TargetAlign = getStackAlignment(); + if (TargetAlign >= 8 && (NumGPRSpills & 1)) { + if (CS1Spilled && !UnspilledCS1GPRs.empty()) { + for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) { + unsigned Reg = UnspilledCS1GPRs[i]; + // Don't spill high register if the function is thumb. In the case of + // Windows on ARM, accept R11 (frame pointer) + if (!AFI->isThumbFunction() || + (STI.isTargetWindows() && Reg == ARM::R11) || + isARMLowRegister(Reg) || Reg == ARM::LR) { + SavedRegs.set(Reg); + if (!MRI.isReserved(Reg)) + ExtraCSSpill = true; + break; + } + } + } else if (!UnspilledCS2GPRs.empty() && !AFI->isThumb1OnlyFunction()) { + unsigned Reg = UnspilledCS2GPRs.front(); + SavedRegs.set(Reg); + if (!MRI.isReserved(Reg)) + ExtraCSSpill = true; + } + } + + // Estimate if we might need to scavenge a register at some point in order + // to materialize a stack offset. If so, either spill one additional + // callee-saved register or reserve a special spill slot to facilitate + // register scavenging. Thumb1 needs a spill slot for stack pointer + // adjustments also, even when the frame itself is small. + if (BigStack && !ExtraCSSpill) { + // If any non-reserved CS register isn't spilled, just spill one or two + // extra. That should take care of it! + unsigned NumExtras = TargetAlign / 4; + SmallVector<unsigned, 2> Extras; + while (NumExtras && !UnspilledCS1GPRs.empty()) { + unsigned Reg = UnspilledCS1GPRs.back(); + UnspilledCS1GPRs.pop_back(); + if (!MRI.isReserved(Reg) && + (!AFI->isThumb1OnlyFunction() || isARMLowRegister(Reg) || + Reg == ARM::LR)) { + Extras.push_back(Reg); + NumExtras--; + } + } + // For non-Thumb1 functions, also check for hi-reg CS registers + if (!AFI->isThumb1OnlyFunction()) { + while (NumExtras && !UnspilledCS2GPRs.empty()) { + unsigned Reg = UnspilledCS2GPRs.back(); + UnspilledCS2GPRs.pop_back(); + if (!MRI.isReserved(Reg)) { + Extras.push_back(Reg); + NumExtras--; + } + } + } + if (Extras.size() && NumExtras == 0) { + for (unsigned i = 0, e = Extras.size(); i != e; ++i) { + SavedRegs.set(Extras[i]); + } + } else if (!AFI->isThumb1OnlyFunction()) { + // note: Thumb1 functions spill to R12, not the stack. Reserve a slot + // closest to SP or frame pointer. + const TargetRegisterClass *RC = &ARM::GPRRegClass; + RS->addScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(), + RC->getAlignment(), + false)); + } + } + } + + if (ForceLRSpill) { + SavedRegs.set(ARM::LR); + AFI->setLRIsSpilledForFarJump(true); + } +} + + +void ARMFrameLowering:: +eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + const ARMBaseInstrInfo &TII = + *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo()); + if (!hasReservedCallFrame(MF)) { + // If we have alloca, convert as follows: + // ADJCALLSTACKDOWN -> sub, sp, sp, amount + // ADJCALLSTACKUP -> add, sp, sp, amount + MachineInstr *Old = I; + DebugLoc dl = Old->getDebugLoc(); + unsigned Amount = Old->getOperand(0).getImm(); + if (Amount != 0) { + // We need to keep the stack aligned properly. To do this, we round the + // amount of space needed for the outgoing arguments up to the next + // alignment boundary. + Amount = alignSPAdjust(Amount); + + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + assert(!AFI->isThumb1OnlyFunction() && + "This eliminateCallFramePseudoInstr does not support Thumb1!"); + bool isARM = !AFI->isThumbFunction(); + + // Replace the pseudo instruction with a new instruction... + unsigned Opc = Old->getOpcode(); + int PIdx = Old->findFirstPredOperandIdx(); + ARMCC::CondCodes Pred = (PIdx == -1) + ? ARMCC::AL : (ARMCC::CondCodes)Old->getOperand(PIdx).getImm(); + if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) { + // Note: PredReg is operand 2 for ADJCALLSTACKDOWN. + unsigned PredReg = Old->getOperand(2).getReg(); + emitSPUpdate(isARM, MBB, I, dl, TII, -Amount, MachineInstr::NoFlags, + Pred, PredReg); + } else { + // Note: PredReg is operand 3 for ADJCALLSTACKUP. + unsigned PredReg = Old->getOperand(3).getReg(); + assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP); + emitSPUpdate(isARM, MBB, I, dl, TII, Amount, MachineInstr::NoFlags, + Pred, PredReg); + } + } + } + MBB.erase(I); +} + +/// Get the minimum constant for ARM that is greater than or equal to the +/// argument. In ARM, constants can have any value that can be produced by +/// rotating an 8-bit value to the right by an even number of bits within a +/// 32-bit word. +static uint32_t alignToARMConstant(uint32_t Value) { + unsigned Shifted = 0; + + if (Value == 0) + return 0; + + while (!(Value & 0xC0000000)) { + Value = Value << 2; + Shifted += 2; + } + + bool Carry = (Value & 0x00FFFFFF); + Value = ((Value & 0xFF000000) >> 24) + Carry; + + if (Value & 0x0000100) + Value = Value & 0x000001FC; + + if (Shifted > 24) + Value = Value >> (Shifted - 24); + else + Value = Value << (24 - Shifted); + + return Value; +} + +// The stack limit in the TCB is set to this many bytes above the actual +// stack limit. +static const uint64_t kSplitStackAvailable = 256; + +// Adjust the function prologue to enable split stacks. This currently only +// supports android and linux. +// +// The ABI of the segmented stack prologue is a little arbitrarily chosen, but +// must be well defined in order to allow for consistent implementations of the +// __morestack helper function. The ABI is also not a normal ABI in that it +// doesn't follow the normal calling conventions because this allows the +// prologue of each function to be optimized further. +// +// Currently, the ABI looks like (when calling __morestack) +// +// * r4 holds the minimum stack size requested for this function call +// * r5 holds the stack size of the arguments to the function +// * the beginning of the function is 3 instructions after the call to +// __morestack +// +// Implementations of __morestack should use r4 to allocate a new stack, r5 to +// place the arguments on to the new stack, and the 3-instruction knowledge to +// jump directly to the body of the function when working on the new stack. +// +// An old (and possibly no longer compatible) implementation of __morestack for +// ARM can be found at [1]. +// +// [1] - https://github.com/mozilla/rust/blob/86efd9/src/rt/arch/arm/morestack.S +void ARMFrameLowering::adjustForSegmentedStacks( + MachineFunction &MF, MachineBasicBlock &PrologueMBB) const { + unsigned Opcode; + unsigned CFIIndex; + const ARMSubtarget *ST = &MF.getSubtarget<ARMSubtarget>(); + bool Thumb = ST->isThumb(); + + // Sadly, this currently doesn't support varargs, platforms other than + // android/linux. Note that thumb1/thumb2 are support for android/linux. + if (MF.getFunction()->isVarArg()) + report_fatal_error("Segmented stacks do not support vararg functions."); + if (!ST->isTargetAndroid() && !ST->isTargetLinux()) + report_fatal_error("Segmented stacks not supported on this platform."); + + MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineModuleInfo &MMI = MF.getMMI(); + MCContext &Context = MMI.getContext(); + const MCRegisterInfo *MRI = Context.getRegisterInfo(); + const ARMBaseInstrInfo &TII = + *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo()); + ARMFunctionInfo *ARMFI = MF.getInfo<ARMFunctionInfo>(); + DebugLoc DL; + + uint64_t StackSize = MFI->getStackSize(); + + // Do not generate a prologue for functions with a stack of size zero + if (StackSize == 0) + return; + + // Use R4 and R5 as scratch registers. + // We save R4 and R5 before use and restore them before leaving the function. + unsigned ScratchReg0 = ARM::R4; + unsigned ScratchReg1 = ARM::R5; + uint64_t AlignedStackSize; + + MachineBasicBlock *PrevStackMBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock *PostStackMBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock *AllocMBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock *GetMBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock *McrMBB = MF.CreateMachineBasicBlock(); + + // Grab everything that reaches PrologueMBB to update there liveness as well. + SmallPtrSet<MachineBasicBlock *, 8> BeforePrologueRegion; + SmallVector<MachineBasicBlock *, 2> WalkList; + WalkList.push_back(&PrologueMBB); + + do { + MachineBasicBlock *CurMBB = WalkList.pop_back_val(); + for (MachineBasicBlock *PredBB : CurMBB->predecessors()) { + if (BeforePrologueRegion.insert(PredBB).second) + WalkList.push_back(PredBB); + } + } while (!WalkList.empty()); + + // The order in that list is important. + // The blocks will all be inserted before PrologueMBB using that order. + // Therefore the block that should appear first in the CFG should appear + // first in the list. + MachineBasicBlock *AddedBlocks[] = {PrevStackMBB, McrMBB, GetMBB, AllocMBB, + PostStackMBB}; + + for (MachineBasicBlock *B : AddedBlocks) + BeforePrologueRegion.insert(B); + + for (const auto &LI : PrologueMBB.liveins()) { + for (MachineBasicBlock *PredBB : BeforePrologueRegion) + PredBB->addLiveIn(LI); + } + + // Remove the newly added blocks from the list, since we know + // we do not have to do the following updates for them. + for (MachineBasicBlock *B : AddedBlocks) { + BeforePrologueRegion.erase(B); + MF.insert(PrologueMBB.getIterator(), B); + } + + for (MachineBasicBlock *MBB : BeforePrologueRegion) { + // Make sure the LiveIns are still sorted and unique. + MBB->sortUniqueLiveIns(); + // Replace the edges to PrologueMBB by edges to the sequences + // we are about to add. + MBB->ReplaceUsesOfBlockWith(&PrologueMBB, AddedBlocks[0]); + } + + // The required stack size that is aligned to ARM constant criterion. + AlignedStackSize = alignToARMConstant(StackSize); + + // When the frame size is less than 256 we just compare the stack + // boundary directly to the value of the stack pointer, per gcc. + bool CompareStackPointer = AlignedStackSize < kSplitStackAvailable; + + // We will use two of the callee save registers as scratch registers so we + // need to save those registers onto the stack. + // We will use SR0 to hold stack limit and SR1 to hold the stack size + // requested and arguments for __morestack(). + // SR0: Scratch Register #0 + // SR1: Scratch Register #1 + // push {SR0, SR1} + if (Thumb) { + AddDefaultPred(BuildMI(PrevStackMBB, DL, TII.get(ARM::tPUSH))) + .addReg(ScratchReg0).addReg(ScratchReg1); + } else { + AddDefaultPred(BuildMI(PrevStackMBB, DL, TII.get(ARM::STMDB_UPD)) + .addReg(ARM::SP, RegState::Define).addReg(ARM::SP)) + .addReg(ScratchReg0).addReg(ScratchReg1); + } + + // Emit the relevant DWARF information about the change in stack pointer as + // well as where to find both r4 and r5 (the callee-save registers) + CFIIndex = + MMI.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, -8)); + BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( + nullptr, MRI->getDwarfRegNum(ScratchReg1, true), -4)); + BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( + nullptr, MRI->getDwarfRegNum(ScratchReg0, true), -8)); + BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + + // mov SR1, sp + if (Thumb) { + AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::tMOVr), ScratchReg1) + .addReg(ARM::SP)); + } else if (CompareStackPointer) { + AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::MOVr), ScratchReg1) + .addReg(ARM::SP)).addReg(0); + } + + // sub SR1, sp, #StackSize + if (!CompareStackPointer && Thumb) { + AddDefaultPred( + AddDefaultCC(BuildMI(McrMBB, DL, TII.get(ARM::tSUBi8), ScratchReg1)) + .addReg(ScratchReg1).addImm(AlignedStackSize)); + } else if (!CompareStackPointer) { + AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::SUBri), ScratchReg1) + .addReg(ARM::SP).addImm(AlignedStackSize)).addReg(0); + } + + if (Thumb && ST->isThumb1Only()) { + unsigned PCLabelId = ARMFI->createPICLabelUId(); + ARMConstantPoolValue *NewCPV = ARMConstantPoolSymbol::Create( + MF.getFunction()->getContext(), "__STACK_LIMIT", PCLabelId, 0); + MachineConstantPool *MCP = MF.getConstantPool(); + unsigned CPI = MCP->getConstantPoolIndex(NewCPV, 4); + + // ldr SR0, [pc, offset(STACK_LIMIT)] + AddDefaultPred(BuildMI(GetMBB, DL, TII.get(ARM::tLDRpci), ScratchReg0) + .addConstantPoolIndex(CPI)); + + // ldr SR0, [SR0] + AddDefaultPred(BuildMI(GetMBB, DL, TII.get(ARM::tLDRi), ScratchReg0) + .addReg(ScratchReg0).addImm(0)); + } else { + // Get TLS base address from the coprocessor + // mrc p15, #0, SR0, c13, c0, #3 + AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::MRC), ScratchReg0) + .addImm(15) + .addImm(0) + .addImm(13) + .addImm(0) + .addImm(3)); + + // Use the last tls slot on android and a private field of the TCP on linux. + assert(ST->isTargetAndroid() || ST->isTargetLinux()); + unsigned TlsOffset = ST->isTargetAndroid() ? 63 : 1; + + // Get the stack limit from the right offset + // ldr SR0, [sr0, #4 * TlsOffset] + AddDefaultPred(BuildMI(GetMBB, DL, TII.get(ARM::LDRi12), ScratchReg0) + .addReg(ScratchReg0).addImm(4 * TlsOffset)); + } + + // Compare stack limit with stack size requested. + // cmp SR0, SR1 + Opcode = Thumb ? ARM::tCMPr : ARM::CMPrr; + AddDefaultPred(BuildMI(GetMBB, DL, TII.get(Opcode)) + .addReg(ScratchReg0) + .addReg(ScratchReg1)); + + // This jump is taken if StackLimit < SP - stack required. + Opcode = Thumb ? ARM::tBcc : ARM::Bcc; + BuildMI(GetMBB, DL, TII.get(Opcode)).addMBB(PostStackMBB) + .addImm(ARMCC::LO) + .addReg(ARM::CPSR); + + + // Calling __morestack(StackSize, Size of stack arguments). + // __morestack knows that the stack size requested is in SR0(r4) + // and amount size of stack arguments is in SR1(r5). + + // Pass first argument for the __morestack by Scratch Register #0. + // The amount size of stack required + if (Thumb) { + AddDefaultPred(AddDefaultCC(BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8), + ScratchReg0)).addImm(AlignedStackSize)); + } else { + AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ScratchReg0) + .addImm(AlignedStackSize)).addReg(0); + } + // Pass second argument for the __morestack by Scratch Register #1. + // The amount size of stack consumed to save function arguments. + if (Thumb) { + AddDefaultPred( + AddDefaultCC(BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8), ScratchReg1)) + .addImm(alignToARMConstant(ARMFI->getArgumentStackSize()))); + } else { + AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ScratchReg1) + .addImm(alignToARMConstant(ARMFI->getArgumentStackSize()))) + .addReg(0); + } + + // push {lr} - Save return address of this function. + if (Thumb) { + AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tPUSH))) + .addReg(ARM::LR); + } else { + AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::STMDB_UPD)) + .addReg(ARM::SP, RegState::Define) + .addReg(ARM::SP)) + .addReg(ARM::LR); + } + + // Emit the DWARF info about the change in stack as well as where to find the + // previous link register + CFIIndex = + MMI.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, -12)); + BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( + nullptr, MRI->getDwarfRegNum(ARM::LR, true), -12)); + BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + + // Call __morestack(). + if (Thumb) { + AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tBL))) + .addExternalSymbol("__morestack"); + } else { + BuildMI(AllocMBB, DL, TII.get(ARM::BL)) + .addExternalSymbol("__morestack"); + } + + // pop {lr} - Restore return address of this original function. + if (Thumb) { + if (ST->isThumb1Only()) { + AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tPOP))) + .addReg(ScratchReg0); + AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tMOVr), ARM::LR) + .addReg(ScratchReg0)); + } else { + AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::t2LDR_POST)) + .addReg(ARM::LR, RegState::Define) + .addReg(ARM::SP, RegState::Define) + .addReg(ARM::SP) + .addImm(4)); + } + } else { + AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::LDMIA_UPD)) + .addReg(ARM::SP, RegState::Define) + .addReg(ARM::SP)) + .addReg(ARM::LR); + } + + // Restore SR0 and SR1 in case of __morestack() was called. + // __morestack() will skip PostStackMBB block so we need to restore + // scratch registers from here. + // pop {SR0, SR1} + if (Thumb) { + AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tPOP))) + .addReg(ScratchReg0) + .addReg(ScratchReg1); + } else { + AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::LDMIA_UPD)) + .addReg(ARM::SP, RegState::Define) + .addReg(ARM::SP)) + .addReg(ScratchReg0) + .addReg(ScratchReg1); + } + + // Update the CFA offset now that we've popped + CFIIndex = MMI.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 0)); + BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + + // bx lr - Return from this function. + Opcode = Thumb ? ARM::tBX_RET : ARM::BX_RET; + AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(Opcode))); + + // Restore SR0 and SR1 in case of __morestack() was not called. + // pop {SR0, SR1} + if (Thumb) { + AddDefaultPred(BuildMI(PostStackMBB, DL, TII.get(ARM::tPOP))) + .addReg(ScratchReg0) + .addReg(ScratchReg1); + } else { + AddDefaultPred(BuildMI(PostStackMBB, DL, TII.get(ARM::LDMIA_UPD)) + .addReg(ARM::SP, RegState::Define) + .addReg(ARM::SP)) + .addReg(ScratchReg0) + .addReg(ScratchReg1); + } + + // Update the CFA offset now that we've popped + CFIIndex = MMI.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 0)); + BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + + // Tell debuggers that r4 and r5 are now the same as they were in the + // previous function, that they're the "Same Value". + CFIIndex = MMI.addFrameInst(MCCFIInstruction::createSameValue( + nullptr, MRI->getDwarfRegNum(ScratchReg0, true))); + BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + CFIIndex = MMI.addFrameInst(MCCFIInstruction::createSameValue( + nullptr, MRI->getDwarfRegNum(ScratchReg1, true))); + BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + + // Organizing MBB lists + PostStackMBB->addSuccessor(&PrologueMBB); + + AllocMBB->addSuccessor(PostStackMBB); + + GetMBB->addSuccessor(PostStackMBB); + GetMBB->addSuccessor(AllocMBB); + + McrMBB->addSuccessor(GetMBB); + + PrevStackMBB->addSuccessor(McrMBB); + +#ifdef XDEBUG + MF.verify(); +#endif +} diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h new file mode 100644 index 0000000..66f4dfb --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h @@ -0,0 +1,85 @@ +//==-- ARMTargetFrameLowering.h - Define frame lowering for ARM --*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_ARMFRAMELOWERING_H +#define LLVM_LIB_TARGET_ARM_ARMFRAMELOWERING_H + +#include "llvm/Target/TargetFrameLowering.h" + +namespace llvm { + class ARMSubtarget; + +class ARMFrameLowering : public TargetFrameLowering { +protected: + const ARMSubtarget &STI; + +public: + explicit ARMFrameLowering(const ARMSubtarget &sti); + + /// emitProlog/emitEpilog - These methods insert prolog and epilog code into + /// the function. + void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const override; + + bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const override; + + bool noFramePointerElim(const MachineFunction &MF) const override; + + bool hasFP(const MachineFunction &MF) const override; + bool hasReservedCallFrame(const MachineFunction &MF) const override; + bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override; + int getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const override; + int ResolveFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg, int SPAdj) const; + + void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, + RegScavenger *RS) const override; + + void adjustForSegmentedStacks(MachineFunction &MF, + MachineBasicBlock &MBB) const override; + + /// Returns true if the target will correctly handle shrink wrapping. + bool enableShrinkWrapping(const MachineFunction &MF) const override { + return true; + } + + private: + void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, unsigned StmOpc, + unsigned StrOpc, bool NoGap, + bool(*Func)(unsigned, bool), unsigned NumAlignedDPRCS2Regs, + unsigned MIFlags = 0) const; + void emitPopInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, unsigned LdmOpc, + unsigned LdrOpc, bool isVarArg, bool NoGap, + bool(*Func)(unsigned, bool), + unsigned NumAlignedDPRCS2Regs) const; + + void + eliminateCallFramePseudoInstr(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const override; +}; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp new file mode 100644 index 0000000..0157c0a --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp @@ -0,0 +1,102 @@ +//===-- ARMHazardRecognizer.cpp - ARM postra hazard recognizer ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "ARMHazardRecognizer.h" +#include "ARMBaseInstrInfo.h" +#include "ARMBaseRegisterInfo.h" +#include "ARMSubtarget.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/Target/TargetRegisterInfo.h" +using namespace llvm; + +static bool hasRAWHazard(MachineInstr *DefMI, MachineInstr *MI, + const TargetRegisterInfo &TRI) { + // FIXME: Detect integer instructions properly. + const MCInstrDesc &MCID = MI->getDesc(); + unsigned Domain = MCID.TSFlags & ARMII::DomainMask; + if (MI->mayStore()) + return false; + unsigned Opcode = MCID.getOpcode(); + if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD) + return false; + if ((Domain & ARMII::DomainVFP) || (Domain & ARMII::DomainNEON)) + return MI->readsRegister(DefMI->getOperand(0).getReg(), &TRI); + return false; +} + +ScheduleHazardRecognizer::HazardType +ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { + assert(Stalls == 0 && "ARM hazards don't support scoreboard lookahead"); + + MachineInstr *MI = SU->getInstr(); + + if (!MI->isDebugValue()) { + // Look for special VMLA / VMLS hazards. A VMUL / VADD / VSUB following + // a VMLA / VMLS will cause 4 cycle stall. + const MCInstrDesc &MCID = MI->getDesc(); + if (LastMI && (MCID.TSFlags & ARMII::DomainMask) != ARMII::DomainGeneral) { + MachineInstr *DefMI = LastMI; + const MCInstrDesc &LastMCID = LastMI->getDesc(); + const MachineFunction *MF = MI->getParent()->getParent(); + const ARMBaseInstrInfo &TII = *static_cast<const ARMBaseInstrInfo *>( + MF->getSubtarget().getInstrInfo()); + + // Skip over one non-VFP / NEON instruction. + if (!LastMI->isBarrier() && + // On A9, AGU and NEON/FPU are muxed. + !(TII.getSubtarget().isLikeA9() && LastMI->mayLoadOrStore()) && + (LastMCID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) { + MachineBasicBlock::iterator I = LastMI; + if (I != LastMI->getParent()->begin()) { + I = std::prev(I); + DefMI = &*I; + } + } + + if (TII.isFpMLxInstruction(DefMI->getOpcode()) && + (TII.canCauseFpMLxStall(MI->getOpcode()) || + hasRAWHazard(DefMI, MI, TII.getRegisterInfo()))) { + // Try to schedule another instruction for the next 4 cycles. + if (FpMLxStalls == 0) + FpMLxStalls = 4; + return Hazard; + } + } + } + + return ScoreboardHazardRecognizer::getHazardType(SU, Stalls); +} + +void ARMHazardRecognizer::Reset() { + LastMI = nullptr; + FpMLxStalls = 0; + ScoreboardHazardRecognizer::Reset(); +} + +void ARMHazardRecognizer::EmitInstruction(SUnit *SU) { + MachineInstr *MI = SU->getInstr(); + if (!MI->isDebugValue()) { + LastMI = MI; + FpMLxStalls = 0; + } + + ScoreboardHazardRecognizer::EmitInstruction(SU); +} + +void ARMHazardRecognizer::AdvanceCycle() { + if (FpMLxStalls && --FpMLxStalls == 0) + // Stalled for 4 cycles but still can't schedule any other instructions. + LastMI = nullptr; + ScoreboardHazardRecognizer::AdvanceCycle(); +} + +void ARMHazardRecognizer::RecedeCycle() { + llvm_unreachable("reverse ARM hazard checking unsupported"); +} diff --git a/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.h b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.h new file mode 100644 index 0000000..ccf09db --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.h @@ -0,0 +1,49 @@ +//===-- ARMHazardRecognizer.h - ARM Hazard Recognizers ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines hazard recognizers for scheduling ARM functions. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_ARMHAZARDRECOGNIZER_H +#define LLVM_LIB_TARGET_ARM_ARMHAZARDRECOGNIZER_H + +#include "llvm/CodeGen/ScoreboardHazardRecognizer.h" + +namespace llvm { + +class ARMBaseInstrInfo; +class ARMBaseRegisterInfo; +class ARMSubtarget; +class MachineInstr; + +/// ARMHazardRecognizer handles special constraints that are not expressed in +/// the scheduling itinerary. This is only used during postRA scheduling. The +/// ARM preRA scheduler uses an unspecialized instance of the +/// ScoreboardHazardRecognizer. +class ARMHazardRecognizer : public ScoreboardHazardRecognizer { + MachineInstr *LastMI; + unsigned FpMLxStalls; + +public: + ARMHazardRecognizer(const InstrItineraryData *ItinData, + const ScheduleDAG *DAG) + : ScoreboardHazardRecognizer(ItinData, DAG, "post-RA-sched"), + LastMI(nullptr) {} + + HazardType getHazardType(SUnit *SU, int Stalls) override; + void Reset() override; + void EmitInstruction(SUnit *SU) override; + void AdvanceCycle() override; + void RecedeCycle() override; +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp new file mode 100644 index 0000000..0242440 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -0,0 +1,3954 @@ +//===-- ARMISelDAGToDAG.cpp - A dag to dag inst selector for ARM ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines an instruction selector for the ARM target. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "ARMTargetMachine.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetOptions.h" + +using namespace llvm; + +#define DEBUG_TYPE "arm-isel" + +static cl::opt<bool> +DisableShifterOp("disable-shifter-op", cl::Hidden, + cl::desc("Disable isel of shifter-op"), + cl::init(false)); + +static cl::opt<bool> +CheckVMLxHazard("check-vmlx-hazard", cl::Hidden, + cl::desc("Check fp vmla / vmls hazard at isel time"), + cl::init(true)); + +//===--------------------------------------------------------------------===// +/// ARMDAGToDAGISel - ARM specific code to select ARM machine +/// instructions for SelectionDAG operations. +/// +namespace { + +enum AddrMode2Type { + AM2_BASE, // Simple AM2 (+-imm12) + AM2_SHOP // Shifter-op AM2 +}; + +class ARMDAGToDAGISel : public SelectionDAGISel { + /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can + /// make the right decision when generating code for different targets. + const ARMSubtarget *Subtarget; + +public: + explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm, CodeGenOpt::Level OptLevel) + : SelectionDAGISel(tm, OptLevel) {} + + bool runOnMachineFunction(MachineFunction &MF) override { + // Reset the subtarget each time through. + Subtarget = &MF.getSubtarget<ARMSubtarget>(); + SelectionDAGISel::runOnMachineFunction(MF); + return true; + } + + const char *getPassName() const override { + return "ARM Instruction Selection"; + } + + void PreprocessISelDAG() override; + + /// getI32Imm - Return a target constant of type i32 with the specified + /// value. + inline SDValue getI32Imm(unsigned Imm, SDLoc dl) { + return CurDAG->getTargetConstant(Imm, dl, MVT::i32); + } + + SDNode *Select(SDNode *N) override; + + + bool hasNoVMLxHazardUse(SDNode *N) const; + bool isShifterOpProfitable(const SDValue &Shift, + ARM_AM::ShiftOpc ShOpcVal, unsigned ShAmt); + bool SelectRegShifterOperand(SDValue N, SDValue &A, + SDValue &B, SDValue &C, + bool CheckProfitability = true); + bool SelectImmShifterOperand(SDValue N, SDValue &A, + SDValue &B, bool CheckProfitability = true); + bool SelectShiftRegShifterOperand(SDValue N, SDValue &A, + SDValue &B, SDValue &C) { + // Don't apply the profitability check + return SelectRegShifterOperand(N, A, B, C, false); + } + bool SelectShiftImmShifterOperand(SDValue N, SDValue &A, + SDValue &B) { + // Don't apply the profitability check + return SelectImmShifterOperand(N, A, B, false); + } + + bool SelectAddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm); + bool SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset, SDValue &Opc); + + AddrMode2Type SelectAddrMode2Worker(SDValue N, SDValue &Base, + SDValue &Offset, SDValue &Opc); + bool SelectAddrMode2Base(SDValue N, SDValue &Base, SDValue &Offset, + SDValue &Opc) { + return SelectAddrMode2Worker(N, Base, Offset, Opc) == AM2_BASE; + } + + bool SelectAddrMode2ShOp(SDValue N, SDValue &Base, SDValue &Offset, + SDValue &Opc) { + return SelectAddrMode2Worker(N, Base, Offset, Opc) == AM2_SHOP; + } + + bool SelectAddrMode2(SDValue N, SDValue &Base, SDValue &Offset, + SDValue &Opc) { + SelectAddrMode2Worker(N, Base, Offset, Opc); +// return SelectAddrMode2ShOp(N, Base, Offset, Opc); + // This always matches one way or another. + return true; + } + + bool SelectCMOVPred(SDValue N, SDValue &Pred, SDValue &Reg) { + const ConstantSDNode *CN = cast<ConstantSDNode>(N); + Pred = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(N), MVT::i32); + Reg = CurDAG->getRegister(ARM::CPSR, MVT::i32); + return true; + } + + bool SelectAddrMode2OffsetReg(SDNode *Op, SDValue N, + SDValue &Offset, SDValue &Opc); + bool SelectAddrMode2OffsetImm(SDNode *Op, SDValue N, + SDValue &Offset, SDValue &Opc); + bool SelectAddrMode2OffsetImmPre(SDNode *Op, SDValue N, + SDValue &Offset, SDValue &Opc); + bool SelectAddrOffsetNone(SDValue N, SDValue &Base); + bool SelectAddrMode3(SDValue N, SDValue &Base, + SDValue &Offset, SDValue &Opc); + bool SelectAddrMode3Offset(SDNode *Op, SDValue N, + SDValue &Offset, SDValue &Opc); + bool SelectAddrMode5(SDValue N, SDValue &Base, + SDValue &Offset); + bool SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,SDValue &Align); + bool SelectAddrMode6Offset(SDNode *Op, SDValue N, SDValue &Offset); + + bool SelectAddrModePC(SDValue N, SDValue &Offset, SDValue &Label); + + // Thumb Addressing Modes: + bool SelectThumbAddrModeRR(SDValue N, SDValue &Base, SDValue &Offset); + bool SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, SDValue &Base, + SDValue &OffImm); + bool SelectThumbAddrModeImm5S1(SDValue N, SDValue &Base, + SDValue &OffImm); + bool SelectThumbAddrModeImm5S2(SDValue N, SDValue &Base, + SDValue &OffImm); + bool SelectThumbAddrModeImm5S4(SDValue N, SDValue &Base, + SDValue &OffImm); + bool SelectThumbAddrModeSP(SDValue N, SDValue &Base, SDValue &OffImm); + + // Thumb 2 Addressing Modes: + bool SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm); + bool SelectT2AddrModeImm8(SDValue N, SDValue &Base, + SDValue &OffImm); + bool SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N, + SDValue &OffImm); + bool SelectT2AddrModeSoReg(SDValue N, SDValue &Base, + SDValue &OffReg, SDValue &ShImm); + bool SelectT2AddrModeExclusive(SDValue N, SDValue &Base, SDValue &OffImm); + + inline bool is_so_imm(unsigned Imm) const { + return ARM_AM::getSOImmVal(Imm) != -1; + } + + inline bool is_so_imm_not(unsigned Imm) const { + return ARM_AM::getSOImmVal(~Imm) != -1; + } + + inline bool is_t2_so_imm(unsigned Imm) const { + return ARM_AM::getT2SOImmVal(Imm) != -1; + } + + inline bool is_t2_so_imm_not(unsigned Imm) const { + return ARM_AM::getT2SOImmVal(~Imm) != -1; + } + + // Include the pieces autogenerated from the target description. +#include "ARMGenDAGISel.inc" + +private: + /// SelectARMIndexedLoad - Indexed (pre/post inc/dec) load matching code for + /// ARM. + SDNode *SelectARMIndexedLoad(SDNode *N); + SDNode *SelectT2IndexedLoad(SDNode *N); + + /// SelectVLD - Select NEON load intrinsics. NumVecs should be + /// 1, 2, 3 or 4. The opcode arrays specify the instructions used for + /// loads of D registers and even subregs and odd subregs of Q registers. + /// For NumVecs <= 2, QOpcodes1 is not used. + SDNode *SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, + const uint16_t *DOpcodes, + const uint16_t *QOpcodes0, const uint16_t *QOpcodes1); + + /// SelectVST - Select NEON store intrinsics. NumVecs should + /// be 1, 2, 3 or 4. The opcode arrays specify the instructions used for + /// stores of D registers and even subregs and odd subregs of Q registers. + /// For NumVecs <= 2, QOpcodes1 is not used. + SDNode *SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, + const uint16_t *DOpcodes, + const uint16_t *QOpcodes0, const uint16_t *QOpcodes1); + + /// SelectVLDSTLane - Select NEON load/store lane intrinsics. NumVecs should + /// be 2, 3 or 4. The opcode arrays specify the instructions used for + /// load/store of D registers and Q registers. + SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, + bool isUpdating, unsigned NumVecs, + const uint16_t *DOpcodes, const uint16_t *QOpcodes); + + /// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs + /// should be 2, 3 or 4. The opcode array specifies the instructions used + /// for loading D registers. (Q registers are not supported.) + SDNode *SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs, + const uint16_t *Opcodes); + + /// SelectVTBL - Select NEON VTBL and VTBX intrinsics. NumVecs should be 2, + /// 3 or 4. These are custom-selected so that a REG_SEQUENCE can be + /// generated to force the table registers to be consecutive. + SDNode *SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, unsigned Opc); + + /// SelectV6T2BitfieldExtractOp - Select SBFX/UBFX instructions for ARM. + SDNode *SelectV6T2BitfieldExtractOp(SDNode *N, bool isSigned); + + // Select special operations if node forms integer ABS pattern + SDNode *SelectABSOp(SDNode *N); + + SDNode *SelectReadRegister(SDNode *N); + SDNode *SelectWriteRegister(SDNode *N); + + SDNode *SelectInlineAsm(SDNode *N); + + SDNode *SelectConcatVector(SDNode *N); + + /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for + /// inline asm expressions. + bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, + std::vector<SDValue> &OutOps) override; + + // Form pairs of consecutive R, S, D, or Q registers. + SDNode *createGPRPairNode(EVT VT, SDValue V0, SDValue V1); + SDNode *createSRegPairNode(EVT VT, SDValue V0, SDValue V1); + SDNode *createDRegPairNode(EVT VT, SDValue V0, SDValue V1); + SDNode *createQRegPairNode(EVT VT, SDValue V0, SDValue V1); + + // Form sequences of 4 consecutive S, D, or Q registers. + SDNode *createQuadSRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3); + SDNode *createQuadDRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3); + SDNode *createQuadQRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3); + + // Get the alignment operand for a NEON VLD or VST instruction. + SDValue GetVLDSTAlign(SDValue Align, SDLoc dl, unsigned NumVecs, + bool is64BitVector); + + /// Returns the number of instructions required to materialize the given + /// constant in a register, or 3 if a literal pool load is needed. + unsigned ConstantMaterializationCost(unsigned Val) const; + + /// Checks if N is a multiplication by a constant where we can extract out a + /// power of two from the constant so that it can be used in a shift, but only + /// if it simplifies the materialization of the constant. Returns true if it + /// is, and assigns to PowerOfTwo the power of two that should be extracted + /// out and to NewMulConst the new constant to be multiplied by. + bool canExtractShiftFromMul(const SDValue &N, unsigned MaxShift, + unsigned &PowerOfTwo, SDValue &NewMulConst) const; + + /// Replace N with M in CurDAG, in a way that also ensures that M gets + /// selected when N would have been selected. + void replaceDAGValue(const SDValue &N, SDValue M); +}; +} + +/// isInt32Immediate - This method tests to see if the node is a 32-bit constant +/// operand. If so Imm will receive the 32-bit value. +static bool isInt32Immediate(SDNode *N, unsigned &Imm) { + if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) { + Imm = cast<ConstantSDNode>(N)->getZExtValue(); + return true; + } + return false; +} + +// isInt32Immediate - This method tests to see if a constant operand. +// If so Imm will receive the 32 bit value. +static bool isInt32Immediate(SDValue N, unsigned &Imm) { + return isInt32Immediate(N.getNode(), Imm); +} + +// isOpcWithIntImmediate - This method tests to see if the node is a specific +// opcode and that it has a immediate integer right operand. +// If so Imm will receive the 32 bit value. +static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) { + return N->getOpcode() == Opc && + isInt32Immediate(N->getOperand(1).getNode(), Imm); +} + +/// \brief Check whether a particular node is a constant value representable as +/// (N * Scale) where (N in [\p RangeMin, \p RangeMax). +/// +/// \param ScaledConstant [out] - On success, the pre-scaled constant value. +static bool isScaledConstantInRange(SDValue Node, int Scale, + int RangeMin, int RangeMax, + int &ScaledConstant) { + assert(Scale > 0 && "Invalid scale!"); + + // Check that this is a constant. + const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Node); + if (!C) + return false; + + ScaledConstant = (int) C->getZExtValue(); + if ((ScaledConstant % Scale) != 0) + return false; + + ScaledConstant /= Scale; + return ScaledConstant >= RangeMin && ScaledConstant < RangeMax; +} + +void ARMDAGToDAGISel::PreprocessISelDAG() { + if (!Subtarget->hasV6T2Ops()) + return; + + bool isThumb2 = Subtarget->isThumb(); + for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), + E = CurDAG->allnodes_end(); I != E; ) { + SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. + + if (N->getOpcode() != ISD::ADD) + continue; + + // Look for (add X1, (and (srl X2, c1), c2)) where c2 is constant with + // leading zeros, followed by consecutive set bits, followed by 1 or 2 + // trailing zeros, e.g. 1020. + // Transform the expression to + // (add X1, (shl (and (srl X2, c1), (c2>>tz)), tz)) where tz is the number + // of trailing zeros of c2. The left shift would be folded as an shifter + // operand of 'add' and the 'and' and 'srl' would become a bits extraction + // node (UBFX). + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + unsigned And_imm = 0; + if (!isOpcWithIntImmediate(N1.getNode(), ISD::AND, And_imm)) { + if (isOpcWithIntImmediate(N0.getNode(), ISD::AND, And_imm)) + std::swap(N0, N1); + } + if (!And_imm) + continue; + + // Check if the AND mask is an immediate of the form: 000.....1111111100 + unsigned TZ = countTrailingZeros(And_imm); + if (TZ != 1 && TZ != 2) + // Be conservative here. Shifter operands aren't always free. e.g. On + // Swift, left shifter operand of 1 / 2 for free but others are not. + // e.g. + // ubfx r3, r1, #16, #8 + // ldr.w r3, [r0, r3, lsl #2] + // vs. + // mov.w r9, #1020 + // and.w r2, r9, r1, lsr #14 + // ldr r2, [r0, r2] + continue; + And_imm >>= TZ; + if (And_imm & (And_imm + 1)) + continue; + + // Look for (and (srl X, c1), c2). + SDValue Srl = N1.getOperand(0); + unsigned Srl_imm = 0; + if (!isOpcWithIntImmediate(Srl.getNode(), ISD::SRL, Srl_imm) || + (Srl_imm <= 2)) + continue; + + // Make sure first operand is not a shifter operand which would prevent + // folding of the left shift. + SDValue CPTmp0; + SDValue CPTmp1; + SDValue CPTmp2; + if (isThumb2) { + if (SelectImmShifterOperand(N0, CPTmp0, CPTmp1)) + continue; + } else { + if (SelectImmShifterOperand(N0, CPTmp0, CPTmp1) || + SelectRegShifterOperand(N0, CPTmp0, CPTmp1, CPTmp2)) + continue; + } + + // Now make the transformation. + Srl = CurDAG->getNode(ISD::SRL, SDLoc(Srl), MVT::i32, + Srl.getOperand(0), + CurDAG->getConstant(Srl_imm + TZ, SDLoc(Srl), + MVT::i32)); + N1 = CurDAG->getNode(ISD::AND, SDLoc(N1), MVT::i32, + Srl, + CurDAG->getConstant(And_imm, SDLoc(Srl), MVT::i32)); + N1 = CurDAG->getNode(ISD::SHL, SDLoc(N1), MVT::i32, + N1, CurDAG->getConstant(TZ, SDLoc(Srl), MVT::i32)); + CurDAG->UpdateNodeOperands(N, N0, N1); + } +} + +/// hasNoVMLxHazardUse - Return true if it's desirable to select a FP MLA / MLS +/// node. VFP / NEON fp VMLA / VMLS instructions have special RAW hazards (at +/// least on current ARM implementations) which should be avoidded. +bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const { + if (OptLevel == CodeGenOpt::None) + return true; + + if (!CheckVMLxHazard) + return true; + + if (!Subtarget->isCortexA7() && !Subtarget->isCortexA8() && + !Subtarget->isCortexA9() && !Subtarget->isSwift()) + return true; + + if (!N->hasOneUse()) + return false; + + SDNode *Use = *N->use_begin(); + if (Use->getOpcode() == ISD::CopyToReg) + return true; + if (Use->isMachineOpcode()) { + const ARMBaseInstrInfo *TII = static_cast<const ARMBaseInstrInfo *>( + CurDAG->getSubtarget().getInstrInfo()); + + const MCInstrDesc &MCID = TII->get(Use->getMachineOpcode()); + if (MCID.mayStore()) + return true; + unsigned Opcode = MCID.getOpcode(); + if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD) + return true; + // vmlx feeding into another vmlx. We actually want to unfold + // the use later in the MLxExpansion pass. e.g. + // vmla + // vmla (stall 8 cycles) + // + // vmul (5 cycles) + // vadd (5 cycles) + // vmla + // This adds up to about 18 - 19 cycles. + // + // vmla + // vmul (stall 4 cycles) + // vadd adds up to about 14 cycles. + return TII->isFpMLxInstruction(Opcode); + } + + return false; +} + +bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift, + ARM_AM::ShiftOpc ShOpcVal, + unsigned ShAmt) { + if (!Subtarget->isLikeA9() && !Subtarget->isSwift()) + return true; + if (Shift.hasOneUse()) + return true; + // R << 2 is free. + return ShOpcVal == ARM_AM::lsl && + (ShAmt == 2 || (Subtarget->isSwift() && ShAmt == 1)); +} + +unsigned ARMDAGToDAGISel::ConstantMaterializationCost(unsigned Val) const { + if (Subtarget->isThumb()) { + if (Val <= 255) return 1; // MOV + if (Subtarget->hasV6T2Ops() && Val <= 0xffff) return 1; // MOVW + if (~Val <= 255) return 2; // MOV + MVN + if (ARM_AM::isThumbImmShiftedVal(Val)) return 2; // MOV + LSL + } else { + if (ARM_AM::getSOImmVal(Val) != -1) return 1; // MOV + if (ARM_AM::getSOImmVal(~Val) != -1) return 1; // MVN + if (Subtarget->hasV6T2Ops() && Val <= 0xffff) return 1; // MOVW + if (ARM_AM::isSOImmTwoPartVal(Val)) return 2; // two instrs + } + if (Subtarget->useMovt(*MF)) return 2; // MOVW + MOVT + return 3; // Literal pool load +} + +bool ARMDAGToDAGISel::canExtractShiftFromMul(const SDValue &N, + unsigned MaxShift, + unsigned &PowerOfTwo, + SDValue &NewMulConst) const { + assert(N.getOpcode() == ISD::MUL); + assert(MaxShift > 0); + + // If the multiply is used in more than one place then changing the constant + // will make other uses incorrect, so don't. + if (!N.hasOneUse()) return false; + // Check if the multiply is by a constant + ConstantSDNode *MulConst = dyn_cast<ConstantSDNode>(N.getOperand(1)); + if (!MulConst) return false; + // If the constant is used in more than one place then modifying it will mean + // we need to materialize two constants instead of one, which is a bad idea. + if (!MulConst->hasOneUse()) return false; + unsigned MulConstVal = MulConst->getZExtValue(); + if (MulConstVal == 0) return false; + + // Find the largest power of 2 that MulConstVal is a multiple of + PowerOfTwo = MaxShift; + while ((MulConstVal % (1 << PowerOfTwo)) != 0) { + --PowerOfTwo; + if (PowerOfTwo == 0) return false; + } + + // Only optimise if the new cost is better + unsigned NewMulConstVal = MulConstVal / (1 << PowerOfTwo); + NewMulConst = CurDAG->getConstant(NewMulConstVal, SDLoc(N), MVT::i32); + unsigned OldCost = ConstantMaterializationCost(MulConstVal); + unsigned NewCost = ConstantMaterializationCost(NewMulConstVal); + return NewCost < OldCost; +} + +void ARMDAGToDAGISel::replaceDAGValue(const SDValue &N, SDValue M) { + CurDAG->RepositionNode(N.getNode()->getIterator(), M.getNode()); + CurDAG->ReplaceAllUsesWith(N, M); +} + +bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N, + SDValue &BaseReg, + SDValue &Opc, + bool CheckProfitability) { + if (DisableShifterOp) + return false; + + // If N is a multiply-by-constant and it's profitable to extract a shift and + // use it in a shifted operand do so. + if (N.getOpcode() == ISD::MUL) { + unsigned PowerOfTwo = 0; + SDValue NewMulConst; + if (canExtractShiftFromMul(N, 31, PowerOfTwo, NewMulConst)) { + BaseReg = SDValue(Select(CurDAG->getNode(ISD::MUL, SDLoc(N), MVT::i32, + N.getOperand(0), NewMulConst) + .getNode()), + 0); + replaceDAGValue(N.getOperand(1), NewMulConst); + Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ARM_AM::lsl, + PowerOfTwo), + SDLoc(N), MVT::i32); + return true; + } + } + + ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode()); + + // Don't match base register only case. That is matched to a separate + // lower complexity pattern with explicit register operand. + if (ShOpcVal == ARM_AM::no_shift) return false; + + BaseReg = N.getOperand(0); + unsigned ShImmVal = 0; + ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1)); + if (!RHS) return false; + ShImmVal = RHS->getZExtValue() & 31; + Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal), + SDLoc(N), MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectRegShifterOperand(SDValue N, + SDValue &BaseReg, + SDValue &ShReg, + SDValue &Opc, + bool CheckProfitability) { + if (DisableShifterOp) + return false; + + ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode()); + + // Don't match base register only case. That is matched to a separate + // lower complexity pattern with explicit register operand. + if (ShOpcVal == ARM_AM::no_shift) return false; + + BaseReg = N.getOperand(0); + unsigned ShImmVal = 0; + ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1)); + if (RHS) return false; + + ShReg = N.getOperand(1); + if (CheckProfitability && !isShifterOpProfitable(N, ShOpcVal, ShImmVal)) + return false; + Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal), + SDLoc(N), MVT::i32); + return true; +} + + +bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N, + SDValue &Base, + SDValue &OffImm) { + // Match simple R + imm12 operands. + + // Base only. + if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB && + !CurDAG->isBaseWithConstantOffset(N)) { + if (N.getOpcode() == ISD::FrameIndex) { + // Match frame index. + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); + return true; + } + + if (N.getOpcode() == ARMISD::Wrapper && + N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) { + Base = N.getOperand(0); + } else + Base = N; + OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); + return true; + } + + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int RHSC = (int)RHS->getSExtValue(); + if (N.getOpcode() == ISD::SUB) + RHSC = -RHSC; + + if (RHSC > -0x1000 && RHSC < 0x1000) { // 12 bits + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + } + OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32); + return true; + } + } + + // Base only. + Base = N; + OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); + return true; +} + + + +bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset, + SDValue &Opc) { + if (N.getOpcode() == ISD::MUL && + ((!Subtarget->isLikeA9() && !Subtarget->isSwift()) || N.hasOneUse())) { + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + // X * [3,5,9] -> X + X * [2,4,8] etc. + int RHSC = (int)RHS->getZExtValue(); + if (RHSC & 1) { + RHSC = RHSC & ~1; + ARM_AM::AddrOpc AddSub = ARM_AM::add; + if (RHSC < 0) { + AddSub = ARM_AM::sub; + RHSC = - RHSC; + } + if (isPowerOf2_32(RHSC)) { + unsigned ShAmt = Log2_32(RHSC); + Base = Offset = N.getOperand(0); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, + ARM_AM::lsl), + SDLoc(N), MVT::i32); + return true; + } + } + } + } + + if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB && + // ISD::OR that is equivalent to an ISD::ADD. + !CurDAG->isBaseWithConstantOffset(N)) + return false; + + // Leave simple R +/- imm12 operands for LDRi12 + if (N.getOpcode() == ISD::ADD || N.getOpcode() == ISD::OR) { + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/1, + -0x1000+1, 0x1000, RHSC)) // 12 bits. + return false; + } + + // Otherwise this is R +/- [possibly shifted] R. + ARM_AM::AddrOpc AddSub = N.getOpcode() == ISD::SUB ? ARM_AM::sub:ARM_AM::add; + ARM_AM::ShiftOpc ShOpcVal = + ARM_AM::getShiftOpcForNode(N.getOperand(1).getOpcode()); + unsigned ShAmt = 0; + + Base = N.getOperand(0); + Offset = N.getOperand(1); + + if (ShOpcVal != ARM_AM::no_shift) { + // Check to see if the RHS of the shift is a constant, if not, we can't fold + // it. + if (ConstantSDNode *Sh = + dyn_cast<ConstantSDNode>(N.getOperand(1).getOperand(1))) { + ShAmt = Sh->getZExtValue(); + if (isShifterOpProfitable(Offset, ShOpcVal, ShAmt)) + Offset = N.getOperand(1).getOperand(0); + else { + ShAmt = 0; + ShOpcVal = ARM_AM::no_shift; + } + } else { + ShOpcVal = ARM_AM::no_shift; + } + } + + // Try matching (R shl C) + (R). + if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift && + !(Subtarget->isLikeA9() || Subtarget->isSwift() || + N.getOperand(0).hasOneUse())) { + ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0).getOpcode()); + if (ShOpcVal != ARM_AM::no_shift) { + // Check to see if the RHS of the shift is a constant, if not, we can't + // fold it. + if (ConstantSDNode *Sh = + dyn_cast<ConstantSDNode>(N.getOperand(0).getOperand(1))) { + ShAmt = Sh->getZExtValue(); + if (isShifterOpProfitable(N.getOperand(0), ShOpcVal, ShAmt)) { + Offset = N.getOperand(0).getOperand(0); + Base = N.getOperand(1); + } else { + ShAmt = 0; + ShOpcVal = ARM_AM::no_shift; + } + } else { + ShOpcVal = ARM_AM::no_shift; + } + } + } + + // If Offset is a multiply-by-constant and it's profitable to extract a shift + // and use it in a shifted operand do so. + if (Offset.getOpcode() == ISD::MUL) { + unsigned PowerOfTwo = 0; + SDValue NewMulConst; + if (canExtractShiftFromMul(Offset, 31, PowerOfTwo, NewMulConst)) { + replaceDAGValue(Offset.getOperand(1), NewMulConst); + ShAmt = PowerOfTwo; + ShOpcVal = ARM_AM::lsl; + } + } + + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal), + SDLoc(N), MVT::i32); + return true; +} + + +//----- + +AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N, + SDValue &Base, + SDValue &Offset, + SDValue &Opc) { + if (N.getOpcode() == ISD::MUL && + (!(Subtarget->isLikeA9() || Subtarget->isSwift()) || N.hasOneUse())) { + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + // X * [3,5,9] -> X + X * [2,4,8] etc. + int RHSC = (int)RHS->getZExtValue(); + if (RHSC & 1) { + RHSC = RHSC & ~1; + ARM_AM::AddrOpc AddSub = ARM_AM::add; + if (RHSC < 0) { + AddSub = ARM_AM::sub; + RHSC = - RHSC; + } + if (isPowerOf2_32(RHSC)) { + unsigned ShAmt = Log2_32(RHSC); + Base = Offset = N.getOperand(0); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, + ARM_AM::lsl), + SDLoc(N), MVT::i32); + return AM2_SHOP; + } + } + } + } + + if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB && + // ISD::OR that is equivalent to an ADD. + !CurDAG->isBaseWithConstantOffset(N)) { + Base = N; + if (N.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + } else if (N.getOpcode() == ARMISD::Wrapper && + N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) { + Base = N.getOperand(0); + } + Offset = CurDAG->getRegister(0, MVT::i32); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(ARM_AM::add, 0, + ARM_AM::no_shift), + SDLoc(N), MVT::i32); + return AM2_BASE; + } + + // Match simple R +/- imm12 operands. + if (N.getOpcode() != ISD::SUB) { + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/1, + -0x1000+1, 0x1000, RHSC)) { // 12 bits. + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + } + Offset = CurDAG->getRegister(0, MVT::i32); + + ARM_AM::AddrOpc AddSub = ARM_AM::add; + if (RHSC < 0) { + AddSub = ARM_AM::sub; + RHSC = - RHSC; + } + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, RHSC, + ARM_AM::no_shift), + SDLoc(N), MVT::i32); + return AM2_BASE; + } + } + + if ((Subtarget->isLikeA9() || Subtarget->isSwift()) && !N.hasOneUse()) { + // Compute R +/- (R << N) and reuse it. + Base = N; + Offset = CurDAG->getRegister(0, MVT::i32); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(ARM_AM::add, 0, + ARM_AM::no_shift), + SDLoc(N), MVT::i32); + return AM2_BASE; + } + + // Otherwise this is R +/- [possibly shifted] R. + ARM_AM::AddrOpc AddSub = N.getOpcode() != ISD::SUB ? ARM_AM::add:ARM_AM::sub; + ARM_AM::ShiftOpc ShOpcVal = + ARM_AM::getShiftOpcForNode(N.getOperand(1).getOpcode()); + unsigned ShAmt = 0; + + Base = N.getOperand(0); + Offset = N.getOperand(1); + + if (ShOpcVal != ARM_AM::no_shift) { + // Check to see if the RHS of the shift is a constant, if not, we can't fold + // it. + if (ConstantSDNode *Sh = + dyn_cast<ConstantSDNode>(N.getOperand(1).getOperand(1))) { + ShAmt = Sh->getZExtValue(); + if (isShifterOpProfitable(Offset, ShOpcVal, ShAmt)) + Offset = N.getOperand(1).getOperand(0); + else { + ShAmt = 0; + ShOpcVal = ARM_AM::no_shift; + } + } else { + ShOpcVal = ARM_AM::no_shift; + } + } + + // Try matching (R shl C) + (R). + if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift && + !(Subtarget->isLikeA9() || Subtarget->isSwift() || + N.getOperand(0).hasOneUse())) { + ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0).getOpcode()); + if (ShOpcVal != ARM_AM::no_shift) { + // Check to see if the RHS of the shift is a constant, if not, we can't + // fold it. + if (ConstantSDNode *Sh = + dyn_cast<ConstantSDNode>(N.getOperand(0).getOperand(1))) { + ShAmt = Sh->getZExtValue(); + if (isShifterOpProfitable(N.getOperand(0), ShOpcVal, ShAmt)) { + Offset = N.getOperand(0).getOperand(0); + Base = N.getOperand(1); + } else { + ShAmt = 0; + ShOpcVal = ARM_AM::no_shift; + } + } else { + ShOpcVal = ARM_AM::no_shift; + } + } + } + + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal), + SDLoc(N), MVT::i32); + return AM2_SHOP; +} + +bool ARMDAGToDAGISel::SelectAddrMode2OffsetReg(SDNode *Op, SDValue N, + SDValue &Offset, SDValue &Opc) { + unsigned Opcode = Op->getOpcode(); + ISD::MemIndexedMode AM = (Opcode == ISD::LOAD) + ? cast<LoadSDNode>(Op)->getAddressingMode() + : cast<StoreSDNode>(Op)->getAddressingMode(); + ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC) + ? ARM_AM::add : ARM_AM::sub; + int Val; + if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val)) + return false; + + Offset = N; + ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode()); + unsigned ShAmt = 0; + if (ShOpcVal != ARM_AM::no_shift) { + // Check to see if the RHS of the shift is a constant, if not, we can't fold + // it. + if (ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + ShAmt = Sh->getZExtValue(); + if (isShifterOpProfitable(N, ShOpcVal, ShAmt)) + Offset = N.getOperand(0); + else { + ShAmt = 0; + ShOpcVal = ARM_AM::no_shift; + } + } else { + ShOpcVal = ARM_AM::no_shift; + } + } + + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal), + SDLoc(N), MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectAddrMode2OffsetImmPre(SDNode *Op, SDValue N, + SDValue &Offset, SDValue &Opc) { + unsigned Opcode = Op->getOpcode(); + ISD::MemIndexedMode AM = (Opcode == ISD::LOAD) + ? cast<LoadSDNode>(Op)->getAddressingMode() + : cast<StoreSDNode>(Op)->getAddressingMode(); + ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC) + ? ARM_AM::add : ARM_AM::sub; + int Val; + if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val)) { // 12 bits. + if (AddSub == ARM_AM::sub) Val *= -1; + Offset = CurDAG->getRegister(0, MVT::i32); + Opc = CurDAG->getTargetConstant(Val, SDLoc(Op), MVT::i32); + return true; + } + + return false; +} + + +bool ARMDAGToDAGISel::SelectAddrMode2OffsetImm(SDNode *Op, SDValue N, + SDValue &Offset, SDValue &Opc) { + unsigned Opcode = Op->getOpcode(); + ISD::MemIndexedMode AM = (Opcode == ISD::LOAD) + ? cast<LoadSDNode>(Op)->getAddressingMode() + : cast<StoreSDNode>(Op)->getAddressingMode(); + ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC) + ? ARM_AM::add : ARM_AM::sub; + int Val; + if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val)) { // 12 bits. + Offset = CurDAG->getRegister(0, MVT::i32); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, Val, + ARM_AM::no_shift), + SDLoc(Op), MVT::i32); + return true; + } + + return false; +} + +bool ARMDAGToDAGISel::SelectAddrOffsetNone(SDValue N, SDValue &Base) { + Base = N; + return true; +} + +bool ARMDAGToDAGISel::SelectAddrMode3(SDValue N, + SDValue &Base, SDValue &Offset, + SDValue &Opc) { + if (N.getOpcode() == ISD::SUB) { + // X - C is canonicalize to X + -C, no need to handle it here. + Base = N.getOperand(0); + Offset = N.getOperand(1); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::sub, 0), SDLoc(N), + MVT::i32); + return true; + } + + if (!CurDAG->isBaseWithConstantOffset(N)) { + Base = N; + if (N.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + } + Offset = CurDAG->getRegister(0, MVT::i32); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0), SDLoc(N), + MVT::i32); + return true; + } + + // If the RHS is +/- imm8, fold into addr mode. + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/1, + -256 + 1, 256, RHSC)) { // 8 bits. + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + } + Offset = CurDAG->getRegister(0, MVT::i32); + + ARM_AM::AddrOpc AddSub = ARM_AM::add; + if (RHSC < 0) { + AddSub = ARM_AM::sub; + RHSC = -RHSC; + } + Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, RHSC), SDLoc(N), + MVT::i32); + return true; + } + + Base = N.getOperand(0); + Offset = N.getOperand(1); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0), SDLoc(N), + MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectAddrMode3Offset(SDNode *Op, SDValue N, + SDValue &Offset, SDValue &Opc) { + unsigned Opcode = Op->getOpcode(); + ISD::MemIndexedMode AM = (Opcode == ISD::LOAD) + ? cast<LoadSDNode>(Op)->getAddressingMode() + : cast<StoreSDNode>(Op)->getAddressingMode(); + ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC) + ? ARM_AM::add : ARM_AM::sub; + int Val; + if (isScaledConstantInRange(N, /*Scale=*/1, 0, 256, Val)) { // 12 bits. + Offset = CurDAG->getRegister(0, MVT::i32); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, Val), SDLoc(Op), + MVT::i32); + return true; + } + + Offset = N; + Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, 0), SDLoc(Op), + MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N, + SDValue &Base, SDValue &Offset) { + if (!CurDAG->isBaseWithConstantOffset(N)) { + Base = N; + if (N.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + } else if (N.getOpcode() == ARMISD::Wrapper && + N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) { + Base = N.getOperand(0); + } + Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0), + SDLoc(N), MVT::i32); + return true; + } + + // If the RHS is +/- imm8, fold into addr mode. + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/4, + -256 + 1, 256, RHSC)) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + } + + ARM_AM::AddrOpc AddSub = ARM_AM::add; + if (RHSC < 0) { + AddSub = ARM_AM::sub; + RHSC = -RHSC; + } + Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC), + SDLoc(N), MVT::i32); + return true; + } + + Base = N; + Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0), + SDLoc(N), MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr, + SDValue &Align) { + Addr = N; + + unsigned Alignment = 0; + + MemSDNode *MemN = cast<MemSDNode>(Parent); + + if (isa<LSBaseSDNode>(MemN) || + ((MemN->getOpcode() == ARMISD::VST1_UPD || + MemN->getOpcode() == ARMISD::VLD1_UPD) && + MemN->getConstantOperandVal(MemN->getNumOperands() - 1) == 1)) { + // This case occurs only for VLD1-lane/dup and VST1-lane instructions. + // The maximum alignment is equal to the memory size being referenced. + unsigned MMOAlign = MemN->getAlignment(); + unsigned MemSize = MemN->getMemoryVT().getSizeInBits() / 8; + if (MMOAlign >= MemSize && MemSize > 1) + Alignment = MemSize; + } else { + // All other uses of addrmode6 are for intrinsics. For now just record + // the raw alignment value; it will be refined later based on the legal + // alignment operands for the intrinsic. + Alignment = MemN->getAlignment(); + } + + Align = CurDAG->getTargetConstant(Alignment, SDLoc(N), MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectAddrMode6Offset(SDNode *Op, SDValue N, + SDValue &Offset) { + LSBaseSDNode *LdSt = cast<LSBaseSDNode>(Op); + ISD::MemIndexedMode AM = LdSt->getAddressingMode(); + if (AM != ISD::POST_INC) + return false; + Offset = N; + if (ConstantSDNode *NC = dyn_cast<ConstantSDNode>(N)) { + if (NC->getZExtValue() * 8 == LdSt->getMemoryVT().getSizeInBits()) + Offset = CurDAG->getRegister(0, MVT::i32); + } + return true; +} + +bool ARMDAGToDAGISel::SelectAddrModePC(SDValue N, + SDValue &Offset, SDValue &Label) { + if (N.getOpcode() == ARMISD::PIC_ADD && N.hasOneUse()) { + Offset = N.getOperand(0); + SDValue N1 = N.getOperand(1); + Label = CurDAG->getTargetConstant(cast<ConstantSDNode>(N1)->getZExtValue(), + SDLoc(N), MVT::i32); + return true; + } + + return false; +} + + +//===----------------------------------------------------------------------===// +// Thumb Addressing Modes +//===----------------------------------------------------------------------===// + +bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDValue N, + SDValue &Base, SDValue &Offset){ + if (N.getOpcode() != ISD::ADD && !CurDAG->isBaseWithConstantOffset(N)) { + ConstantSDNode *NC = dyn_cast<ConstantSDNode>(N); + if (!NC || !NC->isNullValue()) + return false; + + Base = Offset = N; + return true; + } + + Base = N.getOperand(0); + Offset = N.getOperand(1); + return true; +} + +bool +ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, + SDValue &Base, SDValue &OffImm) { + if (!CurDAG->isBaseWithConstantOffset(N)) { + if (N.getOpcode() == ISD::ADD) { + return false; // We want to select register offset instead + } else if (N.getOpcode() == ARMISD::Wrapper && + N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) { + Base = N.getOperand(0); + } else { + Base = N; + } + + OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); + return true; + } + + // If the RHS is + imm5 * scale, fold into addr mode. + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), Scale, 0, 32, RHSC)) { + Base = N.getOperand(0); + OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32); + return true; + } + + // Offset is too large, so use register offset instead. + return false; +} + +bool +ARMDAGToDAGISel::SelectThumbAddrModeImm5S4(SDValue N, SDValue &Base, + SDValue &OffImm) { + return SelectThumbAddrModeImm5S(N, 4, Base, OffImm); +} + +bool +ARMDAGToDAGISel::SelectThumbAddrModeImm5S2(SDValue N, SDValue &Base, + SDValue &OffImm) { + return SelectThumbAddrModeImm5S(N, 2, Base, OffImm); +} + +bool +ARMDAGToDAGISel::SelectThumbAddrModeImm5S1(SDValue N, SDValue &Base, + SDValue &OffImm) { + return SelectThumbAddrModeImm5S(N, 1, Base, OffImm); +} + +bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N, + SDValue &Base, SDValue &OffImm) { + if (N.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + // Only multiples of 4 are allowed for the offset, so the frame object + // alignment must be at least 4. + MachineFrameInfo *MFI = MF->getFrameInfo(); + if (MFI->getObjectAlignment(FI) < 4) + MFI->setObjectAlignment(FI, 4); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); + return true; + } + + if (!CurDAG->isBaseWithConstantOffset(N)) + return false; + + RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0)); + if (N.getOperand(0).getOpcode() == ISD::FrameIndex || + (LHSR && LHSR->getReg() == ARM::SP)) { + // If the RHS is + imm8 * scale, fold into addr mode. + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/4, 0, 256, RHSC)) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + // For LHS+RHS to result in an offset that's a multiple of 4 the object + // indexed by the LHS must be 4-byte aligned. + MachineFrameInfo *MFI = MF->getFrameInfo(); + if (MFI->getObjectAlignment(FI) < 4) + MFI->setObjectAlignment(FI, 4); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + } + OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32); + return true; + } + } + + return false; +} + + +//===----------------------------------------------------------------------===// +// Thumb 2 Addressing Modes +//===----------------------------------------------------------------------===// + + +bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N, + SDValue &Base, SDValue &OffImm) { + // Match simple R + imm12 operands. + + // Base only. + if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB && + !CurDAG->isBaseWithConstantOffset(N)) { + if (N.getOpcode() == ISD::FrameIndex) { + // Match frame index. + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); + return true; + } + + if (N.getOpcode() == ARMISD::Wrapper && + N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::TargetConstantPool) + return false; // We want to select t2LDRpci instead. + } else + Base = N; + OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); + return true; + } + + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + if (SelectT2AddrModeImm8(N, Base, OffImm)) + // Let t2LDRi8 handle (R - imm8). + return false; + + int RHSC = (int)RHS->getZExtValue(); + if (N.getOpcode() == ISD::SUB) + RHSC = -RHSC; + + if (RHSC >= 0 && RHSC < 0x1000) { // 12 bits (unsigned) + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + } + OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32); + return true; + } + } + + // Base only. + Base = N; + OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N, + SDValue &Base, SDValue &OffImm) { + // Match simple R - imm8 operands. + if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB && + !CurDAG->isBaseWithConstantOffset(N)) + return false; + + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int RHSC = (int)RHS->getSExtValue(); + if (N.getOpcode() == ISD::SUB) + RHSC = -RHSC; + + if ((RHSC >= -255) && (RHSC < 0)) { // 8 bits (always negative) + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + } + OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32); + return true; + } + } + + return false; +} + +bool ARMDAGToDAGISel::SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N, + SDValue &OffImm){ + unsigned Opcode = Op->getOpcode(); + ISD::MemIndexedMode AM = (Opcode == ISD::LOAD) + ? cast<LoadSDNode>(Op)->getAddressingMode() + : cast<StoreSDNode>(Op)->getAddressingMode(); + int RHSC; + if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x100, RHSC)) { // 8 bits. + OffImm = ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC)) + ? CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32) + : CurDAG->getTargetConstant(-RHSC, SDLoc(N), MVT::i32); + return true; + } + + return false; +} + +bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N, + SDValue &Base, + SDValue &OffReg, SDValue &ShImm) { + // (R - imm8) should be handled by t2LDRi8. The rest are handled by t2LDRi12. + if (N.getOpcode() != ISD::ADD && !CurDAG->isBaseWithConstantOffset(N)) + return false; + + // Leave (R + imm12) for t2LDRi12, (R - imm8) for t2LDRi8. + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int RHSC = (int)RHS->getZExtValue(); + if (RHSC >= 0 && RHSC < 0x1000) // 12 bits (unsigned) + return false; + else if (RHSC < 0 && RHSC >= -255) // 8 bits + return false; + } + + // Look for (R + R) or (R + (R << [1,2,3])). + unsigned ShAmt = 0; + Base = N.getOperand(0); + OffReg = N.getOperand(1); + + // Swap if it is ((R << c) + R). + ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(OffReg.getOpcode()); + if (ShOpcVal != ARM_AM::lsl) { + ShOpcVal = ARM_AM::getShiftOpcForNode(Base.getOpcode()); + if (ShOpcVal == ARM_AM::lsl) + std::swap(Base, OffReg); + } + + if (ShOpcVal == ARM_AM::lsl) { + // Check to see if the RHS of the shift is a constant, if not, we can't fold + // it. + if (ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(OffReg.getOperand(1))) { + ShAmt = Sh->getZExtValue(); + if (ShAmt < 4 && isShifterOpProfitable(OffReg, ShOpcVal, ShAmt)) + OffReg = OffReg.getOperand(0); + else { + ShAmt = 0; + } + } + } + + // If OffReg is a multiply-by-constant and it's profitable to extract a shift + // and use it in a shifted operand do so. + if (OffReg.getOpcode() == ISD::MUL) { + unsigned PowerOfTwo = 0; + SDValue NewMulConst; + if (canExtractShiftFromMul(OffReg, 3, PowerOfTwo, NewMulConst)) { + replaceDAGValue(OffReg.getOperand(1), NewMulConst); + ShAmt = PowerOfTwo; + } + } + + ShImm = CurDAG->getTargetConstant(ShAmt, SDLoc(N), MVT::i32); + + return true; +} + +bool ARMDAGToDAGISel::SelectT2AddrModeExclusive(SDValue N, SDValue &Base, + SDValue &OffImm) { + // This *must* succeed since it's used for the irreplaceable ldrex and strex + // instructions. + Base = N; + OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); + + if (N.getOpcode() != ISD::ADD || !CurDAG->isBaseWithConstantOffset(N)) + return true; + + ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1)); + if (!RHS) + return true; + + uint32_t RHSC = (int)RHS->getZExtValue(); + if (RHSC > 1020 || RHSC % 4 != 0) + return true; + + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + } + + OffImm = CurDAG->getTargetConstant(RHSC/4, SDLoc(N), MVT::i32); + return true; +} + +//===--------------------------------------------------------------------===// + +/// getAL - Returns a ARMCC::AL immediate node. +static inline SDValue getAL(SelectionDAG *CurDAG, SDLoc dl) { + return CurDAG->getTargetConstant((uint64_t)ARMCC::AL, dl, MVT::i32); +} + +SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDNode *N) { + LoadSDNode *LD = cast<LoadSDNode>(N); + ISD::MemIndexedMode AM = LD->getAddressingMode(); + if (AM == ISD::UNINDEXED) + return nullptr; + + EVT LoadedVT = LD->getMemoryVT(); + SDValue Offset, AMOpc; + bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC); + unsigned Opcode = 0; + bool Match = false; + if (LoadedVT == MVT::i32 && isPre && + SelectAddrMode2OffsetImmPre(N, LD->getOffset(), Offset, AMOpc)) { + Opcode = ARM::LDR_PRE_IMM; + Match = true; + } else if (LoadedVT == MVT::i32 && !isPre && + SelectAddrMode2OffsetImm(N, LD->getOffset(), Offset, AMOpc)) { + Opcode = ARM::LDR_POST_IMM; + Match = true; + } else if (LoadedVT == MVT::i32 && + SelectAddrMode2OffsetReg(N, LD->getOffset(), Offset, AMOpc)) { + Opcode = isPre ? ARM::LDR_PRE_REG : ARM::LDR_POST_REG; + Match = true; + + } else if (LoadedVT == MVT::i16 && + SelectAddrMode3Offset(N, LD->getOffset(), Offset, AMOpc)) { + Match = true; + Opcode = (LD->getExtensionType() == ISD::SEXTLOAD) + ? (isPre ? ARM::LDRSH_PRE : ARM::LDRSH_POST) + : (isPre ? ARM::LDRH_PRE : ARM::LDRH_POST); + } else if (LoadedVT == MVT::i8 || LoadedVT == MVT::i1) { + if (LD->getExtensionType() == ISD::SEXTLOAD) { + if (SelectAddrMode3Offset(N, LD->getOffset(), Offset, AMOpc)) { + Match = true; + Opcode = isPre ? ARM::LDRSB_PRE : ARM::LDRSB_POST; + } + } else { + if (isPre && + SelectAddrMode2OffsetImmPre(N, LD->getOffset(), Offset, AMOpc)) { + Match = true; + Opcode = ARM::LDRB_PRE_IMM; + } else if (!isPre && + SelectAddrMode2OffsetImm(N, LD->getOffset(), Offset, AMOpc)) { + Match = true; + Opcode = ARM::LDRB_POST_IMM; + } else if (SelectAddrMode2OffsetReg(N, LD->getOffset(), Offset, AMOpc)) { + Match = true; + Opcode = isPre ? ARM::LDRB_PRE_REG : ARM::LDRB_POST_REG; + } + } + } + + if (Match) { + if (Opcode == ARM::LDR_PRE_IMM || Opcode == ARM::LDRB_PRE_IMM) { + SDValue Chain = LD->getChain(); + SDValue Base = LD->getBasePtr(); + SDValue Ops[]= { Base, AMOpc, getAL(CurDAG, SDLoc(N)), + CurDAG->getRegister(0, MVT::i32), Chain }; + return CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, + MVT::i32, MVT::Other, Ops); + } else { + SDValue Chain = LD->getChain(); + SDValue Base = LD->getBasePtr(); + SDValue Ops[]= { Base, Offset, AMOpc, getAL(CurDAG, SDLoc(N)), + CurDAG->getRegister(0, MVT::i32), Chain }; + return CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, + MVT::i32, MVT::Other, Ops); + } + } + + return nullptr; +} + +SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) { + LoadSDNode *LD = cast<LoadSDNode>(N); + ISD::MemIndexedMode AM = LD->getAddressingMode(); + if (AM == ISD::UNINDEXED) + return nullptr; + + EVT LoadedVT = LD->getMemoryVT(); + bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD; + SDValue Offset; + bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC); + unsigned Opcode = 0; + bool Match = false; + if (SelectT2AddrModeImm8Offset(N, LD->getOffset(), Offset)) { + switch (LoadedVT.getSimpleVT().SimpleTy) { + case MVT::i32: + Opcode = isPre ? ARM::t2LDR_PRE : ARM::t2LDR_POST; + break; + case MVT::i16: + if (isSExtLd) + Opcode = isPre ? ARM::t2LDRSH_PRE : ARM::t2LDRSH_POST; + else + Opcode = isPre ? ARM::t2LDRH_PRE : ARM::t2LDRH_POST; + break; + case MVT::i8: + case MVT::i1: + if (isSExtLd) + Opcode = isPre ? ARM::t2LDRSB_PRE : ARM::t2LDRSB_POST; + else + Opcode = isPre ? ARM::t2LDRB_PRE : ARM::t2LDRB_POST; + break; + default: + return nullptr; + } + Match = true; + } + + if (Match) { + SDValue Chain = LD->getChain(); + SDValue Base = LD->getBasePtr(); + SDValue Ops[]= { Base, Offset, getAL(CurDAG, SDLoc(N)), + CurDAG->getRegister(0, MVT::i32), Chain }; + return CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, MVT::i32, + MVT::Other, Ops); + } + + return nullptr; +} + +/// \brief Form a GPRPair pseudo register from a pair of GPR regs. +SDNode *ARMDAGToDAGISel::createGPRPairNode(EVT VT, SDValue V0, SDValue V1) { + SDLoc dl(V0.getNode()); + SDValue RegClass = + CurDAG->getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32); + SDValue SubReg0 = CurDAG->getTargetConstant(ARM::gsub_0, dl, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(ARM::gsub_1, dl, MVT::i32); + const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops); +} + +/// \brief Form a D register from a pair of S registers. +SDNode *ARMDAGToDAGISel::createSRegPairNode(EVT VT, SDValue V0, SDValue V1) { + SDLoc dl(V0.getNode()); + SDValue RegClass = + CurDAG->getTargetConstant(ARM::DPR_VFP2RegClassID, dl, MVT::i32); + SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, dl, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(ARM::ssub_1, dl, MVT::i32); + const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops); +} + +/// \brief Form a quad register from a pair of D registers. +SDNode *ARMDAGToDAGISel::createDRegPairNode(EVT VT, SDValue V0, SDValue V1) { + SDLoc dl(V0.getNode()); + SDValue RegClass = CurDAG->getTargetConstant(ARM::QPRRegClassID, dl, + MVT::i32); + SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, dl, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, dl, MVT::i32); + const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops); +} + +/// \brief Form 4 consecutive D registers from a pair of Q registers. +SDNode *ARMDAGToDAGISel::createQRegPairNode(EVT VT, SDValue V0, SDValue V1) { + SDLoc dl(V0.getNode()); + SDValue RegClass = CurDAG->getTargetConstant(ARM::QQPRRegClassID, dl, + MVT::i32); + SDValue SubReg0 = CurDAG->getTargetConstant(ARM::qsub_0, dl, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(ARM::qsub_1, dl, MVT::i32); + const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops); +} + +/// \brief Form 4 consecutive S registers. +SDNode *ARMDAGToDAGISel::createQuadSRegsNode(EVT VT, SDValue V0, SDValue V1, + SDValue V2, SDValue V3) { + SDLoc dl(V0.getNode()); + SDValue RegClass = + CurDAG->getTargetConstant(ARM::QPR_VFP2RegClassID, dl, MVT::i32); + SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, dl, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(ARM::ssub_1, dl, MVT::i32); + SDValue SubReg2 = CurDAG->getTargetConstant(ARM::ssub_2, dl, MVT::i32); + SDValue SubReg3 = CurDAG->getTargetConstant(ARM::ssub_3, dl, MVT::i32); + const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1, + V2, SubReg2, V3, SubReg3 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops); +} + +/// \brief Form 4 consecutive D registers. +SDNode *ARMDAGToDAGISel::createQuadDRegsNode(EVT VT, SDValue V0, SDValue V1, + SDValue V2, SDValue V3) { + SDLoc dl(V0.getNode()); + SDValue RegClass = CurDAG->getTargetConstant(ARM::QQPRRegClassID, dl, + MVT::i32); + SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, dl, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, dl, MVT::i32); + SDValue SubReg2 = CurDAG->getTargetConstant(ARM::dsub_2, dl, MVT::i32); + SDValue SubReg3 = CurDAG->getTargetConstant(ARM::dsub_3, dl, MVT::i32); + const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1, + V2, SubReg2, V3, SubReg3 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops); +} + +/// \brief Form 4 consecutive Q registers. +SDNode *ARMDAGToDAGISel::createQuadQRegsNode(EVT VT, SDValue V0, SDValue V1, + SDValue V2, SDValue V3) { + SDLoc dl(V0.getNode()); + SDValue RegClass = CurDAG->getTargetConstant(ARM::QQQQPRRegClassID, dl, + MVT::i32); + SDValue SubReg0 = CurDAG->getTargetConstant(ARM::qsub_0, dl, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(ARM::qsub_1, dl, MVT::i32); + SDValue SubReg2 = CurDAG->getTargetConstant(ARM::qsub_2, dl, MVT::i32); + SDValue SubReg3 = CurDAG->getTargetConstant(ARM::qsub_3, dl, MVT::i32); + const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1, + V2, SubReg2, V3, SubReg3 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops); +} + +/// GetVLDSTAlign - Get the alignment (in bytes) for the alignment operand +/// of a NEON VLD or VST instruction. The supported values depend on the +/// number of registers being loaded. +SDValue ARMDAGToDAGISel::GetVLDSTAlign(SDValue Align, SDLoc dl, + unsigned NumVecs, bool is64BitVector) { + unsigned NumRegs = NumVecs; + if (!is64BitVector && NumVecs < 3) + NumRegs *= 2; + + unsigned Alignment = cast<ConstantSDNode>(Align)->getZExtValue(); + if (Alignment >= 32 && NumRegs == 4) + Alignment = 32; + else if (Alignment >= 16 && (NumRegs == 2 || NumRegs == 4)) + Alignment = 16; + else if (Alignment >= 8) + Alignment = 8; + else + Alignment = 0; + + return CurDAG->getTargetConstant(Alignment, dl, MVT::i32); +} + +static bool isVLDfixed(unsigned Opc) +{ + switch (Opc) { + default: return false; + case ARM::VLD1d8wb_fixed : return true; + case ARM::VLD1d16wb_fixed : return true; + case ARM::VLD1d64Qwb_fixed : return true; + case ARM::VLD1d32wb_fixed : return true; + case ARM::VLD1d64wb_fixed : return true; + case ARM::VLD1d64TPseudoWB_fixed : return true; + case ARM::VLD1d64QPseudoWB_fixed : return true; + case ARM::VLD1q8wb_fixed : return true; + case ARM::VLD1q16wb_fixed : return true; + case ARM::VLD1q32wb_fixed : return true; + case ARM::VLD1q64wb_fixed : return true; + case ARM::VLD2d8wb_fixed : return true; + case ARM::VLD2d16wb_fixed : return true; + case ARM::VLD2d32wb_fixed : return true; + case ARM::VLD2q8PseudoWB_fixed : return true; + case ARM::VLD2q16PseudoWB_fixed : return true; + case ARM::VLD2q32PseudoWB_fixed : return true; + case ARM::VLD2DUPd8wb_fixed : return true; + case ARM::VLD2DUPd16wb_fixed : return true; + case ARM::VLD2DUPd32wb_fixed : return true; + } +} + +static bool isVSTfixed(unsigned Opc) +{ + switch (Opc) { + default: return false; + case ARM::VST1d8wb_fixed : return true; + case ARM::VST1d16wb_fixed : return true; + case ARM::VST1d32wb_fixed : return true; + case ARM::VST1d64wb_fixed : return true; + case ARM::VST1q8wb_fixed : return true; + case ARM::VST1q16wb_fixed : return true; + case ARM::VST1q32wb_fixed : return true; + case ARM::VST1q64wb_fixed : return true; + case ARM::VST1d64TPseudoWB_fixed : return true; + case ARM::VST1d64QPseudoWB_fixed : return true; + case ARM::VST2d8wb_fixed : return true; + case ARM::VST2d16wb_fixed : return true; + case ARM::VST2d32wb_fixed : return true; + case ARM::VST2q8PseudoWB_fixed : return true; + case ARM::VST2q16PseudoWB_fixed : return true; + case ARM::VST2q32PseudoWB_fixed : return true; + } +} + +// Get the register stride update opcode of a VLD/VST instruction that +// is otherwise equivalent to the given fixed stride updating instruction. +static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) { + assert((isVLDfixed(Opc) || isVSTfixed(Opc)) + && "Incorrect fixed stride updating instruction."); + switch (Opc) { + default: break; + case ARM::VLD1d8wb_fixed: return ARM::VLD1d8wb_register; + case ARM::VLD1d16wb_fixed: return ARM::VLD1d16wb_register; + case ARM::VLD1d32wb_fixed: return ARM::VLD1d32wb_register; + case ARM::VLD1d64wb_fixed: return ARM::VLD1d64wb_register; + case ARM::VLD1q8wb_fixed: return ARM::VLD1q8wb_register; + case ARM::VLD1q16wb_fixed: return ARM::VLD1q16wb_register; + case ARM::VLD1q32wb_fixed: return ARM::VLD1q32wb_register; + case ARM::VLD1q64wb_fixed: return ARM::VLD1q64wb_register; + case ARM::VLD1d64Twb_fixed: return ARM::VLD1d64Twb_register; + case ARM::VLD1d64Qwb_fixed: return ARM::VLD1d64Qwb_register; + case ARM::VLD1d64TPseudoWB_fixed: return ARM::VLD1d64TPseudoWB_register; + case ARM::VLD1d64QPseudoWB_fixed: return ARM::VLD1d64QPseudoWB_register; + + case ARM::VST1d8wb_fixed: return ARM::VST1d8wb_register; + case ARM::VST1d16wb_fixed: return ARM::VST1d16wb_register; + case ARM::VST1d32wb_fixed: return ARM::VST1d32wb_register; + case ARM::VST1d64wb_fixed: return ARM::VST1d64wb_register; + case ARM::VST1q8wb_fixed: return ARM::VST1q8wb_register; + case ARM::VST1q16wb_fixed: return ARM::VST1q16wb_register; + case ARM::VST1q32wb_fixed: return ARM::VST1q32wb_register; + case ARM::VST1q64wb_fixed: return ARM::VST1q64wb_register; + case ARM::VST1d64TPseudoWB_fixed: return ARM::VST1d64TPseudoWB_register; + case ARM::VST1d64QPseudoWB_fixed: return ARM::VST1d64QPseudoWB_register; + + case ARM::VLD2d8wb_fixed: return ARM::VLD2d8wb_register; + case ARM::VLD2d16wb_fixed: return ARM::VLD2d16wb_register; + case ARM::VLD2d32wb_fixed: return ARM::VLD2d32wb_register; + case ARM::VLD2q8PseudoWB_fixed: return ARM::VLD2q8PseudoWB_register; + case ARM::VLD2q16PseudoWB_fixed: return ARM::VLD2q16PseudoWB_register; + case ARM::VLD2q32PseudoWB_fixed: return ARM::VLD2q32PseudoWB_register; + + case ARM::VST2d8wb_fixed: return ARM::VST2d8wb_register; + case ARM::VST2d16wb_fixed: return ARM::VST2d16wb_register; + case ARM::VST2d32wb_fixed: return ARM::VST2d32wb_register; + case ARM::VST2q8PseudoWB_fixed: return ARM::VST2q8PseudoWB_register; + case ARM::VST2q16PseudoWB_fixed: return ARM::VST2q16PseudoWB_register; + case ARM::VST2q32PseudoWB_fixed: return ARM::VST2q32PseudoWB_register; + + case ARM::VLD2DUPd8wb_fixed: return ARM::VLD2DUPd8wb_register; + case ARM::VLD2DUPd16wb_fixed: return ARM::VLD2DUPd16wb_register; + case ARM::VLD2DUPd32wb_fixed: return ARM::VLD2DUPd32wb_register; + } + return Opc; // If not one we handle, return it unchanged. +} + +SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, + const uint16_t *DOpcodes, + const uint16_t *QOpcodes0, + const uint16_t *QOpcodes1) { + assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range"); + SDLoc dl(N); + + SDValue MemAddr, Align; + unsigned AddrOpIdx = isUpdating ? 1 : 2; + if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) + return nullptr; + + SDValue Chain = N->getOperand(0); + EVT VT = N->getValueType(0); + bool is64BitVector = VT.is64BitVector(); + Align = GetVLDSTAlign(Align, dl, NumVecs, is64BitVector); + + unsigned OpcodeIndex; + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled vld type"); + // Double-register operations: + case MVT::v8i8: OpcodeIndex = 0; break; + case MVT::v4i16: OpcodeIndex = 1; break; + case MVT::v2f32: + case MVT::v2i32: OpcodeIndex = 2; break; + case MVT::v1i64: OpcodeIndex = 3; break; + // Quad-register operations: + case MVT::v16i8: OpcodeIndex = 0; break; + case MVT::v8i16: OpcodeIndex = 1; break; + case MVT::v4f32: + case MVT::v4i32: OpcodeIndex = 2; break; + case MVT::v2f64: + case MVT::v2i64: OpcodeIndex = 3; + assert(NumVecs == 1 && "v2i64 type only supported for VLD1"); + break; + } + + EVT ResTy; + if (NumVecs == 1) + ResTy = VT; + else { + unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs; + if (!is64BitVector) + ResTyElts *= 2; + ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts); + } + std::vector<EVT> ResTys; + ResTys.push_back(ResTy); + if (isUpdating) + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::Other); + + SDValue Pred = getAL(CurDAG, dl); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + SDNode *VLd; + SmallVector<SDValue, 7> Ops; + + // Double registers and VLD1/VLD2 quad registers are directly supported. + if (is64BitVector || NumVecs <= 2) { + unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] : + QOpcodes0[OpcodeIndex]); + Ops.push_back(MemAddr); + Ops.push_back(Align); + if (isUpdating) { + SDValue Inc = N->getOperand(AddrOpIdx + 1); + // FIXME: VLD1/VLD2 fixed increment doesn't need Reg0. Remove the reg0 + // case entirely when the rest are updated to that form, too. + if ((NumVecs <= 2) && !isa<ConstantSDNode>(Inc.getNode())) + Opc = getVLDSTRegisterUpdateOpcode(Opc); + // FIXME: We use a VLD1 for v1i64 even if the pseudo says vld2/3/4, so + // check for that explicitly too. Horribly hacky, but temporary. + if ((NumVecs > 2 && !isVLDfixed(Opc)) || + !isa<ConstantSDNode>(Inc.getNode())) + Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc); + } + Ops.push_back(Pred); + Ops.push_back(Reg0); + Ops.push_back(Chain); + VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + + } else { + // Otherwise, quad registers are loaded with two separate instructions, + // where one loads the even registers and the other loads the odd registers. + EVT AddrTy = MemAddr.getValueType(); + + // Load the even subregs. This is always an updating load, so that it + // provides the address to the second load for the odd subregs. + SDValue ImplDef = + SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0); + const SDValue OpsA[] = { MemAddr, Align, Reg0, ImplDef, Pred, Reg0, Chain }; + SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl, + ResTy, AddrTy, MVT::Other, OpsA); + Chain = SDValue(VLdA, 2); + + // Load the odd subregs. + Ops.push_back(SDValue(VLdA, 1)); + Ops.push_back(Align); + if (isUpdating) { + SDValue Inc = N->getOperand(AddrOpIdx + 1); + assert(isa<ConstantSDNode>(Inc.getNode()) && + "only constant post-increment update allowed for VLD3/4"); + (void)Inc; + Ops.push_back(Reg0); + } + Ops.push_back(SDValue(VLdA, 0)); + Ops.push_back(Pred); + Ops.push_back(Reg0); + Ops.push_back(Chain); + VLd = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, Ops); + } + + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + cast<MachineSDNode>(VLd)->setMemRefs(MemOp, MemOp + 1); + + if (NumVecs == 1) + return VLd; + + // Extract out the subregisters. + SDValue SuperReg = SDValue(VLd, 0); + assert(ARM::dsub_7 == ARM::dsub_0+7 && + ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering"); + unsigned Sub0 = (is64BitVector ? ARM::dsub_0 : ARM::qsub_0); + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + ReplaceUses(SDValue(N, Vec), + CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg)); + ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1)); + if (isUpdating) + ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLd, 2)); + return nullptr; +} + +SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, + const uint16_t *DOpcodes, + const uint16_t *QOpcodes0, + const uint16_t *QOpcodes1) { + assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range"); + SDLoc dl(N); + + SDValue MemAddr, Align; + unsigned AddrOpIdx = isUpdating ? 1 : 2; + unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1) + if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) + return nullptr; + + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + + SDValue Chain = N->getOperand(0); + EVT VT = N->getOperand(Vec0Idx).getValueType(); + bool is64BitVector = VT.is64BitVector(); + Align = GetVLDSTAlign(Align, dl, NumVecs, is64BitVector); + + unsigned OpcodeIndex; + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled vst type"); + // Double-register operations: + case MVT::v8i8: OpcodeIndex = 0; break; + case MVT::v4i16: OpcodeIndex = 1; break; + case MVT::v2f32: + case MVT::v2i32: OpcodeIndex = 2; break; + case MVT::v1i64: OpcodeIndex = 3; break; + // Quad-register operations: + case MVT::v16i8: OpcodeIndex = 0; break; + case MVT::v8i16: OpcodeIndex = 1; break; + case MVT::v4f32: + case MVT::v4i32: OpcodeIndex = 2; break; + case MVT::v2f64: + case MVT::v2i64: OpcodeIndex = 3; + assert(NumVecs == 1 && "v2i64 type only supported for VST1"); + break; + } + + std::vector<EVT> ResTys; + if (isUpdating) + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::Other); + + SDValue Pred = getAL(CurDAG, dl); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + SmallVector<SDValue, 7> Ops; + + // Double registers and VST1/VST2 quad registers are directly supported. + if (is64BitVector || NumVecs <= 2) { + SDValue SrcReg; + if (NumVecs == 1) { + SrcReg = N->getOperand(Vec0Idx); + } else if (is64BitVector) { + // Form a REG_SEQUENCE to force register allocation. + SDValue V0 = N->getOperand(Vec0Idx + 0); + SDValue V1 = N->getOperand(Vec0Idx + 1); + if (NumVecs == 2) + SrcReg = SDValue(createDRegPairNode(MVT::v2i64, V0, V1), 0); + else { + SDValue V2 = N->getOperand(Vec0Idx + 2); + // If it's a vst3, form a quad D-register and leave the last part as + // an undef. + SDValue V3 = (NumVecs == 3) + ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0) + : N->getOperand(Vec0Idx + 3); + SrcReg = SDValue(createQuadDRegsNode(MVT::v4i64, V0, V1, V2, V3), 0); + } + } else { + // Form a QQ register. + SDValue Q0 = N->getOperand(Vec0Idx); + SDValue Q1 = N->getOperand(Vec0Idx + 1); + SrcReg = SDValue(createQRegPairNode(MVT::v4i64, Q0, Q1), 0); + } + + unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] : + QOpcodes0[OpcodeIndex]); + Ops.push_back(MemAddr); + Ops.push_back(Align); + if (isUpdating) { + SDValue Inc = N->getOperand(AddrOpIdx + 1); + // FIXME: VST1/VST2 fixed increment doesn't need Reg0. Remove the reg0 + // case entirely when the rest are updated to that form, too. + if (NumVecs <= 2 && !isa<ConstantSDNode>(Inc.getNode())) + Opc = getVLDSTRegisterUpdateOpcode(Opc); + // FIXME: We use a VST1 for v1i64 even if the pseudo says vld2/3/4, so + // check for that explicitly too. Horribly hacky, but temporary. + if (!isa<ConstantSDNode>(Inc.getNode())) + Ops.push_back(Inc); + else if (NumVecs > 2 && !isVSTfixed(Opc)) + Ops.push_back(Reg0); + } + Ops.push_back(SrcReg); + Ops.push_back(Pred); + Ops.push_back(Reg0); + Ops.push_back(Chain); + SDNode *VSt = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + + // Transfer memoperands. + cast<MachineSDNode>(VSt)->setMemRefs(MemOp, MemOp + 1); + + return VSt; + } + + // Otherwise, quad registers are stored with two separate instructions, + // where one stores the even registers and the other stores the odd registers. + + // Form the QQQQ REG_SEQUENCE. + SDValue V0 = N->getOperand(Vec0Idx + 0); + SDValue V1 = N->getOperand(Vec0Idx + 1); + SDValue V2 = N->getOperand(Vec0Idx + 2); + SDValue V3 = (NumVecs == 3) + ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0) + : N->getOperand(Vec0Idx + 3); + SDValue RegSeq = SDValue(createQuadQRegsNode(MVT::v8i64, V0, V1, V2, V3), 0); + + // Store the even D registers. This is always an updating store, so that it + // provides the address to the second store for the odd subregs. + const SDValue OpsA[] = { MemAddr, Align, Reg0, RegSeq, Pred, Reg0, Chain }; + SDNode *VStA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl, + MemAddr.getValueType(), + MVT::Other, OpsA); + cast<MachineSDNode>(VStA)->setMemRefs(MemOp, MemOp + 1); + Chain = SDValue(VStA, 1); + + // Store the odd D registers. + Ops.push_back(SDValue(VStA, 0)); + Ops.push_back(Align); + if (isUpdating) { + SDValue Inc = N->getOperand(AddrOpIdx + 1); + assert(isa<ConstantSDNode>(Inc.getNode()) && + "only constant post-increment update allowed for VST3/4"); + (void)Inc; + Ops.push_back(Reg0); + } + Ops.push_back(RegSeq); + Ops.push_back(Pred); + Ops.push_back(Reg0); + Ops.push_back(Chain); + SDNode *VStB = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, + Ops); + cast<MachineSDNode>(VStB)->setMemRefs(MemOp, MemOp + 1); + return VStB; +} + +SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, + bool isUpdating, unsigned NumVecs, + const uint16_t *DOpcodes, + const uint16_t *QOpcodes) { + assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range"); + SDLoc dl(N); + + SDValue MemAddr, Align; + unsigned AddrOpIdx = isUpdating ? 1 : 2; + unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1) + if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) + return nullptr; + + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + + SDValue Chain = N->getOperand(0); + unsigned Lane = + cast<ConstantSDNode>(N->getOperand(Vec0Idx + NumVecs))->getZExtValue(); + EVT VT = N->getOperand(Vec0Idx).getValueType(); + bool is64BitVector = VT.is64BitVector(); + + unsigned Alignment = 0; + if (NumVecs != 3) { + Alignment = cast<ConstantSDNode>(Align)->getZExtValue(); + unsigned NumBytes = NumVecs * VT.getVectorElementType().getSizeInBits()/8; + if (Alignment > NumBytes) + Alignment = NumBytes; + if (Alignment < 8 && Alignment < NumBytes) + Alignment = 0; + // Alignment must be a power of two; make sure of that. + Alignment = (Alignment & -Alignment); + if (Alignment == 1) + Alignment = 0; + } + Align = CurDAG->getTargetConstant(Alignment, dl, MVT::i32); + + unsigned OpcodeIndex; + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled vld/vst lane type"); + // Double-register operations: + case MVT::v8i8: OpcodeIndex = 0; break; + case MVT::v4i16: OpcodeIndex = 1; break; + case MVT::v2f32: + case MVT::v2i32: OpcodeIndex = 2; break; + // Quad-register operations: + case MVT::v8i16: OpcodeIndex = 0; break; + case MVT::v4f32: + case MVT::v4i32: OpcodeIndex = 1; break; + } + + std::vector<EVT> ResTys; + if (IsLoad) { + unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs; + if (!is64BitVector) + ResTyElts *= 2; + ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), + MVT::i64, ResTyElts)); + } + if (isUpdating) + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::Other); + + SDValue Pred = getAL(CurDAG, dl); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + + SmallVector<SDValue, 8> Ops; + Ops.push_back(MemAddr); + Ops.push_back(Align); + if (isUpdating) { + SDValue Inc = N->getOperand(AddrOpIdx + 1); + Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc); + } + + SDValue SuperReg; + SDValue V0 = N->getOperand(Vec0Idx + 0); + SDValue V1 = N->getOperand(Vec0Idx + 1); + if (NumVecs == 2) { + if (is64BitVector) + SuperReg = SDValue(createDRegPairNode(MVT::v2i64, V0, V1), 0); + else + SuperReg = SDValue(createQRegPairNode(MVT::v4i64, V0, V1), 0); + } else { + SDValue V2 = N->getOperand(Vec0Idx + 2); + SDValue V3 = (NumVecs == 3) + ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0) + : N->getOperand(Vec0Idx + 3); + if (is64BitVector) + SuperReg = SDValue(createQuadDRegsNode(MVT::v4i64, V0, V1, V2, V3), 0); + else + SuperReg = SDValue(createQuadQRegsNode(MVT::v8i64, V0, V1, V2, V3), 0); + } + Ops.push_back(SuperReg); + Ops.push_back(getI32Imm(Lane, dl)); + Ops.push_back(Pred); + Ops.push_back(Reg0); + Ops.push_back(Chain); + + unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] : + QOpcodes[OpcodeIndex]); + SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + cast<MachineSDNode>(VLdLn)->setMemRefs(MemOp, MemOp + 1); + if (!IsLoad) + return VLdLn; + + // Extract the subregisters. + SuperReg = SDValue(VLdLn, 0); + assert(ARM::dsub_7 == ARM::dsub_0+7 && + ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering"); + unsigned Sub0 = is64BitVector ? ARM::dsub_0 : ARM::qsub_0; + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + ReplaceUses(SDValue(N, Vec), + CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg)); + ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1)); + if (isUpdating) + ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2)); + return nullptr; +} + +SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, + unsigned NumVecs, + const uint16_t *Opcodes) { + assert(NumVecs >=2 && NumVecs <= 4 && "VLDDup NumVecs out-of-range"); + SDLoc dl(N); + + SDValue MemAddr, Align; + if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align)) + return nullptr; + + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + + SDValue Chain = N->getOperand(0); + EVT VT = N->getValueType(0); + + unsigned Alignment = 0; + if (NumVecs != 3) { + Alignment = cast<ConstantSDNode>(Align)->getZExtValue(); + unsigned NumBytes = NumVecs * VT.getVectorElementType().getSizeInBits()/8; + if (Alignment > NumBytes) + Alignment = NumBytes; + if (Alignment < 8 && Alignment < NumBytes) + Alignment = 0; + // Alignment must be a power of two; make sure of that. + Alignment = (Alignment & -Alignment); + if (Alignment == 1) + Alignment = 0; + } + Align = CurDAG->getTargetConstant(Alignment, dl, MVT::i32); + + unsigned OpcodeIndex; + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled vld-dup type"); + case MVT::v8i8: OpcodeIndex = 0; break; + case MVT::v4i16: OpcodeIndex = 1; break; + case MVT::v2f32: + case MVT::v2i32: OpcodeIndex = 2; break; + } + + SDValue Pred = getAL(CurDAG, dl); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + SDValue SuperReg; + unsigned Opc = Opcodes[OpcodeIndex]; + SmallVector<SDValue, 6> Ops; + Ops.push_back(MemAddr); + Ops.push_back(Align); + if (isUpdating) { + // fixed-stride update instructions don't have an explicit writeback + // operand. It's implicit in the opcode itself. + SDValue Inc = N->getOperand(2); + if (!isa<ConstantSDNode>(Inc.getNode())) + Ops.push_back(Inc); + // FIXME: VLD3 and VLD4 haven't been updated to that form yet. + else if (NumVecs > 2) + Ops.push_back(Reg0); + } + Ops.push_back(Pred); + Ops.push_back(Reg0); + Ops.push_back(Chain); + + unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs; + std::vector<EVT> ResTys; + ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,ResTyElts)); + if (isUpdating) + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::Other); + SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1); + SuperReg = SDValue(VLdDup, 0); + + // Extract the subregisters. + assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); + unsigned SubIdx = ARM::dsub_0; + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + ReplaceUses(SDValue(N, Vec), + CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg)); + ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1)); + if (isUpdating) + ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2)); + return nullptr; +} + +SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, + unsigned Opc) { + assert(NumVecs >= 2 && NumVecs <= 4 && "VTBL NumVecs out-of-range"); + SDLoc dl(N); + EVT VT = N->getValueType(0); + unsigned FirstTblReg = IsExt ? 2 : 1; + + // Form a REG_SEQUENCE to force register allocation. + SDValue RegSeq; + SDValue V0 = N->getOperand(FirstTblReg + 0); + SDValue V1 = N->getOperand(FirstTblReg + 1); + if (NumVecs == 2) + RegSeq = SDValue(createDRegPairNode(MVT::v16i8, V0, V1), 0); + else { + SDValue V2 = N->getOperand(FirstTblReg + 2); + // If it's a vtbl3, form a quad D-register and leave the last part as + // an undef. + SDValue V3 = (NumVecs == 3) + ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0) + : N->getOperand(FirstTblReg + 3); + RegSeq = SDValue(createQuadDRegsNode(MVT::v4i64, V0, V1, V2, V3), 0); + } + + SmallVector<SDValue, 6> Ops; + if (IsExt) + Ops.push_back(N->getOperand(1)); + Ops.push_back(RegSeq); + Ops.push_back(N->getOperand(FirstTblReg + NumVecs)); + Ops.push_back(getAL(CurDAG, dl)); // predicate + Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // predicate register + return CurDAG->getMachineNode(Opc, dl, VT, Ops); +} + +SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N, + bool isSigned) { + if (!Subtarget->hasV6T2Ops()) + return nullptr; + + unsigned Opc = isSigned + ? (Subtarget->isThumb() ? ARM::t2SBFX : ARM::SBFX) + : (Subtarget->isThumb() ? ARM::t2UBFX : ARM::UBFX); + SDLoc dl(N); + + // For unsigned extracts, check for a shift right and mask + unsigned And_imm = 0; + if (N->getOpcode() == ISD::AND) { + if (isOpcWithIntImmediate(N, ISD::AND, And_imm)) { + + // The immediate is a mask of the low bits iff imm & (imm+1) == 0 + if (And_imm & (And_imm + 1)) + return nullptr; + + unsigned Srl_imm = 0; + if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRL, + Srl_imm)) { + assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!"); + + // Note: The width operand is encoded as width-1. + unsigned Width = countTrailingOnes(And_imm) - 1; + unsigned LSB = Srl_imm; + + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + + if ((LSB + Width + 1) == N->getValueType(0).getSizeInBits()) { + // It's cheaper to use a right shift to extract the top bits. + if (Subtarget->isThumb()) { + Opc = isSigned ? ARM::t2ASRri : ARM::t2LSRri; + SDValue Ops[] = { N->getOperand(0).getOperand(0), + CurDAG->getTargetConstant(LSB, dl, MVT::i32), + getAL(CurDAG, dl), Reg0, Reg0 }; + return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops); + } + + // ARM models shift instructions as MOVsi with shifter operand. + ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(ISD::SRL); + SDValue ShOpc = + CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, LSB), dl, + MVT::i32); + SDValue Ops[] = { N->getOperand(0).getOperand(0), ShOpc, + getAL(CurDAG, dl), Reg0, Reg0 }; + return CurDAG->SelectNodeTo(N, ARM::MOVsi, MVT::i32, Ops); + } + + SDValue Ops[] = { N->getOperand(0).getOperand(0), + CurDAG->getTargetConstant(LSB, dl, MVT::i32), + CurDAG->getTargetConstant(Width, dl, MVT::i32), + getAL(CurDAG, dl), Reg0 }; + return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops); + } + } + return nullptr; + } + + // Otherwise, we're looking for a shift of a shift + unsigned Shl_imm = 0; + if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) { + assert(Shl_imm > 0 && Shl_imm < 32 && "bad amount in shift node!"); + unsigned Srl_imm = 0; + if (isInt32Immediate(N->getOperand(1), Srl_imm)) { + assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!"); + // Note: The width operand is encoded as width-1. + unsigned Width = 32 - Srl_imm - 1; + int LSB = Srl_imm - Shl_imm; + if (LSB < 0) + return nullptr; + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { N->getOperand(0).getOperand(0), + CurDAG->getTargetConstant(LSB, dl, MVT::i32), + CurDAG->getTargetConstant(Width, dl, MVT::i32), + getAL(CurDAG, dl), Reg0 }; + return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops); + } + } + + if (N->getOpcode() == ISD::SIGN_EXTEND_INREG) { + unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits(); + unsigned LSB = 0; + if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRL, LSB) && + !isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRA, LSB)) + return nullptr; + + if (LSB + Width > 32) + return nullptr; + + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { N->getOperand(0).getOperand(0), + CurDAG->getTargetConstant(LSB, dl, MVT::i32), + CurDAG->getTargetConstant(Width - 1, dl, MVT::i32), + getAL(CurDAG, dl), Reg0 }; + return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops); + } + + return nullptr; +} + +/// Target-specific DAG combining for ISD::XOR. +/// Target-independent combining lowers SELECT_CC nodes of the form +/// select_cc setg[ge] X, 0, X, -X +/// select_cc setgt X, -1, X, -X +/// select_cc setl[te] X, 0, -X, X +/// select_cc setlt X, 1, -X, X +/// which represent Integer ABS into: +/// Y = sra (X, size(X)-1); xor (add (X, Y), Y) +/// ARM instruction selection detects the latter and matches it to +/// ARM::ABS or ARM::t2ABS machine node. +SDNode *ARMDAGToDAGISel::SelectABSOp(SDNode *N){ + SDValue XORSrc0 = N->getOperand(0); + SDValue XORSrc1 = N->getOperand(1); + EVT VT = N->getValueType(0); + + if (Subtarget->isThumb1Only()) + return nullptr; + + if (XORSrc0.getOpcode() != ISD::ADD || XORSrc1.getOpcode() != ISD::SRA) + return nullptr; + + SDValue ADDSrc0 = XORSrc0.getOperand(0); + SDValue ADDSrc1 = XORSrc0.getOperand(1); + SDValue SRASrc0 = XORSrc1.getOperand(0); + SDValue SRASrc1 = XORSrc1.getOperand(1); + ConstantSDNode *SRAConstant = dyn_cast<ConstantSDNode>(SRASrc1); + EVT XType = SRASrc0.getValueType(); + unsigned Size = XType.getSizeInBits() - 1; + + if (ADDSrc1 == XORSrc1 && ADDSrc0 == SRASrc0 && + XType.isInteger() && SRAConstant != nullptr && + Size == SRAConstant->getZExtValue()) { + unsigned Opcode = Subtarget->isThumb2() ? ARM::t2ABS : ARM::ABS; + return CurDAG->SelectNodeTo(N, Opcode, VT, ADDSrc0); + } + + return nullptr; +} + +SDNode *ARMDAGToDAGISel::SelectConcatVector(SDNode *N) { + // The only time a CONCAT_VECTORS operation can have legal types is when + // two 64-bit vectors are concatenated to a 128-bit vector. + EVT VT = N->getValueType(0); + if (!VT.is128BitVector() || N->getNumOperands() != 2) + llvm_unreachable("unexpected CONCAT_VECTORS"); + return createDRegPairNode(VT, N->getOperand(0), N->getOperand(1)); +} + +SDNode *ARMDAGToDAGISel::Select(SDNode *N) { + SDLoc dl(N); + + if (N->isMachineOpcode()) { + N->setNodeId(-1); + return nullptr; // Already selected. + } + + switch (N->getOpcode()) { + default: break; + case ISD::WRITE_REGISTER: { + SDNode *ResNode = SelectWriteRegister(N); + if (ResNode) + return ResNode; + break; + } + case ISD::READ_REGISTER: { + SDNode *ResNode = SelectReadRegister(N); + if (ResNode) + return ResNode; + break; + } + case ISD::INLINEASM: { + SDNode *ResNode = SelectInlineAsm(N); + if (ResNode) + return ResNode; + break; + } + case ISD::XOR: { + // Select special operations if XOR node forms integer ABS pattern + SDNode *ResNode = SelectABSOp(N); + if (ResNode) + return ResNode; + // Other cases are autogenerated. + break; + } + case ISD::Constant: { + unsigned Val = cast<ConstantSDNode>(N)->getZExtValue(); + // If we can't materialize the constant we need to use a literal pool + if (ConstantMaterializationCost(Val) > 2) { + SDValue CPIdx = CurDAG->getTargetConstantPool( + ConstantInt::get(Type::getInt32Ty(*CurDAG->getContext()), Val), + TLI->getPointerTy(CurDAG->getDataLayout())); + + SDNode *ResNode; + if (Subtarget->isThumb()) { + SDValue Pred = getAL(CurDAG, dl); + SDValue PredReg = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { CPIdx, Pred, PredReg, CurDAG->getEntryNode() }; + ResNode = CurDAG->getMachineNode(ARM::tLDRpci, dl, MVT::i32, MVT::Other, + Ops); + } else { + SDValue Ops[] = { + CPIdx, + CurDAG->getTargetConstant(0, dl, MVT::i32), + getAL(CurDAG, dl), + CurDAG->getRegister(0, MVT::i32), + CurDAG->getEntryNode() + }; + ResNode=CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other, + Ops); + } + ReplaceUses(SDValue(N, 0), SDValue(ResNode, 0)); + return nullptr; + } + + // Other cases are autogenerated. + break; + } + case ISD::FrameIndex: { + // Selects to ADDri FI, 0 which in turn will become ADDri SP, imm. + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + SDValue TFI = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + if (Subtarget->isThumb1Only()) { + // Set the alignment of the frame object to 4, to avoid having to generate + // more than one ADD + MachineFrameInfo *MFI = MF->getFrameInfo(); + if (MFI->getObjectAlignment(FI) < 4) + MFI->setObjectAlignment(FI, 4); + return CurDAG->SelectNodeTo(N, ARM::tADDframe, MVT::i32, TFI, + CurDAG->getTargetConstant(0, dl, MVT::i32)); + } else { + unsigned Opc = ((Subtarget->isThumb() && Subtarget->hasThumb2()) ? + ARM::t2ADDri : ARM::ADDri); + SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, dl, MVT::i32), + getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32), + CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops); + } + } + case ISD::SRL: + if (SDNode *I = SelectV6T2BitfieldExtractOp(N, false)) + return I; + break; + case ISD::SIGN_EXTEND_INREG: + case ISD::SRA: + if (SDNode *I = SelectV6T2BitfieldExtractOp(N, true)) + return I; + break; + case ISD::MUL: + if (Subtarget->isThumb1Only()) + break; + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) { + unsigned RHSV = C->getZExtValue(); + if (!RHSV) break; + if (isPowerOf2_32(RHSV-1)) { // 2^n+1? + unsigned ShImm = Log2_32(RHSV-1); + if (ShImm >= 32) + break; + SDValue V = N->getOperand(0); + ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, ShImm); + SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, dl, MVT::i32); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + if (Subtarget->isThumb()) { + SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG, dl), Reg0, Reg0 }; + return CurDAG->SelectNodeTo(N, ARM::t2ADDrs, MVT::i32, Ops); + } else { + SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG, dl), Reg0, + Reg0 }; + return CurDAG->SelectNodeTo(N, ARM::ADDrsi, MVT::i32, Ops); + } + } + if (isPowerOf2_32(RHSV+1)) { // 2^n-1? + unsigned ShImm = Log2_32(RHSV+1); + if (ShImm >= 32) + break; + SDValue V = N->getOperand(0); + ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, ShImm); + SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, dl, MVT::i32); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + if (Subtarget->isThumb()) { + SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG, dl), Reg0, Reg0 }; + return CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops); + } else { + SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG, dl), Reg0, + Reg0 }; + return CurDAG->SelectNodeTo(N, ARM::RSBrsi, MVT::i32, Ops); + } + } + } + break; + case ISD::AND: { + // Check for unsigned bitfield extract + if (SDNode *I = SelectV6T2BitfieldExtractOp(N, false)) + return I; + + // (and (or x, c2), c1) and top 16-bits of c1 and c2 match, lower 16-bits + // of c1 are 0xffff, and lower 16-bit of c2 are 0. That is, the top 16-bits + // are entirely contributed by c2 and lower 16-bits are entirely contributed + // by x. That's equal to (or (and x, 0xffff), (and c1, 0xffff0000)). + // Select it to: "movt x, ((c1 & 0xffff) >> 16) + EVT VT = N->getValueType(0); + if (VT != MVT::i32) + break; + unsigned Opc = (Subtarget->isThumb() && Subtarget->hasThumb2()) + ? ARM::t2MOVTi16 + : (Subtarget->hasV6T2Ops() ? ARM::MOVTi16 : 0); + if (!Opc) + break; + SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); + ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); + if (!N1C) + break; + if (N0.getOpcode() == ISD::OR && N0.getNode()->hasOneUse()) { + SDValue N2 = N0.getOperand(1); + ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2); + if (!N2C) + break; + unsigned N1CVal = N1C->getZExtValue(); + unsigned N2CVal = N2C->getZExtValue(); + if ((N1CVal & 0xffff0000U) == (N2CVal & 0xffff0000U) && + (N1CVal & 0xffffU) == 0xffffU && + (N2CVal & 0xffffU) == 0x0U) { + SDValue Imm16 = CurDAG->getTargetConstant((N2CVal & 0xFFFF0000U) >> 16, + dl, MVT::i32); + SDValue Ops[] = { N0.getOperand(0), Imm16, + getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->getMachineNode(Opc, dl, VT, Ops); + } + } + break; + } + case ARMISD::VMOVRRD: + return CurDAG->getMachineNode(ARM::VMOVRRD, dl, MVT::i32, MVT::i32, + N->getOperand(0), getAL(CurDAG, dl), + CurDAG->getRegister(0, MVT::i32)); + case ISD::UMUL_LOHI: { + if (Subtarget->isThumb1Only()) + break; + if (Subtarget->isThumb()) { + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), + getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32, Ops); + } else { + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), + getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32), + CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->getMachineNode(Subtarget->hasV6Ops() ? + ARM::UMULL : ARM::UMULLv5, + dl, MVT::i32, MVT::i32, Ops); + } + } + case ISD::SMUL_LOHI: { + if (Subtarget->isThumb1Only()) + break; + if (Subtarget->isThumb()) { + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), + getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32, Ops); + } else { + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), + getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32), + CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->getMachineNode(Subtarget->hasV6Ops() ? + ARM::SMULL : ARM::SMULLv5, + dl, MVT::i32, MVT::i32, Ops); + } + } + case ARMISD::UMLAL:{ + if (Subtarget->isThumb()) { + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), getAL(CurDAG, dl), + CurDAG->getRegister(0, MVT::i32)}; + return CurDAG->getMachineNode(ARM::t2UMLAL, dl, MVT::i32, MVT::i32, Ops); + }else{ + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), getAL(CurDAG, dl), + CurDAG->getRegister(0, MVT::i32), + CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->getMachineNode(Subtarget->hasV6Ops() ? + ARM::UMLAL : ARM::UMLALv5, + dl, MVT::i32, MVT::i32, Ops); + } + } + case ARMISD::SMLAL:{ + if (Subtarget->isThumb()) { + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), getAL(CurDAG, dl), + CurDAG->getRegister(0, MVT::i32)}; + return CurDAG->getMachineNode(ARM::t2SMLAL, dl, MVT::i32, MVT::i32, Ops); + }else{ + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), getAL(CurDAG, dl), + CurDAG->getRegister(0, MVT::i32), + CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->getMachineNode(Subtarget->hasV6Ops() ? + ARM::SMLAL : ARM::SMLALv5, + dl, MVT::i32, MVT::i32, Ops); + } + } + case ISD::LOAD: { + SDNode *ResNode = nullptr; + if (Subtarget->isThumb() && Subtarget->hasThumb2()) + ResNode = SelectT2IndexedLoad(N); + else + ResNode = SelectARMIndexedLoad(N); + if (ResNode) + return ResNode; + // Other cases are autogenerated. + break; + } + case ARMISD::BRCOND: { + // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc) + // Emits: (Bcc:void (bb:Other):$dst, (imm:i32):$cc) + // Pattern complexity = 6 cost = 1 size = 0 + + // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc) + // Emits: (tBcc:void (bb:Other):$dst, (imm:i32):$cc) + // Pattern complexity = 6 cost = 1 size = 0 + + // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc) + // Emits: (t2Bcc:void (bb:Other):$dst, (imm:i32):$cc) + // Pattern complexity = 6 cost = 1 size = 0 + + unsigned Opc = Subtarget->isThumb() ? + ((Subtarget->hasThumb2()) ? ARM::t2Bcc : ARM::tBcc) : ARM::Bcc; + SDValue Chain = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue N2 = N->getOperand(2); + SDValue N3 = N->getOperand(3); + SDValue InFlag = N->getOperand(4); + assert(N1.getOpcode() == ISD::BasicBlock); + assert(N2.getOpcode() == ISD::Constant); + assert(N3.getOpcode() == ISD::Register); + + SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned) + cast<ConstantSDNode>(N2)->getZExtValue()), dl, + MVT::i32); + SDValue Ops[] = { N1, Tmp2, N3, Chain, InFlag }; + SDNode *ResNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, + MVT::Glue, Ops); + Chain = SDValue(ResNode, 0); + if (N->getNumValues() == 2) { + InFlag = SDValue(ResNode, 1); + ReplaceUses(SDValue(N, 1), InFlag); + } + ReplaceUses(SDValue(N, 0), + SDValue(Chain.getNode(), Chain.getResNo())); + return nullptr; + } + case ARMISD::VZIP: { + unsigned Opc = 0; + EVT VT = N->getValueType(0); + switch (VT.getSimpleVT().SimpleTy) { + default: return nullptr; + case MVT::v8i8: Opc = ARM::VZIPd8; break; + case MVT::v4i16: Opc = ARM::VZIPd16; break; + case MVT::v2f32: + // vzip.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm. + case MVT::v2i32: Opc = ARM::VTRNd32; break; + case MVT::v16i8: Opc = ARM::VZIPq8; break; + case MVT::v8i16: Opc = ARM::VZIPq16; break; + case MVT::v4f32: + case MVT::v4i32: Opc = ARM::VZIPq32; break; + } + SDValue Pred = getAL(CurDAG, dl); + SDValue PredReg = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg }; + return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops); + } + case ARMISD::VUZP: { + unsigned Opc = 0; + EVT VT = N->getValueType(0); + switch (VT.getSimpleVT().SimpleTy) { + default: return nullptr; + case MVT::v8i8: Opc = ARM::VUZPd8; break; + case MVT::v4i16: Opc = ARM::VUZPd16; break; + case MVT::v2f32: + // vuzp.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm. + case MVT::v2i32: Opc = ARM::VTRNd32; break; + case MVT::v16i8: Opc = ARM::VUZPq8; break; + case MVT::v8i16: Opc = ARM::VUZPq16; break; + case MVT::v4f32: + case MVT::v4i32: Opc = ARM::VUZPq32; break; + } + SDValue Pred = getAL(CurDAG, dl); + SDValue PredReg = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg }; + return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops); + } + case ARMISD::VTRN: { + unsigned Opc = 0; + EVT VT = N->getValueType(0); + switch (VT.getSimpleVT().SimpleTy) { + default: return nullptr; + case MVT::v8i8: Opc = ARM::VTRNd8; break; + case MVT::v4i16: Opc = ARM::VTRNd16; break; + case MVT::v2f32: + case MVT::v2i32: Opc = ARM::VTRNd32; break; + case MVT::v16i8: Opc = ARM::VTRNq8; break; + case MVT::v8i16: Opc = ARM::VTRNq16; break; + case MVT::v4f32: + case MVT::v4i32: Opc = ARM::VTRNq32; break; + } + SDValue Pred = getAL(CurDAG, dl); + SDValue PredReg = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg }; + return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops); + } + case ARMISD::BUILD_VECTOR: { + EVT VecVT = N->getValueType(0); + EVT EltVT = VecVT.getVectorElementType(); + unsigned NumElts = VecVT.getVectorNumElements(); + if (EltVT == MVT::f64) { + assert(NumElts == 2 && "unexpected type for BUILD_VECTOR"); + return createDRegPairNode(VecVT, N->getOperand(0), N->getOperand(1)); + } + assert(EltVT == MVT::f32 && "unexpected type for BUILD_VECTOR"); + if (NumElts == 2) + return createSRegPairNode(VecVT, N->getOperand(0), N->getOperand(1)); + assert(NumElts == 4 && "unexpected type for BUILD_VECTOR"); + return createQuadSRegsNode(VecVT, N->getOperand(0), N->getOperand(1), + N->getOperand(2), N->getOperand(3)); + } + + case ARMISD::VLD2DUP: { + static const uint16_t Opcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16, + ARM::VLD2DUPd32 }; + return SelectVLDDup(N, false, 2, Opcodes); + } + + case ARMISD::VLD3DUP: { + static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo, + ARM::VLD3DUPd16Pseudo, + ARM::VLD3DUPd32Pseudo }; + return SelectVLDDup(N, false, 3, Opcodes); + } + + case ARMISD::VLD4DUP: { + static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo, + ARM::VLD4DUPd16Pseudo, + ARM::VLD4DUPd32Pseudo }; + return SelectVLDDup(N, false, 4, Opcodes); + } + + case ARMISD::VLD2DUP_UPD: { + static const uint16_t Opcodes[] = { ARM::VLD2DUPd8wb_fixed, + ARM::VLD2DUPd16wb_fixed, + ARM::VLD2DUPd32wb_fixed }; + return SelectVLDDup(N, true, 2, Opcodes); + } + + case ARMISD::VLD3DUP_UPD: { + static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD, + ARM::VLD3DUPd16Pseudo_UPD, + ARM::VLD3DUPd32Pseudo_UPD }; + return SelectVLDDup(N, true, 3, Opcodes); + } + + case ARMISD::VLD4DUP_UPD: { + static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD, + ARM::VLD4DUPd16Pseudo_UPD, + ARM::VLD4DUPd32Pseudo_UPD }; + return SelectVLDDup(N, true, 4, Opcodes); + } + + case ARMISD::VLD1_UPD: { + static const uint16_t DOpcodes[] = { ARM::VLD1d8wb_fixed, + ARM::VLD1d16wb_fixed, + ARM::VLD1d32wb_fixed, + ARM::VLD1d64wb_fixed }; + static const uint16_t QOpcodes[] = { ARM::VLD1q8wb_fixed, + ARM::VLD1q16wb_fixed, + ARM::VLD1q32wb_fixed, + ARM::VLD1q64wb_fixed }; + return SelectVLD(N, true, 1, DOpcodes, QOpcodes, nullptr); + } + + case ARMISD::VLD2_UPD: { + static const uint16_t DOpcodes[] = { ARM::VLD2d8wb_fixed, + ARM::VLD2d16wb_fixed, + ARM::VLD2d32wb_fixed, + ARM::VLD1q64wb_fixed}; + static const uint16_t QOpcodes[] = { ARM::VLD2q8PseudoWB_fixed, + ARM::VLD2q16PseudoWB_fixed, + ARM::VLD2q32PseudoWB_fixed }; + return SelectVLD(N, true, 2, DOpcodes, QOpcodes, nullptr); + } + + case ARMISD::VLD3_UPD: { + static const uint16_t DOpcodes[] = { ARM::VLD3d8Pseudo_UPD, + ARM::VLD3d16Pseudo_UPD, + ARM::VLD3d32Pseudo_UPD, + ARM::VLD1d64TPseudoWB_fixed}; + static const uint16_t QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD, + ARM::VLD3q16Pseudo_UPD, + ARM::VLD3q32Pseudo_UPD }; + static const uint16_t QOpcodes1[] = { ARM::VLD3q8oddPseudo_UPD, + ARM::VLD3q16oddPseudo_UPD, + ARM::VLD3q32oddPseudo_UPD }; + return SelectVLD(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1); + } + + case ARMISD::VLD4_UPD: { + static const uint16_t DOpcodes[] = { ARM::VLD4d8Pseudo_UPD, + ARM::VLD4d16Pseudo_UPD, + ARM::VLD4d32Pseudo_UPD, + ARM::VLD1d64QPseudoWB_fixed}; + static const uint16_t QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD, + ARM::VLD4q16Pseudo_UPD, + ARM::VLD4q32Pseudo_UPD }; + static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD, + ARM::VLD4q16oddPseudo_UPD, + ARM::VLD4q32oddPseudo_UPD }; + return SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); + } + + case ARMISD::VLD2LN_UPD: { + static const uint16_t DOpcodes[] = { ARM::VLD2LNd8Pseudo_UPD, + ARM::VLD2LNd16Pseudo_UPD, + ARM::VLD2LNd32Pseudo_UPD }; + static const uint16_t QOpcodes[] = { ARM::VLD2LNq16Pseudo_UPD, + ARM::VLD2LNq32Pseudo_UPD }; + return SelectVLDSTLane(N, true, true, 2, DOpcodes, QOpcodes); + } + + case ARMISD::VLD3LN_UPD: { + static const uint16_t DOpcodes[] = { ARM::VLD3LNd8Pseudo_UPD, + ARM::VLD3LNd16Pseudo_UPD, + ARM::VLD3LNd32Pseudo_UPD }; + static const uint16_t QOpcodes[] = { ARM::VLD3LNq16Pseudo_UPD, + ARM::VLD3LNq32Pseudo_UPD }; + return SelectVLDSTLane(N, true, true, 3, DOpcodes, QOpcodes); + } + + case ARMISD::VLD4LN_UPD: { + static const uint16_t DOpcodes[] = { ARM::VLD4LNd8Pseudo_UPD, + ARM::VLD4LNd16Pseudo_UPD, + ARM::VLD4LNd32Pseudo_UPD }; + static const uint16_t QOpcodes[] = { ARM::VLD4LNq16Pseudo_UPD, + ARM::VLD4LNq32Pseudo_UPD }; + return SelectVLDSTLane(N, true, true, 4, DOpcodes, QOpcodes); + } + + case ARMISD::VST1_UPD: { + static const uint16_t DOpcodes[] = { ARM::VST1d8wb_fixed, + ARM::VST1d16wb_fixed, + ARM::VST1d32wb_fixed, + ARM::VST1d64wb_fixed }; + static const uint16_t QOpcodes[] = { ARM::VST1q8wb_fixed, + ARM::VST1q16wb_fixed, + ARM::VST1q32wb_fixed, + ARM::VST1q64wb_fixed }; + return SelectVST(N, true, 1, DOpcodes, QOpcodes, nullptr); + } + + case ARMISD::VST2_UPD: { + static const uint16_t DOpcodes[] = { ARM::VST2d8wb_fixed, + ARM::VST2d16wb_fixed, + ARM::VST2d32wb_fixed, + ARM::VST1q64wb_fixed}; + static const uint16_t QOpcodes[] = { ARM::VST2q8PseudoWB_fixed, + ARM::VST2q16PseudoWB_fixed, + ARM::VST2q32PseudoWB_fixed }; + return SelectVST(N, true, 2, DOpcodes, QOpcodes, nullptr); + } + + case ARMISD::VST3_UPD: { + static const uint16_t DOpcodes[] = { ARM::VST3d8Pseudo_UPD, + ARM::VST3d16Pseudo_UPD, + ARM::VST3d32Pseudo_UPD, + ARM::VST1d64TPseudoWB_fixed}; + static const uint16_t QOpcodes0[] = { ARM::VST3q8Pseudo_UPD, + ARM::VST3q16Pseudo_UPD, + ARM::VST3q32Pseudo_UPD }; + static const uint16_t QOpcodes1[] = { ARM::VST3q8oddPseudo_UPD, + ARM::VST3q16oddPseudo_UPD, + ARM::VST3q32oddPseudo_UPD }; + return SelectVST(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1); + } + + case ARMISD::VST4_UPD: { + static const uint16_t DOpcodes[] = { ARM::VST4d8Pseudo_UPD, + ARM::VST4d16Pseudo_UPD, + ARM::VST4d32Pseudo_UPD, + ARM::VST1d64QPseudoWB_fixed}; + static const uint16_t QOpcodes0[] = { ARM::VST4q8Pseudo_UPD, + ARM::VST4q16Pseudo_UPD, + ARM::VST4q32Pseudo_UPD }; + static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD, + ARM::VST4q16oddPseudo_UPD, + ARM::VST4q32oddPseudo_UPD }; + return SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); + } + + case ARMISD::VST2LN_UPD: { + static const uint16_t DOpcodes[] = { ARM::VST2LNd8Pseudo_UPD, + ARM::VST2LNd16Pseudo_UPD, + ARM::VST2LNd32Pseudo_UPD }; + static const uint16_t QOpcodes[] = { ARM::VST2LNq16Pseudo_UPD, + ARM::VST2LNq32Pseudo_UPD }; + return SelectVLDSTLane(N, false, true, 2, DOpcodes, QOpcodes); + } + + case ARMISD::VST3LN_UPD: { + static const uint16_t DOpcodes[] = { ARM::VST3LNd8Pseudo_UPD, + ARM::VST3LNd16Pseudo_UPD, + ARM::VST3LNd32Pseudo_UPD }; + static const uint16_t QOpcodes[] = { ARM::VST3LNq16Pseudo_UPD, + ARM::VST3LNq32Pseudo_UPD }; + return SelectVLDSTLane(N, false, true, 3, DOpcodes, QOpcodes); + } + + case ARMISD::VST4LN_UPD: { + static const uint16_t DOpcodes[] = { ARM::VST4LNd8Pseudo_UPD, + ARM::VST4LNd16Pseudo_UPD, + ARM::VST4LNd32Pseudo_UPD }; + static const uint16_t QOpcodes[] = { ARM::VST4LNq16Pseudo_UPD, + ARM::VST4LNq32Pseudo_UPD }; + return SelectVLDSTLane(N, false, true, 4, DOpcodes, QOpcodes); + } + + case ISD::INTRINSIC_VOID: + case ISD::INTRINSIC_W_CHAIN: { + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + switch (IntNo) { + default: + break; + + case Intrinsic::arm_ldaexd: + case Intrinsic::arm_ldrexd: { + SDLoc dl(N); + SDValue Chain = N->getOperand(0); + SDValue MemAddr = N->getOperand(2); + bool isThumb = Subtarget->isThumb() && Subtarget->hasThumb2(); + + bool IsAcquire = IntNo == Intrinsic::arm_ldaexd; + unsigned NewOpc = isThumb ? (IsAcquire ? ARM::t2LDAEXD : ARM::t2LDREXD) + : (IsAcquire ? ARM::LDAEXD : ARM::LDREXD); + + // arm_ldrexd returns a i64 value in {i32, i32} + std::vector<EVT> ResTys; + if (isThumb) { + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::i32); + } else + ResTys.push_back(MVT::Untyped); + ResTys.push_back(MVT::Other); + + // Place arguments in the right order. + SmallVector<SDValue, 7> Ops; + Ops.push_back(MemAddr); + Ops.push_back(getAL(CurDAG, dl)); + Ops.push_back(CurDAG->getRegister(0, MVT::i32)); + Ops.push_back(Chain); + SDNode *Ld = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops); + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1); + + // Remap uses. + SDValue OutChain = isThumb ? SDValue(Ld, 2) : SDValue(Ld, 1); + if (!SDValue(N, 0).use_empty()) { + SDValue Result; + if (isThumb) + Result = SDValue(Ld, 0); + else { + SDValue SubRegIdx = + CurDAG->getTargetConstant(ARM::gsub_0, dl, MVT::i32); + SDNode *ResNode = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + dl, MVT::i32, SDValue(Ld, 0), SubRegIdx); + Result = SDValue(ResNode,0); + } + ReplaceUses(SDValue(N, 0), Result); + } + if (!SDValue(N, 1).use_empty()) { + SDValue Result; + if (isThumb) + Result = SDValue(Ld, 1); + else { + SDValue SubRegIdx = + CurDAG->getTargetConstant(ARM::gsub_1, dl, MVT::i32); + SDNode *ResNode = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + dl, MVT::i32, SDValue(Ld, 0), SubRegIdx); + Result = SDValue(ResNode,0); + } + ReplaceUses(SDValue(N, 1), Result); + } + ReplaceUses(SDValue(N, 2), OutChain); + return nullptr; + } + case Intrinsic::arm_stlexd: + case Intrinsic::arm_strexd: { + SDLoc dl(N); + SDValue Chain = N->getOperand(0); + SDValue Val0 = N->getOperand(2); + SDValue Val1 = N->getOperand(3); + SDValue MemAddr = N->getOperand(4); + + // Store exclusive double return a i32 value which is the return status + // of the issued store. + const EVT ResTys[] = {MVT::i32, MVT::Other}; + + bool isThumb = Subtarget->isThumb() && Subtarget->hasThumb2(); + // Place arguments in the right order. + SmallVector<SDValue, 7> Ops; + if (isThumb) { + Ops.push_back(Val0); + Ops.push_back(Val1); + } else + // arm_strexd uses GPRPair. + Ops.push_back(SDValue(createGPRPairNode(MVT::Untyped, Val0, Val1), 0)); + Ops.push_back(MemAddr); + Ops.push_back(getAL(CurDAG, dl)); + Ops.push_back(CurDAG->getRegister(0, MVT::i32)); + Ops.push_back(Chain); + + bool IsRelease = IntNo == Intrinsic::arm_stlexd; + unsigned NewOpc = isThumb ? (IsRelease ? ARM::t2STLEXD : ARM::t2STREXD) + : (IsRelease ? ARM::STLEXD : ARM::STREXD); + + SDNode *St = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops); + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1); + + return St; + } + + case Intrinsic::arm_neon_vld1: { + static const uint16_t DOpcodes[] = { ARM::VLD1d8, ARM::VLD1d16, + ARM::VLD1d32, ARM::VLD1d64 }; + static const uint16_t QOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16, + ARM::VLD1q32, ARM::VLD1q64}; + return SelectVLD(N, false, 1, DOpcodes, QOpcodes, nullptr); + } + + case Intrinsic::arm_neon_vld2: { + static const uint16_t DOpcodes[] = { ARM::VLD2d8, ARM::VLD2d16, + ARM::VLD2d32, ARM::VLD1q64 }; + static const uint16_t QOpcodes[] = { ARM::VLD2q8Pseudo, ARM::VLD2q16Pseudo, + ARM::VLD2q32Pseudo }; + return SelectVLD(N, false, 2, DOpcodes, QOpcodes, nullptr); + } + + case Intrinsic::arm_neon_vld3: { + static const uint16_t DOpcodes[] = { ARM::VLD3d8Pseudo, + ARM::VLD3d16Pseudo, + ARM::VLD3d32Pseudo, + ARM::VLD1d64TPseudo }; + static const uint16_t QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD, + ARM::VLD3q16Pseudo_UPD, + ARM::VLD3q32Pseudo_UPD }; + static const uint16_t QOpcodes1[] = { ARM::VLD3q8oddPseudo, + ARM::VLD3q16oddPseudo, + ARM::VLD3q32oddPseudo }; + return SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1); + } + + case Intrinsic::arm_neon_vld4: { + static const uint16_t DOpcodes[] = { ARM::VLD4d8Pseudo, + ARM::VLD4d16Pseudo, + ARM::VLD4d32Pseudo, + ARM::VLD1d64QPseudo }; + static const uint16_t QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD, + ARM::VLD4q16Pseudo_UPD, + ARM::VLD4q32Pseudo_UPD }; + static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo, + ARM::VLD4q16oddPseudo, + ARM::VLD4q32oddPseudo }; + return SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1); + } + + case Intrinsic::arm_neon_vld2lane: { + static const uint16_t DOpcodes[] = { ARM::VLD2LNd8Pseudo, + ARM::VLD2LNd16Pseudo, + ARM::VLD2LNd32Pseudo }; + static const uint16_t QOpcodes[] = { ARM::VLD2LNq16Pseudo, + ARM::VLD2LNq32Pseudo }; + return SelectVLDSTLane(N, true, false, 2, DOpcodes, QOpcodes); + } + + case Intrinsic::arm_neon_vld3lane: { + static const uint16_t DOpcodes[] = { ARM::VLD3LNd8Pseudo, + ARM::VLD3LNd16Pseudo, + ARM::VLD3LNd32Pseudo }; + static const uint16_t QOpcodes[] = { ARM::VLD3LNq16Pseudo, + ARM::VLD3LNq32Pseudo }; + return SelectVLDSTLane(N, true, false, 3, DOpcodes, QOpcodes); + } + + case Intrinsic::arm_neon_vld4lane: { + static const uint16_t DOpcodes[] = { ARM::VLD4LNd8Pseudo, + ARM::VLD4LNd16Pseudo, + ARM::VLD4LNd32Pseudo }; + static const uint16_t QOpcodes[] = { ARM::VLD4LNq16Pseudo, + ARM::VLD4LNq32Pseudo }; + return SelectVLDSTLane(N, true, false, 4, DOpcodes, QOpcodes); + } + + case Intrinsic::arm_neon_vst1: { + static const uint16_t DOpcodes[] = { ARM::VST1d8, ARM::VST1d16, + ARM::VST1d32, ARM::VST1d64 }; + static const uint16_t QOpcodes[] = { ARM::VST1q8, ARM::VST1q16, + ARM::VST1q32, ARM::VST1q64 }; + return SelectVST(N, false, 1, DOpcodes, QOpcodes, nullptr); + } + + case Intrinsic::arm_neon_vst2: { + static const uint16_t DOpcodes[] = { ARM::VST2d8, ARM::VST2d16, + ARM::VST2d32, ARM::VST1q64 }; + static uint16_t QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo, + ARM::VST2q32Pseudo }; + return SelectVST(N, false, 2, DOpcodes, QOpcodes, nullptr); + } + + case Intrinsic::arm_neon_vst3: { + static const uint16_t DOpcodes[] = { ARM::VST3d8Pseudo, + ARM::VST3d16Pseudo, + ARM::VST3d32Pseudo, + ARM::VST1d64TPseudo }; + static const uint16_t QOpcodes0[] = { ARM::VST3q8Pseudo_UPD, + ARM::VST3q16Pseudo_UPD, + ARM::VST3q32Pseudo_UPD }; + static const uint16_t QOpcodes1[] = { ARM::VST3q8oddPseudo, + ARM::VST3q16oddPseudo, + ARM::VST3q32oddPseudo }; + return SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1); + } + + case Intrinsic::arm_neon_vst4: { + static const uint16_t DOpcodes[] = { ARM::VST4d8Pseudo, + ARM::VST4d16Pseudo, + ARM::VST4d32Pseudo, + ARM::VST1d64QPseudo }; + static const uint16_t QOpcodes0[] = { ARM::VST4q8Pseudo_UPD, + ARM::VST4q16Pseudo_UPD, + ARM::VST4q32Pseudo_UPD }; + static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo, + ARM::VST4q16oddPseudo, + ARM::VST4q32oddPseudo }; + return SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1); + } + + case Intrinsic::arm_neon_vst2lane: { + static const uint16_t DOpcodes[] = { ARM::VST2LNd8Pseudo, + ARM::VST2LNd16Pseudo, + ARM::VST2LNd32Pseudo }; + static const uint16_t QOpcodes[] = { ARM::VST2LNq16Pseudo, + ARM::VST2LNq32Pseudo }; + return SelectVLDSTLane(N, false, false, 2, DOpcodes, QOpcodes); + } + + case Intrinsic::arm_neon_vst3lane: { + static const uint16_t DOpcodes[] = { ARM::VST3LNd8Pseudo, + ARM::VST3LNd16Pseudo, + ARM::VST3LNd32Pseudo }; + static const uint16_t QOpcodes[] = { ARM::VST3LNq16Pseudo, + ARM::VST3LNq32Pseudo }; + return SelectVLDSTLane(N, false, false, 3, DOpcodes, QOpcodes); + } + + case Intrinsic::arm_neon_vst4lane: { + static const uint16_t DOpcodes[] = { ARM::VST4LNd8Pseudo, + ARM::VST4LNd16Pseudo, + ARM::VST4LNd32Pseudo }; + static const uint16_t QOpcodes[] = { ARM::VST4LNq16Pseudo, + ARM::VST4LNq32Pseudo }; + return SelectVLDSTLane(N, false, false, 4, DOpcodes, QOpcodes); + } + } + break; + } + + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + switch (IntNo) { + default: + break; + + case Intrinsic::arm_neon_vtbl2: + return SelectVTBL(N, false, 2, ARM::VTBL2); + case Intrinsic::arm_neon_vtbl3: + return SelectVTBL(N, false, 3, ARM::VTBL3Pseudo); + case Intrinsic::arm_neon_vtbl4: + return SelectVTBL(N, false, 4, ARM::VTBL4Pseudo); + + case Intrinsic::arm_neon_vtbx2: + return SelectVTBL(N, true, 2, ARM::VTBX2); + case Intrinsic::arm_neon_vtbx3: + return SelectVTBL(N, true, 3, ARM::VTBX3Pseudo); + case Intrinsic::arm_neon_vtbx4: + return SelectVTBL(N, true, 4, ARM::VTBX4Pseudo); + } + break; + } + + case ARMISD::VTBL1: { + SDLoc dl(N); + EVT VT = N->getValueType(0); + SmallVector<SDValue, 6> Ops; + + Ops.push_back(N->getOperand(0)); + Ops.push_back(N->getOperand(1)); + Ops.push_back(getAL(CurDAG, dl)); // Predicate + Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register + return CurDAG->getMachineNode(ARM::VTBL1, dl, VT, Ops); + } + case ARMISD::VTBL2: { + SDLoc dl(N); + EVT VT = N->getValueType(0); + + // Form a REG_SEQUENCE to force register allocation. + SDValue V0 = N->getOperand(0); + SDValue V1 = N->getOperand(1); + SDValue RegSeq = SDValue(createDRegPairNode(MVT::v16i8, V0, V1), 0); + + SmallVector<SDValue, 6> Ops; + Ops.push_back(RegSeq); + Ops.push_back(N->getOperand(2)); + Ops.push_back(getAL(CurDAG, dl)); // Predicate + Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register + return CurDAG->getMachineNode(ARM::VTBL2, dl, VT, Ops); + } + + case ISD::CONCAT_VECTORS: + return SelectConcatVector(N); + } + + return SelectCode(N); +} + +// Inspect a register string of the form +// cp<coprocessor>:<opc1>:c<CRn>:c<CRm>:<opc2> (32bit) or +// cp<coprocessor>:<opc1>:c<CRm> (64bit) inspect the fields of the string +// and obtain the integer operands from them, adding these operands to the +// provided vector. +static void getIntOperandsFromRegisterString(StringRef RegString, + SelectionDAG *CurDAG, SDLoc DL, + std::vector<SDValue>& Ops) { + SmallVector<StringRef, 5> Fields; + RegString.split(Fields, ':'); + + if (Fields.size() > 1) { + bool AllIntFields = true; + + for (StringRef Field : Fields) { + // Need to trim out leading 'cp' characters and get the integer field. + unsigned IntField; + AllIntFields &= !Field.trim("CPcp").getAsInteger(10, IntField); + Ops.push_back(CurDAG->getTargetConstant(IntField, DL, MVT::i32)); + } + + assert(AllIntFields && + "Unexpected non-integer value in special register string."); + } +} + +// Maps a Banked Register string to its mask value. The mask value returned is +// for use in the MRSbanked / MSRbanked instruction nodes as the Banked Register +// mask operand, which expresses which register is to be used, e.g. r8, and in +// which mode it is to be used, e.g. usr. Returns -1 to signify that the string +// was invalid. +static inline int getBankedRegisterMask(StringRef RegString) { + return StringSwitch<int>(RegString.lower()) + .Case("r8_usr", 0x00) + .Case("r9_usr", 0x01) + .Case("r10_usr", 0x02) + .Case("r11_usr", 0x03) + .Case("r12_usr", 0x04) + .Case("sp_usr", 0x05) + .Case("lr_usr", 0x06) + .Case("r8_fiq", 0x08) + .Case("r9_fiq", 0x09) + .Case("r10_fiq", 0x0a) + .Case("r11_fiq", 0x0b) + .Case("r12_fiq", 0x0c) + .Case("sp_fiq", 0x0d) + .Case("lr_fiq", 0x0e) + .Case("lr_irq", 0x10) + .Case("sp_irq", 0x11) + .Case("lr_svc", 0x12) + .Case("sp_svc", 0x13) + .Case("lr_abt", 0x14) + .Case("sp_abt", 0x15) + .Case("lr_und", 0x16) + .Case("sp_und", 0x17) + .Case("lr_mon", 0x1c) + .Case("sp_mon", 0x1d) + .Case("elr_hyp", 0x1e) + .Case("sp_hyp", 0x1f) + .Case("spsr_fiq", 0x2e) + .Case("spsr_irq", 0x30) + .Case("spsr_svc", 0x32) + .Case("spsr_abt", 0x34) + .Case("spsr_und", 0x36) + .Case("spsr_mon", 0x3c) + .Case("spsr_hyp", 0x3e) + .Default(-1); +} + +// Maps a MClass special register string to its value for use in the +// t2MRS_M / t2MSR_M instruction nodes as the SYSm value operand. +// Returns -1 to signify that the string was invalid. +static inline int getMClassRegisterSYSmValueMask(StringRef RegString) { + return StringSwitch<int>(RegString.lower()) + .Case("apsr", 0x0) + .Case("iapsr", 0x1) + .Case("eapsr", 0x2) + .Case("xpsr", 0x3) + .Case("ipsr", 0x5) + .Case("epsr", 0x6) + .Case("iepsr", 0x7) + .Case("msp", 0x8) + .Case("psp", 0x9) + .Case("primask", 0x10) + .Case("basepri", 0x11) + .Case("basepri_max", 0x12) + .Case("faultmask", 0x13) + .Case("control", 0x14) + .Default(-1); +} + +// The flags here are common to those allowed for apsr in the A class cores and +// those allowed for the special registers in the M class cores. Returns a +// value representing which flags were present, -1 if invalid. +static inline int getMClassFlagsMask(StringRef Flags, bool hasDSP) { + if (Flags.empty()) + return 0x2 | (int)hasDSP; + + return StringSwitch<int>(Flags) + .Case("g", 0x1) + .Case("nzcvq", 0x2) + .Case("nzcvqg", 0x3) + .Default(-1); +} + +static int getMClassRegisterMask(StringRef Reg, StringRef Flags, bool IsRead, + const ARMSubtarget *Subtarget) { + // Ensure that the register (without flags) was a valid M Class special + // register. + int SYSmvalue = getMClassRegisterSYSmValueMask(Reg); + if (SYSmvalue == -1) + return -1; + + // basepri, basepri_max and faultmask are only valid for V7m. + if (!Subtarget->hasV7Ops() && SYSmvalue >= 0x11 && SYSmvalue <= 0x13) + return -1; + + // If it was a read then we won't be expecting flags and so at this point + // we can return the mask. + if (IsRead) { + assert (Flags.empty() && "Unexpected flags for reading M class register."); + return SYSmvalue; + } + + // We know we are now handling a write so need to get the mask for the flags. + int Mask = getMClassFlagsMask(Flags, Subtarget->hasDSP()); + + // Only apsr, iapsr, eapsr, xpsr can have flags. The other register values + // shouldn't have flags present. + if ((SYSmvalue < 0x4 && Mask == -1) || (SYSmvalue > 0x4 && !Flags.empty())) + return -1; + + // The _g and _nzcvqg versions are only valid if the DSP extension is + // available. + if (!Subtarget->hasDSP() && (Mask & 0x1)) + return -1; + + // The register was valid so need to put the mask in the correct place + // (the flags need to be in bits 11-10) and combine with the SYSmvalue to + // construct the operand for the instruction node. + if (SYSmvalue < 0x4) + return SYSmvalue | Mask << 10; + + return SYSmvalue; +} + +static int getARClassRegisterMask(StringRef Reg, StringRef Flags) { + // The mask operand contains the special register (R Bit) in bit 4, whether + // the register is spsr (R bit is 1) or one of cpsr/apsr (R bit is 0), and + // bits 3-0 contains the fields to be accessed in the special register, set by + // the flags provided with the register. + int Mask = 0; + if (Reg == "apsr") { + // The flags permitted for apsr are the same flags that are allowed in + // M class registers. We get the flag value and then shift the flags into + // the correct place to combine with the mask. + Mask = getMClassFlagsMask(Flags, true); + if (Mask == -1) + return -1; + return Mask << 2; + } + + if (Reg != "cpsr" && Reg != "spsr") { + return -1; + } + + // This is the same as if the flags were "fc" + if (Flags.empty() || Flags == "all") + return Mask | 0x9; + + // Inspect the supplied flags string and set the bits in the mask for + // the relevant and valid flags allowed for cpsr and spsr. + for (char Flag : Flags) { + int FlagVal; + switch (Flag) { + case 'c': + FlagVal = 0x1; + break; + case 'x': + FlagVal = 0x2; + break; + case 's': + FlagVal = 0x4; + break; + case 'f': + FlagVal = 0x8; + break; + default: + FlagVal = 0; + } + + // This avoids allowing strings where the same flag bit appears twice. + if (!FlagVal || (Mask & FlagVal)) + return -1; + Mask |= FlagVal; + } + + // If the register is spsr then we need to set the R bit. + if (Reg == "spsr") + Mask |= 0x10; + + return Mask; +} + +// Lower the read_register intrinsic to ARM specific DAG nodes +// using the supplied metadata string to select the instruction node to use +// and the registers/masks to construct as operands for the node. +SDNode *ARMDAGToDAGISel::SelectReadRegister(SDNode *N){ + const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1)); + const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0)); + bool IsThumb2 = Subtarget->isThumb2(); + SDLoc DL(N); + + std::vector<SDValue> Ops; + getIntOperandsFromRegisterString(RegString->getString(), CurDAG, DL, Ops); + + if (!Ops.empty()) { + // If the special register string was constructed of fields (as defined + // in the ACLE) then need to lower to MRC node (32 bit) or + // MRRC node(64 bit), we can make the distinction based on the number of + // operands we have. + unsigned Opcode; + SmallVector<EVT, 3> ResTypes; + if (Ops.size() == 5){ + Opcode = IsThumb2 ? ARM::t2MRC : ARM::MRC; + ResTypes.append({ MVT::i32, MVT::Other }); + } else { + assert(Ops.size() == 3 && + "Invalid number of fields in special register string."); + Opcode = IsThumb2 ? ARM::t2MRRC : ARM::MRRC; + ResTypes.append({ MVT::i32, MVT::i32, MVT::Other }); + } + + Ops.push_back(getAL(CurDAG, DL)); + Ops.push_back(CurDAG->getRegister(0, MVT::i32)); + Ops.push_back(N->getOperand(0)); + return CurDAG->getMachineNode(Opcode, DL, ResTypes, Ops); + } + + std::string SpecialReg = RegString->getString().lower(); + + int BankedReg = getBankedRegisterMask(SpecialReg); + if (BankedReg != -1) { + Ops = { CurDAG->getTargetConstant(BankedReg, DL, MVT::i32), + getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32), + N->getOperand(0) }; + return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRSbanked : ARM::MRSbanked, + DL, MVT::i32, MVT::Other, Ops); + } + + // The VFP registers are read by creating SelectionDAG nodes with opcodes + // corresponding to the register that is being read from. So we switch on the + // string to find which opcode we need to use. + unsigned Opcode = StringSwitch<unsigned>(SpecialReg) + .Case("fpscr", ARM::VMRS) + .Case("fpexc", ARM::VMRS_FPEXC) + .Case("fpsid", ARM::VMRS_FPSID) + .Case("mvfr0", ARM::VMRS_MVFR0) + .Case("mvfr1", ARM::VMRS_MVFR1) + .Case("mvfr2", ARM::VMRS_MVFR2) + .Case("fpinst", ARM::VMRS_FPINST) + .Case("fpinst2", ARM::VMRS_FPINST2) + .Default(0); + + // If an opcode was found then we can lower the read to a VFP instruction. + if (Opcode) { + if (!Subtarget->hasVFP2()) + return nullptr; + if (Opcode == ARM::VMRS_MVFR2 && !Subtarget->hasFPARMv8()) + return nullptr; + + Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32), + N->getOperand(0) }; + return CurDAG->getMachineNode(Opcode, DL, MVT::i32, MVT::Other, Ops); + } + + // If the target is M Class then need to validate that the register string + // is an acceptable value, so check that a mask can be constructed from the + // string. + if (Subtarget->isMClass()) { + int SYSmValue = getMClassRegisterMask(SpecialReg, "", true, Subtarget); + if (SYSmValue == -1) + return nullptr; + + SDValue Ops[] = { CurDAG->getTargetConstant(SYSmValue, DL, MVT::i32), + getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32), + N->getOperand(0) }; + return CurDAG->getMachineNode(ARM::t2MRS_M, DL, MVT::i32, MVT::Other, Ops); + } + + // Here we know the target is not M Class so we need to check if it is one + // of the remaining possible values which are apsr, cpsr or spsr. + if (SpecialReg == "apsr" || SpecialReg == "cpsr") { + Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32), + N->getOperand(0) }; + return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRS_AR : ARM::MRS, DL, + MVT::i32, MVT::Other, Ops); + } + + if (SpecialReg == "spsr") { + Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32), + N->getOperand(0) }; + return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRSsys_AR : ARM::MRSsys, + DL, MVT::i32, MVT::Other, Ops); + } + + return nullptr; +} + +// Lower the write_register intrinsic to ARM specific DAG nodes +// using the supplied metadata string to select the instruction node to use +// and the registers/masks to use in the nodes +SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){ + const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1)); + const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0)); + bool IsThumb2 = Subtarget->isThumb2(); + SDLoc DL(N); + + std::vector<SDValue> Ops; + getIntOperandsFromRegisterString(RegString->getString(), CurDAG, DL, Ops); + + if (!Ops.empty()) { + // If the special register string was constructed of fields (as defined + // in the ACLE) then need to lower to MCR node (32 bit) or + // MCRR node(64 bit), we can make the distinction based on the number of + // operands we have. + unsigned Opcode; + if (Ops.size() == 5) { + Opcode = IsThumb2 ? ARM::t2MCR : ARM::MCR; + Ops.insert(Ops.begin()+2, N->getOperand(2)); + } else { + assert(Ops.size() == 3 && + "Invalid number of fields in special register string."); + Opcode = IsThumb2 ? ARM::t2MCRR : ARM::MCRR; + SDValue WriteValue[] = { N->getOperand(2), N->getOperand(3) }; + Ops.insert(Ops.begin()+2, WriteValue, WriteValue+2); + } + + Ops.push_back(getAL(CurDAG, DL)); + Ops.push_back(CurDAG->getRegister(0, MVT::i32)); + Ops.push_back(N->getOperand(0)); + + return CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops); + } + + std::string SpecialReg = RegString->getString().lower(); + int BankedReg = getBankedRegisterMask(SpecialReg); + if (BankedReg != -1) { + Ops = { CurDAG->getTargetConstant(BankedReg, DL, MVT::i32), N->getOperand(2), + getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32), + N->getOperand(0) }; + return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MSRbanked : ARM::MSRbanked, + DL, MVT::Other, Ops); + } + + // The VFP registers are written to by creating SelectionDAG nodes with + // opcodes corresponding to the register that is being written. So we switch + // on the string to find which opcode we need to use. + unsigned Opcode = StringSwitch<unsigned>(SpecialReg) + .Case("fpscr", ARM::VMSR) + .Case("fpexc", ARM::VMSR_FPEXC) + .Case("fpsid", ARM::VMSR_FPSID) + .Case("fpinst", ARM::VMSR_FPINST) + .Case("fpinst2", ARM::VMSR_FPINST2) + .Default(0); + + if (Opcode) { + if (!Subtarget->hasVFP2()) + return nullptr; + Ops = { N->getOperand(2), getAL(CurDAG, DL), + CurDAG->getRegister(0, MVT::i32), N->getOperand(0) }; + return CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops); + } + + SmallVector<StringRef, 5> Fields; + StringRef(SpecialReg).split(Fields, '_', 1, false); + std::string Reg = Fields[0].str(); + StringRef Flags = Fields.size() == 2 ? Fields[1] : ""; + + // If the target was M Class then need to validate the special register value + // and retrieve the mask for use in the instruction node. + if (Subtarget->isMClass()) { + // basepri_max gets split so need to correct Reg and Flags. + if (SpecialReg == "basepri_max") { + Reg = SpecialReg; + Flags = ""; + } + int SYSmValue = getMClassRegisterMask(Reg, Flags, false, Subtarget); + if (SYSmValue == -1) + return nullptr; + + SDValue Ops[] = { CurDAG->getTargetConstant(SYSmValue, DL, MVT::i32), + N->getOperand(2), getAL(CurDAG, DL), + CurDAG->getRegister(0, MVT::i32), N->getOperand(0) }; + return CurDAG->getMachineNode(ARM::t2MSR_M, DL, MVT::Other, Ops); + } + + // We then check to see if a valid mask can be constructed for one of the + // register string values permitted for the A and R class cores. These values + // are apsr, spsr and cpsr; these are also valid on older cores. + int Mask = getARClassRegisterMask(Reg, Flags); + if (Mask != -1) { + Ops = { CurDAG->getTargetConstant(Mask, DL, MVT::i32), N->getOperand(2), + getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32), + N->getOperand(0) }; + return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MSR_AR : ARM::MSR, + DL, MVT::Other, Ops); + } + + return nullptr; +} + +SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){ + std::vector<SDValue> AsmNodeOperands; + unsigned Flag, Kind; + bool Changed = false; + unsigned NumOps = N->getNumOperands(); + + // Normally, i64 data is bounded to two arbitrary GRPs for "%r" constraint. + // However, some instrstions (e.g. ldrexd/strexd in ARM mode) require + // (even/even+1) GPRs and use %n and %Hn to refer to the individual regs + // respectively. Since there is no constraint to explicitly specify a + // reg pair, we use GPRPair reg class for "%r" for 64-bit data. For Thumb, + // the 64-bit data may be referred by H, Q, R modifiers, so we still pack + // them into a GPRPair. + + SDLoc dl(N); + SDValue Glue = N->getGluedNode() ? N->getOperand(NumOps-1) + : SDValue(nullptr,0); + + SmallVector<bool, 8> OpChanged; + // Glue node will be appended late. + for(unsigned i = 0, e = N->getGluedNode() ? NumOps - 1 : NumOps; i < e; ++i) { + SDValue op = N->getOperand(i); + AsmNodeOperands.push_back(op); + + if (i < InlineAsm::Op_FirstOperand) + continue; + + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(i))) { + Flag = C->getZExtValue(); + Kind = InlineAsm::getKind(Flag); + } + else + continue; + + // Immediate operands to inline asm in the SelectionDAG are modeled with + // two operands. The first is a constant of value InlineAsm::Kind_Imm, and + // the second is a constant with the value of the immediate. If we get here + // and we have a Kind_Imm, skip the next operand, and continue. + if (Kind == InlineAsm::Kind_Imm) { + SDValue op = N->getOperand(++i); + AsmNodeOperands.push_back(op); + continue; + } + + unsigned NumRegs = InlineAsm::getNumOperandRegisters(Flag); + if (NumRegs) + OpChanged.push_back(false); + + unsigned DefIdx = 0; + bool IsTiedToChangedOp = false; + // If it's a use that is tied with a previous def, it has no + // reg class constraint. + if (Changed && InlineAsm::isUseOperandTiedToDef(Flag, DefIdx)) + IsTiedToChangedOp = OpChanged[DefIdx]; + + if (Kind != InlineAsm::Kind_RegUse && Kind != InlineAsm::Kind_RegDef + && Kind != InlineAsm::Kind_RegDefEarlyClobber) + continue; + + unsigned RC; + bool HasRC = InlineAsm::hasRegClassConstraint(Flag, RC); + if ((!IsTiedToChangedOp && (!HasRC || RC != ARM::GPRRegClassID)) + || NumRegs != 2) + continue; + + assert((i+2 < NumOps) && "Invalid number of operands in inline asm"); + SDValue V0 = N->getOperand(i+1); + SDValue V1 = N->getOperand(i+2); + unsigned Reg0 = cast<RegisterSDNode>(V0)->getReg(); + unsigned Reg1 = cast<RegisterSDNode>(V1)->getReg(); + SDValue PairedReg; + MachineRegisterInfo &MRI = MF->getRegInfo(); + + if (Kind == InlineAsm::Kind_RegDef || + Kind == InlineAsm::Kind_RegDefEarlyClobber) { + // Replace the two GPRs with 1 GPRPair and copy values from GPRPair to + // the original GPRs. + + unsigned GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + PairedReg = CurDAG->getRegister(GPVR, MVT::Untyped); + SDValue Chain = SDValue(N,0); + + SDNode *GU = N->getGluedUser(); + SDValue RegCopy = CurDAG->getCopyFromReg(Chain, dl, GPVR, MVT::Untyped, + Chain.getValue(1)); + + // Extract values from a GPRPair reg and copy to the original GPR reg. + SDValue Sub0 = CurDAG->getTargetExtractSubreg(ARM::gsub_0, dl, MVT::i32, + RegCopy); + SDValue Sub1 = CurDAG->getTargetExtractSubreg(ARM::gsub_1, dl, MVT::i32, + RegCopy); + SDValue T0 = CurDAG->getCopyToReg(Sub0, dl, Reg0, Sub0, + RegCopy.getValue(1)); + SDValue T1 = CurDAG->getCopyToReg(Sub1, dl, Reg1, Sub1, T0.getValue(1)); + + // Update the original glue user. + std::vector<SDValue> Ops(GU->op_begin(), GU->op_end()-1); + Ops.push_back(T1.getValue(1)); + CurDAG->UpdateNodeOperands(GU, Ops); + } + else { + // For Kind == InlineAsm::Kind_RegUse, we first copy two GPRs into a + // GPRPair and then pass the GPRPair to the inline asm. + SDValue Chain = AsmNodeOperands[InlineAsm::Op_InputChain]; + + // As REG_SEQ doesn't take RegisterSDNode, we copy them first. + SDValue T0 = CurDAG->getCopyFromReg(Chain, dl, Reg0, MVT::i32, + Chain.getValue(1)); + SDValue T1 = CurDAG->getCopyFromReg(Chain, dl, Reg1, MVT::i32, + T0.getValue(1)); + SDValue Pair = SDValue(createGPRPairNode(MVT::Untyped, T0, T1), 0); + + // Copy REG_SEQ into a GPRPair-typed VR and replace the original two + // i32 VRs of inline asm with it. + unsigned GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass); + PairedReg = CurDAG->getRegister(GPVR, MVT::Untyped); + Chain = CurDAG->getCopyToReg(T1, dl, GPVR, Pair, T1.getValue(1)); + + AsmNodeOperands[InlineAsm::Op_InputChain] = Chain; + Glue = Chain.getValue(1); + } + + Changed = true; + + if(PairedReg.getNode()) { + OpChanged[OpChanged.size() -1 ] = true; + Flag = InlineAsm::getFlagWord(Kind, 1 /* RegNum*/); + if (IsTiedToChangedOp) + Flag = InlineAsm::getFlagWordForMatchingOp(Flag, DefIdx); + else + Flag = InlineAsm::getFlagWordForRegClass(Flag, ARM::GPRPairRegClassID); + // Replace the current flag. + AsmNodeOperands[AsmNodeOperands.size() -1] = CurDAG->getTargetConstant( + Flag, dl, MVT::i32); + // Add the new register node and skip the original two GPRs. + AsmNodeOperands.push_back(PairedReg); + // Skip the next two GPRs. + i += 2; + } + } + + if (Glue.getNode()) + AsmNodeOperands.push_back(Glue); + if (!Changed) + return nullptr; + + SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N), + CurDAG->getVTList(MVT::Other, MVT::Glue), AsmNodeOperands); + New->setNodeId(-1); + return New.getNode(); +} + + +bool ARMDAGToDAGISel:: +SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, + std::vector<SDValue> &OutOps) { + switch(ConstraintID) { + default: + llvm_unreachable("Unexpected asm memory constraint"); + case InlineAsm::Constraint_i: + // FIXME: It seems strange that 'i' is needed here since it's supposed to + // be an immediate and not a memory constraint. + // Fallthrough. + case InlineAsm::Constraint_m: + case InlineAsm::Constraint_o: + case InlineAsm::Constraint_Q: + case InlineAsm::Constraint_Um: + case InlineAsm::Constraint_Un: + case InlineAsm::Constraint_Uq: + case InlineAsm::Constraint_Us: + case InlineAsm::Constraint_Ut: + case InlineAsm::Constraint_Uv: + case InlineAsm::Constraint_Uy: + // Require the address to be in a register. That is safe for all ARM + // variants and it is hard to do anything much smarter without knowing + // how the operand is used. + OutOps.push_back(Op); + return false; + } + return true; +} + +/// createARMISelDag - This pass converts a legalized DAG into a +/// ARM-specific DAG, ready for instruction scheduling. +/// +FunctionPass *llvm::createARMISelDag(ARMBaseTargetMachine &TM, + CodeGenOpt::Level OptLevel) { + return new ARMDAGToDAGISel(TM, OptLevel); +} diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp new file mode 100644 index 0000000..9cfb06b --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -0,0 +1,12326 @@ +//===-- ARMISelLowering.cpp - ARM DAG Lowering Implementation -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that ARM uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#include "ARMISelLowering.h" +#include "ARMCallingConv.h" +#include "ARMConstantPoolValue.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMPerfectShuffle.h" +#include "ARMSubtarget.h" +#include "ARMTargetMachine.h" +#include "ARMTargetObjectFile.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/IntrinsicLowering.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Type.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetOptions.h" +#include <utility> +using namespace llvm; + +#define DEBUG_TYPE "arm-isel" + +STATISTIC(NumTailCalls, "Number of tail calls"); +STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); +STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); + +static cl::opt<bool> +ARMInterworking("arm-interworking", cl::Hidden, + cl::desc("Enable / disable ARM interworking (for debugging only)"), + cl::init(true)); + +namespace { + class ARMCCState : public CCState { + public: + ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF, + SmallVectorImpl<CCValAssign> &locs, LLVMContext &C, + ParmContext PC) + : CCState(CC, isVarArg, MF, locs, C) { + assert(((PC == Call) || (PC == Prologue)) && + "ARMCCState users must specify whether their context is call" + "or prologue generation."); + CallOrPrologue = PC; + } + }; +} + +// The APCS parameter registers. +static const MCPhysReg GPRArgRegs[] = { + ARM::R0, ARM::R1, ARM::R2, ARM::R3 +}; + +void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, + MVT PromotedBitwiseVT) { + if (VT != PromotedLdStVT) { + setOperationAction(ISD::LOAD, VT, Promote); + AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT); + + setOperationAction(ISD::STORE, VT, Promote); + AddPromotedToType (ISD::STORE, VT, PromotedLdStVT); + } + + MVT ElemTy = VT.getVectorElementType(); + if (ElemTy != MVT::i64 && ElemTy != MVT::f64) + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + if (ElemTy == MVT::i32) { + setOperationAction(ISD::SINT_TO_FP, VT, Custom); + setOperationAction(ISD::UINT_TO_FP, VT, Custom); + setOperationAction(ISD::FP_TO_SINT, VT, Custom); + setOperationAction(ISD::FP_TO_UINT, VT, Custom); + } else { + setOperationAction(ISD::SINT_TO_FP, VT, Expand); + setOperationAction(ISD::UINT_TO_FP, VT, Expand); + setOperationAction(ISD::FP_TO_SINT, VT, Expand); + setOperationAction(ISD::FP_TO_UINT, VT, Expand); + } + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); + setOperationAction(ISD::SELECT, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); + if (VT.isInteger()) { + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::SRL, VT, Custom); + } + + // Promote all bit-wise operations. + if (VT.isInteger() && VT != PromotedBitwiseVT) { + setOperationAction(ISD::AND, VT, Promote); + AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT); + setOperationAction(ISD::OR, VT, Promote); + AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT); + setOperationAction(ISD::XOR, VT, Promote); + AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT); + } + + // Neon does not support vector divide/remainder operations. + setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::FDIV, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::UREM, VT, Expand); + setOperationAction(ISD::FREM, VT, Expand); + + if (!VT.isFloatingPoint() && + VT != MVT::v2i64 && VT != MVT::v1i64) + for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) + setOperationAction(Opcode, VT, Legal); +} + +void ARMTargetLowering::addDRTypeForNEON(MVT VT) { + addRegisterClass(VT, &ARM::DPRRegClass); + addTypeForNEON(VT, MVT::f64, MVT::v2i32); +} + +void ARMTargetLowering::addQRTypeForNEON(MVT VT) { + addRegisterClass(VT, &ARM::DPairRegClass); + addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); +} + +ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, + const ARMSubtarget &STI) + : TargetLowering(TM), Subtarget(&STI) { + RegInfo = Subtarget->getRegisterInfo(); + Itins = Subtarget->getInstrItineraryData(); + + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); + + if (Subtarget->isTargetMachO()) { + // Uses VFP for Thumb libfuncs if available. + if (Subtarget->isThumb() && Subtarget->hasVFP2() && + Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) { + static const struct { + const RTLIB::Libcall Op; + const char * const Name; + const ISD::CondCode Cond; + } LibraryCalls[] = { + // Single-precision floating-point arithmetic. + { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID }, + { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID }, + { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID }, + { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID }, + + // Double-precision floating-point arithmetic. + { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID }, + { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID }, + { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID }, + { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID }, + + // Single-precision comparisons. + { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE }, + { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE }, + { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE }, + { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE }, + { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE }, + { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE }, + { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE }, + { RTLIB::O_F32, "__unordsf2vfp", ISD::SETEQ }, + + // Double-precision comparisons. + { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE }, + { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE }, + { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE }, + { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE }, + { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE }, + { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE }, + { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE }, + { RTLIB::O_F64, "__unorddf2vfp", ISD::SETEQ }, + + // Floating-point to integer conversions. + // i64 conversions are done via library routines even when generating VFP + // instructions, so use the same ones. + { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID }, + { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID }, + { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID }, + { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID }, + + // Conversions between floating types. + { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID }, + { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID }, + + // Integer to floating-point conversions. + // i64 conversions are done via library routines even when generating VFP + // instructions, so use the same ones. + // FIXME: There appears to be some naming inconsistency in ARM libgcc: + // e.g., __floatunsidf vs. __floatunssidfvfp. + { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID }, + { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID }, + { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID }, + { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID }, + }; + + for (const auto &LC : LibraryCalls) { + setLibcallName(LC.Op, LC.Name); + if (LC.Cond != ISD::SETCC_INVALID) + setCmpLibcallCC(LC.Op, LC.Cond); + } + } + + // Set the correct calling convention for ARMv7k WatchOS. It's just + // AAPCS_VFP for functions as simple as libcalls. + if (Subtarget->isTargetWatchOS()) { + for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) + setLibcallCallingConv((RTLIB::Libcall)i, CallingConv::ARM_AAPCS_VFP); + } + } + + // These libcalls are not available in 32-bit. + setLibcallName(RTLIB::SHL_I128, nullptr); + setLibcallName(RTLIB::SRL_I128, nullptr); + setLibcallName(RTLIB::SRA_I128, nullptr); + + // RTLIB + if (Subtarget->isAAPCS_ABI() && + (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() || + Subtarget->isTargetAndroid())) { + static const struct { + const RTLIB::Libcall Op; + const char * const Name; + const CallingConv::ID CC; + const ISD::CondCode Cond; + } LibraryCalls[] = { + // Double-precision floating-point arithmetic helper functions + // RTABI chapter 4.1.2, Table 2 + { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + + // Double-precision floating-point comparison helper functions + // RTABI chapter 4.1.2, Table 3 + { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, + { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, + { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, + { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, + { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, + { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, + { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, + { RTLIB::O_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ }, + + // Single-precision floating-point arithmetic helper functions + // RTABI chapter 4.1.2, Table 4 + { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + + // Single-precision floating-point comparison helper functions + // RTABI chapter 4.1.2, Table 5 + { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, + { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, + { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, + { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, + { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, + { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, + { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, + { RTLIB::O_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ }, + + // Floating-point to integer conversions. + // RTABI chapter 4.1.2, Table 6 + { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + + // Conversions between floating types. + // RTABI chapter 4.1.2, Table 7 + { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + + // Integer to floating-point conversions. + // RTABI chapter 4.1.2, Table 8 + { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + + // Long long helper functions + // RTABI chapter 4.2, Table 9 + { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + + // Integer division functions + // RTABI chapter 4.3.1 + { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + }; + + for (const auto &LC : LibraryCalls) { + setLibcallName(LC.Op, LC.Name); + setLibcallCallingConv(LC.Op, LC.CC); + if (LC.Cond != ISD::SETCC_INVALID) + setCmpLibcallCC(LC.Op, LC.Cond); + } + + // EABI dependent RTLIB + if (TM.Options.EABIVersion == EABI::EABI4 || + TM.Options.EABIVersion == EABI::EABI5) { + static const struct { + const RTLIB::Libcall Op; + const char *const Name; + const CallingConv::ID CC; + const ISD::CondCode Cond; + } MemOpsLibraryCalls[] = { + // Memory operations + // RTABI chapter 4.3.4 + { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + }; + + for (const auto &LC : MemOpsLibraryCalls) { + setLibcallName(LC.Op, LC.Name); + setLibcallCallingConv(LC.Op, LC.CC); + if (LC.Cond != ISD::SETCC_INVALID) + setCmpLibcallCC(LC.Op, LC.Cond); + } + } + } + + if (Subtarget->isTargetWindows()) { + static const struct { + const RTLIB::Libcall Op; + const char * const Name; + const CallingConv::ID CC; + } LibraryCalls[] = { + { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP }, + { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP }, + { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP }, + { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP }, + { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP }, + { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP }, + { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP }, + { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP }, + { RTLIB::SDIV_I32, "__rt_sdiv", CallingConv::ARM_AAPCS_VFP }, + { RTLIB::UDIV_I32, "__rt_udiv", CallingConv::ARM_AAPCS_VFP }, + { RTLIB::SDIV_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS_VFP }, + { RTLIB::UDIV_I64, "__rt_udiv64", CallingConv::ARM_AAPCS_VFP }, + }; + + for (const auto &LC : LibraryCalls) { + setLibcallName(LC.Op, LC.Name); + setLibcallCallingConv(LC.Op, LC.CC); + } + } + + // Use divmod compiler-rt calls for iOS 5.0 and later. + if (Subtarget->isTargetWatchOS() || + (Subtarget->isTargetIOS() && + !Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { + setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); + setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); + } + + // The half <-> float conversion functions are always soft-float, but are + // needed for some targets which use a hard-float calling convention by + // default. + if (Subtarget->isAAPCS_ABI()) { + setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS); + } else { + setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS); + setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS); + setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS); + } + + // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have + // a __gnu_ prefix (which is the default). + if (Subtarget->isTargetAEABI()) { + setLibcallName(RTLIB::FPROUND_F32_F16, "__aeabi_f2h"); + setLibcallName(RTLIB::FPROUND_F64_F16, "__aeabi_d2h"); + setLibcallName(RTLIB::FPEXT_F16_F32, "__aeabi_h2f"); + } + + if (Subtarget->isThumb1Only()) + addRegisterClass(MVT::i32, &ARM::tGPRRegClass); + else + addRegisterClass(MVT::i32, &ARM::GPRRegClass); + if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && + !Subtarget->isThumb1Only()) { + addRegisterClass(MVT::f32, &ARM::SPRRegClass); + addRegisterClass(MVT::f64, &ARM::DPRRegClass); + } + + for (MVT VT : MVT::vector_valuetypes()) { + for (MVT InnerVT : MVT::vector_valuetypes()) { + setTruncStoreAction(VT, InnerVT, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); + } + + setOperationAction(ISD::MULHS, VT, Expand); + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::MULHU, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + + setOperationAction(ISD::BSWAP, VT, Expand); + } + + setOperationAction(ISD::ConstantFP, MVT::f32, Custom); + setOperationAction(ISD::ConstantFP, MVT::f64, Custom); + + setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom); + setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom); + + if (Subtarget->hasNEON()) { + addDRTypeForNEON(MVT::v2f32); + addDRTypeForNEON(MVT::v8i8); + addDRTypeForNEON(MVT::v4i16); + addDRTypeForNEON(MVT::v2i32); + addDRTypeForNEON(MVT::v1i64); + + addQRTypeForNEON(MVT::v4f32); + addQRTypeForNEON(MVT::v2f64); + addQRTypeForNEON(MVT::v16i8); + addQRTypeForNEON(MVT::v8i16); + addQRTypeForNEON(MVT::v4i32); + addQRTypeForNEON(MVT::v2i64); + + // v2f64 is legal so that QR subregs can be extracted as f64 elements, but + // neither Neon nor VFP support any arithmetic operations on it. + // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively + // supported for v4f32. + setOperationAction(ISD::FADD, MVT::v2f64, Expand); + setOperationAction(ISD::FSUB, MVT::v2f64, Expand); + setOperationAction(ISD::FMUL, MVT::v2f64, Expand); + // FIXME: Code duplication: FDIV and FREM are expanded always, see + // ARMTargetLowering::addTypeForNEON method for details. + setOperationAction(ISD::FDIV, MVT::v2f64, Expand); + setOperationAction(ISD::FREM, MVT::v2f64, Expand); + // FIXME: Create unittest. + // In another words, find a way when "copysign" appears in DAG with vector + // operands. + setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand); + // FIXME: Code duplication: SETCC has custom operation action, see + // ARMTargetLowering::addTypeForNEON method for details. + setOperationAction(ISD::SETCC, MVT::v2f64, Expand); + // FIXME: Create unittest for FNEG and for FABS. + setOperationAction(ISD::FNEG, MVT::v2f64, Expand); + setOperationAction(ISD::FABS, MVT::v2f64, Expand); + setOperationAction(ISD::FSQRT, MVT::v2f64, Expand); + setOperationAction(ISD::FSIN, MVT::v2f64, Expand); + setOperationAction(ISD::FCOS, MVT::v2f64, Expand); + setOperationAction(ISD::FPOWI, MVT::v2f64, Expand); + setOperationAction(ISD::FPOW, MVT::v2f64, Expand); + setOperationAction(ISD::FLOG, MVT::v2f64, Expand); + setOperationAction(ISD::FLOG2, MVT::v2f64, Expand); + setOperationAction(ISD::FLOG10, MVT::v2f64, Expand); + setOperationAction(ISD::FEXP, MVT::v2f64, Expand); + setOperationAction(ISD::FEXP2, MVT::v2f64, Expand); + // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR. + setOperationAction(ISD::FCEIL, MVT::v2f64, Expand); + setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand); + setOperationAction(ISD::FRINT, MVT::v2f64, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); + setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); + setOperationAction(ISD::FMA, MVT::v2f64, Expand); + + setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); + setOperationAction(ISD::FSIN, MVT::v4f32, Expand); + setOperationAction(ISD::FCOS, MVT::v4f32, Expand); + setOperationAction(ISD::FPOWI, MVT::v4f32, Expand); + setOperationAction(ISD::FPOW, MVT::v4f32, Expand); + setOperationAction(ISD::FLOG, MVT::v4f32, Expand); + setOperationAction(ISD::FLOG2, MVT::v4f32, Expand); + setOperationAction(ISD::FLOG10, MVT::v4f32, Expand); + setOperationAction(ISD::FEXP, MVT::v4f32, Expand); + setOperationAction(ISD::FEXP2, MVT::v4f32, Expand); + setOperationAction(ISD::FCEIL, MVT::v4f32, Expand); + setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand); + setOperationAction(ISD::FRINT, MVT::v4f32, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); + setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand); + + // Mark v2f32 intrinsics. + setOperationAction(ISD::FSQRT, MVT::v2f32, Expand); + setOperationAction(ISD::FSIN, MVT::v2f32, Expand); + setOperationAction(ISD::FCOS, MVT::v2f32, Expand); + setOperationAction(ISD::FPOWI, MVT::v2f32, Expand); + setOperationAction(ISD::FPOW, MVT::v2f32, Expand); + setOperationAction(ISD::FLOG, MVT::v2f32, Expand); + setOperationAction(ISD::FLOG2, MVT::v2f32, Expand); + setOperationAction(ISD::FLOG10, MVT::v2f32, Expand); + setOperationAction(ISD::FEXP, MVT::v2f32, Expand); + setOperationAction(ISD::FEXP2, MVT::v2f32, Expand); + setOperationAction(ISD::FCEIL, MVT::v2f32, Expand); + setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand); + setOperationAction(ISD::FRINT, MVT::v2f32, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand); + setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand); + + // Neon does not support some operations on v1i64 and v2i64 types. + setOperationAction(ISD::MUL, MVT::v1i64, Expand); + // Custom handling for some quad-vector types to detect VMULL. + setOperationAction(ISD::MUL, MVT::v8i16, Custom); + setOperationAction(ISD::MUL, MVT::v4i32, Custom); + setOperationAction(ISD::MUL, MVT::v2i64, Custom); + // Custom handling for some vector types to avoid expensive expansions + setOperationAction(ISD::SDIV, MVT::v4i16, Custom); + setOperationAction(ISD::SDIV, MVT::v8i8, Custom); + setOperationAction(ISD::UDIV, MVT::v4i16, Custom); + setOperationAction(ISD::UDIV, MVT::v8i8, Custom); + setOperationAction(ISD::SETCC, MVT::v1i64, Expand); + setOperationAction(ISD::SETCC, MVT::v2i64, Expand); + // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with + // a destination type that is wider than the source, and nor does + // it have a FP_TO_[SU]INT instruction with a narrower destination than + // source. + setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); + + setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); + setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); + + // NEON does not have single instruction CTPOP for vectors with element + // types wider than 8-bits. However, custom lowering can leverage the + // v8i8/v16i8 vcnt instruction. + setOperationAction(ISD::CTPOP, MVT::v2i32, Custom); + setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); + setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); + setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); + + // NEON does not have single instruction CTTZ for vectors. + setOperationAction(ISD::CTTZ, MVT::v8i8, Custom); + setOperationAction(ISD::CTTZ, MVT::v4i16, Custom); + setOperationAction(ISD::CTTZ, MVT::v2i32, Custom); + setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); + + setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); + setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); + setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); + setOperationAction(ISD::CTTZ, MVT::v2i64, Custom); + + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom); + + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); + + // NEON only has FMA instructions as of VFP4. + if (!Subtarget->hasVFP4()) { + setOperationAction(ISD::FMA, MVT::v2f32, Expand); + setOperationAction(ISD::FMA, MVT::v4f32, Expand); + } + + setTargetDAGCombine(ISD::INTRINSIC_VOID); + setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); + setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::SRL); + setTargetDAGCombine(ISD::SRA); + setTargetDAGCombine(ISD::SIGN_EXTEND); + setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::ANY_EXTEND); + setTargetDAGCombine(ISD::BUILD_VECTOR); + setTargetDAGCombine(ISD::VECTOR_SHUFFLE); + setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::FP_TO_SINT); + setTargetDAGCombine(ISD::FP_TO_UINT); + setTargetDAGCombine(ISD::FDIV); + setTargetDAGCombine(ISD::LOAD); + + // It is legal to extload from v4i8 to v4i16 or v4i32. + for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, + MVT::v2i32}) { + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal); + setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal); + setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal); + } + } + } + + // ARM and Thumb2 support UMLAL/SMLAL. + if (!Subtarget->isThumb1Only()) + setTargetDAGCombine(ISD::ADDC); + + if (Subtarget->isFPOnlySP()) { + // When targeting a floating-point unit with only single-precision + // operations, f64 is legal for the few double-precision instructions which + // are present However, no double-precision operations other than moves, + // loads and stores are provided by the hardware. + setOperationAction(ISD::FADD, MVT::f64, Expand); + setOperationAction(ISD::FSUB, MVT::f64, Expand); + setOperationAction(ISD::FMUL, MVT::f64, Expand); + setOperationAction(ISD::FMA, MVT::f64, Expand); + setOperationAction(ISD::FDIV, MVT::f64, Expand); + setOperationAction(ISD::FREM, MVT::f64, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); + setOperationAction(ISD::FGETSIGN, MVT::f64, Expand); + setOperationAction(ISD::FNEG, MVT::f64, Expand); + setOperationAction(ISD::FABS, MVT::f64, Expand); + setOperationAction(ISD::FSQRT, MVT::f64, Expand); + setOperationAction(ISD::FSIN, MVT::f64, Expand); + setOperationAction(ISD::FCOS, MVT::f64, Expand); + setOperationAction(ISD::FPOWI, MVT::f64, Expand); + setOperationAction(ISD::FPOW, MVT::f64, Expand); + setOperationAction(ISD::FLOG, MVT::f64, Expand); + setOperationAction(ISD::FLOG2, MVT::f64, Expand); + setOperationAction(ISD::FLOG10, MVT::f64, Expand); + setOperationAction(ISD::FEXP, MVT::f64, Expand); + setOperationAction(ISD::FEXP2, MVT::f64, Expand); + setOperationAction(ISD::FCEIL, MVT::f64, Expand); + setOperationAction(ISD::FTRUNC, MVT::f64, Expand); + setOperationAction(ISD::FRINT, MVT::f64, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand); + setOperationAction(ISD::FFLOOR, MVT::f64, Expand); + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom); + setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); + } + + computeRegisterProperties(Subtarget->getRegisterInfo()); + + // ARM does not have floating-point extending loads. + for (MVT VT : MVT::fp_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); + } + + // ... or truncating stores + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + setTruncStoreAction(MVT::f32, MVT::f16, Expand); + setTruncStoreAction(MVT::f64, MVT::f16, Expand); + + // ARM does not have i1 sign extending load. + for (MVT VT : MVT::integer_valuetypes()) + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); + + // ARM supports all 4 flavors of integer indexed load / store. + if (!Subtarget->isThumb1Only()) { + for (unsigned im = (unsigned)ISD::PRE_INC; + im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { + setIndexedLoadAction(im, MVT::i1, Legal); + setIndexedLoadAction(im, MVT::i8, Legal); + setIndexedLoadAction(im, MVT::i16, Legal); + setIndexedLoadAction(im, MVT::i32, Legal); + setIndexedStoreAction(im, MVT::i1, Legal); + setIndexedStoreAction(im, MVT::i8, Legal); + setIndexedStoreAction(im, MVT::i16, Legal); + setIndexedStoreAction(im, MVT::i32, Legal); + } + } + + setOperationAction(ISD::SADDO, MVT::i32, Custom); + setOperationAction(ISD::UADDO, MVT::i32, Custom); + setOperationAction(ISD::SSUBO, MVT::i32, Custom); + setOperationAction(ISD::USUBO, MVT::i32, Custom); + + // i64 operation support. + setOperationAction(ISD::MUL, MVT::i64, Expand); + setOperationAction(ISD::MULHU, MVT::i32, Expand); + if (Subtarget->isThumb1Only()) { + setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); + setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); + } + if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops() + || (Subtarget->isThumb2() && !Subtarget->hasDSP())) + setOperationAction(ISD::MULHS, MVT::i32, Expand); + + setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRL, MVT::i64, Custom); + setOperationAction(ISD::SRA, MVT::i64, Custom); + + if (!Subtarget->isThumb1Only()) { + // FIXME: We should do this for Thumb1 as well. + setOperationAction(ISD::ADDC, MVT::i32, Custom); + setOperationAction(ISD::ADDE, MVT::i32, Custom); + setOperationAction(ISD::SUBC, MVT::i32, Custom); + setOperationAction(ISD::SUBE, MVT::i32, Custom); + } + + if (!Subtarget->isThumb1Only()) + setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); + + // ARM does not have ROTL. + setOperationAction(ISD::ROTL, MVT::i32, Expand); + for (MVT VT : MVT::vector_valuetypes()) { + setOperationAction(ISD::ROTL, VT, Expand); + setOperationAction(ISD::ROTR, VT, Expand); + } + setOperationAction(ISD::CTTZ, MVT::i32, Custom); + setOperationAction(ISD::CTPOP, MVT::i32, Expand); + if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) + setOperationAction(ISD::CTLZ, MVT::i32, Expand); + + // These just redirect to CTTZ and CTLZ on ARM. + setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i32 , Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i32 , Expand); + + // @llvm.readcyclecounter requires the Performance Monitors extension. + // Default to the 0 expansion on unsupported platforms. + // FIXME: Technically there are older ARM CPUs that have + // implementation-specific ways of obtaining this information. + if (Subtarget->hasPerfMon()) + setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); + + // Only ARMv6 has BSWAP. + if (!Subtarget->hasV6Ops()) + setOperationAction(ISD::BSWAP, MVT::i32, Expand); + + if (!(Subtarget->hasDivide() && Subtarget->isThumb2()) && + !(Subtarget->hasDivideInARMMode() && !Subtarget->isThumb())) { + // These are expanded into libcalls if the cpu doesn't have HW divider. + setOperationAction(ISD::SDIV, MVT::i32, LibCall); + setOperationAction(ISD::UDIV, MVT::i32, LibCall); + } + + setOperationAction(ISD::SREM, MVT::i32, Expand); + setOperationAction(ISD::UREM, MVT::i32, Expand); + // Register based DivRem for AEABI (RTABI 4.2) + if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid()) { + setOperationAction(ISD::SREM, MVT::i64, Custom); + setOperationAction(ISD::UREM, MVT::i64, Custom); + + setLibcallName(RTLIB::SDIVREM_I8, "__aeabi_idivmod"); + setLibcallName(RTLIB::SDIVREM_I16, "__aeabi_idivmod"); + setLibcallName(RTLIB::SDIVREM_I32, "__aeabi_idivmod"); + setLibcallName(RTLIB::SDIVREM_I64, "__aeabi_ldivmod"); + setLibcallName(RTLIB::UDIVREM_I8, "__aeabi_uidivmod"); + setLibcallName(RTLIB::UDIVREM_I16, "__aeabi_uidivmod"); + setLibcallName(RTLIB::UDIVREM_I32, "__aeabi_uidivmod"); + setLibcallName(RTLIB::UDIVREM_I64, "__aeabi_uldivmod"); + + setLibcallCallingConv(RTLIB::SDIVREM_I8, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SDIVREM_I16, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SDIVREM_I32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SDIVREM_I64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UDIVREM_I8, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UDIVREM_I16, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UDIVREM_I32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UDIVREM_I64, CallingConv::ARM_AAPCS); + + setOperationAction(ISD::SDIVREM, MVT::i32, Custom); + setOperationAction(ISD::UDIVREM, MVT::i32, Custom); + } else { + setOperationAction(ISD::SDIVREM, MVT::i32, Expand); + setOperationAction(ISD::UDIVREM, MVT::i32, Expand); + } + + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + setOperationAction(ISD::ConstantPool, MVT::i32, Custom); + setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); + setOperationAction(ISD::BlockAddress, MVT::i32, Custom); + + setOperationAction(ISD::TRAP, MVT::Other, Legal); + + // Use the default implementation. + setOperationAction(ISD::VASTART, MVT::Other, Custom); + setOperationAction(ISD::VAARG, MVT::Other, Expand); + setOperationAction(ISD::VACOPY, MVT::Other, Expand); + setOperationAction(ISD::VAEND, MVT::Other, Expand); + setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); + setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); + + if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment()) + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); + else + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); + + // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use + // the default expansion. If we are targeting a single threaded system, + // then set them all for expand so we can lower them later into their + // non-atomic form. + if (TM.Options.ThreadModel == ThreadModel::Single) + setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand); + else if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only()) { + // ATOMIC_FENCE needs custom lowering; the others should have been expanded + // to ldrex/strex loops already. + setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); + + // On v8, we have particularly efficient implementations of atomic fences + // if they can be combined with nearby atomic loads and stores. + if (!Subtarget->hasV8Ops()) { + // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc. + setInsertFencesForAtomic(true); + } + } else { + // If there's anything we can use as a barrier, go through custom lowering + // for ATOMIC_FENCE. + setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, + Subtarget->hasAnyDataBarrier() ? Custom : Expand); + + // Set them all for expansion, which will force libcalls. + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand); + // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the + // Unordered/Monotonic case. + setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom); + setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom); + } + + setOperationAction(ISD::PREFETCH, MVT::Other, Custom); + + // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes. + if (!Subtarget->hasV6Ops()) { + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); + } + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + + if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && + !Subtarget->isThumb1Only()) { + // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR + // iff target supports vfp2. + setOperationAction(ISD::BITCAST, MVT::i64, Custom); + setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); + } + + // We want to custom lower some of our intrinsics. + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); + setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); + setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); + if (Subtarget->useSjLjEH()) + setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); + + setOperationAction(ISD::SETCC, MVT::i32, Expand); + setOperationAction(ISD::SETCC, MVT::f32, Expand); + setOperationAction(ISD::SETCC, MVT::f64, Expand); + setOperationAction(ISD::SELECT, MVT::i32, Custom); + setOperationAction(ISD::SELECT, MVT::f32, Custom); + setOperationAction(ISD::SELECT, MVT::f64, Custom); + setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); + + setOperationAction(ISD::BRCOND, MVT::Other, Expand); + setOperationAction(ISD::BR_CC, MVT::i32, Custom); + setOperationAction(ISD::BR_CC, MVT::f32, Custom); + setOperationAction(ISD::BR_CC, MVT::f64, Custom); + setOperationAction(ISD::BR_JT, MVT::Other, Custom); + + // We don't support sin/cos/fmod/copysign/pow + setOperationAction(ISD::FSIN, MVT::f64, Expand); + setOperationAction(ISD::FSIN, MVT::f32, Expand); + setOperationAction(ISD::FCOS, MVT::f32, Expand); + setOperationAction(ISD::FCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); + setOperationAction(ISD::FREM, MVT::f64, Expand); + setOperationAction(ISD::FREM, MVT::f32, Expand); + if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && + !Subtarget->isThumb1Only()) { + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); + } + setOperationAction(ISD::FPOW, MVT::f64, Expand); + setOperationAction(ISD::FPOW, MVT::f32, Expand); + + if (!Subtarget->hasVFP4()) { + setOperationAction(ISD::FMA, MVT::f64, Expand); + setOperationAction(ISD::FMA, MVT::f32, Expand); + } + + // Various VFP goodness + if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) { + // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded. + if (!Subtarget->hasFPARMv8() || Subtarget->isFPOnlySP()) { + setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); + } + + // fp16 is a special v7 extension that adds f16 <-> f32 conversions. + if (!Subtarget->hasFP16()) { + setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); + } + } + + // Combine sin / cos into one node or libcall if possible. + if (Subtarget->hasSinCos()) { + setLibcallName(RTLIB::SINCOS_F32, "sincosf"); + setLibcallName(RTLIB::SINCOS_F64, "sincos"); + if (Subtarget->isTargetWatchOS()) { + setLibcallCallingConv(RTLIB::SINCOS_F32, CallingConv::ARM_AAPCS_VFP); + setLibcallCallingConv(RTLIB::SINCOS_F64, CallingConv::ARM_AAPCS_VFP); + } + if (Subtarget->isTargetIOS() || Subtarget->isTargetWatchOS()) { + // For iOS, we don't want to the normal expansion of a libcall to + // sincos. We want to issue a libcall to __sincos_stret. + setOperationAction(ISD::FSINCOS, MVT::f64, Custom); + setOperationAction(ISD::FSINCOS, MVT::f32, Custom); + } + } + + // FP-ARMv8 implements a lot of rounding-like FP operations. + if (Subtarget->hasFPARMv8()) { + setOperationAction(ISD::FFLOOR, MVT::f32, Legal); + setOperationAction(ISD::FCEIL, MVT::f32, Legal); + setOperationAction(ISD::FROUND, MVT::f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::f32, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); + setOperationAction(ISD::FRINT, MVT::f32, Legal); + setOperationAction(ISD::FMINNUM, MVT::f32, Legal); + setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); + setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal); + setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal); + setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); + setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); + + if (!Subtarget->isFPOnlySP()) { + setOperationAction(ISD::FFLOOR, MVT::f64, Legal); + setOperationAction(ISD::FCEIL, MVT::f64, Legal); + setOperationAction(ISD::FROUND, MVT::f64, Legal); + setOperationAction(ISD::FTRUNC, MVT::f64, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); + setOperationAction(ISD::FRINT, MVT::f64, Legal); + setOperationAction(ISD::FMINNUM, MVT::f64, Legal); + setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); + } + } + + if (Subtarget->hasNEON()) { + // vmin and vmax aren't available in a scalar form, so we use + // a NEON instruction with an undef lane instead. + setOperationAction(ISD::FMINNAN, MVT::f32, Legal); + setOperationAction(ISD::FMAXNAN, MVT::f32, Legal); + setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal); + setOperationAction(ISD::FMAXNAN, MVT::v2f32, Legal); + setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal); + setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal); + } + + // We have target-specific dag combine patterns for the following nodes: + // ARMISD::VMOVRRD - No need to call setTargetDAGCombine + setTargetDAGCombine(ISD::ADD); + setTargetDAGCombine(ISD::SUB); + setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::AND); + setTargetDAGCombine(ISD::OR); + setTargetDAGCombine(ISD::XOR); + + if (Subtarget->hasV6Ops()) + setTargetDAGCombine(ISD::SRL); + + setStackPointerRegisterToSaveRestore(ARM::SP); + + if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() || + !Subtarget->hasVFP2()) + setSchedulingPreference(Sched::RegPressure); + else + setSchedulingPreference(Sched::Hybrid); + + //// temporary - rewrite interface to use type + MaxStoresPerMemset = 8; + MaxStoresPerMemsetOptSize = 4; + MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores + MaxStoresPerMemcpyOptSize = 2; + MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores + MaxStoresPerMemmoveOptSize = 2; + + // On ARM arguments smaller than 4 bytes are extended, so all arguments + // are at least 4 bytes aligned. + setMinStackArgumentAlignment(4); + + // Prefer likely predicted branches to selects on out-of-order cores. + PredictableSelectIsExpensive = Subtarget->isLikeA9(); + + setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2); +} + +bool ARMTargetLowering::useSoftFloat() const { + return Subtarget->useSoftFloat(); +} + +// FIXME: It might make sense to define the representative register class as the +// nearest super-register that has a non-null superset. For example, DPR_VFP2 is +// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, +// SPR's representative would be DPR_VFP2. This should work well if register +// pressure tracking were modified such that a register use would increment the +// pressure of the register class's representative and all of it's super +// classes' representatives transitively. We have not implemented this because +// of the difficulty prior to coalescing of modeling operand register classes +// due to the common occurrence of cross class copies and subregister insertions +// and extractions. +std::pair<const TargetRegisterClass *, uint8_t> +ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, + MVT VT) const { + const TargetRegisterClass *RRC = nullptr; + uint8_t Cost = 1; + switch (VT.SimpleTy) { + default: + return TargetLowering::findRepresentativeClass(TRI, VT); + // Use DPR as representative register class for all floating point + // and vector types. Since there are 32 SPR registers and 32 DPR registers so + // the cost is 1 for both f32 and f64. + case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16: + case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: + RRC = &ARM::DPRRegClass; + // When NEON is used for SP, only half of the register file is available + // because operations that define both SP and DP results will be constrained + // to the VFP2 class (D0-D15). We currently model this constraint prior to + // coalescing by double-counting the SP regs. See the FIXME above. + if (Subtarget->useNEONForSinglePrecisionFP()) + Cost = 2; + break; + case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: + case MVT::v4f32: case MVT::v2f64: + RRC = &ARM::DPRRegClass; + Cost = 2; + break; + case MVT::v4i64: + RRC = &ARM::DPRRegClass; + Cost = 4; + break; + case MVT::v8i64: + RRC = &ARM::DPRRegClass; + Cost = 8; + break; + } + return std::make_pair(RRC, Cost); +} + +const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { + switch ((ARMISD::NodeType)Opcode) { + case ARMISD::FIRST_NUMBER: break; + case ARMISD::Wrapper: return "ARMISD::Wrapper"; + case ARMISD::WrapperPIC: return "ARMISD::WrapperPIC"; + case ARMISD::WrapperJT: return "ARMISD::WrapperJT"; + case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL"; + case ARMISD::CALL: return "ARMISD::CALL"; + case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED"; + case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK"; + case ARMISD::tCALL: return "ARMISD::tCALL"; + case ARMISD::BRCOND: return "ARMISD::BRCOND"; + case ARMISD::BR_JT: return "ARMISD::BR_JT"; + case ARMISD::BR2_JT: return "ARMISD::BR2_JT"; + case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; + case ARMISD::INTRET_FLAG: return "ARMISD::INTRET_FLAG"; + case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; + case ARMISD::CMP: return "ARMISD::CMP"; + case ARMISD::CMN: return "ARMISD::CMN"; + case ARMISD::CMPZ: return "ARMISD::CMPZ"; + case ARMISD::CMPFP: return "ARMISD::CMPFP"; + case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0"; + case ARMISD::BCC_i64: return "ARMISD::BCC_i64"; + case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; + + case ARMISD::CMOV: return "ARMISD::CMOV"; + + case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; + case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; + case ARMISD::RRX: return "ARMISD::RRX"; + + case ARMISD::ADDC: return "ARMISD::ADDC"; + case ARMISD::ADDE: return "ARMISD::ADDE"; + case ARMISD::SUBC: return "ARMISD::SUBC"; + case ARMISD::SUBE: return "ARMISD::SUBE"; + + case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD"; + case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; + + case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; + case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP"; + case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH"; + + case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN"; + + case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; + + case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC"; + + case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR"; + + case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; + + case ARMISD::WIN__CHKSTK: return "ARMISD:::WIN__CHKSTK"; + case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; + + case ARMISD::VCEQ: return "ARMISD::VCEQ"; + case ARMISD::VCEQZ: return "ARMISD::VCEQZ"; + case ARMISD::VCGE: return "ARMISD::VCGE"; + case ARMISD::VCGEZ: return "ARMISD::VCGEZ"; + case ARMISD::VCLEZ: return "ARMISD::VCLEZ"; + case ARMISD::VCGEU: return "ARMISD::VCGEU"; + case ARMISD::VCGT: return "ARMISD::VCGT"; + case ARMISD::VCGTZ: return "ARMISD::VCGTZ"; + case ARMISD::VCLTZ: return "ARMISD::VCLTZ"; + case ARMISD::VCGTU: return "ARMISD::VCGTU"; + case ARMISD::VTST: return "ARMISD::VTST"; + + case ARMISD::VSHL: return "ARMISD::VSHL"; + case ARMISD::VSHRs: return "ARMISD::VSHRs"; + case ARMISD::VSHRu: return "ARMISD::VSHRu"; + case ARMISD::VRSHRs: return "ARMISD::VRSHRs"; + case ARMISD::VRSHRu: return "ARMISD::VRSHRu"; + case ARMISD::VRSHRN: return "ARMISD::VRSHRN"; + case ARMISD::VQSHLs: return "ARMISD::VQSHLs"; + case ARMISD::VQSHLu: return "ARMISD::VQSHLu"; + case ARMISD::VQSHLsu: return "ARMISD::VQSHLsu"; + case ARMISD::VQSHRNs: return "ARMISD::VQSHRNs"; + case ARMISD::VQSHRNu: return "ARMISD::VQSHRNu"; + case ARMISD::VQSHRNsu: return "ARMISD::VQSHRNsu"; + case ARMISD::VQRSHRNs: return "ARMISD::VQRSHRNs"; + case ARMISD::VQRSHRNu: return "ARMISD::VQRSHRNu"; + case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu"; + case ARMISD::VSLI: return "ARMISD::VSLI"; + case ARMISD::VSRI: return "ARMISD::VSRI"; + case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; + case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; + case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM"; + case ARMISD::VMVNIMM: return "ARMISD::VMVNIMM"; + case ARMISD::VMOVFPIMM: return "ARMISD::VMOVFPIMM"; + case ARMISD::VDUP: return "ARMISD::VDUP"; + case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE"; + case ARMISD::VEXT: return "ARMISD::VEXT"; + case ARMISD::VREV64: return "ARMISD::VREV64"; + case ARMISD::VREV32: return "ARMISD::VREV32"; + case ARMISD::VREV16: return "ARMISD::VREV16"; + case ARMISD::VZIP: return "ARMISD::VZIP"; + case ARMISD::VUZP: return "ARMISD::VUZP"; + case ARMISD::VTRN: return "ARMISD::VTRN"; + case ARMISD::VTBL1: return "ARMISD::VTBL1"; + case ARMISD::VTBL2: return "ARMISD::VTBL2"; + case ARMISD::VMULLs: return "ARMISD::VMULLs"; + case ARMISD::VMULLu: return "ARMISD::VMULLu"; + case ARMISD::UMLAL: return "ARMISD::UMLAL"; + case ARMISD::SMLAL: return "ARMISD::SMLAL"; + case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; + case ARMISD::BFI: return "ARMISD::BFI"; + case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; + case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; + case ARMISD::VBSL: return "ARMISD::VBSL"; + case ARMISD::MEMCPY: return "ARMISD::MEMCPY"; + case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; + case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; + case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; + case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD"; + case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD"; + case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD"; + case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD"; + case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD"; + case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD"; + case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD"; + case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD"; + case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD"; + case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD"; + case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD"; + case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD"; + case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD"; + case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD"; + case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD"; + case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; + case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; + } + return nullptr; +} + +EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, + EVT VT) const { + if (!VT.isVector()) + return getPointerTy(DL); + return VT.changeVectorElementTypeToInteger(); +} + +/// getRegClassFor - Return the register class that should be used for the +/// specified value type. +const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const { + // Map v4i64 to QQ registers but do not make the type legal. Similarly map + // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to + // load / store 4 to 8 consecutive D registers. + if (Subtarget->hasNEON()) { + if (VT == MVT::v4i64) + return &ARM::QQPRRegClass; + if (VT == MVT::v8i64) + return &ARM::QQQQPRRegClass; + } + return TargetLowering::getRegClassFor(VT); +} + +// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the +// source/dest is aligned and the copy size is large enough. We therefore want +// to align such objects passed to memory intrinsics. +bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, + unsigned &PrefAlign) const { + if (!isa<MemIntrinsic>(CI)) + return false; + MinSize = 8; + // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1 + // cycle faster than 4-byte aligned LDM. + PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4); + return true; +} + +// Create a fast isel object. +FastISel * +ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo) const { + return ARM::createFastISel(funcInfo, libInfo); +} + +Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { + unsigned NumVals = N->getNumValues(); + if (!NumVals) + return Sched::RegPressure; + + for (unsigned i = 0; i != NumVals; ++i) { + EVT VT = N->getValueType(i); + if (VT == MVT::Glue || VT == MVT::Other) + continue; + if (VT.isFloatingPoint() || VT.isVector()) + return Sched::ILP; + } + + if (!N->isMachineOpcode()) + return Sched::RegPressure; + + // Load are scheduled for latency even if there instruction itinerary + // is not available. + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); + + if (MCID.getNumDefs() == 0) + return Sched::RegPressure; + if (!Itins->isEmpty() && + Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2) + return Sched::ILP; + + return Sched::RegPressure; +} + +//===----------------------------------------------------------------------===// +// Lowering Code +//===----------------------------------------------------------------------===// + +/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC +static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { + switch (CC) { + default: llvm_unreachable("Unknown condition code!"); + case ISD::SETNE: return ARMCC::NE; + case ISD::SETEQ: return ARMCC::EQ; + case ISD::SETGT: return ARMCC::GT; + case ISD::SETGE: return ARMCC::GE; + case ISD::SETLT: return ARMCC::LT; + case ISD::SETLE: return ARMCC::LE; + case ISD::SETUGT: return ARMCC::HI; + case ISD::SETUGE: return ARMCC::HS; + case ISD::SETULT: return ARMCC::LO; + case ISD::SETULE: return ARMCC::LS; + } +} + +/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. +static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, + ARMCC::CondCodes &CondCode2) { + CondCode2 = ARMCC::AL; + switch (CC) { + default: llvm_unreachable("Unknown FP condition!"); + case ISD::SETEQ: + case ISD::SETOEQ: CondCode = ARMCC::EQ; break; + case ISD::SETGT: + case ISD::SETOGT: CondCode = ARMCC::GT; break; + case ISD::SETGE: + case ISD::SETOGE: CondCode = ARMCC::GE; break; + case ISD::SETOLT: CondCode = ARMCC::MI; break; + case ISD::SETOLE: CondCode = ARMCC::LS; break; + case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break; + case ISD::SETO: CondCode = ARMCC::VC; break; + case ISD::SETUO: CondCode = ARMCC::VS; break; + case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break; + case ISD::SETUGT: CondCode = ARMCC::HI; break; + case ISD::SETUGE: CondCode = ARMCC::PL; break; + case ISD::SETLT: + case ISD::SETULT: CondCode = ARMCC::LT; break; + case ISD::SETLE: + case ISD::SETULE: CondCode = ARMCC::LE; break; + case ISD::SETNE: + case ISD::SETUNE: CondCode = ARMCC::NE; break; + } +} + +//===----------------------------------------------------------------------===// +// Calling Convention Implementation +//===----------------------------------------------------------------------===// + +#include "ARMGenCallingConv.inc" + +/// getEffectiveCallingConv - Get the effective calling convention, taking into +/// account presence of floating point hardware and calling convention +/// limitations, such as support for variadic functions. +CallingConv::ID +ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, + bool isVarArg) const { + switch (CC) { + default: + llvm_unreachable("Unsupported calling convention"); + case CallingConv::ARM_AAPCS: + case CallingConv::ARM_APCS: + case CallingConv::GHC: + return CC; + case CallingConv::ARM_AAPCS_VFP: + return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP; + case CallingConv::C: + if (!Subtarget->isAAPCS_ABI()) + return CallingConv::ARM_APCS; + else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && + getTargetMachine().Options.FloatABIType == FloatABI::Hard && + !isVarArg) + return CallingConv::ARM_AAPCS_VFP; + else + return CallingConv::ARM_AAPCS; + case CallingConv::Fast: + if (!Subtarget->isAAPCS_ABI()) { + if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg) + return CallingConv::Fast; + return CallingConv::ARM_APCS; + } else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg) + return CallingConv::ARM_AAPCS_VFP; + else + return CallingConv::ARM_AAPCS; + } +} + +/// CCAssignFnForNode - Selects the correct CCAssignFn for the given +/// CallingConvention. +CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, + bool Return, + bool isVarArg) const { + switch (getEffectiveCallingConv(CC, isVarArg)) { + default: + llvm_unreachable("Unsupported calling convention"); + case CallingConv::ARM_APCS: + return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); + case CallingConv::ARM_AAPCS: + return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); + case CallingConv::ARM_AAPCS_VFP: + return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); + case CallingConv::Fast: + return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS); + case CallingConv::GHC: + return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC); + } +} + +/// LowerCallResult - Lower the result values of a call into the +/// appropriate copies out of appropriate physical registers. +SDValue +ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals, + bool isThisReturn, SDValue ThisVal) const { + + // Assign locations to each value returned by this call. + SmallVector<CCValAssign, 16> RVLocs; + ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext(), Call); + CCInfo.AnalyzeCallResult(Ins, + CCAssignFnForNode(CallConv, /* Return*/ true, + isVarArg)); + + // Copy all of the result registers out of their specified physreg. + for (unsigned i = 0; i != RVLocs.size(); ++i) { + CCValAssign VA = RVLocs[i]; + + // Pass 'this' value directly from the argument to return value, to avoid + // reg unit interference + if (i == 0 && isThisReturn) { + assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 && + "unexpected return calling convention register assignment"); + InVals.push_back(ThisVal); + continue; + } + + SDValue Val; + if (VA.needsCustom()) { + // Handle f64 or half of a v2f64. + SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, + InFlag); + Chain = Lo.getValue(1); + InFlag = Lo.getValue(2); + VA = RVLocs[++i]; // skip ahead to next loc + SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, + InFlag); + Chain = Hi.getValue(1); + InFlag = Hi.getValue(2); + if (!Subtarget->isLittle()) + std::swap (Lo, Hi); + Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); + + if (VA.getLocVT() == MVT::v2f64) { + SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, + DAG.getConstant(0, dl, MVT::i32)); + + VA = RVLocs[++i]; // skip ahead to next loc + Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); + Chain = Lo.getValue(1); + InFlag = Lo.getValue(2); + VA = RVLocs[++i]; // skip ahead to next loc + Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); + Chain = Hi.getValue(1); + InFlag = Hi.getValue(2); + if (!Subtarget->isLittle()) + std::swap (Lo, Hi); + Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); + Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, + DAG.getConstant(1, dl, MVT::i32)); + } + } else { + Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), + InFlag); + Chain = Val.getValue(1); + InFlag = Val.getValue(2); + } + + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::BCvt: + Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); + break; + } + + InVals.push_back(Val); + } + + return Chain; +} + +/// LowerMemOpCallTo - Store the argument to the stack. +SDValue +ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, + SDValue StackPtr, SDValue Arg, + SDLoc dl, SelectionDAG &DAG, + const CCValAssign &VA, + ISD::ArgFlagsTy Flags) const { + unsigned LocMemOffset = VA.getLocMemOffset(); + SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); + PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), + StackPtr, PtrOff); + return DAG.getStore( + Chain, dl, Arg, PtrOff, + MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset), + false, false, 0); +} + +void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG, + SDValue Chain, SDValue &Arg, + RegsToPassVector &RegsToPass, + CCValAssign &VA, CCValAssign &NextVA, + SDValue &StackPtr, + SmallVectorImpl<SDValue> &MemOpChains, + ISD::ArgFlagsTy Flags) const { + + SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, + DAG.getVTList(MVT::i32, MVT::i32), Arg); + unsigned id = Subtarget->isLittle() ? 0 : 1; + RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id))); + + if (NextVA.isRegLoc()) + RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id))); + else { + assert(NextVA.isMemLoc()); + if (!StackPtr.getNode()) + StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, + getPointerTy(DAG.getDataLayout())); + + MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id), + dl, DAG, NextVA, + Flags)); + } +} + +/// LowerCall - Lowering a call into a callseq_start <- +/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter +/// nodes. +SDValue +ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG = CLI.DAG; + SDLoc &dl = CLI.DL; + SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; + SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; + SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + bool &isTailCall = CLI.IsTailCall; + CallingConv::ID CallConv = CLI.CallConv; + bool doesNotRet = CLI.DoesNotReturn; + bool isVarArg = CLI.IsVarArg; + + MachineFunction &MF = DAG.getMachineFunction(); + bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); + bool isThisReturn = false; + bool isSibCall = false; + auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls"); + + // Disable tail calls if they're not supported. + if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true") + isTailCall = false; + + if (isTailCall) { + // Check if it's really possible to do a tail call. + isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, + isVarArg, isStructRet, MF.getFunction()->hasStructRetAttr(), + Outs, OutVals, Ins, DAG); + if (!isTailCall && CLI.CS && CLI.CS->isMustTailCall()) + report_fatal_error("failed to perform tail call elimination on a call " + "site marked musttail"); + // We don't support GuaranteedTailCallOpt for ARM, only automatically + // detected sibcalls. + if (isTailCall) { + ++NumTailCalls; + isSibCall = true; + } + } + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ArgLocs; + ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext(), Call); + CCInfo.AnalyzeCallOperands(Outs, + CCAssignFnForNode(CallConv, /* Return*/ false, + isVarArg)); + + // Get a count of how many bytes are to be pushed on the stack. + unsigned NumBytes = CCInfo.getNextStackOffset(); + + // For tail calls, memory operands are available in our caller's stack. + if (isSibCall) + NumBytes = 0; + + // Adjust the stack pointer for the new arguments... + // These operations are automatically eliminated by the prolog/epilog pass + if (!isSibCall) + Chain = DAG.getCALLSEQ_START(Chain, + DAG.getIntPtrConstant(NumBytes, dl, true), dl); + + SDValue StackPtr = + DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); + + RegsToPassVector RegsToPass; + SmallVector<SDValue, 8> MemOpChains; + + // Walk the register/memloc assignments, inserting copies/loads. In the case + // of tail call optimization, arguments are handled later. + for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); + i != e; + ++i, ++realArgIdx) { + CCValAssign &VA = ArgLocs[i]; + SDValue Arg = OutVals[realArgIdx]; + ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; + bool isByVal = Flags.isByVal(); + + // Promote the value if needed. + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::SExt: + Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); + break; + case CCValAssign::ZExt: + Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); + break; + case CCValAssign::AExt: + Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); + break; + case CCValAssign::BCvt: + Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); + break; + } + + // f64 and v2f64 might be passed in i32 pairs and must be split into pieces + if (VA.needsCustom()) { + if (VA.getLocVT() == MVT::v2f64) { + SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, + DAG.getConstant(0, dl, MVT::i32)); + SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, + DAG.getConstant(1, dl, MVT::i32)); + + PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, + VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); + + VA = ArgLocs[++i]; // skip ahead to next loc + if (VA.isRegLoc()) { + PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, + VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); + } else { + assert(VA.isMemLoc()); + + MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1, + dl, DAG, VA, Flags)); + } + } else { + PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], + StackPtr, MemOpChains, Flags); + } + } else if (VA.isRegLoc()) { + if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i32) { + assert(VA.getLocVT() == MVT::i32 && + "unexpected calling convention register assignment"); + assert(!Ins.empty() && Ins[0].VT == MVT::i32 && + "unexpected use of 'returned'"); + isThisReturn = true; + } + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + } else if (isByVal) { + assert(VA.isMemLoc()); + unsigned offset = 0; + + // True if this byval aggregate will be split between registers + // and memory. + unsigned ByValArgsCount = CCInfo.getInRegsParamsCount(); + unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed(); + + if (CurByValIdx < ByValArgsCount) { + + unsigned RegBegin, RegEnd; + CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd); + + EVT PtrVT = + DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); + unsigned int i, j; + for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { + SDValue Const = DAG.getConstant(4*i, dl, MVT::i32); + SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); + SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, + MachinePointerInfo(), + false, false, false, + DAG.InferPtrAlignment(AddArg)); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(j, Load)); + } + + // If parameter size outsides register area, "offset" value + // helps us to calculate stack slot for remained part properly. + offset = RegEnd - RegBegin; + + CCInfo.nextInRegsParam(); + } + + if (Flags.getByValSize() > 4*offset) { + auto PtrVT = getPointerTy(DAG.getDataLayout()); + unsigned LocMemOffset = VA.getLocMemOffset(); + SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); + SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff); + SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl); + SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); + SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, + MVT::i32); + SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl, + MVT::i32); + + SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode}; + MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, + Ops)); + } + } else if (!isSibCall) { + assert(VA.isMemLoc()); + + MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, + dl, DAG, VA, Flags)); + } + } + + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into the appropriate regs. + SDValue InFlag; + // Tail call byval lowering might overwrite argument registers so in case of + // tail call optimization the copies to registers are lowered later. + if (!isTailCall) + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); + } + + // For tail calls lower the arguments to the 'real' stack slot. + if (isTailCall) { + // Force all the incoming stack arguments to be loaded from the stack + // before any new outgoing arguments are stored to the stack, because the + // outgoing stack slots may alias the incoming argument stack slots, and + // the alias isn't otherwise explicit. This is slightly more conservative + // than necessary, because it means that each store effectively depends + // on every argument instead of just those arguments it would clobber. + + // Do not flag preceding copytoreg stuff together with the following stuff. + InFlag = SDValue(); + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); + } + InFlag = SDValue(); + } + + // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every + // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol + // node so that legalize doesn't hack it. + bool isDirect = false; + bool isARMFunc = false; + bool isLocalARMFunc = false; + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + auto PtrVt = getPointerTy(DAG.getDataLayout()); + + if (Subtarget->genLongCalls()) { + assert((Subtarget->isTargetWindows() || + getTargetMachine().getRelocationModel() == Reloc::Static) && + "long-calls with non-static relocation model!"); + // Handle a global address or an external symbol. If it's not one of + // those, the target's already in a register, so we don't need to do + // anything extra. + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + const GlobalValue *GV = G->getGlobal(); + // Create a constant pool entry for the callee address + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); + ARMConstantPoolValue *CPV = + ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); + + // Get the address of the callee into a register + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + Callee = DAG.getLoad( + PtrVt, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); + } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) { + const char *Sym = S->getSymbol(); + + // Create a constant pool entry for the callee address + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); + ARMConstantPoolValue *CPV = + ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, + ARMPCLabelIndex, 0); + // Get the address of the callee into a register + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + Callee = DAG.getLoad( + PtrVt, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); + } + } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + const GlobalValue *GV = G->getGlobal(); + isDirect = true; + bool isDef = GV->isStrongDefinitionForLinker(); + bool isStub = (!isDef && Subtarget->isTargetMachO()) && + getTargetMachine().getRelocationModel() != Reloc::Static; + isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); + // ARM call to a local ARM function is predicable. + isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking); + // tBX takes a register source operand. + if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { + assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); + Callee = DAG.getNode( + ARMISD::WrapperPIC, dl, PtrVt, + DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); + Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), Callee, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, true, 0); + } else if (Subtarget->isTargetCOFF()) { + assert(Subtarget->isTargetWindows() && + "Windows is the only supported COFF target"); + unsigned TargetFlags = GV->hasDLLImportStorageClass() + ? ARMII::MO_DLLIMPORT + : ARMII::MO_NO_FLAG; + Callee = + DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0, TargetFlags); + if (GV->hasDLLImportStorageClass()) + Callee = + DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), + DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); + } else { + // On ELF targets for PIC code, direct calls should go through the PLT + unsigned OpFlags = 0; + if (Subtarget->isTargetELF() && + getTargetMachine().getRelocationModel() == Reloc::PIC_) + OpFlags = ARMII::MO_PLT; + Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, OpFlags); + } + } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { + isDirect = true; + bool isStub = Subtarget->isTargetMachO() && + getTargetMachine().getRelocationModel() != Reloc::Static; + isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); + // tBX takes a register source operand. + const char *Sym = S->getSymbol(); + if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); + ARMConstantPoolValue *CPV = + ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, + ARMPCLabelIndex, 4); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + Callee = DAG.getLoad( + PtrVt, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); + Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel); + } else { + unsigned OpFlags = 0; + // On ELF targets for PIC code, direct calls should go through the PLT + if (Subtarget->isTargetELF() && + getTargetMachine().getRelocationModel() == Reloc::PIC_) + OpFlags = ARMII::MO_PLT; + Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, OpFlags); + } + } + + // FIXME: handle tail calls differently. + unsigned CallOpc; + if (Subtarget->isThumb()) { + if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) + CallOpc = ARMISD::CALL_NOLINK; + else + CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL; + } else { + if (!isDirect && !Subtarget->hasV5TOps()) + CallOpc = ARMISD::CALL_NOLINK; + else if (doesNotRet && isDirect && Subtarget->hasRAS() && + // Emit regular call when code size is the priority + !MF.getFunction()->optForMinSize()) + // "mov lr, pc; b _foo" to avoid confusing the RSP + CallOpc = ARMISD::CALL_NOLINK; + else + CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL; + } + + std::vector<SDValue> Ops; + Ops.push_back(Chain); + Ops.push_back(Callee); + + // Add argument registers to the end of the list so that they are known live + // into the call. + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) + Ops.push_back(DAG.getRegister(RegsToPass[i].first, + RegsToPass[i].second.getValueType())); + + // Add a register mask operand representing the call-preserved registers. + if (!isTailCall) { + const uint32_t *Mask; + const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); + if (isThisReturn) { + // For 'this' returns, use the R0-preserving mask if applicable + Mask = ARI->getThisReturnPreservedMask(MF, CallConv); + if (!Mask) { + // Set isThisReturn to false if the calling convention is not one that + // allows 'returned' to be modeled in this way, so LowerCallResult does + // not try to pass 'this' straight through + isThisReturn = false; + Mask = ARI->getCallPreservedMask(MF, CallConv); + } + } else + Mask = ARI->getCallPreservedMask(MF, CallConv); + + assert(Mask && "Missing call preserved mask for calling convention"); + Ops.push_back(DAG.getRegisterMask(Mask)); + } + + if (InFlag.getNode()) + Ops.push_back(InFlag); + + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + if (isTailCall) { + MF.getFrameInfo()->setHasTailCall(); + return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops); + } + + // Returns a chain and a flag for retval copy to use. + Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); + InFlag = Chain.getValue(1); + + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), + DAG.getIntPtrConstant(0, dl, true), InFlag, dl); + if (!Ins.empty()) + InFlag = Chain.getValue(1); + + // Handle result values, copying them out of physregs into vregs that we + // return. + return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, + InVals, isThisReturn, + isThisReturn ? OutVals[0] : SDValue()); +} + +/// HandleByVal - Every parameter *after* a byval parameter is passed +/// on the stack. Remember the next parameter register to allocate, +/// and then confiscate the rest of the parameter registers to insure +/// this. +void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size, + unsigned Align) const { + assert((State->getCallOrPrologue() == Prologue || + State->getCallOrPrologue() == Call) && + "unhandled ParmContext"); + + // Byval (as with any stack) slots are always at least 4 byte aligned. + Align = std::max(Align, 4U); + + unsigned Reg = State->AllocateReg(GPRArgRegs); + if (!Reg) + return; + + unsigned AlignInRegs = Align / 4; + unsigned Waste = (ARM::R4 - Reg) % AlignInRegs; + for (unsigned i = 0; i < Waste; ++i) + Reg = State->AllocateReg(GPRArgRegs); + + if (!Reg) + return; + + unsigned Excess = 4 * (ARM::R4 - Reg); + + // Special case when NSAA != SP and parameter size greater than size of + // all remained GPR regs. In that case we can't split parameter, we must + // send it to stack. We also must set NCRN to R4, so waste all + // remained registers. + const unsigned NSAAOffset = State->getNextStackOffset(); + if (NSAAOffset != 0 && Size > Excess) { + while (State->AllocateReg(GPRArgRegs)) + ; + return; + } + + // First register for byval parameter is the first register that wasn't + // allocated before this method call, so it would be "reg". + // If parameter is small enough to be saved in range [reg, r4), then + // the end (first after last) register would be reg + param-size-in-regs, + // else parameter would be splitted between registers and stack, + // end register would be r4 in this case. + unsigned ByValRegBegin = Reg; + unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4); + State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd); + // Note, first register is allocated in the beginning of function already, + // allocate remained amount of registers we need. + for (unsigned i = Reg + 1; i != ByValRegEnd; ++i) + State->AllocateReg(GPRArgRegs); + // A byval parameter that is split between registers and memory needs its + // size truncated here. + // In the case where the entire structure fits in registers, we set the + // size in memory to zero. + Size = std::max<int>(Size - Excess, 0); +} + +/// MatchingStackOffset - Return true if the given stack call argument is +/// already available in the same position (relatively) of the caller's +/// incoming argument stack. +static +bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, + MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, + const TargetInstrInfo *TII) { + unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; + int FI = INT_MAX; + if (Arg.getOpcode() == ISD::CopyFromReg) { + unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); + if (!TargetRegisterInfo::isVirtualRegister(VR)) + return false; + MachineInstr *Def = MRI->getVRegDef(VR); + if (!Def) + return false; + if (!Flags.isByVal()) { + if (!TII->isLoadFromStackSlot(Def, FI)) + return false; + } else { + return false; + } + } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { + if (Flags.isByVal()) + // ByVal argument is passed in as a pointer but it's now being + // dereferenced. e.g. + // define @foo(%struct.X* %A) { + // tail call @bar(%struct.X* byval %A) + // } + return false; + SDValue Ptr = Ld->getBasePtr(); + FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); + if (!FINode) + return false; + FI = FINode->getIndex(); + } else + return false; + + assert(FI != INT_MAX); + if (!MFI->isFixedObjectIndex(FI)) + return false; + return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); +} + +/// IsEligibleForTailCallOptimization - Check whether the call is eligible +/// for tail call optimization. Targets which want to do tail call +/// optimization should implement this function. +bool +ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, + CallingConv::ID CalleeCC, + bool isVarArg, + bool isCalleeStructRet, + bool isCallerStructRet, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + SelectionDAG& DAG) const { + const Function *CallerF = DAG.getMachineFunction().getFunction(); + CallingConv::ID CallerCC = CallerF->getCallingConv(); + bool CCMatch = CallerCC == CalleeCC; + + assert(Subtarget->supportsTailCall()); + + // Look for obvious safe cases to perform tail call optimization that do not + // require ABI changes. This is what gcc calls sibcall. + + // Do not sibcall optimize vararg calls unless the call site is not passing + // any arguments. + if (isVarArg && !Outs.empty()) + return false; + + // Exception-handling functions need a special set of instructions to indicate + // a return to the hardware. Tail-calling another function would probably + // break this. + if (CallerF->hasFnAttribute("interrupt")) + return false; + + // Also avoid sibcall optimization if either caller or callee uses struct + // return semantics. + if (isCalleeStructRet || isCallerStructRet) + return false; + + // Externally-defined functions with weak linkage should not be + // tail-called on ARM when the OS does not support dynamic + // pre-emption of symbols, as the AAELF spec requires normal calls + // to undefined weak functions to be replaced with a NOP or jump to the + // next instruction. The behaviour of branch instructions in this + // situation (as used for tail calls) is implementation-defined, so we + // cannot rely on the linker replacing the tail call with a return. + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + const GlobalValue *GV = G->getGlobal(); + const Triple &TT = getTargetMachine().getTargetTriple(); + if (GV->hasExternalWeakLinkage() && + (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) + return false; + } + + // If the calling conventions do not match, then we'd better make sure the + // results are returned in the same way as what the caller expects. + if (!CCMatch) { + SmallVector<CCValAssign, 16> RVLocs1; + ARMCCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1, + *DAG.getContext(), Call); + CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC, true, isVarArg)); + + SmallVector<CCValAssign, 16> RVLocs2; + ARMCCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2, + *DAG.getContext(), Call); + CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC, true, isVarArg)); + + if (RVLocs1.size() != RVLocs2.size()) + return false; + for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { + if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) + return false; + if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) + return false; + if (RVLocs1[i].isRegLoc()) { + if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) + return false; + } else { + if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) + return false; + } + } + } + + // If Caller's vararg or byval argument has been split between registers and + // stack, do not perform tail call, since part of the argument is in caller's + // local frame. + const ARMFunctionInfo *AFI_Caller = DAG.getMachineFunction(). + getInfo<ARMFunctionInfo>(); + if (AFI_Caller->getArgRegsSaveSize()) + return false; + + // If the callee takes no arguments then go on to check the results of the + // call. + if (!Outs.empty()) { + // Check if stack adjustment is needed. For now, do not do this if any + // argument is passed on the stack. + SmallVector<CCValAssign, 16> ArgLocs; + ARMCCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext(), Call); + CCInfo.AnalyzeCallOperands(Outs, + CCAssignFnForNode(CalleeCC, false, isVarArg)); + if (CCInfo.getNextStackOffset()) { + MachineFunction &MF = DAG.getMachineFunction(); + + // Check if the arguments are already laid out in the right way as + // the caller's fixed stack objects. + MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineRegisterInfo *MRI = &MF.getRegInfo(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); + i != e; + ++i, ++realArgIdx) { + CCValAssign &VA = ArgLocs[i]; + EVT RegVT = VA.getLocVT(); + SDValue Arg = OutVals[realArgIdx]; + ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; + if (VA.getLocInfo() == CCValAssign::Indirect) + return false; + if (VA.needsCustom()) { + // f64 and vector types are split into multiple registers or + // register/stack-slot combinations. The types will not match + // the registers; give up on memory f64 refs until we figure + // out what to do about this. + if (!VA.isRegLoc()) + return false; + if (!ArgLocs[++i].isRegLoc()) + return false; + if (RegVT == MVT::v2f64) { + if (!ArgLocs[++i].isRegLoc()) + return false; + if (!ArgLocs[++i].isRegLoc()) + return false; + } + } else if (!VA.isRegLoc()) { + if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, + MFI, MRI, TII)) + return false; + } + } + } + } + + return true; +} + +bool +ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv, + MachineFunction &MF, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + LLVMContext &Context) const { + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); + return CCInfo.CheckReturn(Outs, CCAssignFnForNode(CallConv, /*Return=*/true, + isVarArg)); +} + +static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps, + SDLoc DL, SelectionDAG &DAG) { + const MachineFunction &MF = DAG.getMachineFunction(); + const Function *F = MF.getFunction(); + + StringRef IntKind = F->getFnAttribute("interrupt").getValueAsString(); + + // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset + // version of the "preferred return address". These offsets affect the return + // instruction if this is a return from PL1 without hypervisor extensions. + // IRQ/FIQ: +4 "subs pc, lr, #4" + // SWI: 0 "subs pc, lr, #0" + // ABORT: +4 "subs pc, lr, #4" + // UNDEF: +4/+2 "subs pc, lr, #0" + // UNDEF varies depending on where the exception came from ARM or Thumb + // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0. + + int64_t LROffset; + if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" || + IntKind == "ABORT") + LROffset = 4; + else if (IntKind == "SWI" || IntKind == "UNDEF") + LROffset = 0; + else + report_fatal_error("Unsupported interrupt attribute. If present, value " + "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF"); + + RetOps.insert(RetOps.begin() + 1, + DAG.getConstant(LROffset, DL, MVT::i32, false)); + + return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps); +} + +SDValue +ARMTargetLowering::LowerReturn(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + SDLoc dl, SelectionDAG &DAG) const { + + // CCValAssign - represent the assignment of the return value to a location. + SmallVector<CCValAssign, 16> RVLocs; + + // CCState - Info about the registers and stack slots. + ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext(), Call); + + // Analyze outgoing return values. + CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true, + isVarArg)); + + SDValue Flag; + SmallVector<SDValue, 4> RetOps; + RetOps.push_back(Chain); // Operand #0 = Chain (updated below) + bool isLittleEndian = Subtarget->isLittle(); + + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + AFI->setReturnRegsCount(RVLocs.size()); + + // Copy the result values into the output registers. + for (unsigned i = 0, realRVLocIdx = 0; + i != RVLocs.size(); + ++i, ++realRVLocIdx) { + CCValAssign &VA = RVLocs[i]; + assert(VA.isRegLoc() && "Can only return in registers!"); + + SDValue Arg = OutVals[realRVLocIdx]; + + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::BCvt: + Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); + break; + } + + if (VA.needsCustom()) { + if (VA.getLocVT() == MVT::v2f64) { + // Extract the first half and return it in two registers. + SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, + DAG.getConstant(0, dl, MVT::i32)); + SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, + DAG.getVTList(MVT::i32, MVT::i32), Half); + + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), + HalfGPRs.getValue(isLittleEndian ? 0 : 1), + Flag); + Flag = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + VA = RVLocs[++i]; // skip ahead to next loc + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), + HalfGPRs.getValue(isLittleEndian ? 1 : 0), + Flag); + Flag = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + VA = RVLocs[++i]; // skip ahead to next loc + + // Extract the 2nd half and fall through to handle it as an f64 value. + Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, + DAG.getConstant(1, dl, MVT::i32)); + } + // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is + // available. + SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, + DAG.getVTList(MVT::i32, MVT::i32), Arg); + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), + fmrrd.getValue(isLittleEndian ? 0 : 1), + Flag); + Flag = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + VA = RVLocs[++i]; // skip ahead to next loc + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), + fmrrd.getValue(isLittleEndian ? 1 : 0), + Flag); + } else + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); + + // Guarantee that all emitted copies are + // stuck together, avoiding something bad. + Flag = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + } + + // Update chain and glue. + RetOps[0] = Chain; + if (Flag.getNode()) + RetOps.push_back(Flag); + + // CPUs which aren't M-class use a special sequence to return from + // exceptions (roughly, any instruction setting pc and cpsr simultaneously, + // though we use "subs pc, lr, #N"). + // + // M-class CPUs actually use a normal return sequence with a special + // (hardware-provided) value in LR, so the normal code path works. + if (DAG.getMachineFunction().getFunction()->hasFnAttribute("interrupt") && + !Subtarget->isMClass()) { + if (Subtarget->isThumb1Only()) + report_fatal_error("interrupt attribute is not supported in Thumb1"); + return LowerInterruptReturn(RetOps, dl, DAG); + } + + return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps); +} + +bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { + if (N->getNumValues() != 1) + return false; + if (!N->hasNUsesOfValue(1, 0)) + return false; + + SDValue TCChain = Chain; + SDNode *Copy = *N->use_begin(); + if (Copy->getOpcode() == ISD::CopyToReg) { + // If the copy has a glue operand, we conservatively assume it isn't safe to + // perform a tail call. + if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) + return false; + TCChain = Copy->getOperand(0); + } else if (Copy->getOpcode() == ARMISD::VMOVRRD) { + SDNode *VMov = Copy; + // f64 returned in a pair of GPRs. + SmallPtrSet<SDNode*, 2> Copies; + for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); + UI != UE; ++UI) { + if (UI->getOpcode() != ISD::CopyToReg) + return false; + Copies.insert(*UI); + } + if (Copies.size() > 2) + return false; + + for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); + UI != UE; ++UI) { + SDValue UseChain = UI->getOperand(0); + if (Copies.count(UseChain.getNode())) + // Second CopyToReg + Copy = *UI; + else { + // We are at the top of this chain. + // If the copy has a glue operand, we conservatively assume it + // isn't safe to perform a tail call. + if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue) + return false; + // First CopyToReg + TCChain = UseChain; + } + } + } else if (Copy->getOpcode() == ISD::BITCAST) { + // f32 returned in a single GPR. + if (!Copy->hasOneUse()) + return false; + Copy = *Copy->use_begin(); + if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0)) + return false; + // If the copy has a glue operand, we conservatively assume it isn't safe to + // perform a tail call. + if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) + return false; + TCChain = Copy->getOperand(0); + } else { + return false; + } + + bool HasRet = false; + for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); + UI != UE; ++UI) { + if (UI->getOpcode() != ARMISD::RET_FLAG && + UI->getOpcode() != ARMISD::INTRET_FLAG) + return false; + HasRet = true; + } + + if (!HasRet) + return false; + + Chain = TCChain; + return true; +} + +bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { + if (!Subtarget->supportsTailCall()) + return false; + + auto Attr = + CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); + if (!CI->isTailCall() || Attr.getValueAsString() == "true") + return false; + + return true; +} + +// Trying to write a 64 bit value so need to split into two 32 bit values first, +// and pass the lower and high parts through. +static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) { + SDLoc DL(Op); + SDValue WriteValue = Op->getOperand(2); + + // This function is only supposed to be called for i64 type argument. + assert(WriteValue.getValueType() == MVT::i64 + && "LowerWRITE_REGISTER called for non-i64 type argument."); + + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, + DAG.getConstant(0, DL, MVT::i32)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, + DAG.getConstant(1, DL, MVT::i32)); + SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi }; + return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops); +} + +// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as +// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is +// one of the above mentioned nodes. It has to be wrapped because otherwise +// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only +// be used to form addressing mode. These wrapped nodes will be selected +// into MOVi. +static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) { + EVT PtrVT = Op.getValueType(); + // FIXME there is no actual debug info here + SDLoc dl(Op); + ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); + SDValue Res; + if (CP->isMachineConstantPoolEntry()) + Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, + CP->getAlignment()); + else + Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, + CP->getAlignment()); + return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); +} + +unsigned ARMTargetLowering::getJumpTableEncoding() const { + return MachineJumpTableInfo::EK_Inline; +} + +SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned ARMPCLabelIndex = 0; + SDLoc DL(Op); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); + Reloc::Model RelocM = getTargetMachine().getRelocationModel(); + SDValue CPAddr; + if (RelocM == Reloc::Static) { + CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4); + } else { + unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; + ARMPCLabelIndex = AFI->createPICLabelUId(); + ARMConstantPoolValue *CPV = + ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex, + ARMCP::CPBlockAddress, PCAdj); + CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); + } + CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); + SDValue Result = + DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 0); + if (RelocM == Reloc::Static) + return Result; + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32); + return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel); +} + +// Lower ISD::GlobalTLSAddress using the "general dynamic" model +SDValue +ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, + SelectionDAG &DAG) const { + SDLoc dl(GA); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); + ARMConstantPoolValue *CPV = + ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, + ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); + SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); + Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); + Argument = + DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 0); + SDValue Chain = Argument.getValue(1); + + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); + Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel); + + // call __tls_get_addr. + ArgListTy Args; + ArgListEntry Entry; + Entry.Node = Argument; + Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext()); + Args.push_back(Entry); + + // FIXME: is there useful debug info available here? + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl).setChain(Chain) + .setCallee(CallingConv::C, Type::getInt32Ty(*DAG.getContext()), + DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args), + 0); + + std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); + return CallResult.first; +} + +// Lower ISD::GlobalTLSAddress using the "initial exec" or +// "local exec" model. +SDValue +ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, + SelectionDAG &DAG, + TLSModel::Model model) const { + const GlobalValue *GV = GA->getGlobal(); + SDLoc dl(GA); + SDValue Offset; + SDValue Chain = DAG.getEntryNode(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + // Get the Thread Pointer + SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); + + if (model == TLSModel::InitialExec) { + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); + // Initial exec model. + unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; + ARMConstantPoolValue *CPV = + ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, + ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, + true); + Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); + Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); + Offset = DAG.getLoad( + PtrVT, dl, Chain, Offset, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); + Chain = Offset.getValue(1); + + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); + Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); + + Offset = DAG.getLoad( + PtrVT, dl, Chain, Offset, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); + } else { + // local exec model + assert(model == TLSModel::LocalExec); + ARMConstantPoolValue *CPV = + ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); + Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); + Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); + Offset = DAG.getLoad( + PtrVT, dl, Chain, Offset, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); + } + + // The address of the thread local variable is the add of the thread + // pointer with the offset of the variable. + return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); +} + +SDValue +ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { + // TODO: implement the "local dynamic" model + assert(Subtarget->isTargetELF() && + "TLS not implemented for non-ELF targets"); + GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); + if (DAG.getTarget().Options.EmulatedTLS) + return LowerToTLSEmulatedModel(GA, DAG); + + TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal()); + + switch (model) { + case TLSModel::GeneralDynamic: + case TLSModel::LocalDynamic: + return LowerToTLSGeneralDynamicModel(GA, DAG); + case TLSModel::InitialExec: + case TLSModel::LocalExec: + return LowerToTLSExecModels(GA, DAG, model); + } + llvm_unreachable("bogus TLS model"); +} + +SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, + SelectionDAG &DAG) const { + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDLoc dl(Op); + const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); + if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { + bool UseGOT_PREL = + !(GV->hasHiddenVisibility() || GV->hasLocalLinkage()); + + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDLoc dl(Op); + unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; + ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create( + GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj, + UseGOT_PREL ? ARMCP::GOT_PREL : ARMCP::no_modifier, + /*AddCurrentAddress=*/UseGOT_PREL); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + SDValue Result = DAG.getLoad( + PtrVT, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); + SDValue Chain = Result.getValue(1); + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); + Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); + if (UseGOT_PREL) + Result = DAG.getLoad(PtrVT, dl, Chain, Result, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); + return Result; + } + + // If we have T2 ops, we can materialize the address directly via movt/movw + // pair. This is always cheaper. + if (Subtarget->useMovt(DAG.getMachineFunction())) { + ++NumMovwMovt; + // FIXME: Once remat is capable of dealing with instructions with register + // operands, expand this into two nodes. + return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, + DAG.getTargetGlobalAddress(GV, dl, PtrVT)); + } else { + SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + return DAG.getLoad( + PtrVT, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); + } +} + +SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, + SelectionDAG &DAG) const { + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDLoc dl(Op); + const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); + Reloc::Model RelocM = getTargetMachine().getRelocationModel(); + + if (Subtarget->useMovt(DAG.getMachineFunction())) + ++NumMovwMovt; + + // FIXME: Once remat is capable of dealing with instructions with register + // operands, expand this into multiple nodes + unsigned Wrapper = + RelocM == Reloc::PIC_ ? ARMISD::WrapperPIC : ARMISD::Wrapper; + + SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY); + SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G); + + if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) + Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); + return Result; +} + +SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, + SelectionDAG &DAG) const { + assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported"); + assert(Subtarget->useMovt(DAG.getMachineFunction()) && + "Windows on ARM expects to use movw/movt"); + + const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); + const ARMII::TOF TargetFlags = + (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue Result; + SDLoc DL(Op); + + ++NumMovwMovt; + + // FIXME: Once remat is capable of dealing with instructions with register + // operands, expand this into two nodes. + Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, + DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0, + TargetFlags)); + if (GV->hasDLLImportStorageClass()) + Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); + return Result; +} + +SDValue +ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); + SDValue Val = DAG.getConstant(0, dl, MVT::i32); + return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, + DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), + Op.getOperand(1), Val); +} + +SDValue +ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); + return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), + Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32)); +} + +SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other, + Op.getOperand(0)); +} + +SDValue +ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) const { + unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + SDLoc dl(Op); + switch (IntNo) { + default: return SDValue(); // Don't custom lower most intrinsics. + case Intrinsic::arm_rbit: { + assert(Op.getOperand(1).getValueType() == MVT::i32 && + "RBIT intrinsic must have i32 type!"); + return DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Op.getOperand(1)); + } + case Intrinsic::arm_thread_pointer: { + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); + } + case Intrinsic::eh_sjlj_lsda: { + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + Reloc::Model RelocM = getTargetMachine().getRelocationModel(); + SDValue CPAddr; + unsigned PCAdj = (RelocM != Reloc::PIC_) + ? 0 : (Subtarget->isThumb() ? 4 : 8); + ARMConstantPoolValue *CPV = + ARMConstantPoolConstant::Create(MF.getFunction(), ARMPCLabelIndex, + ARMCP::CPLSDA, PCAdj); + CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + SDValue Result = DAG.getLoad( + PtrVT, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); + + if (RelocM == Reloc::PIC_) { + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); + Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); + } + return Result; + } + case Intrinsic::arm_neon_vmulls: + case Intrinsic::arm_neon_vmullu: { + unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) + ? ARMISD::VMULLs : ARMISD::VMULLu; + return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } + case Intrinsic::arm_neon_vminnm: + case Intrinsic::arm_neon_vmaxnm: { + unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm) + ? ISD::FMINNUM : ISD::FMAXNUM; + return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } + case Intrinsic::arm_neon_vminu: + case Intrinsic::arm_neon_vmaxu: { + if (Op.getValueType().isFloatingPoint()) + return SDValue(); + unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu) + ? ISD::UMIN : ISD::UMAX; + return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } + case Intrinsic::arm_neon_vmins: + case Intrinsic::arm_neon_vmaxs: { + // v{min,max}s is overloaded between signed integers and floats. + if (!Op.getValueType().isFloatingPoint()) { + unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) + ? ISD::SMIN : ISD::SMAX; + return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } + unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) + ? ISD::FMINNAN : ISD::FMAXNAN; + return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } + } +} + +static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + // FIXME: handle "fence singlethread" more efficiently. + SDLoc dl(Op); + if (!Subtarget->hasDataBarrier()) { + // Some ARMv6 cpus can support data barriers with an mcr instruction. + // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get + // here. + assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && + "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!"); + return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), + DAG.getConstant(0, dl, MVT::i32)); + } + + ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1)); + AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue()); + ARM_MB::MemBOpt Domain = ARM_MB::ISH; + if (Subtarget->isMClass()) { + // Only a full system barrier exists in the M-class architectures. + Domain = ARM_MB::SY; + } else if (Subtarget->isSwift() && Ord == Release) { + // Swift happens to implement ISHST barriers in a way that's compatible with + // Release semantics but weaker than ISH so we'd be fools not to use + // it. Beware: other processors probably don't! + Domain = ARM_MB::ISHST; + } + + return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0), + DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32), + DAG.getConstant(Domain, dl, MVT::i32)); +} + +static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + // ARM pre v5TE and Thumb1 does not have preload instructions. + if (!(Subtarget->isThumb2() || + (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps()))) + // Just preserve the chain. + return Op.getOperand(0); + + SDLoc dl(Op); + unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1; + if (!isRead && + (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension())) + // ARMv7 with MP extension has PLDW. + return Op.getOperand(0); + + unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); + if (Subtarget->isThumb()) { + // Invert the bits. + isRead = ~isRead & 1; + isData = ~isData & 1; + } + + return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0), + Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32), + DAG.getConstant(isData, dl, MVT::i32)); +} + +static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>(); + + // vastart just stores the address of the VarArgsFrameIndex slot into the + // memory location argument. + SDLoc dl(Op); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); + SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); + const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); + return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), + MachinePointerInfo(SV), false, false, 0); +} + +SDValue +ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, + SDValue &Root, SelectionDAG &DAG, + SDLoc dl) const { + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + + const TargetRegisterClass *RC; + if (AFI->isThumb1OnlyFunction()) + RC = &ARM::tGPRRegClass; + else + RC = &ARM::GPRRegClass; + + // Transform the arguments stored in physical registers into virtual ones. + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); + + SDValue ArgValue2; + if (NextVA.isMemLoc()) { + MachineFrameInfo *MFI = MF.getFrameInfo(); + int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true); + + // Create load node to retrieve arguments from the stack. + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); + ArgValue2 = DAG.getLoad( + MVT::i32, dl, Root, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, + false, false, 0); + } else { + Reg = MF.addLiveIn(NextVA.getLocReg(), RC); + ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); + } + if (!Subtarget->isLittle()) + std::swap (ArgValue, ArgValue2); + return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2); +} + +// The remaining GPRs hold either the beginning of variable-argument +// data, or the beginning of an aggregate passed by value (usually +// byval). Either way, we allocate stack slots adjacent to the data +// provided by our caller, and store the unallocated registers there. +// If this is a variadic function, the va_list pointer will begin with +// these values; otherwise, this reassembles a (byval) structure that +// was split between registers and memory. +// Return: The frame index registers were stored into. +int +ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, + SDLoc dl, SDValue &Chain, + const Value *OrigArg, + unsigned InRegsParamRecordIdx, + int ArgOffset, + unsigned ArgSize) const { + // Currently, two use-cases possible: + // Case #1. Non-var-args function, and we meet first byval parameter. + // Setup first unallocated register as first byval register; + // eat all remained registers + // (these two actions are performed by HandleByVal method). + // Then, here, we initialize stack frame with + // "store-reg" instructions. + // Case #2. Var-args function, that doesn't contain byval parameters. + // The same: eat all remained unallocated registers, + // initialize stack frame. + + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned RBegin, REnd; + if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) { + CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd); + } else { + unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs); + RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx]; + REnd = ARM::R4; + } + + if (REnd != RBegin) + ArgOffset = -4 * (ARM::R4 - RBegin); + + auto PtrVT = getPointerTy(DAG.getDataLayout()); + int FrameIndex = MFI->CreateFixedObject(ArgSize, ArgOffset, false); + SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT); + + SmallVector<SDValue, 4> MemOps; + const TargetRegisterClass *RC = + AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass; + + for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) { + unsigned VReg = MF.addLiveIn(Reg, RC); + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); + SDValue Store = + DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo(OrigArg, 4 * i), false, false, 0); + MemOps.push_back(Store); + FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT)); + } + + if (!MemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); + return FrameIndex; +} + +// Setup stack frame, the va_list pointer will start from. +void +ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, + SDLoc dl, SDValue &Chain, + unsigned ArgOffset, + unsigned TotalArgRegsSaveSize, + bool ForceMutable) const { + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + + // Try to store any remaining integer argument regs + // to their spots on the stack so that they may be loaded by deferencing + // the result of va_next. + // If there is no regs to be stored, just point address after last + // argument passed via stack. + int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr, + CCInfo.getInRegsParamsCount(), + CCInfo.getNextStackOffset(), 4); + AFI->setVarArgsFrameIndex(FrameIndex); +} + +SDValue +ARMTargetLowering::LowerFormalArguments(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> + &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) + const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + + // Assign locations to all of the incoming arguments. + SmallVector<CCValAssign, 16> ArgLocs; + ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext(), Prologue); + CCInfo.AnalyzeFormalArguments(Ins, + CCAssignFnForNode(CallConv, /* Return*/ false, + isVarArg)); + + SmallVector<SDValue, 16> ArgValues; + SDValue ArgValue; + Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin(); + unsigned CurArgIdx = 0; + + // Initially ArgRegsSaveSize is zero. + // Then we increase this value each time we meet byval parameter. + // We also increase this value in case of varargs function. + AFI->setArgRegsSaveSize(0); + + // Calculate the amount of stack space that we need to allocate to store + // byval and variadic arguments that are passed in registers. + // We need to know this before we allocate the first byval or variadic + // argument, as they will be allocated a stack slot below the CFA (Canonical + // Frame Address, the stack pointer at entry to the function). + unsigned ArgRegBegin = ARM::R4; + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount()) + break; + + CCValAssign &VA = ArgLocs[i]; + unsigned Index = VA.getValNo(); + ISD::ArgFlagsTy Flags = Ins[Index].Flags; + if (!Flags.isByVal()) + continue; + + assert(VA.isMemLoc() && "unexpected byval pointer in reg"); + unsigned RBegin, REnd; + CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd); + ArgRegBegin = std::min(ArgRegBegin, RBegin); + + CCInfo.nextInRegsParam(); + } + CCInfo.rewindByValRegsInfo(); + + int lastInsIndex = -1; + if (isVarArg && MFI->hasVAStart()) { + unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs); + if (RegIdx != array_lengthof(GPRArgRegs)) + ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]); + } + + unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin); + AFI->setArgRegsSaveSize(TotalArgRegsSaveSize); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + if (Ins[VA.getValNo()].isOrigArg()) { + std::advance(CurOrigArg, + Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx); + CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex(); + } + // Arguments stored in registers. + if (VA.isRegLoc()) { + EVT RegVT = VA.getLocVT(); + + if (VA.needsCustom()) { + // f64 and vector types are split up into multiple registers or + // combinations of registers and stack slots. + if (VA.getLocVT() == MVT::v2f64) { + SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], + Chain, DAG, dl); + VA = ArgLocs[++i]; // skip ahead to next loc + SDValue ArgValue2; + if (VA.isMemLoc()) { + int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true); + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + ArgValue2 = DAG.getLoad( + MVT::f64, dl, Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + false, false, false, 0); + } else { + ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], + Chain, DAG, dl); + } + ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); + ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, + ArgValue, ArgValue1, + DAG.getIntPtrConstant(0, dl)); + ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, + ArgValue, ArgValue2, + DAG.getIntPtrConstant(1, dl)); + } else + ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); + + } else { + const TargetRegisterClass *RC; + + if (RegVT == MVT::f32) + RC = &ARM::SPRRegClass; + else if (RegVT == MVT::f64) + RC = &ARM::DPRRegClass; + else if (RegVT == MVT::v2f64) + RC = &ARM::QPRRegClass; + else if (RegVT == MVT::i32) + RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass + : &ARM::GPRRegClass; + else + llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); + + // Transform the arguments in physical registers into virtual ones. + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); + } + + // If this is an 8 or 16-bit value, it is really passed promoted + // to 32 bits. Insert an assert[sz]ext to capture this, then + // truncate to the right size. + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::BCvt: + ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); + break; + case CCValAssign::SExt: + ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, + DAG.getValueType(VA.getValVT())); + ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); + break; + case CCValAssign::ZExt: + ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, + DAG.getValueType(VA.getValVT())); + ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); + break; + } + + InVals.push_back(ArgValue); + + } else { // VA.isRegLoc() + + // sanity check + assert(VA.isMemLoc()); + assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); + + int index = VA.getValNo(); + + // Some Ins[] entries become multiple ArgLoc[] entries. + // Process them only once. + if (index != lastInsIndex) + { + ISD::ArgFlagsTy Flags = Ins[index].Flags; + // FIXME: For now, all byval parameter objects are marked mutable. + // This can be changed with more analysis. + // In case of tail call optimization mark all arguments mutable. + // Since they could be overwritten by lowering of arguments in case of + // a tail call. + if (Flags.isByVal()) { + assert(Ins[index].isOrigArg() && + "Byval arguments cannot be implicit"); + unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed(); + + int FrameIndex = StoreByValRegs( + CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex, + VA.getLocMemOffset(), Flags.getByValSize()); + InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT)); + CCInfo.nextInRegsParam(); + } else { + unsigned FIOffset = VA.getLocMemOffset(); + int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8, + FIOffset, true); + + // Create load nodes to retrieve arguments from the stack. + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + InVals.push_back(DAG.getLoad( + VA.getValVT(), dl, Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + false, false, false, 0)); + } + lastInsIndex = index; + } + } + } + + // varargs + if (isVarArg && MFI->hasVAStart()) + VarArgStyleRegisters(CCInfo, DAG, dl, Chain, + CCInfo.getNextStackOffset(), + TotalArgRegsSaveSize); + + AFI->setArgumentStackSize(CCInfo.getNextStackOffset()); + + return Chain; +} + +/// isFloatingPointZero - Return true if this is +0.0. +static bool isFloatingPointZero(SDValue Op) { + if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) + return CFP->getValueAPF().isPosZero(); + else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { + // Maybe this has already been legalized into the constant pool? + if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) { + SDValue WrapperOp = Op.getOperand(1).getOperand(0); + if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp)) + if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) + return CFP->getValueAPF().isPosZero(); + } + } else if (Op->getOpcode() == ISD::BITCAST && + Op->getValueType(0) == MVT::f64) { + // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64) + // created by LowerConstantFP(). + SDValue BitcastOp = Op->getOperand(0); + if (BitcastOp->getOpcode() == ARMISD::VMOVIMM && + isNullConstant(BitcastOp->getOperand(0))) + return true; + } + return false; +} + +/// Returns appropriate ARM CMP (cmp) and corresponding condition code for +/// the given operands. +SDValue +ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, + SDValue &ARMcc, SelectionDAG &DAG, + SDLoc dl) const { + if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { + unsigned C = RHSC->getZExtValue(); + if (!isLegalICmpImmediate(C)) { + // Constant does not fit, try adjusting it by one? + switch (CC) { + default: break; + case ISD::SETLT: + case ISD::SETGE: + if (C != 0x80000000 && isLegalICmpImmediate(C-1)) { + CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; + RHS = DAG.getConstant(C - 1, dl, MVT::i32); + } + break; + case ISD::SETULT: + case ISD::SETUGE: + if (C != 0 && isLegalICmpImmediate(C-1)) { + CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; + RHS = DAG.getConstant(C - 1, dl, MVT::i32); + } + break; + case ISD::SETLE: + case ISD::SETGT: + if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) { + CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; + RHS = DAG.getConstant(C + 1, dl, MVT::i32); + } + break; + case ISD::SETULE: + case ISD::SETUGT: + if (C != 0xffffffff && isLegalICmpImmediate(C+1)) { + CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; + RHS = DAG.getConstant(C + 1, dl, MVT::i32); + } + break; + } + } + } + + ARMCC::CondCodes CondCode = IntCCToARMCC(CC); + ARMISD::NodeType CompareType; + switch (CondCode) { + default: + CompareType = ARMISD::CMP; + break; + case ARMCC::EQ: + case ARMCC::NE: + // Uses only Z Flag + CompareType = ARMISD::CMPZ; + break; + } + ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); + return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS); +} + +/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. +SDValue +ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, + SDLoc dl) const { + assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64); + SDValue Cmp; + if (!isFloatingPointZero(RHS)) + Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS); + else + Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS); + return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); +} + +/// duplicateCmp - Glue values can have only one use, so this function +/// duplicates a comparison node. +SDValue +ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { + unsigned Opc = Cmp.getOpcode(); + SDLoc DL(Cmp); + if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ) + return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); + + assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation"); + Cmp = Cmp.getOperand(0); + Opc = Cmp.getOpcode(); + if (Opc == ARMISD::CMPFP) + Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); + else { + assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT"); + Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0)); + } + return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); +} + +std::pair<SDValue, SDValue> +ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, + SDValue &ARMcc) const { + assert(Op.getValueType() == MVT::i32 && "Unsupported value type"); + + SDValue Value, OverflowCmp; + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDLoc dl(Op); + + // FIXME: We are currently always generating CMPs because we don't support + // generating CMN through the backend. This is not as good as the natural + // CMP case because it causes a register dependency and cannot be folded + // later. + + switch (Op.getOpcode()) { + default: + llvm_unreachable("Unknown overflow instruction!"); + case ISD::SADDO: + ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); + Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); + OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); + break; + case ISD::UADDO: + ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); + Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); + OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); + break; + case ISD::SSUBO: + ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); + Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); + OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); + break; + case ISD::USUBO: + ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); + Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); + OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); + break; + } // switch (...) + + return std::make_pair(Value, OverflowCmp); +} + + +SDValue +ARMTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { + // Let legalize expand this if it isn't a legal type yet. + if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) + return SDValue(); + + SDValue Value, OverflowCmp; + SDValue ARMcc; + std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc); + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + SDLoc dl(Op); + // We use 0 and 1 as false and true values. + SDValue TVal = DAG.getConstant(1, dl, MVT::i32); + SDValue FVal = DAG.getConstant(0, dl, MVT::i32); + EVT VT = Op.getValueType(); + + SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, + ARMcc, CCR, OverflowCmp); + + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); + return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); +} + + +SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { + SDValue Cond = Op.getOperand(0); + SDValue SelectTrue = Op.getOperand(1); + SDValue SelectFalse = Op.getOperand(2); + SDLoc dl(Op); + unsigned Opc = Cond.getOpcode(); + + if (Cond.getResNo() == 1 && + (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || + Opc == ISD::USUBO)) { + if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) + return SDValue(); + + SDValue Value, OverflowCmp; + SDValue ARMcc; + std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + EVT VT = Op.getValueType(); + + return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR, + OverflowCmp, DAG); + } + + // Convert: + // + // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond) + // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond) + // + if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) { + const ConstantSDNode *CMOVTrue = + dyn_cast<ConstantSDNode>(Cond.getOperand(0)); + const ConstantSDNode *CMOVFalse = + dyn_cast<ConstantSDNode>(Cond.getOperand(1)); + + if (CMOVTrue && CMOVFalse) { + unsigned CMOVTrueVal = CMOVTrue->getZExtValue(); + unsigned CMOVFalseVal = CMOVFalse->getZExtValue(); + + SDValue True; + SDValue False; + if (CMOVTrueVal == 1 && CMOVFalseVal == 0) { + True = SelectTrue; + False = SelectFalse; + } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) { + True = SelectFalse; + False = SelectTrue; + } + + if (True.getNode() && False.getNode()) { + EVT VT = Op.getValueType(); + SDValue ARMcc = Cond.getOperand(2); + SDValue CCR = Cond.getOperand(3); + SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG); + assert(True.getValueType() == VT); + return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG); + } + } + } + + // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the + // undefined bits before doing a full-word comparison with zero. + Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond, + DAG.getConstant(1, dl, Cond.getValueType())); + + return DAG.getSelectCC(dl, Cond, + DAG.getConstant(0, dl, Cond.getValueType()), + SelectTrue, SelectFalse, ISD::SETNE); +} + +static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, + bool &swpCmpOps, bool &swpVselOps) { + // Start by selecting the GE condition code for opcodes that return true for + // 'equality' + if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE || + CC == ISD::SETULE) + CondCode = ARMCC::GE; + + // and GT for opcodes that return false for 'equality'. + else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT || + CC == ISD::SETULT) + CondCode = ARMCC::GT; + + // Since we are constrained to GE/GT, if the opcode contains 'less', we need + // to swap the compare operands. + if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT || + CC == ISD::SETULT) + swpCmpOps = true; + + // Both GT and GE are ordered comparisons, and return false for 'unordered'. + // If we have an unordered opcode, we need to swap the operands to the VSEL + // instruction (effectively negating the condition). + // + // This also has the effect of swapping which one of 'less' or 'greater' + // returns true, so we also swap the compare operands. It also switches + // whether we return true for 'equality', so we compensate by picking the + // opposite condition code to our original choice. + if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE || + CC == ISD::SETUGT) { + swpCmpOps = !swpCmpOps; + swpVselOps = !swpVselOps; + CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT; + } + + // 'ordered' is 'anything but unordered', so use the VS condition code and + // swap the VSEL operands. + if (CC == ISD::SETO) { + CondCode = ARMCC::VS; + swpVselOps = true; + } + + // 'unordered or not equal' is 'anything but equal', so use the EQ condition + // code and swap the VSEL operands. + if (CC == ISD::SETUNE) { + CondCode = ARMCC::EQ; + swpVselOps = true; + } +} + +SDValue ARMTargetLowering::getCMOV(SDLoc dl, EVT VT, SDValue FalseVal, + SDValue TrueVal, SDValue ARMcc, SDValue CCR, + SDValue Cmp, SelectionDAG &DAG) const { + if (Subtarget->isFPOnlySP() && VT == MVT::f64) { + FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl, + DAG.getVTList(MVT::i32, MVT::i32), FalseVal); + TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl, + DAG.getVTList(MVT::i32, MVT::i32), TrueVal); + + SDValue TrueLow = TrueVal.getValue(0); + SDValue TrueHigh = TrueVal.getValue(1); + SDValue FalseLow = FalseVal.getValue(0); + SDValue FalseHigh = FalseVal.getValue(1); + + SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow, + ARMcc, CCR, Cmp); + SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh, + ARMcc, CCR, duplicateCmp(Cmp, DAG)); + + return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High); + } else { + return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, + Cmp); + } +} + +SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); + SDValue TrueVal = Op.getOperand(2); + SDValue FalseVal = Op.getOperand(3); + SDLoc dl(Op); + + if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) { + DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC, + dl); + + // If softenSetCCOperands only returned one value, we should compare it to + // zero. + if (!RHS.getNode()) { + RHS = DAG.getConstant(0, dl, LHS.getValueType()); + CC = ISD::SETNE; + } + } + + if (LHS.getValueType() == MVT::i32) { + // Try to generate VSEL on ARMv8. + // The VSEL instruction can't use all the usual ARM condition + // codes: it only has two bits to select the condition code, so it's + // constrained to use only GE, GT, VS and EQ. + // + // To implement all the various ISD::SETXXX opcodes, we sometimes need to + // swap the operands of the previous compare instruction (effectively + // inverting the compare condition, swapping 'less' and 'greater') and + // sometimes need to swap the operands to the VSEL (which inverts the + // condition in the sense of firing whenever the previous condition didn't) + if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || + TrueVal.getValueType() == MVT::f64)) { + ARMCC::CondCodes CondCode = IntCCToARMCC(CC); + if (CondCode == ARMCC::LT || CondCode == ARMCC::LE || + CondCode == ARMCC::VC || CondCode == ARMCC::NE) { + CC = ISD::getSetCCInverse(CC, true); + std::swap(TrueVal, FalseVal); + } + } + + SDValue ARMcc; + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); + return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); + } + + ARMCC::CondCodes CondCode, CondCode2; + FPCCToARMCC(CC, CondCode, CondCode2); + + // Try to generate VMAXNM/VMINNM on ARMv8. + if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || + TrueVal.getValueType() == MVT::f64)) { + bool swpCmpOps = false; + bool swpVselOps = false; + checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); + + if (CondCode == ARMCC::GT || CondCode == ARMCC::GE || + CondCode == ARMCC::VS || CondCode == ARMCC::EQ) { + if (swpCmpOps) + std::swap(LHS, RHS); + if (swpVselOps) + std::swap(TrueVal, FalseVal); + } + } + + SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); + SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); + if (CondCode2 != ARMCC::AL) { + SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32); + // FIXME: Needs another CMP because flag can have but one use. + SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl); + Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG); + } + return Result; +} + +/// canChangeToInt - Given the fp compare operand, return true if it is suitable +/// to morph to an integer compare sequence. +static bool canChangeToInt(SDValue Op, bool &SeenZero, + const ARMSubtarget *Subtarget) { + SDNode *N = Op.getNode(); + if (!N->hasOneUse()) + // Otherwise it requires moving the value from fp to integer registers. + return false; + if (!N->getNumValues()) + return false; + EVT VT = Op.getValueType(); + if (VT != MVT::f32 && !Subtarget->isFPBrccSlow()) + // f32 case is generally profitable. f64 case only makes sense when vcmpe + + // vmrs are very slow, e.g. cortex-a8. + return false; + + if (isFloatingPointZero(Op)) { + SeenZero = true; + return true; + } + return ISD::isNormalLoad(N); +} + +static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { + if (isFloatingPointZero(Op)) + return DAG.getConstant(0, SDLoc(Op), MVT::i32); + + if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) + return DAG.getLoad(MVT::i32, SDLoc(Op), + Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), + Ld->isVolatile(), Ld->isNonTemporal(), + Ld->isInvariant(), Ld->getAlignment()); + + llvm_unreachable("Unknown VFP cmp argument!"); +} + +static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, + SDValue &RetVal1, SDValue &RetVal2) { + SDLoc dl(Op); + + if (isFloatingPointZero(Op)) { + RetVal1 = DAG.getConstant(0, dl, MVT::i32); + RetVal2 = DAG.getConstant(0, dl, MVT::i32); + return; + } + + if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) { + SDValue Ptr = Ld->getBasePtr(); + RetVal1 = DAG.getLoad(MVT::i32, dl, + Ld->getChain(), Ptr, + Ld->getPointerInfo(), + Ld->isVolatile(), Ld->isNonTemporal(), + Ld->isInvariant(), Ld->getAlignment()); + + EVT PtrType = Ptr.getValueType(); + unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); + SDValue NewPtr = DAG.getNode(ISD::ADD, dl, + PtrType, Ptr, DAG.getConstant(4, dl, PtrType)); + RetVal2 = DAG.getLoad(MVT::i32, dl, + Ld->getChain(), NewPtr, + Ld->getPointerInfo().getWithOffset(4), + Ld->isVolatile(), Ld->isNonTemporal(), + Ld->isInvariant(), NewAlign); + return; + } + + llvm_unreachable("Unknown VFP cmp argument!"); +} + +/// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some +/// f32 and even f64 comparisons to integer ones. +SDValue +ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); + SDValue LHS = Op.getOperand(2); + SDValue RHS = Op.getOperand(3); + SDValue Dest = Op.getOperand(4); + SDLoc dl(Op); + + bool LHSSeenZero = false; + bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget); + bool RHSSeenZero = false; + bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget); + if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) { + // If unsafe fp math optimization is enabled and there are no other uses of + // the CMP operands, and the condition code is EQ or NE, we can optimize it + // to an integer comparison. + if (CC == ISD::SETOEQ) + CC = ISD::SETEQ; + else if (CC == ISD::SETUNE) + CC = ISD::SETNE; + + SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32); + SDValue ARMcc; + if (LHS.getValueType() == MVT::f32) { + LHS = DAG.getNode(ISD::AND, dl, MVT::i32, + bitcastf32Toi32(LHS, DAG), Mask); + RHS = DAG.getNode(ISD::AND, dl, MVT::i32, + bitcastf32Toi32(RHS, DAG), Mask); + SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, + Chain, Dest, ARMcc, CCR, Cmp); + } + + SDValue LHS1, LHS2; + SDValue RHS1, RHS2; + expandf64Toi32(LHS, DAG, LHS1, LHS2); + expandf64Toi32(RHS, DAG, RHS1, RHS2); + LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask); + RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask); + ARMCC::CondCodes CondCode = IntCCToARMCC(CC); + ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); + SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest }; + return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops); + } + + return SDValue(); +} + +SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); + SDValue LHS = Op.getOperand(2); + SDValue RHS = Op.getOperand(3); + SDValue Dest = Op.getOperand(4); + SDLoc dl(Op); + + if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) { + DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC, + dl); + + // If softenSetCCOperands only returned one value, we should compare it to + // zero. + if (!RHS.getNode()) { + RHS = DAG.getConstant(0, dl, LHS.getValueType()); + CC = ISD::SETNE; + } + } + + if (LHS.getValueType() == MVT::i32) { + SDValue ARMcc; + SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, + Chain, Dest, ARMcc, CCR, Cmp); + } + + assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); + + if (getTargetMachine().Options.UnsafeFPMath && + (CC == ISD::SETEQ || CC == ISD::SETOEQ || + CC == ISD::SETNE || CC == ISD::SETUNE)) { + SDValue Result = OptimizeVFPBrcond(Op, DAG); + if (Result.getNode()) + return Result; + } + + ARMCC::CondCodes CondCode, CondCode2; + FPCCToARMCC(CC, CondCode, CondCode2); + + SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); + SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; + SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); + if (CondCode2 != ARMCC::AL) { + ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); + SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) }; + Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); + } + return Res; +} + +SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + SDValue Table = Op.getOperand(1); + SDValue Index = Op.getOperand(2); + SDLoc dl(Op); + + EVT PTy = getPointerTy(DAG.getDataLayout()); + JumpTableSDNode *JT = cast<JumpTableSDNode>(Table); + SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); + Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI); + Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy)); + SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table); + if (Subtarget->isThumb2()) { + // Thumb2 uses a two-level jump. That is, it jumps into the jump table + // which does another jump to the destination. This also makes it easier + // to translate it to TBB / TBH later. + // FIXME: This might not work if the function is extremely large. + return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain, + Addr, Op.getOperand(2), JTI); + } + if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { + Addr = + DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, + MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), + false, false, false, 0); + Chain = Addr.getValue(1); + Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table); + return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); + } else { + Addr = + DAG.getLoad(PTy, dl, Chain, Addr, + MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), + false, false, false, 0); + Chain = Addr.getValue(1); + return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); + } +} + +static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + SDLoc dl(Op); + + if (Op.getValueType().getVectorElementType() == MVT::i32) { + if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32) + return Op; + return DAG.UnrollVectorOp(Op.getNode()); + } + + assert(Op.getOperand(0).getValueType() == MVT::v4f32 && + "Invalid type for custom lowering!"); + if (VT != MVT::v4i16) + return DAG.UnrollVectorOp(Op.getNode()); + + Op = DAG.getNode(Op.getOpcode(), dl, MVT::v4i32, Op.getOperand(0)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Op); +} + +SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + if (VT.isVector()) + return LowerVectorFP_TO_INT(Op, DAG); + if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) { + RTLIB::Libcall LC; + if (Op.getOpcode() == ISD::FP_TO_SINT) + LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), + Op.getValueType()); + else + LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), + Op.getValueType()); + return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), + /*isSigned*/ false, SDLoc(Op)).first; + } + + return Op; +} + +static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + SDLoc dl(Op); + + if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) { + if (VT.getVectorElementType() == MVT::f32) + return Op; + return DAG.UnrollVectorOp(Op.getNode()); + } + + assert(Op.getOperand(0).getValueType() == MVT::v4i16 && + "Invalid type for custom lowering!"); + if (VT != MVT::v4f32) + return DAG.UnrollVectorOp(Op.getNode()); + + unsigned CastOpc; + unsigned Opc; + switch (Op.getOpcode()) { + default: llvm_unreachable("Invalid opcode!"); + case ISD::SINT_TO_FP: + CastOpc = ISD::SIGN_EXTEND; + Opc = ISD::SINT_TO_FP; + break; + case ISD::UINT_TO_FP: + CastOpc = ISD::ZERO_EXTEND; + Opc = ISD::UINT_TO_FP; + break; + } + + Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0)); + return DAG.getNode(Opc, dl, VT, Op); +} + +SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + if (VT.isVector()) + return LowerVectorINT_TO_FP(Op, DAG); + if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) { + RTLIB::Libcall LC; + if (Op.getOpcode() == ISD::SINT_TO_FP) + LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), + Op.getValueType()); + else + LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), + Op.getValueType()); + return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), + /*isSigned*/ false, SDLoc(Op)).first; + } + + return Op; +} + +SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { + // Implement fcopysign with a fabs and a conditional fneg. + SDValue Tmp0 = Op.getOperand(0); + SDValue Tmp1 = Op.getOperand(1); + SDLoc dl(Op); + EVT VT = Op.getValueType(); + EVT SrcVT = Tmp1.getValueType(); + bool InGPR = Tmp0.getOpcode() == ISD::BITCAST || + Tmp0.getOpcode() == ARMISD::VMOVDRR; + bool UseNEON = !InGPR && Subtarget->hasNEON(); + + if (UseNEON) { + // Use VBSL to copy the sign bit. + unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80); + SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32, + DAG.getTargetConstant(EncodedVal, dl, MVT::i32)); + EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; + if (VT == MVT::f64) + Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT, + DAG.getNode(ISD::BITCAST, dl, OpVT, Mask), + DAG.getConstant(32, dl, MVT::i32)); + else /*if (VT == MVT::f32)*/ + Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0); + if (SrcVT == MVT::f32) { + Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1); + if (VT == MVT::f64) + Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT, + DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), + DAG.getConstant(32, dl, MVT::i32)); + } else if (VT == MVT::f32) + Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64, + DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), + DAG.getConstant(32, dl, MVT::i32)); + Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); + Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1); + + SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff), + dl, MVT::i32); + AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes); + SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask, + DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes)); + + SDValue Res = DAG.getNode(ISD::OR, dl, OpVT, + DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask), + DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot)); + if (VT == MVT::f32) { + Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res); + Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, + DAG.getConstant(0, dl, MVT::i32)); + } else { + Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res); + } + + return Res; + } + + // Bitcast operand 1 to i32. + if (SrcVT == MVT::f64) + Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), + Tmp1).getValue(1); + Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1); + + // Or in the signbit with integer operations. + SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32); + SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32); + Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1); + if (VT == MVT::f32) { + Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32, + DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2); + return DAG.getNode(ISD::BITCAST, dl, MVT::f32, + DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1)); + } + + // f64: Or the high part with signbit and then combine two parts. + Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), + Tmp0); + SDValue Lo = Tmp0.getValue(0); + SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2); + Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1); + return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); +} + +SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + MFI->setReturnAddressIsTaken(true); + + if (verifyReturnAddressArgumentIsConstant(Op, DAG)) + return SDValue(); + + EVT VT = Op.getValueType(); + SDLoc dl(Op); + unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + if (Depth) { + SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); + SDValue Offset = DAG.getConstant(4, dl, MVT::i32); + return DAG.getLoad(VT, dl, DAG.getEntryNode(), + DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), + MachinePointerInfo(), false, false, false, 0); + } + + // Return LR, which contains the return address. Mark it an implicit live-in. + unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); + return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); +} + +SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { + const ARMBaseRegisterInfo &ARI = + *static_cast<const ARMBaseRegisterInfo*>(RegInfo); + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + MFI->setFrameAddressIsTaken(true); + + EVT VT = Op.getValueType(); + SDLoc dl(Op); // FIXME probably not meaningful + unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + unsigned FrameReg = ARI.getFrameRegister(MF); + SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); + while (Depth--) + FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, + MachinePointerInfo(), + false, false, false, 0); + return FrameAddr; +} + +// FIXME? Maybe this could be a TableGen attribute on some registers and +// this table could be generated automatically from RegInfo. +unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const { + unsigned Reg = StringSwitch<unsigned>(RegName) + .Case("sp", ARM::SP) + .Default(0); + if (Reg) + return Reg; + report_fatal_error(Twine("Invalid register name \"" + + StringRef(RegName) + "\".")); +} + +// Result is 64 bit value so split into two 32 bit values and return as a +// pair of values. +static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) { + SDLoc DL(N); + + // This function is only supposed to be called for i64 type destination. + assert(N->getValueType(0) == MVT::i64 + && "ExpandREAD_REGISTER called for non-i64 type result."); + + SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL, + DAG.getVTList(MVT::i32, MVT::i32, MVT::Other), + N->getOperand(0), + N->getOperand(1)); + + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0), + Read.getValue(1))); + Results.push_back(Read.getOperand(0)); +} + +/// \p BC is a bitcast that is about to be turned into a VMOVDRR. +/// When \p DstVT, the destination type of \p BC, is on the vector +/// register bank and the source of bitcast, \p Op, operates on the same bank, +/// it might be possible to combine them, such that everything stays on the +/// vector register bank. +/// \p return The node that would replace \p BT, if the combine +/// is possible. +static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, + SelectionDAG &DAG) { + SDValue Op = BC->getOperand(0); + EVT DstVT = BC->getValueType(0); + + // The only vector instruction that can produce a scalar (remember, + // since the bitcast was about to be turned into VMOVDRR, the source + // type is i64) from a vector is EXTRACT_VECTOR_ELT. + // Moreover, we can do this combine only if there is one use. + // Finally, if the destination type is not a vector, there is not + // much point on forcing everything on the vector bank. + if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !Op.hasOneUse()) + return SDValue(); + + // If the index is not constant, we will introduce an additional + // multiply that will stick. + // Give up in that case. + ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + if (!Index) + return SDValue(); + unsigned DstNumElt = DstVT.getVectorNumElements(); + + // Compute the new index. + const APInt &APIntIndex = Index->getAPIntValue(); + APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt); + NewIndex *= APIntIndex; + // Check if the new constant index fits into i32. + if (NewIndex.getBitWidth() > 32) + return SDValue(); + + // vMTy bitcast(i64 extractelt vNi64 src, i32 index) -> + // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M) + SDLoc dl(Op); + SDValue ExtractSrc = Op.getOperand(0); + EVT VecVT = EVT::getVectorVT( + *DAG.getContext(), DstVT.getScalarType(), + ExtractSrc.getValueType().getVectorNumElements() * DstNumElt); + SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast, + DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32)); +} + +/// ExpandBITCAST - If the target supports VFP, this function is called to +/// expand a bit convert where either the source or destination type is i64 to +/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 +/// operand type is illegal (e.g., v2f32 for a target that doesn't support +/// vectors), since the legalizer won't know what to do with that. +static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDLoc dl(N); + SDValue Op = N->getOperand(0); + + // This function is only supposed to be called for i64 types, either as the + // source or destination of the bit convert. + EVT SrcVT = Op.getValueType(); + EVT DstVT = N->getValueType(0); + assert((SrcVT == MVT::i64 || DstVT == MVT::i64) && + "ExpandBITCAST called for non-i64 type"); + + // Turn i64->f64 into VMOVDRR. + if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { + // Do not force values to GPRs (this is what VMOVDRR does for the inputs) + // if we can combine the bitcast with its source. + if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG)) + return Val; + + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, + DAG.getConstant(0, dl, MVT::i32)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, + DAG.getConstant(1, dl, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, DstVT, + DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi)); + } + + // Turn f64->i64 into VMOVRRD. + if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { + SDValue Cvt; + if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() && + SrcVT.getVectorNumElements() > 1) + Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, + DAG.getVTList(MVT::i32, MVT::i32), + DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op)); + else + Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, + DAG.getVTList(MVT::i32, MVT::i32), Op); + // Merge the pieces into a single i64 value. + return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); + } + + return SDValue(); +} + +/// getZeroVector - Returns a vector of specified type with all zero elements. +/// Zero vectors are used to represent vector negation and in those cases +/// will be implemented with the NEON VNEG instruction. However, VNEG does +/// not support i64 elements, so sometimes the zero vectors will need to be +/// explicitly constructed. Regardless, use a canonical VMOV to create the +/// zero vector. +static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, SDLoc dl) { + assert(VT.isVector() && "Expected a vector type"); + // The canonical modified immediate encoding of a zero vector is....0! + SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32); + EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; + SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal); + return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); +} + +/// LowerShiftRightParts - Lower SRA_PARTS, which returns two +/// i32 values and take a 2 x i32 value to shift plus a shift amount. +SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getNumOperands() == 3 && "Not a double-shift!"); + EVT VT = Op.getValueType(); + unsigned VTBits = VT.getSizeInBits(); + SDLoc dl(Op); + SDValue ShOpLo = Op.getOperand(0); + SDValue ShOpHi = Op.getOperand(1); + SDValue ShAmt = Op.getOperand(2); + SDValue ARMcc; + unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; + + assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); + + SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); + SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, + DAG.getConstant(VTBits, dl, MVT::i32)); + SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); + SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); + SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); + + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), + ISD::SETGE, ARMcc, DAG, dl); + SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); + SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, + CCR, Cmp); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); +} + +/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two +/// i32 values and take a 2 x i32 value to shift plus a shift amount. +SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getNumOperands() == 3 && "Not a double-shift!"); + EVT VT = Op.getValueType(); + unsigned VTBits = VT.getSizeInBits(); + SDLoc dl(Op); + SDValue ShOpLo = Op.getOperand(0); + SDValue ShOpHi = Op.getOperand(1); + SDValue ShAmt = Op.getOperand(2); + SDValue ARMcc; + + assert(Op.getOpcode() == ISD::SHL_PARTS); + SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); + SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, + DAG.getConstant(VTBits, dl, MVT::i32)); + SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); + SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); + + SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), + ISD::SETGE, ARMcc, DAG, dl); + SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); + SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, Tmp3, ARMcc, + CCR, Cmp); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); +} + +SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, + SelectionDAG &DAG) const { + // The rounding mode is in bits 23:22 of the FPSCR. + // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 + // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) + // so that the shift + and get folded into a bitfield extract. + SDLoc dl(Op); + SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, + DAG.getConstant(Intrinsic::arm_get_fpscr, dl, + MVT::i32)); + SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, + DAG.getConstant(1U << 22, dl, MVT::i32)); + SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, + DAG.getConstant(22, dl, MVT::i32)); + return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, + DAG.getConstant(3, dl, MVT::i32)); +} + +static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + SDLoc dl(N); + EVT VT = N->getValueType(0); + if (VT.isVector()) { + assert(ST->hasNEON()); + + // Compute the least significant set bit: LSB = X & -X + SDValue X = N->getOperand(0); + SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X); + SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX); + + EVT ElemTy = VT.getVectorElementType(); + + if (ElemTy == MVT::i8) { + // Compute with: cttz(x) = ctpop(lsb - 1) + SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, + DAG.getTargetConstant(1, dl, ElemTy)); + SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); + return DAG.getNode(ISD::CTPOP, dl, VT, Bits); + } + + if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) && + (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) { + // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0 + unsigned NumBits = ElemTy.getSizeInBits(); + SDValue WidthMinus1 = + DAG.getNode(ARMISD::VMOVIMM, dl, VT, + DAG.getTargetConstant(NumBits - 1, dl, ElemTy)); + SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB); + return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ); + } + + // Compute with: cttz(x) = ctpop(lsb - 1) + + // Since we can only compute the number of bits in a byte with vcnt.8, we + // have to gather the result with pairwise addition (vpaddl) for i16, i32, + // and i64. + + // Compute LSB - 1. + SDValue Bits; + if (ElemTy == MVT::i64) { + // Load constant 0xffff'ffff'ffff'ffff to register. + SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT, + DAG.getTargetConstant(0x1eff, dl, MVT::i32)); + Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF); + } else { + SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, + DAG.getTargetConstant(1, dl, ElemTy)); + Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); + } + + // Count #bits with vcnt.8. + EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; + SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits); + SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8); + + // Gather the #bits with vpaddl (pairwise add.) + EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; + SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit, + DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), + Cnt8); + if (ElemTy == MVT::i16) + return Cnt16; + + EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32; + SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit, + DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), + Cnt16); + if (ElemTy == MVT::i32) + return Cnt32; + + assert(ElemTy == MVT::i64); + SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), + Cnt32); + return Cnt64; + } + + if (!ST->hasV6T2Ops()) + return SDValue(); + + SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0)); + return DAG.getNode(ISD::CTLZ, dl, VT, rbit); +} + +/// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count +/// for each 16-bit element from operand, repeated. The basic idea is to +/// leverage vcnt to get the 8-bit counts, gather and add the results. +/// +/// Trace for v4i16: +/// input = [v0 v1 v2 v3 ] (vi 16-bit element) +/// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element) +/// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi) +/// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6] +/// [b0 b1 b2 b3 b4 b5 b6 b7] +/// +[b1 b0 b3 b2 b5 b4 b7 b6] +/// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0, +/// vuzp: = [k0 k1 k2 k3 k0 k1 k2 k3] each ki is 8-bits) +static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + SDLoc DL(N); + + EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; + SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0)); + SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0); + SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1); + SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2); + return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3); +} + +/// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the +/// bit-count for each 16-bit element from the operand. We need slightly +/// different sequencing for v4i16 and v8i16 to stay within NEON's available +/// 64/128-bit registers. +/// +/// Trace for v4i16: +/// input = [v0 v1 v2 v3 ] (vi 16-bit element) +/// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi) +/// v8i16:Extended = [k0 k1 k2 k3 k0 k1 k2 k3 ] +/// v4i16:Extracted = [k0 k1 k2 k3 ] +static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + SDLoc DL(N); + + SDValue BitCounts = getCTPOP16BitCounts(N, DAG); + if (VT.is64BitVector()) { + SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended, + DAG.getIntPtrConstant(0, DL)); + } else { + SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, + BitCounts, DAG.getIntPtrConstant(0, DL)); + return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted); + } +} + +/// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the +/// bit-count for each 32-bit element from the operand. The idea here is +/// to split the vector into 16-bit elements, leverage the 16-bit count +/// routine, and then combine the results. +/// +/// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged): +/// input = [v0 v1 ] (vi: 32-bit elements) +/// Bitcast = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1]) +/// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi) +/// vrev: N0 = [k1 k0 k3 k2 ] +/// [k0 k1 k2 k3 ] +/// N1 =+[k1 k0 k3 k2 ] +/// [k0 k2 k1 k3 ] +/// N2 =+[k1 k3 k0 k2 ] +/// [k0 k2 k1 k3 ] +/// Extended =+[k1 k3 k0 k2 ] +/// [k0 k2 ] +/// Extracted=+[k1 k3 ] +/// +static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + SDLoc DL(N); + + EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; + + SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0)); + SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG); + SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16); + SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0); + SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1); + + if (VT.is64BitVector()) { + SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended, + DAG.getIntPtrConstant(0, DL)); + } else { + SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2, + DAG.getIntPtrConstant(0, DL)); + return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted); + } +} + +static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VT = N->getValueType(0); + + assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); + assert((VT == MVT::v2i32 || VT == MVT::v4i32 || + VT == MVT::v4i16 || VT == MVT::v8i16) && + "Unexpected type for custom ctpop lowering"); + + if (VT.getVectorElementType() == MVT::i32) + return lowerCTPOP32BitElements(N, DAG); + else + return lowerCTPOP16BitElements(N, DAG); +} + +static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VT = N->getValueType(0); + SDLoc dl(N); + + if (!VT.isVector()) + return SDValue(); + + // Lower vector shifts on NEON to use VSHL. + assert(ST->hasNEON() && "unexpected vector shift"); + + // Left shifts translate directly to the vshiftu intrinsic. + if (N->getOpcode() == ISD::SHL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::arm_neon_vshiftu, dl, + MVT::i32), + N->getOperand(0), N->getOperand(1)); + + assert((N->getOpcode() == ISD::SRA || + N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode"); + + // NEON uses the same intrinsics for both left and right shifts. For + // right shifts, the shift amounts are negative, so negate the vector of + // shift amounts. + EVT ShiftVT = N->getOperand(1).getValueType(); + SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT, + getZeroVector(ShiftVT, DAG, dl), + N->getOperand(1)); + Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ? + Intrinsic::arm_neon_vshifts : + Intrinsic::arm_neon_vshiftu); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(vshiftInt, dl, MVT::i32), + N->getOperand(0), NegatedCount); +} + +static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VT = N->getValueType(0); + SDLoc dl(N); + + // We can get here for a node like i32 = ISD::SHL i32, i64 + if (VT != MVT::i64) + return SDValue(); + + assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && + "Unknown shift to lower!"); + + // We only lower SRA, SRL of 1 here, all others use generic lowering. + if (!isOneConstant(N->getOperand(1))) + return SDValue(); + + // If we are in thumb mode, we don't have RRX. + if (ST->isThumb1Only()) return SDValue(); + + // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), + DAG.getConstant(0, dl, MVT::i32)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), + DAG.getConstant(1, dl, MVT::i32)); + + // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and + // captures the result into a carry flag. + unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG; + Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi); + + // The low part is an ARMISD::RRX operand, which shifts the carry in. + Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1)); + + // Merge the pieces into a single i64 value. + return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); +} + +static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { + SDValue TmpOp0, TmpOp1; + bool Invert = false; + bool Swap = false; + unsigned Opc = 0; + + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue CC = Op.getOperand(2); + EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger(); + EVT VT = Op.getValueType(); + ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); + SDLoc dl(Op); + + if (CmpVT.getVectorElementType() == MVT::i64) + // 64-bit comparisons are not legal. We've marked SETCC as non-Custom, + // but it's possible that our operands are 64-bit but our result is 32-bit. + // Bail in this case. + return SDValue(); + + if (Op1.getValueType().isFloatingPoint()) { + switch (SetCCOpcode) { + default: llvm_unreachable("Illegal FP comparison"); + case ISD::SETUNE: + case ISD::SETNE: Invert = true; // Fallthrough + case ISD::SETOEQ: + case ISD::SETEQ: Opc = ARMISD::VCEQ; break; + case ISD::SETOLT: + case ISD::SETLT: Swap = true; // Fallthrough + case ISD::SETOGT: + case ISD::SETGT: Opc = ARMISD::VCGT; break; + case ISD::SETOLE: + case ISD::SETLE: Swap = true; // Fallthrough + case ISD::SETOGE: + case ISD::SETGE: Opc = ARMISD::VCGE; break; + case ISD::SETUGE: Swap = true; // Fallthrough + case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break; + case ISD::SETUGT: Swap = true; // Fallthrough + case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break; + case ISD::SETUEQ: Invert = true; // Fallthrough + case ISD::SETONE: + // Expand this to (OLT | OGT). + TmpOp0 = Op0; + TmpOp1 = Op1; + Opc = ISD::OR; + Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0); + Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1); + break; + case ISD::SETUO: Invert = true; // Fallthrough + case ISD::SETO: + // Expand this to (OLT | OGE). + TmpOp0 = Op0; + TmpOp1 = Op1; + Opc = ISD::OR; + Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0); + Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1); + break; + } + } else { + // Integer comparisons. + switch (SetCCOpcode) { + default: llvm_unreachable("Illegal integer comparison"); + case ISD::SETNE: Invert = true; + case ISD::SETEQ: Opc = ARMISD::VCEQ; break; + case ISD::SETLT: Swap = true; + case ISD::SETGT: Opc = ARMISD::VCGT; break; + case ISD::SETLE: Swap = true; + case ISD::SETGE: Opc = ARMISD::VCGE; break; + case ISD::SETULT: Swap = true; + case ISD::SETUGT: Opc = ARMISD::VCGTU; break; + case ISD::SETULE: Swap = true; + case ISD::SETUGE: Opc = ARMISD::VCGEU; break; + } + + // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero). + if (Opc == ARMISD::VCEQ) { + + SDValue AndOp; + if (ISD::isBuildVectorAllZeros(Op1.getNode())) + AndOp = Op0; + else if (ISD::isBuildVectorAllZeros(Op0.getNode())) + AndOp = Op1; + + // Ignore bitconvert. + if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST) + AndOp = AndOp.getOperand(0); + + if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { + Opc = ARMISD::VTST; + Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0)); + Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1)); + Invert = !Invert; + } + } + } + + if (Swap) + std::swap(Op0, Op1); + + // If one of the operands is a constant vector zero, attempt to fold the + // comparison to a specialized compare-against-zero form. + SDValue SingleOp; + if (ISD::isBuildVectorAllZeros(Op1.getNode())) + SingleOp = Op0; + else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { + if (Opc == ARMISD::VCGE) + Opc = ARMISD::VCLEZ; + else if (Opc == ARMISD::VCGT) + Opc = ARMISD::VCLTZ; + SingleOp = Op1; + } + + SDValue Result; + if (SingleOp.getNode()) { + switch (Opc) { + case ARMISD::VCEQ: + Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break; + case ARMISD::VCGE: + Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break; + case ARMISD::VCLEZ: + Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break; + case ARMISD::VCGT: + Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break; + case ARMISD::VCLTZ: + Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break; + default: + Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1); + } + } else { + Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1); + } + + Result = DAG.getSExtOrTrunc(Result, dl, VT); + + if (Invert) + Result = DAG.getNOT(dl, Result, VT); + + return Result; +} + +/// isNEONModifiedImm - Check if the specified splat value corresponds to a +/// valid vector constant for a NEON instruction with a "modified immediate" +/// operand (e.g., VMOV). If so, return the encoded value. +static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, + unsigned SplatBitSize, SelectionDAG &DAG, + SDLoc dl, EVT &VT, bool is128Bits, + NEONModImmType type) { + unsigned OpCmode, Imm; + + // SplatBitSize is set to the smallest size that splats the vector, so a + // zero vector will always have SplatBitSize == 8. However, NEON modified + // immediate instructions others than VMOV do not support the 8-bit encoding + // of a zero vector, and the default encoding of zero is supposed to be the + // 32-bit version. + if (SplatBits == 0) + SplatBitSize = 32; + + switch (SplatBitSize) { + case 8: + if (type != VMOVModImm) + return SDValue(); + // Any 1-byte value is OK. Op=0, Cmode=1110. + assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); + OpCmode = 0xe; + Imm = SplatBits; + VT = is128Bits ? MVT::v16i8 : MVT::v8i8; + break; + + case 16: + // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. + VT = is128Bits ? MVT::v8i16 : MVT::v4i16; + if ((SplatBits & ~0xff) == 0) { + // Value = 0x00nn: Op=x, Cmode=100x. + OpCmode = 0x8; + Imm = SplatBits; + break; + } + if ((SplatBits & ~0xff00) == 0) { + // Value = 0xnn00: Op=x, Cmode=101x. + OpCmode = 0xa; + Imm = SplatBits >> 8; + break; + } + return SDValue(); + + case 32: + // NEON's 32-bit VMOV supports splat values where: + // * only one byte is nonzero, or + // * the least significant byte is 0xff and the second byte is nonzero, or + // * the least significant 2 bytes are 0xff and the third is nonzero. + VT = is128Bits ? MVT::v4i32 : MVT::v2i32; + if ((SplatBits & ~0xff) == 0) { + // Value = 0x000000nn: Op=x, Cmode=000x. + OpCmode = 0; + Imm = SplatBits; + break; + } + if ((SplatBits & ~0xff00) == 0) { + // Value = 0x0000nn00: Op=x, Cmode=001x. + OpCmode = 0x2; + Imm = SplatBits >> 8; + break; + } + if ((SplatBits & ~0xff0000) == 0) { + // Value = 0x00nn0000: Op=x, Cmode=010x. + OpCmode = 0x4; + Imm = SplatBits >> 16; + break; + } + if ((SplatBits & ~0xff000000) == 0) { + // Value = 0xnn000000: Op=x, Cmode=011x. + OpCmode = 0x6; + Imm = SplatBits >> 24; + break; + } + + // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC + if (type == OtherModImm) return SDValue(); + + if ((SplatBits & ~0xffff) == 0 && + ((SplatBits | SplatUndef) & 0xff) == 0xff) { + // Value = 0x0000nnff: Op=x, Cmode=1100. + OpCmode = 0xc; + Imm = SplatBits >> 8; + break; + } + + if ((SplatBits & ~0xffffff) == 0 && + ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { + // Value = 0x00nnffff: Op=x, Cmode=1101. + OpCmode = 0xd; + Imm = SplatBits >> 16; + break; + } + + // Note: there are a few 32-bit splat values (specifically: 00ffff00, + // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not + // VMOV.I32. A (very) minor optimization would be to replicate the value + // and fall through here to test for a valid 64-bit splat. But, then the + // caller would also need to check and handle the change in size. + return SDValue(); + + case 64: { + if (type != VMOVModImm) + return SDValue(); + // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. + uint64_t BitMask = 0xff; + uint64_t Val = 0; + unsigned ImmMask = 1; + Imm = 0; + for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { + if (((SplatBits | SplatUndef) & BitMask) == BitMask) { + Val |= BitMask; + Imm |= ImmMask; + } else if ((SplatBits & BitMask) != 0) { + return SDValue(); + } + BitMask <<= 8; + ImmMask <<= 1; + } + + if (DAG.getDataLayout().isBigEndian()) + // swap higher and lower 32 bit word + Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4); + + // Op=1, Cmode=1110. + OpCmode = 0x1e; + VT = is128Bits ? MVT::v2i64 : MVT::v1i64; + break; + } + + default: + llvm_unreachable("unexpected size for isNEONModifiedImm"); + } + + unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm); + return DAG.getTargetConstant(EncodedVal, dl, MVT::i32); +} + +SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) const { + if (!ST->hasVFP3()) + return SDValue(); + + bool IsDouble = Op.getValueType() == MVT::f64; + ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op); + + // Use the default (constant pool) lowering for double constants when we have + // an SP-only FPU + if (IsDouble && Subtarget->isFPOnlySP()) + return SDValue(); + + // Try splatting with a VMOV.f32... + APFloat FPVal = CFP->getValueAPF(); + int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal); + + if (ImmVal != -1) { + if (IsDouble || !ST->useNEONForSinglePrecisionFP()) { + // We have code in place to select a valid ConstantFP already, no need to + // do any mangling. + return Op; + } + + // It's a float and we are trying to use NEON operations where + // possible. Lower it to a splat followed by an extract. + SDLoc DL(Op); + SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32); + SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, + NewVal); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant, + DAG.getConstant(0, DL, MVT::i32)); + } + + // The rest of our options are NEON only, make sure that's allowed before + // proceeding.. + if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP())) + return SDValue(); + + EVT VMovVT; + uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue(); + + // It wouldn't really be worth bothering for doubles except for one very + // important value, which does happen to match: 0.0. So make sure we don't do + // anything stupid. + if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32)) + return SDValue(); + + // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). + SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), + VMovVT, false, VMOVModImm); + if (NewVal != SDValue()) { + SDLoc DL(Op); + SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, + NewVal); + if (IsDouble) + return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); + + // It's a float: cast and extract a vector element. + SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, + VecConstant); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, + DAG.getConstant(0, DL, MVT::i32)); + } + + // Finally, try a VMVN.i32 + NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, + false, VMVNModImm); + if (NewVal != SDValue()) { + SDLoc DL(Op); + SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal); + + if (IsDouble) + return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); + + // It's a float: cast and extract a vector element. + SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, + VecConstant); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, + DAG.getConstant(0, DL, MVT::i32)); + } + + return SDValue(); +} + +// check if an VEXT instruction can handle the shuffle mask when the +// vector sources of the shuffle are the same. +static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { + unsigned NumElts = VT.getVectorNumElements(); + + // Assume that the first shuffle index is not UNDEF. Fail if it is. + if (M[0] < 0) + return false; + + Imm = M[0]; + + // If this is a VEXT shuffle, the immediate value is the index of the first + // element. The other shuffle indices must be the successive elements after + // the first one. + unsigned ExpectedElt = Imm; + for (unsigned i = 1; i < NumElts; ++i) { + // Increment the expected index. If it wraps around, just follow it + // back to index zero and keep going. + ++ExpectedElt; + if (ExpectedElt == NumElts) + ExpectedElt = 0; + + if (M[i] < 0) continue; // ignore UNDEF indices + if (ExpectedElt != static_cast<unsigned>(M[i])) + return false; + } + + return true; +} + + +static bool isVEXTMask(ArrayRef<int> M, EVT VT, + bool &ReverseVEXT, unsigned &Imm) { + unsigned NumElts = VT.getVectorNumElements(); + ReverseVEXT = false; + + // Assume that the first shuffle index is not UNDEF. Fail if it is. + if (M[0] < 0) + return false; + + Imm = M[0]; + + // If this is a VEXT shuffle, the immediate value is the index of the first + // element. The other shuffle indices must be the successive elements after + // the first one. + unsigned ExpectedElt = Imm; + for (unsigned i = 1; i < NumElts; ++i) { + // Increment the expected index. If it wraps around, it may still be + // a VEXT but the source vectors must be swapped. + ExpectedElt += 1; + if (ExpectedElt == NumElts * 2) { + ExpectedElt = 0; + ReverseVEXT = true; + } + + if (M[i] < 0) continue; // ignore UNDEF indices + if (ExpectedElt != static_cast<unsigned>(M[i])) + return false; + } + + // Adjust the index value if the source operands will be swapped. + if (ReverseVEXT) + Imm -= NumElts; + + return true; +} + +/// isVREVMask - Check if a vector shuffle corresponds to a VREV +/// instruction with the specified blocksize. (The order of the elements +/// within each block of the vector is reversed.) +static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { + assert((BlockSize==16 || BlockSize==32 || BlockSize==64) && + "Only possible block sizes for VREV are: 16, 32, 64"); + + unsigned EltSz = VT.getVectorElementType().getSizeInBits(); + if (EltSz == 64) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + unsigned BlockElts = M[0] + 1; + // If the first shuffle index is UNDEF, be optimistic. + if (M[0] < 0) + BlockElts = BlockSize / EltSz; + + if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) + return false; + + for (unsigned i = 0; i < NumElts; ++i) { + if (M[i] < 0) continue; // ignore UNDEF indices + if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts)) + return false; + } + + return true; +} + +static bool isVTBLMask(ArrayRef<int> M, EVT VT) { + // We can handle <8 x i8> vector shuffles. If the index in the mask is out of + // range, then 0 is placed into the resulting vector. So pretty much any mask + // of 8 elements can work here. + return VT == MVT::v8i8 && M.size() == 8; +} + +// Checks whether the shuffle mask represents a vector transpose (VTRN) by +// checking that pairs of elements in the shuffle mask represent the same index +// in each vector, incrementing the expected index by 2 at each step. +// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6] +// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g} +// v2={e,f,g,h} +// WhichResult gives the offset for each element in the mask based on which +// of the two results it belongs to. +// +// The transpose can be represented either as: +// result1 = shufflevector v1, v2, result1_shuffle_mask +// result2 = shufflevector v1, v2, result2_shuffle_mask +// where v1/v2 and the shuffle masks have the same number of elements +// (here WhichResult (see below) indicates which result is being checked) +// +// or as: +// results = shufflevector v1, v2, shuffle_mask +// where both results are returned in one vector and the shuffle mask has twice +// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we +// want to check the low half and high half of the shuffle mask as if it were +// the other case +static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { + unsigned EltSz = VT.getVectorElementType().getSizeInBits(); + if (EltSz == 64) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + // If the mask is twice as long as the input vector then we need to check the + // upper and lower parts of the mask with a matching value for WhichResult + // FIXME: A mask with only even values will be rejected in case the first + // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only + // M[0] is used to determine WhichResult + for (unsigned i = 0; i < M.size(); i += NumElts) { + if (M.size() == NumElts * 2) + WhichResult = i / NumElts; + else + WhichResult = M[i] == 0 ? 0 : 1; + for (unsigned j = 0; j < NumElts; j += 2) { + if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || + (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult)) + return false; + } + } + + if (M.size() == NumElts*2) + WhichResult = 0; + + return true; +} + +/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of +/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". +/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. +static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ + unsigned EltSz = VT.getVectorElementType().getSizeInBits(); + if (EltSz == 64) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + for (unsigned i = 0; i < M.size(); i += NumElts) { + if (M.size() == NumElts * 2) + WhichResult = i / NumElts; + else + WhichResult = M[i] == 0 ? 0 : 1; + for (unsigned j = 0; j < NumElts; j += 2) { + if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || + (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult)) + return false; + } + } + + if (M.size() == NumElts*2) + WhichResult = 0; + + return true; +} + +// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking +// that the mask elements are either all even and in steps of size 2 or all odd +// and in steps of size 2. +// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6] +// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g} +// v2={e,f,g,h} +// Requires similar checks to that of isVTRNMask with +// respect the how results are returned. +static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { + unsigned EltSz = VT.getVectorElementType().getSizeInBits(); + if (EltSz == 64) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + for (unsigned j = 0; j < NumElts; ++j) { + if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult) + return false; + } + } + + if (M.size() == NumElts*2) + WhichResult = 0; + + // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. + if (VT.is64BitVector() && EltSz == 32) + return false; + + return true; +} + +/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of +/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". +/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, +static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ + unsigned EltSz = VT.getVectorElementType().getSizeInBits(); + if (EltSz == 64) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + unsigned Half = NumElts / 2; + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + for (unsigned j = 0; j < NumElts; j += Half) { + unsigned Idx = WhichResult; + for (unsigned k = 0; k < Half; ++k) { + int MIdx = M[i + j + k]; + if (MIdx >= 0 && (unsigned) MIdx != Idx) + return false; + Idx += 2; + } + } + } + + if (M.size() == NumElts*2) + WhichResult = 0; + + // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. + if (VT.is64BitVector() && EltSz == 32) + return false; + + return true; +} + +// Checks whether the shuffle mask represents a vector zip (VZIP) by checking +// that pairs of elements of the shufflemask represent the same index in each +// vector incrementing sequentially through the vectors. +// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5] +// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f} +// v2={e,f,g,h} +// Requires similar checks to that of isVTRNMask with respect the how results +// are returned. +static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { + unsigned EltSz = VT.getVectorElementType().getSizeInBits(); + if (EltSz == 64) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + unsigned Idx = WhichResult * NumElts / 2; + for (unsigned j = 0; j < NumElts; j += 2) { + if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || + (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts)) + return false; + Idx += 1; + } + } + + if (M.size() == NumElts*2) + WhichResult = 0; + + // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. + if (VT.is64BitVector() && EltSz == 32) + return false; + + return true; +} + +/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of +/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". +/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. +static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ + unsigned EltSz = VT.getVectorElementType().getSizeInBits(); + if (EltSz == 64) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + unsigned Idx = WhichResult * NumElts / 2; + for (unsigned j = 0; j < NumElts; j += 2) { + if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || + (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx)) + return false; + Idx += 1; + } + } + + if (M.size() == NumElts*2) + WhichResult = 0; + + // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. + if (VT.is64BitVector() && EltSz == 32) + return false; + + return true; +} + +/// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), +/// and return the corresponding ARMISD opcode if it is, or 0 if it isn't. +static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT, + unsigned &WhichResult, + bool &isV_UNDEF) { + isV_UNDEF = false; + if (isVTRNMask(ShuffleMask, VT, WhichResult)) + return ARMISD::VTRN; + if (isVUZPMask(ShuffleMask, VT, WhichResult)) + return ARMISD::VUZP; + if (isVZIPMask(ShuffleMask, VT, WhichResult)) + return ARMISD::VZIP; + + isV_UNDEF = true; + if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) + return ARMISD::VTRN; + if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) + return ARMISD::VUZP; + if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) + return ARMISD::VZIP; + + return 0; +} + +/// \return true if this is a reverse operation on an vector. +static bool isReverseMask(ArrayRef<int> M, EVT VT) { + unsigned NumElts = VT.getVectorNumElements(); + // Make sure the mask has the right size. + if (NumElts != M.size()) + return false; + + // Look for <15, ..., 3, -1, 1, 0>. + for (unsigned i = 0; i != NumElts; ++i) + if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i)) + return false; + + return true; +} + +// If N is an integer constant that can be moved into a register in one +// instruction, return an SDValue of such a constant (will become a MOV +// instruction). Otherwise return null. +static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, + const ARMSubtarget *ST, SDLoc dl) { + uint64_t Val; + if (!isa<ConstantSDNode>(N)) + return SDValue(); + Val = cast<ConstantSDNode>(N)->getZExtValue(); + + if (ST->isThumb1Only()) { + if (Val <= 255 || ~Val <= 255) + return DAG.getConstant(Val, dl, MVT::i32); + } else { + if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1) + return DAG.getConstant(Val, dl, MVT::i32); + } + return SDValue(); +} + +// If this is a case we can't handle, return null and let the default +// expansion code take care of it. +SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) const { + BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); + SDLoc dl(Op); + EVT VT = Op.getValueType(); + + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { + if (SplatBitSize <= 64) { + // Check if an immediate VMOV works. + EVT VmovVT; + SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), + SplatUndef.getZExtValue(), SplatBitSize, + DAG, dl, VmovVT, VT.is128BitVector(), + VMOVModImm); + if (Val.getNode()) { + SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); + return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); + } + + // Try an immediate VMVN. + uint64_t NegatedImm = (~SplatBits).getZExtValue(); + Val = isNEONModifiedImm(NegatedImm, + SplatUndef.getZExtValue(), SplatBitSize, + DAG, dl, VmovVT, VT.is128BitVector(), + VMVNModImm); + if (Val.getNode()) { + SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); + return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); + } + + // Use vmov.f32 to materialize other v2f32 and v4f32 splats. + if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) { + int ImmVal = ARM_AM::getFP32Imm(SplatBits); + if (ImmVal != -1) { + SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32); + return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val); + } + } + } + } + + // Scan through the operands to see if only one value is used. + // + // As an optimisation, even if more than one value is used it may be more + // profitable to splat with one value then change some lanes. + // + // Heuristically we decide to do this if the vector has a "dominant" value, + // defined as splatted to more than half of the lanes. + unsigned NumElts = VT.getVectorNumElements(); + bool isOnlyLowElement = true; + bool usesOnlyOneValue = true; + bool hasDominantValue = false; + bool isConstant = true; + + // Map of the number of times a particular SDValue appears in the + // element list. + DenseMap<SDValue, unsigned> ValueCounts; + SDValue Value; + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + if (i > 0) + isOnlyLowElement = false; + if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) + isConstant = false; + + ValueCounts.insert(std::make_pair(V, 0)); + unsigned &Count = ValueCounts[V]; + + // Is this value dominant? (takes up more than half of the lanes) + if (++Count > (NumElts / 2)) { + hasDominantValue = true; + Value = V; + } + } + if (ValueCounts.size() != 1) + usesOnlyOneValue = false; + if (!Value.getNode() && ValueCounts.size() > 0) + Value = ValueCounts.begin()->first; + + if (ValueCounts.size() == 0) + return DAG.getUNDEF(VT); + + // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR. + // Keep going if we are hitting this case. + if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) + return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); + + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + + // Use VDUP for non-constant splats. For f32 constant splats, reduce to + // i32 and try again. + if (hasDominantValue && EltSize <= 32) { + if (!isConstant) { + SDValue N; + + // If we are VDUPing a value that comes directly from a vector, that will + // cause an unnecessary move to and from a GPR, where instead we could + // just use VDUPLANE. We can only do this if the lane being extracted + // is at a constant index, as the VDUP from lane instructions only have + // constant-index forms. + ConstantSDNode *constIndex; + if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) { + // We need to create a new undef vector to use for the VDUPLANE if the + // size of the vector from which we get the value is different than the + // size of the vector that we need to create. We will insert the element + // such that the register coalescer will remove unnecessary copies. + if (VT != Value->getOperand(0).getValueType()) { + unsigned index = constIndex->getAPIntValue().getLimitedValue() % + VT.getVectorNumElements(); + N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, + DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT), + Value, DAG.getConstant(index, dl, MVT::i32)), + DAG.getConstant(index, dl, MVT::i32)); + } else + N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, + Value->getOperand(0), Value->getOperand(1)); + } else + N = DAG.getNode(ARMISD::VDUP, dl, VT, Value); + + if (!usesOnlyOneValue) { + // The dominant value was splatted as 'N', but we now have to insert + // all differing elements. + for (unsigned I = 0; I < NumElts; ++I) { + if (Op.getOperand(I) == Value) + continue; + SmallVector<SDValue, 3> Ops; + Ops.push_back(N); + Ops.push_back(Op.getOperand(I)); + Ops.push_back(DAG.getConstant(I, dl, MVT::i32)); + N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops); + } + } + return N; + } + if (VT.getVectorElementType().isFloatingPoint()) { + SmallVector<SDValue, 8> Ops; + for (unsigned i = 0; i < NumElts; ++i) + Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32, + Op.getOperand(i))); + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); + SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops); + Val = LowerBUILD_VECTOR(Val, DAG, ST); + if (Val.getNode()) + return DAG.getNode(ISD::BITCAST, dl, VT, Val); + } + if (usesOnlyOneValue) { + SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); + if (isConstant && Val.getNode()) + return DAG.getNode(ARMISD::VDUP, dl, VT, Val); + } + } + + // If all elements are constants and the case above didn't get hit, fall back + // to the default expansion, which will generate a load from the constant + // pool. + if (isConstant) + return SDValue(); + + // Empirical tests suggest this is rarely worth it for vectors of length <= 2. + if (NumElts >= 4) { + SDValue shuffle = ReconstructShuffle(Op, DAG); + if (shuffle != SDValue()) + return shuffle; + } + + // Vectors with 32- or 64-bit elements can be built by directly assigning + // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands + // will be legalized. + if (EltSize >= 32) { + // Do the expansion with floating-point types, since that is what the VFP + // registers are defined to use, and since i64 is not legal. + EVT EltVT = EVT::getFloatingPointVT(EltSize); + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); + SmallVector<SDValue, 8> Ops; + for (unsigned i = 0; i < NumElts; ++i) + Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i))); + SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); + return DAG.getNode(ISD::BITCAST, dl, VT, Val); + } + + // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we + // know the default expansion would otherwise fall back on something even + // worse. For a vector with one or two non-undef values, that's + // scalar_to_vector for the elements followed by a shuffle (provided the + // shuffle is valid for the target) and materialization element by element + // on the stack followed by a load for everything else. + if (!isConstant && !usesOnlyOneValue) { + SDValue Vec = DAG.getUNDEF(VT); + for (unsigned i = 0 ; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32); + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); + } + return Vec; + } + + return SDValue(); +} + +// Gather data to see if the operation can be modelled as a +// shuffle in combination with VEXTs. +SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); + SDLoc dl(Op); + EVT VT = Op.getValueType(); + unsigned NumElts = VT.getVectorNumElements(); + + struct ShuffleSourceInfo { + SDValue Vec; + unsigned MinElt; + unsigned MaxElt; + + // We may insert some combination of BITCASTs and VEXT nodes to force Vec to + // be compatible with the shuffle we intend to construct. As a result + // ShuffleVec will be some sliding window into the original Vec. + SDValue ShuffleVec; + + // Code should guarantee that element i in Vec starts at element "WindowBase + // + i * WindowScale in ShuffleVec". + int WindowBase; + int WindowScale; + + bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } + ShuffleSourceInfo(SDValue Vec) + : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0), + WindowScale(1) {} + }; + + // First gather all vectors used as an immediate source for this BUILD_VECTOR + // node. + SmallVector<ShuffleSourceInfo, 2> Sources; + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { + // A shuffle can only come from building a vector from various + // elements of other vectors. + return SDValue(); + } else if (!isa<ConstantSDNode>(V.getOperand(1))) { + // Furthermore, shuffles require a constant mask, whereas extractelts + // accept variable indices. + return SDValue(); + } + + // Add this element source to the list if it's not already there. + SDValue SourceVec = V.getOperand(0); + auto Source = std::find(Sources.begin(), Sources.end(), SourceVec); + if (Source == Sources.end()) + Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); + + // Update the minimum and maximum lane number seen. + unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); + Source->MinElt = std::min(Source->MinElt, EltNo); + Source->MaxElt = std::max(Source->MaxElt, EltNo); + } + + // Currently only do something sane when at most two source vectors + // are involved. + if (Sources.size() > 2) + return SDValue(); + + // Find out the smallest element size among result and two sources, and use + // it as element size to build the shuffle_vector. + EVT SmallestEltTy = VT.getVectorElementType(); + for (auto &Source : Sources) { + EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); + if (SrcEltTy.bitsLT(SmallestEltTy)) + SmallestEltTy = SrcEltTy; + } + unsigned ResMultiplier = + VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits(); + NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); + EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); + + // If the source vector is too wide or too narrow, we may nevertheless be able + // to construct a compatible shuffle either by concatenating it with UNDEF or + // extracting a suitable range of elements. + for (auto &Src : Sources) { + EVT SrcVT = Src.ShuffleVec.getValueType(); + + if (SrcVT.getSizeInBits() == VT.getSizeInBits()) + continue; + + // This stage of the search produces a source with the same element type as + // the original, but with a total width matching the BUILD_VECTOR output. + EVT EltVT = SrcVT.getVectorElementType(); + unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); + EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); + + if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { + if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits()) + return SDValue(); + // We can pad out the smaller vector for free, so if it's part of a + // shuffle... + Src.ShuffleVec = + DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, + DAG.getUNDEF(Src.ShuffleVec.getValueType())); + continue; + } + + if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits()) + return SDValue(); + + if (Src.MaxElt - Src.MinElt >= NumSrcElts) { + // Span too large for a VEXT to cope + return SDValue(); + } + + if (Src.MinElt >= NumSrcElts) { + // The extraction can just take the second half + Src.ShuffleVec = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(NumSrcElts, dl, MVT::i32)); + Src.WindowBase = -NumSrcElts; + } else if (Src.MaxElt < NumSrcElts) { + // The extraction can just take the first half + Src.ShuffleVec = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(0, dl, MVT::i32)); + } else { + // An actual VEXT is needed + SDValue VEXTSrc1 = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(0, dl, MVT::i32)); + SDValue VEXTSrc2 = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(NumSrcElts, dl, MVT::i32)); + + Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1, + VEXTSrc2, + DAG.getConstant(Src.MinElt, dl, MVT::i32)); + Src.WindowBase = -Src.MinElt; + } + } + + // Another possible incompatibility occurs from the vector element types. We + // can fix this by bitcasting the source vectors to the same type we intend + // for the shuffle. + for (auto &Src : Sources) { + EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); + if (SrcEltTy == SmallestEltTy) + continue; + assert(ShuffleVT.getVectorElementType() == SmallestEltTy); + Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); + Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); + Src.WindowBase *= Src.WindowScale; + } + + // Final sanity check before we try to actually produce a shuffle. + DEBUG( + for (auto Src : Sources) + assert(Src.ShuffleVec.getValueType() == ShuffleVT); + ); + + // The stars all align, our next step is to produce the mask for the shuffle. + SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); + int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits(); + for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { + SDValue Entry = Op.getOperand(i); + if (Entry.getOpcode() == ISD::UNDEF) + continue; + + auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0)); + int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); + + // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit + // trunc. So only std::min(SrcBits, DestBits) actually get defined in this + // segment. + EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); + int BitsDefined = std::min(OrigEltTy.getSizeInBits(), + VT.getVectorElementType().getSizeInBits()); + int LanesDefined = BitsDefined / BitsPerShuffleLane; + + // This source is expected to fill ResMultiplier lanes of the final shuffle, + // starting at the appropriate offset. + int *LaneMask = &Mask[i * ResMultiplier]; + + int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; + ExtractBase += NumElts * (Src - Sources.begin()); + for (int j = 0; j < LanesDefined; ++j) + LaneMask[j] = ExtractBase + j; + } + + // Final check before we try to produce nonsense... + if (!isShuffleMaskLegal(Mask, ShuffleVT)) + return SDValue(); + + // We can't handle more than two sources. This should have already + // been checked before this point. + assert(Sources.size() <= 2 && "Too many sources!"); + + SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; + for (unsigned i = 0; i < Sources.size(); ++i) + ShuffleOps[i] = Sources[i].ShuffleVec; + + SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], + ShuffleOps[1], &Mask[0]); + return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); +} + +/// isShuffleMaskLegal - Targets can use this to indicate that they only +/// support *some* VECTOR_SHUFFLE operations, those with specific masks. +/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values +/// are assumed to be legal. +bool +ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, + EVT VT) const { + if (VT.getVectorNumElements() == 4 && + (VT.is128BitVector() || VT.is64BitVector())) { + unsigned PFIndexes[4]; + for (unsigned i = 0; i != 4; ++i) { + if (M[i] < 0) + PFIndexes[i] = 8; + else + PFIndexes[i] = M[i]; + } + + // Compute the index in the perfect shuffle table. + unsigned PFTableIndex = + PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; + unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; + unsigned Cost = (PFEntry >> 30); + + if (Cost <= 4) + return true; + } + + bool ReverseVEXT, isV_UNDEF; + unsigned Imm, WhichResult; + + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + return (EltSize >= 32 || + ShuffleVectorSDNode::isSplatMask(&M[0], VT) || + isVREVMask(M, VT, 64) || + isVREVMask(M, VT, 32) || + isVREVMask(M, VT, 16) || + isVEXTMask(M, VT, ReverseVEXT, Imm) || + isVTBLMask(M, VT) || + isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF) || + ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT))); +} + +/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit +/// the specified operations to build the shuffle. +static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, + SDValue RHS, SelectionDAG &DAG, + SDLoc dl) { + unsigned OpNum = (PFEntry >> 26) & 0x0F; + unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); + unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); + + enum { + OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> + OP_VREV, + OP_VDUP0, + OP_VDUP1, + OP_VDUP2, + OP_VDUP3, + OP_VEXT1, + OP_VEXT2, + OP_VEXT3, + OP_VUZPL, // VUZP, left result + OP_VUZPR, // VUZP, right result + OP_VZIPL, // VZIP, left result + OP_VZIPR, // VZIP, right result + OP_VTRNL, // VTRN, left result + OP_VTRNR // VTRN, right result + }; + + if (OpNum == OP_COPY) { + if (LHSID == (1*9+2)*9+3) return LHS; + assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); + return RHS; + } + + SDValue OpLHS, OpRHS; + OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); + OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); + EVT VT = OpLHS.getValueType(); + + switch (OpNum) { + default: llvm_unreachable("Unknown shuffle opcode!"); + case OP_VREV: + // VREV divides the vector in half and swaps within the half. + if (VT.getVectorElementType() == MVT::i32 || + VT.getVectorElementType() == MVT::f32) + return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); + // vrev <4 x i16> -> VREV32 + if (VT.getVectorElementType() == MVT::i16) + return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS); + // vrev <4 x i8> -> VREV16 + assert(VT.getVectorElementType() == MVT::i8); + return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS); + case OP_VDUP0: + case OP_VDUP1: + case OP_VDUP2: + case OP_VDUP3: + return DAG.getNode(ARMISD::VDUPLANE, dl, VT, + OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32)); + case OP_VEXT1: + case OP_VEXT2: + case OP_VEXT3: + return DAG.getNode(ARMISD::VEXT, dl, VT, + OpLHS, OpRHS, + DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32)); + case OP_VUZPL: + case OP_VUZPR: + return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), + OpLHS, OpRHS).getValue(OpNum-OP_VUZPL); + case OP_VZIPL: + case OP_VZIPR: + return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), + OpLHS, OpRHS).getValue(OpNum-OP_VZIPL); + case OP_VTRNL: + case OP_VTRNR: + return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), + OpLHS, OpRHS).getValue(OpNum-OP_VTRNL); + } +} + +static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, + ArrayRef<int> ShuffleMask, + SelectionDAG &DAG) { + // Check to see if we can use the VTBL instruction. + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + SDLoc DL(Op); + + SmallVector<SDValue, 8> VTBLMask; + for (ArrayRef<int>::iterator + I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I) + VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32)); + + if (V2.getNode()->getOpcode() == ISD::UNDEF) + return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, VTBLMask)); + + return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, VTBLMask)); +} + +static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, + SelectionDAG &DAG) { + SDLoc DL(Op); + SDValue OpLHS = Op.getOperand(0); + EVT VT = OpLHS.getValueType(); + + assert((VT == MVT::v8i16 || VT == MVT::v16i8) && + "Expect an v8i16/v16i8 type"); + OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS); + // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now, + // extract the first 8 bytes into the top double word and the last 8 bytes + // into the bottom double word. The v8i16 case is similar. + unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4; + return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS, + DAG.getConstant(ExtractNum, DL, MVT::i32)); +} + +static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + SDLoc dl(Op); + EVT VT = Op.getValueType(); + ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); + + // Convert shuffles that are directly supported on NEON to target-specific + // DAG nodes, instead of keeping them as shuffles and matching them again + // during code selection. This is more efficient and avoids the possibility + // of inconsistencies between legalization and selection. + // FIXME: floating-point vectors should be canonicalized to integer vectors + // of the same time so that they get CSEd properly. + ArrayRef<int> ShuffleMask = SVN->getMask(); + + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + if (EltSize <= 32) { + if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) { + int Lane = SVN->getSplatIndex(); + // If this is undef splat, generate it via "just" vdup, if possible. + if (Lane == -1) Lane = 0; + + // Test if V1 is a SCALAR_TO_VECTOR. + if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { + return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); + } + // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR + // (and probably will turn into a SCALAR_TO_VECTOR once legalization + // reaches it). + if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR && + !isa<ConstantSDNode>(V1.getOperand(0))) { + bool IsScalarToVector = true; + for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) + if (V1.getOperand(i).getOpcode() != ISD::UNDEF) { + IsScalarToVector = false; + break; + } + if (IsScalarToVector) + return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); + } + return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, + DAG.getConstant(Lane, dl, MVT::i32)); + } + + bool ReverseVEXT; + unsigned Imm; + if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { + if (ReverseVEXT) + std::swap(V1, V2); + return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, + DAG.getConstant(Imm, dl, MVT::i32)); + } + + if (isVREVMask(ShuffleMask, VT, 64)) + return DAG.getNode(ARMISD::VREV64, dl, VT, V1); + if (isVREVMask(ShuffleMask, VT, 32)) + return DAG.getNode(ARMISD::VREV32, dl, VT, V1); + if (isVREVMask(ShuffleMask, VT, 16)) + return DAG.getNode(ARMISD::VREV16, dl, VT, V1); + + if (V2->getOpcode() == ISD::UNDEF && + isSingletonVEXTMask(ShuffleMask, VT, Imm)) { + return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, + DAG.getConstant(Imm, dl, MVT::i32)); + } + + // Check for Neon shuffles that modify both input vectors in place. + // If both results are used, i.e., if there are two shuffles with the same + // source operands and with masks corresponding to both results of one of + // these operations, DAG memoization will ensure that a single node is + // used for both shuffles. + unsigned WhichResult; + bool isV_UNDEF; + if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( + ShuffleMask, VT, WhichResult, isV_UNDEF)) { + if (isV_UNDEF) + V2 = V1; + return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) + .getValue(WhichResult); + } + + // Also check for these shuffles through CONCAT_VECTORS: we canonicalize + // shuffles that produce a result larger than their operands with: + // shuffle(concat(v1, undef), concat(v2, undef)) + // -> + // shuffle(concat(v1, v2), undef) + // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine). + // + // This is useful in the general case, but there are special cases where + // native shuffles produce larger results: the two-result ops. + // + // Look through the concat when lowering them: + // shuffle(concat(v1, v2), undef) + // -> + // concat(VZIP(v1, v2):0, :1) + // + if (V1->getOpcode() == ISD::CONCAT_VECTORS && + V2->getOpcode() == ISD::UNDEF) { + SDValue SubV1 = V1->getOperand(0); + SDValue SubV2 = V1->getOperand(1); + EVT SubVT = SubV1.getValueType(); + + // We expect these to have been canonicalized to -1. + assert(std::all_of(ShuffleMask.begin(), ShuffleMask.end(), [&](int i) { + return i < (int)VT.getVectorNumElements(); + }) && "Unexpected shuffle index into UNDEF operand!"); + + if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( + ShuffleMask, SubVT, WhichResult, isV_UNDEF)) { + if (isV_UNDEF) + SubV2 = SubV1; + assert((WhichResult == 0) && + "In-place shuffle of concat can only have one result!"); + SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT), + SubV1, SubV2); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0), + Res.getValue(1)); + } + } + } + + // If the shuffle is not directly supported and it has 4 elements, use + // the PerfectShuffle-generated table to synthesize it from other shuffles. + unsigned NumElts = VT.getVectorNumElements(); + if (NumElts == 4) { + unsigned PFIndexes[4]; + for (unsigned i = 0; i != 4; ++i) { + if (ShuffleMask[i] < 0) + PFIndexes[i] = 8; + else + PFIndexes[i] = ShuffleMask[i]; + } + + // Compute the index in the perfect shuffle table. + unsigned PFTableIndex = + PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; + unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; + unsigned Cost = (PFEntry >> 30); + + if (Cost <= 4) + return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); + } + + // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. + if (EltSize >= 32) { + // Do the expansion with floating-point types, since that is what the VFP + // registers are defined to use, and since i64 is not legal. + EVT EltVT = EVT::getFloatingPointVT(EltSize); + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); + V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2); + SmallVector<SDValue, 8> Ops; + for (unsigned i = 0; i < NumElts; ++i) { + if (ShuffleMask[i] < 0) + Ops.push_back(DAG.getUNDEF(EltVT)); + else + Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, + ShuffleMask[i] < (int)NumElts ? V1 : V2, + DAG.getConstant(ShuffleMask[i] & (NumElts-1), + dl, MVT::i32))); + } + SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); + return DAG.getNode(ISD::BITCAST, dl, VT, Val); + } + + if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT)) + return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG); + + if (VT == MVT::v8i8) { + SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG); + if (NewOp.getNode()) + return NewOp; + } + + return SDValue(); +} + +static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { + // INSERT_VECTOR_ELT is legal only for immediate indexes. + SDValue Lane = Op.getOperand(2); + if (!isa<ConstantSDNode>(Lane)) + return SDValue(); + + return Op; +} + +static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { + // EXTRACT_VECTOR_ELT is legal only for immediate indexes. + SDValue Lane = Op.getOperand(1); + if (!isa<ConstantSDNode>(Lane)) + return SDValue(); + + SDValue Vec = Op.getOperand(0); + if (Op.getValueType() == MVT::i32 && + Vec.getValueType().getVectorElementType().getSizeInBits() < 32) { + SDLoc dl(Op); + return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); + } + + return Op; +} + +static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { + // The only time a CONCAT_VECTORS operation can have legal types is when + // two 64-bit vectors are concatenated to a 128-bit vector. + assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && + "unexpected CONCAT_VECTORS"); + SDLoc dl(Op); + SDValue Val = DAG.getUNDEF(MVT::v2f64); + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + if (Op0.getOpcode() != ISD::UNDEF) + Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, + DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0), + DAG.getIntPtrConstant(0, dl)); + if (Op1.getOpcode() != ISD::UNDEF) + Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, + DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1), + DAG.getIntPtrConstant(1, dl)); + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); +} + +/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each +/// element has been zero/sign-extended, depending on the isSigned parameter, +/// from an integer type half its size. +static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, + bool isSigned) { + // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32. + EVT VT = N->getValueType(0); + if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) { + SDNode *BVN = N->getOperand(0).getNode(); + if (BVN->getValueType(0) != MVT::v4i32 || + BVN->getOpcode() != ISD::BUILD_VECTOR) + return false; + unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; + unsigned HiElt = 1 - LoElt; + ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt)); + ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt)); + ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2)); + ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2)); + if (!Lo0 || !Hi0 || !Lo1 || !Hi1) + return false; + if (isSigned) { + if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 && + Hi1->getSExtValue() == Lo1->getSExtValue() >> 32) + return true; + } else { + if (Hi0->isNullValue() && Hi1->isNullValue()) + return true; + } + return false; + } + + if (N->getOpcode() != ISD::BUILD_VECTOR) + return false; + + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { + SDNode *Elt = N->getOperand(i).getNode(); + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + unsigned HalfSize = EltSize / 2; + if (isSigned) { + if (!isIntN(HalfSize, C->getSExtValue())) + return false; + } else { + if (!isUIntN(HalfSize, C->getZExtValue())) + return false; + } + continue; + } + return false; + } + + return true; +} + +/// isSignExtended - Check if a node is a vector value that is sign-extended +/// or a constant BUILD_VECTOR with sign-extended elements. +static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { + if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N)) + return true; + if (isExtendedBUILD_VECTOR(N, DAG, true)) + return true; + return false; +} + +/// isZeroExtended - Check if a node is a vector value that is zero-extended +/// or a constant BUILD_VECTOR with zero-extended elements. +static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { + if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N)) + return true; + if (isExtendedBUILD_VECTOR(N, DAG, false)) + return true; + return false; +} + +static EVT getExtensionTo64Bits(const EVT &OrigVT) { + if (OrigVT.getSizeInBits() >= 64) + return OrigVT; + + assert(OrigVT.isSimple() && "Expecting a simple value type"); + + MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; + switch (OrigSimpleTy) { + default: llvm_unreachable("Unexpected Vector Type"); + case MVT::v2i8: + case MVT::v2i16: + return MVT::v2i32; + case MVT::v4i8: + return MVT::v4i16; + } +} + +/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total +/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL. +/// We insert the required extension here to get the vector to fill a D register. +static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, + const EVT &OrigTy, + const EVT &ExtTy, + unsigned ExtOpcode) { + // The vector originally had a size of OrigTy. It was then extended to ExtTy. + // We expect the ExtTy to be 128-bits total. If the OrigTy is less than + // 64-bits we need to insert a new extension so that it will be 64-bits. + assert(ExtTy.is128BitVector() && "Unexpected extension size"); + if (OrigTy.getSizeInBits() >= 64) + return N; + + // Must extend size to at least 64 bits to be used as an operand for VMULL. + EVT NewVT = getExtensionTo64Bits(OrigTy); + + return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); +} + +/// SkipLoadExtensionForVMULL - return a load of the original vector size that +/// does not do any sign/zero extension. If the original vector is less +/// than 64 bits, an appropriate extension will be added after the load to +/// reach a total size of 64 bits. We have to add the extension separately +/// because ARM does not have a sign/zero extending load for vectors. +static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { + EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT()); + + // The load already has the right type. + if (ExtendedTy == LD->getMemoryVT()) + return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(), + LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(), + LD->isNonTemporal(), LD->isInvariant(), + LD->getAlignment()); + + // We need to create a zextload/sextload. We cannot just create a load + // followed by a zext/zext node because LowerMUL is also run during normal + // operation legalization where we can't create illegal types. + return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy, + LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), + LD->getMemoryVT(), LD->isVolatile(), LD->isInvariant(), + LD->isNonTemporal(), LD->getAlignment()); +} + +/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, +/// extending load, or BUILD_VECTOR with extended elements, return the +/// unextended value. The unextended vector should be 64 bits so that it can +/// be used as an operand to a VMULL instruction. If the original vector size +/// before extension is less than 64 bits we add a an extension to resize +/// the vector to 64 bits. +static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { + if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) + return AddRequiredExtensionForVMULL(N->getOperand(0), DAG, + N->getOperand(0)->getValueType(0), + N->getValueType(0), + N->getOpcode()); + + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) + return SkipLoadExtensionForVMULL(LD, DAG); + + // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will + // have been legalized as a BITCAST from v4i32. + if (N->getOpcode() == ISD::BITCAST) { + SDNode *BVN = N->getOperand(0).getNode(); + assert(BVN->getOpcode() == ISD::BUILD_VECTOR && + BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); + unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), MVT::v2i32, + BVN->getOperand(LowElt), BVN->getOperand(LowElt+2)); + } + // Construct a new BUILD_VECTOR with elements truncated to half the size. + assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); + EVT VT = N->getValueType(0); + unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2; + unsigned NumElts = VT.getVectorNumElements(); + MVT TruncVT = MVT::getIntegerVT(EltSize); + SmallVector<SDValue, 8> Ops; + SDLoc dl(N); + for (unsigned i = 0; i != NumElts; ++i) { + ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); + const APInt &CInt = C->getAPIntValue(); + // Element types smaller than 32 bits are not legal, so use i32 elements. + // The values are implicitly truncated so sext vs. zext doesn't matter. + Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); + } + return DAG.getNode(ISD::BUILD_VECTOR, dl, + MVT::getVectorVT(TruncVT, NumElts), Ops); +} + +static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { + unsigned Opcode = N->getOpcode(); + if (Opcode == ISD::ADD || Opcode == ISD::SUB) { + SDNode *N0 = N->getOperand(0).getNode(); + SDNode *N1 = N->getOperand(1).getNode(); + return N0->hasOneUse() && N1->hasOneUse() && + isSignExtended(N0, DAG) && isSignExtended(N1, DAG); + } + return false; +} + +static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { + unsigned Opcode = N->getOpcode(); + if (Opcode == ISD::ADD || Opcode == ISD::SUB) { + SDNode *N0 = N->getOperand(0).getNode(); + SDNode *N1 = N->getOperand(1).getNode(); + return N0->hasOneUse() && N1->hasOneUse() && + isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); + } + return false; +} + +static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { + // Multiplications are only custom-lowered for 128-bit vectors so that + // VMULL can be detected. Otherwise v2i64 multiplications are not legal. + EVT VT = Op.getValueType(); + assert(VT.is128BitVector() && VT.isInteger() && + "unexpected type for custom-lowering ISD::MUL"); + SDNode *N0 = Op.getOperand(0).getNode(); + SDNode *N1 = Op.getOperand(1).getNode(); + unsigned NewOpc = 0; + bool isMLA = false; + bool isN0SExt = isSignExtended(N0, DAG); + bool isN1SExt = isSignExtended(N1, DAG); + if (isN0SExt && isN1SExt) + NewOpc = ARMISD::VMULLs; + else { + bool isN0ZExt = isZeroExtended(N0, DAG); + bool isN1ZExt = isZeroExtended(N1, DAG); + if (isN0ZExt && isN1ZExt) + NewOpc = ARMISD::VMULLu; + else if (isN1SExt || isN1ZExt) { + // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these + // into (s/zext A * s/zext C) + (s/zext B * s/zext C) + if (isN1SExt && isAddSubSExt(N0, DAG)) { + NewOpc = ARMISD::VMULLs; + isMLA = true; + } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { + NewOpc = ARMISD::VMULLu; + isMLA = true; + } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { + std::swap(N0, N1); + NewOpc = ARMISD::VMULLu; + isMLA = true; + } + } + + if (!NewOpc) { + if (VT == MVT::v2i64) + // Fall through to expand this. It is not legal. + return SDValue(); + else + // Other vector multiplications are legal. + return Op; + } + } + + // Legalize to a VMULL instruction. + SDLoc DL(Op); + SDValue Op0; + SDValue Op1 = SkipExtensionForVMULL(N1, DAG); + if (!isMLA) { + Op0 = SkipExtensionForVMULL(N0, DAG); + assert(Op0.getValueType().is64BitVector() && + Op1.getValueType().is64BitVector() && + "unexpected types for extended operands to VMULL"); + return DAG.getNode(NewOpc, DL, VT, Op0, Op1); + } + + // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during + // isel lowering to take advantage of no-stall back to back vmul + vmla. + // vmull q0, d4, d6 + // vmlal q0, d5, d6 + // is faster than + // vaddl q0, d4, d5 + // vmovl q1, d6 + // vmul q0, q0, q1 + SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG); + SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG); + EVT Op1VT = Op1.getValueType(); + return DAG.getNode(N0->getOpcode(), DL, VT, + DAG.getNode(NewOpc, DL, VT, + DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), + DAG.getNode(NewOpc, DL, VT, + DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); +} + +static SDValue +LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) { + // TODO: Should this propagate fast-math-flags? + + // Convert to float + // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); + // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); + X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X); + Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y); + X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X); + Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y); + // Get reciprocal estimate. + // float4 recip = vrecpeq_f32(yf); + Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), + Y); + // Because char has a smaller range than uchar, we can actually get away + // without any newton steps. This requires that we use a weird bias + // of 0xb000, however (again, this has been exhaustively tested). + // float4 result = as_float4(as_int4(xf*recip) + 0xb000); + X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y); + X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X); + Y = DAG.getConstant(0xb000, dl, MVT::i32); + Y = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Y, Y, Y, Y); + X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y); + X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X); + // Convert back to short. + X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X); + X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X); + return X; +} + +static SDValue +LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) { + // TODO: Should this propagate fast-math-flags? + + SDValue N2; + // Convert to float. + // float4 yf = vcvt_f32_s32(vmovl_s16(y)); + // float4 xf = vcvt_f32_s32(vmovl_s16(x)); + N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0); + N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1); + N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); + N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); + + // Use reciprocal estimate and one refinement step. + // float4 recip = vrecpeq_f32(yf); + // recip *= vrecpsq_f32(yf, recip); + N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), + N1); + N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), + N1, N2); + N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); + // Because short has a smaller range than ushort, we can actually get away + // with only a single newton step. This requires that we use a weird bias + // of 89, however (again, this has been exhaustively tested). + // float4 result = as_float4(as_int4(xf*recip) + 0x89); + N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); + N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); + N1 = DAG.getConstant(0x89, dl, MVT::i32); + N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1); + N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); + N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); + // Convert back to integer and return. + // return vmovn_s32(vcvt_s32_f32(result)); + N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); + N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); + return N0; +} + +static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + assert((VT == MVT::v4i16 || VT == MVT::v8i8) && + "unexpected type for custom-lowering ISD::SDIV"); + + SDLoc dl(Op); + SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); + SDValue N2, N3; + + if (VT == MVT::v8i8) { + N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0); + N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1); + + N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, + DAG.getIntPtrConstant(4, dl)); + N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, + DAG.getIntPtrConstant(4, dl)); + N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, + DAG.getIntPtrConstant(0, dl)); + N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, + DAG.getIntPtrConstant(0, dl)); + + N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16 + N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 + + N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); + N0 = LowerCONCAT_VECTORS(N0, DAG); + + N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); + return N0; + } + return LowerSDIV_v4i16(N0, N1, dl, DAG); +} + +static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { + // TODO: Should this propagate fast-math-flags? + EVT VT = Op.getValueType(); + assert((VT == MVT::v4i16 || VT == MVT::v8i8) && + "unexpected type for custom-lowering ISD::UDIV"); + + SDLoc dl(Op); + SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); + SDValue N2, N3; + + if (VT == MVT::v8i8) { + N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0); + N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1); + + N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, + DAG.getIntPtrConstant(4, dl)); + N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, + DAG.getIntPtrConstant(4, dl)); + N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, + DAG.getIntPtrConstant(0, dl)); + N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, + DAG.getIntPtrConstant(0, dl)); + + N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16 + N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 + + N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); + N0 = LowerCONCAT_VECTORS(N0, DAG); + + N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, + DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl, + MVT::i32), + N0); + return N0; + } + + // v4i16 sdiv ... Convert to float. + // float4 yf = vcvt_f32_s32(vmovl_u16(y)); + // float4 xf = vcvt_f32_s32(vmovl_u16(x)); + N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0); + N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1); + N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); + SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); + + // Use reciprocal estimate and two refinement steps. + // float4 recip = vrecpeq_f32(yf); + // recip *= vrecpsq_f32(yf, recip); + // recip *= vrecpsq_f32(yf, recip); + N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), + BN1); + N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), + BN1, N2); + N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); + N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), + BN1, N2); + N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); + // Simply multiplying by the reciprocal estimate can leave us a few ulps + // too low, so we add 2 ulps (exhaustive testing shows that this is enough, + // and that it will never cause us to return an answer too large). + // float4 result = as_float4(as_int4(xf*recip) + 2); + N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); + N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); + N1 = DAG.getConstant(2, dl, MVT::i32); + N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1); + N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); + N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); + // Convert back to integer and return. + // return vmovn_u32(vcvt_s32_f32(result)); + N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); + N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); + return N0; +} + +static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getNode()->getValueType(0); + SDVTList VTs = DAG.getVTList(VT, MVT::i32); + + unsigned Opc; + bool ExtraOp = false; + switch (Op.getOpcode()) { + default: llvm_unreachable("Invalid code"); + case ISD::ADDC: Opc = ARMISD::ADDC; break; + case ISD::ADDE: Opc = ARMISD::ADDE; ExtraOp = true; break; + case ISD::SUBC: Opc = ARMISD::SUBC; break; + case ISD::SUBE: Opc = ARMISD::SUBE; ExtraOp = true; break; + } + + if (!ExtraOp) + return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), + Op.getOperand(1)); + return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), + Op.getOperand(1), Op.getOperand(2)); +} + +SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { + assert(Subtarget->isTargetDarwin()); + + // For iOS, we want to call an alternative entry point: __sincos_stret, + // return values are passed via sret. + SDLoc dl(Op); + SDValue Arg = Op.getOperand(0); + EVT ArgVT = Arg.getValueType(); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + + MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // Pair of floats / doubles used to pass the result. + Type *RetTy = StructType::get(ArgTy, ArgTy, nullptr); + auto &DL = DAG.getDataLayout(); + + ArgListTy Args; + bool ShouldUseSRet = Subtarget->isAPCS_ABI(); + SDValue SRet; + if (ShouldUseSRet) { + // Create stack object for sret. + const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); + const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy); + int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false); + SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL)); + + ArgListEntry Entry; + Entry.Node = SRet; + Entry.Ty = RetTy->getPointerTo(); + Entry.isSExt = false; + Entry.isZExt = false; + Entry.isSRet = true; + Args.push_back(Entry); + RetTy = Type::getVoidTy(*DAG.getContext()); + } + + ArgListEntry Entry; + Entry.Node = Arg; + Entry.Ty = ArgTy; + Entry.isSExt = false; + Entry.isZExt = false; + Args.push_back(Entry); + + const char *LibcallName = + (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; + RTLIB::Libcall LC = + (ArgVT == MVT::f64) ? RTLIB::SINCOS_F64 : RTLIB::SINCOS_F32; + CallingConv::ID CC = getLibcallCallingConv(LC); + SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); + + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl) + .setChain(DAG.getEntryNode()) + .setCallee(CC, RetTy, Callee, std::move(Args), 0) + .setDiscardResult(ShouldUseSRet); + std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); + + if (!ShouldUseSRet) + return CallResult.first; + + SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet, + MachinePointerInfo(), false, false, false, 0); + + // Address of cos field. + SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, + DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); + SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, + MachinePointerInfo(), false, false, false, 0); + + SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); + return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, + LoadSin.getValue(0), LoadCos.getValue(0)); +} + +SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, + bool Signed, + SDValue &Chain) const { + EVT VT = Op.getValueType(); + assert((VT == MVT::i32 || VT == MVT::i64) && + "unexpected type for custom lowering DIV"); + SDLoc dl(Op); + + const auto &DL = DAG.getDataLayout(); + const auto &TLI = DAG.getTargetLoweringInfo(); + + const char *Name = nullptr; + if (Signed) + Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64"; + else + Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64"; + + SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL)); + + ARMTargetLowering::ArgListTy Args; + + for (auto AI : {1, 0}) { + ArgListEntry Arg; + Arg.Node = Op.getOperand(AI); + Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext()); + Args.push_back(Arg); + } + + CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl) + .setChain(Chain) + .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()), + ES, std::move(Args), 0); + + return LowerCallTo(CLI).first; +} + +SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, + bool Signed) const { + assert(Op.getValueType() == MVT::i32 && + "unexpected type for custom lowering DIV"); + SDLoc dl(Op); + + SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, + DAG.getEntryNode(), Op.getOperand(1)); + + return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); +} + +void ARMTargetLowering::ExpandDIV_Windows( + SDValue Op, SelectionDAG &DAG, bool Signed, + SmallVectorImpl<SDValue> &Results) const { + const auto &DL = DAG.getDataLayout(); + const auto &TLI = DAG.getTargetLoweringInfo(); + + assert(Op.getValueType() == MVT::i64 && + "unexpected type for custom lowering DIV"); + SDLoc dl(Op); + + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op.getOperand(1), + DAG.getConstant(0, dl, MVT::i32)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op.getOperand(1), + DAG.getConstant(1, dl, MVT::i32)); + SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i32, Lo, Hi); + + SDValue DBZCHK = + DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, DAG.getEntryNode(), Or); + + SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); + + SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result); + SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result, + DAG.getConstant(32, dl, TLI.getPointerTy(DL))); + Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper); + + Results.push_back(Lower); + Results.push_back(Upper); +} + +static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { + // Monotonic load/store is legal for all targets + if (cast<AtomicSDNode>(Op)->getOrdering() <= Monotonic) + return Op; + + // Acquire/Release load/store is not legal for targets without a + // dmb or equivalent available. + return SDValue(); +} + +static void ReplaceREADCYCLECOUNTER(SDNode *N, + SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + SDLoc DL(N); + // Under Power Management extensions, the cycle-count is: + // mrc p15, #0, <Rt>, c9, c13, #0 + SDValue Ops[] = { N->getOperand(0), // Chain + DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), + DAG.getConstant(15, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(9, DL, MVT::i32), + DAG.getConstant(13, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i32) + }; + + SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, + DAG.getVTList(MVT::i32, MVT::Other), Ops); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32, + DAG.getConstant(0, DL, MVT::i32))); + Results.push_back(Cycles32.getValue(1)); +} + +SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { + switch (Op.getOpcode()) { + default: llvm_unreachable("Don't know how to custom lower this!"); + case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG); + case ISD::ConstantPool: return LowerConstantPool(Op, DAG); + case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); + case ISD::GlobalAddress: + switch (Subtarget->getTargetTriple().getObjectFormat()) { + default: llvm_unreachable("unknown object format"); + case Triple::COFF: + return LowerGlobalAddressWindows(Op, DAG); + case Triple::ELF: + return LowerGlobalAddressELF(Op, DAG); + case Triple::MachO: + return LowerGlobalAddressDarwin(Op, DAG); + } + case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); + case ISD::SELECT: return LowerSELECT(Op, DAG); + case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); + case ISD::BR_CC: return LowerBR_CC(Op, DAG); + case ISD::BR_JT: return LowerBR_JT(Op, DAG); + case ISD::VASTART: return LowerVASTART(Op, DAG); + case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget); + case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget); + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); + case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); + case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); + case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); + case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); + case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); + case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); + case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, + Subtarget); + case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG); + case ISD::SHL: + case ISD::SRL: + case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); + case ISD::SREM: return LowerREM(Op.getNode(), DAG); + case ISD::UREM: return LowerREM(Op.getNode(), DAG); + case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); + case ISD::SRL_PARTS: + case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); + case ISD::CTTZ: + case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); + case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); + case ISD::SETCC: return LowerVSETCC(Op, DAG); + case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); + case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); + case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); + case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); + case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); + case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); + case ISD::MUL: return LowerMUL(Op, DAG); + case ISD::SDIV: return LowerSDIV(Op, DAG); + case ISD::UDIV: return LowerUDIV(Op, DAG); + case ISD::ADDC: + case ISD::ADDE: + case ISD::SUBC: + case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); + case ISD::SADDO: + case ISD::UADDO: + case ISD::SSUBO: + case ISD::USUBO: + return LowerXALUO(Op, DAG); + case ISD::ATOMIC_LOAD: + case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); + case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); + case ISD::SDIVREM: + case ISD::UDIVREM: return LowerDivRem(Op, DAG); + case ISD::DYNAMIC_STACKALLOC: + if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment()) + return LowerDYNAMIC_STACKALLOC(Op, DAG); + llvm_unreachable("Don't know how to custom lower this!"); + case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); + case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); + case ARMISD::WIN__DBZCHK: return SDValue(); + } +} + +/// ReplaceNodeResults - Replace the results of node with an illegal result +/// type with new values built out of custom code. +void ARMTargetLowering::ReplaceNodeResults(SDNode *N, + SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const { + SDValue Res; + switch (N->getOpcode()) { + default: + llvm_unreachable("Don't know how to custom expand this!"); + case ISD::READ_REGISTER: + ExpandREAD_REGISTER(N, Results, DAG); + break; + case ISD::BITCAST: + Res = ExpandBITCAST(N, DAG); + break; + case ISD::SRL: + case ISD::SRA: + Res = Expand64BitShift(N, DAG, Subtarget); + break; + case ISD::SREM: + case ISD::UREM: + Res = LowerREM(N, DAG); + break; + case ISD::READCYCLECOUNTER: + ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); + return; + case ISD::UDIV: + case ISD::SDIV: + assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows"); + return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV, + Results); + } + if (Res.getNode()) + Results.push_back(Res); +} + +//===----------------------------------------------------------------------===// +// ARM Scheduler Hooks +//===----------------------------------------------------------------------===// + +/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and +/// registers the function context. +void ARMTargetLowering:: +SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB, + MachineBasicBlock *DispatchBB, int FI) const { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + DebugLoc dl = MI->getDebugLoc(); + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo *MRI = &MF->getRegInfo(); + MachineConstantPool *MCP = MF->getConstantPool(); + ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>(); + const Function *F = MF->getFunction(); + + bool isThumb = Subtarget->isThumb(); + bool isThumb2 = Subtarget->isThumb2(); + + unsigned PCLabelId = AFI->createPICLabelUId(); + unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8; + ARMConstantPoolValue *CPV = + ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj); + unsigned CPI = MCP->getConstantPoolIndex(CPV, 4); + + const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass + : &ARM::GPRRegClass; + + // Grab constant pool and fixed stack memory operands. + MachineMemOperand *CPMMO = + MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), + MachineMemOperand::MOLoad, 4, 4); + + MachineMemOperand *FIMMOSt = + MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI), + MachineMemOperand::MOStore, 4, 4); + + // Load the address of the dispatch MBB into the jump buffer. + if (isThumb2) { + // Incoming value: jbuf + // ldr.n r5, LCPI1_1 + // orr r5, r5, #1 + // add r5, pc + // str r5, [$jbuf, #+4] ; &jbuf[1] + unsigned NewVReg1 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1) + .addConstantPoolIndex(CPI) + .addMemOperand(CPMMO)); + // Set the low bit because of thumb mode. + unsigned NewVReg2 = MRI->createVirtualRegister(TRC); + AddDefaultCC( + AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2) + .addReg(NewVReg1, RegState::Kill) + .addImm(0x01))); + unsigned NewVReg3 = MRI->createVirtualRegister(TRC); + BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3) + .addReg(NewVReg2, RegState::Kill) + .addImm(PCLabelId); + AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12)) + .addReg(NewVReg3, RegState::Kill) + .addFrameIndex(FI) + .addImm(36) // &jbuf[1] :: pc + .addMemOperand(FIMMOSt)); + } else if (isThumb) { + // Incoming value: jbuf + // ldr.n r1, LCPI1_4 + // add r1, pc + // mov r2, #1 + // orrs r1, r2 + // add r2, $jbuf, #+4 ; &jbuf[1] + // str r1, [r2] + unsigned NewVReg1 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1) + .addConstantPoolIndex(CPI) + .addMemOperand(CPMMO)); + unsigned NewVReg2 = MRI->createVirtualRegister(TRC); + BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2) + .addReg(NewVReg1, RegState::Kill) + .addImm(PCLabelId); + // Set the low bit because of thumb mode. + unsigned NewVReg3 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3) + .addReg(ARM::CPSR, RegState::Define) + .addImm(1)); + unsigned NewVReg4 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4) + .addReg(ARM::CPSR, RegState::Define) + .addReg(NewVReg2, RegState::Kill) + .addReg(NewVReg3, RegState::Kill)); + unsigned NewVReg5 = MRI->createVirtualRegister(TRC); + BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5) + .addFrameIndex(FI) + .addImm(36); // &jbuf[1] :: pc + AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi)) + .addReg(NewVReg4, RegState::Kill) + .addReg(NewVReg5, RegState::Kill) + .addImm(0) + .addMemOperand(FIMMOSt)); + } else { + // Incoming value: jbuf + // ldr r1, LCPI1_1 + // add r1, pc, r1 + // str r1, [$jbuf, #+4] ; &jbuf[1] + unsigned NewVReg1 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1) + .addConstantPoolIndex(CPI) + .addImm(0) + .addMemOperand(CPMMO)); + unsigned NewVReg2 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2) + .addReg(NewVReg1, RegState::Kill) + .addImm(PCLabelId)); + AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12)) + .addReg(NewVReg2, RegState::Kill) + .addFrameIndex(FI) + .addImm(36) // &jbuf[1] :: pc + .addMemOperand(FIMMOSt)); + } +} + +void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, + MachineBasicBlock *MBB) const { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + DebugLoc dl = MI->getDebugLoc(); + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo *MRI = &MF->getRegInfo(); + MachineFrameInfo *MFI = MF->getFrameInfo(); + int FI = MFI->getFunctionContextIndex(); + + const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass + : &ARM::GPRnopcRegClass; + + // Get a mapping of the call site numbers to all of the landing pads they're + // associated with. + DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2> > CallSiteNumToLPad; + unsigned MaxCSNum = 0; + MachineModuleInfo &MMI = MF->getMMI(); + for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; + ++BB) { + if (!BB->isEHPad()) continue; + + // FIXME: We should assert that the EH_LABEL is the first MI in the landing + // pad. + for (MachineBasicBlock::iterator + II = BB->begin(), IE = BB->end(); II != IE; ++II) { + if (!II->isEHLabel()) continue; + + MCSymbol *Sym = II->getOperand(0).getMCSymbol(); + if (!MMI.hasCallSiteLandingPad(Sym)) continue; + + SmallVectorImpl<unsigned> &CallSiteIdxs = MMI.getCallSiteLandingPad(Sym); + for (SmallVectorImpl<unsigned>::iterator + CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end(); + CSI != CSE; ++CSI) { + CallSiteNumToLPad[*CSI].push_back(&*BB); + MaxCSNum = std::max(MaxCSNum, *CSI); + } + break; + } + } + + // Get an ordered list of the machine basic blocks for the jump table. + std::vector<MachineBasicBlock*> LPadList; + SmallPtrSet<MachineBasicBlock*, 64> InvokeBBs; + LPadList.reserve(CallSiteNumToLPad.size()); + for (unsigned I = 1; I <= MaxCSNum; ++I) { + SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I]; + for (SmallVectorImpl<MachineBasicBlock*>::iterator + II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) { + LPadList.push_back(*II); + InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end()); + } + } + + assert(!LPadList.empty() && + "No landing pad destinations for the dispatch jump table!"); + + // Create the jump table and associated information. + MachineJumpTableInfo *JTI = + MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline); + unsigned MJTI = JTI->createJumpTableIndex(LPadList); + Reloc::Model RelocM = getTargetMachine().getRelocationModel(); + + // Create the MBBs for the dispatch code. + + // Shove the dispatch's address into the return slot in the function context. + MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); + DispatchBB->setIsEHPad(); + + MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); + unsigned trap_opcode; + if (Subtarget->isThumb()) + trap_opcode = ARM::tTRAP; + else + trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP; + + BuildMI(TrapBB, dl, TII->get(trap_opcode)); + DispatchBB->addSuccessor(TrapBB); + + MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); + DispatchBB->addSuccessor(DispContBB); + + // Insert and MBBs. + MF->insert(MF->end(), DispatchBB); + MF->insert(MF->end(), DispContBB); + MF->insert(MF->end(), TrapBB); + + // Insert code into the entry block that creates and registers the function + // context. + SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI); + + MachineMemOperand *FIMMOLd = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FI), + MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4); + + MachineInstrBuilder MIB; + MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); + + const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII); + const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); + + // Add a register mask with no preserved registers. This results in all + // registers being marked as clobbered. + MIB.addRegMask(RI.getNoPreservedMask()); + + unsigned NumLPads = LPadList.size(); + if (Subtarget->isThumb2()) { + unsigned NewVReg1 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1) + .addFrameIndex(FI) + .addImm(4) + .addMemOperand(FIMMOLd)); + + if (NumLPads < 256) { + AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri)) + .addReg(NewVReg1) + .addImm(LPadList.size())); + } else { + unsigned VReg1 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1) + .addImm(NumLPads & 0xFFFF)); + + unsigned VReg2 = VReg1; + if ((NumLPads & 0xFFFF0000) != 0) { + VReg2 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2) + .addReg(VReg1) + .addImm(NumLPads >> 16)); + } + + AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr)) + .addReg(NewVReg1) + .addReg(VReg2)); + } + + BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc)) + .addMBB(TrapBB) + .addImm(ARMCC::HI) + .addReg(ARM::CPSR); + + unsigned NewVReg3 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT),NewVReg3) + .addJumpTableIndex(MJTI)); + + unsigned NewVReg4 = MRI->createVirtualRegister(TRC); + AddDefaultCC( + AddDefaultPred( + BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4) + .addReg(NewVReg3, RegState::Kill) + .addReg(NewVReg1) + .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)))); + + BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT)) + .addReg(NewVReg4, RegState::Kill) + .addReg(NewVReg1) + .addJumpTableIndex(MJTI); + } else if (Subtarget->isThumb()) { + unsigned NewVReg1 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1) + .addFrameIndex(FI) + .addImm(1) + .addMemOperand(FIMMOLd)); + + if (NumLPads < 256) { + AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8)) + .addReg(NewVReg1) + .addImm(NumLPads)); + } else { + MachineConstantPool *ConstantPool = MF->getConstantPool(); + Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext()); + const Constant *C = ConstantInt::get(Int32Ty, NumLPads); + + // MachineConstantPool wants an explicit alignment. + unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); + if (Align == 0) + Align = MF->getDataLayout().getTypeAllocSize(C->getType()); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); + + unsigned VReg1 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) + .addReg(VReg1, RegState::Define) + .addConstantPoolIndex(Idx)); + AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr)) + .addReg(NewVReg1) + .addReg(VReg1)); + } + + BuildMI(DispatchBB, dl, TII->get(ARM::tBcc)) + .addMBB(TrapBB) + .addImm(ARMCC::HI) + .addReg(ARM::CPSR); + + unsigned NewVReg2 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2) + .addReg(ARM::CPSR, RegState::Define) + .addReg(NewVReg1) + .addImm(2)); + + unsigned NewVReg3 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3) + .addJumpTableIndex(MJTI)); + + unsigned NewVReg4 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4) + .addReg(ARM::CPSR, RegState::Define) + .addReg(NewVReg2, RegState::Kill) + .addReg(NewVReg3)); + + MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( + MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); + + unsigned NewVReg5 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) + .addReg(NewVReg4, RegState::Kill) + .addImm(0) + .addMemOperand(JTMMOLd)); + + unsigned NewVReg6 = NewVReg5; + if (RelocM == Reloc::PIC_) { + NewVReg6 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6) + .addReg(ARM::CPSR, RegState::Define) + .addReg(NewVReg5, RegState::Kill) + .addReg(NewVReg3)); + } + + BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr)) + .addReg(NewVReg6, RegState::Kill) + .addJumpTableIndex(MJTI); + } else { + unsigned NewVReg1 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1) + .addFrameIndex(FI) + .addImm(4) + .addMemOperand(FIMMOLd)); + + if (NumLPads < 256) { + AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPri)) + .addReg(NewVReg1) + .addImm(NumLPads)); + } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) { + unsigned VReg1 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1) + .addImm(NumLPads & 0xFFFF)); + + unsigned VReg2 = VReg1; + if ((NumLPads & 0xFFFF0000) != 0) { + VReg2 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2) + .addReg(VReg1) + .addImm(NumLPads >> 16)); + } + + AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) + .addReg(NewVReg1) + .addReg(VReg2)); + } else { + MachineConstantPool *ConstantPool = MF->getConstantPool(); + Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext()); + const Constant *C = ConstantInt::get(Int32Ty, NumLPads); + + // MachineConstantPool wants an explicit alignment. + unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); + if (Align == 0) + Align = MF->getDataLayout().getTypeAllocSize(C->getType()); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); + + unsigned VReg1 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) + .addReg(VReg1, RegState::Define) + .addConstantPoolIndex(Idx) + .addImm(0)); + AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) + .addReg(NewVReg1) + .addReg(VReg1, RegState::Kill)); + } + + BuildMI(DispatchBB, dl, TII->get(ARM::Bcc)) + .addMBB(TrapBB) + .addImm(ARMCC::HI) + .addReg(ARM::CPSR); + + unsigned NewVReg3 = MRI->createVirtualRegister(TRC); + AddDefaultCC( + AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3) + .addReg(NewVReg1) + .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)))); + unsigned NewVReg4 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) + .addJumpTableIndex(MJTI)); + + MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( + MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); + unsigned NewVReg5 = MRI->createVirtualRegister(TRC); + AddDefaultPred( + BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) + .addReg(NewVReg3, RegState::Kill) + .addReg(NewVReg4) + .addImm(0) + .addMemOperand(JTMMOLd)); + + if (RelocM == Reloc::PIC_) { + BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd)) + .addReg(NewVReg5, RegState::Kill) + .addReg(NewVReg4) + .addJumpTableIndex(MJTI); + } else { + BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr)) + .addReg(NewVReg5, RegState::Kill) + .addJumpTableIndex(MJTI); + } + } + + // Add the jump table entries as successors to the MBB. + SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs; + for (std::vector<MachineBasicBlock*>::iterator + I = LPadList.begin(), E = LPadList.end(); I != E; ++I) { + MachineBasicBlock *CurMBB = *I; + if (SeenMBBs.insert(CurMBB).second) + DispContBB->addSuccessor(CurMBB); + } + + // N.B. the order the invoke BBs are processed in doesn't matter here. + const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF); + SmallVector<MachineBasicBlock*, 64> MBBLPads; + for (MachineBasicBlock *BB : InvokeBBs) { + + // Remove the landing pad successor from the invoke block and replace it + // with the new dispatch block. + SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(), + BB->succ_end()); + while (!Successors.empty()) { + MachineBasicBlock *SMBB = Successors.pop_back_val(); + if (SMBB->isEHPad()) { + BB->removeSuccessor(SMBB); + MBBLPads.push_back(SMBB); + } + } + + BB->addSuccessor(DispatchBB, BranchProbability::getZero()); + BB->normalizeSuccProbs(); + + // Find the invoke call and mark all of the callee-saved registers as + // 'implicit defined' so that they're spilled. This prevents code from + // moving instructions to before the EH block, where they will never be + // executed. + for (MachineBasicBlock::reverse_iterator + II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) { + if (!II->isCall()) continue; + + DenseMap<unsigned, bool> DefRegs; + for (MachineInstr::mop_iterator + OI = II->operands_begin(), OE = II->operands_end(); + OI != OE; ++OI) { + if (!OI->isReg()) continue; + DefRegs[OI->getReg()] = true; + } + + MachineInstrBuilder MIB(*MF, &*II); + + for (unsigned i = 0; SavedRegs[i] != 0; ++i) { + unsigned Reg = SavedRegs[i]; + if (Subtarget->isThumb2() && + !ARM::tGPRRegClass.contains(Reg) && + !ARM::hGPRRegClass.contains(Reg)) + continue; + if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg)) + continue; + if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg)) + continue; + if (!DefRegs[Reg]) + MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); + } + + break; + } + } + + // Mark all former landing pads as non-landing pads. The dispatch is the only + // landing pad now. + for (SmallVectorImpl<MachineBasicBlock*>::iterator + I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I) + (*I)->setIsEHPad(false); + + // The instruction is gone now. + MI->eraseFromParent(); +} + +static +MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { + for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), + E = MBB->succ_end(); I != E; ++I) + if (*I != Succ) + return *I; + llvm_unreachable("Expecting a BB with two successors!"); +} + +/// Return the load opcode for a given load size. If load size >= 8, +/// neon opcode will be returned. +static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) { + if (LdSize >= 8) + return LdSize == 16 ? ARM::VLD1q32wb_fixed + : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0; + if (IsThumb1) + return LdSize == 4 ? ARM::tLDRi + : LdSize == 2 ? ARM::tLDRHi + : LdSize == 1 ? ARM::tLDRBi : 0; + if (IsThumb2) + return LdSize == 4 ? ARM::t2LDR_POST + : LdSize == 2 ? ARM::t2LDRH_POST + : LdSize == 1 ? ARM::t2LDRB_POST : 0; + return LdSize == 4 ? ARM::LDR_POST_IMM + : LdSize == 2 ? ARM::LDRH_POST + : LdSize == 1 ? ARM::LDRB_POST_IMM : 0; +} + +/// Return the store opcode for a given store size. If store size >= 8, +/// neon opcode will be returned. +static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) { + if (StSize >= 8) + return StSize == 16 ? ARM::VST1q32wb_fixed + : StSize == 8 ? ARM::VST1d32wb_fixed : 0; + if (IsThumb1) + return StSize == 4 ? ARM::tSTRi + : StSize == 2 ? ARM::tSTRHi + : StSize == 1 ? ARM::tSTRBi : 0; + if (IsThumb2) + return StSize == 4 ? ARM::t2STR_POST + : StSize == 2 ? ARM::t2STRH_POST + : StSize == 1 ? ARM::t2STRB_POST : 0; + return StSize == 4 ? ARM::STR_POST_IMM + : StSize == 2 ? ARM::STRH_POST + : StSize == 1 ? ARM::STRB_POST_IMM : 0; +} + +/// Emit a post-increment load operation with given size. The instructions +/// will be added to BB at Pos. +static void emitPostLd(MachineBasicBlock *BB, MachineInstr *Pos, + const TargetInstrInfo *TII, DebugLoc dl, + unsigned LdSize, unsigned Data, unsigned AddrIn, + unsigned AddrOut, bool IsThumb1, bool IsThumb2) { + unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2); + assert(LdOpc != 0 && "Should have a load opcode"); + if (LdSize >= 8) { + AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) + .addReg(AddrOut, RegState::Define).addReg(AddrIn) + .addImm(0)); + } else if (IsThumb1) { + // load + update AddrIn + AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) + .addReg(AddrIn).addImm(0)); + MachineInstrBuilder MIB = + BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut); + MIB = AddDefaultT1CC(MIB); + MIB.addReg(AddrIn).addImm(LdSize); + AddDefaultPred(MIB); + } else if (IsThumb2) { + AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) + .addReg(AddrOut, RegState::Define).addReg(AddrIn) + .addImm(LdSize)); + } else { // arm + AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) + .addReg(AddrOut, RegState::Define).addReg(AddrIn) + .addReg(0).addImm(LdSize)); + } +} + +/// Emit a post-increment store operation with given size. The instructions +/// will be added to BB at Pos. +static void emitPostSt(MachineBasicBlock *BB, MachineInstr *Pos, + const TargetInstrInfo *TII, DebugLoc dl, + unsigned StSize, unsigned Data, unsigned AddrIn, + unsigned AddrOut, bool IsThumb1, bool IsThumb2) { + unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2); + assert(StOpc != 0 && "Should have a store opcode"); + if (StSize >= 8) { + AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) + .addReg(AddrIn).addImm(0).addReg(Data)); + } else if (IsThumb1) { + // store + update AddrIn + AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc)).addReg(Data) + .addReg(AddrIn).addImm(0)); + MachineInstrBuilder MIB = + BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut); + MIB = AddDefaultT1CC(MIB); + MIB.addReg(AddrIn).addImm(StSize); + AddDefaultPred(MIB); + } else if (IsThumb2) { + AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) + .addReg(Data).addReg(AddrIn).addImm(StSize)); + } else { // arm + AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) + .addReg(Data).addReg(AddrIn).addReg(0) + .addImm(StSize)); + } +} + +MachineBasicBlock * +ARMTargetLowering::EmitStructByval(MachineInstr *MI, + MachineBasicBlock *BB) const { + // This pseudo instruction has 3 operands: dst, src, size + // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). + // Otherwise, we will generate unrolled scalar copies. + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction::iterator It = ++BB->getIterator(); + + unsigned dest = MI->getOperand(0).getReg(); + unsigned src = MI->getOperand(1).getReg(); + unsigned SizeVal = MI->getOperand(2).getImm(); + unsigned Align = MI->getOperand(3).getImm(); + DebugLoc dl = MI->getDebugLoc(); + + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned UnitSize = 0; + const TargetRegisterClass *TRC = nullptr; + const TargetRegisterClass *VecTRC = nullptr; + + bool IsThumb1 = Subtarget->isThumb1Only(); + bool IsThumb2 = Subtarget->isThumb2(); + + if (Align & 1) { + UnitSize = 1; + } else if (Align & 2) { + UnitSize = 2; + } else { + // Check whether we can use NEON instructions. + if (!MF->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) && + Subtarget->hasNEON()) { + if ((Align % 16 == 0) && SizeVal >= 16) + UnitSize = 16; + else if ((Align % 8 == 0) && SizeVal >= 8) + UnitSize = 8; + } + // Can't use NEON instructions. + if (UnitSize == 0) + UnitSize = 4; + } + + // Select the correct opcode and register class for unit size load/store + bool IsNeon = UnitSize >= 8; + TRC = (IsThumb1 || IsThumb2) ? &ARM::tGPRRegClass : &ARM::GPRRegClass; + if (IsNeon) + VecTRC = UnitSize == 16 ? &ARM::DPairRegClass + : UnitSize == 8 ? &ARM::DPRRegClass + : nullptr; + + unsigned BytesLeft = SizeVal % UnitSize; + unsigned LoopSize = SizeVal - BytesLeft; + + if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) { + // Use LDR and STR to copy. + // [scratch, srcOut] = LDR_POST(srcIn, UnitSize) + // [destOut] = STR_POST(scratch, destIn, UnitSize) + unsigned srcIn = src; + unsigned destIn = dest; + for (unsigned i = 0; i < LoopSize; i+=UnitSize) { + unsigned srcOut = MRI.createVirtualRegister(TRC); + unsigned destOut = MRI.createVirtualRegister(TRC); + unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); + emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut, + IsThumb1, IsThumb2); + emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut, + IsThumb1, IsThumb2); + srcIn = srcOut; + destIn = destOut; + } + + // Handle the leftover bytes with LDRB and STRB. + // [scratch, srcOut] = LDRB_POST(srcIn, 1) + // [destOut] = STRB_POST(scratch, destIn, 1) + for (unsigned i = 0; i < BytesLeft; i++) { + unsigned srcOut = MRI.createVirtualRegister(TRC); + unsigned destOut = MRI.createVirtualRegister(TRC); + unsigned scratch = MRI.createVirtualRegister(TRC); + emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut, + IsThumb1, IsThumb2); + emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut, + IsThumb1, IsThumb2); + srcIn = srcOut; + destIn = destOut; + } + MI->eraseFromParent(); // The instruction is gone now. + return BB; + } + + // Expand the pseudo op to a loop. + // thisMBB: + // ... + // movw varEnd, # --> with thumb2 + // movt varEnd, # + // ldrcp varEnd, idx --> without thumb2 + // fallthrough --> loopMBB + // loopMBB: + // PHI varPhi, varEnd, varLoop + // PHI srcPhi, src, srcLoop + // PHI destPhi, dst, destLoop + // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) + // [destLoop] = STR_POST(scratch, destPhi, UnitSize) + // subs varLoop, varPhi, #UnitSize + // bne loopMBB + // fallthrough --> exitMBB + // exitMBB: + // epilogue to handle left-over bytes + // [scratch, srcOut] = LDRB_POST(srcLoop, 1) + // [destOut] = STRB_POST(scratch, destLoop, 1) + MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MF->insert(It, loopMBB); + MF->insert(It, exitMBB); + + // Transfer the remainder of BB and its successor edges to exitMBB. + exitMBB->splice(exitMBB->begin(), BB, + std::next(MachineBasicBlock::iterator(MI)), BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); + + // Load an immediate to varEnd. + unsigned varEnd = MRI.createVirtualRegister(TRC); + if (Subtarget->useMovt(*MF)) { + unsigned Vtmp = varEnd; + if ((LoopSize & 0xFFFF0000) != 0) + Vtmp = MRI.createVirtualRegister(TRC); + AddDefaultPred(BuildMI(BB, dl, + TII->get(IsThumb2 ? ARM::t2MOVi16 : ARM::MOVi16), + Vtmp).addImm(LoopSize & 0xFFFF)); + + if ((LoopSize & 0xFFFF0000) != 0) + AddDefaultPred(BuildMI(BB, dl, + TII->get(IsThumb2 ? ARM::t2MOVTi16 : ARM::MOVTi16), + varEnd) + .addReg(Vtmp) + .addImm(LoopSize >> 16)); + } else { + MachineConstantPool *ConstantPool = MF->getConstantPool(); + Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext()); + const Constant *C = ConstantInt::get(Int32Ty, LoopSize); + + // MachineConstantPool wants an explicit alignment. + unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); + if (Align == 0) + Align = MF->getDataLayout().getTypeAllocSize(C->getType()); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); + + if (IsThumb1) + AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)).addReg( + varEnd, RegState::Define).addConstantPoolIndex(Idx)); + else + AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)).addReg( + varEnd, RegState::Define).addConstantPoolIndex(Idx).addImm(0)); + } + BB->addSuccessor(loopMBB); + + // Generate the loop body: + // varPhi = PHI(varLoop, varEnd) + // srcPhi = PHI(srcLoop, src) + // destPhi = PHI(destLoop, dst) + MachineBasicBlock *entryBB = BB; + BB = loopMBB; + unsigned varLoop = MRI.createVirtualRegister(TRC); + unsigned varPhi = MRI.createVirtualRegister(TRC); + unsigned srcLoop = MRI.createVirtualRegister(TRC); + unsigned srcPhi = MRI.createVirtualRegister(TRC); + unsigned destLoop = MRI.createVirtualRegister(TRC); + unsigned destPhi = MRI.createVirtualRegister(TRC); + + BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi) + .addReg(varLoop).addMBB(loopMBB) + .addReg(varEnd).addMBB(entryBB); + BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi) + .addReg(srcLoop).addMBB(loopMBB) + .addReg(src).addMBB(entryBB); + BuildMI(BB, dl, TII->get(ARM::PHI), destPhi) + .addReg(destLoop).addMBB(loopMBB) + .addReg(dest).addMBB(entryBB); + + // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) + // [destLoop] = STR_POST(scratch, destPhi, UnitSiz) + unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); + emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop, + IsThumb1, IsThumb2); + emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop, + IsThumb1, IsThumb2); + + // Decrement loop variable by UnitSize. + if (IsThumb1) { + MachineInstrBuilder MIB = + BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop); + MIB = AddDefaultT1CC(MIB); + MIB.addReg(varPhi).addImm(UnitSize); + AddDefaultPred(MIB); + } else { + MachineInstrBuilder MIB = + BuildMI(*BB, BB->end(), dl, + TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); + AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize))); + MIB->getOperand(5).setReg(ARM::CPSR); + MIB->getOperand(5).setIsDef(true); + } + BuildMI(*BB, BB->end(), dl, + TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc)) + .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); + + // loopMBB can loop back to loopMBB or fall through to exitMBB. + BB->addSuccessor(loopMBB); + BB->addSuccessor(exitMBB); + + // Add epilogue to handle BytesLeft. + BB = exitMBB; + MachineInstr *StartOfExit = exitMBB->begin(); + + // [scratch, srcOut] = LDRB_POST(srcLoop, 1) + // [destOut] = STRB_POST(scratch, destLoop, 1) + unsigned srcIn = srcLoop; + unsigned destIn = destLoop; + for (unsigned i = 0; i < BytesLeft; i++) { + unsigned srcOut = MRI.createVirtualRegister(TRC); + unsigned destOut = MRI.createVirtualRegister(TRC); + unsigned scratch = MRI.createVirtualRegister(TRC); + emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut, + IsThumb1, IsThumb2); + emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut, + IsThumb1, IsThumb2); + srcIn = srcOut; + destIn = destOut; + } + + MI->eraseFromParent(); // The instruction is gone now. + return BB; +} + +MachineBasicBlock * +ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI, + MachineBasicBlock *MBB) const { + const TargetMachine &TM = getTargetMachine(); + const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + + assert(Subtarget->isTargetWindows() && + "__chkstk is only supported on Windows"); + assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode"); + + // __chkstk takes the number of words to allocate on the stack in R4, and + // returns the stack adjustment in number of bytes in R4. This will not + // clober any other registers (other than the obvious lr). + // + // Although, technically, IP should be considered a register which may be + // clobbered, the call itself will not touch it. Windows on ARM is a pure + // thumb-2 environment, so there is no interworking required. As a result, we + // do not expect a veneer to be emitted by the linker, clobbering IP. + // + // Each module receives its own copy of __chkstk, so no import thunk is + // required, again, ensuring that IP is not clobbered. + // + // Finally, although some linkers may theoretically provide a trampoline for + // out of range calls (which is quite common due to a 32M range limitation of + // branches for Thumb), we can generate the long-call version via + // -mcmodel=large, alleviating the need for the trampoline which may clobber + // IP. + + switch (TM.getCodeModel()) { + case CodeModel::Small: + case CodeModel::Medium: + case CodeModel::Default: + case CodeModel::Kernel: + BuildMI(*MBB, MI, DL, TII.get(ARM::tBL)) + .addImm((unsigned)ARMCC::AL).addReg(0) + .addExternalSymbol("__chkstk") + .addReg(ARM::R4, RegState::Implicit | RegState::Kill) + .addReg(ARM::R4, RegState::Implicit | RegState::Define) + .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead); + break; + case CodeModel::Large: + case CodeModel::JITDefault: { + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass); + + BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg) + .addExternalSymbol("__chkstk"); + BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr)) + .addImm((unsigned)ARMCC::AL).addReg(0) + .addReg(Reg, RegState::Kill) + .addReg(ARM::R4, RegState::Implicit | RegState::Kill) + .addReg(ARM::R4, RegState::Implicit | RegState::Define) + .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead); + break; + } + } + + AddDefaultCC(AddDefaultPred(BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), + ARM::SP) + .addReg(ARM::SP).addReg(ARM::R4))); + + MI->eraseFromParent(); + return MBB; +} + +MachineBasicBlock * +ARMTargetLowering::EmitLowered__dbzchk(MachineInstr *MI, + MachineBasicBlock *MBB) const { + DebugLoc DL = MI->getDebugLoc(); + MachineFunction *MF = MBB->getParent(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + + MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock(); + MF->push_back(ContBB); + ContBB->splice(ContBB->begin(), MBB, + std::next(MachineBasicBlock::iterator(MI)), MBB->end()); + MBB->addSuccessor(ContBB); + + MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); + MF->push_back(TrapBB); + BuildMI(TrapBB, DL, TII->get(ARM::t2UDF)).addImm(249); + MBB->addSuccessor(TrapBB); + + BuildMI(*MBB, MI, DL, TII->get(ARM::tCBZ)) + .addReg(MI->getOperand(0).getReg()) + .addMBB(TrapBB); + + MI->eraseFromParent(); + return ContBB; +} + +MachineBasicBlock * +ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, + MachineBasicBlock *BB) const { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + DebugLoc dl = MI->getDebugLoc(); + bool isThumb2 = Subtarget->isThumb2(); + switch (MI->getOpcode()) { + default: { + MI->dump(); + llvm_unreachable("Unexpected instr type to insert"); + } + // The Thumb2 pre-indexed stores have the same MI operands, they just + // define them differently in the .td files from the isel patterns, so + // they need pseudos. + case ARM::t2STR_preidx: + MI->setDesc(TII->get(ARM::t2STR_PRE)); + return BB; + case ARM::t2STRB_preidx: + MI->setDesc(TII->get(ARM::t2STRB_PRE)); + return BB; + case ARM::t2STRH_preidx: + MI->setDesc(TII->get(ARM::t2STRH_PRE)); + return BB; + + case ARM::STRi_preidx: + case ARM::STRBi_preidx: { + unsigned NewOpc = MI->getOpcode() == ARM::STRi_preidx ? + ARM::STR_PRE_IMM : ARM::STRB_PRE_IMM; + // Decode the offset. + unsigned Offset = MI->getOperand(4).getImm(); + bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub; + Offset = ARM_AM::getAM2Offset(Offset); + if (isSub) + Offset = -Offset; + + MachineMemOperand *MMO = *MI->memoperands_begin(); + BuildMI(*BB, MI, dl, TII->get(NewOpc)) + .addOperand(MI->getOperand(0)) // Rn_wb + .addOperand(MI->getOperand(1)) // Rt + .addOperand(MI->getOperand(2)) // Rn + .addImm(Offset) // offset (skip GPR==zero_reg) + .addOperand(MI->getOperand(5)) // pred + .addOperand(MI->getOperand(6)) + .addMemOperand(MMO); + MI->eraseFromParent(); + return BB; + } + case ARM::STRr_preidx: + case ARM::STRBr_preidx: + case ARM::STRH_preidx: { + unsigned NewOpc; + switch (MI->getOpcode()) { + default: llvm_unreachable("unexpected opcode!"); + case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break; + case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break; + case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break; + } + MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); + for (unsigned i = 0; i < MI->getNumOperands(); ++i) + MIB.addOperand(MI->getOperand(i)); + MI->eraseFromParent(); + return BB; + } + + case ARM::tMOVCCr_pseudo: { + // To "insert" a SELECT_CC instruction, we actually have to insert the + // diamond control-flow pattern. The incoming instruction knows the + // destination vreg to set, the condition code register to branch on, the + // true/false values to select between, and a branch opcode to use. + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction::iterator It = ++BB->getIterator(); + + // thisMBB: + // ... + // TrueVal = ... + // cmpTY ccX, r1, r2 + // bCC copy1MBB + // fallthrough --> copy0MBB + MachineBasicBlock *thisMBB = BB; + MachineFunction *F = BB->getParent(); + MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(It, copy0MBB); + F->insert(It, sinkMBB); + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), BB, + std::next(MachineBasicBlock::iterator(MI)), BB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(BB); + + BB->addSuccessor(copy0MBB); + BB->addSuccessor(sinkMBB); + + BuildMI(BB, dl, TII->get(ARM::tBcc)).addMBB(sinkMBB) + .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg()); + + // copy0MBB: + // %FalseValue = ... + // # fallthrough to sinkMBB + BB = copy0MBB; + + // Update machine-CFG edges + BB->addSuccessor(sinkMBB); + + // sinkMBB: + // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] + // ... + BB = sinkMBB; + BuildMI(*BB, BB->begin(), dl, + TII->get(ARM::PHI), MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) + .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); + + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; + } + + case ARM::BCCi64: + case ARM::BCCZi64: { + // If there is an unconditional branch to the other successor, remove it. + BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end()); + + // Compare both parts that make up the double comparison separately for + // equality. + bool RHSisZero = MI->getOpcode() == ARM::BCCZi64; + + unsigned LHS1 = MI->getOperand(1).getReg(); + unsigned LHS2 = MI->getOperand(2).getReg(); + if (RHSisZero) { + AddDefaultPred(BuildMI(BB, dl, + TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) + .addReg(LHS1).addImm(0)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) + .addReg(LHS2).addImm(0) + .addImm(ARMCC::EQ).addReg(ARM::CPSR); + } else { + unsigned RHS1 = MI->getOperand(3).getReg(); + unsigned RHS2 = MI->getOperand(4).getReg(); + AddDefaultPred(BuildMI(BB, dl, + TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) + .addReg(LHS1).addReg(RHS1)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) + .addReg(LHS2).addReg(RHS2) + .addImm(ARMCC::EQ).addReg(ARM::CPSR); + } + + MachineBasicBlock *destMBB = MI->getOperand(RHSisZero ? 3 : 5).getMBB(); + MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB); + if (MI->getOperand(0).getImm() == ARMCC::NE) + std::swap(destMBB, exitMBB); + + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) + .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR); + if (isThumb2) + AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2B)).addMBB(exitMBB)); + else + BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB); + + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; + } + + case ARM::Int_eh_sjlj_setjmp: + case ARM::Int_eh_sjlj_setjmp_nofp: + case ARM::tInt_eh_sjlj_setjmp: + case ARM::t2Int_eh_sjlj_setjmp: + case ARM::t2Int_eh_sjlj_setjmp_nofp: + return BB; + + case ARM::Int_eh_sjlj_setup_dispatch: + EmitSjLjDispatchBlock(MI, BB); + return BB; + + case ARM::ABS: + case ARM::t2ABS: { + // To insert an ABS instruction, we have to insert the + // diamond control-flow pattern. The incoming instruction knows the + // source vreg to test against 0, the destination vreg to set, + // the condition code register to branch on, the + // true/false values to select between, and a branch opcode to use. + // It transforms + // V1 = ABS V0 + // into + // V2 = MOVS V0 + // BCC (branch to SinkBB if V0 >= 0) + // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0) + // SinkBB: V1 = PHI(V2, V3) + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction::iterator BBI = ++BB->getIterator(); + MachineFunction *Fn = BB->getParent(); + MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB); + Fn->insert(BBI, RSBBB); + Fn->insert(BBI, SinkBB); + + unsigned int ABSSrcReg = MI->getOperand(1).getReg(); + unsigned int ABSDstReg = MI->getOperand(0).getReg(); + bool ABSSrcKIll = MI->getOperand(1).isKill(); + bool isThumb2 = Subtarget->isThumb2(); + MachineRegisterInfo &MRI = Fn->getRegInfo(); + // In Thumb mode S must not be specified if source register is the SP or + // PC and if destination register is the SP, so restrict register class + unsigned NewRsbDstReg = + MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass); + + // Transfer the remainder of BB and its successor edges to sinkMBB. + SinkBB->splice(SinkBB->begin(), BB, + std::next(MachineBasicBlock::iterator(MI)), BB->end()); + SinkBB->transferSuccessorsAndUpdatePHIs(BB); + + BB->addSuccessor(RSBBB); + BB->addSuccessor(SinkBB); + + // fall through to SinkMBB + RSBBB->addSuccessor(SinkBB); + + // insert a cmp at the end of BB + AddDefaultPred(BuildMI(BB, dl, + TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) + .addReg(ABSSrcReg).addImm(0)); + + // insert a bcc with opposite CC to ARMCC::MI at the end of BB + BuildMI(BB, dl, + TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB) + .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR); + + // insert rsbri in RSBBB + // Note: BCC and rsbri will be converted into predicated rsbmi + // by if-conversion pass + BuildMI(*RSBBB, RSBBB->begin(), dl, + TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg) + .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0) + .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0); + + // insert PHI in SinkBB, + // reuse ABSDstReg to not change uses of ABS instruction + BuildMI(*SinkBB, SinkBB->begin(), dl, + TII->get(ARM::PHI), ABSDstReg) + .addReg(NewRsbDstReg).addMBB(RSBBB) + .addReg(ABSSrcReg).addMBB(BB); + + // remove ABS instruction + MI->eraseFromParent(); + + // return last added BB + return SinkBB; + } + case ARM::COPY_STRUCT_BYVAL_I32: + ++NumLoopByVals; + return EmitStructByval(MI, BB); + case ARM::WIN__CHKSTK: + return EmitLowered__chkstk(MI, BB); + case ARM::WIN__DBZCHK: + return EmitLowered__dbzchk(MI, BB); + } +} + +/// \brief Attaches vregs to MEMCPY that it will use as scratch registers +/// when it is expanded into LDM/STM. This is done as a post-isel lowering +/// instead of as a custom inserter because we need the use list from the SDNode. +static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, + MachineInstr *MI, const SDNode *Node) { + bool isThumb1 = Subtarget->isThumb1Only(); + + DebugLoc DL = MI->getDebugLoc(); + MachineFunction *MF = MI->getParent()->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + MachineInstrBuilder MIB(*MF, MI); + + // If the new dst/src is unused mark it as dead. + if (!Node->hasAnyUseOfValue(0)) { + MI->getOperand(0).setIsDead(true); + } + if (!Node->hasAnyUseOfValue(1)) { + MI->getOperand(1).setIsDead(true); + } + + // The MEMCPY both defines and kills the scratch registers. + for (unsigned I = 0; I != MI->getOperand(4).getImm(); ++I) { + unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass + : &ARM::GPRRegClass); + MIB.addReg(TmpReg, RegState::Define|RegState::Dead); + } +} + +void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, + SDNode *Node) const { + if (MI->getOpcode() == ARM::MEMCPY) { + attachMEMCPYScratchRegs(Subtarget, MI, Node); + return; + } + + const MCInstrDesc *MCID = &MI->getDesc(); + // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, + // RSC. Coming out of isel, they have an implicit CPSR def, but the optional + // operand is still set to noreg. If needed, set the optional operand's + // register to CPSR, and remove the redundant implicit def. + // + // e.g. ADCS (..., CPSR<imp-def>) -> ADC (... opt:CPSR<def>). + + // Rename pseudo opcodes. + unsigned NewOpc = convertAddSubFlagsOpcode(MI->getOpcode()); + if (NewOpc) { + const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); + MCID = &TII->get(NewOpc); + + assert(MCID->getNumOperands() == MI->getDesc().getNumOperands() + 1 && + "converted opcode should be the same except for cc_out"); + + MI->setDesc(*MCID); + + // Add the optional cc_out operand + MI->addOperand(MachineOperand::CreateReg(0, /*isDef=*/true)); + } + unsigned ccOutIdx = MCID->getNumOperands() - 1; + + // Any ARM instruction that sets the 's' bit should specify an optional + // "cc_out" operand in the last operand position. + if (!MI->hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) { + assert(!NewOpc && "Optional cc_out operand required"); + return; + } + // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it + // since we already have an optional CPSR def. + bool definesCPSR = false; + bool deadCPSR = false; + for (unsigned i = MCID->getNumOperands(), e = MI->getNumOperands(); + i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) { + definesCPSR = true; + if (MO.isDead()) + deadCPSR = true; + MI->RemoveOperand(i); + break; + } + } + if (!definesCPSR) { + assert(!NewOpc && "Optional cc_out operand required"); + return; + } + assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag"); + if (deadCPSR) { + assert(!MI->getOperand(ccOutIdx).getReg() && + "expect uninitialized optional cc_out operand"); + return; + } + + // If this instruction was defined with an optional CPSR def and its dag node + // had a live implicit CPSR def, then activate the optional CPSR def. + MachineOperand &MO = MI->getOperand(ccOutIdx); + MO.setReg(ARM::CPSR); + MO.setIsDef(true); +} + +//===----------------------------------------------------------------------===// +// ARM Optimization Hooks +//===----------------------------------------------------------------------===// + +// Helper function that checks if N is a null or all ones constant. +static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) { + return AllOnes ? isAllOnesConstant(N) : isNullConstant(N); +} + +// Return true if N is conditionally 0 or all ones. +// Detects these expressions where cc is an i1 value: +// +// (select cc 0, y) [AllOnes=0] +// (select cc y, 0) [AllOnes=0] +// (zext cc) [AllOnes=0] +// (sext cc) [AllOnes=0/1] +// (select cc -1, y) [AllOnes=1] +// (select cc y, -1) [AllOnes=1] +// +// Invert is set when N is the null/all ones constant when CC is false. +// OtherOp is set to the alternative value of N. +static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, + SDValue &CC, bool &Invert, + SDValue &OtherOp, + SelectionDAG &DAG) { + switch (N->getOpcode()) { + default: return false; + case ISD::SELECT: { + CC = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue N2 = N->getOperand(2); + if (isZeroOrAllOnes(N1, AllOnes)) { + Invert = false; + OtherOp = N2; + return true; + } + if (isZeroOrAllOnes(N2, AllOnes)) { + Invert = true; + OtherOp = N1; + return true; + } + return false; + } + case ISD::ZERO_EXTEND: + // (zext cc) can never be the all ones value. + if (AllOnes) + return false; + // Fall through. + case ISD::SIGN_EXTEND: { + SDLoc dl(N); + EVT VT = N->getValueType(0); + CC = N->getOperand(0); + if (CC.getValueType() != MVT::i1) + return false; + Invert = !AllOnes; + if (AllOnes) + // When looking for an AllOnes constant, N is an sext, and the 'other' + // value is 0. + OtherOp = DAG.getConstant(0, dl, VT); + else if (N->getOpcode() == ISD::ZERO_EXTEND) + // When looking for a 0 constant, N can be zext or sext. + OtherOp = DAG.getConstant(1, dl, VT); + else + OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, + VT); + return true; + } + } +} + +// Combine a constant select operand into its use: +// +// (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) +// (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) +// (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1] +// (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) +// (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) +// +// The transform is rejected if the select doesn't have a constant operand that +// is null, or all ones when AllOnes is set. +// +// Also recognize sext/zext from i1: +// +// (add (zext cc), x) -> (select cc (add x, 1), x) +// (add (sext cc), x) -> (select cc (add x, -1), x) +// +// These transformations eventually create predicated instructions. +// +// @param N The node to transform. +// @param Slct The N operand that is a select. +// @param OtherOp The other N operand (x above). +// @param DCI Context. +// @param AllOnes Require the select constant to be all ones instead of null. +// @returns The new node, or SDValue() on failure. +static +SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, + TargetLowering::DAGCombinerInfo &DCI, + bool AllOnes = false) { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + SDValue NonConstantVal; + SDValue CCOp; + bool SwapSelectOps; + if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps, + NonConstantVal, DAG)) + return SDValue(); + + // Slct is now know to be the desired identity constant when CC is true. + SDValue TrueVal = OtherOp; + SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT, + OtherOp, NonConstantVal); + // Unless SwapSelectOps says CC should be false. + if (SwapSelectOps) + std::swap(TrueVal, FalseVal); + + return DAG.getNode(ISD::SELECT, SDLoc(N), VT, + CCOp, TrueVal, FalseVal); +} + +// Attempt combineSelectAndUse on each operand of a commutative operator N. +static +SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + if (N0.getNode()->hasOneUse()) { + SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes); + if (Result.getNode()) + return Result; + } + if (N1.getNode()->hasOneUse()) { + SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes); + if (Result.getNode()) + return Result; + } + return SDValue(); +} + +// AddCombineToVPADDL- For pair-wise add on neon, use the vpaddl instruction +// (only after legalization). +static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + + // Only perform optimization if after legalize, and if NEON is available. We + // also expected both operands to be BUILD_VECTORs. + if (DCI.isBeforeLegalize() || !Subtarget->hasNEON() + || N0.getOpcode() != ISD::BUILD_VECTOR + || N1.getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + + // Check output type since VPADDL operand elements can only be 8, 16, or 32. + EVT VT = N->getValueType(0); + if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64) + return SDValue(); + + // Check that the vector operands are of the right form. + // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR + // operands, where N is the size of the formed vector. + // Each EXTRACT_VECTOR should have the same input vector and odd or even + // index such that we have a pair wise add pattern. + + // Grab the vector that all EXTRACT_VECTOR nodes should be referencing. + if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + SDValue Vec = N0->getOperand(0)->getOperand(0); + SDNode *V = Vec.getNode(); + unsigned nextIndex = 0; + + // For each operands to the ADD which are BUILD_VECTORs, + // check to see if each of their operands are an EXTRACT_VECTOR with + // the same vector and appropriate index. + for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) { + if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT + && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + + SDValue ExtVec0 = N0->getOperand(i); + SDValue ExtVec1 = N1->getOperand(i); + + // First operand is the vector, verify its the same. + if (V != ExtVec0->getOperand(0).getNode() || + V != ExtVec1->getOperand(0).getNode()) + return SDValue(); + + // Second is the constant, verify its correct. + ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1)); + ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1)); + + // For the constant, we want to see all the even or all the odd. + if (!C0 || !C1 || C0->getZExtValue() != nextIndex + || C1->getZExtValue() != nextIndex+1) + return SDValue(); + + // Increment index. + nextIndex+=2; + } else + return SDValue(); + } + + // Create VPADDL node. + SelectionDAG &DAG = DCI.DAG; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + SDLoc dl(N); + + // Build operand list. + SmallVector<SDValue, 8> Ops; + Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl, + TLI.getPointerTy(DAG.getDataLayout()))); + + // Input is the vector. + Ops.push_back(Vec); + + // Get widened type and narrowed type. + MVT widenType; + unsigned numElem = VT.getVectorNumElements(); + + EVT inputLaneType = Vec.getValueType().getVectorElementType(); + switch (inputLaneType.getSimpleVT().SimpleTy) { + case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break; + case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break; + case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break; + default: + llvm_unreachable("Invalid vector element type for padd optimization."); + } + + SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops); + unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE; + return DAG.getNode(ExtOp, dl, VT, tmp); +} + +static SDValue findMUL_LOHI(SDValue V) { + if (V->getOpcode() == ISD::UMUL_LOHI || + V->getOpcode() == ISD::SMUL_LOHI) + return V; + return SDValue(); +} + +static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + + if (Subtarget->isThumb1Only()) return SDValue(); + + // Only perform the checks after legalize when the pattern is available. + if (DCI.isBeforeLegalize()) return SDValue(); + + // Look for multiply add opportunities. + // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where + // each add nodes consumes a value from ISD::UMUL_LOHI and there is + // a glue link from the first add to the second add. + // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by + // a S/UMLAL instruction. + // UMUL_LOHI + // / :lo \ :hi + // / \ [no multiline comment] + // loAdd -> ADDE | + // \ :glue / + // \ / + // ADDC <- hiAdd + // + assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC"); + SDValue AddcOp0 = AddcNode->getOperand(0); + SDValue AddcOp1 = AddcNode->getOperand(1); + + // Check if the two operands are from the same mul_lohi node. + if (AddcOp0.getNode() == AddcOp1.getNode()) + return SDValue(); + + assert(AddcNode->getNumValues() == 2 && + AddcNode->getValueType(0) == MVT::i32 && + "Expect ADDC with two result values. First: i32"); + + // Check that we have a glued ADDC node. + if (AddcNode->getValueType(1) != MVT::Glue) + return SDValue(); + + // Check that the ADDC adds the low result of the S/UMUL_LOHI. + if (AddcOp0->getOpcode() != ISD::UMUL_LOHI && + AddcOp0->getOpcode() != ISD::SMUL_LOHI && + AddcOp1->getOpcode() != ISD::UMUL_LOHI && + AddcOp1->getOpcode() != ISD::SMUL_LOHI) + return SDValue(); + + // Look for the glued ADDE. + SDNode* AddeNode = AddcNode->getGluedUser(); + if (!AddeNode) + return SDValue(); + + // Make sure it is really an ADDE. + if (AddeNode->getOpcode() != ISD::ADDE) + return SDValue(); + + assert(AddeNode->getNumOperands() == 3 && + AddeNode->getOperand(2).getValueType() == MVT::Glue && + "ADDE node has the wrong inputs"); + + // Check for the triangle shape. + SDValue AddeOp0 = AddeNode->getOperand(0); + SDValue AddeOp1 = AddeNode->getOperand(1); + + // Make sure that the ADDE operands are not coming from the same node. + if (AddeOp0.getNode() == AddeOp1.getNode()) + return SDValue(); + + // Find the MUL_LOHI node walking up ADDE's operands. + bool IsLeftOperandMUL = false; + SDValue MULOp = findMUL_LOHI(AddeOp0); + if (MULOp == SDValue()) + MULOp = findMUL_LOHI(AddeOp1); + else + IsLeftOperandMUL = true; + if (MULOp == SDValue()) + return SDValue(); + + // Figure out the right opcode. + unsigned Opc = MULOp->getOpcode(); + unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL; + + // Figure out the high and low input values to the MLAL node. + SDValue* HiAdd = nullptr; + SDValue* LoMul = nullptr; + SDValue* LowAdd = nullptr; + + // Ensure that ADDE is from high result of ISD::SMUL_LOHI. + if ((AddeOp0 != MULOp.getValue(1)) && (AddeOp1 != MULOp.getValue(1))) + return SDValue(); + + if (IsLeftOperandMUL) + HiAdd = &AddeOp1; + else + HiAdd = &AddeOp0; + + + // Ensure that LoMul and LowAdd are taken from correct ISD::SMUL_LOHI node + // whose low result is fed to the ADDC we are checking. + + if (AddcOp0 == MULOp.getValue(0)) { + LoMul = &AddcOp0; + LowAdd = &AddcOp1; + } + if (AddcOp1 == MULOp.getValue(0)) { + LoMul = &AddcOp1; + LowAdd = &AddcOp0; + } + + if (!LoMul) + return SDValue(); + + // Create the merged node. + SelectionDAG &DAG = DCI.DAG; + + // Build operand list. + SmallVector<SDValue, 8> Ops; + Ops.push_back(LoMul->getOperand(0)); + Ops.push_back(LoMul->getOperand(1)); + Ops.push_back(*LowAdd); + Ops.push_back(*HiAdd); + + SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcNode), + DAG.getVTList(MVT::i32, MVT::i32), Ops); + + // Replace the ADDs' nodes uses by the MLA node's values. + SDValue HiMLALResult(MLALNode.getNode(), 1); + DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); + + SDValue LoMLALResult(MLALNode.getNode(), 0); + DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); + + // Return original node to notify the driver to stop replacing. + SDValue resNode(AddcNode, 0); + return resNode; +} + +/// PerformADDCCombine - Target-specific dag combine transform from +/// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL. +static SDValue PerformADDCCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + + return AddCombineTo64bitMLAL(N, DCI, Subtarget); + +} + +/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with +/// operands N0 and N1. This is a helper for PerformADDCombine that is +/// called with the default operands, and if that fails, with commuted +/// operands. +static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget){ + + // Attempt to create vpaddl for this add. + SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget); + if (Result.getNode()) + return Result; + + // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) + if (N0.getNode()->hasOneUse()) { + SDValue Result = combineSelectAndUse(N, N0, N1, DCI); + if (Result.getNode()) return Result; + } + return SDValue(); +} + +/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. +/// +static SDValue PerformADDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // First try with the default operand order. + SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget); + if (Result.getNode()) + return Result; + + // If that didn't work, try again with the operands commuted. + return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget); +} + +/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. +/// +static SDValue PerformSUBCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) + if (N1.getNode()->hasOneUse()) { + SDValue Result = combineSelectAndUse(N, N1, N0, DCI); + if (Result.getNode()) return Result; + } + + return SDValue(); +} + +/// PerformVMULCombine +/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the +/// special multiplier accumulator forwarding. +/// vmul d3, d0, d2 +/// vmla d3, d1, d2 +/// is faster than +/// vadd d3, d0, d1 +/// vmul d3, d3, d2 +// However, for (A + B) * (A + B), +// vadd d2, d0, d1 +// vmul d3, d0, d2 +// vmla d3, d1, d2 +// is slower than +// vadd d2, d0, d1 +// vmul d3, d2, d2 +static SDValue PerformVMULCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + if (!Subtarget->hasVMLxForwarding()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + unsigned Opcode = N0.getOpcode(); + if (Opcode != ISD::ADD && Opcode != ISD::SUB && + Opcode != ISD::FADD && Opcode != ISD::FSUB) { + Opcode = N1.getOpcode(); + if (Opcode != ISD::ADD && Opcode != ISD::SUB && + Opcode != ISD::FADD && Opcode != ISD::FSUB) + return SDValue(); + std::swap(N0, N1); + } + + if (N0 == N1) + return SDValue(); + + EVT VT = N->getValueType(0); + SDLoc DL(N); + SDValue N00 = N0->getOperand(0); + SDValue N01 = N0->getOperand(1); + return DAG.getNode(Opcode, DL, VT, + DAG.getNode(ISD::MUL, DL, VT, N00, N1), + DAG.getNode(ISD::MUL, DL, VT, N01, N1)); +} + +static SDValue PerformMULCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + SelectionDAG &DAG = DCI.DAG; + + if (Subtarget->isThumb1Only()) + return SDValue(); + + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + EVT VT = N->getValueType(0); + if (VT.is64BitVector() || VT.is128BitVector()) + return PerformVMULCombine(N, DCI, Subtarget); + if (VT != MVT::i32) + return SDValue(); + + ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!C) + return SDValue(); + + int64_t MulAmt = C->getSExtValue(); + unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt); + + ShiftAmt = ShiftAmt & (32 - 1); + SDValue V = N->getOperand(0); + SDLoc DL(N); + + SDValue Res; + MulAmt >>= ShiftAmt; + + if (MulAmt >= 0) { + if (isPowerOf2_32(MulAmt - 1)) { + // (mul x, 2^N + 1) => (add (shl x, N), x) + Res = DAG.getNode(ISD::ADD, DL, VT, + V, + DAG.getNode(ISD::SHL, DL, VT, + V, + DAG.getConstant(Log2_32(MulAmt - 1), DL, + MVT::i32))); + } else if (isPowerOf2_32(MulAmt + 1)) { + // (mul x, 2^N - 1) => (sub (shl x, N), x) + Res = DAG.getNode(ISD::SUB, DL, VT, + DAG.getNode(ISD::SHL, DL, VT, + V, + DAG.getConstant(Log2_32(MulAmt + 1), DL, + MVT::i32)), + V); + } else + return SDValue(); + } else { + uint64_t MulAmtAbs = -MulAmt; + if (isPowerOf2_32(MulAmtAbs + 1)) { + // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) + Res = DAG.getNode(ISD::SUB, DL, VT, + V, + DAG.getNode(ISD::SHL, DL, VT, + V, + DAG.getConstant(Log2_32(MulAmtAbs + 1), DL, + MVT::i32))); + } else if (isPowerOf2_32(MulAmtAbs - 1)) { + // (mul x, -(2^N + 1)) => - (add (shl x, N), x) + Res = DAG.getNode(ISD::ADD, DL, VT, + V, + DAG.getNode(ISD::SHL, DL, VT, + V, + DAG.getConstant(Log2_32(MulAmtAbs - 1), DL, + MVT::i32))); + Res = DAG.getNode(ISD::SUB, DL, VT, + DAG.getConstant(0, DL, MVT::i32), Res); + + } else + return SDValue(); + } + + if (ShiftAmt != 0) + Res = DAG.getNode(ISD::SHL, DL, VT, + Res, DAG.getConstant(ShiftAmt, DL, MVT::i32)); + + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, Res, false); + return SDValue(); +} + +static SDValue PerformANDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + + // Attempt to use immediate-form VBIC + BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); + SDLoc dl(N); + EVT VT = N->getValueType(0); + SelectionDAG &DAG = DCI.DAG; + + if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (BVN && + BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { + if (SplatBitSize <= 64) { + EVT VbicVT; + SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(), + SplatUndef.getZExtValue(), SplatBitSize, + DAG, dl, VbicVT, VT.is128BitVector(), + OtherModImm); + if (Val.getNode()) { + SDValue Input = + DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); + SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val); + return DAG.getNode(ISD::BITCAST, dl, VT, Vbic); + } + } + } + + if (!Subtarget->isThumb1Only()) { + // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) + SDValue Result = combineSelectAndUseCommutative(N, true, DCI); + if (Result.getNode()) + return Result; + } + + return SDValue(); +} + +/// PerformORCombine - Target-specific dag combine xforms for ISD::OR +static SDValue PerformORCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + // Attempt to use immediate-form VORR + BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); + SDLoc dl(N); + EVT VT = N->getValueType(0); + SelectionDAG &DAG = DCI.DAG; + + if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (BVN && Subtarget->hasNEON() && + BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { + if (SplatBitSize <= 64) { + EVT VorrVT; + SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), + SplatUndef.getZExtValue(), SplatBitSize, + DAG, dl, VorrVT, VT.is128BitVector(), + OtherModImm); + if (Val.getNode()) { + SDValue Input = + DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); + SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val); + return DAG.getNode(ISD::BITCAST, dl, VT, Vorr); + } + } + } + + if (!Subtarget->isThumb1Only()) { + // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) + SDValue Result = combineSelectAndUseCommutative(N, false, DCI); + if (Result.getNode()) + return Result; + } + + // The code below optimizes (or (and X, Y), Z). + // The AND operand needs to have a single user to make these optimizations + // profitable. + SDValue N0 = N->getOperand(0); + if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) + return SDValue(); + SDValue N1 = N->getOperand(1); + + // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. + if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() && + DAG.getTargetLoweringInfo().isTypeLegal(VT)) { + APInt SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + + APInt SplatBits0, SplatBits1; + BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1)); + BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1)); + // Ensure that the second operand of both ands are constants + if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, + HasAnyUndefs) && !HasAnyUndefs) { + if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, + HasAnyUndefs) && !HasAnyUndefs) { + // Ensure that the bit width of the constants are the same and that + // the splat arguments are logical inverses as per the pattern we + // are trying to simplify. + if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() && + SplatBits0 == ~SplatBits1) { + // Canonicalize the vector type to make instruction selection + // simpler. + EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; + SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, + N0->getOperand(1), + N0->getOperand(0), + N1->getOperand(0)); + return DAG.getNode(ISD::BITCAST, dl, VT, Result); + } + } + } + } + + // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when + // reasonable. + + // BFI is only available on V6T2+ + if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) + return SDValue(); + + SDLoc DL(N); + // 1) or (and A, mask), val => ARMbfi A, val, mask + // iff (val & mask) == val + // + // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask + // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2) + // && mask == ~mask2 + // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2) + // && ~mask == mask2 + // (i.e., copy a bitfield value into another bitfield of the same width) + + if (VT != MVT::i32) + return SDValue(); + + SDValue N00 = N0.getOperand(0); + + // The value and the mask need to be constants so we can verify this is + // actually a bitfield set. If the mask is 0xffff, we can do better + // via a movt instruction, so don't use BFI in that case. + SDValue MaskOp = N0.getOperand(1); + ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp); + if (!MaskC) + return SDValue(); + unsigned Mask = MaskC->getZExtValue(); + if (Mask == 0xffff) + return SDValue(); + SDValue Res; + // Case (1): or (and A, mask), val => ARMbfi A, val, mask + ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); + if (N1C) { + unsigned Val = N1C->getZExtValue(); + if ((Val & ~Mask) != Val) + return SDValue(); + + if (ARM::isBitFieldInvertedMask(Mask)) { + Val >>= countTrailingZeros(~Mask); + + Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, + DAG.getConstant(Val, DL, MVT::i32), + DAG.getConstant(Mask, DL, MVT::i32)); + + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, Res, false); + return SDValue(); + } + } else if (N1.getOpcode() == ISD::AND) { + // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask + ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); + if (!N11C) + return SDValue(); + unsigned Mask2 = N11C->getZExtValue(); + + // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern + // as is to match. + if (ARM::isBitFieldInvertedMask(Mask) && + (Mask == ~Mask2)) { + // The pack halfword instruction works better for masks that fit it, + // so use that when it's available. + if (Subtarget->hasT2ExtractPack() && + (Mask == 0xffff || Mask == 0xffff0000)) + return SDValue(); + // 2a + unsigned amt = countTrailingZeros(Mask2); + Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), + DAG.getConstant(amt, DL, MVT::i32)); + Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, + DAG.getConstant(Mask, DL, MVT::i32)); + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, Res, false); + return SDValue(); + } else if (ARM::isBitFieldInvertedMask(~Mask) && + (~Mask == Mask2)) { + // The pack halfword instruction works better for masks that fit it, + // so use that when it's available. + if (Subtarget->hasT2ExtractPack() && + (Mask2 == 0xffff || Mask2 == 0xffff0000)) + return SDValue(); + // 2b + unsigned lsb = countTrailingZeros(Mask); + Res = DAG.getNode(ISD::SRL, DL, VT, N00, + DAG.getConstant(lsb, DL, MVT::i32)); + Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, + DAG.getConstant(Mask2, DL, MVT::i32)); + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, Res, false); + return SDValue(); + } + } + + if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) && + N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) && + ARM::isBitFieldInvertedMask(~Mask)) { + // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask + // where lsb(mask) == #shamt and masked bits of B are known zero. + SDValue ShAmt = N00.getOperand(1); + unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue(); + unsigned LSB = countTrailingZeros(Mask); + if (ShAmtC != LSB) + return SDValue(); + + Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0), + DAG.getConstant(~Mask, DL, MVT::i32)); + + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, Res, false); + } + + return SDValue(); +} + +static SDValue PerformXORCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + EVT VT = N->getValueType(0); + SelectionDAG &DAG = DCI.DAG; + + if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + if (!Subtarget->isThumb1Only()) { + // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) + SDValue Result = combineSelectAndUseCommutative(N, false, DCI); + if (Result.getNode()) + return Result; + } + + return SDValue(); +} + +// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it, +// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and +// their position in "to" (Rd). +static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) { + assert(N->getOpcode() == ARMISD::BFI); + + SDValue From = N->getOperand(1); + ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue(); + FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation()); + + // If the Base came from a SHR #C, we can deduce that it is really testing bit + // #C in the base of the SHR. + if (From->getOpcode() == ISD::SRL && + isa<ConstantSDNode>(From->getOperand(1))) { + APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue(); + assert(Shift.getLimitedValue() < 32 && "Shift too large!"); + FromMask <<= Shift.getLimitedValue(31); + From = From->getOperand(0); + } + + return From; +} + +// If A and B contain one contiguous set of bits, does A | B == A . B? +// +// Neither A nor B must be zero. +static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) { + unsigned LastActiveBitInA = A.countTrailingZeros(); + unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1; + return LastActiveBitInA - 1 == FirstActiveBitInB; +} + +static SDValue FindBFIToCombineWith(SDNode *N) { + // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with, + // if one exists. + APInt ToMask, FromMask; + SDValue From = ParseBFI(N, ToMask, FromMask); + SDValue To = N->getOperand(0); + + // Now check for a compatible BFI to merge with. We can pass through BFIs that + // aren't compatible, but not if they set the same bit in their destination as + // we do (or that of any BFI we're going to combine with). + SDValue V = To; + APInt CombinedToMask = ToMask; + while (V.getOpcode() == ARMISD::BFI) { + APInt NewToMask, NewFromMask; + SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask); + if (NewFrom != From) { + // This BFI has a different base. Keep going. + CombinedToMask |= NewToMask; + V = V.getOperand(0); + continue; + } + + // Do the written bits conflict with any we've seen so far? + if ((NewToMask & CombinedToMask).getBoolValue()) + // Conflicting bits - bail out because going further is unsafe. + return SDValue(); + + // Are the new bits contiguous when combined with the old bits? + if (BitsProperlyConcatenate(ToMask, NewToMask) && + BitsProperlyConcatenate(FromMask, NewFromMask)) + return V; + if (BitsProperlyConcatenate(NewToMask, ToMask) && + BitsProperlyConcatenate(NewFromMask, FromMask)) + return V; + + // We've seen a write to some bits, so track it. + CombinedToMask |= NewToMask; + // Keep going... + V = V.getOperand(0); + } + + return SDValue(); +} + +static SDValue PerformBFICombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue N1 = N->getOperand(1); + if (N1.getOpcode() == ISD::AND) { + // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff + // the bits being cleared by the AND are not demanded by the BFI. + ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); + if (!N11C) + return SDValue(); + unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); + unsigned LSB = countTrailingZeros(~InvMask); + unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB; + assert(Width < + static_cast<unsigned>(std::numeric_limits<unsigned>::digits) && + "undefined behavior"); + unsigned Mask = (1u << Width) - 1; + unsigned Mask2 = N11C->getZExtValue(); + if ((Mask & (~Mask2)) == 0) + return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), + N->getOperand(0), N1.getOperand(0), + N->getOperand(2)); + } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) { + // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes. + // Keep track of any consecutive bits set that all come from the same base + // value. We can combine these together into a single BFI. + SDValue CombineBFI = FindBFIToCombineWith(N); + if (CombineBFI == SDValue()) + return SDValue(); + + // We've found a BFI. + APInt ToMask1, FromMask1; + SDValue From1 = ParseBFI(N, ToMask1, FromMask1); + + APInt ToMask2, FromMask2; + SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2); + assert(From1 == From2); + (void)From2; + + // First, unlink CombineBFI. + DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0)); + // Then create a new BFI, combining the two together. + APInt NewFromMask = FromMask1 | FromMask2; + APInt NewToMask = ToMask1 | ToMask2; + + EVT VT = N->getValueType(0); + SDLoc dl(N); + + if (NewFromMask[0] == 0) + From1 = DCI.DAG.getNode( + ISD::SRL, dl, VT, From1, + DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT)); + return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1, + DCI.DAG.getConstant(~NewToMask, dl, VT)); + } + return SDValue(); +} + +/// PerformVMOVRRDCombine - Target-specific dag combine xforms for +/// ARMISD::VMOVRRD. +static SDValue PerformVMOVRRDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + // vmovrrd(vmovdrr x, y) -> x,y + SDValue InDouble = N->getOperand(0); + if (InDouble.getOpcode() == ARMISD::VMOVDRR && !Subtarget->isFPOnlySP()) + return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); + + // vmovrrd(load f64) -> (load i32), (load i32) + SDNode *InNode = InDouble.getNode(); + if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() && + InNode->getValueType(0) == MVT::f64 && + InNode->getOperand(1).getOpcode() == ISD::FrameIndex && + !cast<LoadSDNode>(InNode)->isVolatile()) { + // TODO: Should this be done for non-FrameIndex operands? + LoadSDNode *LD = cast<LoadSDNode>(InNode); + + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(LD); + SDValue BasePtr = LD->getBasePtr(); + SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, + LD->getPointerInfo(), LD->isVolatile(), + LD->isNonTemporal(), LD->isInvariant(), + LD->getAlignment()); + + SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, + DAG.getConstant(4, DL, MVT::i32)); + SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, + LD->getPointerInfo(), LD->isVolatile(), + LD->isNonTemporal(), LD->isInvariant(), + std::min(4U, LD->getAlignment() / 2)); + + DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); + if (DCI.DAG.getDataLayout().isBigEndian()) + std::swap (NewLD1, NewLD2); + SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2); + return Result; + } + + return SDValue(); +} + +/// PerformVMOVDRRCombine - Target-specific dag combine xforms for +/// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands. +static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { + // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X) + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + if (Op0.getOpcode() == ISD::BITCAST) + Op0 = Op0.getOperand(0); + if (Op1.getOpcode() == ISD::BITCAST) + Op1 = Op1.getOperand(0); + if (Op0.getOpcode() == ARMISD::VMOVRRD && + Op0.getNode() == Op1.getNode() && + Op0.getResNo() == 0 && Op1.getResNo() == 1) + return DAG.getNode(ISD::BITCAST, SDLoc(N), + N->getValueType(0), Op0.getOperand(0)); + return SDValue(); +} + +/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node +/// are normal, non-volatile loads. If so, it is profitable to bitcast an +/// i64 vector to have f64 elements, since the value can then be loaded +/// directly into a VFP register. +static bool hasNormalLoadOperand(SDNode *N) { + unsigned NumElts = N->getValueType(0).getVectorNumElements(); + for (unsigned i = 0; i < NumElts; ++i) { + SDNode *Elt = N->getOperand(i).getNode(); + if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile()) + return true; + } + return false; +} + +/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for +/// ISD::BUILD_VECTOR. +static SDValue PerformBUILD_VECTORCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X): + // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value + // into a pair of GPRs, which is fine when the value is used as a scalar, + // but if the i64 value is converted to a vector, we need to undo the VMOVRRD. + SelectionDAG &DAG = DCI.DAG; + if (N->getNumOperands() == 2) { + SDValue RV = PerformVMOVDRRCombine(N, DAG); + if (RV.getNode()) + return RV; + } + + // Load i64 elements as f64 values so that type legalization does not split + // them up into i32 values. + EVT VT = N->getValueType(0); + if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N)) + return SDValue(); + SDLoc dl(N); + SmallVector<SDValue, 8> Ops; + unsigned NumElts = VT.getVectorNumElements(); + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i)); + Ops.push_back(V); + // Make the DAGCombiner fold the bitcast. + DCI.AddToWorklist(V.getNode()); + } + EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts); + SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, FloatVT, Ops); + return DAG.getNode(ISD::BITCAST, dl, VT, BV); +} + +/// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR. +static SDValue +PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { + // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR. + // At that time, we may have inserted bitcasts from integer to float. + // If these bitcasts have survived DAGCombine, change the lowering of this + // BUILD_VECTOR in something more vector friendly, i.e., that does not + // force to use floating point types. + + // Make sure we can change the type of the vector. + // This is possible iff: + // 1. The vector is only used in a bitcast to a integer type. I.e., + // 1.1. Vector is used only once. + // 1.2. Use is a bit convert to an integer type. + // 2. The size of its operands are 32-bits (64-bits are not legal). + EVT VT = N->getValueType(0); + EVT EltVT = VT.getVectorElementType(); + + // Check 1.1. and 2. + if (EltVT.getSizeInBits() != 32 || !N->hasOneUse()) + return SDValue(); + + // By construction, the input type must be float. + assert(EltVT == MVT::f32 && "Unexpected type!"); + + // Check 1.2. + SDNode *Use = *N->use_begin(); + if (Use->getOpcode() != ISD::BITCAST || + Use->getValueType(0).isFloatingPoint()) + return SDValue(); + + // Check profitability. + // Model is, if more than half of the relevant operands are bitcast from + // i32, turn the build_vector into a sequence of insert_vector_elt. + // Relevant operands are everything that is not statically + // (i.e., at compile time) bitcasted. + unsigned NumOfBitCastedElts = 0; + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumOfRelevantElts = NumElts; + for (unsigned Idx = 0; Idx < NumElts; ++Idx) { + SDValue Elt = N->getOperand(Idx); + if (Elt->getOpcode() == ISD::BITCAST) { + // Assume only bit cast to i32 will go away. + if (Elt->getOperand(0).getValueType() == MVT::i32) + ++NumOfBitCastedElts; + } else if (Elt.getOpcode() == ISD::UNDEF || isa<ConstantSDNode>(Elt)) + // Constants are statically casted, thus do not count them as + // relevant operands. + --NumOfRelevantElts; + } + + // Check if more than half of the elements require a non-free bitcast. + if (NumOfBitCastedElts <= NumOfRelevantElts / 2) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + // Create the new vector type. + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); + // Check if the type is legal. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isTypeLegal(VecVT)) + return SDValue(); + + // Combine: + // ARMISD::BUILD_VECTOR E1, E2, ..., EN. + // => BITCAST INSERT_VECTOR_ELT + // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1), + // (BITCAST EN), N. + SDValue Vec = DAG.getUNDEF(VecVT); + SDLoc dl(N); + for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) { + SDValue V = N->getOperand(Idx); + if (V.getOpcode() == ISD::UNDEF) + continue; + if (V.getOpcode() == ISD::BITCAST && + V->getOperand(0).getValueType() == MVT::i32) + // Fold obvious case. + V = V.getOperand(0); + else { + V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); + // Make the DAGCombiner fold the bitcasts. + DCI.AddToWorklist(V.getNode()); + } + SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32); + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx); + } + Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec); + // Make the DAGCombiner fold the bitcasts. + DCI.AddToWorklist(Vec.getNode()); + return Vec; +} + +/// PerformInsertEltCombine - Target-specific dag combine xforms for +/// ISD::INSERT_VECTOR_ELT. +static SDValue PerformInsertEltCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + // Bitcast an i64 load inserted into a vector to f64. + // Otherwise, the i64 value will be legalized to a pair of i32 values. + EVT VT = N->getValueType(0); + SDNode *Elt = N->getOperand(1).getNode(); + if (VT.getVectorElementType() != MVT::i64 || + !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(N); + EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, + VT.getVectorNumElements()); + SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0)); + SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1)); + // Make the DAGCombiner fold the bitcasts. + DCI.AddToWorklist(Vec.getNode()); + DCI.AddToWorklist(V.getNode()); + SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT, + Vec, V, N->getOperand(2)); + return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); +} + +/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for +/// ISD::VECTOR_SHUFFLE. +static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { + // The LLVM shufflevector instruction does not require the shuffle mask + // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does + // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the + // operands do not match the mask length, they are extended by concatenating + // them with undef vectors. That is probably the right thing for other + // targets, but for NEON it is better to concatenate two double-register + // size vector operands into a single quad-register size vector. Do that + // transformation here: + // shuffle(concat(v1, undef), concat(v2, undef)) -> + // shuffle(concat(v1, v2), undef) + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + if (Op0.getOpcode() != ISD::CONCAT_VECTORS || + Op1.getOpcode() != ISD::CONCAT_VECTORS || + Op0.getNumOperands() != 2 || + Op1.getNumOperands() != 2) + return SDValue(); + SDValue Concat0Op1 = Op0.getOperand(1); + SDValue Concat1Op1 = Op1.getOperand(1); + if (Concat0Op1.getOpcode() != ISD::UNDEF || + Concat1Op1.getOpcode() != ISD::UNDEF) + return SDValue(); + // Skip the transformation if any of the types are illegal. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT VT = N->getValueType(0); + if (!TLI.isTypeLegal(VT) || + !TLI.isTypeLegal(Concat0Op1.getValueType()) || + !TLI.isTypeLegal(Concat1Op1.getValueType())) + return SDValue(); + + SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, + Op0.getOperand(0), Op1.getOperand(0)); + // Translate the shuffle mask. + SmallVector<int, 16> NewMask; + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfElts = NumElts/2; + ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); + for (unsigned n = 0; n < NumElts; ++n) { + int MaskElt = SVN->getMaskElt(n); + int NewElt = -1; + if (MaskElt < (int)HalfElts) + NewElt = MaskElt; + else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts)) + NewElt = HalfElts + MaskElt - NumElts; + NewMask.push_back(NewElt); + } + return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat, + DAG.getUNDEF(VT), NewMask.data()); +} + +/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, +/// NEON load/store intrinsics, and generic vector load/stores, to merge +/// base address updates. +/// For generic load/stores, the memory type is assumed to be a vector. +/// The caller is assumed to have checked legality. +static SDValue CombineBaseUpdate(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; + const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || + N->getOpcode() == ISD::INTRINSIC_W_CHAIN); + const bool isStore = N->getOpcode() == ISD::STORE; + const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1); + SDValue Addr = N->getOperand(AddrOpIdx); + MemSDNode *MemN = cast<MemSDNode>(N); + SDLoc dl(N); + + // Search for a use of the address operand that is an increment. + for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), + UE = Addr.getNode()->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + if (User->getOpcode() != ISD::ADD || + UI.getUse().getResNo() != Addr.getResNo()) + continue; + + // Check that the add is independent of the load/store. Otherwise, folding + // it would create a cycle. + if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) + continue; + + // Find the new opcode for the updating load/store. + bool isLoadOp = true; + bool isLaneOp = false; + unsigned NewOpc = 0; + unsigned NumVecs = 0; + if (isIntrinsic) { + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + switch (IntNo) { + default: llvm_unreachable("unexpected intrinsic for Neon base update"); + case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD; + NumVecs = 1; break; + case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD; + NumVecs = 2; break; + case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD; + NumVecs = 3; break; + case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; + NumVecs = 4; break; + case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; + NumVecs = 2; isLaneOp = true; break; + case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; + NumVecs = 3; isLaneOp = true; break; + case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; + NumVecs = 4; isLaneOp = true; break; + case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; + NumVecs = 1; isLoadOp = false; break; + case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; + NumVecs = 2; isLoadOp = false; break; + case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; + NumVecs = 3; isLoadOp = false; break; + case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; + NumVecs = 4; isLoadOp = false; break; + case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; + NumVecs = 2; isLoadOp = false; isLaneOp = true; break; + case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; + NumVecs = 3; isLoadOp = false; isLaneOp = true; break; + case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; + NumVecs = 4; isLoadOp = false; isLaneOp = true; break; + } + } else { + isLaneOp = true; + switch (N->getOpcode()) { + default: llvm_unreachable("unexpected opcode for Neon base update"); + case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; + case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; + case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; + case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD; + NumVecs = 1; isLaneOp = false; break; + case ISD::STORE: NewOpc = ARMISD::VST1_UPD; + NumVecs = 1; isLaneOp = false; isLoadOp = false; break; + } + } + + // Find the size of memory referenced by the load/store. + EVT VecTy; + if (isLoadOp) { + VecTy = N->getValueType(0); + } else if (isIntrinsic) { + VecTy = N->getOperand(AddrOpIdx+1).getValueType(); + } else { + assert(isStore && "Node has to be a load, a store, or an intrinsic!"); + VecTy = N->getOperand(1).getValueType(); + } + + unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; + if (isLaneOp) + NumBytes /= VecTy.getVectorNumElements(); + + // If the increment is a constant, it must match the memory ref size. + SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); + if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { + uint64_t IncVal = CInc->getZExtValue(); + if (IncVal != NumBytes) + continue; + } else if (NumBytes >= 3 * 16) { + // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two + // separate instructions that make it harder to use a non-constant update. + continue; + } + + // OK, we found an ADD we can fold into the base update. + // Now, create a _UPD node, taking care of not breaking alignment. + + EVT AlignedVecTy = VecTy; + unsigned Alignment = MemN->getAlignment(); + + // If this is a less-than-standard-aligned load/store, change the type to + // match the standard alignment. + // The alignment is overlooked when selecting _UPD variants; and it's + // easier to introduce bitcasts here than fix that. + // There are 3 ways to get to this base-update combine: + // - intrinsics: they are assumed to be properly aligned (to the standard + // alignment of the memory type), so we don't need to do anything. + // - ARMISD::VLDx nodes: they are only generated from the aforementioned + // intrinsics, so, likewise, there's nothing to do. + // - generic load/store instructions: the alignment is specified as an + // explicit operand, rather than implicitly as the standard alignment + // of the memory type (like the intrisics). We need to change the + // memory type to match the explicit alignment. That way, we don't + // generate non-standard-aligned ARMISD::VLDx nodes. + if (isa<LSBaseSDNode>(N)) { + if (Alignment == 0) + Alignment = 1; + if (Alignment < VecTy.getScalarSizeInBits() / 8) { + MVT EltTy = MVT::getIntegerVT(Alignment * 8); + assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); + assert(!isLaneOp && "Unexpected generic load/store lane."); + unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); + AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); + } + // Don't set an explicit alignment on regular load/stores that we want + // to transform to VLD/VST 1_UPD nodes. + // This matches the behavior of regular load/stores, which only get an + // explicit alignment if the MMO alignment is larger than the standard + // alignment of the memory type. + // Intrinsics, however, always get an explicit alignment, set to the + // alignment of the MMO. + Alignment = 1; + } + + // Create the new updating load/store node. + // First, create an SDVTList for the new updating node's results. + EVT Tys[6]; + unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); + unsigned n; + for (n = 0; n < NumResultVecs; ++n) + Tys[n] = AlignedVecTy; + Tys[n++] = MVT::i32; + Tys[n] = MVT::Other; + SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2)); + + // Then, gather the new node's operands. + SmallVector<SDValue, 8> Ops; + Ops.push_back(N->getOperand(0)); // incoming chain + Ops.push_back(N->getOperand(AddrOpIdx)); + Ops.push_back(Inc); + + if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) { + // Try to match the intrinsic's signature + Ops.push_back(StN->getValue()); + } else { + // Loads (and of course intrinsics) match the intrinsics' signature, + // so just add all but the alignment operand. + for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i) + Ops.push_back(N->getOperand(i)); + } + + // For all node types, the alignment operand is always the last one. + Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32)); + + // If this is a non-standard-aligned STORE, the penultimate operand is the + // stored value. Bitcast it to the aligned type. + if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { + SDValue &StVal = Ops[Ops.size()-2]; + StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); + } + + SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, + Ops, AlignedVecTy, + MemN->getMemOperand()); + + // Update the uses. + SmallVector<SDValue, 5> NewResults; + for (unsigned i = 0; i < NumResultVecs; ++i) + NewResults.push_back(SDValue(UpdN.getNode(), i)); + + // If this is an non-standard-aligned LOAD, the first result is the loaded + // value. Bitcast it to the expected result type. + if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { + SDValue &LdVal = NewResults[0]; + LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal); + } + + NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain + DCI.CombineTo(N, NewResults); + DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); + + break; + } + return SDValue(); +} + +static SDValue PerformVLDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + return CombineBaseUpdate(N, DCI); +} + +/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a +/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic +/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and +/// return true. +static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + // vldN-dup instructions only support 64-bit vectors for N > 1. + if (!VT.is64BitVector()) + return false; + + // Check if the VDUPLANE operand is a vldN-dup intrinsic. + SDNode *VLD = N->getOperand(0).getNode(); + if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) + return false; + unsigned NumVecs = 0; + unsigned NewOpc = 0; + unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue(); + if (IntNo == Intrinsic::arm_neon_vld2lane) { + NumVecs = 2; + NewOpc = ARMISD::VLD2DUP; + } else if (IntNo == Intrinsic::arm_neon_vld3lane) { + NumVecs = 3; + NewOpc = ARMISD::VLD3DUP; + } else if (IntNo == Intrinsic::arm_neon_vld4lane) { + NumVecs = 4; + NewOpc = ARMISD::VLD4DUP; + } else { + return false; + } + + // First check that all the vldN-lane uses are VDUPLANEs and that the lane + // numbers match the load. + unsigned VLDLaneNo = + cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue(); + for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); + UI != UE; ++UI) { + // Ignore uses of the chain result. + if (UI.getUse().getResNo() == NumVecs) + continue; + SDNode *User = *UI; + if (User->getOpcode() != ARMISD::VDUPLANE || + VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue()) + return false; + } + + // Create the vldN-dup node. + EVT Tys[5]; + unsigned n; + for (n = 0; n < NumVecs; ++n) + Tys[n] = VT; + Tys[n] = MVT::Other; + SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1)); + SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; + MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD); + SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, + Ops, VLDMemInt->getMemoryVT(), + VLDMemInt->getMemOperand()); + + // Update the uses. + for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); + UI != UE; ++UI) { + unsigned ResNo = UI.getUse().getResNo(); + // Ignore uses of the chain result. + if (ResNo == NumVecs) + continue; + SDNode *User = *UI; + DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); + } + + // Now the vldN-lane intrinsic is dead except for its chain result. + // Update uses of the chain. + std::vector<SDValue> VLDDupResults; + for (unsigned n = 0; n < NumVecs; ++n) + VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); + VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); + DCI.CombineTo(VLD, VLDDupResults); + + return true; +} + +/// PerformVDUPLANECombine - Target-specific dag combine xforms for +/// ARMISD::VDUPLANE. +static SDValue PerformVDUPLANECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue Op = N->getOperand(0); + + // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses + // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. + if (CombineVLDDUP(N, DCI)) + return SDValue(N, 0); + + // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is + // redundant. Ignore bit_converts for now; element sizes are checked below. + while (Op.getOpcode() == ISD::BITCAST) + Op = Op.getOperand(0); + if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM) + return SDValue(); + + // Make sure the VMOV element size is not bigger than the VDUPLANE elements. + unsigned EltSize = Op.getValueType().getVectorElementType().getSizeInBits(); + // The canonical VMOV for a zero vector uses a 32-bit element size. + unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + unsigned EltBits; + if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0) + EltSize = 8; + EVT VT = N->getValueType(0); + if (EltSize > VT.getVectorElementType().getSizeInBits()) + return SDValue(); + + return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); +} + +static SDValue PerformLOADCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + EVT VT = N->getValueType(0); + + // If this is a legal vector load, try to combine it into a VLD1_UPD. + if (ISD::isNormalLoad(N) && VT.isVector() && + DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return CombineBaseUpdate(N, DCI); + + return SDValue(); +} + +/// PerformSTORECombine - Target-specific dag combine xforms for +/// ISD::STORE. +static SDValue PerformSTORECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + StoreSDNode *St = cast<StoreSDNode>(N); + if (St->isVolatile()) + return SDValue(); + + // Optimize trunc store (of multiple scalars) to shuffle and store. First, + // pack all of the elements in one place. Next, store to memory in fewer + // chunks. + SDValue StVal = St->getValue(); + EVT VT = StVal.getValueType(); + if (St->isTruncatingStore() && VT.isVector()) { + SelectionDAG &DAG = DCI.DAG; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT StVT = St->getMemoryVT(); + unsigned NumElems = VT.getVectorNumElements(); + assert(StVT != VT && "Cannot truncate to the same type"); + unsigned FromEltSz = VT.getVectorElementType().getSizeInBits(); + unsigned ToEltSz = StVT.getVectorElementType().getSizeInBits(); + + // From, To sizes and ElemCount must be pow of two + if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue(); + + // We are going to use the original vector elt for storing. + // Accumulated smaller vector elements must be a multiple of the store size. + if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue(); + + unsigned SizeRatio = FromEltSz / ToEltSz; + assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); + + // Create a type on which we perform the shuffle. + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), + NumElems*SizeRatio); + assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); + + SDLoc DL(St); + SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); + SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i < NumElems; ++i) + ShuffleVec[i] = DAG.getDataLayout().isBigEndian() + ? (i + 1) * SizeRatio - 1 + : i * SizeRatio; + + // Can't shuffle using an illegal type. + if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); + + SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec, + DAG.getUNDEF(WideVec.getValueType()), + ShuffleVec.data()); + // At this point all of the data is stored at the bottom of the + // register. We now need to save it to mem. + + // Find the largest store unit + MVT StoreType = MVT::i8; + for (MVT Tp : MVT::integer_valuetypes()) { + if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) + StoreType = Tp; + } + // Didn't find a legal store type. + if (!TLI.isTypeLegal(StoreType)) + return SDValue(); + + // Bitcast the original vector into a vector of store-size units + EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), + StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); + assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); + SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); + SmallVector<SDValue, 8> Chains; + SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, + TLI.getPointerTy(DAG.getDataLayout())); + SDValue BasePtr = St->getBasePtr(); + + // Perform one or more big stores into memory. + unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits(); + for (unsigned I = 0; I < E; I++) { + SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, + StoreType, ShuffWide, + DAG.getIntPtrConstant(I, DL)); + SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr, + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, + Increment); + Chains.push_back(Ch); + } + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + } + + if (!ISD::isNormalStore(St)) + return SDValue(); + + // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and + // ARM stores of arguments in the same cache line. + if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && + StVal.getNode()->hasOneUse()) { + SelectionDAG &DAG = DCI.DAG; + bool isBigEndian = DAG.getDataLayout().isBigEndian(); + SDLoc DL(St); + SDValue BasePtr = St->getBasePtr(); + SDValue NewST1 = DAG.getStore(St->getChain(), DL, + StVal.getNode()->getOperand(isBigEndian ? 1 : 0 ), + BasePtr, St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + + SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, + DAG.getConstant(4, DL, MVT::i32)); + return DAG.getStore(NewST1.getValue(0), DL, + StVal.getNode()->getOperand(isBigEndian ? 0 : 1), + OffsetPtr, St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), + std::min(4U, St->getAlignment() / 2)); + } + + if (StVal.getValueType() == MVT::i64 && + StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + + // Bitcast an i64 store extracted from a vector to f64. + // Otherwise, the i64 value will be legalized to a pair of i32 values. + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(StVal); + SDValue IntVec = StVal.getOperand(0); + EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, + IntVec.getValueType().getVectorNumElements()); + SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); + SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, + Vec, StVal.getOperand(1)); + dl = SDLoc(N); + SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); + // Make the DAGCombiner fold the bitcasts. + DCI.AddToWorklist(Vec.getNode()); + DCI.AddToWorklist(ExtElt.getNode()); + DCI.AddToWorklist(V.getNode()); + return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment(), + St->getAAInfo()); + } + + // If this is a legal vector store, try to combine it into a VST1_UPD. + if (ISD::isNormalStore(N) && VT.isVector() && + DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return CombineBaseUpdate(N, DCI); + + return SDValue(); +} + +/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) +/// can replace combinations of VMUL and VCVT (floating-point to integer) +/// when the VMUL has a constant operand that is a power of 2. +/// +/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): +/// vmul.f32 d16, d17, d16 +/// vcvt.s32.f32 d16, d16 +/// becomes: +/// vcvt.s32.f32 d16, d16, #3 +static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + + SDValue Op = N->getOperand(0); + if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL) + return SDValue(); + + SDValue ConstVec = Op->getOperand(1); + if (!isa<BuildVectorSDNode>(ConstVec)) + return SDValue(); + + MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); + uint32_t FloatBits = FloatTy.getSizeInBits(); + MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); + uint32_t IntBits = IntTy.getSizeInBits(); + unsigned NumLanes = Op.getValueType().getVectorNumElements(); + if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) { + // These instructions only exist converting from f32 to i32. We can handle + // smaller integers by generating an extra truncate, but larger ones would + // be lossy. We also can't handle more then 4 lanes, since these intructions + // only support v2i32/v4i32 types. + return SDValue(); + } + + BitVector UndefElements; + BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); + int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); + if (C == -1 || C == 0 || C > 32) + return SDValue(); + + SDLoc dl(N); + bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; + unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : + Intrinsic::arm_neon_vcvtfp2fxu; + SDValue FixConv = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, + DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0), + DAG.getConstant(C, dl, MVT::i32)); + + if (IntBits < FloatBits) + FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv); + + return FixConv; +} + +/// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD) +/// can replace combinations of VCVT (integer to floating-point) and VDIV +/// when the VDIV has a constant operand that is a power of 2. +/// +/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): +/// vcvt.f32.s32 d16, d16 +/// vdiv.f32 d16, d17, d16 +/// becomes: +/// vcvt.f32.s32 d16, d16, #3 +static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + + SDValue Op = N->getOperand(0); + unsigned OpOpcode = Op.getNode()->getOpcode(); + if (!N->getValueType(0).isVector() || + (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP)) + return SDValue(); + + SDValue ConstVec = N->getOperand(1); + if (!isa<BuildVectorSDNode>(ConstVec)) + return SDValue(); + + MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); + uint32_t FloatBits = FloatTy.getSizeInBits(); + MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); + uint32_t IntBits = IntTy.getSizeInBits(); + unsigned NumLanes = Op.getValueType().getVectorNumElements(); + if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) { + // These instructions only exist converting from i32 to f32. We can handle + // smaller integers by generating an extra extend, but larger ones would + // be lossy. We also can't handle more then 4 lanes, since these intructions + // only support v2i32/v4i32 types. + return SDValue(); + } + + BitVector UndefElements; + BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); + int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); + if (C == -1 || C == 0 || C > 32) + return SDValue(); + + SDLoc dl(N); + bool isSigned = OpOpcode == ISD::SINT_TO_FP; + SDValue ConvInput = Op.getOperand(0); + if (IntBits < FloatBits) + ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, + dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, + ConvInput); + + unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp : + Intrinsic::arm_neon_vcvtfxu2fp; + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, + Op.getValueType(), + DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), + ConvInput, DAG.getConstant(C, dl, MVT::i32)); +} + +/// Getvshiftimm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift operation, where all the elements of the +/// build_vector must have the same constant integer value. +static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { + // Ignore bit_converts. + while (Op.getOpcode() == ISD::BITCAST) + Op = Op.getOperand(0); + BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, + HasAnyUndefs, ElementBits) || + SplatBitSize > ElementBits) + return false; + Cnt = SplatBits.getSExtValue(); + return true; +} + +/// isVShiftLImm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift left operation. That value must be in the range: +/// 0 <= Value < ElementBits for a left shift; or +/// 0 <= Value <= ElementBits for a long left shift. +static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { + assert(VT.isVector() && "vector shift count is not a vector type"); + int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); + if (! getVShiftImm(Op, ElementBits, Cnt)) + return false; + return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits); +} + +/// isVShiftRImm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift right operation. For a shift opcode, the value +/// is positive, but for an intrinsic the value count must be negative. The +/// absolute value must be in the range: +/// 1 <= |Value| <= ElementBits for a right shift; or +/// 1 <= |Value| <= ElementBits/2 for a narrow right shift. +static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, + int64_t &Cnt) { + assert(VT.isVector() && "vector shift count is not a vector type"); + int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); + if (! getVShiftImm(Op, ElementBits, Cnt)) + return false; + if (!isIntrinsic) + return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits)); + if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) { + Cnt = -Cnt; + return true; + } + return false; +} + +/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. +static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + switch (IntNo) { + default: + // Don't do anything for most intrinsics. + break; + + // Vector shifts: check for immediate versions and lower them. + // Note: This is done during DAG combining instead of DAG legalizing because + // the build_vectors for 64-bit vector element shift counts are generally + // not legal, and it is hard to see their values after they get legalized to + // loads from a constant pool. + case Intrinsic::arm_neon_vshifts: + case Intrinsic::arm_neon_vshiftu: + case Intrinsic::arm_neon_vrshifts: + case Intrinsic::arm_neon_vrshiftu: + case Intrinsic::arm_neon_vrshiftn: + case Intrinsic::arm_neon_vqshifts: + case Intrinsic::arm_neon_vqshiftu: + case Intrinsic::arm_neon_vqshiftsu: + case Intrinsic::arm_neon_vqshiftns: + case Intrinsic::arm_neon_vqshiftnu: + case Intrinsic::arm_neon_vqshiftnsu: + case Intrinsic::arm_neon_vqrshiftns: + case Intrinsic::arm_neon_vqrshiftnu: + case Intrinsic::arm_neon_vqrshiftnsu: { + EVT VT = N->getOperand(1).getValueType(); + int64_t Cnt; + unsigned VShiftOpc = 0; + + switch (IntNo) { + case Intrinsic::arm_neon_vshifts: + case Intrinsic::arm_neon_vshiftu: + if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { + VShiftOpc = ARMISD::VSHL; + break; + } + if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { + VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? + ARMISD::VSHRs : ARMISD::VSHRu); + break; + } + return SDValue(); + + case Intrinsic::arm_neon_vrshifts: + case Intrinsic::arm_neon_vrshiftu: + if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) + break; + return SDValue(); + + case Intrinsic::arm_neon_vqshifts: + case Intrinsic::arm_neon_vqshiftu: + if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) + break; + return SDValue(); + + case Intrinsic::arm_neon_vqshiftsu: + if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) + break; + llvm_unreachable("invalid shift count for vqshlu intrinsic"); + + case Intrinsic::arm_neon_vrshiftn: + case Intrinsic::arm_neon_vqshiftns: + case Intrinsic::arm_neon_vqshiftnu: + case Intrinsic::arm_neon_vqshiftnsu: + case Intrinsic::arm_neon_vqrshiftns: + case Intrinsic::arm_neon_vqrshiftnu: + case Intrinsic::arm_neon_vqrshiftnsu: + // Narrowing shifts require an immediate right shift. + if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) + break; + llvm_unreachable("invalid shift count for narrowing vector shift " + "intrinsic"); + + default: + llvm_unreachable("unhandled vector shift"); + } + + switch (IntNo) { + case Intrinsic::arm_neon_vshifts: + case Intrinsic::arm_neon_vshiftu: + // Opcode already set above. + break; + case Intrinsic::arm_neon_vrshifts: + VShiftOpc = ARMISD::VRSHRs; break; + case Intrinsic::arm_neon_vrshiftu: + VShiftOpc = ARMISD::VRSHRu; break; + case Intrinsic::arm_neon_vrshiftn: + VShiftOpc = ARMISD::VRSHRN; break; + case Intrinsic::arm_neon_vqshifts: + VShiftOpc = ARMISD::VQSHLs; break; + case Intrinsic::arm_neon_vqshiftu: + VShiftOpc = ARMISD::VQSHLu; break; + case Intrinsic::arm_neon_vqshiftsu: + VShiftOpc = ARMISD::VQSHLsu; break; + case Intrinsic::arm_neon_vqshiftns: + VShiftOpc = ARMISD::VQSHRNs; break; + case Intrinsic::arm_neon_vqshiftnu: + VShiftOpc = ARMISD::VQSHRNu; break; + case Intrinsic::arm_neon_vqshiftnsu: + VShiftOpc = ARMISD::VQSHRNsu; break; + case Intrinsic::arm_neon_vqrshiftns: + VShiftOpc = ARMISD::VQRSHRNs; break; + case Intrinsic::arm_neon_vqrshiftnu: + VShiftOpc = ARMISD::VQRSHRNu; break; + case Intrinsic::arm_neon_vqrshiftnsu: + VShiftOpc = ARMISD::VQRSHRNsu; break; + } + + SDLoc dl(N); + return DAG.getNode(VShiftOpc, dl, N->getValueType(0), + N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32)); + } + + case Intrinsic::arm_neon_vshiftins: { + EVT VT = N->getOperand(1).getValueType(); + int64_t Cnt; + unsigned VShiftOpc = 0; + + if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) + VShiftOpc = ARMISD::VSLI; + else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) + VShiftOpc = ARMISD::VSRI; + else { + llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); + } + + SDLoc dl(N); + return DAG.getNode(VShiftOpc, dl, N->getValueType(0), + N->getOperand(1), N->getOperand(2), + DAG.getConstant(Cnt, dl, MVT::i32)); + } + + case Intrinsic::arm_neon_vqrshifts: + case Intrinsic::arm_neon_vqrshiftu: + // No immediate versions of these to check for. + break; + } + + return SDValue(); +} + +/// PerformShiftCombine - Checks for immediate versions of vector shifts and +/// lowers them. As with the vector shift intrinsics, this is done during DAG +/// combining instead of DAG legalizing because the build_vectors for 64-bit +/// vector element shift counts are generally not legal, and it is hard to see +/// their values after they get legalized to loads from a constant pool. +static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VT = N->getValueType(0); + if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) { + // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high + // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16. + SDValue N1 = N->getOperand(1); + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { + SDValue N0 = N->getOperand(0); + if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP && + DAG.MaskedValueIsZero(N0.getOperand(0), + APInt::getHighBitsSet(32, 16))) + return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1); + } + } + + // Nothing to be done for scalar shifts. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!VT.isVector() || !TLI.isTypeLegal(VT)) + return SDValue(); + + assert(ST->hasNEON() && "unexpected vector shift"); + int64_t Cnt; + + switch (N->getOpcode()) { + default: llvm_unreachable("unexpected shift opcode"); + + case ISD::SHL: + if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) { + SDLoc dl(N); + return DAG.getNode(ARMISD::VSHL, dl, VT, N->getOperand(0), + DAG.getConstant(Cnt, dl, MVT::i32)); + } + break; + + case ISD::SRA: + case ISD::SRL: + if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { + unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ? + ARMISD::VSHRs : ARMISD::VSHRu); + SDLoc dl(N); + return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), + DAG.getConstant(Cnt, dl, MVT::i32)); + } + } + return SDValue(); +} + +/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, +/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. +static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + SDValue N0 = N->getOperand(0); + + // Check for sign- and zero-extensions of vector extract operations of 8- + // and 16-bit vector elements. NEON supports these directly. They are + // handled during DAG combining because type legalization will promote them + // to 32-bit types and it is messy to recognize the operations after that. + if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + SDValue Vec = N0.getOperand(0); + SDValue Lane = N0.getOperand(1); + EVT VT = N->getValueType(0); + EVT EltVT = N0.getValueType(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + if (VT == MVT::i32 && + (EltVT == MVT::i8 || EltVT == MVT::i16) && + TLI.isTypeLegal(Vec.getValueType()) && + isa<ConstantSDNode>(Lane)) { + + unsigned Opc = 0; + switch (N->getOpcode()) { + default: llvm_unreachable("unexpected opcode"); + case ISD::SIGN_EXTEND: + Opc = ARMISD::VGETLANEs; + break; + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: + Opc = ARMISD::VGETLANEu; + break; + } + return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane); + } + } + + return SDValue(); +} + +static void computeKnownBits(SelectionDAG &DAG, SDValue Op, APInt &KnownZero, + APInt &KnownOne) { + if (Op.getOpcode() == ARMISD::BFI) { + // Conservatively, we can recurse down the first operand + // and just mask out all affected bits. + computeKnownBits(DAG, Op.getOperand(0), KnownZero, KnownOne); + + // The operand to BFI is already a mask suitable for removing the bits it + // sets. + ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2)); + APInt Mask = CI->getAPIntValue(); + KnownZero &= Mask; + KnownOne &= Mask; + return; + } + if (Op.getOpcode() == ARMISD::CMOV) { + APInt KZ2(KnownZero.getBitWidth(), 0); + APInt KO2(KnownOne.getBitWidth(), 0); + computeKnownBits(DAG, Op.getOperand(1), KnownZero, KnownOne); + computeKnownBits(DAG, Op.getOperand(2), KZ2, KO2); + + KnownZero &= KZ2; + KnownOne &= KO2; + return; + } + return DAG.computeKnownBits(Op, KnownZero, KnownOne); +} + +SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const { + // If we have a CMOV, OR and AND combination such as: + // if (x & CN) + // y |= CM; + // + // And: + // * CN is a single bit; + // * All bits covered by CM are known zero in y + // + // Then we can convert this into a sequence of BFI instructions. This will + // always be a win if CM is a single bit, will always be no worse than the + // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is + // three bits (due to the extra IT instruction). + + SDValue Op0 = CMOV->getOperand(0); + SDValue Op1 = CMOV->getOperand(1); + auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2)); + auto CC = CCNode->getAPIntValue().getLimitedValue(); + SDValue CmpZ = CMOV->getOperand(4); + + // The compare must be against zero. + if (!isNullConstant(CmpZ->getOperand(1))) + return SDValue(); + + assert(CmpZ->getOpcode() == ARMISD::CMPZ); + SDValue And = CmpZ->getOperand(0); + if (And->getOpcode() != ISD::AND) + return SDValue(); + ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(And->getOperand(1)); + if (!AndC || !AndC->getAPIntValue().isPowerOf2()) + return SDValue(); + SDValue X = And->getOperand(0); + + if (CC == ARMCC::EQ) { + // We're performing an "equal to zero" compare. Swap the operands so we + // canonicalize on a "not equal to zero" compare. + std::swap(Op0, Op1); + } else { + assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?"); + } + + if (Op1->getOpcode() != ISD::OR) + return SDValue(); + + ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1)); + if (!OrC) + return SDValue(); + SDValue Y = Op1->getOperand(0); + + if (Op0 != Y) + return SDValue(); + + // Now, is it profitable to continue? + APInt OrCI = OrC->getAPIntValue(); + unsigned Heuristic = Subtarget->isThumb() ? 3 : 2; + if (OrCI.countPopulation() > Heuristic) + return SDValue(); + + // Lastly, can we determine that the bits defined by OrCI + // are zero in Y? + APInt KnownZero, KnownOne; + computeKnownBits(DAG, Y, KnownZero, KnownOne); + if ((OrCI & KnownZero) != OrCI) + return SDValue(); + + // OK, we can do the combine. + SDValue V = Y; + SDLoc dl(X); + EVT VT = X.getValueType(); + unsigned BitInX = AndC->getAPIntValue().logBase2(); + + if (BitInX != 0) { + // We must shift X first. + X = DAG.getNode(ISD::SRL, dl, VT, X, + DAG.getConstant(BitInX, dl, VT)); + } + + for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits(); + BitInY < NumActiveBits; ++BitInY) { + if (OrCI[BitInY] == 0) + continue; + APInt Mask(VT.getSizeInBits(), 0); + Mask.setBit(BitInY); + V = DAG.getNode(ARMISD::BFI, dl, VT, V, X, + // Confusingly, the operand is an *inverted* mask. + DAG.getConstant(~Mask, dl, VT)); + } + + return V; +} + +/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. +SDValue +ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { + SDValue Cmp = N->getOperand(4); + if (Cmp.getOpcode() != ARMISD::CMPZ) + // Only looking at EQ and NE cases. + return SDValue(); + + EVT VT = N->getValueType(0); + SDLoc dl(N); + SDValue LHS = Cmp.getOperand(0); + SDValue RHS = Cmp.getOperand(1); + SDValue FalseVal = N->getOperand(0); + SDValue TrueVal = N->getOperand(1); + SDValue ARMcc = N->getOperand(2); + ARMCC::CondCodes CC = + (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); + + // BFI is only available on V6T2+. + if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) { + SDValue R = PerformCMOVToBFICombine(N, DAG); + if (R) + return R; + } + + // Simplify + // mov r1, r0 + // cmp r1, x + // mov r0, y + // moveq r0, x + // to + // cmp r0, x + // movne r0, y + // + // mov r1, r0 + // cmp r1, x + // mov r0, x + // movne r0, y + // to + // cmp r0, x + // movne r0, y + /// FIXME: Turn this into a target neutral optimization? + SDValue Res; + if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) { + Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, + N->getOperand(3), Cmp); + } else if (CC == ARMCC::EQ && TrueVal == RHS) { + SDValue ARMcc; + SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl); + Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, + N->getOperand(3), NewCmp); + } + + if (Res.getNode()) { + APInt KnownZero, KnownOne; + DAG.computeKnownBits(SDValue(N,0), KnownZero, KnownOne); + // Capture demanded bits information that would be otherwise lost. + if (KnownZero == 0xfffffffe) + Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, + DAG.getValueType(MVT::i1)); + else if (KnownZero == 0xffffff00) + Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, + DAG.getValueType(MVT::i8)); + else if (KnownZero == 0xffff0000) + Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, + DAG.getValueType(MVT::i16)); + } + + return Res; +} + +SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + switch (N->getOpcode()) { + default: break; + case ISD::ADDC: return PerformADDCCombine(N, DCI, Subtarget); + case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); + case ISD::SUB: return PerformSUBCombine(N, DCI); + case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); + case ISD::OR: return PerformORCombine(N, DCI, Subtarget); + case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); + case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); + case ARMISD::BFI: return PerformBFICombine(N, DCI); + case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); + case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); + case ISD::STORE: return PerformSTORECombine(N, DCI); + case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); + case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); + case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); + case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + return PerformVCVTCombine(N, DCI.DAG, Subtarget); + case ISD::FDIV: + return PerformVDIVCombine(N, DCI.DAG, Subtarget); + case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); + case ISD::SHL: + case ISD::SRA: + case ISD::SRL: return PerformShiftCombine(N, DCI.DAG, Subtarget); + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); + case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); + case ISD::LOAD: return PerformLOADCombine(N, DCI); + case ARMISD::VLD2DUP: + case ARMISD::VLD3DUP: + case ARMISD::VLD4DUP: + return PerformVLDCombine(N, DCI); + case ARMISD::BUILD_VECTOR: + return PerformARMBUILD_VECTORCombine(N, DCI); + case ISD::INTRINSIC_VOID: + case ISD::INTRINSIC_W_CHAIN: + switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { + case Intrinsic::arm_neon_vld1: + case Intrinsic::arm_neon_vld2: + case Intrinsic::arm_neon_vld3: + case Intrinsic::arm_neon_vld4: + case Intrinsic::arm_neon_vld2lane: + case Intrinsic::arm_neon_vld3lane: + case Intrinsic::arm_neon_vld4lane: + case Intrinsic::arm_neon_vst1: + case Intrinsic::arm_neon_vst2: + case Intrinsic::arm_neon_vst3: + case Intrinsic::arm_neon_vst4: + case Intrinsic::arm_neon_vst2lane: + case Intrinsic::arm_neon_vst3lane: + case Intrinsic::arm_neon_vst4lane: + return PerformVLDCombine(N, DCI); + default: break; + } + break; + } + return SDValue(); +} + +bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, + EVT VT) const { + return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); +} + +bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned, + unsigned, + bool *Fast) const { + // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus + bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); + + switch (VT.getSimpleVT().SimpleTy) { + default: + return false; + case MVT::i8: + case MVT::i16: + case MVT::i32: { + // Unaligned access can use (for example) LRDB, LRDH, LDR + if (AllowsUnaligned) { + if (Fast) + *Fast = Subtarget->hasV7Ops(); + return true; + } + return false; + } + case MVT::f64: + case MVT::v2f64: { + // For any little-endian targets with neon, we can support unaligned ld/st + // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. + // A big-endian target may also explicitly support unaligned accesses + if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) { + if (Fast) + *Fast = true; + return true; + } + return false; + } + } +} + +static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, + unsigned AlignCheck) { + return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && + (DstAlign == 0 || DstAlign % AlignCheck == 0)); +} + +EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size, + unsigned DstAlign, unsigned SrcAlign, + bool IsMemset, bool ZeroMemset, + bool MemcpyStrSrc, + MachineFunction &MF) const { + const Function *F = MF.getFunction(); + + // See if we can use NEON instructions for this... + if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() && + !F->hasFnAttribute(Attribute::NoImplicitFloat)) { + bool Fast; + if (Size >= 16 && + (memOpAlign(SrcAlign, DstAlign, 16) || + (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, &Fast) && Fast))) { + return MVT::v2f64; + } else if (Size >= 8 && + (memOpAlign(SrcAlign, DstAlign, 8) || + (allowsMisalignedMemoryAccesses(MVT::f64, 0, 1, &Fast) && + Fast))) { + return MVT::f64; + } + } + + // Lowering to i32/i16 if the size permits. + if (Size >= 4) + return MVT::i32; + else if (Size >= 2) + return MVT::i16; + + // Let the target-independent logic figure it out. + return MVT::Other; +} + +bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { + if (Val.getOpcode() != ISD::LOAD) + return false; + + EVT VT1 = Val.getValueType(); + if (!VT1.isSimple() || !VT1.isInteger() || + !VT2.isSimple() || !VT2.isInteger()) + return false; + + switch (VT1.getSimpleVT().SimpleTy) { + default: break; + case MVT::i1: + case MVT::i8: + case MVT::i16: + // 8-bit and 16-bit loads implicitly zero-extend to 32-bits. + return true; + } + + return false; +} + +bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { + EVT VT = ExtVal.getValueType(); + + if (!isTypeLegal(VT)) + return false; + + // Don't create a loadext if we can fold the extension into a wide/long + // instruction. + // If there's more than one user instruction, the loadext is desirable no + // matter what. There can be two uses by the same instruction. + if (ExtVal->use_empty() || + !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode())) + return true; + + SDNode *U = *ExtVal->use_begin(); + if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB || + U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHL)) + return false; + + return true; +} + +bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { + if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) + return false; + + if (!isTypeLegal(EVT::getEVT(Ty1))) + return false; + + assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); + + // Assuming the caller doesn't have a zeroext or signext return parameter, + // truncation all the way down to i1 is valid. + return true; +} + + +static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { + if (V < 0) + return false; + + unsigned Scale = 1; + switch (VT.getSimpleVT().SimpleTy) { + default: return false; + case MVT::i1: + case MVT::i8: + // Scale == 1; + break; + case MVT::i16: + // Scale == 2; + Scale = 2; + break; + case MVT::i32: + // Scale == 4; + Scale = 4; + break; + } + + if ((V & (Scale - 1)) != 0) + return false; + V /= Scale; + return V == (V & ((1LL << 5) - 1)); +} + +static bool isLegalT2AddressImmediate(int64_t V, EVT VT, + const ARMSubtarget *Subtarget) { + bool isNeg = false; + if (V < 0) { + isNeg = true; + V = - V; + } + + switch (VT.getSimpleVT().SimpleTy) { + default: return false; + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + // + imm12 or - imm8 + if (isNeg) + return V == (V & ((1LL << 8) - 1)); + return V == (V & ((1LL << 12) - 1)); + case MVT::f32: + case MVT::f64: + // Same as ARM mode. FIXME: NEON? + if (!Subtarget->hasVFP2()) + return false; + if ((V & 3) != 0) + return false; + V >>= 2; + return V == (V & ((1LL << 8) - 1)); + } +} + +/// isLegalAddressImmediate - Return true if the integer value can be used +/// as the offset of the target addressing mode for load / store of the +/// given type. +static bool isLegalAddressImmediate(int64_t V, EVT VT, + const ARMSubtarget *Subtarget) { + if (V == 0) + return true; + + if (!VT.isSimple()) + return false; + + if (Subtarget->isThumb1Only()) + return isLegalT1AddressImmediate(V, VT); + else if (Subtarget->isThumb2()) + return isLegalT2AddressImmediate(V, VT, Subtarget); + + // ARM mode. + if (V < 0) + V = - V; + switch (VT.getSimpleVT().SimpleTy) { + default: return false; + case MVT::i1: + case MVT::i8: + case MVT::i32: + // +- imm12 + return V == (V & ((1LL << 12) - 1)); + case MVT::i16: + // +- imm8 + return V == (V & ((1LL << 8) - 1)); + case MVT::f32: + case MVT::f64: + if (!Subtarget->hasVFP2()) // FIXME: NEON? + return false; + if ((V & 3) != 0) + return false; + V >>= 2; + return V == (V & ((1LL << 8) - 1)); + } +} + +bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM, + EVT VT) const { + int Scale = AM.Scale; + if (Scale < 0) + return false; + + switch (VT.getSimpleVT().SimpleTy) { + default: return false; + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + if (Scale == 1) + return true; + // r + r << imm + Scale = Scale & ~1; + return Scale == 2 || Scale == 4 || Scale == 8; + case MVT::i64: + // r + r + if (((unsigned)AM.HasBaseReg + Scale) <= 2) + return true; + return false; + case MVT::isVoid: + // Note, we allow "void" uses (basically, uses that aren't loads or + // stores), because arm allows folding a scale into many arithmetic + // operations. This should be made more precise and revisited later. + + // Allow r << imm, but the imm has to be a multiple of two. + if (Scale & 1) return false; + return isPowerOf2_32(Scale); + } +} + +/// isLegalAddressingMode - Return true if the addressing mode represented +/// by AM is legal for this target, for a load/store of the specified type. +bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL, + const AddrMode &AM, Type *Ty, + unsigned AS) const { + EVT VT = getValueType(DL, Ty, true); + if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget)) + return false; + + // Can never fold addr of global into load/store. + if (AM.BaseGV) + return false; + + switch (AM.Scale) { + case 0: // no scale reg, must be "r+i" or "r", or "i". + break; + case 1: + if (Subtarget->isThumb1Only()) + return false; + // FALL THROUGH. + default: + // ARM doesn't support any R+R*scale+imm addr modes. + if (AM.BaseOffs) + return false; + + if (!VT.isSimple()) + return false; + + if (Subtarget->isThumb2()) + return isLegalT2ScaledAddressingMode(AM, VT); + + int Scale = AM.Scale; + switch (VT.getSimpleVT().SimpleTy) { + default: return false; + case MVT::i1: + case MVT::i8: + case MVT::i32: + if (Scale < 0) Scale = -Scale; + if (Scale == 1) + return true; + // r + r << imm + return isPowerOf2_32(Scale & ~1); + case MVT::i16: + case MVT::i64: + // r + r + if (((unsigned)AM.HasBaseReg + Scale) <= 2) + return true; + return false; + + case MVT::isVoid: + // Note, we allow "void" uses (basically, uses that aren't loads or + // stores), because arm allows folding a scale into many arithmetic + // operations. This should be made more precise and revisited later. + + // Allow r << imm, but the imm has to be a multiple of two. + if (Scale & 1) return false; + return isPowerOf2_32(Scale); + } + } + return true; +} + +/// isLegalICmpImmediate - Return true if the specified immediate is legal +/// icmp immediate, that is the target has icmp instructions which can compare +/// a register against the immediate without having to materialize the +/// immediate into a register. +bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { + // Thumb2 and ARM modes can use cmn for negative immediates. + if (!Subtarget->isThumb()) + return ARM_AM::getSOImmVal(std::abs(Imm)) != -1; + if (Subtarget->isThumb2()) + return ARM_AM::getT2SOImmVal(std::abs(Imm)) != -1; + // Thumb1 doesn't have cmn, and only 8-bit immediates. + return Imm >= 0 && Imm <= 255; +} + +/// isLegalAddImmediate - Return true if the specified immediate is a legal add +/// *or sub* immediate, that is the target has add or sub instructions which can +/// add a register with the immediate without having to materialize the +/// immediate into a register. +bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { + // Same encoding for add/sub, just flip the sign. + int64_t AbsImm = std::abs(Imm); + if (!Subtarget->isThumb()) + return ARM_AM::getSOImmVal(AbsImm) != -1; + if (Subtarget->isThumb2()) + return ARM_AM::getT2SOImmVal(AbsImm) != -1; + // Thumb1 only has 8-bit unsigned immediate. + return AbsImm >= 0 && AbsImm <= 255; +} + +static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, + bool isSEXTLoad, SDValue &Base, + SDValue &Offset, bool &isInc, + SelectionDAG &DAG) { + if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) + return false; + + if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) { + // AddressingMode 3 + Base = Ptr->getOperand(0); + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { + int RHSC = (int)RHS->getZExtValue(); + if (RHSC < 0 && RHSC > -256) { + assert(Ptr->getOpcode() == ISD::ADD); + isInc = false; + Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); + return true; + } + } + isInc = (Ptr->getOpcode() == ISD::ADD); + Offset = Ptr->getOperand(1); + return true; + } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) { + // AddressingMode 2 + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { + int RHSC = (int)RHS->getZExtValue(); + if (RHSC < 0 && RHSC > -0x1000) { + assert(Ptr->getOpcode() == ISD::ADD); + isInc = false; + Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); + Base = Ptr->getOperand(0); + return true; + } + } + + if (Ptr->getOpcode() == ISD::ADD) { + isInc = true; + ARM_AM::ShiftOpc ShOpcVal= + ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode()); + if (ShOpcVal != ARM_AM::no_shift) { + Base = Ptr->getOperand(1); + Offset = Ptr->getOperand(0); + } else { + Base = Ptr->getOperand(0); + Offset = Ptr->getOperand(1); + } + return true; + } + + isInc = (Ptr->getOpcode() == ISD::ADD); + Base = Ptr->getOperand(0); + Offset = Ptr->getOperand(1); + return true; + } + + // FIXME: Use VLDM / VSTM to emulate indexed FP load / store. + return false; +} + +static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, + bool isSEXTLoad, SDValue &Base, + SDValue &Offset, bool &isInc, + SelectionDAG &DAG) { + if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) + return false; + + Base = Ptr->getOperand(0); + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { + int RHSC = (int)RHS->getZExtValue(); + if (RHSC < 0 && RHSC > -0x100) { // 8 bits. + assert(Ptr->getOpcode() == ISD::ADD); + isInc = false; + Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); + return true; + } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero. + isInc = Ptr->getOpcode() == ISD::ADD; + Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); + return true; + } + } + + return false; +} + +/// getPreIndexedAddressParts - returns true by value, base pointer and +/// offset pointer and addressing mode by reference if the node's address +/// can be legally represented as pre-indexed load / store address. +bool +ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, + SDValue &Offset, + ISD::MemIndexedMode &AM, + SelectionDAG &DAG) const { + if (Subtarget->isThumb1Only()) + return false; + + EVT VT; + SDValue Ptr; + bool isSEXTLoad = false; + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { + Ptr = LD->getBasePtr(); + VT = LD->getMemoryVT(); + isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; + } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { + Ptr = ST->getBasePtr(); + VT = ST->getMemoryVT(); + } else + return false; + + bool isInc; + bool isLegal = false; + if (Subtarget->isThumb2()) + isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, + Offset, isInc, DAG); + else + isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, + Offset, isInc, DAG); + if (!isLegal) + return false; + + AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC; + return true; +} + +/// getPostIndexedAddressParts - returns true by value, base pointer and +/// offset pointer and addressing mode by reference if this node can be +/// combined with a load / store to form a post-indexed load / store. +bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, + SDValue &Base, + SDValue &Offset, + ISD::MemIndexedMode &AM, + SelectionDAG &DAG) const { + if (Subtarget->isThumb1Only()) + return false; + + EVT VT; + SDValue Ptr; + bool isSEXTLoad = false; + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { + VT = LD->getMemoryVT(); + Ptr = LD->getBasePtr(); + isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; + } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { + VT = ST->getMemoryVT(); + Ptr = ST->getBasePtr(); + } else + return false; + + bool isInc; + bool isLegal = false; + if (Subtarget->isThumb2()) + isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, + isInc, DAG); + else + isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, + isInc, DAG); + if (!isLegal) + return false; + + if (Ptr != Base) { + // Swap base ptr and offset to catch more post-index load / store when + // it's legal. In Thumb2 mode, offset must be an immediate. + if (Ptr == Offset && Op->getOpcode() == ISD::ADD && + !Subtarget->isThumb2()) + std::swap(Base, Offset); + + // Post-indexed load / store update the base pointer. + if (Ptr != Base) + return false; + } + + AM = isInc ? ISD::POST_INC : ISD::POST_DEC; + return true; +} + +void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) const { + unsigned BitWidth = KnownOne.getBitWidth(); + KnownZero = KnownOne = APInt(BitWidth, 0); + switch (Op.getOpcode()) { + default: break; + case ARMISD::ADDC: + case ARMISD::ADDE: + case ARMISD::SUBC: + case ARMISD::SUBE: + // These nodes' second result is a boolean + if (Op.getResNo() == 0) + break; + KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); + break; + case ARMISD::CMOV: { + // Bits are known zero/one if known on the LHS and RHS. + DAG.computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); + if (KnownZero == 0 && KnownOne == 0) return; + + APInt KnownZeroRHS, KnownOneRHS; + DAG.computeKnownBits(Op.getOperand(1), KnownZeroRHS, KnownOneRHS, Depth+1); + KnownZero &= KnownZeroRHS; + KnownOne &= KnownOneRHS; + return; + } + case ISD::INTRINSIC_W_CHAIN: { + ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); + Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); + switch (IntID) { + default: return; + case Intrinsic::arm_ldaex: + case Intrinsic::arm_ldrex: { + EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); + unsigned MemBits = VT.getScalarType().getSizeInBits(); + KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); + return; + } + } + } + } +} + +//===----------------------------------------------------------------------===// +// ARM Inline Assembly Support +//===----------------------------------------------------------------------===// + +bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { + // Looking for "rev" which is V6+. + if (!Subtarget->hasV6Ops()) + return false; + + InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); + std::string AsmStr = IA->getAsmString(); + SmallVector<StringRef, 4> AsmPieces; + SplitString(AsmStr, AsmPieces, ";\n"); + + switch (AsmPieces.size()) { + default: return false; + case 1: + AsmStr = AsmPieces[0]; + AsmPieces.clear(); + SplitString(AsmStr, AsmPieces, " \t,"); + + // rev $0, $1 + if (AsmPieces.size() == 3 && + AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" && + IA->getConstraintString().compare(0, 4, "=l,l") == 0) { + IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); + if (Ty && Ty->getBitWidth() == 32) + return IntrinsicLowering::LowerToByteSwap(CI); + } + break; + } + + return false; +} + +/// getConstraintType - Given a constraint letter, return the type of +/// constraint it is for this target. +ARMTargetLowering::ConstraintType +ARMTargetLowering::getConstraintType(StringRef Constraint) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + default: break; + case 'l': return C_RegisterClass; + case 'w': return C_RegisterClass; + case 'h': return C_RegisterClass; + case 'x': return C_RegisterClass; + case 't': return C_RegisterClass; + case 'j': return C_Other; // Constant for movw. + // An address with a single base register. Due to the way we + // currently handle addresses it is the same as an 'r' memory constraint. + case 'Q': return C_Memory; + } + } else if (Constraint.size() == 2) { + switch (Constraint[0]) { + default: break; + // All 'U+' constraints are addresses. + case 'U': return C_Memory; + } + } + return TargetLowering::getConstraintType(Constraint); +} + +/// Examine constraint type and operand type and determine a weight value. +/// This object must already have been set up with the operand type +/// and the current alternative constraint selected. +TargetLowering::ConstraintWeight +ARMTargetLowering::getSingleConstraintMatchWeight( + AsmOperandInfo &info, const char *constraint) const { + ConstraintWeight weight = CW_Invalid; + Value *CallOperandVal = info.CallOperandVal; + // If we don't have a value, we can't do a match, + // but allow it at the lowest weight. + if (!CallOperandVal) + return CW_Default; + Type *type = CallOperandVal->getType(); + // Look at the constraint type. + switch (*constraint) { + default: + weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); + break; + case 'l': + if (type->isIntegerTy()) { + if (Subtarget->isThumb()) + weight = CW_SpecificReg; + else + weight = CW_Register; + } + break; + case 'w': + if (type->isFloatingPointTy()) + weight = CW_Register; + break; + } + return weight; +} + +typedef std::pair<unsigned, const TargetRegisterClass*> RCPair; +RCPair ARMTargetLowering::getRegForInlineAsmConstraint( + const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { + if (Constraint.size() == 1) { + // GCC ARM Constraint Letters + switch (Constraint[0]) { + case 'l': // Low regs or general regs. + if (Subtarget->isThumb()) + return RCPair(0U, &ARM::tGPRRegClass); + return RCPair(0U, &ARM::GPRRegClass); + case 'h': // High regs or no regs. + if (Subtarget->isThumb()) + return RCPair(0U, &ARM::hGPRRegClass); + break; + case 'r': + if (Subtarget->isThumb1Only()) + return RCPair(0U, &ARM::tGPRRegClass); + return RCPair(0U, &ARM::GPRRegClass); + case 'w': + if (VT == MVT::Other) + break; + if (VT == MVT::f32) + return RCPair(0U, &ARM::SPRRegClass); + if (VT.getSizeInBits() == 64) + return RCPair(0U, &ARM::DPRRegClass); + if (VT.getSizeInBits() == 128) + return RCPair(0U, &ARM::QPRRegClass); + break; + case 'x': + if (VT == MVT::Other) + break; + if (VT == MVT::f32) + return RCPair(0U, &ARM::SPR_8RegClass); + if (VT.getSizeInBits() == 64) + return RCPair(0U, &ARM::DPR_8RegClass); + if (VT.getSizeInBits() == 128) + return RCPair(0U, &ARM::QPR_8RegClass); + break; + case 't': + if (VT == MVT::f32) + return RCPair(0U, &ARM::SPRRegClass); + break; + } + } + if (StringRef("{cc}").equals_lower(Constraint)) + return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); + + return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); +} + +/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops +/// vector. If it is invalid, don't add anything to Ops. +void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, + std::string &Constraint, + std::vector<SDValue>&Ops, + SelectionDAG &DAG) const { + SDValue Result; + + // Currently only support length 1 constraints. + if (Constraint.length() != 1) return; + + char ConstraintLetter = Constraint[0]; + switch (ConstraintLetter) { + default: break; + case 'j': + case 'I': case 'J': case 'K': case 'L': + case 'M': case 'N': case 'O': + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); + if (!C) + return; + + int64_t CVal64 = C->getSExtValue(); + int CVal = (int) CVal64; + // None of these constraints allow values larger than 32 bits. Check + // that the value fits in an int. + if (CVal != CVal64) + return; + + switch (ConstraintLetter) { + case 'j': + // Constant suitable for movw, must be between 0 and + // 65535. + if (Subtarget->hasV6T2Ops()) + if (CVal >= 0 && CVal <= 65535) + break; + return; + case 'I': + if (Subtarget->isThumb1Only()) { + // This must be a constant between 0 and 255, for ADD + // immediates. + if (CVal >= 0 && CVal <= 255) + break; + } else if (Subtarget->isThumb2()) { + // A constant that can be used as an immediate value in a + // data-processing instruction. + if (ARM_AM::getT2SOImmVal(CVal) != -1) + break; + } else { + // A constant that can be used as an immediate value in a + // data-processing instruction. + if (ARM_AM::getSOImmVal(CVal) != -1) + break; + } + return; + + case 'J': + if (Subtarget->isThumb()) { // FIXME thumb2 + // This must be a constant between -255 and -1, for negated ADD + // immediates. This can be used in GCC with an "n" modifier that + // prints the negated value, for use with SUB instructions. It is + // not useful otherwise but is implemented for compatibility. + if (CVal >= -255 && CVal <= -1) + break; + } else { + // This must be a constant between -4095 and 4095. It is not clear + // what this constraint is intended for. Implemented for + // compatibility with GCC. + if (CVal >= -4095 && CVal <= 4095) + break; + } + return; + + case 'K': + if (Subtarget->isThumb1Only()) { + // A 32-bit value where only one byte has a nonzero value. Exclude + // zero to match GCC. This constraint is used by GCC internally for + // constants that can be loaded with a move/shift combination. + // It is not useful otherwise but is implemented for compatibility. + if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal)) + break; + } else if (Subtarget->isThumb2()) { + // A constant whose bitwise inverse can be used as an immediate + // value in a data-processing instruction. This can be used in GCC + // with a "B" modifier that prints the inverted value, for use with + // BIC and MVN instructions. It is not useful otherwise but is + // implemented for compatibility. + if (ARM_AM::getT2SOImmVal(~CVal) != -1) + break; + } else { + // A constant whose bitwise inverse can be used as an immediate + // value in a data-processing instruction. This can be used in GCC + // with a "B" modifier that prints the inverted value, for use with + // BIC and MVN instructions. It is not useful otherwise but is + // implemented for compatibility. + if (ARM_AM::getSOImmVal(~CVal) != -1) + break; + } + return; + + case 'L': + if (Subtarget->isThumb1Only()) { + // This must be a constant between -7 and 7, + // for 3-operand ADD/SUB immediate instructions. + if (CVal >= -7 && CVal < 7) + break; + } else if (Subtarget->isThumb2()) { + // A constant whose negation can be used as an immediate value in a + // data-processing instruction. This can be used in GCC with an "n" + // modifier that prints the negated value, for use with SUB + // instructions. It is not useful otherwise but is implemented for + // compatibility. + if (ARM_AM::getT2SOImmVal(-CVal) != -1) + break; + } else { + // A constant whose negation can be used as an immediate value in a + // data-processing instruction. This can be used in GCC with an "n" + // modifier that prints the negated value, for use with SUB + // instructions. It is not useful otherwise but is implemented for + // compatibility. + if (ARM_AM::getSOImmVal(-CVal) != -1) + break; + } + return; + + case 'M': + if (Subtarget->isThumb()) { // FIXME thumb2 + // This must be a multiple of 4 between 0 and 1020, for + // ADD sp + immediate. + if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0)) + break; + } else { + // A power of two or a constant between 0 and 32. This is used in + // GCC for the shift amount on shifted register operands, but it is + // useful in general for any shift amounts. + if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0)) + break; + } + return; + + case 'N': + if (Subtarget->isThumb()) { // FIXME thumb2 + // This must be a constant between 0 and 31, for shift amounts. + if (CVal >= 0 && CVal <= 31) + break; + } + return; + + case 'O': + if (Subtarget->isThumb()) { // FIXME thumb2 + // This must be a multiple of 4 between -508 and 508, for + // ADD/SUB sp = sp + immediate. + if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0)) + break; + } + return; + } + Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType()); + break; + } + + if (Result.getNode()) { + Ops.push_back(Result); + return; + } + return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); +} + +static RTLIB::Libcall getDivRemLibcall( + const SDNode *N, MVT::SimpleValueType SVT) { + assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || + N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && + "Unhandled Opcode in getDivRemLibcall"); + bool isSigned = N->getOpcode() == ISD::SDIVREM || + N->getOpcode() == ISD::SREM; + RTLIB::Libcall LC; + switch (SVT) { + default: llvm_unreachable("Unexpected request for libcall!"); + case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; + case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; + case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; + case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; + } + return LC; +} + +static TargetLowering::ArgListTy getDivRemArgList( + const SDNode *N, LLVMContext *Context) { + assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || + N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && + "Unhandled Opcode in getDivRemArgList"); + bool isSigned = N->getOpcode() == ISD::SDIVREM || + N->getOpcode() == ISD::SREM; + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { + EVT ArgVT = N->getOperand(i).getValueType(); + Type *ArgTy = ArgVT.getTypeForEVT(*Context); + Entry.Node = N->getOperand(i); + Entry.Ty = ArgTy; + Entry.isSExt = isSigned; + Entry.isZExt = !isSigned; + Args.push_back(Entry); + } + return Args; +} + +SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { + assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid()) && + "Register-based DivRem lowering only"); + unsigned Opcode = Op->getOpcode(); + assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && + "Invalid opcode for Div/Rem lowering"); + bool isSigned = (Opcode == ISD::SDIVREM); + EVT VT = Op->getValueType(0); + Type *Ty = VT.getTypeForEVT(*DAG.getContext()); + + RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(), + VT.getSimpleVT().SimpleTy); + SDValue InChain = DAG.getEntryNode(); + + TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(), + DAG.getContext()); + + SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), + getPointerTy(DAG.getDataLayout())); + + Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr); + + SDLoc dl(Op); + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl).setChain(InChain) + .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0) + .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); + + std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); + return CallInfo.first; +} + +// Lowers REM using divmod helpers +// see RTABI section 4.2/4.3 +SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const { + // Build return types (div and rem) + std::vector<Type*> RetTyParams; + Type *RetTyElement; + + switch (N->getValueType(0).getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unexpected request for libcall!"); + case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break; + case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break; + case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break; + case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break; + } + + RetTyParams.push_back(RetTyElement); + RetTyParams.push_back(RetTyElement); + ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams); + Type *RetTy = StructType::get(*DAG.getContext(), ret); + + RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT(). + SimpleTy); + SDValue InChain = DAG.getEntryNode(); + TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext()); + bool isSigned = N->getOpcode() == ISD::SREM; + SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), + getPointerTy(DAG.getDataLayout())); + + // Lower call + CallLoweringInfo CLI(DAG); + CLI.setChain(InChain) + .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args), 0) + .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N)); + std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); + + // Return second (rem) result operand (first contains div) + SDNode *ResNode = CallResult.first.getNode(); + assert(ResNode->getNumOperands() == 2 && "divmod should return two operands"); + return ResNode->getOperand(1); +} + +SDValue +ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { + assert(Subtarget->isTargetWindows() && "unsupported target platform"); + SDLoc DL(Op); + + // Get the inputs. + SDValue Chain = Op.getOperand(0); + SDValue Size = Op.getOperand(1); + + SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size, + DAG.getConstant(2, DL, MVT::i32)); + + SDValue Flag; + Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag); + Flag = Chain.getValue(1); + + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag); + + SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); + Chain = NewSP.getValue(1); + + SDValue Ops[2] = { NewSP, Chain }; + return DAG.getMergeValues(Ops, DL); +} + +SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { + assert(Op.getValueType() == MVT::f64 && Subtarget->isFPOnlySP() && + "Unexpected type for custom-lowering FP_EXTEND"); + + RTLIB::Libcall LC; + LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); + + SDValue SrcVal = Op.getOperand(0); + return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, + SDLoc(Op)).first; +} + +SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { + assert(Op.getOperand(0).getValueType() == MVT::f64 && + Subtarget->isFPOnlySP() && + "Unexpected type for custom-lowering FP_ROUND"); + + RTLIB::Libcall LC; + LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); + + SDValue SrcVal = Op.getOperand(0); + return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, + SDLoc(Op)).first; +} + +bool +ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { + // The ARM target isn't yet aware of offsets. + return false; +} + +bool ARM::isBitFieldInvertedMask(unsigned v) { + if (v == 0xffffffff) + return false; + + // there can be 1's on either or both "outsides", all the "inside" + // bits must be 0's + return isShiftedMask_32(~v); +} + +/// isFPImmLegal - Returns true if the target can instruction select the +/// specified FP immediate natively. If false, the legalizer will +/// materialize the FP immediate as a load from a constant pool. +bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { + if (!Subtarget->hasVFP3()) + return false; + if (VT == MVT::f32) + return ARM_AM::getFP32Imm(Imm) != -1; + if (VT == MVT::f64 && !Subtarget->isFPOnlySP()) + return ARM_AM::getFP64Imm(Imm) != -1; + return false; +} + +/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as +/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment +/// specified in the intrinsic calls. +bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, + unsigned Intrinsic) const { + switch (Intrinsic) { + case Intrinsic::arm_neon_vld1: + case Intrinsic::arm_neon_vld2: + case Intrinsic::arm_neon_vld3: + case Intrinsic::arm_neon_vld4: + case Intrinsic::arm_neon_vld2lane: + case Intrinsic::arm_neon_vld3lane: + case Intrinsic::arm_neon_vld4lane: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + // Conservatively set memVT to the entire set of vectors loaded. + auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); + uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); + Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); + Info.vol = false; // volatile loads with NEON intrinsics not supported + Info.readMem = true; + Info.writeMem = false; + return true; + } + case Intrinsic::arm_neon_vst1: + case Intrinsic::arm_neon_vst2: + case Intrinsic::arm_neon_vst3: + case Intrinsic::arm_neon_vst4: + case Intrinsic::arm_neon_vst2lane: + case Intrinsic::arm_neon_vst3lane: + case Intrinsic::arm_neon_vst4lane: { + Info.opc = ISD::INTRINSIC_VOID; + // Conservatively set memVT to the entire set of vectors stored. + auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); + unsigned NumElts = 0; + for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { + Type *ArgTy = I.getArgOperand(ArgI)->getType(); + if (!ArgTy->isVectorTy()) + break; + NumElts += DL.getTypeSizeInBits(ArgTy) / 64; + } + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); + Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); + Info.vol = false; // volatile stores with NEON intrinsics not supported + Info.readMem = false; + Info.writeMem = true; + return true; + } + case Intrinsic::arm_ldaex: + case Intrinsic::arm_ldrex: { + auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); + PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(PtrTy->getElementType()); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); + Info.vol = true; + Info.readMem = true; + Info.writeMem = false; + return true; + } + case Intrinsic::arm_stlex: + case Intrinsic::arm_strex: { + auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); + PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(PtrTy->getElementType()); + Info.ptrVal = I.getArgOperand(1); + Info.offset = 0; + Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); + Info.vol = true; + Info.readMem = false; + Info.writeMem = true; + return true; + } + case Intrinsic::arm_stlexd: + case Intrinsic::arm_strexd: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::i64; + Info.ptrVal = I.getArgOperand(2); + Info.offset = 0; + Info.align = 8; + Info.vol = true; + Info.readMem = false; + Info.writeMem = true; + return true; + } + case Intrinsic::arm_ldaexd: + case Intrinsic::arm_ldrexd: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::i64; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = 8; + Info.vol = true; + Info.readMem = true; + Info.writeMem = false; + return true; + } + default: + break; + } + + return false; +} + +/// \brief Returns true if it is beneficial to convert a load of a constant +/// to just the constant itself. +bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, + Type *Ty) const { + assert(Ty->isIntegerTy()); + + unsigned Bits = Ty->getPrimitiveSizeInBits(); + if (Bits == 0 || Bits > 32) + return false; + return true; +} + +Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, + ARM_MB::MemBOpt Domain) const { + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + + // First, if the target has no DMB, see what fallback we can use. + if (!Subtarget->hasDataBarrier()) { + // Some ARMv6 cpus can support data barriers with an mcr instruction. + // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get + // here. + if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) { + Function *MCR = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_mcr); + Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0), + Builder.getInt32(0), Builder.getInt32(7), + Builder.getInt32(10), Builder.getInt32(5)}; + return Builder.CreateCall(MCR, args); + } else { + // Instead of using barriers, atomic accesses on these subtargets use + // libcalls. + llvm_unreachable("makeDMB on a target so old that it has no barriers"); + } + } else { + Function *DMB = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_dmb); + // Only a full system barrier exists in the M-class architectures. + Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain; + Constant *CDomain = Builder.getInt32(Domain); + return Builder.CreateCall(DMB, CDomain); + } +} + +// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html +Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, + AtomicOrdering Ord, bool IsStore, + bool IsLoad) const { + if (!getInsertFencesForAtomic()) + return nullptr; + + switch (Ord) { + case NotAtomic: + case Unordered: + llvm_unreachable("Invalid fence: unordered/non-atomic"); + case Monotonic: + case Acquire: + return nullptr; // Nothing to do + case SequentiallyConsistent: + if (!IsStore) + return nullptr; // Nothing to do + /*FALLTHROUGH*/ + case Release: + case AcquireRelease: + if (Subtarget->isSwift()) + return makeDMB(Builder, ARM_MB::ISHST); + // FIXME: add a comment with a link to documentation justifying this. + else + return makeDMB(Builder, ARM_MB::ISH); + } + llvm_unreachable("Unknown fence ordering in emitLeadingFence"); +} + +Instruction* ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder, + AtomicOrdering Ord, bool IsStore, + bool IsLoad) const { + if (!getInsertFencesForAtomic()) + return nullptr; + + switch (Ord) { + case NotAtomic: + case Unordered: + llvm_unreachable("Invalid fence: unordered/not-atomic"); + case Monotonic: + case Release: + return nullptr; // Nothing to do + case Acquire: + case AcquireRelease: + case SequentiallyConsistent: + return makeDMB(Builder, ARM_MB::ISH); + } + llvm_unreachable("Unknown fence ordering in emitTrailingFence"); +} + +// Loads and stores less than 64-bits are already atomic; ones above that +// are doomed anyway, so defer to the default libcall and blame the OS when +// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit +// anything for those. +bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { + unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); + return (Size == 64) && !Subtarget->isMClass(); +} + +// Loads and stores less than 64-bits are already atomic; ones above that +// are doomed anyway, so defer to the default libcall and blame the OS when +// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit +// anything for those. +// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that +// guarantee, see DDI0406C ARM architecture reference manual, +// sections A8.8.72-74 LDRD) +TargetLowering::AtomicExpansionKind +ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { + unsigned Size = LI->getType()->getPrimitiveSizeInBits(); + return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly + : AtomicExpansionKind::None; +} + +// For the real atomic operations, we have ldrex/strex up to 32 bits, +// and up to 64 bits on the non-M profiles +TargetLowering::AtomicExpansionKind +ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { + unsigned Size = AI->getType()->getPrimitiveSizeInBits(); + return (Size <= (Subtarget->isMClass() ? 32U : 64U)) + ? AtomicExpansionKind::LLSC + : AtomicExpansionKind::None; +} + +bool ARMTargetLowering::shouldExpandAtomicCmpXchgInIR( + AtomicCmpXchgInst *AI) const { + return true; +} + +// This has so far only been implemented for MachO. +bool ARMTargetLowering::useLoadStackGuardNode() const { + return Subtarget->isTargetMachO(); +} + +bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, + unsigned &Cost) const { + // If we do not have NEON, vector types are not natively supported. + if (!Subtarget->hasNEON()) + return false; + + // Floating point values and vector values map to the same register file. + // Therefore, although we could do a store extract of a vector type, this is + // better to leave at float as we have more freedom in the addressing mode for + // those. + if (VectorTy->isFPOrFPVectorTy()) + return false; + + // If the index is unknown at compile time, this is very expensive to lower + // and it is not possible to combine the store with the extract. + if (!isa<ConstantInt>(Idx)) + return false; + + assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); + unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth(); + // We can do a store + vector extract on any vector that fits perfectly in a D + // or Q register. + if (BitWidth == 64 || BitWidth == 128) { + Cost = 0; + return true; + } + return false; +} + +bool ARMTargetLowering::isCheapToSpeculateCttz() const { + return Subtarget->hasV6T2Ops(); +} + +bool ARMTargetLowering::isCheapToSpeculateCtlz() const { + return Subtarget->hasV6T2Ops(); +} + +Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, + AtomicOrdering Ord) const { + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Type *ValTy = cast<PointerType>(Addr->getType())->getElementType(); + bool IsAcquire = isAtLeastAcquire(Ord); + + // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd + // intrinsic must return {i32, i32} and we have to recombine them into a + // single i64 here. + if (ValTy->getPrimitiveSizeInBits() == 64) { + Intrinsic::ID Int = + IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd; + Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int); + + Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); + Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi"); + + Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); + Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); + if (!Subtarget->isLittle()) + std::swap (Lo, Hi); + Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); + Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); + return Builder.CreateOr( + Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64"); + } + + Type *Tys[] = { Addr->getType() }; + Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex; + Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int, Tys); + + return Builder.CreateTruncOrBitCast( + Builder.CreateCall(Ldrex, Addr), + cast<PointerType>(Addr->getType())->getElementType()); +} + +void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( + IRBuilder<> &Builder) const { + if (!Subtarget->hasV7Ops()) + return; + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Builder.CreateCall(llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); +} + +Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, + Value *Addr, + AtomicOrdering Ord) const { + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + bool IsRelease = isAtLeastRelease(Ord); + + // Since the intrinsics must have legal type, the i64 intrinsics take two + // parameters: "i32, i32". We must marshal Val into the appropriate form + // before the call. + if (Val->getType()->getPrimitiveSizeInBits() == 64) { + Intrinsic::ID Int = + IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd; + Function *Strex = Intrinsic::getDeclaration(M, Int); + Type *Int32Ty = Type::getInt32Ty(M->getContext()); + + Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo"); + Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi"); + if (!Subtarget->isLittle()) + std::swap (Lo, Hi); + Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); + return Builder.CreateCall(Strex, {Lo, Hi, Addr}); + } + + Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex; + Type *Tys[] = { Addr->getType() }; + Function *Strex = Intrinsic::getDeclaration(M, Int, Tys); + + return Builder.CreateCall( + Strex, {Builder.CreateZExtOrBitCast( + Val, Strex->getFunctionType()->getParamType(0)), + Addr}); +} + +/// \brief Lower an interleaved load into a vldN intrinsic. +/// +/// E.g. Lower an interleaved load (Factor = 2): +/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4 +/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements +/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements +/// +/// Into: +/// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4) +/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 +/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 +bool ARMTargetLowering::lowerInterleavedLoad( + LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, + ArrayRef<unsigned> Indices, unsigned Factor) const { + assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && + "Invalid interleave factor"); + assert(!Shuffles.empty() && "Empty shufflevector input"); + assert(Shuffles.size() == Indices.size() && + "Unmatched number of shufflevectors and indices"); + + VectorType *VecTy = Shuffles[0]->getType(); + Type *EltTy = VecTy->getVectorElementType(); + + const DataLayout &DL = LI->getModule()->getDataLayout(); + unsigned VecSize = DL.getTypeSizeInBits(VecTy); + bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64; + + // Skip if we do not have NEON and skip illegal vector types and vector types + // with i64/f64 elements (vldN doesn't support i64/f64 elements). + if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128) || EltIs64Bits) + return false; + + // A pointer vector can not be the return type of the ldN intrinsics. Need to + // load integer vectors first and then convert to pointer vectors. + if (EltTy->isPointerTy()) + VecTy = + VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); + + static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, + Intrinsic::arm_neon_vld3, + Intrinsic::arm_neon_vld4}; + + IRBuilder<> Builder(LI); + SmallVector<Value *, 2> Ops; + + Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); + Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr)); + Ops.push_back(Builder.getInt32(LI->getAlignment())); + + Type *Tys[] = { VecTy, Int8Ptr }; + Function *VldnFunc = + Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); + CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN"); + + // Replace uses of each shufflevector with the corresponding vector loaded + // by ldN. + for (unsigned i = 0; i < Shuffles.size(); i++) { + ShuffleVectorInst *SV = Shuffles[i]; + unsigned Index = Indices[i]; + + Value *SubVec = Builder.CreateExtractValue(VldN, Index); + + // Convert the integer vector to pointer vector if the element is pointer. + if (EltTy->isPointerTy()) + SubVec = Builder.CreateIntToPtr(SubVec, SV->getType()); + + SV->replaceAllUsesWith(SubVec); + } + + return true; +} + +/// \brief Get a mask consisting of sequential integers starting from \p Start. +/// +/// I.e. <Start, Start + 1, ..., Start + NumElts - 1> +static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start, + unsigned NumElts) { + SmallVector<Constant *, 16> Mask; + for (unsigned i = 0; i < NumElts; i++) + Mask.push_back(Builder.getInt32(Start + i)); + + return ConstantVector::get(Mask); +} + +/// \brief Lower an interleaved store into a vstN intrinsic. +/// +/// E.g. Lower an interleaved store (Factor = 3): +/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, +/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> +/// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4 +/// +/// Into: +/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> +/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> +/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> +/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) +/// +/// Note that the new shufflevectors will be removed and we'll only generate one +/// vst3 instruction in CodeGen. +bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, + ShuffleVectorInst *SVI, + unsigned Factor) const { + assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && + "Invalid interleave factor"); + + VectorType *VecTy = SVI->getType(); + assert(VecTy->getVectorNumElements() % Factor == 0 && + "Invalid interleaved store"); + + unsigned NumSubElts = VecTy->getVectorNumElements() / Factor; + Type *EltTy = VecTy->getVectorElementType(); + VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts); + + const DataLayout &DL = SI->getModule()->getDataLayout(); + unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); + bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64; + + // Skip if we do not have NEON and skip illegal vector types and vector types + // with i64/f64 elements (vstN doesn't support i64/f64 elements). + if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128) || + EltIs64Bits) + return false; + + Value *Op0 = SVI->getOperand(0); + Value *Op1 = SVI->getOperand(1); + IRBuilder<> Builder(SI); + + // StN intrinsics don't support pointer vectors as arguments. Convert pointer + // vectors to integer vectors. + if (EltTy->isPointerTy()) { + Type *IntTy = DL.getIntPtrType(EltTy); + + // Convert to the corresponding integer vector. + Type *IntVecTy = + VectorType::get(IntTy, Op0->getType()->getVectorNumElements()); + Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); + Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); + + SubVecTy = VectorType::get(IntTy, NumSubElts); + } + + static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, + Intrinsic::arm_neon_vst3, + Intrinsic::arm_neon_vst4}; + SmallVector<Value *, 6> Ops; + + Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); + Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr)); + + Type *Tys[] = { Int8Ptr, SubVecTy }; + Function *VstNFunc = Intrinsic::getDeclaration( + SI->getModule(), StoreInts[Factor - 2], Tys); + + // Split the shufflevector operands into sub vectors for the new vstN call. + for (unsigned i = 0; i < Factor; i++) + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts))); + + Ops.push_back(Builder.getInt32(SI->getAlignment())); + Builder.CreateCall(VstNFunc, Ops); + return true; +} + +enum HABaseType { + HA_UNKNOWN = 0, + HA_FLOAT, + HA_DOUBLE, + HA_VECT64, + HA_VECT128 +}; + +static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, + uint64_t &Members) { + if (auto *ST = dyn_cast<StructType>(Ty)) { + for (unsigned i = 0; i < ST->getNumElements(); ++i) { + uint64_t SubMembers = 0; + if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers)) + return false; + Members += SubMembers; + } + } else if (auto *AT = dyn_cast<ArrayType>(Ty)) { + uint64_t SubMembers = 0; + if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers)) + return false; + Members += SubMembers * AT->getNumElements(); + } else if (Ty->isFloatTy()) { + if (Base != HA_UNKNOWN && Base != HA_FLOAT) + return false; + Members = 1; + Base = HA_FLOAT; + } else if (Ty->isDoubleTy()) { + if (Base != HA_UNKNOWN && Base != HA_DOUBLE) + return false; + Members = 1; + Base = HA_DOUBLE; + } else if (auto *VT = dyn_cast<VectorType>(Ty)) { + Members = 1; + switch (Base) { + case HA_FLOAT: + case HA_DOUBLE: + return false; + case HA_VECT64: + return VT->getBitWidth() == 64; + case HA_VECT128: + return VT->getBitWidth() == 128; + case HA_UNKNOWN: + switch (VT->getBitWidth()) { + case 64: + Base = HA_VECT64; + return true; + case 128: + Base = HA_VECT128; + return true; + default: + return false; + } + } + } + + return (Members > 0 && Members <= 4); +} + +/// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate or one of +/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when +/// passing according to AAPCS rules. +bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( + Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { + if (getEffectiveCallingConv(CallConv, isVarArg) != + CallingConv::ARM_AAPCS_VFP) + return false; + + HABaseType Base = HA_UNKNOWN; + uint64_t Members = 0; + bool IsHA = isHomogeneousAggregate(Ty, Base, Members); + DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump()); + + bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy(); + return IsHA || IsIntArray; +} + +unsigned ARMTargetLowering::getExceptionPointerRegister( + const Constant *PersonalityFn) const { + // Platforms which do not use SjLj EH may return values in these registers + // via the personality function. + return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0; +} + +unsigned ARMTargetLowering::getExceptionSelectorRegister( + const Constant *PersonalityFn) const { + // Platforms which do not use SjLj EH may return values in these registers + // via the personality function. + return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h new file mode 100644 index 0000000..b764624 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h @@ -0,0 +1,677 @@ +//===-- ARMISelLowering.h - ARM DAG Lowering Interface ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that ARM uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_ARMISELLOWERING_H +#define LLVM_LIB_TARGET_ARM_ARMISELLOWERING_H + +#include "MCTargetDesc/ARMBaseInfo.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/Target/TargetLowering.h" +#include <vector> + +namespace llvm { + class ARMConstantPoolValue; + class ARMSubtarget; + + namespace ARMISD { + // ARM Specific DAG Nodes + enum NodeType : unsigned { + // Start the numbering where the builtin ops and target ops leave off. + FIRST_NUMBER = ISD::BUILTIN_OP_END, + + Wrapper, // Wrapper - A wrapper node for TargetConstantPool, + // TargetExternalSymbol, and TargetGlobalAddress. + WrapperPIC, // WrapperPIC - A wrapper node for TargetGlobalAddress in + // PIC mode. + WrapperJT, // WrapperJT - A wrapper node for TargetJumpTable + + // Add pseudo op to model memcpy for struct byval. + COPY_STRUCT_BYVAL, + + CALL, // Function call. + CALL_PRED, // Function call that's predicable. + CALL_NOLINK, // Function call with branch not branch-and-link. + tCALL, // Thumb function call. + BRCOND, // Conditional branch. + BR_JT, // Jumptable branch. + BR2_JT, // Jumptable branch (2 level - jumptable entry is a jump). + RET_FLAG, // Return with a flag operand. + INTRET_FLAG, // Interrupt return with an LR-offset and a flag operand. + + PIC_ADD, // Add with a PC operand and a PIC label. + + CMP, // ARM compare instructions. + CMN, // ARM CMN instructions. + CMPZ, // ARM compare that sets only Z flag. + CMPFP, // ARM VFP compare instruction, sets FPSCR. + CMPFPw0, // ARM VFP compare against zero instruction, sets FPSCR. + FMSTAT, // ARM fmstat instruction. + + CMOV, // ARM conditional move instructions. + + BCC_i64, + + SRL_FLAG, // V,Flag = srl_flag X -> srl X, 1 + save carry out. + SRA_FLAG, // V,Flag = sra_flag X -> sra X, 1 + save carry out. + RRX, // V = RRX X, Flag -> srl X, 1 + shift in carry flag. + + ADDC, // Add with carry + ADDE, // Add using carry + SUBC, // Sub with carry + SUBE, // Sub using carry + + VMOVRRD, // double to two gprs. + VMOVDRR, // Two gprs to double. + + EH_SJLJ_SETJMP, // SjLj exception handling setjmp. + EH_SJLJ_LONGJMP, // SjLj exception handling longjmp. + EH_SJLJ_SETUP_DISPATCH, // SjLj exception handling setup_dispatch. + + TC_RETURN, // Tail call return pseudo. + + THREAD_POINTER, + + DYN_ALLOC, // Dynamic allocation on the stack. + + MEMBARRIER_MCR, // Memory barrier (MCR) + + PRELOAD, // Preload + + WIN__CHKSTK, // Windows' __chkstk call to do stack probing. + WIN__DBZCHK, // Windows' divide by zero check + + VCEQ, // Vector compare equal. + VCEQZ, // Vector compare equal to zero. + VCGE, // Vector compare greater than or equal. + VCGEZ, // Vector compare greater than or equal to zero. + VCLEZ, // Vector compare less than or equal to zero. + VCGEU, // Vector compare unsigned greater than or equal. + VCGT, // Vector compare greater than. + VCGTZ, // Vector compare greater than zero. + VCLTZ, // Vector compare less than zero. + VCGTU, // Vector compare unsigned greater than. + VTST, // Vector test bits. + + // Vector shift by immediate: + VSHL, // ...left + VSHRs, // ...right (signed) + VSHRu, // ...right (unsigned) + + // Vector rounding shift by immediate: + VRSHRs, // ...right (signed) + VRSHRu, // ...right (unsigned) + VRSHRN, // ...right narrow + + // Vector saturating shift by immediate: + VQSHLs, // ...left (signed) + VQSHLu, // ...left (unsigned) + VQSHLsu, // ...left (signed to unsigned) + VQSHRNs, // ...right narrow (signed) + VQSHRNu, // ...right narrow (unsigned) + VQSHRNsu, // ...right narrow (signed to unsigned) + + // Vector saturating rounding shift by immediate: + VQRSHRNs, // ...right narrow (signed) + VQRSHRNu, // ...right narrow (unsigned) + VQRSHRNsu, // ...right narrow (signed to unsigned) + + // Vector shift and insert: + VSLI, // ...left + VSRI, // ...right + + // Vector get lane (VMOV scalar to ARM core register) + // (These are used for 8- and 16-bit element types only.) + VGETLANEu, // zero-extend vector extract element + VGETLANEs, // sign-extend vector extract element + + // Vector move immediate and move negated immediate: + VMOVIMM, + VMVNIMM, + + // Vector move f32 immediate: + VMOVFPIMM, + + // Vector duplicate: + VDUP, + VDUPLANE, + + // Vector shuffles: + VEXT, // extract + VREV64, // reverse elements within 64-bit doublewords + VREV32, // reverse elements within 32-bit words + VREV16, // reverse elements within 16-bit halfwords + VZIP, // zip (interleave) + VUZP, // unzip (deinterleave) + VTRN, // transpose + VTBL1, // 1-register shuffle with mask + VTBL2, // 2-register shuffle with mask + + // Vector multiply long: + VMULLs, // ...signed + VMULLu, // ...unsigned + + UMLAL, // 64bit Unsigned Accumulate Multiply + SMLAL, // 64bit Signed Accumulate Multiply + + // Operands of the standard BUILD_VECTOR node are not legalized, which + // is fine if BUILD_VECTORs are always lowered to shuffles or other + // operations, but for ARM some BUILD_VECTORs are legal as-is and their + // operands need to be legalized. Define an ARM-specific version of + // BUILD_VECTOR for this purpose. + BUILD_VECTOR, + + // Bit-field insert + BFI, + + // Vector OR with immediate + VORRIMM, + // Vector AND with NOT of immediate + VBICIMM, + + // Vector bitwise select + VBSL, + + // Pseudo-instruction representing a memory copy using ldm/stm + // instructions. + MEMCPY, + + // Vector load N-element structure to all lanes: + VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE, + VLD3DUP, + VLD4DUP, + + // NEON loads with post-increment base updates: + VLD1_UPD, + VLD2_UPD, + VLD3_UPD, + VLD4_UPD, + VLD2LN_UPD, + VLD3LN_UPD, + VLD4LN_UPD, + VLD2DUP_UPD, + VLD3DUP_UPD, + VLD4DUP_UPD, + + // NEON stores with post-increment base updates: + VST1_UPD, + VST2_UPD, + VST3_UPD, + VST4_UPD, + VST2LN_UPD, + VST3LN_UPD, + VST4LN_UPD + }; + } + + /// Define some predicates that are used for node matching. + namespace ARM { + bool isBitFieldInvertedMask(unsigned v); + } + + //===--------------------------------------------------------------------===// + // ARMTargetLowering - ARM Implementation of the TargetLowering interface + + class ARMTargetLowering : public TargetLowering { + public: + explicit ARMTargetLowering(const TargetMachine &TM, + const ARMSubtarget &STI); + + unsigned getJumpTableEncoding() const override; + bool useSoftFloat() const override; + + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + + /// ReplaceNodeResults - Replace the results of node with an illegal result + /// type with new values built out of custom code. + /// + void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results, + SelectionDAG &DAG) const override; + + const char *getTargetNodeName(unsigned Opcode) const override; + + bool isSelectSupported(SelectSupportKind Kind) const override { + // ARM does not support scalar condition selects on vectors. + return (Kind != ScalarCondVectorVal); + } + + /// getSetCCResultType - Return the value type to use for ISD::SETCC. + EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, + EVT VT) const override; + + MachineBasicBlock * + EmitInstrWithCustomInserter(MachineInstr *MI, + MachineBasicBlock *MBB) const override; + + void AdjustInstrPostInstrSelection(MachineInstr *MI, + SDNode *Node) const override; + + SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const; + SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const; + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + + bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override; + + /// allowsMisalignedMemoryAccesses - Returns true if the target allows + /// unaligned memory accesses of the specified type. Returns whether it + /// is "fast" by reference in the second argument. + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, + unsigned Align, + bool *Fast) const override; + + EVT getOptimalMemOpType(uint64_t Size, + unsigned DstAlign, unsigned SrcAlign, + bool IsMemset, bool ZeroMemset, + bool MemcpyStrSrc, + MachineFunction &MF) const override; + + using TargetLowering::isZExtFree; + bool isZExtFree(SDValue Val, EVT VT2) const override; + + bool isVectorLoadExtDesirable(SDValue ExtVal) const override; + + bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override; + + + /// isLegalAddressingMode - Return true if the addressing mode represented + /// by AM is legal for this target, for a load/store of the specified type. + bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, + Type *Ty, unsigned AS) const override; + bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const; + + /// isLegalICmpImmediate - Return true if the specified immediate is legal + /// icmp immediate, that is the target has icmp instructions which can + /// compare a register against the immediate without having to materialize + /// the immediate into a register. + bool isLegalICmpImmediate(int64_t Imm) const override; + + /// isLegalAddImmediate - Return true if the specified immediate is legal + /// add immediate, that is the target has add instructions which can + /// add a register and the immediate without having to materialize + /// the immediate into a register. + bool isLegalAddImmediate(int64_t Imm) const override; + + /// getPreIndexedAddressParts - returns true by value, base pointer and + /// offset pointer and addressing mode by reference if the node's address + /// can be legally represented as pre-indexed load / store address. + bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, + ISD::MemIndexedMode &AM, + SelectionDAG &DAG) const override; + + /// getPostIndexedAddressParts - returns true by value, base pointer and + /// offset pointer and addressing mode by reference if this node can be + /// combined with a load / store to form a post-indexed load / store. + bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, + SDValue &Offset, ISD::MemIndexedMode &AM, + SelectionDAG &DAG) const override; + + void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) const override; + + + bool ExpandInlineAsm(CallInst *CI) const override; + + ConstraintType getConstraintType(StringRef Constraint) const override; + + /// Examine constraint string and operand type and determine a weight value. + /// The operand object must already have been set up with the operand type. + ConstraintWeight getSingleConstraintMatchWeight( + AsmOperandInfo &info, const char *constraint) const override; + + std::pair<unsigned, const TargetRegisterClass *> + getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, + StringRef Constraint, MVT VT) const override; + + /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops + /// vector. If it is invalid, don't add anything to Ops. If hasMemory is + /// true it means one of the asm constraint of the inline asm instruction + /// being processed is 'm'. + void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, + std::vector<SDValue> &Ops, + SelectionDAG &DAG) const override; + + unsigned + getInlineAsmMemConstraint(StringRef ConstraintCode) const override { + if (ConstraintCode == "Q") + return InlineAsm::Constraint_Q; + else if (ConstraintCode == "o") + return InlineAsm::Constraint_o; + else if (ConstraintCode.size() == 2) { + if (ConstraintCode[0] == 'U') { + switch(ConstraintCode[1]) { + default: + break; + case 'm': + return InlineAsm::Constraint_Um; + case 'n': + return InlineAsm::Constraint_Un; + case 'q': + return InlineAsm::Constraint_Uq; + case 's': + return InlineAsm::Constraint_Us; + case 't': + return InlineAsm::Constraint_Ut; + case 'v': + return InlineAsm::Constraint_Uv; + case 'y': + return InlineAsm::Constraint_Uy; + } + } + } + return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); + } + + const ARMSubtarget* getSubtarget() const { + return Subtarget; + } + + /// getRegClassFor - Return the register class that should be used for the + /// specified value type. + const TargetRegisterClass *getRegClassFor(MVT VT) const override; + + /// Returns true if a cast between SrcAS and DestAS is a noop. + bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { + // Addrspacecasts are always noops. + return true; + } + + bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, + unsigned &PrefAlign) const override; + + /// createFastISel - This method returns a target specific FastISel object, + /// or null if the target does not support "fast" ISel. + FastISel *createFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo) const override; + + Sched::Preference getSchedulingPreference(SDNode *N) const override; + + bool + isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override; + bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; + + /// isFPImmLegal - Returns true if the target can instruction select the + /// specified FP immediate natively. If false, the legalizer will + /// materialize the FP immediate as a load from a constant pool. + bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; + + bool getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, + unsigned Intrinsic) const override; + + /// \brief Returns true if it is beneficial to convert a load of a constant + /// to just the constant itself. + bool shouldConvertConstantLoadToIntImm(const APInt &Imm, + Type *Ty) const override; + + /// \brief Returns true if an argument of type Ty needs to be passed in a + /// contiguous block of registers in calling convention CallConv. + bool functionArgumentNeedsConsecutiveRegisters( + Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override; + + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. + unsigned + getExceptionPointerRegister(const Constant *PersonalityFn) const override; + + /// If a physical register, this returns the register that receives the + /// exception typeid on entry to a landing pad. + unsigned + getExceptionSelectorRegister(const Constant *PersonalityFn) const override; + + Instruction *makeDMB(IRBuilder<> &Builder, ARM_MB::MemBOpt Domain) const; + Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr, + AtomicOrdering Ord) const override; + Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val, + Value *Addr, AtomicOrdering Ord) const override; + + void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override; + + Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord, + bool IsStore, bool IsLoad) const override; + Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord, + bool IsStore, bool IsLoad) const override; + + unsigned getMaxSupportedInterleaveFactor() const override { return 4; } + + bool lowerInterleavedLoad(LoadInst *LI, + ArrayRef<ShuffleVectorInst *> Shuffles, + ArrayRef<unsigned> Indices, + unsigned Factor) const override; + bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, + unsigned Factor) const override; + + TargetLoweringBase::AtomicExpansionKind + shouldExpandAtomicLoadInIR(LoadInst *LI) const override; + bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; + TargetLoweringBase::AtomicExpansionKind + shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; + bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override; + + bool useLoadStackGuardNode() const override; + + bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, + unsigned &Cost) const override; + + bool isCheapToSpeculateCttz() const override; + bool isCheapToSpeculateCtlz() const override; + + protected: + std::pair<const TargetRegisterClass *, uint8_t> + findRepresentativeClass(const TargetRegisterInfo *TRI, + MVT VT) const override; + + private: + /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can + /// make the right decision when generating code for different targets. + const ARMSubtarget *Subtarget; + + const TargetRegisterInfo *RegInfo; + + const InstrItineraryData *Itins; + + /// ARMPCLabelIndex - Keep track of the number of ARM PC labels created. + /// + unsigned ARMPCLabelIndex; + + void addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT); + void addDRTypeForNEON(MVT VT); + void addQRTypeForNEON(MVT VT); + std::pair<SDValue, SDValue> getARMXALUOOp(SDValue Op, SelectionDAG &DAG, SDValue &ARMcc) const; + + typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector; + void PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG, + SDValue Chain, SDValue &Arg, + RegsToPassVector &RegsToPass, + CCValAssign &VA, CCValAssign &NextVA, + SDValue &StackPtr, + SmallVectorImpl<SDValue> &MemOpChains, + ISD::ArgFlagsTy Flags) const; + SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, + SDValue &Root, SelectionDAG &DAG, + SDLoc dl) const; + + CallingConv::ID getEffectiveCallingConv(CallingConv::ID CC, + bool isVarArg) const; + CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return, + bool isVarArg) const; + SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, + SDLoc dl, SelectionDAG &DAG, + const CCValAssign &VA, + ISD::ArgFlagsTy Flags) const; + SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) const; + SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalAddressWindows(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, + SelectionDAG &DAG) const; + SDValue LowerToTLSExecModels(GlobalAddressSDNode *GA, + SelectionDAG &DAG, + TLSModel::Model model) const; + SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerConstantFP(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) const; + SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) const; + SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const; + void ExpandDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed, + SmallVectorImpl<SDValue> &Results) const; + SDValue LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, bool Signed, + SDValue &Chain) const; + SDValue LowerREM(SDNode *N, SelectionDAG &DAG) const; + SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + + unsigned getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const override; + + /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster + /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be + /// expanded to FMAs when this method returns true, otherwise fmuladd is + /// expanded to fmul + fadd. + /// + /// ARM supports both fused and unfused multiply-add operations; we already + /// lower a pair of fmul and fadd to the latter so it's not clear that there + /// would be a gain or that the gain would be worthwhile enough to risk + /// correctness bugs. + bool isFMAFasterThanFMulAndFAdd(EVT VT) const override { return false; } + + SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerCallResult(SDValue Chain, SDValue InFlag, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals, + bool isThisReturn, SDValue ThisVal) const; + + SDValue + LowerFormalArguments(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const override; + + int StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, + SDLoc dl, SDValue &Chain, + const Value *OrigArg, + unsigned InRegsParamRecordIdx, + int ArgOffset, + unsigned ArgSize) const; + + void VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, + SDLoc dl, SDValue &Chain, + unsigned ArgOffset, + unsigned TotalArgRegsSaveSize, + bool ForceMutable = false) const; + + SDValue + LowerCall(TargetLowering::CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const override; + + /// HandleByVal - Target-specific cleanup for ByVal support. + void HandleByVal(CCState *, unsigned &, unsigned) const override; + + /// IsEligibleForTailCallOptimization - Check whether the call is eligible + /// for tail call optimization. Targets which want to do tail call + /// optimization should implement this function. + bool IsEligibleForTailCallOptimization(SDValue Callee, + CallingConv::ID CalleeCC, + bool isVarArg, + bool isCalleeStructRet, + bool isCallerStructRet, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + SelectionDAG& DAG) const; + + bool CanLowerReturn(CallingConv::ID CallConv, + MachineFunction &MF, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + LLVMContext &Context) const override; + + SDValue + LowerReturn(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + SDLoc dl, SelectionDAG &DAG) const override; + + bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; + + bool mayBeEmittedAsTailCall(CallInst *CI) const override; + + SDValue getCMOV(SDLoc dl, EVT VT, SDValue FalseVal, SDValue TrueVal, + SDValue ARMcc, SDValue CCR, SDValue Cmp, + SelectionDAG &DAG) const; + SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, + SDValue &ARMcc, SelectionDAG &DAG, SDLoc dl) const; + SDValue getVFPCmp(SDValue LHS, SDValue RHS, + SelectionDAG &DAG, SDLoc dl) const; + SDValue duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const; + + SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const; + + void SetupEntryBlockForSjLj(MachineInstr *MI, + MachineBasicBlock *MBB, + MachineBasicBlock *DispatchBB, int FI) const; + + void EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const; + + bool RemapAddSubWithFlags(MachineInstr *MI, MachineBasicBlock *BB) const; + + MachineBasicBlock *EmitStructByval(MachineInstr *MI, + MachineBasicBlock *MBB) const; + + MachineBasicBlock *EmitLowered__chkstk(MachineInstr *MI, + MachineBasicBlock *MBB) const; + MachineBasicBlock *EmitLowered__dbzchk(MachineInstr *MI, + MachineBasicBlock *MBB) const; + }; + + enum NEONModImmType { + VMOVModImm, + VMVNModImm, + OtherModImm + }; + + namespace ARM { + FastISel *createFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo); + } +} + +#endif // ARMISELLOWERING_H diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td new file mode 100644 index 0000000..e79608d --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td @@ -0,0 +1,2372 @@ +//===-- ARMInstrFormats.td - ARM Instruction Formats -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// +// ARM Instruction Format Definitions. +// + +// Format specifies the encoding used by the instruction. This is part of the +// ad-hoc solution used to emit machine instruction encodings by our machine +// code emitter. +class Format<bits<6> val> { + bits<6> Value = val; +} + +def Pseudo : Format<0>; +def MulFrm : Format<1>; +def BrFrm : Format<2>; +def BrMiscFrm : Format<3>; + +def DPFrm : Format<4>; +def DPSoRegRegFrm : Format<5>; + +def LdFrm : Format<6>; +def StFrm : Format<7>; +def LdMiscFrm : Format<8>; +def StMiscFrm : Format<9>; +def LdStMulFrm : Format<10>; + +def LdStExFrm : Format<11>; + +def ArithMiscFrm : Format<12>; +def SatFrm : Format<13>; +def ExtFrm : Format<14>; + +def VFPUnaryFrm : Format<15>; +def VFPBinaryFrm : Format<16>; +def VFPConv1Frm : Format<17>; +def VFPConv2Frm : Format<18>; +def VFPConv3Frm : Format<19>; +def VFPConv4Frm : Format<20>; +def VFPConv5Frm : Format<21>; +def VFPLdStFrm : Format<22>; +def VFPLdStMulFrm : Format<23>; +def VFPMiscFrm : Format<24>; + +def ThumbFrm : Format<25>; +def MiscFrm : Format<26>; + +def NGetLnFrm : Format<27>; +def NSetLnFrm : Format<28>; +def NDupFrm : Format<29>; +def NLdStFrm : Format<30>; +def N1RegModImmFrm: Format<31>; +def N2RegFrm : Format<32>; +def NVCVTFrm : Format<33>; +def NVDupLnFrm : Format<34>; +def N2RegVShLFrm : Format<35>; +def N2RegVShRFrm : Format<36>; +def N3RegFrm : Format<37>; +def N3RegVShFrm : Format<38>; +def NVExtFrm : Format<39>; +def NVMulSLFrm : Format<40>; +def NVTBLFrm : Format<41>; +def DPSoRegImmFrm : Format<42>; + +// Misc flags. + +// The instruction has an Rn register operand. +// UnaryDP - Indicates this is a unary data processing instruction, i.e. +// it doesn't have a Rn operand. +class UnaryDP { bit isUnaryDataProc = 1; } + +// Xform16Bit - Indicates this Thumb2 instruction may be transformed into +// a 16-bit Thumb instruction if certain conditions are met. +class Xform16Bit { bit canXformTo16Bit = 1; } + +//===----------------------------------------------------------------------===// +// ARM Instruction flags. These need to match ARMBaseInstrInfo.h. +// + +// FIXME: Once the JIT is MC-ized, these can go away. +// Addressing mode. +class AddrMode<bits<5> val> { + bits<5> Value = val; +} +def AddrModeNone : AddrMode<0>; +def AddrMode1 : AddrMode<1>; +def AddrMode2 : AddrMode<2>; +def AddrMode3 : AddrMode<3>; +def AddrMode4 : AddrMode<4>; +def AddrMode5 : AddrMode<5>; +def AddrMode6 : AddrMode<6>; +def AddrModeT1_1 : AddrMode<7>; +def AddrModeT1_2 : AddrMode<8>; +def AddrModeT1_4 : AddrMode<9>; +def AddrModeT1_s : AddrMode<10>; +def AddrModeT2_i12 : AddrMode<11>; +def AddrModeT2_i8 : AddrMode<12>; +def AddrModeT2_so : AddrMode<13>; +def AddrModeT2_pc : AddrMode<14>; +def AddrModeT2_i8s4 : AddrMode<15>; +def AddrMode_i12 : AddrMode<16>; + +// Load / store index mode. +class IndexMode<bits<2> val> { + bits<2> Value = val; +} +def IndexModeNone : IndexMode<0>; +def IndexModePre : IndexMode<1>; +def IndexModePost : IndexMode<2>; +def IndexModeUpd : IndexMode<3>; + +// Instruction execution domain. +class Domain<bits<3> val> { + bits<3> Value = val; +} +def GenericDomain : Domain<0>; +def VFPDomain : Domain<1>; // Instructions in VFP domain only +def NeonDomain : Domain<2>; // Instructions in Neon domain only +def VFPNeonDomain : Domain<3>; // Instructions in both VFP & Neon domains +def VFPNeonA8Domain : Domain<5>; // Instructions in VFP & Neon under A8 + +//===----------------------------------------------------------------------===// +// ARM special operands. +// + +// ARM imod and iflag operands, used only by the CPS instruction. +def imod_op : Operand<i32> { + let PrintMethod = "printCPSIMod"; +} + +def ProcIFlagsOperand : AsmOperandClass { + let Name = "ProcIFlags"; + let ParserMethod = "parseProcIFlagsOperand"; +} +def iflags_op : Operand<i32> { + let PrintMethod = "printCPSIFlag"; + let ParserMatchClass = ProcIFlagsOperand; +} + +// ARM Predicate operand. Default to 14 = always (AL). Second part is CC +// register whose default is 0 (no register). +def CondCodeOperand : AsmOperandClass { let Name = "CondCode"; } +def pred : PredicateOperand<OtherVT, (ops i32imm, i32imm), + (ops (i32 14), (i32 zero_reg))> { + let PrintMethod = "printPredicateOperand"; + let ParserMatchClass = CondCodeOperand; + let DecoderMethod = "DecodePredicateOperand"; +} + +// Selectable predicate operand for CMOV instructions. We can't use a normal +// predicate because the default values interfere with instruction selection. In +// all other respects it is identical though: pseudo-instruction expansion +// relies on the MachineOperands being compatible. +def cmovpred : Operand<i32>, PredicateOp, + ComplexPattern<i32, 2, "SelectCMOVPred"> { + let MIOperandInfo = (ops i32imm, i32imm); + let PrintMethod = "printPredicateOperand"; +} + +// Conditional code result for instructions whose 's' bit is set, e.g. subs. +def CCOutOperand : AsmOperandClass { let Name = "CCOut"; } +def cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 zero_reg))> { + let EncoderMethod = "getCCOutOpValue"; + let PrintMethod = "printSBitModifierOperand"; + let ParserMatchClass = CCOutOperand; + let DecoderMethod = "DecodeCCOutOperand"; +} + +// Same as cc_out except it defaults to setting CPSR. +def s_cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 CPSR))> { + let EncoderMethod = "getCCOutOpValue"; + let PrintMethod = "printSBitModifierOperand"; + let ParserMatchClass = CCOutOperand; + let DecoderMethod = "DecodeCCOutOperand"; +} + +// ARM special operands for disassembly only. +// +def SetEndAsmOperand : ImmAsmOperand { + let Name = "SetEndImm"; + let ParserMethod = "parseSetEndImm"; +} +def setend_op : Operand<i32> { + let PrintMethod = "printSetendOperand"; + let ParserMatchClass = SetEndAsmOperand; +} + +def MSRMaskOperand : AsmOperandClass { + let Name = "MSRMask"; + let ParserMethod = "parseMSRMaskOperand"; +} +def msr_mask : Operand<i32> { + let PrintMethod = "printMSRMaskOperand"; + let DecoderMethod = "DecodeMSRMask"; + let ParserMatchClass = MSRMaskOperand; +} + +def BankedRegOperand : AsmOperandClass { + let Name = "BankedReg"; + let ParserMethod = "parseBankedRegOperand"; +} +def banked_reg : Operand<i32> { + let PrintMethod = "printBankedRegOperand"; + let DecoderMethod = "DecodeBankedReg"; + let ParserMatchClass = BankedRegOperand; +} + +// Shift Right Immediate - A shift right immediate is encoded differently from +// other shift immediates. The imm6 field is encoded like so: +// +// Offset Encoding +// 8 imm6<5:3> = '001', 8 - <imm> is encoded in imm6<2:0> +// 16 imm6<5:4> = '01', 16 - <imm> is encoded in imm6<3:0> +// 32 imm6<5> = '1', 32 - <imm> is encoded in imm6<4:0> +// 64 64 - <imm> is encoded in imm6<5:0> +def shr_imm8_asm_operand : ImmAsmOperand { let Name = "ShrImm8"; } +def shr_imm8 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 8; }]> { + let EncoderMethod = "getShiftRight8Imm"; + let DecoderMethod = "DecodeShiftRight8Imm"; + let ParserMatchClass = shr_imm8_asm_operand; +} +def shr_imm16_asm_operand : ImmAsmOperand { let Name = "ShrImm16"; } +def shr_imm16 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 16; }]> { + let EncoderMethod = "getShiftRight16Imm"; + let DecoderMethod = "DecodeShiftRight16Imm"; + let ParserMatchClass = shr_imm16_asm_operand; +} +def shr_imm32_asm_operand : ImmAsmOperand { let Name = "ShrImm32"; } +def shr_imm32 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 32; }]> { + let EncoderMethod = "getShiftRight32Imm"; + let DecoderMethod = "DecodeShiftRight32Imm"; + let ParserMatchClass = shr_imm32_asm_operand; +} +def shr_imm64_asm_operand : ImmAsmOperand { let Name = "ShrImm64"; } +def shr_imm64 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 64; }]> { + let EncoderMethod = "getShiftRight64Imm"; + let DecoderMethod = "DecodeShiftRight64Imm"; + let ParserMatchClass = shr_imm64_asm_operand; +} + +//===----------------------------------------------------------------------===// +// ARM Assembler alias templates. +// +class ARMInstAlias<string Asm, dag Result, bit Emit = 0b1> + : InstAlias<Asm, Result, Emit>, Requires<[IsARM]>; +class tInstAlias<string Asm, dag Result, bit Emit = 0b1> + : InstAlias<Asm, Result, Emit>, Requires<[IsThumb]>; +class t2InstAlias<string Asm, dag Result, bit Emit = 0b1> + : InstAlias<Asm, Result, Emit>, Requires<[IsThumb2]>; +class VFP2InstAlias<string Asm, dag Result, bit Emit = 0b1> + : InstAlias<Asm, Result, Emit>, Requires<[HasVFP2]>; +class VFP2DPInstAlias<string Asm, dag Result, bit Emit = 0b1> + : InstAlias<Asm, Result, Emit>, Requires<[HasVFP2,HasDPVFP]>; +class VFP3InstAlias<string Asm, dag Result, bit Emit = 0b1> + : InstAlias<Asm, Result, Emit>, Requires<[HasVFP3]>; +class NEONInstAlias<string Asm, dag Result, bit Emit = 0b1> + : InstAlias<Asm, Result, Emit>, Requires<[HasNEON]>; + + +class VFP2MnemonicAlias<string src, string dst> : MnemonicAlias<src, dst>, + Requires<[HasVFP2]>; +class NEONMnemonicAlias<string src, string dst> : MnemonicAlias<src, dst>, + Requires<[HasNEON]>; + +//===----------------------------------------------------------------------===// +// ARM Instruction templates. +// + + +class InstTemplate<AddrMode am, int sz, IndexMode im, + Format f, Domain d, string cstr, InstrItinClass itin> + : Instruction { + let Namespace = "ARM"; + + AddrMode AM = am; + int Size = sz; + IndexMode IM = im; + bits<2> IndexModeBits = IM.Value; + Format F = f; + bits<6> Form = F.Value; + Domain D = d; + bit isUnaryDataProc = 0; + bit canXformTo16Bit = 0; + // The instruction is a 16-bit flag setting Thumb instruction. Used + // by the parser to determine whether to require the 'S' suffix on the + // mnemonic (when not in an IT block) or preclude it (when in an IT block). + bit thumbArithFlagSetting = 0; + + // If this is a pseudo instruction, mark it isCodeGenOnly. + let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo"); + + // The layout of TSFlags should be kept in sync with ARMBaseInfo.h. + let TSFlags{4-0} = AM.Value; + let TSFlags{6-5} = IndexModeBits; + let TSFlags{12-7} = Form; + let TSFlags{13} = isUnaryDataProc; + let TSFlags{14} = canXformTo16Bit; + let TSFlags{17-15} = D.Value; + let TSFlags{18} = thumbArithFlagSetting; + + let Constraints = cstr; + let Itinerary = itin; +} + +class Encoding { + field bits<32> Inst; + // Mask of bits that cause an encoding to be UNPREDICTABLE. + // If a bit is set, then if the corresponding bit in the + // target encoding differs from its value in the "Inst" field, + // the instruction is UNPREDICTABLE (SoftFail in abstract parlance). + field bits<32> Unpredictable = 0; + // SoftFail is the generic name for this field, but we alias it so + // as to make it more obvious what it means in ARM-land. + field bits<32> SoftFail = Unpredictable; +} + +class InstARM<AddrMode am, int sz, IndexMode im, + Format f, Domain d, string cstr, InstrItinClass itin> + : InstTemplate<am, sz, im, f, d, cstr, itin>, Encoding { + let DecoderNamespace = "ARM"; +} + +// This Encoding-less class is used by Thumb1 to specify the encoding bits later +// on by adding flavors to specific instructions. +class InstThumb<AddrMode am, int sz, IndexMode im, + Format f, Domain d, string cstr, InstrItinClass itin> + : InstTemplate<am, sz, im, f, d, cstr, itin> { + let DecoderNamespace = "Thumb"; +} + +// Pseudo-instructions for alternate assembly syntax (never used by codegen). +// These are aliases that require C++ handling to convert to the target +// instruction, while InstAliases can be handled directly by tblgen. +class AsmPseudoInst<string asm, dag iops, dag oops = (outs)> + : InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo, GenericDomain, + "", NoItinerary> { + let OutOperandList = oops; + let InOperandList = iops; + let Pattern = []; + let isCodeGenOnly = 0; // So we get asm matcher for it. + let AsmString = asm; + let isPseudo = 1; +} + +class ARMAsmPseudo<string asm, dag iops, dag oops = (outs)> + : AsmPseudoInst<asm, iops, oops>, Requires<[IsARM]>; +class tAsmPseudo<string asm, dag iops, dag oops = (outs)> + : AsmPseudoInst<asm, iops, oops>, Requires<[IsThumb]>; +class t2AsmPseudo<string asm, dag iops, dag oops = (outs)> + : AsmPseudoInst<asm, iops, oops>, Requires<[IsThumb2]>; +class VFP2AsmPseudo<string asm, dag iops, dag oops = (outs)> + : AsmPseudoInst<asm, iops, oops>, Requires<[HasVFP2]>; +class NEONAsmPseudo<string asm, dag iops, dag oops = (outs)> + : AsmPseudoInst<asm, iops, oops>, Requires<[HasNEON]>; + +// Pseudo instructions for the code generator. +class PseudoInst<dag oops, dag iops, InstrItinClass itin, list<dag> pattern> + : InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo, + GenericDomain, "", itin> { + let OutOperandList = oops; + let InOperandList = iops; + let Pattern = pattern; + let isCodeGenOnly = 1; + let isPseudo = 1; +} + +// PseudoInst that's ARM-mode only. +class ARMPseudoInst<dag oops, dag iops, int sz, InstrItinClass itin, + list<dag> pattern> + : PseudoInst<oops, iops, itin, pattern> { + let Size = sz; + list<Predicate> Predicates = [IsARM]; +} + +// PseudoInst that's Thumb-mode only. +class tPseudoInst<dag oops, dag iops, int sz, InstrItinClass itin, + list<dag> pattern> + : PseudoInst<oops, iops, itin, pattern> { + let Size = sz; + list<Predicate> Predicates = [IsThumb]; +} + +// PseudoInst that's Thumb2-mode only. +class t2PseudoInst<dag oops, dag iops, int sz, InstrItinClass itin, + list<dag> pattern> + : PseudoInst<oops, iops, itin, pattern> { + let Size = sz; + list<Predicate> Predicates = [IsThumb2]; +} + +class ARMPseudoExpand<dag oops, dag iops, int sz, + InstrItinClass itin, list<dag> pattern, + dag Result> + : ARMPseudoInst<oops, iops, sz, itin, pattern>, + PseudoInstExpansion<Result>; + +class tPseudoExpand<dag oops, dag iops, int sz, + InstrItinClass itin, list<dag> pattern, + dag Result> + : tPseudoInst<oops, iops, sz, itin, pattern>, + PseudoInstExpansion<Result>; + +class t2PseudoExpand<dag oops, dag iops, int sz, + InstrItinClass itin, list<dag> pattern, + dag Result> + : t2PseudoInst<oops, iops, sz, itin, pattern>, + PseudoInstExpansion<Result>; + +// Almost all ARM instructions are predicable. +class I<dag oops, dag iops, AddrMode am, int sz, + IndexMode im, Format f, InstrItinClass itin, + string opc, string asm, string cstr, + list<dag> pattern> + : InstARM<am, sz, im, f, GenericDomain, cstr, itin> { + bits<4> p; + let Inst{31-28} = p; + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat(opc, "${p}", asm); + let Pattern = pattern; + list<Predicate> Predicates = [IsARM]; +} + +// A few are not predicable +class InoP<dag oops, dag iops, AddrMode am, int sz, + IndexMode im, Format f, InstrItinClass itin, + string opc, string asm, string cstr, + list<dag> pattern> + : InstARM<am, sz, im, f, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = iops; + let AsmString = !strconcat(opc, asm); + let Pattern = pattern; + let isPredicable = 0; + list<Predicate> Predicates = [IsARM]; +} + +// Same as I except it can optionally modify CPSR. Note it's modeled as an input +// operand since by default it's a zero register. It will become an implicit def +// once it's "flipped". +class sI<dag oops, dag iops, AddrMode am, int sz, + IndexMode im, Format f, InstrItinClass itin, + string opc, string asm, string cstr, + list<dag> pattern> + : InstARM<am, sz, im, f, GenericDomain, cstr, itin> { + bits<4> p; // Predicate operand + bits<1> s; // condition-code set flag ('1' if the insn should set the flags) + let Inst{31-28} = p; + let Inst{20} = s; + + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p, cc_out:$s)); + let AsmString = !strconcat(opc, "${s}${p}", asm); + let Pattern = pattern; + list<Predicate> Predicates = [IsARM]; +} + +// Special cases +class XI<dag oops, dag iops, AddrMode am, int sz, + IndexMode im, Format f, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, im, f, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = iops; + let AsmString = asm; + let Pattern = pattern; + list<Predicate> Predicates = [IsARM]; +} + +class AI<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin, + opc, asm, "", pattern>; +class AsI<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : sI<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin, + opc, asm, "", pattern>; +class AXI<dag oops, dag iops, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin, + asm, "", pattern>; +class AXIM<dag oops, dag iops, AddrMode am, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, am, 4, IndexModeNone, f, itin, + asm, "", pattern>; +class AInoP<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : InoP<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin, + opc, asm, "", pattern>; + +// Ctrl flow instructions +class ABI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, 4, IndexModeNone, BrFrm, itin, + opc, asm, "", pattern> { + let Inst{27-24} = opcod; +} +class ABXI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrModeNone, 4, IndexModeNone, BrFrm, itin, + asm, "", pattern> { + let Inst{27-24} = opcod; +} + +// BR_JT instructions +class JTI<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrModeNone, 0, IndexModeNone, BrMiscFrm, itin, + asm, "", pattern>; + +class AIldr_ex_or_acq<bits<2> opcod, bits<2> opcod2, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, 4, IndexModeNone, LdStExFrm, itin, + opc, asm, "", pattern> { + bits<4> Rt; + bits<4> addr; + let Inst{27-23} = 0b00011; + let Inst{22-21} = opcod; + let Inst{20} = 1; + let Inst{19-16} = addr; + let Inst{15-12} = Rt; + let Inst{11-10} = 0b11; + let Inst{9-8} = opcod2; + let Inst{7-0} = 0b10011111; +} +class AIstr_ex_or_rel<bits<2> opcod, bits<2> opcod2, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, 4, IndexModeNone, LdStExFrm, itin, + opc, asm, "", pattern> { + bits<4> Rt; + bits<4> addr; + let Inst{27-23} = 0b00011; + let Inst{22-21} = opcod; + let Inst{20} = 0; + let Inst{19-16} = addr; + let Inst{11-10} = 0b11; + let Inst{9-8} = opcod2; + let Inst{7-4} = 0b1001; + let Inst{3-0} = Rt; +} +// Atomic load/store instructions +class AIldrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : AIldr_ex_or_acq<opcod, 0b11, oops, iops, itin, opc, asm, pattern>; + +class AIstrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : AIstr_ex_or_rel<opcod, 0b11, oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + let Inst{15-12} = Rd; +} + +// Exclusive load/store instructions + +class AIldaex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : AIldr_ex_or_acq<opcod, 0b10, oops, iops, itin, opc, asm, pattern>, + Requires<[IsARM, HasV8]>; + +class AIstlex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : AIstr_ex_or_rel<opcod, 0b10, oops, iops, itin, opc, asm, pattern>, + Requires<[IsARM, HasV8]> { + bits<4> Rd; + let Inst{15-12} = Rd; +} + +class AIswp<bit b, dag oops, dag iops, string opc, list<dag> pattern> + : AI<oops, iops, MiscFrm, NoItinerary, opc, "\t$Rt, $Rt2, $addr", pattern> { + bits<4> Rt; + bits<4> Rt2; + bits<4> addr; + let Inst{27-23} = 0b00010; + let Inst{22} = b; + let Inst{21-20} = 0b00; + let Inst{19-16} = addr; + let Inst{15-12} = Rt; + let Inst{11-4} = 0b00001001; + let Inst{3-0} = Rt2; + + let Unpredictable{11-8} = 0b1111; + let DecoderMethod = "DecodeSwap"; +} +// Acquire/Release load/store instructions +class AIldracq<bits<2> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : AIldr_ex_or_acq<opcod, 0b00, oops, iops, itin, opc, asm, pattern>, + Requires<[IsARM, HasV8]>; + +class AIstrrel<bits<2> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : AIstr_ex_or_rel<opcod, 0b00, oops, iops, itin, opc, asm, pattern>, + Requires<[IsARM, HasV8]> { + let Inst{15-12} = 0b1111; +} + +// addrmode1 instructions +class AI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode1, 4, IndexModeNone, f, itin, + opc, asm, "", pattern> { + let Inst{24-21} = opcod; + let Inst{27-26} = 0b00; +} +class AsI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : sI<oops, iops, AddrMode1, 4, IndexModeNone, f, itin, + opc, asm, "", pattern> { + let Inst{24-21} = opcod; + let Inst{27-26} = 0b00; +} +class AXI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrMode1, 4, IndexModeNone, f, itin, + asm, "", pattern> { + let Inst{24-21} = opcod; + let Inst{27-26} = 0b00; +} + +// loads + +// LDR/LDRB/STR/STRB/... +class AI2ldst<bits<3> op, bit isLd, bit isByte, dag oops, dag iops, AddrMode am, + Format f, InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : I<oops, iops, am, 4, IndexModeNone, f, itin, opc, asm, + "", pattern> { + let Inst{27-25} = op; + let Inst{24} = 1; // 24 == P + // 23 == U + let Inst{22} = isByte; + let Inst{21} = 0; // 21 == W + let Inst{20} = isLd; +} +// Indexed load/stores +class AI2ldstidx<bit isLd, bit isByte, bit isPre, dag oops, dag iops, + IndexMode im, Format f, InstrItinClass itin, string opc, + string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode2, 4, im, f, itin, + opc, asm, cstr, pattern> { + bits<4> Rt; + let Inst{27-26} = 0b01; + let Inst{24} = isPre; // P bit + let Inst{22} = isByte; // B bit + let Inst{21} = isPre; // W bit + let Inst{20} = isLd; // L bit + let Inst{15-12} = Rt; +} +class AI2stridx_reg<bit isByte, bit isPre, dag oops, dag iops, + IndexMode im, Format f, InstrItinClass itin, string opc, + string asm, string cstr, list<dag> pattern> + : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr, + pattern> { + // AM2 store w/ two operands: (GPR, am2offset) + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> Rn; + let Inst{25} = 1; + let Inst{23} = offset{12}; + let Inst{19-16} = Rn; + let Inst{11-5} = offset{11-5}; + let Inst{4} = 0; + let Inst{3-0} = offset{3-0}; +} + +class AI2stridx_imm<bit isByte, bit isPre, dag oops, dag iops, + IndexMode im, Format f, InstrItinClass itin, string opc, + string asm, string cstr, list<dag> pattern> + : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr, + pattern> { + // AM2 store w/ two operands: (GPR, am2offset) + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> Rn; + let Inst{25} = 0; + let Inst{23} = offset{12}; + let Inst{19-16} = Rn; + let Inst{11-0} = offset{11-0}; +} + + +// FIXME: Merge with the above class when addrmode2 gets used for STR, STRB +// but for now use this class for STRT and STRBT. +class AI2stridxT<bit isByte, bit isPre, dag oops, dag iops, + IndexMode im, Format f, InstrItinClass itin, string opc, + string asm, string cstr, list<dag> pattern> + : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr, + pattern> { + // AM2 store w/ two operands: (GPR, am2offset) + // {17-14} Rn + // {13} 1 == Rm, 0 == imm12 + // {12} isAdd + // {11-0} imm12/Rm + bits<18> addr; + let Inst{25} = addr{13}; + let Inst{23} = addr{12}; + let Inst{19-16} = addr{17-14}; + let Inst{11-0} = addr{11-0}; +} + +// addrmode3 instructions +class AI3ld<bits<4> op, bit op20, dag oops, dag iops, Format f, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode3, 4, IndexModeNone, f, itin, + opc, asm, "", pattern> { + bits<14> addr; + bits<4> Rt; + let Inst{27-25} = 0b000; + let Inst{24} = 1; // P bit + let Inst{23} = addr{8}; // U bit + let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm + let Inst{21} = 0; // W bit + let Inst{20} = op20; // L bit + let Inst{19-16} = addr{12-9}; // Rn + let Inst{15-12} = Rt; // Rt + let Inst{11-8} = addr{7-4}; // imm7_4/zero + let Inst{7-4} = op; + let Inst{3-0} = addr{3-0}; // imm3_0/Rm + + let DecoderMethod = "DecodeAddrMode3Instruction"; +} + +class AI3ldstidx<bits<4> op, bit op20, bit isPre, dag oops, dag iops, + IndexMode im, Format f, InstrItinClass itin, string opc, + string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode3, 4, im, f, itin, + opc, asm, cstr, pattern> { + bits<4> Rt; + let Inst{27-25} = 0b000; + let Inst{24} = isPre; // P bit + let Inst{21} = isPre; // W bit + let Inst{20} = op20; // L bit + let Inst{15-12} = Rt; // Rt + let Inst{7-4} = op; +} + +// FIXME: Merge with the above class when addrmode2 gets used for LDR, LDRB +// but for now use this class for LDRSBT, LDRHT, LDSHT. +class AI3ldstidxT<bits<4> op, bit isLoad, dag oops, dag iops, + IndexMode im, Format f, InstrItinClass itin, string opc, + string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode3, 4, im, f, itin, opc, asm, cstr, pattern> { + // {13} 1 == imm8, 0 == Rm + // {12-9} Rn + // {8} isAdd + // {7-4} imm7_4/zero + // {3-0} imm3_0/Rm + bits<4> addr; + bits<4> Rt; + let Inst{27-25} = 0b000; + let Inst{24} = 0; // P bit + let Inst{21} = 1; + let Inst{20} = isLoad; // L bit + let Inst{19-16} = addr; // Rn + let Inst{15-12} = Rt; // Rt + let Inst{7-4} = op; +} + +// stores +class AI3str<bits<4> op, dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode3, 4, IndexModeNone, f, itin, + opc, asm, "", pattern> { + bits<14> addr; + bits<4> Rt; + let Inst{27-25} = 0b000; + let Inst{24} = 1; // P bit + let Inst{23} = addr{8}; // U bit + let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm + let Inst{21} = 0; // W bit + let Inst{20} = 0; // L bit + let Inst{19-16} = addr{12-9}; // Rn + let Inst{15-12} = Rt; // Rt + let Inst{11-8} = addr{7-4}; // imm7_4/zero + let Inst{7-4} = op; + let Inst{3-0} = addr{3-0}; // imm3_0/Rm + let DecoderMethod = "DecodeAddrMode3Instruction"; +} + +// addrmode4 instructions +class AXI4<dag oops, dag iops, IndexMode im, Format f, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : XI<oops, iops, AddrMode4, 4, im, f, itin, asm, cstr, pattern> { + bits<4> p; + bits<16> regs; + bits<4> Rn; + let Inst{31-28} = p; + let Inst{27-25} = 0b100; + let Inst{22} = 0; // S bit + let Inst{19-16} = Rn; + let Inst{15-0} = regs; +} + +// Unsigned multiply, multiply-accumulate instructions. +class AMul1I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, 4, IndexModeNone, MulFrm, itin, + opc, asm, "", pattern> { + let Inst{7-4} = 0b1001; + let Inst{20} = 0; // S bit + let Inst{27-21} = opcod; +} +class AsMul1I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : sI<oops, iops, AddrModeNone, 4, IndexModeNone, MulFrm, itin, + opc, asm, "", pattern> { + let Inst{7-4} = 0b1001; + let Inst{27-21} = opcod; +} + +// Most significant word multiply +class AMul2I<bits<7> opcod, bits<4> opc7_4, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, 4, IndexModeNone, MulFrm, itin, + opc, asm, "", pattern> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + let Inst{7-4} = opc7_4; + let Inst{20} = 1; + let Inst{27-21} = opcod; + let Inst{19-16} = Rd; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; +} +// MSW multiple w/ Ra operand +class AMul2Ia<bits<7> opcod, bits<4> opc7_4, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AMul2I<opcod, opc7_4, oops, iops, itin, opc, asm, pattern> { + bits<4> Ra; + let Inst{15-12} = Ra; +} + +// SMUL<x><y> / SMULW<y> / SMLA<x><y> / SMLAW<x><y> +class AMulxyIbase<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, 4, IndexModeNone, MulFrm, itin, + opc, asm, "", pattern> { + bits<4> Rn; + bits<4> Rm; + let Inst{4} = 0; + let Inst{7} = 1; + let Inst{20} = 0; + let Inst{27-21} = opcod; + let Inst{6-5} = bit6_5; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; +} +class AMulxyI<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AMulxyIbase<opcod, bit6_5, oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + let Inst{19-16} = Rd; +} + +// AMulxyI with Ra operand +class AMulxyIa<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AMulxyI<opcod, bit6_5, oops, iops, itin, opc, asm, pattern> { + bits<4> Ra; + let Inst{15-12} = Ra; +} +// SMLAL* +class AMulxyI64<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AMulxyIbase<opcod, bit6_5, oops, iops, itin, opc, asm, pattern> { + bits<4> RdLo; + bits<4> RdHi; + let Inst{19-16} = RdHi; + let Inst{15-12} = RdLo; +} + +// Extend instructions. +class AExtI<bits<8> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, 4, IndexModeNone, ExtFrm, itin, + opc, asm, "", pattern> { + // All AExtI instructions have Rd and Rm register operands. + bits<4> Rd; + bits<4> Rm; + let Inst{15-12} = Rd; + let Inst{3-0} = Rm; + let Inst{7-4} = 0b0111; + let Inst{9-8} = 0b00; + let Inst{27-20} = opcod; + + let Unpredictable{9-8} = 0b11; +} + +// Misc Arithmetic instructions. +class AMiscA1I<bits<8> opcod, bits<4> opc7_4, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, 4, IndexModeNone, ArithMiscFrm, itin, + opc, asm, "", pattern> { + bits<4> Rd; + bits<4> Rm; + let Inst{27-20} = opcod; + let Inst{19-16} = 0b1111; + let Inst{15-12} = Rd; + let Inst{11-8} = 0b1111; + let Inst{7-4} = opc7_4; + let Inst{3-0} = Rm; +} + +// Division instructions. +class ADivA1I<bits<3> opcod, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, 4, IndexModeNone, ArithMiscFrm, itin, + opc, asm, "", pattern> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + let Inst{27-23} = 0b01110; + let Inst{22-20} = opcod; + let Inst{19-16} = Rd; + let Inst{15-12} = 0b1111; + let Inst{11-8} = Rm; + let Inst{7-4} = 0b0001; + let Inst{3-0} = Rn; +} + +// PKH instructions +def PKHLSLAsmOperand : ImmAsmOperand { + let Name = "PKHLSLImm"; + let ParserMethod = "parsePKHLSLImm"; +} +def pkh_lsl_amt: Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 32; }]>{ + let PrintMethod = "printPKHLSLShiftImm"; + let ParserMatchClass = PKHLSLAsmOperand; +} +def PKHASRAsmOperand : AsmOperandClass { + let Name = "PKHASRImm"; + let ParserMethod = "parsePKHASRImm"; +} +def pkh_asr_amt: Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 32; }]>{ + let PrintMethod = "printPKHASRShiftImm"; + let ParserMatchClass = PKHASRAsmOperand; +} + +class APKHI<bits<8> opcod, bit tb, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, 4, IndexModeNone, ArithMiscFrm, itin, + opc, asm, "", pattern> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + bits<5> sh; + let Inst{27-20} = opcod; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-7} = sh; + let Inst{6} = tb; + let Inst{5-4} = 0b01; + let Inst{3-0} = Rm; +} + +//===----------------------------------------------------------------------===// + +// ARMPat - Same as Pat<>, but requires that the compiler be in ARM mode. +class ARMPat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsARM]; +} +class ARMV5TPat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsARM, HasV5T]; +} +class ARMV5TEPat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsARM, HasV5TE]; +} +// ARMV5MOPat - Same as ARMV5TEPat with UseMulOps. +class ARMV5MOPat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsARM, HasV5TE, UseMulOps]; +} +class ARMV6Pat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsARM, HasV6]; +} +class VFPPat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [HasVFP2]; +} +class VFPNoNEONPat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [HasVFP2, DontUseNEONForFP]; +} +//===----------------------------------------------------------------------===// +// Thumb Instruction Format Definitions. +// + +class ThumbI<dag oops, dag iops, AddrMode am, int sz, + InstrItinClass itin, string asm, string cstr, list<dag> pattern> + : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = iops; + let AsmString = asm; + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb]; +} + +// TI - Thumb instruction. +class TI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern> + : ThumbI<oops, iops, AddrModeNone, 2, itin, asm, "", pattern>; + +// Two-address instructions +class TIt<dag oops, dag iops, InstrItinClass itin, string asm, + list<dag> pattern> + : ThumbI<oops, iops, AddrModeNone, 2, itin, asm, "$lhs = $dst", + pattern>; + +// tBL, tBX 32-bit instructions +class TIx2<bits<5> opcod1, bits<2> opcod2, bit opcod3, + dag oops, dag iops, InstrItinClass itin, string asm, + list<dag> pattern> + : ThumbI<oops, iops, AddrModeNone, 4, itin, asm, "", pattern>, + Encoding { + let Inst{31-27} = opcod1; + let Inst{15-14} = opcod2; + let Inst{12} = opcod3; +} + +// BR_JT instructions +class TJTI<dag oops, dag iops, InstrItinClass itin, string asm, + list<dag> pattern> + : ThumbI<oops, iops, AddrModeNone, 0, itin, asm, "", pattern>; + +// Thumb1 only +class Thumb1I<dag oops, dag iops, AddrMode am, int sz, + InstrItinClass itin, string asm, string cstr, list<dag> pattern> + : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = iops; + let AsmString = asm; + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb, IsThumb1Only]; +} + +class T1I<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : Thumb1I<oops, iops, AddrModeNone, 2, itin, asm, "", pattern>; +class T1Ix2<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : Thumb1I<oops, iops, AddrModeNone, 4, itin, asm, "", pattern>; + +// Two-address instructions +class T1It<dag oops, dag iops, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : Thumb1I<oops, iops, AddrModeNone, 2, itin, + asm, cstr, pattern>; + +// Thumb1 instruction that can either be predicated or set CPSR. +class Thumb1sI<dag oops, dag iops, AddrMode am, int sz, + InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { + let OutOperandList = !con(oops, (outs s_cc_out:$s)); + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat(opc, "${s}${p}", asm); + let Pattern = pattern; + let thumbArithFlagSetting = 1; + list<Predicate> Predicates = [IsThumb, IsThumb1Only]; + let DecoderNamespace = "ThumbSBit"; +} + +class T1sI<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb1sI<oops, iops, AddrModeNone, 2, itin, opc, asm, "", pattern>; + +// Two-address instructions +class T1sIt<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb1sI<oops, iops, AddrModeNone, 2, itin, opc, asm, + "$Rn = $Rdn", pattern>; + +// Thumb1 instruction that can be predicated. +class Thumb1pI<dag oops, dag iops, AddrMode am, int sz, + InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat(opc, "${p}", asm); + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb, IsThumb1Only]; +} + +class T1pI<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb1pI<oops, iops, AddrModeNone, 2, itin, opc, asm, "", pattern>; + +// Two-address instructions +class T1pIt<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb1pI<oops, iops, AddrModeNone, 2, itin, opc, asm, + "$Rn = $Rdn", pattern>; + +class T1pIs<dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : Thumb1pI<oops, iops, AddrModeT1_s, 2, itin, opc, asm, "", pattern>; + +class Encoding16 : Encoding { + let Inst{31-16} = 0x0000; +} + +// A6.2 16-bit Thumb instruction encoding +class T1Encoding<bits<6> opcode> : Encoding16 { + let Inst{15-10} = opcode; +} + +// A6.2.1 Shift (immediate), add, subtract, move, and compare encoding. +class T1General<bits<5> opcode> : Encoding16 { + let Inst{15-14} = 0b00; + let Inst{13-9} = opcode; +} + +// A6.2.2 Data-processing encoding. +class T1DataProcessing<bits<4> opcode> : Encoding16 { + let Inst{15-10} = 0b010000; + let Inst{9-6} = opcode; +} + +// A6.2.3 Special data instructions and branch and exchange encoding. +class T1Special<bits<4> opcode> : Encoding16 { + let Inst{15-10} = 0b010001; + let Inst{9-6} = opcode; +} + +// A6.2.4 Load/store single data item encoding. +class T1LoadStore<bits<4> opA, bits<3> opB> : Encoding16 { + let Inst{15-12} = opA; + let Inst{11-9} = opB; +} +class T1LdStSP<bits<3> opB> : T1LoadStore<0b1001, opB>; // SP relative + +class T1BranchCond<bits<4> opcode> : Encoding16 { + let Inst{15-12} = opcode; +} + +// Helper classes to encode Thumb1 loads and stores. For immediates, the +// following bits are used for "opA" (see A6.2.4): +// +// 0b0110 => Immediate, 4 bytes +// 0b1000 => Immediate, 2 bytes +// 0b0111 => Immediate, 1 byte +class T1pILdStEncode<bits<3> opcode, dag oops, dag iops, AddrMode am, + InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : Thumb1pI<oops, iops, am, 2, itin, opc, asm, "", pattern>, + T1LoadStore<0b0101, opcode> { + bits<3> Rt; + bits<8> addr; + let Inst{8-6} = addr{5-3}; // Rm + let Inst{5-3} = addr{2-0}; // Rn + let Inst{2-0} = Rt; +} +class T1pILdStEncodeImm<bits<4> opA, bit opB, dag oops, dag iops, AddrMode am, + InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : Thumb1pI<oops, iops, am, 2, itin, opc, asm, "", pattern>, + T1LoadStore<opA, {opB,?,?}> { + bits<3> Rt; + bits<8> addr; + let Inst{10-6} = addr{7-3}; // imm5 + let Inst{5-3} = addr{2-0}; // Rn + let Inst{2-0} = Rt; +} + +// A6.2.5 Miscellaneous 16-bit instructions encoding. +class T1Misc<bits<7> opcode> : Encoding16 { + let Inst{15-12} = 0b1011; + let Inst{11-5} = opcode; +} + +// Thumb2I - Thumb2 instruction. Almost all Thumb2 instructions are predicable. +class Thumb2I<dag oops, dag iops, AddrMode am, int sz, + InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat(opc, "${p}", asm); + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb2]; + let DecoderNamespace = "Thumb2"; +} + +// Same as Thumb2I except it can optionally modify CPSR. Note it's modeled as an +// input operand since by default it's a zero register. It will become an +// implicit def once it's "flipped". +// +// FIXME: This uses unified syntax so {s} comes before {p}. We should make it +// more consistent. +class Thumb2sI<dag oops, dag iops, AddrMode am, int sz, + InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { + bits<1> s; // condition-code set flag ('1' if the insn should set the flags) + let Inst{20} = s; + + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p, cc_out:$s)); + let AsmString = !strconcat(opc, "${s}${p}", asm); + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb2]; + let DecoderNamespace = "Thumb2"; +} + +// Special cases +class Thumb2XI<dag oops, dag iops, AddrMode am, int sz, + InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = iops; + let AsmString = asm; + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb2]; + let DecoderNamespace = "Thumb2"; +} + +class ThumbXI<dag oops, dag iops, AddrMode am, int sz, + InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = iops; + let AsmString = asm; + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb, IsThumb1Only]; + let DecoderNamespace = "Thumb"; +} + +class T2I<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeNone, 4, itin, opc, asm, "", pattern>; +class T2Ii12<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeT2_i12, 4, itin, opc, asm, "",pattern>; +class T2Ii8<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeT2_i8, 4, itin, opc, asm, "", pattern>; +class T2Iso<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeT2_so, 4, itin, opc, asm, "", pattern>; +class T2Ipc<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeT2_pc, 4, itin, opc, asm, "", pattern>; +class T2Ii8s4<bit P, bit W, bit isLoad, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeT2_i8s4, 4, itin, opc, asm, cstr, + pattern> { + bits<4> Rt; + bits<4> Rt2; + bits<13> addr; + let Inst{31-25} = 0b1110100; + let Inst{24} = P; + let Inst{23} = addr{8}; + let Inst{22} = 1; + let Inst{21} = W; + let Inst{20} = isLoad; + let Inst{19-16} = addr{12-9}; + let Inst{15-12} = Rt{3-0}; + let Inst{11-8} = Rt2{3-0}; + let Inst{7-0} = addr{7-0}; +} +class T2Ii8s4post<bit P, bit W, bit isLoad, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, string cstr, + list<dag> pattern> + : Thumb2I<oops, iops, AddrModeT2_i8s4, 4, itin, opc, asm, cstr, + pattern> { + bits<4> Rt; + bits<4> Rt2; + bits<4> addr; + bits<9> imm; + let Inst{31-25} = 0b1110100; + let Inst{24} = P; + let Inst{23} = imm{8}; + let Inst{22} = 1; + let Inst{21} = W; + let Inst{20} = isLoad; + let Inst{19-16} = addr; + let Inst{15-12} = Rt{3-0}; + let Inst{11-8} = Rt2{3-0}; + let Inst{7-0} = imm{7-0}; +} + +class T2sI<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2sI<oops, iops, AddrModeNone, 4, itin, opc, asm, "", pattern>; + +class T2XI<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : Thumb2XI<oops, iops, AddrModeNone, 4, itin, asm, "", pattern>; +class T2JTI<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : Thumb2XI<oops, iops, AddrModeNone, 0, itin, asm, "", pattern>; + +// Move to/from coprocessor instructions +class T2Cop<bits<4> opc, dag oops, dag iops, string opcstr, string asm, + list<dag> pattern> + : T2I <oops, iops, NoItinerary, opcstr, asm, pattern>, Requires<[IsThumb2]> { + let Inst{31-28} = opc; +} + +// Two-address instructions +class T2XIt<dag oops, dag iops, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : Thumb2XI<oops, iops, AddrModeNone, 4, itin, asm, cstr, pattern>; + +// T2Ipreldst - Thumb2 pre-indexed load / store instructions. +class T2Ipreldst<bit signed, bits<2> opcod, bit load, bit pre, + dag oops, dag iops, + AddrMode am, IndexMode im, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : InstARM<am, 4, im, ThumbFrm, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat(opc, "${p}", asm); + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb2]; + let DecoderNamespace = "Thumb2"; + + bits<4> Rt; + bits<13> addr; + let Inst{31-27} = 0b11111; + let Inst{26-25} = 0b00; + let Inst{24} = signed; + let Inst{23} = 0; + let Inst{22-21} = opcod; + let Inst{20} = load; + let Inst{19-16} = addr{12-9}; + let Inst{15-12} = Rt{3-0}; + let Inst{11} = 1; + // (P, W) = (1, 1) Pre-indexed or (0, 1) Post-indexed + let Inst{10} = pre; // The P bit. + let Inst{9} = addr{8}; // Sign bit + let Inst{8} = 1; // The W bit. + let Inst{7-0} = addr{7-0}; + + let DecoderMethod = "DecodeT2LdStPre"; +} + +// T2Ipostldst - Thumb2 post-indexed load / store instructions. +class T2Ipostldst<bit signed, bits<2> opcod, bit load, bit pre, + dag oops, dag iops, + AddrMode am, IndexMode im, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : InstARM<am, 4, im, ThumbFrm, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat(opc, "${p}", asm); + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb2]; + let DecoderNamespace = "Thumb2"; + + bits<4> Rt; + bits<4> Rn; + bits<9> offset; + let Inst{31-27} = 0b11111; + let Inst{26-25} = 0b00; + let Inst{24} = signed; + let Inst{23} = 0; + let Inst{22-21} = opcod; + let Inst{20} = load; + let Inst{19-16} = Rn; + let Inst{15-12} = Rt{3-0}; + let Inst{11} = 1; + // (P, W) = (1, 1) Pre-indexed or (0, 1) Post-indexed + let Inst{10} = pre; // The P bit. + let Inst{9} = offset{8}; // Sign bit + let Inst{8} = 1; // The W bit. + let Inst{7-0} = offset{7-0}; + + let DecoderMethod = "DecodeT2LdStPre"; +} + +// Tv5Pat - Same as Pat<>, but requires V5T Thumb mode. +class Tv5Pat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsThumb, IsThumb1Only, HasV5T]; +} + +// T1Pat - Same as Pat<>, but requires that the compiler be in Thumb1 mode. +class T1Pat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsThumb, IsThumb1Only]; +} + +// T2v6Pat - Same as Pat<>, but requires V6T2 Thumb2 mode. +class T2v6Pat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsThumb2, HasV6T2]; +} + +// T2Pat - Same as Pat<>, but requires that the compiler be in Thumb2 mode. +class T2Pat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsThumb2]; +} + +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// ARM VFP Instruction templates. +// + +// Almost all VFP instructions are predicable. +class VFPI<dag oops, dag iops, AddrMode am, int sz, + IndexMode im, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, im, f, VFPDomain, cstr, itin> { + bits<4> p; + let Inst{31-28} = p; + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat(opc, "${p}", asm); + let Pattern = pattern; + let PostEncoderMethod = "VFPThumb2PostEncoder"; + let DecoderNamespace = "VFP"; + list<Predicate> Predicates = [HasVFP2]; +} + +// Special cases +class VFPXI<dag oops, dag iops, AddrMode am, int sz, + IndexMode im, Format f, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, im, f, VFPDomain, cstr, itin> { + bits<4> p; + let Inst{31-28} = p; + let OutOperandList = oops; + let InOperandList = iops; + let AsmString = asm; + let Pattern = pattern; + let PostEncoderMethod = "VFPThumb2PostEncoder"; + let DecoderNamespace = "VFP"; + list<Predicate> Predicates = [HasVFP2]; +} + +class VFPAI<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : VFPI<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin, + opc, asm, "", pattern> { + let PostEncoderMethod = "VFPThumb2PostEncoder"; +} + +// ARM VFP addrmode5 loads and stores +class ADI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops, + InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : VFPI<oops, iops, AddrMode5, 4, IndexModeNone, + VFPLdStFrm, itin, opc, asm, "", pattern> { + // Instruction operands. + bits<5> Dd; + bits<13> addr; + + // Encode instruction operands. + let Inst{23} = addr{8}; // U (add = (U == '1')) + let Inst{22} = Dd{4}; + let Inst{19-16} = addr{12-9}; // Rn + let Inst{15-12} = Dd{3-0}; + let Inst{7-0} = addr{7-0}; // imm8 + + let Inst{27-24} = opcod1; + let Inst{21-20} = opcod2; + let Inst{11-9} = 0b101; + let Inst{8} = 1; // Double precision + + // Loads & stores operate on both NEON and VFP pipelines. + let D = VFPNeonDomain; +} + +class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops, + InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : VFPI<oops, iops, AddrMode5, 4, IndexModeNone, + VFPLdStFrm, itin, opc, asm, "", pattern> { + // Instruction operands. + bits<5> Sd; + bits<13> addr; + + // Encode instruction operands. + let Inst{23} = addr{8}; // U (add = (U == '1')) + let Inst{22} = Sd{0}; + let Inst{19-16} = addr{12-9}; // Rn + let Inst{15-12} = Sd{4-1}; + let Inst{7-0} = addr{7-0}; // imm8 + + let Inst{27-24} = opcod1; + let Inst{21-20} = opcod2; + let Inst{11-9} = 0b101; + let Inst{8} = 0; // Single precision + + // Loads & stores operate on both NEON and VFP pipelines. + let D = VFPNeonDomain; +} + +// VFP Load / store multiple pseudo instructions. +class PseudoVFPLdStM<dag oops, dag iops, InstrItinClass itin, string cstr, + list<dag> pattern> + : InstARM<AddrMode4, 4, IndexModeNone, Pseudo, VFPNeonDomain, + cstr, itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let Pattern = pattern; + list<Predicate> Predicates = [HasVFP2]; +} + +// Load / store multiple + +// Unknown precision +class AXXI4<dag oops, dag iops, IndexMode im, + string asm, string cstr, list<dag> pattern> + : VFPXI<oops, iops, AddrMode4, 4, im, + VFPLdStFrm, NoItinerary, asm, cstr, pattern> { + // Instruction operands. + bits<4> Rn; + bits<13> regs; + + // Encode instruction operands. + let Inst{19-16} = Rn; + let Inst{22} = 0; + let Inst{15-12} = regs{11-8}; + let Inst{7-1} = regs{7-1}; + + let Inst{27-25} = 0b110; + let Inst{11-8} = 0b1011; + let Inst{0} = 1; +} + +// Double precision +class AXDI4<dag oops, dag iops, IndexMode im, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : VFPXI<oops, iops, AddrMode4, 4, im, + VFPLdStMulFrm, itin, asm, cstr, pattern> { + // Instruction operands. + bits<4> Rn; + bits<13> regs; + + // Encode instruction operands. + let Inst{19-16} = Rn; + let Inst{22} = regs{12}; + let Inst{15-12} = regs{11-8}; + let Inst{7-1} = regs{7-1}; + + let Inst{27-25} = 0b110; + let Inst{11-9} = 0b101; + let Inst{8} = 1; // Double precision + let Inst{0} = 0; +} + +// Single Precision +class AXSI4<dag oops, dag iops, IndexMode im, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : VFPXI<oops, iops, AddrMode4, 4, im, + VFPLdStMulFrm, itin, asm, cstr, pattern> { + // Instruction operands. + bits<4> Rn; + bits<13> regs; + + // Encode instruction operands. + let Inst{19-16} = Rn; + let Inst{22} = regs{8}; + let Inst{15-12} = regs{12-9}; + let Inst{7-0} = regs{7-0}; + + let Inst{27-25} = 0b110; + let Inst{11-9} = 0b101; + let Inst{8} = 0; // Single precision +} + +// Double precision, unary +class ADuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, + bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc, + string asm, list<dag> pattern> + : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> { + // Instruction operands. + bits<5> Dd; + bits<5> Dm; + + // Encode instruction operands. + let Inst{3-0} = Dm{3-0}; + let Inst{5} = Dm{4}; + let Inst{15-12} = Dd{3-0}; + let Inst{22} = Dd{4}; + + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{19-16} = opcod3; + let Inst{11-9} = 0b101; + let Inst{8} = 1; // Double precision + let Inst{7-6} = opcod4; + let Inst{4} = opcod5; + + let Predicates = [HasVFP2, HasDPVFP]; +} + +// Double precision, unary, not-predicated +class ADuInp<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, + bit opcod5, dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone, VFPUnaryFrm, itin, asm, "", pattern> { + // Instruction operands. + bits<5> Dd; + bits<5> Dm; + + let Inst{31-28} = 0b1111; + + // Encode instruction operands. + let Inst{3-0} = Dm{3-0}; + let Inst{5} = Dm{4}; + let Inst{15-12} = Dd{3-0}; + let Inst{22} = Dd{4}; + + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{19-16} = opcod3; + let Inst{11-9} = 0b101; + let Inst{8} = 1; // Double precision + let Inst{7-6} = opcod4; + let Inst{4} = opcod5; +} + +// Double precision, binary +class ADbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, + dag iops, InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> { + // Instruction operands. + bits<5> Dd; + bits<5> Dn; + bits<5> Dm; + + // Encode instruction operands. + let Inst{3-0} = Dm{3-0}; + let Inst{5} = Dm{4}; + let Inst{19-16} = Dn{3-0}; + let Inst{7} = Dn{4}; + let Inst{15-12} = Dd{3-0}; + let Inst{22} = Dd{4}; + + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{11-9} = 0b101; + let Inst{8} = 1; // Double precision + let Inst{6} = op6; + let Inst{4} = op4; + + let Predicates = [HasVFP2, HasDPVFP]; +} + +// FP, binary, not predicated +class ADbInp<bits<5> opcod1, bits<2> opcod2, bit opcod3, dag oops, dag iops, + InstrItinClass itin, string asm, list<dag> pattern> + : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone, VFPBinaryFrm, itin, + asm, "", pattern> +{ + // Instruction operands. + bits<5> Dd; + bits<5> Dn; + bits<5> Dm; + + let Inst{31-28} = 0b1111; + + // Encode instruction operands. + let Inst{3-0} = Dm{3-0}; + let Inst{5} = Dm{4}; + let Inst{19-16} = Dn{3-0}; + let Inst{7} = Dn{4}; + let Inst{15-12} = Dd{3-0}; + let Inst{22} = Dd{4}; + + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{11-9} = 0b101; + let Inst{8} = 1; // double precision + let Inst{6} = opcod3; + let Inst{4} = 0; + + let Predicates = [HasVFP2, HasDPVFP]; +} + +// Single precision, unary, predicated +class ASuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, + bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc, + string asm, list<dag> pattern> + : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> { + // Instruction operands. + bits<5> Sd; + bits<5> Sm; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; + + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{19-16} = opcod3; + let Inst{11-9} = 0b101; + let Inst{8} = 0; // Single precision + let Inst{7-6} = opcod4; + let Inst{4} = opcod5; +} + +// Single precision, unary, non-predicated +class ASuInp<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, + bit opcod5, dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone, + VFPUnaryFrm, itin, asm, "", pattern> { + // Instruction operands. + bits<5> Sd; + bits<5> Sm; + + let Inst{31-28} = 0b1111; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; + + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{19-16} = opcod3; + let Inst{11-9} = 0b101; + let Inst{8} = 0; // Single precision + let Inst{7-6} = opcod4; + let Inst{4} = opcod5; +} + +// Single precision unary, if no NEON. Same as ASuI except not available if +// NEON is enabled. +class ASuIn<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, + bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc, + string asm, list<dag> pattern> + : ASuI<opcod1, opcod2, opcod3, opcod4, opcod5, oops, iops, itin, opc, asm, + pattern> { + list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP]; +} + +// Single precision, binary +class ASbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> { + // Instruction operands. + bits<5> Sd; + bits<5> Sn; + bits<5> Sm; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{19-16} = Sn{4-1}; + let Inst{7} = Sn{0}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; + + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{11-9} = 0b101; + let Inst{8} = 0; // Single precision + let Inst{6} = op6; + let Inst{4} = op4; +} + +// Single precision, binary, not predicated +class ASbInp<bits<5> opcod1, bits<2> opcod2, bit opcod3, dag oops, dag iops, + InstrItinClass itin, string asm, list<dag> pattern> + : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone, + VFPBinaryFrm, itin, asm, "", pattern> +{ + // Instruction operands. + bits<5> Sd; + bits<5> Sn; + bits<5> Sm; + + let Inst{31-28} = 0b1111; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{19-16} = Sn{4-1}; + let Inst{7} = Sn{0}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; + + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{11-9} = 0b101; + let Inst{8} = 0; // Single precision + let Inst{6} = opcod3; + let Inst{4} = 0; +} + +// Single precision binary, if no NEON. Same as ASbI except not available if +// NEON is enabled. +class ASbIn<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, + dag iops, InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : ASbI<opcod1, opcod2, op6, op4, oops, iops, itin, opc, asm, pattern> { + list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP]; + + // Instruction operands. + bits<5> Sd; + bits<5> Sn; + bits<5> Sm; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{19-16} = Sn{4-1}; + let Inst{7} = Sn{0}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; +} + +// VFP conversion instructions +class AVConv1I<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<4> opcod4, + dag oops, dag iops, InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : VFPAI<oops, iops, VFPConv1Frm, itin, opc, asm, pattern> { + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{19-16} = opcod3; + let Inst{11-8} = opcod4; + let Inst{6} = 1; + let Inst{4} = 0; +} + +// VFP conversion between floating-point and fixed-point +class AVConv1XI<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, bit op5, + dag oops, dag iops, InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : AVConv1I<op1, op2, op3, op4, oops, iops, itin, opc, asm, pattern> { + bits<5> fbits; + // size (fixed-point number): sx == 0 ? 16 : 32 + let Inst{7} = op5; // sx + let Inst{5} = fbits{0}; + let Inst{3-0} = fbits{4-1}; +} + +// VFP conversion instructions, if no NEON +class AVConv1In<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<4> opcod4, + dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm, + pattern> { + list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP]; +} + +class AVConvXI<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, Format f, + InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : VFPAI<oops, iops, f, itin, opc, asm, pattern> { + let Inst{27-20} = opcod1; + let Inst{11-8} = opcod2; + let Inst{4} = 1; +} + +class AVConv2I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AVConvXI<opcod1, opcod2, oops, iops, VFPConv2Frm, itin, opc, asm, pattern>; + +class AVConv3I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AVConvXI<opcod1, opcod2, oops, iops, VFPConv3Frm, itin, opc, asm, pattern>; + +class AVConv4I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AVConvXI<opcod1, opcod2, oops, iops, VFPConv4Frm, itin, opc, asm, pattern>; + +class AVConv5I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AVConvXI<opcod1, opcod2, oops, iops, VFPConv5Frm, itin, opc, asm, pattern>; + +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// ARM NEON Instruction templates. +// + +class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, Format f, + InstrItinClass itin, string opc, string dt, string asm, string cstr, + list<dag> pattern> + : InstARM<am, 4, im, f, NeonDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat(opc, "${p}", ".", dt, "\t", asm); + let Pattern = pattern; + list<Predicate> Predicates = [HasNEON]; + let DecoderNamespace = "NEON"; +} + +// Same as NeonI except it does not have a "data type" specifier. +class NeonXI<dag oops, dag iops, AddrMode am, IndexMode im, Format f, + InstrItinClass itin, string opc, string asm, string cstr, + list<dag> pattern> + : InstARM<am, 4, im, f, NeonDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat(opc, "${p}", "\t", asm); + let Pattern = pattern; + list<Predicate> Predicates = [HasNEON]; + let DecoderNamespace = "NEON"; +} + +// Same as NeonI except it is not predicated +class NeonInp<dag oops, dag iops, AddrMode am, IndexMode im, Format f, + InstrItinClass itin, string opc, string dt, string asm, string cstr, + list<dag> pattern> + : InstARM<am, 4, im, f, NeonDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = iops; + let AsmString = !strconcat(opc, ".", dt, "\t", asm); + let Pattern = pattern; + list<Predicate> Predicates = [HasNEON]; + let DecoderNamespace = "NEON"; + + let Inst{31-28} = 0b1111; +} + +class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4, + dag oops, dag iops, InstrItinClass itin, + string opc, string dt, string asm, string cstr, list<dag> pattern> + : NeonI<oops, iops, AddrMode6, IndexModeNone, NLdStFrm, itin, opc, dt, asm, + cstr, pattern> { + let Inst{31-24} = 0b11110100; + let Inst{23} = op23; + let Inst{21-20} = op21_20; + let Inst{11-8} = op11_8; + let Inst{7-4} = op7_4; + + let PostEncoderMethod = "NEONThumb2LoadStorePostEncoder"; + let DecoderNamespace = "NEONLoadStore"; + + bits<5> Vd; + bits<6> Rn; + bits<4> Rm; + + let Inst{22} = Vd{4}; + let Inst{15-12} = Vd{3-0}; + let Inst{19-16} = Rn{3-0}; + let Inst{3-0} = Rm{3-0}; +} + +class NLdStLn<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4, + dag oops, dag iops, InstrItinClass itin, + string opc, string dt, string asm, string cstr, list<dag> pattern> + : NLdSt<op23, op21_20, op11_8, op7_4, oops, iops, itin, opc, + dt, asm, cstr, pattern> { + bits<3> lane; +} + +class PseudoNLdSt<dag oops, dag iops, InstrItinClass itin, string cstr> + : InstARM<AddrMode6, 4, IndexModeNone, Pseudo, NeonDomain, cstr, + itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + list<Predicate> Predicates = [HasNEON]; +} + +class PseudoNeonI<dag oops, dag iops, InstrItinClass itin, string cstr, + list<dag> pattern> + : InstARM<AddrModeNone, 4, IndexModeNone, Pseudo, NeonDomain, cstr, + itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let Pattern = pattern; + list<Predicate> Predicates = [HasNEON]; +} + +class NDataI<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string dt, string asm, string cstr, list<dag> pattern> + : NeonI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, dt, asm, cstr, + pattern> { + let Inst{31-25} = 0b1111001; + let PostEncoderMethod = "NEONThumb2DataIPostEncoder"; + let DecoderNamespace = "NEONData"; +} + +class NDataXI<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : NeonXI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, asm, + cstr, pattern> { + let Inst{31-25} = 0b1111001; + let PostEncoderMethod = "NEONThumb2DataIPostEncoder"; + let DecoderNamespace = "NEONData"; +} + +// NEON "one register and a modified immediate" format. +class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6, + bit op5, bit op4, + dag oops, dag iops, InstrItinClass itin, + string opc, string dt, string asm, string cstr, + list<dag> pattern> + : NDataI<oops, iops, N1RegModImmFrm, itin, opc, dt, asm, cstr, pattern> { + let Inst{23} = op23; + let Inst{21-19} = op21_19; + let Inst{11-8} = op11_8; + let Inst{7} = op7; + let Inst{6} = op6; + let Inst{5} = op5; + let Inst{4} = op4; + + // Instruction operands. + bits<5> Vd; + bits<13> SIMM; + + let Inst{15-12} = Vd{3-0}; + let Inst{22} = Vd{4}; + let Inst{24} = SIMM{7}; + let Inst{18-16} = SIMM{6-4}; + let Inst{3-0} = SIMM{3-0}; + let DecoderMethod = "DecodeNEONModImmInstruction"; +} + +// NEON 2 vector register format. +class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, + bits<5> op11_7, bit op6, bit op4, + dag oops, dag iops, InstrItinClass itin, + string opc, string dt, string asm, string cstr, list<dag> pattern> + : NDataI<oops, iops, N2RegFrm, itin, opc, dt, asm, cstr, pattern> { + let Inst{24-23} = op24_23; + let Inst{21-20} = op21_20; + let Inst{19-18} = op19_18; + let Inst{17-16} = op17_16; + let Inst{11-7} = op11_7; + let Inst{6} = op6; + let Inst{4} = op4; + + // Instruction operands. + bits<5> Vd; + bits<5> Vm; + + let Inst{15-12} = Vd{3-0}; + let Inst{22} = Vd{4}; + let Inst{3-0} = Vm{3-0}; + let Inst{5} = Vm{4}; +} + +// Same as N2V but not predicated. +class N2Vnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op7, bit op6, + dag oops, dag iops, InstrItinClass itin, string OpcodeStr, + string Dt, list<dag> pattern> + : NeonInp<oops, iops, AddrModeNone, IndexModeNone, N2RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vm", "", pattern> { + bits<5> Vd; + bits<5> Vm; + + // Encode instruction operands + let Inst{22} = Vd{4}; + let Inst{15-12} = Vd{3-0}; + let Inst{5} = Vm{4}; + let Inst{3-0} = Vm{3-0}; + + // Encode constant bits + let Inst{27-23} = 0b00111; + let Inst{21-20} = 0b11; + let Inst{19-18} = op19_18; + let Inst{17-16} = op17_16; + let Inst{11} = 0; + let Inst{10-8} = op10_8; + let Inst{7} = op7; + let Inst{6} = op6; + let Inst{4} = 0; + + let DecoderNamespace = "NEON"; +} + +// Same as N2V except it doesn't have a datatype suffix. +class N2VX<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, + bits<5> op11_7, bit op6, bit op4, + dag oops, dag iops, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : NDataXI<oops, iops, N2RegFrm, itin, opc, asm, cstr, pattern> { + let Inst{24-23} = op24_23; + let Inst{21-20} = op21_20; + let Inst{19-18} = op19_18; + let Inst{17-16} = op17_16; + let Inst{11-7} = op11_7; + let Inst{6} = op6; + let Inst{4} = op4; + + // Instruction operands. + bits<5> Vd; + bits<5> Vm; + + let Inst{15-12} = Vd{3-0}; + let Inst{22} = Vd{4}; + let Inst{3-0} = Vm{3-0}; + let Inst{5} = Vm{4}; +} + +// NEON 2 vector register with immediate. +class N2VImm<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, + dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string dt, string asm, string cstr, list<dag> pattern> + : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> { + let Inst{24} = op24; + let Inst{23} = op23; + let Inst{11-8} = op11_8; + let Inst{7} = op7; + let Inst{6} = op6; + let Inst{4} = op4; + + // Instruction operands. + bits<5> Vd; + bits<5> Vm; + bits<6> SIMM; + + let Inst{15-12} = Vd{3-0}; + let Inst{22} = Vd{4}; + let Inst{3-0} = Vm{3-0}; + let Inst{5} = Vm{4}; + let Inst{21-16} = SIMM{5-0}; +} + +// NEON 3 vector register format. + +class N3VCommon<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, + bit op4, dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string dt, string asm, string cstr, + list<dag> pattern> + : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> { + let Inst{24} = op24; + let Inst{23} = op23; + let Inst{21-20} = op21_20; + let Inst{11-8} = op11_8; + let Inst{6} = op6; + let Inst{4} = op4; +} + +class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4, + dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string dt, string asm, string cstr, list<dag> pattern> + : N3VCommon<op24, op23, op21_20, op11_8, op6, op4, + oops, iops, f, itin, opc, dt, asm, cstr, pattern> { + // Instruction operands. + bits<5> Vd; + bits<5> Vn; + bits<5> Vm; + + let Inst{15-12} = Vd{3-0}; + let Inst{22} = Vd{4}; + let Inst{19-16} = Vn{3-0}; + let Inst{7} = Vn{4}; + let Inst{3-0} = Vm{3-0}; + let Inst{5} = Vm{4}; +} + +class N3Vnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6, + bit op4, dag oops, dag iops,Format f, InstrItinClass itin, + string OpcodeStr, string Dt, list<dag> pattern> + : NeonInp<oops, iops, AddrModeNone, IndexModeNone, f, itin, OpcodeStr, + Dt, "$Vd, $Vn, $Vm", "", pattern> { + bits<5> Vd; + bits<5> Vn; + bits<5> Vm; + + // Encode instruction operands + let Inst{22} = Vd{4}; + let Inst{15-12} = Vd{3-0}; + let Inst{19-16} = Vn{3-0}; + let Inst{7} = Vn{4}; + let Inst{5} = Vm{4}; + let Inst{3-0} = Vm{3-0}; + + // Encode constant bits + let Inst{27-23} = op27_23; + let Inst{21-20} = op21_20; + let Inst{11-8} = op11_8; + let Inst{6} = op6; + let Inst{4} = op4; +} + +class N3VLane32<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, + bit op4, dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string dt, string asm, string cstr, + list<dag> pattern> + : N3VCommon<op24, op23, op21_20, op11_8, op6, op4, + oops, iops, f, itin, opc, dt, asm, cstr, pattern> { + + // Instruction operands. + bits<5> Vd; + bits<5> Vn; + bits<5> Vm; + bit lane; + + let Inst{15-12} = Vd{3-0}; + let Inst{22} = Vd{4}; + let Inst{19-16} = Vn{3-0}; + let Inst{7} = Vn{4}; + let Inst{3-0} = Vm{3-0}; + let Inst{5} = lane; +} + +class N3VLane16<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, + bit op4, dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string dt, string asm, string cstr, + list<dag> pattern> + : N3VCommon<op24, op23, op21_20, op11_8, op6, op4, + oops, iops, f, itin, opc, dt, asm, cstr, pattern> { + + // Instruction operands. + bits<5> Vd; + bits<5> Vn; + bits<5> Vm; + bits<2> lane; + + let Inst{15-12} = Vd{3-0}; + let Inst{22} = Vd{4}; + let Inst{19-16} = Vn{3-0}; + let Inst{7} = Vn{4}; + let Inst{2-0} = Vm{2-0}; + let Inst{5} = lane{1}; + let Inst{3} = lane{0}; +} + +// Same as N3V except it doesn't have a data type suffix. +class N3VX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, + bit op4, + dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : NDataXI<oops, iops, f, itin, opc, asm, cstr, pattern> { + let Inst{24} = op24; + let Inst{23} = op23; + let Inst{21-20} = op21_20; + let Inst{11-8} = op11_8; + let Inst{6} = op6; + let Inst{4} = op4; + + // Instruction operands. + bits<5> Vd; + bits<5> Vn; + bits<5> Vm; + + let Inst{15-12} = Vd{3-0}; + let Inst{22} = Vd{4}; + let Inst{19-16} = Vn{3-0}; + let Inst{7} = Vn{4}; + let Inst{3-0} = Vm{3-0}; + let Inst{5} = Vm{4}; +} + +// NEON VMOVs between scalar and core registers. +class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, + dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string dt, string asm, list<dag> pattern> + : InstARM<AddrModeNone, 4, IndexModeNone, f, NeonDomain, + "", itin> { + let Inst{27-20} = opcod1; + let Inst{11-8} = opcod2; + let Inst{6-5} = opcod3; + let Inst{4} = 1; + // A8.6.303, A8.6.328, A8.6.329 + let Inst{3-0} = 0b0000; + + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat(opc, "${p}", ".", dt, "\t", asm); + let Pattern = pattern; + list<Predicate> Predicates = [HasNEON]; + + let PostEncoderMethod = "NEONThumb2DupPostEncoder"; + let DecoderNamespace = "NEONDup"; + + bits<5> V; + bits<4> R; + bits<4> p; + bits<4> lane; + + let Inst{31-28} = p{3-0}; + let Inst{7} = V{4}; + let Inst{19-16} = V{3-0}; + let Inst{15-12} = R{3-0}; +} +class NVGetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, + dag oops, dag iops, InstrItinClass itin, + string opc, string dt, string asm, list<dag> pattern> + : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NGetLnFrm, itin, + opc, dt, asm, pattern>; +class NVSetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, + dag oops, dag iops, InstrItinClass itin, + string opc, string dt, string asm, list<dag> pattern> + : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NSetLnFrm, itin, + opc, dt, asm, pattern>; +class NVDup<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, + dag oops, dag iops, InstrItinClass itin, + string opc, string dt, string asm, list<dag> pattern> + : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NDupFrm, itin, + opc, dt, asm, pattern>; + +// Vector Duplicate Lane (from scalar to all elements) +class NVDupLane<bits<4> op19_16, bit op6, dag oops, dag iops, + InstrItinClass itin, string opc, string dt, string asm, + list<dag> pattern> + : NDataI<oops, iops, NVDupLnFrm, itin, opc, dt, asm, "", pattern> { + let Inst{24-23} = 0b11; + let Inst{21-20} = 0b11; + let Inst{19-16} = op19_16; + let Inst{11-7} = 0b11000; + let Inst{6} = op6; + let Inst{4} = 0; + + bits<5> Vd; + bits<5> Vm; + + let Inst{22} = Vd{4}; + let Inst{15-12} = Vd{3-0}; + let Inst{5} = Vm{4}; + let Inst{3-0} = Vm{3-0}; +} + +// NEONFPPat - Same as Pat<>, but requires that the compiler be using NEON +// for single-precision FP. +class NEONFPPat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [HasNEON,UseNEONForFP]; +} + +// VFP/NEON Instruction aliases for type suffices. +class VFPDataTypeInstAlias<string opc, string dt, string asm, dag Result> : + InstAlias<!strconcat(opc, dt, "\t", asm), Result>, Requires<[HasVFP2]>; + +multiclass VFPDTAnyInstAlias<string opc, string asm, dag Result> { + def : VFPDataTypeInstAlias<opc, ".8", asm, Result>; + def : VFPDataTypeInstAlias<opc, ".16", asm, Result>; + def : VFPDataTypeInstAlias<opc, ".32", asm, Result>; + def : VFPDataTypeInstAlias<opc, ".64", asm, Result>; +} + +multiclass NEONDTAnyInstAlias<string opc, string asm, dag Result> { + let Predicates = [HasNEON] in { + def : VFPDataTypeInstAlias<opc, ".8", asm, Result>; + def : VFPDataTypeInstAlias<opc, ".16", asm, Result>; + def : VFPDataTypeInstAlias<opc, ".32", asm, Result>; + def : VFPDataTypeInstAlias<opc, ".64", asm, Result>; +} +} + +// The same alias classes using AsmPseudo instead, for the more complex +// stuff in NEON that InstAlias can't quite handle. +// Note that we can't use anonymous defm references here like we can +// above, as we care about the ultimate instruction enum names generated, unlike +// for instalias defs. +class NEONDataTypeAsmPseudoInst<string opc, string dt, string asm, dag iops> : + AsmPseudoInst<!strconcat(opc, dt, "\t", asm), iops>, Requires<[HasNEON]>; + +// Data type suffix token aliases. Implements Table A7-3 in the ARM ARM. +def : TokenAlias<".s8", ".i8">; +def : TokenAlias<".u8", ".i8">; +def : TokenAlias<".s16", ".i16">; +def : TokenAlias<".u16", ".i16">; +def : TokenAlias<".s32", ".i32">; +def : TokenAlias<".u32", ".i32">; +def : TokenAlias<".s64", ".i64">; +def : TokenAlias<".u64", ".i64">; + +def : TokenAlias<".i8", ".8">; +def : TokenAlias<".i16", ".16">; +def : TokenAlias<".i32", ".32">; +def : TokenAlias<".i64", ".64">; + +def : TokenAlias<".p8", ".8">; +def : TokenAlias<".p16", ".16">; + +def : TokenAlias<".f32", ".32">; +def : TokenAlias<".f64", ".64">; +def : TokenAlias<".f", ".f32">; +def : TokenAlias<".d", ".f64">; diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp new file mode 100644 index 0000000..cf973d6 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp @@ -0,0 +1,134 @@ +//===-- ARMInstrInfo.cpp - ARM Instruction Information --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the ARM implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "ARMInstrInfo.h" +#include "ARM.h" +#include "ARMConstantPoolValue.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMTargetMachine.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCInst.h" +using namespace llvm; + +ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI) + : ARMBaseInstrInfo(STI), RI() {} + +/// getNoopForMachoTarget - Return the noop instruction to use for a noop. +void ARMInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { + if (hasNOP()) { + NopInst.setOpcode(ARM::HINT); + NopInst.addOperand(MCOperand::createImm(0)); + NopInst.addOperand(MCOperand::createImm(ARMCC::AL)); + NopInst.addOperand(MCOperand::createReg(0)); + } else { + NopInst.setOpcode(ARM::MOVr); + NopInst.addOperand(MCOperand::createReg(ARM::R0)); + NopInst.addOperand(MCOperand::createReg(ARM::R0)); + NopInst.addOperand(MCOperand::createImm(ARMCC::AL)); + NopInst.addOperand(MCOperand::createReg(0)); + NopInst.addOperand(MCOperand::createReg(0)); + } +} + +unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const { + switch (Opc) { + default: + break; + case ARM::LDR_PRE_IMM: + case ARM::LDR_PRE_REG: + case ARM::LDR_POST_IMM: + case ARM::LDR_POST_REG: + return ARM::LDRi12; + case ARM::LDRH_PRE: + case ARM::LDRH_POST: + return ARM::LDRH; + case ARM::LDRB_PRE_IMM: + case ARM::LDRB_PRE_REG: + case ARM::LDRB_POST_IMM: + case ARM::LDRB_POST_REG: + return ARM::LDRBi12; + case ARM::LDRSH_PRE: + case ARM::LDRSH_POST: + return ARM::LDRSH; + case ARM::LDRSB_PRE: + case ARM::LDRSB_POST: + return ARM::LDRSB; + case ARM::STR_PRE_IMM: + case ARM::STR_PRE_REG: + case ARM::STR_POST_IMM: + case ARM::STR_POST_REG: + return ARM::STRi12; + case ARM::STRH_PRE: + case ARM::STRH_POST: + return ARM::STRH; + case ARM::STRB_PRE_IMM: + case ARM::STRB_PRE_REG: + case ARM::STRB_POST_IMM: + case ARM::STRB_POST_REG: + return ARM::STRBi12; + } + + return 0; +} + +void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI, + Reloc::Model RM) const { + MachineFunction &MF = *MI->getParent()->getParent(); + const ARMSubtarget &Subtarget = MF.getSubtarget<ARMSubtarget>(); + + if (!Subtarget.useMovt(MF)) { + if (RM == Reloc::PIC_) + expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_pcrel, ARM::LDRi12, RM); + else + expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_abs, ARM::LDRi12, RM); + return; + } + + if (RM != Reloc::PIC_) { + expandLoadStackGuardBase(MI, ARM::MOVi32imm, ARM::LDRi12, RM); + return; + } + + const GlobalValue *GV = + cast<GlobalValue>((*MI->memoperands_begin())->getValue()); + + if (!Subtarget.GVIsIndirectSymbol(GV, RM)) { + expandLoadStackGuardBase(MI, ARM::MOV_ga_pcrel, ARM::LDRi12, RM); + return; + } + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + unsigned Reg = MI->getOperand(0).getReg(); + MachineInstrBuilder MIB; + + MIB = BuildMI(MBB, MI, DL, get(ARM::MOV_ga_pcrel_ldr), Reg) + .addGlobalAddress(GV, 0, ARMII::MO_NONLAZY); + unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant; + MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand( + MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 4, 4); + MIB.addMemOperand(MMO); + MIB = BuildMI(MBB, MI, DL, get(ARM::LDRi12), Reg); + MIB.addReg(Reg, RegState::Kill).addImm(0); + MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + AddDefaultPred(MIB); +} diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h new file mode 100644 index 0000000..90f34ea --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h @@ -0,0 +1,48 @@ +//===-- ARMInstrInfo.h - ARM Instruction Information ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the ARM implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_ARMINSTRINFO_H +#define LLVM_LIB_TARGET_ARM_ARMINSTRINFO_H + +#include "ARMBaseInstrInfo.h" +#include "ARMRegisterInfo.h" + +namespace llvm { + class ARMSubtarget; + +class ARMInstrInfo : public ARMBaseInstrInfo { + ARMRegisterInfo RI; +public: + explicit ARMInstrInfo(const ARMSubtarget &STI); + + /// getNoopForMachoTarget - Return the noop instruction to use for a noop. + void getNoopForMachoTarget(MCInst &NopInst) const override; + + // Return the non-pre/post incrementing version of 'Opc'. Return 0 + // if there is not such an opcode. + unsigned getUnindexedOpcode(unsigned Opc) const override; + + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As + /// such, whenever a client has an instance of instruction info, it should + /// always be able to get register info as well (through this method). + /// + const ARMRegisterInfo &getRegisterInfo() const override { return RI; } + +private: + void expandLoadStackGuard(MachineBasicBlock::iterator MI, + Reloc::Model RM) const override; +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td new file mode 100644 index 0000000..b9de83b --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -0,0 +1,5756 @@ +//===- ARMInstrInfo.td - Target Description for ARM Target -*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the ARM instructions in TableGen format. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// ARM specific DAG Nodes. +// + +// Type profiles. +def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>; +def SDT_ARMCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>; +def SDT_ARMStructByVal : SDTypeProfile<0, 4, + [SDTCisVT<0, i32>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; + +def SDT_ARMSaveCallPC : SDTypeProfile<0, 1, []>; + +def SDT_ARMcall : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>; + +def SDT_ARMCMov : SDTypeProfile<1, 3, + [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, + SDTCisVT<3, i32>]>; + +def SDT_ARMBrcond : SDTypeProfile<0, 2, + [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>]>; + +def SDT_ARMBrJT : SDTypeProfile<0, 2, + [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>; + +def SDT_ARMBr2JT : SDTypeProfile<0, 3, + [SDTCisPtrTy<0>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>]>; + +def SDT_ARMBCC_i64 : SDTypeProfile<0, 6, + [SDTCisVT<0, i32>, + SDTCisVT<1, i32>, SDTCisVT<2, i32>, + SDTCisVT<3, i32>, SDTCisVT<4, i32>, + SDTCisVT<5, OtherVT>]>; + +def SDT_ARMAnd : SDTypeProfile<1, 2, + [SDTCisVT<0, i32>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>]>; + +def SDT_ARMCmp : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>; + +def SDT_ARMPICAdd : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, + SDTCisPtrTy<1>, SDTCisVT<2, i32>]>; + +def SDT_ARMThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>; +def SDT_ARMEH_SJLJ_Setjmp : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisPtrTy<1>, + SDTCisInt<2>]>; +def SDT_ARMEH_SJLJ_Longjmp: SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisInt<1>]>; +def SDT_ARMEH_SJLJ_SetupDispatch: SDTypeProfile<0, 0, []>; + +def SDT_ARMMEMBARRIER : SDTypeProfile<0, 1, [SDTCisInt<0>]>; + +def SDT_ARMPREFETCH : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisSameAs<1, 2>, + SDTCisInt<1>]>; + +def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; + +def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; + +def SDT_WIN__DBZCHK : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>; + +def SDT_ARMMEMCPY : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>, + SDTCisVT<4, i32>]>; + +def SDTBinaryArithWithFlags : SDTypeProfile<2, 2, + [SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisInt<0>, SDTCisVT<1, i32>]>; + +// SDTBinaryArithWithFlagsInOut - RES1, CPSR = op LHS, RHS, CPSR +def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, + [SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisInt<0>, + SDTCisVT<1, i32>, + SDTCisVT<4, i32>]>; + +def SDT_ARM64bitmlal : SDTypeProfile<2,4, [ SDTCisVT<0, i32>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>, + SDTCisVT<4, i32>, SDTCisVT<5, i32> ] >; +def ARMUmlal : SDNode<"ARMISD::UMLAL", SDT_ARM64bitmlal>; +def ARMSmlal : SDNode<"ARMISD::SMLAL", SDT_ARM64bitmlal>; + +// Node definitions. +def ARMWrapper : SDNode<"ARMISD::Wrapper", SDTIntUnaryOp>; +def ARMWrapperPIC : SDNode<"ARMISD::WrapperPIC", SDTIntUnaryOp>; +def ARMWrapperJT : SDNode<"ARMISD::WrapperJT", SDTIntUnaryOp>; + +def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeqStart, + [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>; +def ARMcallseq_end : SDNode<"ISD::CALLSEQ_END", SDT_ARMCallSeqEnd, + [SDNPHasChain, SDNPSideEffect, + SDNPOptInGlue, SDNPOutGlue]>; +def ARMcopystructbyval : SDNode<"ARMISD::COPY_STRUCT_BYVAL" , + SDT_ARMStructByVal, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, + SDNPMayStore, SDNPMayLoad]>; + +def ARMcall : SDNode<"ARMISD::CALL", SDT_ARMcall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; +def ARMcall_pred : SDNode<"ARMISD::CALL_PRED", SDT_ARMcall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; +def ARMcall_nolink : SDNode<"ARMISD::CALL_NOLINK", SDT_ARMcall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; + +def ARMretflag : SDNode<"ARMISD::RET_FLAG", SDTNone, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; +def ARMintretflag : SDNode<"ARMISD::INTRET_FLAG", SDT_ARMcall, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; +def ARMcmov : SDNode<"ARMISD::CMOV", SDT_ARMCMov, + [SDNPInGlue]>; + +def ARMbrcond : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>; + +def ARMbrjt : SDNode<"ARMISD::BR_JT", SDT_ARMBrJT, + [SDNPHasChain]>; +def ARMbr2jt : SDNode<"ARMISD::BR2_JT", SDT_ARMBr2JT, + [SDNPHasChain]>; + +def ARMBcci64 : SDNode<"ARMISD::BCC_i64", SDT_ARMBCC_i64, + [SDNPHasChain]>; + +def ARMcmp : SDNode<"ARMISD::CMP", SDT_ARMCmp, + [SDNPOutGlue]>; + +def ARMcmn : SDNode<"ARMISD::CMN", SDT_ARMCmp, + [SDNPOutGlue]>; + +def ARMcmpZ : SDNode<"ARMISD::CMPZ", SDT_ARMCmp, + [SDNPOutGlue, SDNPCommutative]>; + +def ARMpic_add : SDNode<"ARMISD::PIC_ADD", SDT_ARMPICAdd>; + +def ARMsrl_flag : SDNode<"ARMISD::SRL_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>; +def ARMsra_flag : SDNode<"ARMISD::SRA_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>; +def ARMrrx : SDNode<"ARMISD::RRX" , SDTIntUnaryOp, [SDNPInGlue ]>; + +def ARMaddc : SDNode<"ARMISD::ADDC", SDTBinaryArithWithFlags, + [SDNPCommutative]>; +def ARMsubc : SDNode<"ARMISD::SUBC", SDTBinaryArithWithFlags>; +def ARMadde : SDNode<"ARMISD::ADDE", SDTBinaryArithWithFlagsInOut>; +def ARMsube : SDNode<"ARMISD::SUBE", SDTBinaryArithWithFlagsInOut>; + +def ARMthread_pointer: SDNode<"ARMISD::THREAD_POINTER", SDT_ARMThreadPointer>; +def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP", + SDT_ARMEH_SJLJ_Setjmp, + [SDNPHasChain, SDNPSideEffect]>; +def ARMeh_sjlj_longjmp: SDNode<"ARMISD::EH_SJLJ_LONGJMP", + SDT_ARMEH_SJLJ_Longjmp, + [SDNPHasChain, SDNPSideEffect]>; +def ARMeh_sjlj_setup_dispatch: SDNode<"ARMISD::EH_SJLJ_SETUP_DISPATCH", + SDT_ARMEH_SJLJ_SetupDispatch, + [SDNPHasChain, SDNPSideEffect]>; + +def ARMMemBarrierMCR : SDNode<"ARMISD::MEMBARRIER_MCR", SDT_ARMMEMBARRIER, + [SDNPHasChain, SDNPSideEffect]>; +def ARMPreload : SDNode<"ARMISD::PRELOAD", SDT_ARMPREFETCH, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>; + +def ARMtcret : SDNode<"ARMISD::TC_RETURN", SDT_ARMTCRET, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; + +def ARMbfi : SDNode<"ARMISD::BFI", SDT_ARMBFI>; + +def ARMmemcopy : SDNode<"ARMISD::MEMCPY", SDT_ARMMEMCPY, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, + SDNPMayStore, SDNPMayLoad]>; + +//===----------------------------------------------------------------------===// +// ARM Instruction Predicate Definitions. +// +def HasV4T : Predicate<"Subtarget->hasV4TOps()">, + AssemblerPredicate<"HasV4TOps", "armv4t">; +def NoV4T : Predicate<"!Subtarget->hasV4TOps()">; +def HasV5T : Predicate<"Subtarget->hasV5TOps()">, + AssemblerPredicate<"HasV5TOps", "armv5t">; +def HasV5TE : Predicate<"Subtarget->hasV5TEOps()">, + AssemblerPredicate<"HasV5TEOps", "armv5te">; +def HasV6 : Predicate<"Subtarget->hasV6Ops()">, + AssemblerPredicate<"HasV6Ops", "armv6">; +def NoV6 : Predicate<"!Subtarget->hasV6Ops()">; +def HasV6M : Predicate<"Subtarget->hasV6MOps()">, + AssemblerPredicate<"HasV6MOps", + "armv6m or armv6t2">; +def HasV6T2 : Predicate<"Subtarget->hasV6T2Ops()">, + AssemblerPredicate<"HasV6T2Ops", "armv6t2">; +def NoV6T2 : Predicate<"!Subtarget->hasV6T2Ops()">; +def HasV6K : Predicate<"Subtarget->hasV6KOps()">, + AssemblerPredicate<"HasV6KOps", "armv6k">; +def NoV6K : Predicate<"!Subtarget->hasV6KOps()">; +def HasV7 : Predicate<"Subtarget->hasV7Ops()">, + AssemblerPredicate<"HasV7Ops", "armv7">; +def HasV8 : Predicate<"Subtarget->hasV8Ops()">, + AssemblerPredicate<"HasV8Ops", "armv8">; +def PreV8 : Predicate<"!Subtarget->hasV8Ops()">, + AssemblerPredicate<"!HasV8Ops", "armv7 or earlier">; +def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">, + AssemblerPredicate<"HasV8_1aOps", "armv8.1a">; +def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">, + AssemblerPredicate<"HasV8_2aOps", "armv8.2a">; +def NoVFP : Predicate<"!Subtarget->hasVFP2()">; +def HasVFP2 : Predicate<"Subtarget->hasVFP2()">, + AssemblerPredicate<"FeatureVFP2", "VFP2">; +def HasVFP3 : Predicate<"Subtarget->hasVFP3()">, + AssemblerPredicate<"FeatureVFP3", "VFP3">; +def HasVFP4 : Predicate<"Subtarget->hasVFP4()">, + AssemblerPredicate<"FeatureVFP4", "VFP4">; +def HasDPVFP : Predicate<"!Subtarget->isFPOnlySP()">, + AssemblerPredicate<"!FeatureVFPOnlySP", + "double precision VFP">; +def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">, + AssemblerPredicate<"FeatureFPARMv8", "FPARMv8">; +def HasNEON : Predicate<"Subtarget->hasNEON()">, + AssemblerPredicate<"FeatureNEON", "NEON">; +def HasCrypto : Predicate<"Subtarget->hasCrypto()">, + AssemblerPredicate<"FeatureCrypto", "crypto">; +def HasCRC : Predicate<"Subtarget->hasCRC()">, + AssemblerPredicate<"FeatureCRC", "crc">; +def HasFP16 : Predicate<"Subtarget->hasFP16()">, + AssemblerPredicate<"FeatureFP16","half-float conversions">; +def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, + AssemblerPredicate<"FeatureFullFP16","full half-float">; +def HasDivide : Predicate<"Subtarget->hasDivide()">, + AssemblerPredicate<"FeatureHWDiv", "divide in THUMB">; +def HasDivideInARM : Predicate<"Subtarget->hasDivideInARMMode()">, + AssemblerPredicate<"FeatureHWDivARM", "divide in ARM">; +def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">, + AssemblerPredicate<"FeatureT2XtPk", + "pack/extract">; +def HasDSP : Predicate<"Subtarget->hasDSP()">, + AssemblerPredicate<"FeatureDSP", "dsp">; +def HasDB : Predicate<"Subtarget->hasDataBarrier()">, + AssemblerPredicate<"FeatureDB", + "data-barriers">; +def HasMP : Predicate<"Subtarget->hasMPExtension()">, + AssemblerPredicate<"FeatureMP", + "mp-extensions">; +def HasVirtualization: Predicate<"false">, + AssemblerPredicate<"FeatureVirtualization", + "virtualization-extensions">; +def HasTrustZone : Predicate<"Subtarget->hasTrustZone()">, + AssemblerPredicate<"FeatureTrustZone", + "TrustZone">; +def HasZCZ : Predicate<"Subtarget->hasZeroCycleZeroing()">; +def UseNEONForFP : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">; +def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">; +def IsThumb : Predicate<"Subtarget->isThumb()">, + AssemblerPredicate<"ModeThumb", "thumb">; +def IsThumb1Only : Predicate<"Subtarget->isThumb1Only()">; +def IsThumb2 : Predicate<"Subtarget->isThumb2()">, + AssemblerPredicate<"ModeThumb,FeatureThumb2", + "thumb2">; +def IsMClass : Predicate<"Subtarget->isMClass()">, + AssemblerPredicate<"FeatureMClass", "armv*m">; +def IsNotMClass : Predicate<"!Subtarget->isMClass()">, + AssemblerPredicate<"!FeatureMClass", + "!armv*m">; +def IsARM : Predicate<"!Subtarget->isThumb()">, + AssemblerPredicate<"!ModeThumb", "arm-mode">; +def IsMachO : Predicate<"Subtarget->isTargetMachO()">; +def IsNotMachO : Predicate<"!Subtarget->isTargetMachO()">; +def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">; +def UseNaClTrap : Predicate<"Subtarget->useNaClTrap()">, + AssemblerPredicate<"FeatureNaClTrap", "NaCl">; +def DontUseNaClTrap : Predicate<"!Subtarget->useNaClTrap()">; + +// FIXME: Eventually this will be just "hasV6T2Ops". +def UseMovt : Predicate<"Subtarget->useMovt(*MF)">; +def DontUseMovt : Predicate<"!Subtarget->useMovt(*MF)">; +def UseFPVMLx : Predicate<"Subtarget->useFPVMLx()">; +def UseMulOps : Predicate<"Subtarget->useMulOps()">; + +// Prefer fused MAC for fp mul + add over fp VMLA / VMLS if they are available. +// But only select them if more precision in FP computation is allowed. +// Do not use them for Darwin platforms. +def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion ==" + " FPOpFusion::Fast && " + " Subtarget->hasVFP4()) && " + "!Subtarget->isTargetDarwin()">; +def DontUseFusedMAC : Predicate<"!(TM.Options.AllowFPOpFusion ==" + " FPOpFusion::Fast &&" + " Subtarget->hasVFP4()) || " + "Subtarget->isTargetDarwin()">; + +// VGETLNi32 is microcoded on Swift - prefer VMOV. +def HasFastVGETLNi32 : Predicate<"!Subtarget->isSwift()">; +def HasSlowVGETLNi32 : Predicate<"Subtarget->isSwift()">; + +// VDUP.32 is microcoded on Swift - prefer VMOV. +def HasFastVDUP32 : Predicate<"!Subtarget->isSwift()">; +def HasSlowVDUP32 : Predicate<"Subtarget->isSwift()">; + +// Cortex-A9 prefers VMOVSR to VMOVDRR even when using NEON for scalar FP, as +// this allows more effective execution domain optimization. See +// setExecutionDomain(). +def UseVMOVSR : Predicate<"Subtarget->isCortexA9() || !Subtarget->useNEONForSinglePrecisionFP()">; +def DontUseVMOVSR : Predicate<"!Subtarget->isCortexA9() && Subtarget->useNEONForSinglePrecisionFP()">; + +def IsLE : Predicate<"MF->getDataLayout().isLittleEndian()">; +def IsBE : Predicate<"MF->getDataLayout().isBigEndian()">; + +//===----------------------------------------------------------------------===// +// ARM Flag Definitions. + +class RegConstraint<string C> { + string Constraints = C; +} + +//===----------------------------------------------------------------------===// +// ARM specific transformation functions and pattern fragments. +// + +// imm_neg_XFORM - Return the negation of an i32 immediate value. +def imm_neg_XFORM : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(-(int)N->getZExtValue(), SDLoc(N), MVT::i32); +}]>; + +// imm_not_XFORM - Return the complement of a i32 immediate value. +def imm_not_XFORM : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(~(int)N->getZExtValue(), SDLoc(N), MVT::i32); +}]>; + +/// imm16_31 predicate - True if the 32-bit immediate is in the range [16,31]. +def imm16_31 : ImmLeaf<i32, [{ + return (int32_t)Imm >= 16 && (int32_t)Imm < 32; +}]>; + +// sext_16_node predicate - True if the SDNode is sign-extended 16 or more bits. +def sext_16_node : PatLeaf<(i32 GPR:$a), [{ + return CurDAG->ComputeNumSignBits(SDValue(N,0)) >= 17; +}]>; + +/// Split a 32-bit immediate into two 16 bit parts. +def hi16 : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant((uint32_t)N->getZExtValue() >> 16, SDLoc(N), + MVT::i32); +}]>; + +def lo16AllZero : PatLeaf<(i32 imm), [{ + // Returns true if all low 16-bits are 0. + return (((uint32_t)N->getZExtValue()) & 0xFFFFUL) == 0; +}], hi16>; + +class BinOpWithFlagFrag<dag res> : + PatFrag<(ops node:$LHS, node:$RHS, node:$FLAG), res>; +class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>; +class UnOpFrag <dag res> : PatFrag<(ops node:$Src), res>; + +// An 'and' node with a single use. +def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{ + return N->hasOneUse(); +}]>; + +// An 'xor' node with a single use. +def xor_su : PatFrag<(ops node:$lhs, node:$rhs), (xor node:$lhs, node:$rhs), [{ + return N->hasOneUse(); +}]>; + +// An 'fmul' node with a single use. +def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (fmul node:$lhs, node:$rhs),[{ + return N->hasOneUse(); +}]>; + +// An 'fadd' node which checks for single non-hazardous use. +def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fadd node:$lhs, node:$rhs),[{ + return hasNoVMLxHazardUse(N); +}]>; + +// An 'fsub' node which checks for single non-hazardous use. +def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{ + return hasNoVMLxHazardUse(N); +}]>; + +//===----------------------------------------------------------------------===// +// Operand Definitions. +// + +// Immediate operands with a shared generic asm render method. +class ImmAsmOperand : AsmOperandClass { let RenderMethod = "addImmOperands"; } + +// Operands that are part of a memory addressing mode. +class MemOperand : Operand<i32> { let OperandType = "OPERAND_MEMORY"; } + +// Branch target. +// FIXME: rename brtarget to t2_brtarget +def brtarget : Operand<OtherVT> { + let EncoderMethod = "getBranchTargetOpValue"; + let OperandType = "OPERAND_PCREL"; + let DecoderMethod = "DecodeT2BROperand"; +} + +// FIXME: get rid of this one? +def uncondbrtarget : Operand<OtherVT> { + let EncoderMethod = "getUnconditionalBranchTargetOpValue"; + let OperandType = "OPERAND_PCREL"; +} + +// Branch target for ARM. Handles conditional/unconditional +def br_target : Operand<OtherVT> { + let EncoderMethod = "getARMBranchTargetOpValue"; + let OperandType = "OPERAND_PCREL"; +} + +// Call target. +// FIXME: rename bltarget to t2_bl_target? +def bltarget : Operand<i32> { + // Encoded the same as branch targets. + let EncoderMethod = "getBranchTargetOpValue"; + let OperandType = "OPERAND_PCREL"; +} + +// Call target for ARM. Handles conditional/unconditional +// FIXME: rename bl_target to t2_bltarget? +def bl_target : Operand<i32> { + let EncoderMethod = "getARMBLTargetOpValue"; + let OperandType = "OPERAND_PCREL"; +} + +def blx_target : Operand<i32> { + let EncoderMethod = "getARMBLXTargetOpValue"; + let OperandType = "OPERAND_PCREL"; +} + +// A list of registers separated by comma. Used by load/store multiple. +def RegListAsmOperand : AsmOperandClass { let Name = "RegList"; } +def reglist : Operand<i32> { + let EncoderMethod = "getRegisterListOpValue"; + let ParserMatchClass = RegListAsmOperand; + let PrintMethod = "printRegisterList"; + let DecoderMethod = "DecodeRegListOperand"; +} + +def GPRPairOp : RegisterOperand<GPRPair, "printGPRPairOperand">; + +def DPRRegListAsmOperand : AsmOperandClass { let Name = "DPRRegList"; } +def dpr_reglist : Operand<i32> { + let EncoderMethod = "getRegisterListOpValue"; + let ParserMatchClass = DPRRegListAsmOperand; + let PrintMethod = "printRegisterList"; + let DecoderMethod = "DecodeDPRRegListOperand"; +} + +def SPRRegListAsmOperand : AsmOperandClass { let Name = "SPRRegList"; } +def spr_reglist : Operand<i32> { + let EncoderMethod = "getRegisterListOpValue"; + let ParserMatchClass = SPRRegListAsmOperand; + let PrintMethod = "printRegisterList"; + let DecoderMethod = "DecodeSPRRegListOperand"; +} + +// An operand for the CONSTPOOL_ENTRY pseudo-instruction. +def cpinst_operand : Operand<i32> { + let PrintMethod = "printCPInstOperand"; +} + +// Local PC labels. +def pclabel : Operand<i32> { + let PrintMethod = "printPCLabel"; +} + +// ADR instruction labels. +def AdrLabelAsmOperand : AsmOperandClass { let Name = "AdrLabel"; } +def adrlabel : Operand<i32> { + let EncoderMethod = "getAdrLabelOpValue"; + let ParserMatchClass = AdrLabelAsmOperand; + let PrintMethod = "printAdrLabelOperand<0>"; +} + +def neon_vcvt_imm32 : Operand<i32> { + let EncoderMethod = "getNEONVcvtImm32OpValue"; + let DecoderMethod = "DecodeVCVTImmOperand"; +} + +// rot_imm: An integer that encodes a rotate amount. Must be 8, 16, or 24. +def rot_imm_XFORM: SDNodeXForm<imm, [{ + switch (N->getZExtValue()){ + default: llvm_unreachable(nullptr); + case 0: return CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); + case 8: return CurDAG->getTargetConstant(1, SDLoc(N), MVT::i32); + case 16: return CurDAG->getTargetConstant(2, SDLoc(N), MVT::i32); + case 24: return CurDAG->getTargetConstant(3, SDLoc(N), MVT::i32); + } +}]>; +def RotImmAsmOperand : AsmOperandClass { + let Name = "RotImm"; + let ParserMethod = "parseRotImm"; +} +def rot_imm : Operand<i32>, PatLeaf<(i32 imm), [{ + int32_t v = N->getZExtValue(); + return v == 8 || v == 16 || v == 24; }], + rot_imm_XFORM> { + let PrintMethod = "printRotImmOperand"; + let ParserMatchClass = RotImmAsmOperand; +} + +// shift_imm: An integer that encodes a shift amount and the type of shift +// (asr or lsl). The 6-bit immediate encodes as: +// {5} 0 ==> lsl +// 1 asr +// {4-0} imm5 shift amount. +// asr #32 encoded as imm5 == 0. +def ShifterImmAsmOperand : AsmOperandClass { + let Name = "ShifterImm"; + let ParserMethod = "parseShifterImm"; +} +def shift_imm : Operand<i32> { + let PrintMethod = "printShiftImmOperand"; + let ParserMatchClass = ShifterImmAsmOperand; +} + +// shifter_operand operands: so_reg_reg, so_reg_imm, and mod_imm. +def ShiftedRegAsmOperand : AsmOperandClass { let Name = "RegShiftedReg"; } +def so_reg_reg : Operand<i32>, // reg reg imm + ComplexPattern<i32, 3, "SelectRegShifterOperand", + [shl, srl, sra, rotr]> { + let EncoderMethod = "getSORegRegOpValue"; + let PrintMethod = "printSORegRegOperand"; + let DecoderMethod = "DecodeSORegRegOperand"; + let ParserMatchClass = ShiftedRegAsmOperand; + let MIOperandInfo = (ops GPRnopc, GPRnopc, i32imm); +} + +def ShiftedImmAsmOperand : AsmOperandClass { let Name = "RegShiftedImm"; } +def so_reg_imm : Operand<i32>, // reg imm + ComplexPattern<i32, 2, "SelectImmShifterOperand", + [shl, srl, sra, rotr]> { + let EncoderMethod = "getSORegImmOpValue"; + let PrintMethod = "printSORegImmOperand"; + let DecoderMethod = "DecodeSORegImmOperand"; + let ParserMatchClass = ShiftedImmAsmOperand; + let MIOperandInfo = (ops GPR, i32imm); +} + +// FIXME: Does this need to be distinct from so_reg? +def shift_so_reg_reg : Operand<i32>, // reg reg imm + ComplexPattern<i32, 3, "SelectShiftRegShifterOperand", + [shl,srl,sra,rotr]> { + let EncoderMethod = "getSORegRegOpValue"; + let PrintMethod = "printSORegRegOperand"; + let DecoderMethod = "DecodeSORegRegOperand"; + let ParserMatchClass = ShiftedRegAsmOperand; + let MIOperandInfo = (ops GPR, GPR, i32imm); +} + +// FIXME: Does this need to be distinct from so_reg? +def shift_so_reg_imm : Operand<i32>, // reg reg imm + ComplexPattern<i32, 2, "SelectShiftImmShifterOperand", + [shl,srl,sra,rotr]> { + let EncoderMethod = "getSORegImmOpValue"; + let PrintMethod = "printSORegImmOperand"; + let DecoderMethod = "DecodeSORegImmOperand"; + let ParserMatchClass = ShiftedImmAsmOperand; + let MIOperandInfo = (ops GPR, i32imm); +} + +// mod_imm: match a 32-bit immediate operand, which can be encoded into +// a 12-bit immediate; an 8-bit integer and a 4-bit rotator (See ARMARM +// - "Modified Immediate Constants"). Within the MC layer we keep this +// immediate in its encoded form. +def ModImmAsmOperand: AsmOperandClass { + let Name = "ModImm"; + let ParserMethod = "parseModImm"; +} +def mod_imm : Operand<i32>, ImmLeaf<i32, [{ + return ARM_AM::getSOImmVal(Imm) != -1; + }]> { + let EncoderMethod = "getModImmOpValue"; + let PrintMethod = "printModImmOperand"; + let ParserMatchClass = ModImmAsmOperand; +} + +// Note: the patterns mod_imm_not and mod_imm_neg do not require an encoder +// method and such, as they are only used on aliases (Pat<> and InstAlias<>). +// The actual parsing, encoding, decoding are handled by the destination +// instructions, which use mod_imm. + +def ModImmNotAsmOperand : AsmOperandClass { let Name = "ModImmNot"; } +def mod_imm_not : Operand<i32>, PatLeaf<(imm), [{ + return ARM_AM::getSOImmVal(~(uint32_t)N->getZExtValue()) != -1; + }], imm_not_XFORM> { + let ParserMatchClass = ModImmNotAsmOperand; +} + +def ModImmNegAsmOperand : AsmOperandClass { let Name = "ModImmNeg"; } +def mod_imm_neg : Operand<i32>, PatLeaf<(imm), [{ + unsigned Value = -(unsigned)N->getZExtValue(); + return Value && ARM_AM::getSOImmVal(Value) != -1; + }], imm_neg_XFORM> { + let ParserMatchClass = ModImmNegAsmOperand; +} + +/// arm_i32imm - True for +V6T2, or when isSOImmTwoParVal() +def arm_i32imm : PatLeaf<(imm), [{ + if (Subtarget->useMovt(*MF)) + return true; + return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue()); +}]>; + +/// imm0_1 predicate - Immediate in the range [0,1]. +def Imm0_1AsmOperand: ImmAsmOperand { let Name = "Imm0_1"; } +def imm0_1 : Operand<i32> { let ParserMatchClass = Imm0_1AsmOperand; } + +/// imm0_3 predicate - Immediate in the range [0,3]. +def Imm0_3AsmOperand: ImmAsmOperand { let Name = "Imm0_3"; } +def imm0_3 : Operand<i32> { let ParserMatchClass = Imm0_3AsmOperand; } + +/// imm0_7 predicate - Immediate in the range [0,7]. +def Imm0_7AsmOperand: ImmAsmOperand { let Name = "Imm0_7"; } +def imm0_7 : Operand<i32>, ImmLeaf<i32, [{ + return Imm >= 0 && Imm < 8; +}]> { + let ParserMatchClass = Imm0_7AsmOperand; +} + +/// imm8 predicate - Immediate is exactly 8. +def Imm8AsmOperand: ImmAsmOperand { let Name = "Imm8"; } +def imm8 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 8; }]> { + let ParserMatchClass = Imm8AsmOperand; +} + +/// imm16 predicate - Immediate is exactly 16. +def Imm16AsmOperand: ImmAsmOperand { let Name = "Imm16"; } +def imm16 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 16; }]> { + let ParserMatchClass = Imm16AsmOperand; +} + +/// imm32 predicate - Immediate is exactly 32. +def Imm32AsmOperand: ImmAsmOperand { let Name = "Imm32"; } +def imm32 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 32; }]> { + let ParserMatchClass = Imm32AsmOperand; +} + +def imm8_or_16 : ImmLeaf<i32, [{ return Imm == 8 || Imm == 16;}]>; + +/// imm1_7 predicate - Immediate in the range [1,7]. +def Imm1_7AsmOperand: ImmAsmOperand { let Name = "Imm1_7"; } +def imm1_7 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 8; }]> { + let ParserMatchClass = Imm1_7AsmOperand; +} + +/// imm1_15 predicate - Immediate in the range [1,15]. +def Imm1_15AsmOperand: ImmAsmOperand { let Name = "Imm1_15"; } +def imm1_15 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 16; }]> { + let ParserMatchClass = Imm1_15AsmOperand; +} + +/// imm1_31 predicate - Immediate in the range [1,31]. +def Imm1_31AsmOperand: ImmAsmOperand { let Name = "Imm1_31"; } +def imm1_31 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 32; }]> { + let ParserMatchClass = Imm1_31AsmOperand; +} + +/// imm0_15 predicate - Immediate in the range [0,15]. +def Imm0_15AsmOperand: ImmAsmOperand { + let Name = "Imm0_15"; + let DiagnosticType = "ImmRange0_15"; +} +def imm0_15 : Operand<i32>, ImmLeaf<i32, [{ + return Imm >= 0 && Imm < 16; +}]> { + let ParserMatchClass = Imm0_15AsmOperand; +} + +/// imm0_31 predicate - True if the 32-bit immediate is in the range [0,31]. +def Imm0_31AsmOperand: ImmAsmOperand { let Name = "Imm0_31"; } +def imm0_31 : Operand<i32>, ImmLeaf<i32, [{ + return Imm >= 0 && Imm < 32; +}]> { + let ParserMatchClass = Imm0_31AsmOperand; +} + +/// imm0_32 predicate - True if the 32-bit immediate is in the range [0,32]. +def Imm0_32AsmOperand: ImmAsmOperand { let Name = "Imm0_32"; } +def imm0_32 : Operand<i32>, ImmLeaf<i32, [{ + return Imm >= 0 && Imm < 32; +}]> { + let ParserMatchClass = Imm0_32AsmOperand; +} + +/// imm0_63 predicate - True if the 32-bit immediate is in the range [0,63]. +def Imm0_63AsmOperand: ImmAsmOperand { let Name = "Imm0_63"; } +def imm0_63 : Operand<i32>, ImmLeaf<i32, [{ + return Imm >= 0 && Imm < 64; +}]> { + let ParserMatchClass = Imm0_63AsmOperand; +} + +/// imm0_239 predicate - Immediate in the range [0,239]. +def Imm0_239AsmOperand : ImmAsmOperand { + let Name = "Imm0_239"; + let DiagnosticType = "ImmRange0_239"; +} +def imm0_239 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 240; }]> { + let ParserMatchClass = Imm0_239AsmOperand; +} + +/// imm0_255 predicate - Immediate in the range [0,255]. +def Imm0_255AsmOperand : ImmAsmOperand { let Name = "Imm0_255"; } +def imm0_255 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 256; }]> { + let ParserMatchClass = Imm0_255AsmOperand; +} + +/// imm0_65535 - An immediate is in the range [0.65535]. +def Imm0_65535AsmOperand: ImmAsmOperand { let Name = "Imm0_65535"; } +def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{ + return Imm >= 0 && Imm < 65536; +}]> { + let ParserMatchClass = Imm0_65535AsmOperand; +} + +// imm0_65535_neg - An immediate whose negative value is in the range [0.65535]. +def imm0_65535_neg : Operand<i32>, ImmLeaf<i32, [{ + return -Imm >= 0 && -Imm < 65536; +}]>; + +// imm0_65535_expr - For movt/movw - 16-bit immediate that can also reference +// a relocatable expression. +// +// FIXME: This really needs a Thumb version separate from the ARM version. +// While the range is the same, and can thus use the same match class, +// the encoding is different so it should have a different encoder method. +def Imm0_65535ExprAsmOperand: ImmAsmOperand { let Name = "Imm0_65535Expr"; } +def imm0_65535_expr : Operand<i32> { + let EncoderMethod = "getHiLo16ImmOpValue"; + let ParserMatchClass = Imm0_65535ExprAsmOperand; +} + +def Imm256_65535ExprAsmOperand: ImmAsmOperand { let Name = "Imm256_65535Expr"; } +def imm256_65535_expr : Operand<i32> { + let ParserMatchClass = Imm256_65535ExprAsmOperand; +} + +/// imm24b - True if the 32-bit immediate is encodable in 24 bits. +def Imm24bitAsmOperand: ImmAsmOperand { let Name = "Imm24bit"; } +def imm24b : Operand<i32>, ImmLeaf<i32, [{ + return Imm >= 0 && Imm <= 0xffffff; +}]> { + let ParserMatchClass = Imm24bitAsmOperand; +} + + +/// bf_inv_mask_imm predicate - An AND mask to clear an arbitrary width bitfield +/// e.g., 0xf000ffff +def BitfieldAsmOperand : AsmOperandClass { + let Name = "Bitfield"; + let ParserMethod = "parseBitfield"; +} + +def bf_inv_mask_imm : Operand<i32>, + PatLeaf<(imm), [{ + return ARM::isBitFieldInvertedMask(N->getZExtValue()); +}] > { + let EncoderMethod = "getBitfieldInvertedMaskOpValue"; + let PrintMethod = "printBitfieldInvMaskImmOperand"; + let DecoderMethod = "DecodeBitfieldMaskOperand"; + let ParserMatchClass = BitfieldAsmOperand; +} + +def imm1_32_XFORM: SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant((int)N->getZExtValue() - 1, SDLoc(N), + MVT::i32); +}]>; +def Imm1_32AsmOperand: AsmOperandClass { let Name = "Imm1_32"; } +def imm1_32 : Operand<i32>, PatLeaf<(imm), [{ + uint64_t Imm = N->getZExtValue(); + return Imm > 0 && Imm <= 32; + }], + imm1_32_XFORM> { + let PrintMethod = "printImmPlusOneOperand"; + let ParserMatchClass = Imm1_32AsmOperand; +} + +def imm1_16_XFORM: SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant((int)N->getZExtValue() - 1, SDLoc(N), + MVT::i32); +}]>; +def Imm1_16AsmOperand: AsmOperandClass { let Name = "Imm1_16"; } +def imm1_16 : Operand<i32>, PatLeaf<(imm), [{ return Imm > 0 && Imm <= 16; }], + imm1_16_XFORM> { + let PrintMethod = "printImmPlusOneOperand"; + let ParserMatchClass = Imm1_16AsmOperand; +} + +// Define ARM specific addressing modes. +// addrmode_imm12 := reg +/- imm12 +// +def MemImm12OffsetAsmOperand : AsmOperandClass { let Name = "MemImm12Offset"; } +class AddrMode_Imm12 : MemOperand, + ComplexPattern<i32, 2, "SelectAddrModeImm12", []> { + // 12-bit immediate operand. Note that instructions using this encode + // #0 and #-0 differently. We flag #-0 as the magic value INT32_MIN. All other + // immediate values are as normal. + + let EncoderMethod = "getAddrModeImm12OpValue"; + let DecoderMethod = "DecodeAddrModeImm12Operand"; + let ParserMatchClass = MemImm12OffsetAsmOperand; + let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); +} + +def addrmode_imm12 : AddrMode_Imm12 { + let PrintMethod = "printAddrModeImm12Operand<false>"; +} + +def addrmode_imm12_pre : AddrMode_Imm12 { + let PrintMethod = "printAddrModeImm12Operand<true>"; +} + +// ldst_so_reg := reg +/- reg shop imm +// +def MemRegOffsetAsmOperand : AsmOperandClass { let Name = "MemRegOffset"; } +def ldst_so_reg : MemOperand, + ComplexPattern<i32, 3, "SelectLdStSOReg", []> { + let EncoderMethod = "getLdStSORegOpValue"; + // FIXME: Simplify the printer + let PrintMethod = "printAddrMode2Operand"; + let DecoderMethod = "DecodeSORegMemOperand"; + let ParserMatchClass = MemRegOffsetAsmOperand; + let MIOperandInfo = (ops GPR:$base, GPRnopc:$offsreg, i32imm:$shift); +} + +// postidx_imm8 := +/- [0,255] +// +// 9 bit value: +// {8} 1 is imm8 is non-negative. 0 otherwise. +// {7-0} [0,255] imm8 value. +def PostIdxImm8AsmOperand : AsmOperandClass { let Name = "PostIdxImm8"; } +def postidx_imm8 : MemOperand { + let PrintMethod = "printPostIdxImm8Operand"; + let ParserMatchClass = PostIdxImm8AsmOperand; + let MIOperandInfo = (ops i32imm); +} + +// postidx_imm8s4 := +/- [0,1020] +// +// 9 bit value: +// {8} 1 is imm8 is non-negative. 0 otherwise. +// {7-0} [0,255] imm8 value, scaled by 4. +def PostIdxImm8s4AsmOperand : AsmOperandClass { let Name = "PostIdxImm8s4"; } +def postidx_imm8s4 : MemOperand { + let PrintMethod = "printPostIdxImm8s4Operand"; + let ParserMatchClass = PostIdxImm8s4AsmOperand; + let MIOperandInfo = (ops i32imm); +} + + +// postidx_reg := +/- reg +// +def PostIdxRegAsmOperand : AsmOperandClass { + let Name = "PostIdxReg"; + let ParserMethod = "parsePostIdxReg"; +} +def postidx_reg : MemOperand { + let EncoderMethod = "getPostIdxRegOpValue"; + let DecoderMethod = "DecodePostIdxReg"; + let PrintMethod = "printPostIdxRegOperand"; + let ParserMatchClass = PostIdxRegAsmOperand; + let MIOperandInfo = (ops GPRnopc, i32imm); +} + + +// addrmode2 := reg +/- imm12 +// := reg +/- reg shop imm +// +// FIXME: addrmode2 should be refactored the rest of the way to always +// use explicit imm vs. reg versions above (addrmode_imm12 and ldst_so_reg). +def AddrMode2AsmOperand : AsmOperandClass { let Name = "AddrMode2"; } +def addrmode2 : MemOperand, + ComplexPattern<i32, 3, "SelectAddrMode2", []> { + let EncoderMethod = "getAddrMode2OpValue"; + let PrintMethod = "printAddrMode2Operand"; + let ParserMatchClass = AddrMode2AsmOperand; + let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm); +} + +def PostIdxRegShiftedAsmOperand : AsmOperandClass { + let Name = "PostIdxRegShifted"; + let ParserMethod = "parsePostIdxReg"; +} +def am2offset_reg : MemOperand, + ComplexPattern<i32, 2, "SelectAddrMode2OffsetReg", + [], [SDNPWantRoot]> { + let EncoderMethod = "getAddrMode2OffsetOpValue"; + let PrintMethod = "printAddrMode2OffsetOperand"; + // When using this for assembly, it's always as a post-index offset. + let ParserMatchClass = PostIdxRegShiftedAsmOperand; + let MIOperandInfo = (ops GPRnopc, i32imm); +} + +// FIXME: am2offset_imm should only need the immediate, not the GPR. Having +// the GPR is purely vestigal at this point. +def AM2OffsetImmAsmOperand : AsmOperandClass { let Name = "AM2OffsetImm"; } +def am2offset_imm : MemOperand, + ComplexPattern<i32, 2, "SelectAddrMode2OffsetImm", + [], [SDNPWantRoot]> { + let EncoderMethod = "getAddrMode2OffsetOpValue"; + let PrintMethod = "printAddrMode2OffsetOperand"; + let ParserMatchClass = AM2OffsetImmAsmOperand; + let MIOperandInfo = (ops GPRnopc, i32imm); +} + + +// addrmode3 := reg +/- reg +// addrmode3 := reg +/- imm8 +// +// FIXME: split into imm vs. reg versions. +def AddrMode3AsmOperand : AsmOperandClass { let Name = "AddrMode3"; } +class AddrMode3 : MemOperand, + ComplexPattern<i32, 3, "SelectAddrMode3", []> { + let EncoderMethod = "getAddrMode3OpValue"; + let ParserMatchClass = AddrMode3AsmOperand; + let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm); +} + +def addrmode3 : AddrMode3 +{ + let PrintMethod = "printAddrMode3Operand<false>"; +} + +def addrmode3_pre : AddrMode3 +{ + let PrintMethod = "printAddrMode3Operand<true>"; +} + +// FIXME: split into imm vs. reg versions. +// FIXME: parser method to handle +/- register. +def AM3OffsetAsmOperand : AsmOperandClass { + let Name = "AM3Offset"; + let ParserMethod = "parseAM3Offset"; +} +def am3offset : MemOperand, + ComplexPattern<i32, 2, "SelectAddrMode3Offset", + [], [SDNPWantRoot]> { + let EncoderMethod = "getAddrMode3OffsetOpValue"; + let PrintMethod = "printAddrMode3OffsetOperand"; + let ParserMatchClass = AM3OffsetAsmOperand; + let MIOperandInfo = (ops GPR, i32imm); +} + +// ldstm_mode := {ia, ib, da, db} +// +def ldstm_mode : OptionalDefOperand<OtherVT, (ops i32), (ops (i32 1))> { + let EncoderMethod = "getLdStmModeOpValue"; + let PrintMethod = "printLdStmModeOperand"; +} + +// addrmode5 := reg +/- imm8*4 +// +def AddrMode5AsmOperand : AsmOperandClass { let Name = "AddrMode5"; } +class AddrMode5 : MemOperand, + ComplexPattern<i32, 2, "SelectAddrMode5", []> { + let EncoderMethod = "getAddrMode5OpValue"; + let DecoderMethod = "DecodeAddrMode5Operand"; + let ParserMatchClass = AddrMode5AsmOperand; + let MIOperandInfo = (ops GPR:$base, i32imm); +} + +def addrmode5 : AddrMode5 { + let PrintMethod = "printAddrMode5Operand<false>"; +} + +def addrmode5_pre : AddrMode5 { + let PrintMethod = "printAddrMode5Operand<true>"; +} + +// addrmode6 := reg with optional alignment +// +def AddrMode6AsmOperand : AsmOperandClass { let Name = "AlignedMemory"; } +def addrmode6 : MemOperand, + ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{ + let PrintMethod = "printAddrMode6Operand"; + let MIOperandInfo = (ops GPR:$addr, i32imm:$align); + let EncoderMethod = "getAddrMode6AddressOpValue"; + let DecoderMethod = "DecodeAddrMode6Operand"; + let ParserMatchClass = AddrMode6AsmOperand; +} + +def am6offset : MemOperand, + ComplexPattern<i32, 1, "SelectAddrMode6Offset", + [], [SDNPWantRoot]> { + let PrintMethod = "printAddrMode6OffsetOperand"; + let MIOperandInfo = (ops GPR); + let EncoderMethod = "getAddrMode6OffsetOpValue"; + let DecoderMethod = "DecodeGPRRegisterClass"; +} + +// Special version of addrmode6 to handle alignment encoding for VST1/VLD1 +// (single element from one lane) for size 32. +def addrmode6oneL32 : MemOperand, + ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{ + let PrintMethod = "printAddrMode6Operand"; + let MIOperandInfo = (ops GPR:$addr, i32imm); + let EncoderMethod = "getAddrMode6OneLane32AddressOpValue"; +} + +// Base class for addrmode6 with specific alignment restrictions. +class AddrMode6Align : MemOperand, + ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{ + let PrintMethod = "printAddrMode6Operand"; + let MIOperandInfo = (ops GPR:$addr, i32imm:$align); + let EncoderMethod = "getAddrMode6AddressOpValue"; + let DecoderMethod = "DecodeAddrMode6Operand"; +} + +// Special version of addrmode6 to handle no allowed alignment encoding for +// VLD/VST instructions and checking the alignment is not specified. +def AddrMode6AlignNoneAsmOperand : AsmOperandClass { + let Name = "AlignedMemoryNone"; + let DiagnosticType = "AlignedMemoryRequiresNone"; +} +def addrmode6alignNone : AddrMode6Align { + // The alignment specifier can only be omitted. + let ParserMatchClass = AddrMode6AlignNoneAsmOperand; +} + +// Special version of addrmode6 to handle 16-bit alignment encoding for +// VLD/VST instructions and checking the alignment value. +def AddrMode6Align16AsmOperand : AsmOperandClass { + let Name = "AlignedMemory16"; + let DiagnosticType = "AlignedMemoryRequires16"; +} +def addrmode6align16 : AddrMode6Align { + // The alignment specifier can only be 16 or omitted. + let ParserMatchClass = AddrMode6Align16AsmOperand; +} + +// Special version of addrmode6 to handle 32-bit alignment encoding for +// VLD/VST instructions and checking the alignment value. +def AddrMode6Align32AsmOperand : AsmOperandClass { + let Name = "AlignedMemory32"; + let DiagnosticType = "AlignedMemoryRequires32"; +} +def addrmode6align32 : AddrMode6Align { + // The alignment specifier can only be 32 or omitted. + let ParserMatchClass = AddrMode6Align32AsmOperand; +} + +// Special version of addrmode6 to handle 64-bit alignment encoding for +// VLD/VST instructions and checking the alignment value. +def AddrMode6Align64AsmOperand : AsmOperandClass { + let Name = "AlignedMemory64"; + let DiagnosticType = "AlignedMemoryRequires64"; +} +def addrmode6align64 : AddrMode6Align { + // The alignment specifier can only be 64 or omitted. + let ParserMatchClass = AddrMode6Align64AsmOperand; +} + +// Special version of addrmode6 to handle 64-bit or 128-bit alignment encoding +// for VLD/VST instructions and checking the alignment value. +def AddrMode6Align64or128AsmOperand : AsmOperandClass { + let Name = "AlignedMemory64or128"; + let DiagnosticType = "AlignedMemoryRequires64or128"; +} +def addrmode6align64or128 : AddrMode6Align { + // The alignment specifier can only be 64, 128 or omitted. + let ParserMatchClass = AddrMode6Align64or128AsmOperand; +} + +// Special version of addrmode6 to handle 64-bit, 128-bit or 256-bit alignment +// encoding for VLD/VST instructions and checking the alignment value. +def AddrMode6Align64or128or256AsmOperand : AsmOperandClass { + let Name = "AlignedMemory64or128or256"; + let DiagnosticType = "AlignedMemoryRequires64or128or256"; +} +def addrmode6align64or128or256 : AddrMode6Align { + // The alignment specifier can only be 64, 128, 256 or omitted. + let ParserMatchClass = AddrMode6Align64or128or256AsmOperand; +} + +// Special version of addrmode6 to handle alignment encoding for VLD-dup +// instructions, specifically VLD4-dup. +def addrmode6dup : MemOperand, + ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{ + let PrintMethod = "printAddrMode6Operand"; + let MIOperandInfo = (ops GPR:$addr, i32imm); + let EncoderMethod = "getAddrMode6DupAddressOpValue"; + // FIXME: This is close, but not quite right. The alignment specifier is + // different. + let ParserMatchClass = AddrMode6AsmOperand; +} + +// Base class for addrmode6dup with specific alignment restrictions. +class AddrMode6DupAlign : MemOperand, + ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{ + let PrintMethod = "printAddrMode6Operand"; + let MIOperandInfo = (ops GPR:$addr, i32imm); + let EncoderMethod = "getAddrMode6DupAddressOpValue"; +} + +// Special version of addrmode6 to handle no allowed alignment encoding for +// VLD-dup instruction and checking the alignment is not specified. +def AddrMode6dupAlignNoneAsmOperand : AsmOperandClass { + let Name = "DupAlignedMemoryNone"; + let DiagnosticType = "DupAlignedMemoryRequiresNone"; +} +def addrmode6dupalignNone : AddrMode6DupAlign { + // The alignment specifier can only be omitted. + let ParserMatchClass = AddrMode6dupAlignNoneAsmOperand; +} + +// Special version of addrmode6 to handle 16-bit alignment encoding for VLD-dup +// instruction and checking the alignment value. +def AddrMode6dupAlign16AsmOperand : AsmOperandClass { + let Name = "DupAlignedMemory16"; + let DiagnosticType = "DupAlignedMemoryRequires16"; +} +def addrmode6dupalign16 : AddrMode6DupAlign { + // The alignment specifier can only be 16 or omitted. + let ParserMatchClass = AddrMode6dupAlign16AsmOperand; +} + +// Special version of addrmode6 to handle 32-bit alignment encoding for VLD-dup +// instruction and checking the alignment value. +def AddrMode6dupAlign32AsmOperand : AsmOperandClass { + let Name = "DupAlignedMemory32"; + let DiagnosticType = "DupAlignedMemoryRequires32"; +} +def addrmode6dupalign32 : AddrMode6DupAlign { + // The alignment specifier can only be 32 or omitted. + let ParserMatchClass = AddrMode6dupAlign32AsmOperand; +} + +// Special version of addrmode6 to handle 64-bit alignment encoding for VLD +// instructions and checking the alignment value. +def AddrMode6dupAlign64AsmOperand : AsmOperandClass { + let Name = "DupAlignedMemory64"; + let DiagnosticType = "DupAlignedMemoryRequires64"; +} +def addrmode6dupalign64 : AddrMode6DupAlign { + // The alignment specifier can only be 64 or omitted. + let ParserMatchClass = AddrMode6dupAlign64AsmOperand; +} + +// Special version of addrmode6 to handle 64-bit or 128-bit alignment encoding +// for VLD instructions and checking the alignment value. +def AddrMode6dupAlign64or128AsmOperand : AsmOperandClass { + let Name = "DupAlignedMemory64or128"; + let DiagnosticType = "DupAlignedMemoryRequires64or128"; +} +def addrmode6dupalign64or128 : AddrMode6DupAlign { + // The alignment specifier can only be 64, 128 or omitted. + let ParserMatchClass = AddrMode6dupAlign64or128AsmOperand; +} + +// addrmodepc := pc + reg +// +def addrmodepc : MemOperand, + ComplexPattern<i32, 2, "SelectAddrModePC", []> { + let PrintMethod = "printAddrModePCOperand"; + let MIOperandInfo = (ops GPR, i32imm); +} + +// addr_offset_none := reg +// +def MemNoOffsetAsmOperand : AsmOperandClass { let Name = "MemNoOffset"; } +def addr_offset_none : MemOperand, + ComplexPattern<i32, 1, "SelectAddrOffsetNone", []> { + let PrintMethod = "printAddrMode7Operand"; + let DecoderMethod = "DecodeAddrMode7Operand"; + let ParserMatchClass = MemNoOffsetAsmOperand; + let MIOperandInfo = (ops GPR:$base); +} + +def nohash_imm : Operand<i32> { + let PrintMethod = "printNoHashImmediate"; +} + +def CoprocNumAsmOperand : AsmOperandClass { + let Name = "CoprocNum"; + let ParserMethod = "parseCoprocNumOperand"; +} +def p_imm : Operand<i32> { + let PrintMethod = "printPImmediate"; + let ParserMatchClass = CoprocNumAsmOperand; + let DecoderMethod = "DecodeCoprocessor"; +} + +def CoprocRegAsmOperand : AsmOperandClass { + let Name = "CoprocReg"; + let ParserMethod = "parseCoprocRegOperand"; +} +def c_imm : Operand<i32> { + let PrintMethod = "printCImmediate"; + let ParserMatchClass = CoprocRegAsmOperand; +} +def CoprocOptionAsmOperand : AsmOperandClass { + let Name = "CoprocOption"; + let ParserMethod = "parseCoprocOptionOperand"; +} +def coproc_option_imm : Operand<i32> { + let PrintMethod = "printCoprocOptionImm"; + let ParserMatchClass = CoprocOptionAsmOperand; +} + +//===----------------------------------------------------------------------===// + +include "ARMInstrFormats.td" + +//===----------------------------------------------------------------------===// +// Multiclass helpers... +// + +/// AsI1_bin_irs - Defines a set of (op r, {mod_imm|r|so_reg}) patterns for a +/// binop that produces a value. +let TwoOperandAliasConstraint = "$Rn = $Rd" in +multiclass AsI1_bin_irs<bits<4> opcod, string opc, + InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, + PatFrag opnode, bit Commutable = 0> { + // The register-immediate version is re-materializable. This is useful + // in particular for taking the address of a local. + let isReMaterializable = 1 in { + def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm), DPFrm, + iii, opc, "\t$Rd, $Rn, $imm", + [(set GPR:$Rd, (opnode GPR:$Rn, mod_imm:$imm))]>, + Sched<[WriteALU, ReadALU]> { + bits<4> Rd; + bits<4> Rn; + bits<12> imm; + let Inst{25} = 1; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-0} = imm; + } + } + def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, + iir, opc, "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]>, + Sched<[WriteALU, ReadALU, ReadALU]> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + let Inst{25} = 0; + let isCommutable = Commutable; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-4} = 0b00000000; + let Inst{3-0} = Rm; + } + + def rsi : AsI1<opcod, (outs GPR:$Rd), + (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, + iis, opc, "\t$Rd, $Rn, $shift", + [(set GPR:$Rd, (opnode GPR:$Rn, so_reg_imm:$shift))]>, + Sched<[WriteALUsi, ReadALU]> { + bits<4> Rd; + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-5} = shift{11-5}; + let Inst{4} = 0; + let Inst{3-0} = shift{3-0}; + } + + def rsr : AsI1<opcod, (outs GPR:$Rd), + (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, + iis, opc, "\t$Rd, $Rn, $shift", + [(set GPR:$Rd, (opnode GPR:$Rn, so_reg_reg:$shift))]>, + Sched<[WriteALUsr, ReadALUsr]> { + bits<4> Rd; + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-8} = shift{11-8}; + let Inst{7} = 0; + let Inst{6-5} = shift{6-5}; + let Inst{4} = 1; + let Inst{3-0} = shift{3-0}; + } +} + +/// AsI1_rbin_irs - Same as AsI1_bin_irs except the order of operands are +/// reversed. The 'rr' form is only defined for the disassembler; for codegen +/// it is equivalent to the AsI1_bin_irs counterpart. +let TwoOperandAliasConstraint = "$Rn = $Rd" in +multiclass AsI1_rbin_irs<bits<4> opcod, string opc, + InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, + PatFrag opnode, bit Commutable = 0> { + // The register-immediate version is re-materializable. This is useful + // in particular for taking the address of a local. + let isReMaterializable = 1 in { + def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm), DPFrm, + iii, opc, "\t$Rd, $Rn, $imm", + [(set GPR:$Rd, (opnode mod_imm:$imm, GPR:$Rn))]>, + Sched<[WriteALU, ReadALU]> { + bits<4> Rd; + bits<4> Rn; + bits<12> imm; + let Inst{25} = 1; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-0} = imm; + } + } + def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, + iir, opc, "\t$Rd, $Rn, $Rm", + [/* pattern left blank */]>, + Sched<[WriteALU, ReadALU, ReadALU]> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + let Inst{11-4} = 0b00000000; + let Inst{25} = 0; + let Inst{3-0} = Rm; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; + } + + def rsi : AsI1<opcod, (outs GPR:$Rd), + (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, + iis, opc, "\t$Rd, $Rn, $shift", + [(set GPR:$Rd, (opnode so_reg_imm:$shift, GPR:$Rn))]>, + Sched<[WriteALUsi, ReadALU]> { + bits<4> Rd; + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-5} = shift{11-5}; + let Inst{4} = 0; + let Inst{3-0} = shift{3-0}; + } + + def rsr : AsI1<opcod, (outs GPR:$Rd), + (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, + iis, opc, "\t$Rd, $Rn, $shift", + [(set GPR:$Rd, (opnode so_reg_reg:$shift, GPR:$Rn))]>, + Sched<[WriteALUsr, ReadALUsr]> { + bits<4> Rd; + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-8} = shift{11-8}; + let Inst{7} = 0; + let Inst{6-5} = shift{6-5}; + let Inst{4} = 1; + let Inst{3-0} = shift{3-0}; + } +} + +/// AsI1_bin_s_irs - Same as AsI1_bin_irs except it sets the 's' bit by default. +/// +/// These opcodes will be converted to the real non-S opcodes by +/// AdjustInstrPostInstrSelection after giving them an optional CPSR operand. +let hasPostISelHook = 1, Defs = [CPSR] in { +multiclass AsI1_bin_s_irs<InstrItinClass iii, InstrItinClass iir, + InstrItinClass iis, PatFrag opnode, + bit Commutable = 0> { + def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm, pred:$p), + 4, iii, + [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, mod_imm:$imm))]>, + Sched<[WriteALU, ReadALU]>; + + def rr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, pred:$p), + 4, iir, + [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, GPR:$Rm))]>, + Sched<[WriteALU, ReadALU, ReadALU]> { + let isCommutable = Commutable; + } + def rsi : ARMPseudoInst<(outs GPR:$Rd), + (ins GPR:$Rn, so_reg_imm:$shift, pred:$p), + 4, iis, + [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, + so_reg_imm:$shift))]>, + Sched<[WriteALUsi, ReadALU]>; + + def rsr : ARMPseudoInst<(outs GPR:$Rd), + (ins GPR:$Rn, so_reg_reg:$shift, pred:$p), + 4, iis, + [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, + so_reg_reg:$shift))]>, + Sched<[WriteALUSsr, ReadALUsr]>; +} +} + +/// AsI1_rbin_s_is - Same as AsI1_bin_s_irs, except selection DAG +/// operands are reversed. +let hasPostISelHook = 1, Defs = [CPSR] in { +multiclass AsI1_rbin_s_is<InstrItinClass iii, InstrItinClass iir, + InstrItinClass iis, PatFrag opnode, + bit Commutable = 0> { + def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm, pred:$p), + 4, iii, + [(set GPR:$Rd, CPSR, (opnode mod_imm:$imm, GPR:$Rn))]>, + Sched<[WriteALU, ReadALU]>; + + def rsi : ARMPseudoInst<(outs GPR:$Rd), + (ins GPR:$Rn, so_reg_imm:$shift, pred:$p), + 4, iis, + [(set GPR:$Rd, CPSR, (opnode so_reg_imm:$shift, + GPR:$Rn))]>, + Sched<[WriteALUsi, ReadALU]>; + + def rsr : ARMPseudoInst<(outs GPR:$Rd), + (ins GPR:$Rn, so_reg_reg:$shift, pred:$p), + 4, iis, + [(set GPR:$Rd, CPSR, (opnode so_reg_reg:$shift, + GPR:$Rn))]>, + Sched<[WriteALUSsr, ReadALUsr]>; +} +} + +/// AI1_cmp_irs - Defines a set of (op r, {mod_imm|r|so_reg}) cmp / test +/// patterns. Similar to AsI1_bin_irs except the instruction does not produce +/// a explicit result, only implicitly set CPSR. +let isCompare = 1, Defs = [CPSR] in { +multiclass AI1_cmp_irs<bits<4> opcod, string opc, + InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, + PatFrag opnode, bit Commutable = 0, + string rrDecoderMethod = ""> { + def ri : AI1<opcod, (outs), (ins GPR:$Rn, mod_imm:$imm), DPFrm, iii, + opc, "\t$Rn, $imm", + [(opnode GPR:$Rn, mod_imm:$imm)]>, + Sched<[WriteCMP, ReadALU]> { + bits<4> Rn; + bits<12> imm; + let Inst{25} = 1; + let Inst{20} = 1; + let Inst{19-16} = Rn; + let Inst{15-12} = 0b0000; + let Inst{11-0} = imm; + + let Unpredictable{15-12} = 0b1111; + } + def rr : AI1<opcod, (outs), (ins GPR:$Rn, GPR:$Rm), DPFrm, iir, + opc, "\t$Rn, $Rm", + [(opnode GPR:$Rn, GPR:$Rm)]>, + Sched<[WriteCMP, ReadALU, ReadALU]> { + bits<4> Rn; + bits<4> Rm; + let isCommutable = Commutable; + let Inst{25} = 0; + let Inst{20} = 1; + let Inst{19-16} = Rn; + let Inst{15-12} = 0b0000; + let Inst{11-4} = 0b00000000; + let Inst{3-0} = Rm; + let DecoderMethod = rrDecoderMethod; + + let Unpredictable{15-12} = 0b1111; + } + def rsi : AI1<opcod, (outs), + (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, iis, + opc, "\t$Rn, $shift", + [(opnode GPR:$Rn, so_reg_imm:$shift)]>, + Sched<[WriteCMPsi, ReadALU]> { + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{20} = 1; + let Inst{19-16} = Rn; + let Inst{15-12} = 0b0000; + let Inst{11-5} = shift{11-5}; + let Inst{4} = 0; + let Inst{3-0} = shift{3-0}; + + let Unpredictable{15-12} = 0b1111; + } + def rsr : AI1<opcod, (outs), + (ins GPRnopc:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, iis, + opc, "\t$Rn, $shift", + [(opnode GPRnopc:$Rn, so_reg_reg:$shift)]>, + Sched<[WriteCMPsr, ReadALU]> { + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{20} = 1; + let Inst{19-16} = Rn; + let Inst{15-12} = 0b0000; + let Inst{11-8} = shift{11-8}; + let Inst{7} = 0; + let Inst{6-5} = shift{6-5}; + let Inst{4} = 1; + let Inst{3-0} = shift{3-0}; + + let Unpredictable{15-12} = 0b1111; + } + +} +} + +/// AI_ext_rrot - A unary operation with two forms: one whose operand is a +/// register and one whose operand is a register rotated by 8/16/24. +/// FIXME: Remove the 'r' variant. Its rot_imm is zero. +class AI_ext_rrot<bits<8> opcod, string opc, PatFrag opnode> + : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPRnopc:$Rm, rot_imm:$rot), + IIC_iEXTr, opc, "\t$Rd, $Rm$rot", + [(set GPRnopc:$Rd, (opnode (rotr GPRnopc:$Rm, rot_imm:$rot)))]>, + Requires<[IsARM, HasV6]>, Sched<[WriteALUsi]> { + bits<4> Rd; + bits<4> Rm; + bits<2> rot; + let Inst{19-16} = 0b1111; + let Inst{15-12} = Rd; + let Inst{11-10} = rot; + let Inst{3-0} = Rm; +} + +class AI_ext_rrot_np<bits<8> opcod, string opc> + : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPRnopc:$Rm, rot_imm:$rot), + IIC_iEXTr, opc, "\t$Rd, $Rm$rot", []>, + Requires<[IsARM, HasV6]>, Sched<[WriteALUsi]> { + bits<2> rot; + let Inst{19-16} = 0b1111; + let Inst{11-10} = rot; + } + +/// AI_exta_rrot - A binary operation with two forms: one whose operand is a +/// register and one whose operand is a register rotated by 8/16/24. +class AI_exta_rrot<bits<8> opcod, string opc, PatFrag opnode> + : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPR:$Rn, GPRnopc:$Rm, rot_imm:$rot), + IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm$rot", + [(set GPRnopc:$Rd, (opnode GPR:$Rn, + (rotr GPRnopc:$Rm, rot_imm:$rot)))]>, + Requires<[IsARM, HasV6]>, Sched<[WriteALUsr]> { + bits<4> Rd; + bits<4> Rm; + bits<4> Rn; + bits<2> rot; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-10} = rot; + let Inst{9-4} = 0b000111; + let Inst{3-0} = Rm; +} + +class AI_exta_rrot_np<bits<8> opcod, string opc> + : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPR:$Rn, GPRnopc:$Rm, rot_imm:$rot), + IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm$rot", []>, + Requires<[IsARM, HasV6]>, Sched<[WriteALUsr]> { + bits<4> Rn; + bits<2> rot; + let Inst{19-16} = Rn; + let Inst{11-10} = rot; +} + +/// AI1_adde_sube_irs - Define instructions and patterns for adde and sube. +let TwoOperandAliasConstraint = "$Rn = $Rd" in +multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, + bit Commutable = 0> { + let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in { + def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm), + DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm", + [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, mod_imm:$imm, CPSR))]>, + Requires<[IsARM]>, + Sched<[WriteALU, ReadALU]> { + bits<4> Rd; + bits<4> Rn; + bits<12> imm; + let Inst{25} = 1; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; + let Inst{11-0} = imm; + } + def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + DPFrm, IIC_iALUr, opc, "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, GPR:$Rm, CPSR))]>, + Requires<[IsARM]>, + Sched<[WriteALU, ReadALU, ReadALU]> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + let Inst{11-4} = 0b00000000; + let Inst{25} = 0; + let isCommutable = Commutable; + let Inst{3-0} = Rm; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; + } + def rsi : AsI1<opcod, (outs GPR:$Rd), + (ins GPR:$Rn, so_reg_imm:$shift), + DPSoRegImmFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift", + [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_reg_imm:$shift, CPSR))]>, + Requires<[IsARM]>, + Sched<[WriteALUsi, ReadALU]> { + bits<4> Rd; + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-5} = shift{11-5}; + let Inst{4} = 0; + let Inst{3-0} = shift{3-0}; + } + def rsr : AsI1<opcod, (outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, so_reg_reg:$shift), + DPSoRegRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift", + [(set GPRnopc:$Rd, CPSR, + (opnode GPRnopc:$Rn, so_reg_reg:$shift, CPSR))]>, + Requires<[IsARM]>, + Sched<[WriteALUsr, ReadALUsr]> { + bits<4> Rd; + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-8} = shift{11-8}; + let Inst{7} = 0; + let Inst{6-5} = shift{6-5}; + let Inst{4} = 1; + let Inst{3-0} = shift{3-0}; + } + } +} + +/// AI1_rsc_irs - Define instructions and patterns for rsc +let TwoOperandAliasConstraint = "$Rn = $Rd" in +multiclass AI1_rsc_irs<bits<4> opcod, string opc, PatFrag opnode> { + let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in { + def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm), + DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm", + [(set GPR:$Rd, CPSR, (opnode mod_imm:$imm, GPR:$Rn, CPSR))]>, + Requires<[IsARM]>, + Sched<[WriteALU, ReadALU]> { + bits<4> Rd; + bits<4> Rn; + bits<12> imm; + let Inst{25} = 1; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; + let Inst{11-0} = imm; + } + def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + DPFrm, IIC_iALUr, opc, "\t$Rd, $Rn, $Rm", + [/* pattern left blank */]>, + Sched<[WriteALU, ReadALU, ReadALU]> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + let Inst{11-4} = 0b00000000; + let Inst{25} = 0; + let Inst{3-0} = Rm; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; + } + def rsi : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg_imm:$shift), + DPSoRegImmFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift", + [(set GPR:$Rd, CPSR, (opnode so_reg_imm:$shift, GPR:$Rn, CPSR))]>, + Requires<[IsARM]>, + Sched<[WriteALUsi, ReadALU]> { + bits<4> Rd; + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-5} = shift{11-5}; + let Inst{4} = 0; + let Inst{3-0} = shift{3-0}; + } + def rsr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg_reg:$shift), + DPSoRegRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift", + [(set GPR:$Rd, CPSR, (opnode so_reg_reg:$shift, GPR:$Rn, CPSR))]>, + Requires<[IsARM]>, + Sched<[WriteALUsr, ReadALUsr]> { + bits<4> Rd; + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-8} = shift{11-8}; + let Inst{7} = 0; + let Inst{6-5} = shift{6-5}; + let Inst{4} = 1; + let Inst{3-0} = shift{3-0}; + } + } +} + +let canFoldAsLoad = 1, isReMaterializable = 1 in { +multiclass AI_ldr1<bit isByte, string opc, InstrItinClass iii, + InstrItinClass iir, PatFrag opnode> { + // Note: We use the complex addrmode_imm12 rather than just an input + // GPR and a constrained immediate so that we can use this to match + // frame index references and avoid matching constant pool references. + def i12: AI2ldst<0b010, 1, isByte, (outs GPR:$Rt), (ins addrmode_imm12:$addr), + AddrMode_i12, LdFrm, iii, opc, "\t$Rt, $addr", + [(set GPR:$Rt, (opnode addrmode_imm12:$addr))]> { + bits<4> Rt; + bits<17> addr; + let Inst{23} = addr{12}; // U (add = ('U' == 1)) + let Inst{19-16} = addr{16-13}; // Rn + let Inst{15-12} = Rt; + let Inst{11-0} = addr{11-0}; // imm12 + } + def rs : AI2ldst<0b011, 1, isByte, (outs GPR:$Rt), (ins ldst_so_reg:$shift), + AddrModeNone, LdFrm, iir, opc, "\t$Rt, $shift", + [(set GPR:$Rt, (opnode ldst_so_reg:$shift))]> { + bits<4> Rt; + bits<17> shift; + let shift{4} = 0; // Inst{4} = 0 + let Inst{23} = shift{12}; // U (add = ('U' == 1)) + let Inst{19-16} = shift{16-13}; // Rn + let Inst{15-12} = Rt; + let Inst{11-0} = shift{11-0}; + } +} +} + +let canFoldAsLoad = 1, isReMaterializable = 1 in { +multiclass AI_ldr1nopc<bit isByte, string opc, InstrItinClass iii, + InstrItinClass iir, PatFrag opnode> { + // Note: We use the complex addrmode_imm12 rather than just an input + // GPR and a constrained immediate so that we can use this to match + // frame index references and avoid matching constant pool references. + def i12: AI2ldst<0b010, 1, isByte, (outs GPRnopc:$Rt), + (ins addrmode_imm12:$addr), + AddrMode_i12, LdFrm, iii, opc, "\t$Rt, $addr", + [(set GPRnopc:$Rt, (opnode addrmode_imm12:$addr))]> { + bits<4> Rt; + bits<17> addr; + let Inst{23} = addr{12}; // U (add = ('U' == 1)) + let Inst{19-16} = addr{16-13}; // Rn + let Inst{15-12} = Rt; + let Inst{11-0} = addr{11-0}; // imm12 + } + def rs : AI2ldst<0b011, 1, isByte, (outs GPRnopc:$Rt), + (ins ldst_so_reg:$shift), + AddrModeNone, LdFrm, iir, opc, "\t$Rt, $shift", + [(set GPRnopc:$Rt, (opnode ldst_so_reg:$shift))]> { + bits<4> Rt; + bits<17> shift; + let shift{4} = 0; // Inst{4} = 0 + let Inst{23} = shift{12}; // U (add = ('U' == 1)) + let Inst{19-16} = shift{16-13}; // Rn + let Inst{15-12} = Rt; + let Inst{11-0} = shift{11-0}; + } +} +} + + +multiclass AI_str1<bit isByte, string opc, InstrItinClass iii, + InstrItinClass iir, PatFrag opnode> { + // Note: We use the complex addrmode_imm12 rather than just an input + // GPR and a constrained immediate so that we can use this to match + // frame index references and avoid matching constant pool references. + def i12 : AI2ldst<0b010, 0, isByte, (outs), + (ins GPR:$Rt, addrmode_imm12:$addr), + AddrMode_i12, StFrm, iii, opc, "\t$Rt, $addr", + [(opnode GPR:$Rt, addrmode_imm12:$addr)]> { + bits<4> Rt; + bits<17> addr; + let Inst{23} = addr{12}; // U (add = ('U' == 1)) + let Inst{19-16} = addr{16-13}; // Rn + let Inst{15-12} = Rt; + let Inst{11-0} = addr{11-0}; // imm12 + } + def rs : AI2ldst<0b011, 0, isByte, (outs), (ins GPR:$Rt, ldst_so_reg:$shift), + AddrModeNone, StFrm, iir, opc, "\t$Rt, $shift", + [(opnode GPR:$Rt, ldst_so_reg:$shift)]> { + bits<4> Rt; + bits<17> shift; + let shift{4} = 0; // Inst{4} = 0 + let Inst{23} = shift{12}; // U (add = ('U' == 1)) + let Inst{19-16} = shift{16-13}; // Rn + let Inst{15-12} = Rt; + let Inst{11-0} = shift{11-0}; + } +} + +multiclass AI_str1nopc<bit isByte, string opc, InstrItinClass iii, + InstrItinClass iir, PatFrag opnode> { + // Note: We use the complex addrmode_imm12 rather than just an input + // GPR and a constrained immediate so that we can use this to match + // frame index references and avoid matching constant pool references. + def i12 : AI2ldst<0b010, 0, isByte, (outs), + (ins GPRnopc:$Rt, addrmode_imm12:$addr), + AddrMode_i12, StFrm, iii, opc, "\t$Rt, $addr", + [(opnode GPRnopc:$Rt, addrmode_imm12:$addr)]> { + bits<4> Rt; + bits<17> addr; + let Inst{23} = addr{12}; // U (add = ('U' == 1)) + let Inst{19-16} = addr{16-13}; // Rn + let Inst{15-12} = Rt; + let Inst{11-0} = addr{11-0}; // imm12 + } + def rs : AI2ldst<0b011, 0, isByte, (outs), + (ins GPRnopc:$Rt, ldst_so_reg:$shift), + AddrModeNone, StFrm, iir, opc, "\t$Rt, $shift", + [(opnode GPRnopc:$Rt, ldst_so_reg:$shift)]> { + bits<4> Rt; + bits<17> shift; + let shift{4} = 0; // Inst{4} = 0 + let Inst{23} = shift{12}; // U (add = ('U' == 1)) + let Inst{19-16} = shift{16-13}; // Rn + let Inst{15-12} = Rt; + let Inst{11-0} = shift{11-0}; + } +} + + +//===----------------------------------------------------------------------===// +// Instructions +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Miscellaneous Instructions. +// + +/// CONSTPOOL_ENTRY - This instruction represents a floating constant pool in +/// the function. The first operand is the ID# for this instruction, the second +/// is the index into the MachineConstantPool that this is, the third is the +/// size in bytes of this constant pool entry. +let hasSideEffects = 0, isNotDuplicable = 1 in +def CONSTPOOL_ENTRY : +PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx, + i32imm:$size), NoItinerary, []>; + +/// A jumptable consisting of direct 32-bit addresses of the destination basic +/// blocks (either absolute, or relative to the start of the jump-table in PIC +/// mode). Used mostly in ARM and Thumb-1 modes. +def JUMPTABLE_ADDRS : +PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx, + i32imm:$size), NoItinerary, []>; + +/// A jumptable consisting of 32-bit jump instructions. Used for Thumb-2 tables +/// that cannot be optimised to use TBB or TBH. +def JUMPTABLE_INSTS : +PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx, + i32imm:$size), NoItinerary, []>; + +/// A jumptable consisting of 8-bit unsigned integers representing offsets from +/// a TBB instruction. +def JUMPTABLE_TBB : +PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx, + i32imm:$size), NoItinerary, []>; + +/// A jumptable consisting of 16-bit unsigned integers representing offsets from +/// a TBH instruction. +def JUMPTABLE_TBH : +PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx, + i32imm:$size), NoItinerary, []>; + + +// FIXME: Marking these as hasSideEffects is necessary to prevent machine DCE +// from removing one half of the matched pairs. That breaks PEI, which assumes +// these will always be in pairs, and asserts if it finds otherwise. Better way? +let Defs = [SP], Uses = [SP], hasSideEffects = 1 in { +def ADJCALLSTACKUP : +PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2, pred:$p), NoItinerary, + [(ARMcallseq_end timm:$amt1, timm:$amt2)]>; + +def ADJCALLSTACKDOWN : +PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary, + [(ARMcallseq_start timm:$amt)]>; +} + +def HINT : AI<(outs), (ins imm0_239:$imm), MiscFrm, NoItinerary, + "hint", "\t$imm", [(int_arm_hint imm0_239:$imm)]>, + Requires<[IsARM, HasV6]> { + bits<8> imm; + let Inst{27-8} = 0b00110010000011110000; + let Inst{7-0} = imm; +} + +def : InstAlias<"nop$p", (HINT 0, pred:$p)>, Requires<[IsARM, HasV6K]>; +def : InstAlias<"yield$p", (HINT 1, pred:$p)>, Requires<[IsARM, HasV6K]>; +def : InstAlias<"wfe$p", (HINT 2, pred:$p)>, Requires<[IsARM, HasV6K]>; +def : InstAlias<"wfi$p", (HINT 3, pred:$p)>, Requires<[IsARM, HasV6K]>; +def : InstAlias<"sev$p", (HINT 4, pred:$p)>, Requires<[IsARM, HasV6K]>; +def : InstAlias<"sevl$p", (HINT 5, pred:$p)>, Requires<[IsARM, HasV8]>; + +def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel", + "\t$Rd, $Rn, $Rm", []>, Requires<[IsARM, HasV6]> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + let Inst{3-0} = Rm; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; + let Inst{27-20} = 0b01101000; + let Inst{7-4} = 0b1011; + let Inst{11-8} = 0b1111; + let Unpredictable{11-8} = 0b1111; +} + +// The 16-bit operand $val can be used by a debugger to store more information +// about the breakpoint. +def BKPT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary, + "bkpt", "\t$val", []>, Requires<[IsARM]> { + bits<16> val; + let Inst{3-0} = val{3-0}; + let Inst{19-8} = val{15-4}; + let Inst{27-20} = 0b00010010; + let Inst{31-28} = 0xe; // AL + let Inst{7-4} = 0b0111; +} +// default immediate for breakpoint mnemonic +def : InstAlias<"bkpt", (BKPT 0)>, Requires<[IsARM]>; + +def HLT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary, + "hlt", "\t$val", []>, Requires<[IsARM, HasV8]> { + bits<16> val; + let Inst{3-0} = val{3-0}; + let Inst{19-8} = val{15-4}; + let Inst{27-20} = 0b00010000; + let Inst{31-28} = 0xe; // AL + let Inst{7-4} = 0b0111; +} + +// Change Processor State +// FIXME: We should use InstAlias to handle the optional operands. +class CPS<dag iops, string asm_ops> + : AXI<(outs), iops, MiscFrm, NoItinerary, !strconcat("cps", asm_ops), + []>, Requires<[IsARM]> { + bits<2> imod; + bits<3> iflags; + bits<5> mode; + bit M; + + let Inst{31-28} = 0b1111; + let Inst{27-20} = 0b00010000; + let Inst{19-18} = imod; + let Inst{17} = M; // Enabled if mode is set; + let Inst{16-9} = 0b00000000; + let Inst{8-6} = iflags; + let Inst{5} = 0; + let Inst{4-0} = mode; +} + +let DecoderMethod = "DecodeCPSInstruction" in { +let M = 1 in + def CPS3p : CPS<(ins imod_op:$imod, iflags_op:$iflags, imm0_31:$mode), + "$imod\t$iflags, $mode">; +let mode = 0, M = 0 in + def CPS2p : CPS<(ins imod_op:$imod, iflags_op:$iflags), "$imod\t$iflags">; + +let imod = 0, iflags = 0, M = 1 in + def CPS1p : CPS<(ins imm0_31:$mode), "\t$mode">; +} + +// Preload signals the memory system of possible future data/instruction access. +multiclass APreLoad<bits<1> read, bits<1> data, string opc> { + + def i12 : AXIM<(outs), (ins addrmode_imm12:$addr), AddrMode_i12, MiscFrm, + IIC_Preload, !strconcat(opc, "\t$addr"), + [(ARMPreload addrmode_imm12:$addr, (i32 read), (i32 data))]>, + Sched<[WritePreLd]> { + bits<4> Rt; + bits<17> addr; + let Inst{31-26} = 0b111101; + let Inst{25} = 0; // 0 for immediate form + let Inst{24} = data; + let Inst{23} = addr{12}; // U (add = ('U' == 1)) + let Inst{22} = read; + let Inst{21-20} = 0b01; + let Inst{19-16} = addr{16-13}; // Rn + let Inst{15-12} = 0b1111; + let Inst{11-0} = addr{11-0}; // imm12 + } + + def rs : AXI<(outs), (ins ldst_so_reg:$shift), MiscFrm, IIC_Preload, + !strconcat(opc, "\t$shift"), + [(ARMPreload ldst_so_reg:$shift, (i32 read), (i32 data))]>, + Sched<[WritePreLd]> { + bits<17> shift; + let Inst{31-26} = 0b111101; + let Inst{25} = 1; // 1 for register form + let Inst{24} = data; + let Inst{23} = shift{12}; // U (add = ('U' == 1)) + let Inst{22} = read; + let Inst{21-20} = 0b01; + let Inst{19-16} = shift{16-13}; // Rn + let Inst{15-12} = 0b1111; + let Inst{11-0} = shift{11-0}; + let Inst{4} = 0; + } +} + +defm PLD : APreLoad<1, 1, "pld">, Requires<[IsARM]>; +defm PLDW : APreLoad<0, 1, "pldw">, Requires<[IsARM,HasV7,HasMP]>; +defm PLI : APreLoad<1, 0, "pli">, Requires<[IsARM,HasV7]>; + +def SETEND : AXI<(outs), (ins setend_op:$end), MiscFrm, NoItinerary, + "setend\t$end", []>, Requires<[IsARM]>, Deprecated<HasV8Ops> { + bits<1> end; + let Inst{31-10} = 0b1111000100000001000000; + let Inst{9} = end; + let Inst{8-0} = 0; +} + +def DBG : AI<(outs), (ins imm0_15:$opt), MiscFrm, NoItinerary, "dbg", "\t$opt", + [(int_arm_dbg imm0_15:$opt)]>, Requires<[IsARM, HasV7]> { + bits<4> opt; + let Inst{27-4} = 0b001100100000111100001111; + let Inst{3-0} = opt; +} + +// A8.8.247 UDF - Undefined (Encoding A1) +def UDF : AInoP<(outs), (ins imm0_65535:$imm16), MiscFrm, NoItinerary, + "udf", "\t$imm16", [(int_arm_undefined imm0_65535:$imm16)]> { + bits<16> imm16; + let Inst{31-28} = 0b1110; // AL + let Inst{27-25} = 0b011; + let Inst{24-20} = 0b11111; + let Inst{19-8} = imm16{15-4}; + let Inst{7-4} = 0b1111; + let Inst{3-0} = imm16{3-0}; +} + +/* + * A5.4 Permanently UNDEFINED instructions. + * + * For most targets use UDF #65006, for which the OS will generate SIGTRAP. + * Other UDF encodings generate SIGILL. + * + * NaCl's OS instead chooses an ARM UDF encoding that's also a UDF in Thumb. + * Encoding A1: + * 1110 0111 1111 iiii iiii iiii 1111 iiii + * Encoding T1: + * 1101 1110 iiii iiii + * It uses the following encoding: + * 1110 0111 1111 1110 1101 1110 1111 0000 + * - In ARM: UDF #60896; + * - In Thumb: UDF #254 followed by a branch-to-self. + */ +let isBarrier = 1, isTerminator = 1 in +def TRAPNaCl : AXI<(outs), (ins), MiscFrm, NoItinerary, + "trap", [(trap)]>, + Requires<[IsARM,UseNaClTrap]> { + let Inst = 0xe7fedef0; +} +let isBarrier = 1, isTerminator = 1 in +def TRAP : AXI<(outs), (ins), MiscFrm, NoItinerary, + "trap", [(trap)]>, + Requires<[IsARM,DontUseNaClTrap]> { + let Inst = 0xe7ffdefe; +} + +// Address computation and loads and stores in PIC mode. +let isNotDuplicable = 1 in { +def PICADD : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$a, pclabel:$cp, pred:$p), + 4, IIC_iALUr, + [(set GPR:$dst, (ARMpic_add GPR:$a, imm:$cp))]>, + Sched<[WriteALU, ReadALU]>; + +let AddedComplexity = 10 in { +def PICLDR : ARMPseudoInst<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p), + 4, IIC_iLoad_r, + [(set GPR:$dst, (load addrmodepc:$addr))]>; + +def PICLDRH : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p), + 4, IIC_iLoad_bh_r, + [(set GPR:$Rt, (zextloadi16 addrmodepc:$addr))]>; + +def PICLDRB : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p), + 4, IIC_iLoad_bh_r, + [(set GPR:$Rt, (zextloadi8 addrmodepc:$addr))]>; + +def PICLDRSH : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p), + 4, IIC_iLoad_bh_r, + [(set GPR:$Rt, (sextloadi16 addrmodepc:$addr))]>; + +def PICLDRSB : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p), + 4, IIC_iLoad_bh_r, + [(set GPR:$Rt, (sextloadi8 addrmodepc:$addr))]>; +} +let AddedComplexity = 10 in { +def PICSTR : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p), + 4, IIC_iStore_r, [(store GPR:$src, addrmodepc:$addr)]>; + +def PICSTRH : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p), + 4, IIC_iStore_bh_r, [(truncstorei16 GPR:$src, + addrmodepc:$addr)]>; + +def PICSTRB : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p), + 4, IIC_iStore_bh_r, [(truncstorei8 GPR:$src, addrmodepc:$addr)]>; +} +} // isNotDuplicable = 1 + + +// LEApcrel - Load a pc-relative address into a register without offending the +// assembler. +let hasSideEffects = 0, isReMaterializable = 1 in +// The 'adr' mnemonic encodes differently if the label is before or after +// the instruction. The {24-21} opcode bits are set by the fixup, as we don't +// know until then which form of the instruction will be used. +def ADR : AI1<{0,?,?,0}, (outs GPR:$Rd), (ins adrlabel:$label), + MiscFrm, IIC_iALUi, "adr", "\t$Rd, $label", []>, + Sched<[WriteALU, ReadALU]> { + bits<4> Rd; + bits<14> label; + let Inst{27-25} = 0b001; + let Inst{24} = 0; + let Inst{23-22} = label{13-12}; + let Inst{21} = 0; + let Inst{20} = 0; + let Inst{19-16} = 0b1111; + let Inst{15-12} = Rd; + let Inst{11-0} = label{11-0}; +} + +let hasSideEffects = 1 in { +def LEApcrel : ARMPseudoInst<(outs GPR:$Rd), (ins i32imm:$label, pred:$p), + 4, IIC_iALUi, []>, Sched<[WriteALU, ReadALU]>; + +def LEApcrelJT : ARMPseudoInst<(outs GPR:$Rd), + (ins i32imm:$label, pred:$p), + 4, IIC_iALUi, []>, Sched<[WriteALU, ReadALU]>; +} + +//===----------------------------------------------------------------------===// +// Control Flow Instructions. +// + +let isReturn = 1, isTerminator = 1, isBarrier = 1 in { + // ARMV4T and above + def BX_RET : AI<(outs), (ins), BrMiscFrm, IIC_Br, + "bx", "\tlr", [(ARMretflag)]>, + Requires<[IsARM, HasV4T]>, Sched<[WriteBr]> { + let Inst{27-0} = 0b0001001011111111111100011110; + } + + // ARMV4 only + def MOVPCLR : AI<(outs), (ins), BrMiscFrm, IIC_Br, + "mov", "\tpc, lr", [(ARMretflag)]>, + Requires<[IsARM, NoV4T]>, Sched<[WriteBr]> { + let Inst{27-0} = 0b0001101000001111000000001110; + } + + // Exception return: N.b. doesn't set CPSR as far as we're concerned (it sets + // the user-space one). + def SUBS_PC_LR : ARMPseudoInst<(outs), (ins i32imm:$offset, pred:$p), + 4, IIC_Br, + [(ARMintretflag imm:$offset)]>; +} + +// Indirect branches +let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { + // ARMV4T and above + def BX : AXI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br, "bx\t$dst", + [(brind GPR:$dst)]>, + Requires<[IsARM, HasV4T]>, Sched<[WriteBr]> { + bits<4> dst; + let Inst{31-4} = 0b1110000100101111111111110001; + let Inst{3-0} = dst; + } + + def BX_pred : AI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br, + "bx", "\t$dst", [/* pattern left blank */]>, + Requires<[IsARM, HasV4T]>, Sched<[WriteBr]> { + bits<4> dst; + let Inst{27-4} = 0b000100101111111111110001; + let Inst{3-0} = dst; + } +} + +// SP is marked as a use to prevent stack-pointer assignments that appear +// immediately before calls from potentially appearing dead. +let isCall = 1, + // FIXME: Do we really need a non-predicated version? If so, it should + // at least be a pseudo instruction expanding to the predicated version + // at MC lowering time. + Defs = [LR], Uses = [SP] in { + def BL : ABXI<0b1011, (outs), (ins bl_target:$func), + IIC_Br, "bl\t$func", + [(ARMcall tglobaladdr:$func)]>, + Requires<[IsARM]>, Sched<[WriteBrL]> { + let Inst{31-28} = 0b1110; + bits<24> func; + let Inst{23-0} = func; + let DecoderMethod = "DecodeBranchImmInstruction"; + } + + def BL_pred : ABI<0b1011, (outs), (ins bl_target:$func), + IIC_Br, "bl", "\t$func", + [(ARMcall_pred tglobaladdr:$func)]>, + Requires<[IsARM]>, Sched<[WriteBrL]> { + bits<24> func; + let Inst{23-0} = func; + let DecoderMethod = "DecodeBranchImmInstruction"; + } + + // ARMv5T and above + def BLX : AXI<(outs), (ins GPR:$func), BrMiscFrm, + IIC_Br, "blx\t$func", + [(ARMcall GPR:$func)]>, + Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> { + bits<4> func; + let Inst{31-4} = 0b1110000100101111111111110011; + let Inst{3-0} = func; + } + + def BLX_pred : AI<(outs), (ins GPR:$func), BrMiscFrm, + IIC_Br, "blx", "\t$func", + [(ARMcall_pred GPR:$func)]>, + Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> { + bits<4> func; + let Inst{27-4} = 0b000100101111111111110011; + let Inst{3-0} = func; + } + + // ARMv4T + // Note: Restrict $func to the tGPR regclass to prevent it being in LR. + def BX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func), + 8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>, + Requires<[IsARM, HasV4T]>, Sched<[WriteBr]>; + + // ARMv4 + def BMOVPCRX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func), + 8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>, + Requires<[IsARM, NoV4T]>, Sched<[WriteBr]>; + + // mov lr, pc; b if callee is marked noreturn to avoid confusing the + // return stack predictor. + def BMOVPCB_CALL : ARMPseudoInst<(outs), (ins bl_target:$func), + 8, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>, + Requires<[IsARM]>, Sched<[WriteBr]>; +} + +let isBranch = 1, isTerminator = 1 in { + // FIXME: should be able to write a pattern for ARMBrcond, but can't use + // a two-value operand where a dag node expects two operands. :( + def Bcc : ABI<0b1010, (outs), (ins br_target:$target), + IIC_Br, "b", "\t$target", + [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]>, + Sched<[WriteBr]> { + bits<24> target; + let Inst{23-0} = target; + let DecoderMethod = "DecodeBranchImmInstruction"; + } + + let isBarrier = 1 in { + // B is "predicable" since it's just a Bcc with an 'always' condition. + let isPredicable = 1 in + // FIXME: We shouldn't need this pseudo at all. Just using Bcc directly + // should be sufficient. + // FIXME: Is B really a Barrier? That doesn't seem right. + def B : ARMPseudoExpand<(outs), (ins br_target:$target), 4, IIC_Br, + [(br bb:$target)], (Bcc br_target:$target, (ops 14, zero_reg))>, + Sched<[WriteBr]>; + + let Size = 4, isNotDuplicable = 1, isIndirectBranch = 1 in { + def BR_JTr : ARMPseudoInst<(outs), + (ins GPR:$target, i32imm:$jt), + 0, IIC_Br, + [(ARMbrjt GPR:$target, tjumptable:$jt)]>, + Sched<[WriteBr]>; + // FIXME: This shouldn't use the generic "addrmode2," but rather be split + // into i12 and rs suffixed versions. + def BR_JTm : ARMPseudoInst<(outs), + (ins addrmode2:$target, i32imm:$jt), + 0, IIC_Br, + [(ARMbrjt (i32 (load addrmode2:$target)), + tjumptable:$jt)]>, Sched<[WriteBrTbl]>; + def BR_JTadd : ARMPseudoInst<(outs), + (ins GPR:$target, GPR:$idx, i32imm:$jt), + 0, IIC_Br, + [(ARMbrjt (add GPR:$target, GPR:$idx), tjumptable:$jt)]>, + Sched<[WriteBrTbl]>; + } // isNotDuplicable = 1, isIndirectBranch = 1 + } // isBarrier = 1 + +} + +// BLX (immediate) +def BLXi : AXI<(outs), (ins blx_target:$target), BrMiscFrm, NoItinerary, + "blx\t$target", []>, + Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> { + let Inst{31-25} = 0b1111101; + bits<25> target; + let Inst{23-0} = target{24-1}; + let Inst{24} = target{0}; + let isCall = 1; +} + +// Branch and Exchange Jazelle +def BXJ : ABI<0b0001, (outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func", + [/* pattern left blank */]>, Sched<[WriteBr]> { + bits<4> func; + let Inst{23-20} = 0b0010; + let Inst{19-8} = 0xfff; + let Inst{7-4} = 0b0010; + let Inst{3-0} = func; + let isBranch = 1; +} + +// Tail calls. + +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in { + def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst), IIC_Br, []>, + Sched<[WriteBr]>; + + def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst), IIC_Br, []>, + Sched<[WriteBr]>; + + def TAILJMPd : ARMPseudoExpand<(outs), (ins br_target:$dst), + 4, IIC_Br, [], + (Bcc br_target:$dst, (ops 14, zero_reg))>, + Requires<[IsARM]>, Sched<[WriteBr]>; + + def TAILJMPr : ARMPseudoExpand<(outs), (ins tcGPR:$dst), + 4, IIC_Br, [], + (BX GPR:$dst)>, Sched<[WriteBr]>, + Requires<[IsARM]>; +} + +// Secure Monitor Call is a system instruction. +def SMC : ABI<0b0001, (outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt", + []>, Requires<[IsARM, HasTrustZone]> { + bits<4> opt; + let Inst{23-4} = 0b01100000000000000111; + let Inst{3-0} = opt; +} +def : MnemonicAlias<"smi", "smc">; + +// Supervisor Call (Software Interrupt) +let isCall = 1, Uses = [SP] in { +def SVC : ABI<0b1111, (outs), (ins imm24b:$svc), IIC_Br, "svc", "\t$svc", []>, + Sched<[WriteBr]> { + bits<24> svc; + let Inst{23-0} = svc; +} +} + +// Store Return State +class SRSI<bit wb, string asm> + : XI<(outs), (ins imm0_31:$mode), AddrModeNone, 4, IndexModeNone, BrFrm, + NoItinerary, asm, "", []> { + bits<5> mode; + let Inst{31-28} = 0b1111; + let Inst{27-25} = 0b100; + let Inst{22} = 1; + let Inst{21} = wb; + let Inst{20} = 0; + let Inst{19-16} = 0b1101; // SP + let Inst{15-5} = 0b00000101000; + let Inst{4-0} = mode; +} + +def SRSDA : SRSI<0, "srsda\tsp, $mode"> { + let Inst{24-23} = 0; +} +def SRSDA_UPD : SRSI<1, "srsda\tsp!, $mode"> { + let Inst{24-23} = 0; +} +def SRSDB : SRSI<0, "srsdb\tsp, $mode"> { + let Inst{24-23} = 0b10; +} +def SRSDB_UPD : SRSI<1, "srsdb\tsp!, $mode"> { + let Inst{24-23} = 0b10; +} +def SRSIA : SRSI<0, "srsia\tsp, $mode"> { + let Inst{24-23} = 0b01; +} +def SRSIA_UPD : SRSI<1, "srsia\tsp!, $mode"> { + let Inst{24-23} = 0b01; +} +def SRSIB : SRSI<0, "srsib\tsp, $mode"> { + let Inst{24-23} = 0b11; +} +def SRSIB_UPD : SRSI<1, "srsib\tsp!, $mode"> { + let Inst{24-23} = 0b11; +} + +def : ARMInstAlias<"srsda $mode", (SRSDA imm0_31:$mode)>; +def : ARMInstAlias<"srsda $mode!", (SRSDA_UPD imm0_31:$mode)>; + +def : ARMInstAlias<"srsdb $mode", (SRSDB imm0_31:$mode)>; +def : ARMInstAlias<"srsdb $mode!", (SRSDB_UPD imm0_31:$mode)>; + +def : ARMInstAlias<"srsia $mode", (SRSIA imm0_31:$mode)>; +def : ARMInstAlias<"srsia $mode!", (SRSIA_UPD imm0_31:$mode)>; + +def : ARMInstAlias<"srsib $mode", (SRSIB imm0_31:$mode)>; +def : ARMInstAlias<"srsib $mode!", (SRSIB_UPD imm0_31:$mode)>; + +// Return From Exception +class RFEI<bit wb, string asm> + : XI<(outs), (ins GPR:$Rn), AddrModeNone, 4, IndexModeNone, BrFrm, + NoItinerary, asm, "", []> { + bits<4> Rn; + let Inst{31-28} = 0b1111; + let Inst{27-25} = 0b100; + let Inst{22} = 0; + let Inst{21} = wb; + let Inst{20} = 1; + let Inst{19-16} = Rn; + let Inst{15-0} = 0xa00; +} + +def RFEDA : RFEI<0, "rfeda\t$Rn"> { + let Inst{24-23} = 0; +} +def RFEDA_UPD : RFEI<1, "rfeda\t$Rn!"> { + let Inst{24-23} = 0; +} +def RFEDB : RFEI<0, "rfedb\t$Rn"> { + let Inst{24-23} = 0b10; +} +def RFEDB_UPD : RFEI<1, "rfedb\t$Rn!"> { + let Inst{24-23} = 0b10; +} +def RFEIA : RFEI<0, "rfeia\t$Rn"> { + let Inst{24-23} = 0b01; +} +def RFEIA_UPD : RFEI<1, "rfeia\t$Rn!"> { + let Inst{24-23} = 0b01; +} +def RFEIB : RFEI<0, "rfeib\t$Rn"> { + let Inst{24-23} = 0b11; +} +def RFEIB_UPD : RFEI<1, "rfeib\t$Rn!"> { + let Inst{24-23} = 0b11; +} + +// Hypervisor Call is a system instruction +let isCall = 1 in { +def HVC : AInoP< (outs), (ins imm0_65535:$imm), BrFrm, NoItinerary, + "hvc", "\t$imm", []>, + Requires<[IsARM, HasVirtualization]> { + bits<16> imm; + + // Even though HVC isn't predicable, it's encoding includes a condition field. + // The instruction is undefined if the condition field is 0xf otherwise it is + // unpredictable if it isn't condition AL (0xe). + let Inst{31-28} = 0b1110; + let Unpredictable{31-28} = 0b1111; + let Inst{27-24} = 0b0001; + let Inst{23-20} = 0b0100; + let Inst{19-8} = imm{15-4}; + let Inst{7-4} = 0b0111; + let Inst{3-0} = imm{3-0}; +} +} + +// Return from exception in Hypervisor mode. +let isReturn = 1, isBarrier = 1, isTerminator = 1, Defs = [PC] in +def ERET : ABI<0b0001, (outs), (ins), NoItinerary, "eret", "", []>, + Requires<[IsARM, HasVirtualization]> { + let Inst{23-0} = 0b011000000000000001101110; +} + +//===----------------------------------------------------------------------===// +// Load / Store Instructions. +// + +// Load + + +defm LDR : AI_ldr1<0, "ldr", IIC_iLoad_r, IIC_iLoad_si, + UnOpFrag<(load node:$Src)>>; +defm LDRB : AI_ldr1nopc<1, "ldrb", IIC_iLoad_bh_r, IIC_iLoad_bh_si, + UnOpFrag<(zextloadi8 node:$Src)>>; +defm STR : AI_str1<0, "str", IIC_iStore_r, IIC_iStore_si, + BinOpFrag<(store node:$LHS, node:$RHS)>>; +defm STRB : AI_str1nopc<1, "strb", IIC_iStore_bh_r, IIC_iStore_bh_si, + BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>; + +// Special LDR for loads from non-pc-relative constpools. +let canFoldAsLoad = 1, mayLoad = 1, hasSideEffects = 0, + isReMaterializable = 1, isCodeGenOnly = 1 in +def LDRcp : AI2ldst<0b010, 1, 0, (outs GPR:$Rt), (ins addrmode_imm12:$addr), + AddrMode_i12, LdFrm, IIC_iLoad_r, "ldr", "\t$Rt, $addr", + []> { + bits<4> Rt; + bits<17> addr; + let Inst{23} = addr{12}; // U (add = ('U' == 1)) + let Inst{19-16} = 0b1111; + let Inst{15-12} = Rt; + let Inst{11-0} = addr{11-0}; // imm12 +} + +// Loads with zero extension +def LDRH : AI3ld<0b1011, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm, + IIC_iLoad_bh_r, "ldrh", "\t$Rt, $addr", + [(set GPR:$Rt, (zextloadi16 addrmode3:$addr))]>; + +// Loads with sign extension +def LDRSH : AI3ld<0b1111, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm, + IIC_iLoad_bh_r, "ldrsh", "\t$Rt, $addr", + [(set GPR:$Rt, (sextloadi16 addrmode3:$addr))]>; + +def LDRSB : AI3ld<0b1101, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm, + IIC_iLoad_bh_r, "ldrsb", "\t$Rt, $addr", + [(set GPR:$Rt, (sextloadi8 addrmode3:$addr))]>; + +let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in { + // Load doubleword + def LDRD : AI3ld<0b1101, 0, (outs GPR:$Rt, GPR:$Rt2), (ins addrmode3:$addr), + LdMiscFrm, IIC_iLoad_d_r, "ldrd", "\t$Rt, $Rt2, $addr", []>, + Requires<[IsARM, HasV5TE]>; +} + +def LDA : AIldracq<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr), + NoItinerary, "lda", "\t$Rt, $addr", []>; +def LDAB : AIldracq<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr), + NoItinerary, "ldab", "\t$Rt, $addr", []>; +def LDAH : AIldracq<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr), + NoItinerary, "ldah", "\t$Rt, $addr", []>; + +// Indexed loads +multiclass AI2_ldridx<bit isByte, string opc, + InstrItinClass iii, InstrItinClass iir> { + def _PRE_IMM : AI2ldstidx<1, isByte, 1, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addrmode_imm12_pre:$addr), IndexModePre, LdFrm, iii, + opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> { + bits<17> addr; + let Inst{25} = 0; + let Inst{23} = addr{12}; + let Inst{19-16} = addr{16-13}; + let Inst{11-0} = addr{11-0}; + let DecoderMethod = "DecodeLDRPreImm"; + } + + def _PRE_REG : AI2ldstidx<1, isByte, 1, (outs GPR:$Rt, GPR:$Rn_wb), + (ins ldst_so_reg:$addr), IndexModePre, LdFrm, iir, + opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> { + bits<17> addr; + let Inst{25} = 1; + let Inst{23} = addr{12}; + let Inst{19-16} = addr{16-13}; + let Inst{11-0} = addr{11-0}; + let Inst{4} = 0; + let DecoderMethod = "DecodeLDRPreReg"; + } + + def _POST_REG : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addr_offset_none:$addr, am2offset_reg:$offset), + IndexModePost, LdFrm, iir, + opc, "\t$Rt, $addr, $offset", + "$addr.base = $Rn_wb", []> { + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> addr; + let Inst{25} = 1; + let Inst{23} = offset{12}; + let Inst{19-16} = addr; + let Inst{11-0} = offset{11-0}; + let Inst{4} = 0; + + let DecoderMethod = "DecodeAddrMode2IdxInstruction"; + } + + def _POST_IMM : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addr_offset_none:$addr, am2offset_imm:$offset), + IndexModePost, LdFrm, iii, + opc, "\t$Rt, $addr, $offset", + "$addr.base = $Rn_wb", []> { + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> addr; + let Inst{25} = 0; + let Inst{23} = offset{12}; + let Inst{19-16} = addr; + let Inst{11-0} = offset{11-0}; + + let DecoderMethod = "DecodeAddrMode2IdxInstruction"; + } + +} + +let mayLoad = 1, hasSideEffects = 0 in { +// FIXME: for LDR_PRE_REG etc. the itineray should be either IIC_iLoad_ru or +// IIC_iLoad_siu depending on whether it the offset register is shifted. +defm LDR : AI2_ldridx<0, "ldr", IIC_iLoad_iu, IIC_iLoad_ru>; +defm LDRB : AI2_ldridx<1, "ldrb", IIC_iLoad_bh_iu, IIC_iLoad_bh_ru>; +} + +multiclass AI3_ldridx<bits<4> op, string opc, InstrItinClass itin> { + def _PRE : AI3ldstidx<op, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addrmode3_pre:$addr), IndexModePre, + LdMiscFrm, itin, + opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> { + bits<14> addr; + let Inst{23} = addr{8}; // U bit + let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm + let Inst{19-16} = addr{12-9}; // Rn + let Inst{11-8} = addr{7-4}; // imm7_4/zero + let Inst{3-0} = addr{3-0}; // imm3_0/Rm + let DecoderMethod = "DecodeAddrMode3Instruction"; + } + def _POST : AI3ldstidx<op, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addr_offset_none:$addr, am3offset:$offset), + IndexModePost, LdMiscFrm, itin, + opc, "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", + []> { + bits<10> offset; + bits<4> addr; + let Inst{23} = offset{8}; // U bit + let Inst{22} = offset{9}; // 1 == imm8, 0 == Rm + let Inst{19-16} = addr; + let Inst{11-8} = offset{7-4}; // imm7_4/zero + let Inst{3-0} = offset{3-0}; // imm3_0/Rm + let DecoderMethod = "DecodeAddrMode3Instruction"; + } +} + +let mayLoad = 1, hasSideEffects = 0 in { +defm LDRH : AI3_ldridx<0b1011, "ldrh", IIC_iLoad_bh_ru>; +defm LDRSH : AI3_ldridx<0b1111, "ldrsh", IIC_iLoad_bh_ru>; +defm LDRSB : AI3_ldridx<0b1101, "ldrsb", IIC_iLoad_bh_ru>; +let hasExtraDefRegAllocReq = 1 in { +def LDRD_PRE : AI3ldstidx<0b1101, 0, 1, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb), + (ins addrmode3_pre:$addr), IndexModePre, + LdMiscFrm, IIC_iLoad_d_ru, + "ldrd", "\t$Rt, $Rt2, $addr!", + "$addr.base = $Rn_wb", []> { + bits<14> addr; + let Inst{23} = addr{8}; // U bit + let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm + let Inst{19-16} = addr{12-9}; // Rn + let Inst{11-8} = addr{7-4}; // imm7_4/zero + let Inst{3-0} = addr{3-0}; // imm3_0/Rm + let DecoderMethod = "DecodeAddrMode3Instruction"; +} +def LDRD_POST: AI3ldstidx<0b1101, 0, 0, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb), + (ins addr_offset_none:$addr, am3offset:$offset), + IndexModePost, LdMiscFrm, IIC_iLoad_d_ru, + "ldrd", "\t$Rt, $Rt2, $addr, $offset", + "$addr.base = $Rn_wb", []> { + bits<10> offset; + bits<4> addr; + let Inst{23} = offset{8}; // U bit + let Inst{22} = offset{9}; // 1 == imm8, 0 == Rm + let Inst{19-16} = addr; + let Inst{11-8} = offset{7-4}; // imm7_4/zero + let Inst{3-0} = offset{3-0}; // imm3_0/Rm + let DecoderMethod = "DecodeAddrMode3Instruction"; +} +} // hasExtraDefRegAllocReq = 1 +} // mayLoad = 1, hasSideEffects = 0 + +// LDRT, LDRBT, LDRSBT, LDRHT, LDRSHT. +let mayLoad = 1, hasSideEffects = 0 in { +def LDRT_POST_REG : AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addr_offset_none:$addr, am2offset_reg:$offset), + IndexModePost, LdFrm, IIC_iLoad_ru, + "ldrt", "\t$Rt, $addr, $offset", + "$addr.base = $Rn_wb", []> { + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> addr; + let Inst{25} = 1; + let Inst{23} = offset{12}; + let Inst{21} = 1; // overwrite + let Inst{19-16} = addr; + let Inst{11-5} = offset{11-5}; + let Inst{4} = 0; + let Inst{3-0} = offset{3-0}; + let DecoderMethod = "DecodeAddrMode2IdxInstruction"; +} + +def LDRT_POST_IMM + : AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addr_offset_none:$addr, am2offset_imm:$offset), + IndexModePost, LdFrm, IIC_iLoad_ru, + "ldrt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> { + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> addr; + let Inst{25} = 0; + let Inst{23} = offset{12}; + let Inst{21} = 1; // overwrite + let Inst{19-16} = addr; + let Inst{11-0} = offset{11-0}; + let DecoderMethod = "DecodeAddrMode2IdxInstruction"; +} + +def LDRBT_POST_REG : AI2ldstidx<1, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addr_offset_none:$addr, am2offset_reg:$offset), + IndexModePost, LdFrm, IIC_iLoad_bh_ru, + "ldrbt", "\t$Rt, $addr, $offset", + "$addr.base = $Rn_wb", []> { + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> addr; + let Inst{25} = 1; + let Inst{23} = offset{12}; + let Inst{21} = 1; // overwrite + let Inst{19-16} = addr; + let Inst{11-5} = offset{11-5}; + let Inst{4} = 0; + let Inst{3-0} = offset{3-0}; + let DecoderMethod = "DecodeAddrMode2IdxInstruction"; +} + +def LDRBT_POST_IMM + : AI2ldstidx<1, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addr_offset_none:$addr, am2offset_imm:$offset), + IndexModePost, LdFrm, IIC_iLoad_bh_ru, + "ldrbt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> { + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> addr; + let Inst{25} = 0; + let Inst{23} = offset{12}; + let Inst{21} = 1; // overwrite + let Inst{19-16} = addr; + let Inst{11-0} = offset{11-0}; + let DecoderMethod = "DecodeAddrMode2IdxInstruction"; +} + +multiclass AI3ldrT<bits<4> op, string opc> { + def i : AI3ldstidxT<op, 1, (outs GPR:$Rt, GPR:$base_wb), + (ins addr_offset_none:$addr, postidx_imm8:$offset), + IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru, opc, + "\t$Rt, $addr, $offset", "$addr.base = $base_wb", []> { + bits<9> offset; + let Inst{23} = offset{8}; + let Inst{22} = 1; + let Inst{11-8} = offset{7-4}; + let Inst{3-0} = offset{3-0}; + } + def r : AI3ldstidxT<op, 1, (outs GPRnopc:$Rt, GPRnopc:$base_wb), + (ins addr_offset_none:$addr, postidx_reg:$Rm), + IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru, opc, + "\t$Rt, $addr, $Rm", "$addr.base = $base_wb", []> { + bits<5> Rm; + let Inst{23} = Rm{4}; + let Inst{22} = 0; + let Inst{11-8} = 0; + let Unpredictable{11-8} = 0b1111; + let Inst{3-0} = Rm{3-0}; + let DecoderMethod = "DecodeLDR"; + } +} + +defm LDRSBT : AI3ldrT<0b1101, "ldrsbt">; +defm LDRHT : AI3ldrT<0b1011, "ldrht">; +defm LDRSHT : AI3ldrT<0b1111, "ldrsht">; +} + +def LDRT_POST + : ARMAsmPseudo<"ldrt${q} $Rt, $addr", (ins addr_offset_none:$addr, pred:$q), + (outs GPR:$Rt)>; + +def LDRBT_POST + : ARMAsmPseudo<"ldrbt${q} $Rt, $addr", (ins addr_offset_none:$addr, pred:$q), + (outs GPR:$Rt)>; + +// Store + +// Stores with truncate +def STRH : AI3str<0b1011, (outs), (ins GPR:$Rt, addrmode3:$addr), StMiscFrm, + IIC_iStore_bh_r, "strh", "\t$Rt, $addr", + [(truncstorei16 GPR:$Rt, addrmode3:$addr)]>; + +// Store doubleword +let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in { + def STRD : AI3str<0b1111, (outs), (ins GPR:$Rt, GPR:$Rt2, addrmode3:$addr), + StMiscFrm, IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", []>, + Requires<[IsARM, HasV5TE]> { + let Inst{21} = 0; + } +} + +// Indexed stores +multiclass AI2_stridx<bit isByte, string opc, + InstrItinClass iii, InstrItinClass iir> { + def _PRE_IMM : AI2ldstidx<0, isByte, 1, (outs GPR:$Rn_wb), + (ins GPR:$Rt, addrmode_imm12_pre:$addr), IndexModePre, + StFrm, iii, + opc, "\t$Rt, $addr!", + "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> { + bits<17> addr; + let Inst{25} = 0; + let Inst{23} = addr{12}; // U (add = ('U' == 1)) + let Inst{19-16} = addr{16-13}; // Rn + let Inst{11-0} = addr{11-0}; // imm12 + let DecoderMethod = "DecodeSTRPreImm"; + } + + def _PRE_REG : AI2ldstidx<0, isByte, 1, (outs GPR:$Rn_wb), + (ins GPR:$Rt, ldst_so_reg:$addr), + IndexModePre, StFrm, iir, + opc, "\t$Rt, $addr!", + "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> { + bits<17> addr; + let Inst{25} = 1; + let Inst{23} = addr{12}; // U (add = ('U' == 1)) + let Inst{19-16} = addr{16-13}; // Rn + let Inst{11-0} = addr{11-0}; + let Inst{4} = 0; // Inst{4} = 0 + let DecoderMethod = "DecodeSTRPreReg"; + } + def _POST_REG : AI2ldstidx<0, isByte, 0, (outs GPR:$Rn_wb), + (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset), + IndexModePost, StFrm, iir, + opc, "\t$Rt, $addr, $offset", + "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> { + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> addr; + let Inst{25} = 1; + let Inst{23} = offset{12}; + let Inst{19-16} = addr; + let Inst{11-0} = offset{11-0}; + let Inst{4} = 0; + + let DecoderMethod = "DecodeAddrMode2IdxInstruction"; + } + + def _POST_IMM : AI2ldstidx<0, isByte, 0, (outs GPR:$Rn_wb), + (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset), + IndexModePost, StFrm, iii, + opc, "\t$Rt, $addr, $offset", + "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> { + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> addr; + let Inst{25} = 0; + let Inst{23} = offset{12}; + let Inst{19-16} = addr; + let Inst{11-0} = offset{11-0}; + + let DecoderMethod = "DecodeAddrMode2IdxInstruction"; + } +} + +let mayStore = 1, hasSideEffects = 0 in { +// FIXME: for STR_PRE_REG etc. the itineray should be either IIC_iStore_ru or +// IIC_iStore_siu depending on whether it the offset register is shifted. +defm STR : AI2_stridx<0, "str", IIC_iStore_iu, IIC_iStore_ru>; +defm STRB : AI2_stridx<1, "strb", IIC_iStore_bh_iu, IIC_iStore_bh_ru>; +} + +def : ARMPat<(post_store GPR:$Rt, addr_offset_none:$addr, + am2offset_reg:$offset), + (STR_POST_REG GPR:$Rt, addr_offset_none:$addr, + am2offset_reg:$offset)>; +def : ARMPat<(post_store GPR:$Rt, addr_offset_none:$addr, + am2offset_imm:$offset), + (STR_POST_IMM GPR:$Rt, addr_offset_none:$addr, + am2offset_imm:$offset)>; +def : ARMPat<(post_truncsti8 GPR:$Rt, addr_offset_none:$addr, + am2offset_reg:$offset), + (STRB_POST_REG GPR:$Rt, addr_offset_none:$addr, + am2offset_reg:$offset)>; +def : ARMPat<(post_truncsti8 GPR:$Rt, addr_offset_none:$addr, + am2offset_imm:$offset), + (STRB_POST_IMM GPR:$Rt, addr_offset_none:$addr, + am2offset_imm:$offset)>; + +// Pseudo-instructions for pattern matching the pre-indexed stores. We can't +// put the patterns on the instruction definitions directly as ISel wants +// the address base and offset to be separate operands, not a single +// complex operand like we represent the instructions themselves. The +// pseudos map between the two. +let usesCustomInserter = 1, + Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in { +def STRi_preidx: ARMPseudoInst<(outs GPR:$Rn_wb), + (ins GPR:$Rt, GPR:$Rn, am2offset_imm:$offset, pred:$p), + 4, IIC_iStore_ru, + [(set GPR:$Rn_wb, + (pre_store GPR:$Rt, GPR:$Rn, am2offset_imm:$offset))]>; +def STRr_preidx: ARMPseudoInst<(outs GPR:$Rn_wb), + (ins GPR:$Rt, GPR:$Rn, am2offset_reg:$offset, pred:$p), + 4, IIC_iStore_ru, + [(set GPR:$Rn_wb, + (pre_store GPR:$Rt, GPR:$Rn, am2offset_reg:$offset))]>; +def STRBi_preidx: ARMPseudoInst<(outs GPR:$Rn_wb), + (ins GPR:$Rt, GPR:$Rn, am2offset_imm:$offset, pred:$p), + 4, IIC_iStore_ru, + [(set GPR:$Rn_wb, + (pre_truncsti8 GPR:$Rt, GPR:$Rn, am2offset_imm:$offset))]>; +def STRBr_preidx: ARMPseudoInst<(outs GPR:$Rn_wb), + (ins GPR:$Rt, GPR:$Rn, am2offset_reg:$offset, pred:$p), + 4, IIC_iStore_ru, + [(set GPR:$Rn_wb, + (pre_truncsti8 GPR:$Rt, GPR:$Rn, am2offset_reg:$offset))]>; +def STRH_preidx: ARMPseudoInst<(outs GPR:$Rn_wb), + (ins GPR:$Rt, GPR:$Rn, am3offset:$offset, pred:$p), + 4, IIC_iStore_ru, + [(set GPR:$Rn_wb, + (pre_truncsti16 GPR:$Rt, GPR:$Rn, am3offset:$offset))]>; +} + + + +def STRH_PRE : AI3ldstidx<0b1011, 0, 1, (outs GPR:$Rn_wb), + (ins GPR:$Rt, addrmode3_pre:$addr), IndexModePre, + StMiscFrm, IIC_iStore_bh_ru, + "strh", "\t$Rt, $addr!", + "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> { + bits<14> addr; + let Inst{23} = addr{8}; // U bit + let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm + let Inst{19-16} = addr{12-9}; // Rn + let Inst{11-8} = addr{7-4}; // imm7_4/zero + let Inst{3-0} = addr{3-0}; // imm3_0/Rm + let DecoderMethod = "DecodeAddrMode3Instruction"; +} + +def STRH_POST : AI3ldstidx<0b1011, 0, 0, (outs GPR:$Rn_wb), + (ins GPR:$Rt, addr_offset_none:$addr, am3offset:$offset), + IndexModePost, StMiscFrm, IIC_iStore_bh_ru, + "strh", "\t$Rt, $addr, $offset", + "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", + [(set GPR:$Rn_wb, (post_truncsti16 GPR:$Rt, + addr_offset_none:$addr, + am3offset:$offset))]> { + bits<10> offset; + bits<4> addr; + let Inst{23} = offset{8}; // U bit + let Inst{22} = offset{9}; // 1 == imm8, 0 == Rm + let Inst{19-16} = addr; + let Inst{11-8} = offset{7-4}; // imm7_4/zero + let Inst{3-0} = offset{3-0}; // imm3_0/Rm + let DecoderMethod = "DecodeAddrMode3Instruction"; +} + +let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in { +def STRD_PRE : AI3ldstidx<0b1111, 0, 1, (outs GPR:$Rn_wb), + (ins GPR:$Rt, GPR:$Rt2, addrmode3_pre:$addr), + IndexModePre, StMiscFrm, IIC_iStore_d_ru, + "strd", "\t$Rt, $Rt2, $addr!", + "$addr.base = $Rn_wb", []> { + bits<14> addr; + let Inst{23} = addr{8}; // U bit + let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm + let Inst{19-16} = addr{12-9}; // Rn + let Inst{11-8} = addr{7-4}; // imm7_4/zero + let Inst{3-0} = addr{3-0}; // imm3_0/Rm + let DecoderMethod = "DecodeAddrMode3Instruction"; +} + +def STRD_POST: AI3ldstidx<0b1111, 0, 0, (outs GPR:$Rn_wb), + (ins GPR:$Rt, GPR:$Rt2, addr_offset_none:$addr, + am3offset:$offset), + IndexModePost, StMiscFrm, IIC_iStore_d_ru, + "strd", "\t$Rt, $Rt2, $addr, $offset", + "$addr.base = $Rn_wb", []> { + bits<10> offset; + bits<4> addr; + let Inst{23} = offset{8}; // U bit + let Inst{22} = offset{9}; // 1 == imm8, 0 == Rm + let Inst{19-16} = addr; + let Inst{11-8} = offset{7-4}; // imm7_4/zero + let Inst{3-0} = offset{3-0}; // imm3_0/Rm + let DecoderMethod = "DecodeAddrMode3Instruction"; +} +} // mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 + +// STRT, STRBT, and STRHT + +def STRBT_POST_REG : AI2ldstidx<0, 1, 0, (outs GPR:$Rn_wb), + (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset), + IndexModePost, StFrm, IIC_iStore_bh_ru, + "strbt", "\t$Rt, $addr, $offset", + "$addr.base = $Rn_wb", []> { + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> addr; + let Inst{25} = 1; + let Inst{23} = offset{12}; + let Inst{21} = 1; // overwrite + let Inst{19-16} = addr; + let Inst{11-5} = offset{11-5}; + let Inst{4} = 0; + let Inst{3-0} = offset{3-0}; + let DecoderMethod = "DecodeAddrMode2IdxInstruction"; +} + +def STRBT_POST_IMM + : AI2ldstidx<0, 1, 0, (outs GPR:$Rn_wb), + (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset), + IndexModePost, StFrm, IIC_iStore_bh_ru, + "strbt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> { + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> addr; + let Inst{25} = 0; + let Inst{23} = offset{12}; + let Inst{21} = 1; // overwrite + let Inst{19-16} = addr; + let Inst{11-0} = offset{11-0}; + let DecoderMethod = "DecodeAddrMode2IdxInstruction"; +} + +def STRBT_POST + : ARMAsmPseudo<"strbt${q} $Rt, $addr", + (ins GPR:$Rt, addr_offset_none:$addr, pred:$q)>; + +let mayStore = 1, hasSideEffects = 0 in { +def STRT_POST_REG : AI2ldstidx<0, 0, 0, (outs GPR:$Rn_wb), + (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset), + IndexModePost, StFrm, IIC_iStore_ru, + "strt", "\t$Rt, $addr, $offset", + "$addr.base = $Rn_wb", []> { + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> addr; + let Inst{25} = 1; + let Inst{23} = offset{12}; + let Inst{21} = 1; // overwrite + let Inst{19-16} = addr; + let Inst{11-5} = offset{11-5}; + let Inst{4} = 0; + let Inst{3-0} = offset{3-0}; + let DecoderMethod = "DecodeAddrMode2IdxInstruction"; +} + +def STRT_POST_IMM + : AI2ldstidx<0, 0, 0, (outs GPR:$Rn_wb), + (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset), + IndexModePost, StFrm, IIC_iStore_ru, + "strt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> { + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> addr; + let Inst{25} = 0; + let Inst{23} = offset{12}; + let Inst{21} = 1; // overwrite + let Inst{19-16} = addr; + let Inst{11-0} = offset{11-0}; + let DecoderMethod = "DecodeAddrMode2IdxInstruction"; +} +} + +def STRT_POST + : ARMAsmPseudo<"strt${q} $Rt, $addr", + (ins GPR:$Rt, addr_offset_none:$addr, pred:$q)>; + +multiclass AI3strT<bits<4> op, string opc> { + def i : AI3ldstidxT<op, 0, (outs GPR:$base_wb), + (ins GPR:$Rt, addr_offset_none:$addr, postidx_imm8:$offset), + IndexModePost, StMiscFrm, IIC_iStore_bh_ru, opc, + "\t$Rt, $addr, $offset", "$addr.base = $base_wb", []> { + bits<9> offset; + let Inst{23} = offset{8}; + let Inst{22} = 1; + let Inst{11-8} = offset{7-4}; + let Inst{3-0} = offset{3-0}; + } + def r : AI3ldstidxT<op, 0, (outs GPR:$base_wb), + (ins GPR:$Rt, addr_offset_none:$addr, postidx_reg:$Rm), + IndexModePost, StMiscFrm, IIC_iStore_bh_ru, opc, + "\t$Rt, $addr, $Rm", "$addr.base = $base_wb", []> { + bits<5> Rm; + let Inst{23} = Rm{4}; + let Inst{22} = 0; + let Inst{11-8} = 0; + let Inst{3-0} = Rm{3-0}; + } +} + + +defm STRHT : AI3strT<0b1011, "strht">; + +def STL : AIstrrel<0b00, (outs), (ins GPR:$Rt, addr_offset_none:$addr), + NoItinerary, "stl", "\t$Rt, $addr", []>; +def STLB : AIstrrel<0b10, (outs), (ins GPR:$Rt, addr_offset_none:$addr), + NoItinerary, "stlb", "\t$Rt, $addr", []>; +def STLH : AIstrrel<0b11, (outs), (ins GPR:$Rt, addr_offset_none:$addr), + NoItinerary, "stlh", "\t$Rt, $addr", []>; + +//===----------------------------------------------------------------------===// +// Load / store multiple Instructions. +// + +multiclass arm_ldst_mult<string asm, string sfx, bit L_bit, bit P_bit, Format f, + InstrItinClass itin, InstrItinClass itin_upd> { + // IA is the default, so no need for an explicit suffix on the + // mnemonic here. Without it is the canonical spelling. + def IA : + AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + IndexModeNone, f, itin, + !strconcat(asm, "${p}\t$Rn, $regs", sfx), "", []> { + let Inst{24-23} = 0b01; // Increment After + let Inst{22} = P_bit; + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + } + def IA_UPD : + AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + IndexModeUpd, f, itin_upd, + !strconcat(asm, "${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> { + let Inst{24-23} = 0b01; // Increment After + let Inst{22} = P_bit; + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + + let DecoderMethod = "DecodeMemMultipleWritebackInstruction"; + } + def DA : + AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + IndexModeNone, f, itin, + !strconcat(asm, "da${p}\t$Rn, $regs", sfx), "", []> { + let Inst{24-23} = 0b00; // Decrement After + let Inst{22} = P_bit; + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + } + def DA_UPD : + AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + IndexModeUpd, f, itin_upd, + !strconcat(asm, "da${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> { + let Inst{24-23} = 0b00; // Decrement After + let Inst{22} = P_bit; + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + + let DecoderMethod = "DecodeMemMultipleWritebackInstruction"; + } + def DB : + AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + IndexModeNone, f, itin, + !strconcat(asm, "db${p}\t$Rn, $regs", sfx), "", []> { + let Inst{24-23} = 0b10; // Decrement Before + let Inst{22} = P_bit; + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + } + def DB_UPD : + AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + IndexModeUpd, f, itin_upd, + !strconcat(asm, "db${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> { + let Inst{24-23} = 0b10; // Decrement Before + let Inst{22} = P_bit; + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + + let DecoderMethod = "DecodeMemMultipleWritebackInstruction"; + } + def IB : + AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + IndexModeNone, f, itin, + !strconcat(asm, "ib${p}\t$Rn, $regs", sfx), "", []> { + let Inst{24-23} = 0b11; // Increment Before + let Inst{22} = P_bit; + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + } + def IB_UPD : + AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + IndexModeUpd, f, itin_upd, + !strconcat(asm, "ib${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> { + let Inst{24-23} = 0b11; // Increment Before + let Inst{22} = P_bit; + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + + let DecoderMethod = "DecodeMemMultipleWritebackInstruction"; + } +} + +let hasSideEffects = 0 in { + +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in +defm LDM : arm_ldst_mult<"ldm", "", 1, 0, LdStMulFrm, IIC_iLoad_m, + IIC_iLoad_mu>, ComplexDeprecationPredicate<"ARMLoad">; + +let mayStore = 1, hasExtraSrcRegAllocReq = 1 in +defm STM : arm_ldst_mult<"stm", "", 0, 0, LdStMulFrm, IIC_iStore_m, + IIC_iStore_mu>, + ComplexDeprecationPredicate<"ARMStore">; + +} // hasSideEffects + +// FIXME: remove when we have a way to marking a MI with these properties. +// FIXME: Should pc be an implicit operand like PICADD, etc? +let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1, + hasExtraDefRegAllocReq = 1, isCodeGenOnly = 1 in +def LDMIA_RET : ARMPseudoExpand<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, + reglist:$regs, variable_ops), + 4, IIC_iLoad_mBr, [], + (LDMIA_UPD GPR:$wb, GPR:$Rn, pred:$p, reglist:$regs)>, + RegConstraint<"$Rn = $wb">; + +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in +defm sysLDM : arm_ldst_mult<"ldm", " ^", 1, 1, LdStMulFrm, IIC_iLoad_m, + IIC_iLoad_mu>; + +let mayStore = 1, hasExtraSrcRegAllocReq = 1 in +defm sysSTM : arm_ldst_mult<"stm", " ^", 0, 1, LdStMulFrm, IIC_iStore_m, + IIC_iStore_mu>; + + + +//===----------------------------------------------------------------------===// +// Move Instructions. +// + +let hasSideEffects = 0 in +def MOVr : AsI1<0b1101, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMOVr, + "mov", "\t$Rd, $Rm", []>, UnaryDP, Sched<[WriteALU]> { + bits<4> Rd; + bits<4> Rm; + + let Inst{19-16} = 0b0000; + let Inst{11-4} = 0b00000000; + let Inst{25} = 0; + let Inst{3-0} = Rm; + let Inst{15-12} = Rd; +} + +// A version for the smaller set of tail call registers. +let hasSideEffects = 0 in +def MOVr_TC : AsI1<0b1101, (outs tcGPR:$Rd), (ins tcGPR:$Rm), DPFrm, + IIC_iMOVr, "mov", "\t$Rd, $Rm", []>, UnaryDP, Sched<[WriteALU]> { + bits<4> Rd; + bits<4> Rm; + + let Inst{11-4} = 0b00000000; + let Inst{25} = 0; + let Inst{3-0} = Rm; + let Inst{15-12} = Rd; +} + +def MOVsr : AsI1<0b1101, (outs GPRnopc:$Rd), (ins shift_so_reg_reg:$src), + DPSoRegRegFrm, IIC_iMOVsr, + "mov", "\t$Rd, $src", + [(set GPRnopc:$Rd, shift_so_reg_reg:$src)]>, UnaryDP, + Sched<[WriteALU]> { + bits<4> Rd; + bits<12> src; + let Inst{15-12} = Rd; + let Inst{19-16} = 0b0000; + let Inst{11-8} = src{11-8}; + let Inst{7} = 0; + let Inst{6-5} = src{6-5}; + let Inst{4} = 1; + let Inst{3-0} = src{3-0}; + let Inst{25} = 0; +} + +def MOVsi : AsI1<0b1101, (outs GPR:$Rd), (ins shift_so_reg_imm:$src), + DPSoRegImmFrm, IIC_iMOVsr, + "mov", "\t$Rd, $src", [(set GPR:$Rd, shift_so_reg_imm:$src)]>, + UnaryDP, Sched<[WriteALU]> { + bits<4> Rd; + bits<12> src; + let Inst{15-12} = Rd; + let Inst{19-16} = 0b0000; + let Inst{11-5} = src{11-5}; + let Inst{4} = 0; + let Inst{3-0} = src{3-0}; + let Inst{25} = 0; +} + +let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in +def MOVi : AsI1<0b1101, (outs GPR:$Rd), (ins mod_imm:$imm), DPFrm, IIC_iMOVi, + "mov", "\t$Rd, $imm", [(set GPR:$Rd, mod_imm:$imm)]>, UnaryDP, + Sched<[WriteALU]> { + bits<4> Rd; + bits<12> imm; + let Inst{25} = 1; + let Inst{15-12} = Rd; + let Inst{19-16} = 0b0000; + let Inst{11-0} = imm; +} + +let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in +def MOVi16 : AI1<0b1000, (outs GPR:$Rd), (ins imm0_65535_expr:$imm), + DPFrm, IIC_iMOVi, + "movw", "\t$Rd, $imm", + [(set GPR:$Rd, imm0_65535:$imm)]>, + Requires<[IsARM, HasV6T2]>, UnaryDP, Sched<[WriteALU]> { + bits<4> Rd; + bits<16> imm; + let Inst{15-12} = Rd; + let Inst{11-0} = imm{11-0}; + let Inst{19-16} = imm{15-12}; + let Inst{20} = 0; + let Inst{25} = 1; + let DecoderMethod = "DecodeArmMOVTWInstruction"; +} + +def : InstAlias<"mov${p} $Rd, $imm", + (MOVi16 GPR:$Rd, imm0_65535_expr:$imm, pred:$p)>, + Requires<[IsARM]>; + +def MOVi16_ga_pcrel : PseudoInst<(outs GPR:$Rd), + (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>, + Sched<[WriteALU]>; + +let Constraints = "$src = $Rd" in { +def MOVTi16 : AI1<0b1010, (outs GPRnopc:$Rd), + (ins GPR:$src, imm0_65535_expr:$imm), + DPFrm, IIC_iMOVi, + "movt", "\t$Rd, $imm", + [(set GPRnopc:$Rd, + (or (and GPR:$src, 0xffff), + lo16AllZero:$imm))]>, UnaryDP, + Requires<[IsARM, HasV6T2]>, Sched<[WriteALU]> { + bits<4> Rd; + bits<16> imm; + let Inst{15-12} = Rd; + let Inst{11-0} = imm{11-0}; + let Inst{19-16} = imm{15-12}; + let Inst{20} = 0; + let Inst{25} = 1; + let DecoderMethod = "DecodeArmMOVTWInstruction"; +} + +def MOVTi16_ga_pcrel : PseudoInst<(outs GPR:$Rd), + (ins GPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>, + Sched<[WriteALU]>; + +} // Constraints + +def : ARMPat<(or GPR:$src, 0xffff0000), (MOVTi16 GPR:$src, 0xffff)>, + Requires<[IsARM, HasV6T2]>; + +let Uses = [CPSR] in +def RRX: PseudoInst<(outs GPR:$Rd), (ins GPR:$Rm), IIC_iMOVsi, + [(set GPR:$Rd, (ARMrrx GPR:$Rm))]>, UnaryDP, + Requires<[IsARM]>, Sched<[WriteALU]>; + +// These aren't really mov instructions, but we have to define them this way +// due to flag operands. + +let Defs = [CPSR] in { +def MOVsrl_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi, + [(set GPR:$dst, (ARMsrl_flag GPR:$src))]>, UnaryDP, + Sched<[WriteALU]>, Requires<[IsARM]>; +def MOVsra_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi, + [(set GPR:$dst, (ARMsra_flag GPR:$src))]>, UnaryDP, + Sched<[WriteALU]>, Requires<[IsARM]>; +} + +//===----------------------------------------------------------------------===// +// Extend Instructions. +// + +// Sign extenders + +def SXTB : AI_ext_rrot<0b01101010, + "sxtb", UnOpFrag<(sext_inreg node:$Src, i8)>>; +def SXTH : AI_ext_rrot<0b01101011, + "sxth", UnOpFrag<(sext_inreg node:$Src, i16)>>; + +def SXTAB : AI_exta_rrot<0b01101010, + "sxtab", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>; +def SXTAH : AI_exta_rrot<0b01101011, + "sxtah", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>; + +def SXTB16 : AI_ext_rrot_np<0b01101000, "sxtb16">; + +def SXTAB16 : AI_exta_rrot_np<0b01101000, "sxtab16">; + +// Zero extenders + +let AddedComplexity = 16 in { +def UXTB : AI_ext_rrot<0b01101110, + "uxtb" , UnOpFrag<(and node:$Src, 0x000000FF)>>; +def UXTH : AI_ext_rrot<0b01101111, + "uxth" , UnOpFrag<(and node:$Src, 0x0000FFFF)>>; +def UXTB16 : AI_ext_rrot<0b01101100, + "uxtb16", UnOpFrag<(and node:$Src, 0x00FF00FF)>>; + +// FIXME: This pattern incorrectly assumes the shl operator is a rotate. +// The transformation should probably be done as a combiner action +// instead so we can include a check for masking back in the upper +// eight bits of the source into the lower eight bits of the result. +//def : ARMV6Pat<(and (shl GPR:$Src, (i32 8)), 0xFF00FF), +// (UXTB16r_rot GPR:$Src, 3)>; +def : ARMV6Pat<(and (srl GPR:$Src, (i32 8)), 0xFF00FF), + (UXTB16 GPR:$Src, 1)>; + +def UXTAB : AI_exta_rrot<0b01101110, "uxtab", + BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>; +def UXTAH : AI_exta_rrot<0b01101111, "uxtah", + BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>; +} + +// This isn't safe in general, the add is two 16-bit units, not a 32-bit add. +def UXTAB16 : AI_exta_rrot_np<0b01101100, "uxtab16">; + + +def SBFX : I<(outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, imm0_31:$lsb, imm1_32:$width), + AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi, + "sbfx", "\t$Rd, $Rn, $lsb, $width", "", []>, + Requires<[IsARM, HasV6T2]> { + bits<4> Rd; + bits<4> Rn; + bits<5> lsb; + bits<5> width; + let Inst{27-21} = 0b0111101; + let Inst{6-4} = 0b101; + let Inst{20-16} = width; + let Inst{15-12} = Rd; + let Inst{11-7} = lsb; + let Inst{3-0} = Rn; +} + +def UBFX : I<(outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, imm0_31:$lsb, imm1_32:$width), + AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi, + "ubfx", "\t$Rd, $Rn, $lsb, $width", "", []>, + Requires<[IsARM, HasV6T2]> { + bits<4> Rd; + bits<4> Rn; + bits<5> lsb; + bits<5> width; + let Inst{27-21} = 0b0111111; + let Inst{6-4} = 0b101; + let Inst{20-16} = width; + let Inst{15-12} = Rd; + let Inst{11-7} = lsb; + let Inst{3-0} = Rn; +} + +//===----------------------------------------------------------------------===// +// Arithmetic Instructions. +// + +defm ADD : AsI1_bin_irs<0b0100, "add", + IIC_iALUi, IIC_iALUr, IIC_iALUsr, + BinOpFrag<(add node:$LHS, node:$RHS)>, 1>; +defm SUB : AsI1_bin_irs<0b0010, "sub", + IIC_iALUi, IIC_iALUr, IIC_iALUsr, + BinOpFrag<(sub node:$LHS, node:$RHS)>>; + +// ADD and SUB with 's' bit set. +// +// Currently, ADDS/SUBS are pseudo opcodes that exist only in the +// selection DAG. They are "lowered" to real ADD/SUB opcodes by +// AdjustInstrPostInstrSelection where we determine whether or not to +// set the "s" bit based on CPSR liveness. +// +// FIXME: Eliminate ADDS/SUBS pseudo opcodes after adding tablegen +// support for an optional CPSR definition that corresponds to the DAG +// node's second value. We can then eliminate the implicit def of CPSR. +defm ADDS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr, + BinOpFrag<(ARMaddc node:$LHS, node:$RHS)>, 1>; +defm SUBS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr, + BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>; + +defm ADC : AI1_adde_sube_irs<0b0101, "adc", + BinOpWithFlagFrag<(ARMadde node:$LHS, node:$RHS, node:$FLAG)>, 1>; +defm SBC : AI1_adde_sube_irs<0b0110, "sbc", + BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>>; + +defm RSB : AsI1_rbin_irs<0b0011, "rsb", + IIC_iALUi, IIC_iALUr, IIC_iALUsr, + BinOpFrag<(sub node:$LHS, node:$RHS)>>; + +// FIXME: Eliminate them if we can write def : Pat patterns which defines +// CPSR and the implicit def of CPSR is not needed. +defm RSBS : AsI1_rbin_s_is<IIC_iALUi, IIC_iALUr, IIC_iALUsr, + BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>; + +defm RSC : AI1_rsc_irs<0b0111, "rsc", + BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>>; + +// (sub X, imm) gets canonicalized to (add X, -imm). Match this form. +// The assume-no-carry-in form uses the negation of the input since add/sub +// assume opposite meanings of the carry flag (i.e., carry == !borrow). +// See the definition of AddWithCarry() in the ARM ARM A2.2.1 for the gory +// details. +def : ARMPat<(add GPR:$src, mod_imm_neg:$imm), + (SUBri GPR:$src, mod_imm_neg:$imm)>; +def : ARMPat<(ARMaddc GPR:$src, mod_imm_neg:$imm), + (SUBSri GPR:$src, mod_imm_neg:$imm)>; + +def : ARMPat<(add GPR:$src, imm0_65535_neg:$imm), + (SUBrr GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>, + Requires<[IsARM, HasV6T2]>; +def : ARMPat<(ARMaddc GPR:$src, imm0_65535_neg:$imm), + (SUBSrr GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>, + Requires<[IsARM, HasV6T2]>; + +// The with-carry-in form matches bitwise not instead of the negation. +// Effectively, the inverse interpretation of the carry flag already accounts +// for part of the negation. +def : ARMPat<(ARMadde GPR:$src, mod_imm_not:$imm, CPSR), + (SBCri GPR:$src, mod_imm_not:$imm)>; +def : ARMPat<(ARMadde GPR:$src, imm0_65535_neg:$imm, CPSR), + (SBCrr GPR:$src, (MOVi16 (imm_not_XFORM imm:$imm)))>, + Requires<[IsARM, HasV6T2]>; + +// Note: These are implemented in C++ code, because they have to generate +// ADD/SUBrs instructions, which use a complex pattern that a xform function +// cannot produce. +// (mul X, 2^n+1) -> (add (X << n), X) +// (mul X, 2^n-1) -> (rsb X, (X << n)) + +// ARM Arithmetic Instruction +// GPR:$dst = GPR:$a op GPR:$b +class AAI<bits<8> op27_20, bits<8> op11_4, string opc, + list<dag> pattern = [], + dag iops = (ins GPRnopc:$Rn, GPRnopc:$Rm), + string asm = "\t$Rd, $Rn, $Rm"> + : AI<(outs GPRnopc:$Rd), iops, DPFrm, IIC_iALUr, opc, asm, pattern>, + Sched<[WriteALU, ReadALU, ReadALU]> { + bits<4> Rn; + bits<4> Rd; + bits<4> Rm; + let Inst{27-20} = op27_20; + let Inst{11-4} = op11_4; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{3-0} = Rm; + + let Unpredictable{11-8} = 0b1111; +} + +// Saturating add/subtract + +let DecoderMethod = "DecodeQADDInstruction" in +def QADD : AAI<0b00010000, 0b00000101, "qadd", + [(set GPRnopc:$Rd, (int_arm_qadd GPRnopc:$Rm, GPRnopc:$Rn))], + (ins GPRnopc:$Rm, GPRnopc:$Rn), "\t$Rd, $Rm, $Rn">; + +def QSUB : AAI<0b00010010, 0b00000101, "qsub", + [(set GPRnopc:$Rd, (int_arm_qsub GPRnopc:$Rm, GPRnopc:$Rn))], + (ins GPRnopc:$Rm, GPRnopc:$Rn), "\t$Rd, $Rm, $Rn">; +def QDADD : AAI<0b00010100, 0b00000101, "qdadd", [], + (ins GPRnopc:$Rm, GPRnopc:$Rn), + "\t$Rd, $Rm, $Rn">; +def QDSUB : AAI<0b00010110, 0b00000101, "qdsub", [], + (ins GPRnopc:$Rm, GPRnopc:$Rn), + "\t$Rd, $Rm, $Rn">; + +def QADD16 : AAI<0b01100010, 0b11110001, "qadd16">; +def QADD8 : AAI<0b01100010, 0b11111001, "qadd8">; +def QASX : AAI<0b01100010, 0b11110011, "qasx">; +def QSAX : AAI<0b01100010, 0b11110101, "qsax">; +def QSUB16 : AAI<0b01100010, 0b11110111, "qsub16">; +def QSUB8 : AAI<0b01100010, 0b11111111, "qsub8">; +def UQADD16 : AAI<0b01100110, 0b11110001, "uqadd16">; +def UQADD8 : AAI<0b01100110, 0b11111001, "uqadd8">; +def UQASX : AAI<0b01100110, 0b11110011, "uqasx">; +def UQSAX : AAI<0b01100110, 0b11110101, "uqsax">; +def UQSUB16 : AAI<0b01100110, 0b11110111, "uqsub16">; +def UQSUB8 : AAI<0b01100110, 0b11111111, "uqsub8">; + +// Signed/Unsigned add/subtract + +def SASX : AAI<0b01100001, 0b11110011, "sasx">; +def SADD16 : AAI<0b01100001, 0b11110001, "sadd16">; +def SADD8 : AAI<0b01100001, 0b11111001, "sadd8">; +def SSAX : AAI<0b01100001, 0b11110101, "ssax">; +def SSUB16 : AAI<0b01100001, 0b11110111, "ssub16">; +def SSUB8 : AAI<0b01100001, 0b11111111, "ssub8">; +def UASX : AAI<0b01100101, 0b11110011, "uasx">; +def UADD16 : AAI<0b01100101, 0b11110001, "uadd16">; +def UADD8 : AAI<0b01100101, 0b11111001, "uadd8">; +def USAX : AAI<0b01100101, 0b11110101, "usax">; +def USUB16 : AAI<0b01100101, 0b11110111, "usub16">; +def USUB8 : AAI<0b01100101, 0b11111111, "usub8">; + +// Signed/Unsigned halving add/subtract + +def SHASX : AAI<0b01100011, 0b11110011, "shasx">; +def SHADD16 : AAI<0b01100011, 0b11110001, "shadd16">; +def SHADD8 : AAI<0b01100011, 0b11111001, "shadd8">; +def SHSAX : AAI<0b01100011, 0b11110101, "shsax">; +def SHSUB16 : AAI<0b01100011, 0b11110111, "shsub16">; +def SHSUB8 : AAI<0b01100011, 0b11111111, "shsub8">; +def UHASX : AAI<0b01100111, 0b11110011, "uhasx">; +def UHADD16 : AAI<0b01100111, 0b11110001, "uhadd16">; +def UHADD8 : AAI<0b01100111, 0b11111001, "uhadd8">; +def UHSAX : AAI<0b01100111, 0b11110101, "uhsax">; +def UHSUB16 : AAI<0b01100111, 0b11110111, "uhsub16">; +def UHSUB8 : AAI<0b01100111, 0b11111111, "uhsub8">; + +// Unsigned Sum of Absolute Differences [and Accumulate]. + +def USAD8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + MulFrm /* for convenience */, NoItinerary, "usad8", + "\t$Rd, $Rn, $Rm", []>, + Requires<[IsARM, HasV6]>, Sched<[WriteALU, ReadALU, ReadALU]> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + let Inst{27-20} = 0b01111000; + let Inst{15-12} = 0b1111; + let Inst{7-4} = 0b0001; + let Inst{19-16} = Rd; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; +} +def USADA8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), + MulFrm /* for convenience */, NoItinerary, "usada8", + "\t$Rd, $Rn, $Rm, $Ra", []>, + Requires<[IsARM, HasV6]>, Sched<[WriteALU, ReadALU, ReadALU]>{ + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + bits<4> Ra; + let Inst{27-20} = 0b01111000; + let Inst{7-4} = 0b0001; + let Inst{19-16} = Rd; + let Inst{15-12} = Ra; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; +} + +// Signed/Unsigned saturate + +def SSAT : AI<(outs GPRnopc:$Rd), + (ins imm1_32:$sat_imm, GPRnopc:$Rn, shift_imm:$sh), + SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []> { + bits<4> Rd; + bits<5> sat_imm; + bits<4> Rn; + bits<8> sh; + let Inst{27-21} = 0b0110101; + let Inst{5-4} = 0b01; + let Inst{20-16} = sat_imm; + let Inst{15-12} = Rd; + let Inst{11-7} = sh{4-0}; + let Inst{6} = sh{5}; + let Inst{3-0} = Rn; +} + +def SSAT16 : AI<(outs GPRnopc:$Rd), + (ins imm1_16:$sat_imm, GPRnopc:$Rn), SatFrm, + NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", []> { + bits<4> Rd; + bits<4> sat_imm; + bits<4> Rn; + let Inst{27-20} = 0b01101010; + let Inst{11-4} = 0b11110011; + let Inst{15-12} = Rd; + let Inst{19-16} = sat_imm; + let Inst{3-0} = Rn; +} + +def USAT : AI<(outs GPRnopc:$Rd), + (ins imm0_31:$sat_imm, GPRnopc:$Rn, shift_imm:$sh), + SatFrm, NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []> { + bits<4> Rd; + bits<5> sat_imm; + bits<4> Rn; + bits<8> sh; + let Inst{27-21} = 0b0110111; + let Inst{5-4} = 0b01; + let Inst{15-12} = Rd; + let Inst{11-7} = sh{4-0}; + let Inst{6} = sh{5}; + let Inst{20-16} = sat_imm; + let Inst{3-0} = Rn; +} + +def USAT16 : AI<(outs GPRnopc:$Rd), + (ins imm0_15:$sat_imm, GPRnopc:$Rn), SatFrm, + NoItinerary, "usat16", "\t$Rd, $sat_imm, $Rn", []> { + bits<4> Rd; + bits<4> sat_imm; + bits<4> Rn; + let Inst{27-20} = 0b01101110; + let Inst{11-4} = 0b11110011; + let Inst{15-12} = Rd; + let Inst{19-16} = sat_imm; + let Inst{3-0} = Rn; +} + +def : ARMV6Pat<(int_arm_ssat GPRnopc:$a, imm1_32:$pos), + (SSAT imm1_32:$pos, GPRnopc:$a, 0)>; +def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm0_31:$pos), + (USAT imm0_31:$pos, GPRnopc:$a, 0)>; + +//===----------------------------------------------------------------------===// +// Bitwise Instructions. +// + +defm AND : AsI1_bin_irs<0b0000, "and", + IIC_iBITi, IIC_iBITr, IIC_iBITsr, + BinOpFrag<(and node:$LHS, node:$RHS)>, 1>; +defm ORR : AsI1_bin_irs<0b1100, "orr", + IIC_iBITi, IIC_iBITr, IIC_iBITsr, + BinOpFrag<(or node:$LHS, node:$RHS)>, 1>; +defm EOR : AsI1_bin_irs<0b0001, "eor", + IIC_iBITi, IIC_iBITr, IIC_iBITsr, + BinOpFrag<(xor node:$LHS, node:$RHS)>, 1>; +defm BIC : AsI1_bin_irs<0b1110, "bic", + IIC_iBITi, IIC_iBITr, IIC_iBITsr, + BinOpFrag<(and node:$LHS, (not node:$RHS))>>; + +// FIXME: bf_inv_mask_imm should be two operands, the lsb and the msb, just +// like in the actual instruction encoding. The complexity of mapping the mask +// to the lsb/msb pair should be handled by ISel, not encapsulated in the +// instruction description. +def BFC : I<(outs GPR:$Rd), (ins GPR:$src, bf_inv_mask_imm:$imm), + AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi, + "bfc", "\t$Rd, $imm", "$src = $Rd", + [(set GPR:$Rd, (and GPR:$src, bf_inv_mask_imm:$imm))]>, + Requires<[IsARM, HasV6T2]> { + bits<4> Rd; + bits<10> imm; + let Inst{27-21} = 0b0111110; + let Inst{6-0} = 0b0011111; + let Inst{15-12} = Rd; + let Inst{11-7} = imm{4-0}; // lsb + let Inst{20-16} = imm{9-5}; // msb +} + +// A8.6.18 BFI - Bitfield insert (Encoding A1) +def BFI:I<(outs GPRnopc:$Rd), (ins GPRnopc:$src, GPR:$Rn, bf_inv_mask_imm:$imm), + AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi, + "bfi", "\t$Rd, $Rn, $imm", "$src = $Rd", + [(set GPRnopc:$Rd, (ARMbfi GPRnopc:$src, GPR:$Rn, + bf_inv_mask_imm:$imm))]>, + Requires<[IsARM, HasV6T2]> { + bits<4> Rd; + bits<4> Rn; + bits<10> imm; + let Inst{27-21} = 0b0111110; + let Inst{6-4} = 0b001; // Rn: Inst{3-0} != 15 + let Inst{15-12} = Rd; + let Inst{11-7} = imm{4-0}; // lsb + let Inst{20-16} = imm{9-5}; // width + let Inst{3-0} = Rn; +} + +def MVNr : AsI1<0b1111, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMVNr, + "mvn", "\t$Rd, $Rm", + [(set GPR:$Rd, (not GPR:$Rm))]>, UnaryDP, Sched<[WriteALU]> { + bits<4> Rd; + bits<4> Rm; + let Inst{25} = 0; + let Inst{19-16} = 0b0000; + let Inst{11-4} = 0b00000000; + let Inst{15-12} = Rd; + let Inst{3-0} = Rm; +} +def MVNsi : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_imm:$shift), + DPSoRegImmFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift", + [(set GPR:$Rd, (not so_reg_imm:$shift))]>, UnaryDP, + Sched<[WriteALU]> { + bits<4> Rd; + bits<12> shift; + let Inst{25} = 0; + let Inst{19-16} = 0b0000; + let Inst{15-12} = Rd; + let Inst{11-5} = shift{11-5}; + let Inst{4} = 0; + let Inst{3-0} = shift{3-0}; +} +def MVNsr : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_reg:$shift), + DPSoRegRegFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift", + [(set GPR:$Rd, (not so_reg_reg:$shift))]>, UnaryDP, + Sched<[WriteALU]> { + bits<4> Rd; + bits<12> shift; + let Inst{25} = 0; + let Inst{19-16} = 0b0000; + let Inst{15-12} = Rd; + let Inst{11-8} = shift{11-8}; + let Inst{7} = 0; + let Inst{6-5} = shift{6-5}; + let Inst{4} = 1; + let Inst{3-0} = shift{3-0}; +} +let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in +def MVNi : AsI1<0b1111, (outs GPR:$Rd), (ins mod_imm:$imm), DPFrm, + IIC_iMVNi, "mvn", "\t$Rd, $imm", + [(set GPR:$Rd, mod_imm_not:$imm)]>,UnaryDP, Sched<[WriteALU]> { + bits<4> Rd; + bits<12> imm; + let Inst{25} = 1; + let Inst{19-16} = 0b0000; + let Inst{15-12} = Rd; + let Inst{11-0} = imm; +} + +def : ARMPat<(and GPR:$src, mod_imm_not:$imm), + (BICri GPR:$src, mod_imm_not:$imm)>; + +//===----------------------------------------------------------------------===// +// Multiply Instructions. +// +class AsMul1I32<bits<7> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rm; + bits<4> Rn; + let Inst{19-16} = Rd; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; +} +class AsMul1I64<bits<7> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> { + bits<4> RdLo; + bits<4> RdHi; + bits<4> Rm; + bits<4> Rn; + let Inst{19-16} = RdHi; + let Inst{15-12} = RdLo; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; +} +class AsMla1I64<bits<7> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> { + bits<4> RdLo; + bits<4> RdHi; + bits<4> Rm; + bits<4> Rn; + let Inst{19-16} = RdHi; + let Inst{15-12} = RdLo; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; +} + +// FIXME: The v5 pseudos are only necessary for the additional Constraint +// property. Remove them when it's possible to add those properties +// on an individual MachineInstr, not just an instruction description. +let isCommutable = 1, TwoOperandAliasConstraint = "$Rn = $Rd" in { +def MUL : AsMul1I32<0b0000000, (outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, GPRnopc:$Rm), + IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm", + [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))]>, + Requires<[IsARM, HasV6]> { + let Inst{15-12} = 0b0000; + let Unpredictable{15-12} = 0b1111; +} + +let Constraints = "@earlyclobber $Rd" in +def MULv5: ARMPseudoExpand<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm, + pred:$p, cc_out:$s), + 4, IIC_iMUL32, + [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))], + (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s)>, + Requires<[IsARM, NoV6, UseMulOps]>; +} + +def MLA : AsMul1I32<0b0000001, (outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra), + IIC_iMAC32, "mla", "\t$Rd, $Rn, $Rm, $Ra", + [(set GPRnopc:$Rd, (add (mul GPRnopc:$Rn, GPRnopc:$Rm), GPRnopc:$Ra))]>, + Requires<[IsARM, HasV6, UseMulOps]> { + bits<4> Ra; + let Inst{15-12} = Ra; +} + +let Constraints = "@earlyclobber $Rd" in +def MLAv5: ARMPseudoExpand<(outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra, + pred:$p, cc_out:$s), 4, IIC_iMAC32, + [(set GPRnopc:$Rd, (add (mul GPRnopc:$Rn, GPRnopc:$Rm), GPRnopc:$Ra))], + (MLA GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra, pred:$p, cc_out:$s)>, + Requires<[IsARM, NoV6]>; + +def MLS : AMul1I<0b0000011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), + IIC_iMAC32, "mls", "\t$Rd, $Rn, $Rm, $Ra", + [(set GPR:$Rd, (sub GPR:$Ra, (mul GPR:$Rn, GPR:$Rm)))]>, + Requires<[IsARM, HasV6T2, UseMulOps]> { + bits<4> Rd; + bits<4> Rm; + bits<4> Rn; + bits<4> Ra; + let Inst{19-16} = Rd; + let Inst{15-12} = Ra; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; +} + +// Extra precision multiplies with low / high results +let hasSideEffects = 0 in { +let isCommutable = 1 in { +def SMULL : AsMul1I64<0b0000110, (outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm), IIC_iMUL64, + "smull", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + Requires<[IsARM, HasV6]>; + +def UMULL : AsMul1I64<0b0000100, (outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm), IIC_iMUL64, + "umull", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + Requires<[IsARM, HasV6]>; + +let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in { +def SMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), + 4, IIC_iMUL64, [], + (SMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>, + Requires<[IsARM, NoV6]>; + +def UMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), + 4, IIC_iMUL64, [], + (UMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>, + Requires<[IsARM, NoV6]>; +} +} + +// Multiply + accumulate +def SMLAL : AsMla1I64<0b0000111, (outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64, + "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>; +def UMLAL : AsMla1I64<0b0000101, (outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64, + "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>; + +def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64, + "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + Requires<[IsARM, HasV6]> { + bits<4> RdLo; + bits<4> RdHi; + bits<4> Rm; + bits<4> Rn; + let Inst{19-16} = RdHi; + let Inst{15-12} = RdLo; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; +} + +let Constraints = + "@earlyclobber $RdLo,@earlyclobber $RdHi,$RLo = $RdLo,$RHi = $RdHi" in { +def SMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s), + 4, IIC_iMAC64, [], + (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, + pred:$p, cc_out:$s)>, + Requires<[IsARM, NoV6]>; +def UMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s), + 4, IIC_iMAC64, [], + (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, + pred:$p, cc_out:$s)>, + Requires<[IsARM, NoV6]>; +} + +} // hasSideEffects + +// Most significant word multiply +def SMMUL : AMul2I <0b0111010, 0b0001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iMUL32, "smmul", "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (mulhs GPR:$Rn, GPR:$Rm))]>, + Requires<[IsARM, HasV6]> { + let Inst{15-12} = 0b1111; +} + +def SMMULR : AMul2I <0b0111010, 0b0011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iMUL32, "smmulr", "\t$Rd, $Rn, $Rm", []>, + Requires<[IsARM, HasV6]> { + let Inst{15-12} = 0b1111; +} + +def SMMLA : AMul2Ia <0b0111010, 0b0001, (outs GPR:$Rd), + (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), + IIC_iMAC32, "smmla", "\t$Rd, $Rn, $Rm, $Ra", + [(set GPR:$Rd, (add (mulhs GPR:$Rn, GPR:$Rm), GPR:$Ra))]>, + Requires<[IsARM, HasV6, UseMulOps]>; + +def SMMLAR : AMul2Ia <0b0111010, 0b0011, (outs GPR:$Rd), + (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), + IIC_iMAC32, "smmlar", "\t$Rd, $Rn, $Rm, $Ra", []>, + Requires<[IsARM, HasV6]>; + +def SMMLS : AMul2Ia <0b0111010, 0b1101, (outs GPR:$Rd), + (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), + IIC_iMAC32, "smmls", "\t$Rd, $Rn, $Rm, $Ra", []>, + Requires<[IsARM, HasV6, UseMulOps]>; + +def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd), + (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), + IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []>, + Requires<[IsARM, HasV6]>; + +multiclass AI_smul<string opc, PatFrag opnode> { + def BB : AMulxyI<0b0001011, 0b00, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iMUL16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (opnode (sext_inreg GPR:$Rn, i16), + (sext_inreg GPR:$Rm, i16)))]>, + Requires<[IsARM, HasV5TE]>; + + def BT : AMulxyI<0b0001011, 0b10, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iMUL16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (opnode (sext_inreg GPR:$Rn, i16), + (sra GPR:$Rm, (i32 16))))]>, + Requires<[IsARM, HasV5TE]>; + + def TB : AMulxyI<0b0001011, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iMUL16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (opnode (sra GPR:$Rn, (i32 16)), + (sext_inreg GPR:$Rm, i16)))]>, + Requires<[IsARM, HasV5TE]>; + + def TT : AMulxyI<0b0001011, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iMUL16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (opnode (sra GPR:$Rn, (i32 16)), + (sra GPR:$Rm, (i32 16))))]>, + Requires<[IsARM, HasV5TE]>; + + def WB : AMulxyI<0b0001001, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iMUL16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm", + []>, + Requires<[IsARM, HasV5TE]>; + + def WT : AMulxyI<0b0001001, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iMUL16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm", + []>, + Requires<[IsARM, HasV5TE]>; +} + + +multiclass AI_smla<string opc, PatFrag opnode> { + let DecoderMethod = "DecodeSMLAInstruction" in { + def BB : AMulxyIa<0b0001000, 0b00, (outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), + IIC_iMAC16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm, $Ra", + [(set GPRnopc:$Rd, (add GPR:$Ra, + (opnode (sext_inreg GPRnopc:$Rn, i16), + (sext_inreg GPRnopc:$Rm, i16))))]>, + Requires<[IsARM, HasV5TE, UseMulOps]>; + + def BT : AMulxyIa<0b0001000, 0b10, (outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), + IIC_iMAC16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra", + [(set GPRnopc:$Rd, + (add GPR:$Ra, (opnode (sext_inreg GPRnopc:$Rn, i16), + (sra GPRnopc:$Rm, (i32 16)))))]>, + Requires<[IsARM, HasV5TE, UseMulOps]>; + + def TB : AMulxyIa<0b0001000, 0b01, (outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), + IIC_iMAC16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra", + [(set GPRnopc:$Rd, + (add GPR:$Ra, (opnode (sra GPRnopc:$Rn, (i32 16)), + (sext_inreg GPRnopc:$Rm, i16))))]>, + Requires<[IsARM, HasV5TE, UseMulOps]>; + + def TT : AMulxyIa<0b0001000, 0b11, (outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), + IIC_iMAC16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra", + [(set GPRnopc:$Rd, + (add GPR:$Ra, (opnode (sra GPRnopc:$Rn, (i32 16)), + (sra GPRnopc:$Rm, (i32 16)))))]>, + Requires<[IsARM, HasV5TE, UseMulOps]>; + + def WB : AMulxyIa<0b0001001, 0b00, (outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), + IIC_iMAC16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra", + []>, + Requires<[IsARM, HasV5TE, UseMulOps]>; + + def WT : AMulxyIa<0b0001001, 0b10, (outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), + IIC_iMAC16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra", + []>, + Requires<[IsARM, HasV5TE, UseMulOps]>; + } +} + +defm SMUL : AI_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>; +defm SMLA : AI_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>; + +// Halfword multiply accumulate long: SMLAL<x><y>. +def SMLALBB : AMulxyI64<0b0001010, 0b00, (outs GPRnopc:$RdLo, GPRnopc:$RdHi), + (ins GPRnopc:$Rn, GPRnopc:$Rm), + IIC_iMAC64, "smlalbb", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + Requires<[IsARM, HasV5TE]>; + +def SMLALBT : AMulxyI64<0b0001010, 0b10, (outs GPRnopc:$RdLo, GPRnopc:$RdHi), + (ins GPRnopc:$Rn, GPRnopc:$Rm), + IIC_iMAC64, "smlalbt", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + Requires<[IsARM, HasV5TE]>; + +def SMLALTB : AMulxyI64<0b0001010, 0b01, (outs GPRnopc:$RdLo, GPRnopc:$RdHi), + (ins GPRnopc:$Rn, GPRnopc:$Rm), + IIC_iMAC64, "smlaltb", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + Requires<[IsARM, HasV5TE]>; + +def SMLALTT : AMulxyI64<0b0001010, 0b11, (outs GPRnopc:$RdLo, GPRnopc:$RdHi), + (ins GPRnopc:$Rn, GPRnopc:$Rm), + IIC_iMAC64, "smlaltt", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + Requires<[IsARM, HasV5TE]>; + +// Helper class for AI_smld. +class AMulDualIbase<bit long, bit sub, bit swap, dag oops, dag iops, + InstrItinClass itin, string opc, string asm> + : AI<oops, iops, MulFrm, itin, opc, asm, []>, Requires<[IsARM, HasV6]> { + bits<4> Rn; + bits<4> Rm; + let Inst{27-23} = 0b01110; + let Inst{22} = long; + let Inst{21-20} = 0b00; + let Inst{11-8} = Rm; + let Inst{7} = 0; + let Inst{6} = sub; + let Inst{5} = swap; + let Inst{4} = 1; + let Inst{3-0} = Rn; +} +class AMulDualI<bit long, bit sub, bit swap, dag oops, dag iops, + InstrItinClass itin, string opc, string asm> + : AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> { + bits<4> Rd; + let Inst{15-12} = 0b1111; + let Inst{19-16} = Rd; +} +class AMulDualIa<bit long, bit sub, bit swap, dag oops, dag iops, + InstrItinClass itin, string opc, string asm> + : AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> { + bits<4> Ra; + bits<4> Rd; + let Inst{19-16} = Rd; + let Inst{15-12} = Ra; +} +class AMulDualI64<bit long, bit sub, bit swap, dag oops, dag iops, + InstrItinClass itin, string opc, string asm> + : AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> { + bits<4> RdLo; + bits<4> RdHi; + let Inst{19-16} = RdHi; + let Inst{15-12} = RdLo; +} + +multiclass AI_smld<bit sub, string opc> { + + def D : AMulDualIa<0, sub, 0, (outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), + NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm, $Ra">; + + def DX: AMulDualIa<0, sub, 1, (outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), + NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm, $Ra">; + + def LD: AMulDualI64<1, sub, 0, (outs GPRnopc:$RdLo, GPRnopc:$RdHi), + (ins GPRnopc:$Rn, GPRnopc:$Rm), NoItinerary, + !strconcat(opc, "ld"), "\t$RdLo, $RdHi, $Rn, $Rm">; + + def LDX : AMulDualI64<1, sub, 1, (outs GPRnopc:$RdLo, GPRnopc:$RdHi), + (ins GPRnopc:$Rn, GPRnopc:$Rm), NoItinerary, + !strconcat(opc, "ldx"),"\t$RdLo, $RdHi, $Rn, $Rm">; + +} + +defm SMLA : AI_smld<0, "smla">; +defm SMLS : AI_smld<1, "smls">; + +multiclass AI_sdml<bit sub, string opc> { + + def D:AMulDualI<0, sub, 0, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm), + NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm">; + def DX:AMulDualI<0, sub, 1, (outs GPRnopc:$Rd),(ins GPRnopc:$Rn, GPRnopc:$Rm), + NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm">; +} + +defm SMUA : AI_sdml<0, "smua">; +defm SMUS : AI_sdml<1, "smus">; + +//===----------------------------------------------------------------------===// +// Division Instructions (ARMv7-A with virtualization extension) +// +def SDIV : ADivA1I<0b001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iDIV, + "sdiv", "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (sdiv GPR:$Rn, GPR:$Rm))]>, + Requires<[IsARM, HasDivideInARM]>; + +def UDIV : ADivA1I<0b011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iDIV, + "udiv", "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (udiv GPR:$Rn, GPR:$Rm))]>, + Requires<[IsARM, HasDivideInARM]>; + +//===----------------------------------------------------------------------===// +// Misc. Arithmetic Instructions. +// + +def CLZ : AMiscA1I<0b00010110, 0b0001, (outs GPR:$Rd), (ins GPR:$Rm), + IIC_iUNAr, "clz", "\t$Rd, $Rm", + [(set GPR:$Rd, (ctlz GPR:$Rm))]>, Requires<[IsARM, HasV5T]>, + Sched<[WriteALU]>; + +def RBIT : AMiscA1I<0b01101111, 0b0011, (outs GPR:$Rd), (ins GPR:$Rm), + IIC_iUNAr, "rbit", "\t$Rd, $Rm", + [(set GPR:$Rd, (bitreverse GPR:$Rm))]>, + Requires<[IsARM, HasV6T2]>, + Sched<[WriteALU]>; + +def REV : AMiscA1I<0b01101011, 0b0011, (outs GPR:$Rd), (ins GPR:$Rm), + IIC_iUNAr, "rev", "\t$Rd, $Rm", + [(set GPR:$Rd, (bswap GPR:$Rm))]>, Requires<[IsARM, HasV6]>, + Sched<[WriteALU]>; + +let AddedComplexity = 5 in +def REV16 : AMiscA1I<0b01101011, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm), + IIC_iUNAr, "rev16", "\t$Rd, $Rm", + [(set GPR:$Rd, (rotr (bswap GPR:$Rm), (i32 16)))]>, + Requires<[IsARM, HasV6]>, + Sched<[WriteALU]>; + +def : ARMV6Pat<(srl (bswap (extloadi16 addrmode3:$addr)), (i32 16)), + (REV16 (LDRH addrmode3:$addr))>; +def : ARMV6Pat<(truncstorei16 (srl (bswap GPR:$Rn), (i32 16)), addrmode3:$addr), + (STRH (REV16 GPR:$Rn), addrmode3:$addr)>; + +let AddedComplexity = 5 in +def REVSH : AMiscA1I<0b01101111, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm), + IIC_iUNAr, "revsh", "\t$Rd, $Rm", + [(set GPR:$Rd, (sra (bswap GPR:$Rm), (i32 16)))]>, + Requires<[IsARM, HasV6]>, + Sched<[WriteALU]>; + +def : ARMV6Pat<(or (sra (shl GPR:$Rm, (i32 24)), (i32 16)), + (and (srl GPR:$Rm, (i32 8)), 0xFF)), + (REVSH GPR:$Rm)>; + +def PKHBT : APKHI<0b01101000, 0, (outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, GPRnopc:$Rm, pkh_lsl_amt:$sh), + IIC_iALUsi, "pkhbt", "\t$Rd, $Rn, $Rm$sh", + [(set GPRnopc:$Rd, (or (and GPRnopc:$Rn, 0xFFFF), + (and (shl GPRnopc:$Rm, pkh_lsl_amt:$sh), + 0xFFFF0000)))]>, + Requires<[IsARM, HasV6]>, + Sched<[WriteALUsi, ReadALU]>; + +// Alternate cases for PKHBT where identities eliminate some nodes. +def : ARMV6Pat<(or (and GPRnopc:$Rn, 0xFFFF), (and GPRnopc:$Rm, 0xFFFF0000)), + (PKHBT GPRnopc:$Rn, GPRnopc:$Rm, 0)>; +def : ARMV6Pat<(or (and GPRnopc:$Rn, 0xFFFF), (shl GPRnopc:$Rm, imm16_31:$sh)), + (PKHBT GPRnopc:$Rn, GPRnopc:$Rm, imm16_31:$sh)>; + +// Note: Shifts of 1-15 bits will be transformed to srl instead of sra and +// will match the pattern below. +def PKHTB : APKHI<0b01101000, 1, (outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, GPRnopc:$Rm, pkh_asr_amt:$sh), + IIC_iBITsi, "pkhtb", "\t$Rd, $Rn, $Rm$sh", + [(set GPRnopc:$Rd, (or (and GPRnopc:$Rn, 0xFFFF0000), + (and (sra GPRnopc:$Rm, pkh_asr_amt:$sh), + 0xFFFF)))]>, + Requires<[IsARM, HasV6]>, + Sched<[WriteALUsi, ReadALU]>; + +// Alternate cases for PKHTB where identities eliminate some nodes. Note that +// a shift amount of 0 is *not legal* here, it is PKHBT instead. +// We also can not replace a srl (17..31) by an arithmetic shift we would use in +// pkhtb src1, src2, asr (17..31). +def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000), + (srl GPRnopc:$src2, imm16:$sh)), + (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm16:$sh)>; +def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000), + (sra GPRnopc:$src2, imm16_31:$sh)), + (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm16_31:$sh)>; +def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000), + (and (srl GPRnopc:$src2, imm1_15:$sh), 0xFFFF)), + (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm1_15:$sh)>; + +//===----------------------------------------------------------------------===// +// CRC Instructions +// +// Polynomials: +// + CRC32{B,H,W} 0x04C11DB7 +// + CRC32C{B,H,W} 0x1EDC6F41 +// + +class AI_crc32<bit C, bits<2> sz, string suffix, SDPatternOperator builtin> + : AInoP<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm), MiscFrm, NoItinerary, + !strconcat("crc32", suffix), "\t$Rd, $Rn, $Rm", + [(set GPRnopc:$Rd, (builtin GPRnopc:$Rn, GPRnopc:$Rm))]>, + Requires<[IsARM, HasV8, HasCRC]> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + + let Inst{31-28} = 0b1110; + let Inst{27-23} = 0b00010; + let Inst{22-21} = sz; + let Inst{20} = 0; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-10} = 0b00; + let Inst{9} = C; + let Inst{8} = 0; + let Inst{7-4} = 0b0100; + let Inst{3-0} = Rm; + + let Unpredictable{11-8} = 0b1101; +} + +def CRC32B : AI_crc32<0, 0b00, "b", int_arm_crc32b>; +def CRC32CB : AI_crc32<1, 0b00, "cb", int_arm_crc32cb>; +def CRC32H : AI_crc32<0, 0b01, "h", int_arm_crc32h>; +def CRC32CH : AI_crc32<1, 0b01, "ch", int_arm_crc32ch>; +def CRC32W : AI_crc32<0, 0b10, "w", int_arm_crc32w>; +def CRC32CW : AI_crc32<1, 0b10, "cw", int_arm_crc32cw>; + +//===----------------------------------------------------------------------===// +// ARMv8.1a Privilege Access Never extension +// +// SETPAN #imm1 + +def SETPAN : AInoP<(outs), (ins imm0_1:$imm), MiscFrm, NoItinerary, "setpan", + "\t$imm", []>, Requires<[IsARM, HasV8, HasV8_1a]> { + bits<1> imm; + + let Inst{31-28} = 0b1111; + let Inst{27-20} = 0b00010001; + let Inst{19-16} = 0b0000; + let Inst{15-10} = 0b000000; + let Inst{9} = imm; + let Inst{8} = 0b0; + let Inst{7-4} = 0b0000; + let Inst{3-0} = 0b0000; + + let Unpredictable{19-16} = 0b1111; + let Unpredictable{15-10} = 0b111111; + let Unpredictable{8} = 0b1; + let Unpredictable{3-0} = 0b1111; +} + +//===----------------------------------------------------------------------===// +// Comparison Instructions... +// + +defm CMP : AI1_cmp_irs<0b1010, "cmp", + IIC_iCMPi, IIC_iCMPr, IIC_iCMPsr, + BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>; + +// ARMcmpZ can re-use the above instruction definitions. +def : ARMPat<(ARMcmpZ GPR:$src, mod_imm:$imm), + (CMPri GPR:$src, mod_imm:$imm)>; +def : ARMPat<(ARMcmpZ GPR:$src, GPR:$rhs), + (CMPrr GPR:$src, GPR:$rhs)>; +def : ARMPat<(ARMcmpZ GPR:$src, so_reg_imm:$rhs), + (CMPrsi GPR:$src, so_reg_imm:$rhs)>; +def : ARMPat<(ARMcmpZ GPR:$src, so_reg_reg:$rhs), + (CMPrsr GPR:$src, so_reg_reg:$rhs)>; + +// CMN register-integer +let isCompare = 1, Defs = [CPSR] in { +def CMNri : AI1<0b1011, (outs), (ins GPR:$Rn, mod_imm:$imm), DPFrm, IIC_iCMPi, + "cmn", "\t$Rn, $imm", + [(ARMcmn GPR:$Rn, mod_imm:$imm)]>, + Sched<[WriteCMP, ReadALU]> { + bits<4> Rn; + bits<12> imm; + let Inst{25} = 1; + let Inst{20} = 1; + let Inst{19-16} = Rn; + let Inst{15-12} = 0b0000; + let Inst{11-0} = imm; + + let Unpredictable{15-12} = 0b1111; +} + +// CMN register-register/shift +def CMNzrr : AI1<0b1011, (outs), (ins GPR:$Rn, GPR:$Rm), DPFrm, IIC_iCMPr, + "cmn", "\t$Rn, $Rm", + [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))> + GPR:$Rn, GPR:$Rm)]>, Sched<[WriteCMP, ReadALU, ReadALU]> { + bits<4> Rn; + bits<4> Rm; + let isCommutable = 1; + let Inst{25} = 0; + let Inst{20} = 1; + let Inst{19-16} = Rn; + let Inst{15-12} = 0b0000; + let Inst{11-4} = 0b00000000; + let Inst{3-0} = Rm; + + let Unpredictable{15-12} = 0b1111; +} + +def CMNzrsi : AI1<0b1011, (outs), + (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, IIC_iCMPsr, + "cmn", "\t$Rn, $shift", + [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))> + GPR:$Rn, so_reg_imm:$shift)]>, + Sched<[WriteCMPsi, ReadALU]> { + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{20} = 1; + let Inst{19-16} = Rn; + let Inst{15-12} = 0b0000; + let Inst{11-5} = shift{11-5}; + let Inst{4} = 0; + let Inst{3-0} = shift{3-0}; + + let Unpredictable{15-12} = 0b1111; +} + +def CMNzrsr : AI1<0b1011, (outs), + (ins GPRnopc:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, IIC_iCMPsr, + "cmn", "\t$Rn, $shift", + [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))> + GPRnopc:$Rn, so_reg_reg:$shift)]>, + Sched<[WriteCMPsr, ReadALU]> { + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{20} = 1; + let Inst{19-16} = Rn; + let Inst{15-12} = 0b0000; + let Inst{11-8} = shift{11-8}; + let Inst{7} = 0; + let Inst{6-5} = shift{6-5}; + let Inst{4} = 1; + let Inst{3-0} = shift{3-0}; + + let Unpredictable{15-12} = 0b1111; +} + +} + +def : ARMPat<(ARMcmp GPR:$src, mod_imm_neg:$imm), + (CMNri GPR:$src, mod_imm_neg:$imm)>; + +def : ARMPat<(ARMcmpZ GPR:$src, mod_imm_neg:$imm), + (CMNri GPR:$src, mod_imm_neg:$imm)>; + +// Note that TST/TEQ don't set all the same flags that CMP does! +defm TST : AI1_cmp_irs<0b1000, "tst", + IIC_iTSTi, IIC_iTSTr, IIC_iTSTsr, + BinOpFrag<(ARMcmpZ (and_su node:$LHS, node:$RHS), 0)>, 1, + "DecodeTSTInstruction">; +defm TEQ : AI1_cmp_irs<0b1001, "teq", + IIC_iTSTi, IIC_iTSTr, IIC_iTSTsr, + BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>, 1>; + +// Pseudo i64 compares for some floating point compares. +let usesCustomInserter = 1, isBranch = 1, isTerminator = 1, + Defs = [CPSR] in { +def BCCi64 : PseudoInst<(outs), + (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, brtarget:$dst), + IIC_Br, + [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, bb:$dst)]>, + Sched<[WriteBr]>; + +def BCCZi64 : PseudoInst<(outs), + (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, brtarget:$dst), IIC_Br, + [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, 0, 0, bb:$dst)]>, + Sched<[WriteBr]>; +} // usesCustomInserter + + +// Conditional moves +let hasSideEffects = 0 in { + +let isCommutable = 1, isSelect = 1 in +def MOVCCr : ARMPseudoInst<(outs GPR:$Rd), + (ins GPR:$false, GPR:$Rm, cmovpred:$p), + 4, IIC_iCMOVr, + [(set GPR:$Rd, (ARMcmov GPR:$false, GPR:$Rm, + cmovpred:$p))]>, + RegConstraint<"$false = $Rd">, Sched<[WriteALU]>; + +def MOVCCsi : ARMPseudoInst<(outs GPR:$Rd), + (ins GPR:$false, so_reg_imm:$shift, cmovpred:$p), + 4, IIC_iCMOVsr, + [(set GPR:$Rd, + (ARMcmov GPR:$false, so_reg_imm:$shift, + cmovpred:$p))]>, + RegConstraint<"$false = $Rd">, Sched<[WriteALU]>; +def MOVCCsr : ARMPseudoInst<(outs GPR:$Rd), + (ins GPR:$false, so_reg_reg:$shift, cmovpred:$p), + 4, IIC_iCMOVsr, + [(set GPR:$Rd, (ARMcmov GPR:$false, so_reg_reg:$shift, + cmovpred:$p))]>, + RegConstraint<"$false = $Rd">, Sched<[WriteALU]>; + + +let isMoveImm = 1 in +def MOVCCi16 + : ARMPseudoInst<(outs GPR:$Rd), + (ins GPR:$false, imm0_65535_expr:$imm, cmovpred:$p), + 4, IIC_iMOVi, + [(set GPR:$Rd, (ARMcmov GPR:$false, imm0_65535:$imm, + cmovpred:$p))]>, + RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>, + Sched<[WriteALU]>; + +let isMoveImm = 1 in +def MOVCCi : ARMPseudoInst<(outs GPR:$Rd), + (ins GPR:$false, mod_imm:$imm, cmovpred:$p), + 4, IIC_iCMOVi, + [(set GPR:$Rd, (ARMcmov GPR:$false, mod_imm:$imm, + cmovpred:$p))]>, + RegConstraint<"$false = $Rd">, Sched<[WriteALU]>; + +// Two instruction predicate mov immediate. +let isMoveImm = 1 in +def MOVCCi32imm + : ARMPseudoInst<(outs GPR:$Rd), + (ins GPR:$false, i32imm:$src, cmovpred:$p), + 8, IIC_iCMOVix2, + [(set GPR:$Rd, (ARMcmov GPR:$false, imm:$src, + cmovpred:$p))]>, + RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>; + +let isMoveImm = 1 in +def MVNCCi : ARMPseudoInst<(outs GPR:$Rd), + (ins GPR:$false, mod_imm:$imm, cmovpred:$p), + 4, IIC_iCMOVi, + [(set GPR:$Rd, (ARMcmov GPR:$false, mod_imm_not:$imm, + cmovpred:$p))]>, + RegConstraint<"$false = $Rd">, Sched<[WriteALU]>; + +} // hasSideEffects + + +//===----------------------------------------------------------------------===// +// Atomic operations intrinsics +// + +def MemBarrierOptOperand : AsmOperandClass { + let Name = "MemBarrierOpt"; + let ParserMethod = "parseMemBarrierOptOperand"; +} +def memb_opt : Operand<i32> { + let PrintMethod = "printMemBOption"; + let ParserMatchClass = MemBarrierOptOperand; + let DecoderMethod = "DecodeMemBarrierOption"; +} + +def InstSyncBarrierOptOperand : AsmOperandClass { + let Name = "InstSyncBarrierOpt"; + let ParserMethod = "parseInstSyncBarrierOptOperand"; +} +def instsyncb_opt : Operand<i32> { + let PrintMethod = "printInstSyncBOption"; + let ParserMatchClass = InstSyncBarrierOptOperand; + let DecoderMethod = "DecodeInstSyncBarrierOption"; +} + +// Memory barriers protect the atomic sequences +let hasSideEffects = 1 in { +def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary, + "dmb", "\t$opt", [(int_arm_dmb (i32 imm0_15:$opt))]>, + Requires<[IsARM, HasDB]> { + bits<4> opt; + let Inst{31-4} = 0xf57ff05; + let Inst{3-0} = opt; +} + +def DSB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary, + "dsb", "\t$opt", [(int_arm_dsb (i32 imm0_15:$opt))]>, + Requires<[IsARM, HasDB]> { + bits<4> opt; + let Inst{31-4} = 0xf57ff04; + let Inst{3-0} = opt; +} + +// ISB has only full system option +def ISB : AInoP<(outs), (ins instsyncb_opt:$opt), MiscFrm, NoItinerary, + "isb", "\t$opt", [(int_arm_isb (i32 imm0_15:$opt))]>, + Requires<[IsARM, HasDB]> { + bits<4> opt; + let Inst{31-4} = 0xf57ff06; + let Inst{3-0} = opt; +} +} + +let usesCustomInserter = 1, Defs = [CPSR] in { + +// Pseudo instruction that combines movs + predicated rsbmi +// to implement integer ABS + def ABS : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$src), 8, NoItinerary, []>; +} + +let usesCustomInserter = 1 in { + def COPY_STRUCT_BYVAL_I32 : PseudoInst< + (outs), (ins GPR:$dst, GPR:$src, i32imm:$size, i32imm:$alignment), + NoItinerary, + [(ARMcopystructbyval GPR:$dst, GPR:$src, imm:$size, imm:$alignment)]>; +} + +let hasPostISelHook = 1, Constraints = "$newdst = $dst, $newsrc = $src" in { + // %newsrc, %newdst = MEMCPY %dst, %src, N, ...N scratch regs... + // Copies N registers worth of memory from address %src to address %dst + // and returns the incremented addresses. N scratch register will + // be attached for the copy to use. + def MEMCPY : PseudoInst< + (outs GPR:$newdst, GPR:$newsrc), + (ins GPR:$dst, GPR:$src, i32imm:$nreg, variable_ops), + NoItinerary, + [(set GPR:$newdst, GPR:$newsrc, + (ARMmemcopy GPR:$dst, GPR:$src, imm:$nreg))]>; +} + +def ldrex_1 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8; +}]>; + +def ldrex_2 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16; +}]>; + +def ldrex_4 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32; +}]>; + +def strex_1 : PatFrag<(ops node:$val, node:$ptr), + (int_arm_strex node:$val, node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8; +}]>; + +def strex_2 : PatFrag<(ops node:$val, node:$ptr), + (int_arm_strex node:$val, node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16; +}]>; + +def strex_4 : PatFrag<(ops node:$val, node:$ptr), + (int_arm_strex node:$val, node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32; +}]>; + +def ldaex_1 : PatFrag<(ops node:$ptr), (int_arm_ldaex node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8; +}]>; + +def ldaex_2 : PatFrag<(ops node:$ptr), (int_arm_ldaex node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16; +}]>; + +def ldaex_4 : PatFrag<(ops node:$ptr), (int_arm_ldaex node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32; +}]>; + +def stlex_1 : PatFrag<(ops node:$val, node:$ptr), + (int_arm_stlex node:$val, node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8; +}]>; + +def stlex_2 : PatFrag<(ops node:$val, node:$ptr), + (int_arm_stlex node:$val, node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16; +}]>; + +def stlex_4 : PatFrag<(ops node:$val, node:$ptr), + (int_arm_stlex node:$val, node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32; +}]>; + +let mayLoad = 1 in { +def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr), + NoItinerary, "ldrexb", "\t$Rt, $addr", + [(set GPR:$Rt, (ldrex_1 addr_offset_none:$addr))]>; +def LDREXH : AIldrex<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr), + NoItinerary, "ldrexh", "\t$Rt, $addr", + [(set GPR:$Rt, (ldrex_2 addr_offset_none:$addr))]>; +def LDREX : AIldrex<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr), + NoItinerary, "ldrex", "\t$Rt, $addr", + [(set GPR:$Rt, (ldrex_4 addr_offset_none:$addr))]>; +let hasExtraDefRegAllocReq = 1 in +def LDREXD : AIldrex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr), + NoItinerary, "ldrexd", "\t$Rt, $addr", []> { + let DecoderMethod = "DecodeDoubleRegLoad"; +} + +def LDAEXB : AIldaex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr), + NoItinerary, "ldaexb", "\t$Rt, $addr", + [(set GPR:$Rt, (ldaex_1 addr_offset_none:$addr))]>; +def LDAEXH : AIldaex<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr), + NoItinerary, "ldaexh", "\t$Rt, $addr", + [(set GPR:$Rt, (ldaex_2 addr_offset_none:$addr))]>; +def LDAEX : AIldaex<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr), + NoItinerary, "ldaex", "\t$Rt, $addr", + [(set GPR:$Rt, (ldaex_4 addr_offset_none:$addr))]>; +let hasExtraDefRegAllocReq = 1 in +def LDAEXD : AIldaex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr), + NoItinerary, "ldaexd", "\t$Rt, $addr", []> { + let DecoderMethod = "DecodeDoubleRegLoad"; +} +} + +let mayStore = 1, Constraints = "@earlyclobber $Rd" in { +def STREXB: AIstrex<0b10, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr), + NoItinerary, "strexb", "\t$Rd, $Rt, $addr", + [(set GPR:$Rd, (strex_1 GPR:$Rt, + addr_offset_none:$addr))]>; +def STREXH: AIstrex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr), + NoItinerary, "strexh", "\t$Rd, $Rt, $addr", + [(set GPR:$Rd, (strex_2 GPR:$Rt, + addr_offset_none:$addr))]>; +def STREX : AIstrex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr), + NoItinerary, "strex", "\t$Rd, $Rt, $addr", + [(set GPR:$Rd, (strex_4 GPR:$Rt, + addr_offset_none:$addr))]>; +let hasExtraSrcRegAllocReq = 1 in +def STREXD : AIstrex<0b01, (outs GPR:$Rd), + (ins GPRPairOp:$Rt, addr_offset_none:$addr), + NoItinerary, "strexd", "\t$Rd, $Rt, $addr", []> { + let DecoderMethod = "DecodeDoubleRegStore"; +} +def STLEXB: AIstlex<0b10, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr), + NoItinerary, "stlexb", "\t$Rd, $Rt, $addr", + [(set GPR:$Rd, + (stlex_1 GPR:$Rt, addr_offset_none:$addr))]>; +def STLEXH: AIstlex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr), + NoItinerary, "stlexh", "\t$Rd, $Rt, $addr", + [(set GPR:$Rd, + (stlex_2 GPR:$Rt, addr_offset_none:$addr))]>; +def STLEX : AIstlex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr), + NoItinerary, "stlex", "\t$Rd, $Rt, $addr", + [(set GPR:$Rd, + (stlex_4 GPR:$Rt, addr_offset_none:$addr))]>; +let hasExtraSrcRegAllocReq = 1 in +def STLEXD : AIstlex<0b01, (outs GPR:$Rd), + (ins GPRPairOp:$Rt, addr_offset_none:$addr), + NoItinerary, "stlexd", "\t$Rd, $Rt, $addr", []> { + let DecoderMethod = "DecodeDoubleRegStore"; +} +} + +def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex", + [(int_arm_clrex)]>, + Requires<[IsARM, HasV6K]> { + let Inst{31-0} = 0b11110101011111111111000000011111; +} + +def : ARMPat<(strex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr), + (STREXB GPR:$Rt, addr_offset_none:$addr)>; +def : ARMPat<(strex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr), + (STREXH GPR:$Rt, addr_offset_none:$addr)>; + +def : ARMPat<(stlex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr), + (STLEXB GPR:$Rt, addr_offset_none:$addr)>; +def : ARMPat<(stlex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr), + (STLEXH GPR:$Rt, addr_offset_none:$addr)>; + +class acquiring_load<PatFrag base> + : PatFrag<(ops node:$ptr), (base node:$ptr), [{ + AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering(); + return isAtLeastAcquire(Ordering); +}]>; + +def atomic_load_acquire_8 : acquiring_load<atomic_load_8>; +def atomic_load_acquire_16 : acquiring_load<atomic_load_16>; +def atomic_load_acquire_32 : acquiring_load<atomic_load_32>; + +class releasing_store<PatFrag base> + : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{ + AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering(); + return isAtLeastRelease(Ordering); +}]>; + +def atomic_store_release_8 : releasing_store<atomic_store_8>; +def atomic_store_release_16 : releasing_store<atomic_store_16>; +def atomic_store_release_32 : releasing_store<atomic_store_32>; + +let AddedComplexity = 8 in { + def : ARMPat<(atomic_load_acquire_8 addr_offset_none:$addr), (LDAB addr_offset_none:$addr)>; + def : ARMPat<(atomic_load_acquire_16 addr_offset_none:$addr), (LDAH addr_offset_none:$addr)>; + def : ARMPat<(atomic_load_acquire_32 addr_offset_none:$addr), (LDA addr_offset_none:$addr)>; + def : ARMPat<(atomic_store_release_8 addr_offset_none:$addr, GPR:$val), (STLB GPR:$val, addr_offset_none:$addr)>; + def : ARMPat<(atomic_store_release_16 addr_offset_none:$addr, GPR:$val), (STLH GPR:$val, addr_offset_none:$addr)>; + def : ARMPat<(atomic_store_release_32 addr_offset_none:$addr, GPR:$val), (STL GPR:$val, addr_offset_none:$addr)>; +} + +// SWP/SWPB are deprecated in V6/V7. +let mayLoad = 1, mayStore = 1 in { +def SWP : AIswp<0, (outs GPRnopc:$Rt), + (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swp", []>, + Requires<[PreV8]>; +def SWPB: AIswp<1, (outs GPRnopc:$Rt), + (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swpb", []>, + Requires<[PreV8]>; +} + +//===----------------------------------------------------------------------===// +// Coprocessor Instructions. +// + +def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1, + c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), + NoItinerary, "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2", + [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn, + imm:$CRm, imm:$opc2)]>, + Requires<[PreV8]> { + bits<4> opc1; + bits<4> CRn; + bits<4> CRd; + bits<4> cop; + bits<3> opc2; + bits<4> CRm; + + let Inst{3-0} = CRm; + let Inst{4} = 0; + let Inst{7-5} = opc2; + let Inst{11-8} = cop; + let Inst{15-12} = CRd; + let Inst{19-16} = CRn; + let Inst{23-20} = opc1; +} + +def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1, + c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), + NoItinerary, "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2", + [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn, + imm:$CRm, imm:$opc2)]>, + Requires<[PreV8]> { + let Inst{31-28} = 0b1111; + bits<4> opc1; + bits<4> CRn; + bits<4> CRd; + bits<4> cop; + bits<3> opc2; + bits<4> CRm; + + let Inst{3-0} = CRm; + let Inst{4} = 0; + let Inst{7-5} = opc2; + let Inst{11-8} = cop; + let Inst{15-12} = CRd; + let Inst{19-16} = CRn; + let Inst{23-20} = opc1; +} + +class ACI<dag oops, dag iops, string opc, string asm, + IndexMode im = IndexModeNone> + : I<oops, iops, AddrModeNone, 4, im, BrFrm, NoItinerary, + opc, asm, "", []> { + let Inst{27-25} = 0b110; +} +class ACInoP<dag oops, dag iops, string opc, string asm, + IndexMode im = IndexModeNone> + : InoP<oops, iops, AddrModeNone, 4, im, BrFrm, NoItinerary, + opc, asm, "", []> { + let Inst{31-28} = 0b1111; + let Inst{27-25} = 0b110; +} +multiclass LdStCop<bit load, bit Dbit, string asm> { + def _OFFSET : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr), + asm, "\t$cop, $CRd, $addr"> { + bits<13> addr; + bits<4> cop; + bits<4> CRd; + let Inst{24} = 1; // P = 1 + let Inst{23} = addr{8}; + let Inst{22} = Dbit; + let Inst{21} = 0; // W = 0 + let Inst{20} = load; + let Inst{19-16} = addr{12-9}; + let Inst{15-12} = CRd; + let Inst{11-8} = cop; + let Inst{7-0} = addr{7-0}; + let DecoderMethod = "DecodeCopMemInstruction"; + } + def _PRE : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5_pre:$addr), + asm, "\t$cop, $CRd, $addr!", IndexModePre> { + bits<13> addr; + bits<4> cop; + bits<4> CRd; + let Inst{24} = 1; // P = 1 + let Inst{23} = addr{8}; + let Inst{22} = Dbit; + let Inst{21} = 1; // W = 1 + let Inst{20} = load; + let Inst{19-16} = addr{12-9}; + let Inst{15-12} = CRd; + let Inst{11-8} = cop; + let Inst{7-0} = addr{7-0}; + let DecoderMethod = "DecodeCopMemInstruction"; + } + def _POST: ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr, + postidx_imm8s4:$offset), + asm, "\t$cop, $CRd, $addr, $offset", IndexModePost> { + bits<9> offset; + bits<4> addr; + bits<4> cop; + bits<4> CRd; + let Inst{24} = 0; // P = 0 + let Inst{23} = offset{8}; + let Inst{22} = Dbit; + let Inst{21} = 1; // W = 1 + let Inst{20} = load; + let Inst{19-16} = addr; + let Inst{15-12} = CRd; + let Inst{11-8} = cop; + let Inst{7-0} = offset{7-0}; + let DecoderMethod = "DecodeCopMemInstruction"; + } + def _OPTION : ACI<(outs), + (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr, + coproc_option_imm:$option), + asm, "\t$cop, $CRd, $addr, $option"> { + bits<8> option; + bits<4> addr; + bits<4> cop; + bits<4> CRd; + let Inst{24} = 0; // P = 0 + let Inst{23} = 1; // U = 1 + let Inst{22} = Dbit; + let Inst{21} = 0; // W = 0 + let Inst{20} = load; + let Inst{19-16} = addr; + let Inst{15-12} = CRd; + let Inst{11-8} = cop; + let Inst{7-0} = option; + let DecoderMethod = "DecodeCopMemInstruction"; + } +} +multiclass LdSt2Cop<bit load, bit Dbit, string asm> { + def _OFFSET : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr), + asm, "\t$cop, $CRd, $addr"> { + bits<13> addr; + bits<4> cop; + bits<4> CRd; + let Inst{24} = 1; // P = 1 + let Inst{23} = addr{8}; + let Inst{22} = Dbit; + let Inst{21} = 0; // W = 0 + let Inst{20} = load; + let Inst{19-16} = addr{12-9}; + let Inst{15-12} = CRd; + let Inst{11-8} = cop; + let Inst{7-0} = addr{7-0}; + let DecoderMethod = "DecodeCopMemInstruction"; + } + def _PRE : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5_pre:$addr), + asm, "\t$cop, $CRd, $addr!", IndexModePre> { + bits<13> addr; + bits<4> cop; + bits<4> CRd; + let Inst{24} = 1; // P = 1 + let Inst{23} = addr{8}; + let Inst{22} = Dbit; + let Inst{21} = 1; // W = 1 + let Inst{20} = load; + let Inst{19-16} = addr{12-9}; + let Inst{15-12} = CRd; + let Inst{11-8} = cop; + let Inst{7-0} = addr{7-0}; + let DecoderMethod = "DecodeCopMemInstruction"; + } + def _POST: ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr, + postidx_imm8s4:$offset), + asm, "\t$cop, $CRd, $addr, $offset", IndexModePost> { + bits<9> offset; + bits<4> addr; + bits<4> cop; + bits<4> CRd; + let Inst{24} = 0; // P = 0 + let Inst{23} = offset{8}; + let Inst{22} = Dbit; + let Inst{21} = 1; // W = 1 + let Inst{20} = load; + let Inst{19-16} = addr; + let Inst{15-12} = CRd; + let Inst{11-8} = cop; + let Inst{7-0} = offset{7-0}; + let DecoderMethod = "DecodeCopMemInstruction"; + } + def _OPTION : ACInoP<(outs), + (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr, + coproc_option_imm:$option), + asm, "\t$cop, $CRd, $addr, $option"> { + bits<8> option; + bits<4> addr; + bits<4> cop; + bits<4> CRd; + let Inst{24} = 0; // P = 0 + let Inst{23} = 1; // U = 1 + let Inst{22} = Dbit; + let Inst{21} = 0; // W = 0 + let Inst{20} = load; + let Inst{19-16} = addr; + let Inst{15-12} = CRd; + let Inst{11-8} = cop; + let Inst{7-0} = option; + let DecoderMethod = "DecodeCopMemInstruction"; + } +} + +defm LDC : LdStCop <1, 0, "ldc">; +defm LDCL : LdStCop <1, 1, "ldcl">; +defm STC : LdStCop <0, 0, "stc">; +defm STCL : LdStCop <0, 1, "stcl">; +defm LDC2 : LdSt2Cop<1, 0, "ldc2">, Requires<[PreV8]>; +defm LDC2L : LdSt2Cop<1, 1, "ldc2l">, Requires<[PreV8]>; +defm STC2 : LdSt2Cop<0, 0, "stc2">, Requires<[PreV8]>; +defm STC2L : LdSt2Cop<0, 1, "stc2l">, Requires<[PreV8]>; + +//===----------------------------------------------------------------------===// +// Move between coprocessor and ARM core register. +// + +class MovRCopro<string opc, bit direction, dag oops, dag iops, + list<dag> pattern> + : ABI<0b1110, oops, iops, NoItinerary, opc, + "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2", pattern> { + let Inst{20} = direction; + let Inst{4} = 1; + + bits<4> Rt; + bits<4> cop; + bits<3> opc1; + bits<3> opc2; + bits<4> CRm; + bits<4> CRn; + + let Inst{15-12} = Rt; + let Inst{11-8} = cop; + let Inst{23-21} = opc1; + let Inst{7-5} = opc2; + let Inst{3-0} = CRm; + let Inst{19-16} = CRn; +} + +def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */, + (outs), + (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, + c_imm:$CRm, imm0_7:$opc2), + [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn, + imm:$CRm, imm:$opc2)]>, + ComplexDeprecationPredicate<"MCR">; +def : ARMInstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm", + (MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, + c_imm:$CRm, 0, pred:$p)>; +def MRC : MovRCopro<"mrc", 1 /* from coprocessor to ARM core register */, + (outs GPRwithAPSR:$Rt), + (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, + imm0_7:$opc2), []>; +def : ARMInstAlias<"mrc${p} $cop, $opc1, $Rt, $CRn, $CRm", + (MRC GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, + c_imm:$CRm, 0, pred:$p)>; + +def : ARMPat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2), + (MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>; + +class MovRCopro2<string opc, bit direction, dag oops, dag iops, + list<dag> pattern> + : ABXI<0b1110, oops, iops, NoItinerary, + !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"), pattern> { + let Inst{31-24} = 0b11111110; + let Inst{20} = direction; + let Inst{4} = 1; + + bits<4> Rt; + bits<4> cop; + bits<3> opc1; + bits<3> opc2; + bits<4> CRm; + bits<4> CRn; + + let Inst{15-12} = Rt; + let Inst{11-8} = cop; + let Inst{23-21} = opc1; + let Inst{7-5} = opc2; + let Inst{3-0} = CRm; + let Inst{19-16} = CRn; +} + +def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */, + (outs), + (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, + c_imm:$CRm, imm0_7:$opc2), + [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn, + imm:$CRm, imm:$opc2)]>, + Requires<[PreV8]>; +def : ARMInstAlias<"mcr2 $cop, $opc1, $Rt, $CRn, $CRm", + (MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, + c_imm:$CRm, 0)>; +def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */, + (outs GPRwithAPSR:$Rt), + (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, + imm0_7:$opc2), []>, + Requires<[PreV8]>; +def : ARMInstAlias<"mrc2 $cop, $opc1, $Rt, $CRn, $CRm", + (MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, + c_imm:$CRm, 0)>; + +def : ARMV5TPat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn, + imm:$CRm, imm:$opc2), + (MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>; + +class MovRRCopro<string opc, bit direction, dag oops, dag iops, list<dag> + pattern = []> + : ABI<0b1100, oops, iops, NoItinerary, opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm", + pattern> { + + let Inst{23-21} = 0b010; + let Inst{20} = direction; + + bits<4> Rt; + bits<4> Rt2; + bits<4> cop; + bits<4> opc1; + bits<4> CRm; + + let Inst{15-12} = Rt; + let Inst{19-16} = Rt2; + let Inst{11-8} = cop; + let Inst{7-4} = opc1; + let Inst{3-0} = CRm; +} + +def MCRR : MovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */, + (outs), (ins p_imm:$cop, imm0_15:$opc1, GPRnopc:$Rt, + GPRnopc:$Rt2, c_imm:$CRm), + [(int_arm_mcrr imm:$cop, imm:$opc1, GPRnopc:$Rt, + GPRnopc:$Rt2, imm:$CRm)]>; +def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */, + (outs GPRnopc:$Rt, GPRnopc:$Rt2), + (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRm), []>; + +class MovRRCopro2<string opc, bit direction, list<dag> pattern = []> + : ABXI<0b1100, (outs), (ins p_imm:$cop, imm0_15:$opc1, + GPRnopc:$Rt, GPRnopc:$Rt2, c_imm:$CRm), NoItinerary, + !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern>, + Requires<[PreV8]> { + let Inst{31-28} = 0b1111; + let Inst{23-21} = 0b010; + let Inst{20} = direction; + + bits<4> Rt; + bits<4> Rt2; + bits<4> cop; + bits<4> opc1; + bits<4> CRm; + + let Inst{15-12} = Rt; + let Inst{19-16} = Rt2; + let Inst{11-8} = cop; + let Inst{7-4} = opc1; + let Inst{3-0} = CRm; + + let DecoderMethod = "DecodeMRRC2"; +} + +def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */, + [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPRnopc:$Rt, + GPRnopc:$Rt2, imm:$CRm)]>; +def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */>; + +//===----------------------------------------------------------------------===// +// Move between special register and ARM core register +// + +// Move to ARM core register from Special Register +def MRS : ABI<0b0001, (outs GPRnopc:$Rd), (ins), NoItinerary, + "mrs", "\t$Rd, apsr", []> { + bits<4> Rd; + let Inst{23-16} = 0b00001111; + let Unpredictable{19-17} = 0b111; + + let Inst{15-12} = Rd; + + let Inst{11-0} = 0b000000000000; + let Unpredictable{11-0} = 0b110100001111; +} + +def : InstAlias<"mrs${p} $Rd, cpsr", (MRS GPRnopc:$Rd, pred:$p)>, + Requires<[IsARM]>; + +// The MRSsys instruction is the MRS instruction from the ARM ARM, +// section B9.3.9, with the R bit set to 1. +def MRSsys : ABI<0b0001, (outs GPRnopc:$Rd), (ins), NoItinerary, + "mrs", "\t$Rd, spsr", []> { + bits<4> Rd; + let Inst{23-16} = 0b01001111; + let Unpredictable{19-16} = 0b1111; + + let Inst{15-12} = Rd; + + let Inst{11-0} = 0b000000000000; + let Unpredictable{11-0} = 0b110100001111; +} + +// However, the MRS (banked register) system instruction (ARMv7VE) *does* have a +// separate encoding (distinguished by bit 5. +def MRSbanked : ABI<0b0001, (outs GPRnopc:$Rd), (ins banked_reg:$banked), + NoItinerary, "mrs", "\t$Rd, $banked", []>, + Requires<[IsARM, HasVirtualization]> { + bits<6> banked; + bits<4> Rd; + + let Inst{23} = 0; + let Inst{22} = banked{5}; // R bit + let Inst{21-20} = 0b00; + let Inst{19-16} = banked{3-0}; + let Inst{15-12} = Rd; + let Inst{11-9} = 0b001; + let Inst{8} = banked{4}; + let Inst{7-0} = 0b00000000; +} + +// Move from ARM core register to Special Register +// +// No need to have both system and application versions of MSR (immediate) or +// MSR (register), the encodings are the same and the assembly parser has no way +// to distinguish between them. The mask operand contains the special register +// (R Bit) in bit 4 and bits 3-0 contains the mask with the fields to be +// accessed in the special register. +def MSR : ABI<0b0001, (outs), (ins msr_mask:$mask, GPR:$Rn), NoItinerary, + "msr", "\t$mask, $Rn", []> { + bits<5> mask; + bits<4> Rn; + + let Inst{23} = 0; + let Inst{22} = mask{4}; // R bit + let Inst{21-20} = 0b10; + let Inst{19-16} = mask{3-0}; + let Inst{15-12} = 0b1111; + let Inst{11-4} = 0b00000000; + let Inst{3-0} = Rn; +} + +def MSRi : ABI<0b0011, (outs), (ins msr_mask:$mask, mod_imm:$imm), NoItinerary, + "msr", "\t$mask, $imm", []> { + bits<5> mask; + bits<12> imm; + + let Inst{23} = 0; + let Inst{22} = mask{4}; // R bit + let Inst{21-20} = 0b10; + let Inst{19-16} = mask{3-0}; + let Inst{15-12} = 0b1111; + let Inst{11-0} = imm; +} + +// However, the MSR (banked register) system instruction (ARMv7VE) *does* have a +// separate encoding (distinguished by bit 5. +def MSRbanked : ABI<0b0001, (outs), (ins banked_reg:$banked, GPRnopc:$Rn), + NoItinerary, "msr", "\t$banked, $Rn", []>, + Requires<[IsARM, HasVirtualization]> { + bits<6> banked; + bits<4> Rn; + + let Inst{23} = 0; + let Inst{22} = banked{5}; // R bit + let Inst{21-20} = 0b10; + let Inst{19-16} = banked{3-0}; + let Inst{15-12} = 0b1111; + let Inst{11-9} = 0b001; + let Inst{8} = banked{4}; + let Inst{7-4} = 0b0000; + let Inst{3-0} = Rn; +} + +// Dynamic stack allocation yields a _chkstk for Windows targets. These calls +// are needed to probe the stack when allocating more than +// 4k bytes in one go. Touching the stack at 4K increments is necessary to +// ensure that the guard pages used by the OS virtual memory manager are +// allocated in correct sequence. +// The main point of having separate instruction are extra unmodelled effects +// (compared to ordinary calls) like stack pointer change. + +def win__chkstk : SDNode<"ARMISD::WIN__CHKSTK", SDTNone, + [SDNPHasChain, SDNPSideEffect]>; +let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP] in + def WIN__CHKSTK : PseudoInst<(outs), (ins), NoItinerary, [(win__chkstk)]>; + +def win__dbzchk : SDNode<"ARMISD::WIN__DBZCHK", SDT_WIN__DBZCHK, + [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>; +let usesCustomInserter = 1, Defs = [CPSR] in + def WIN__DBZCHK : PseudoInst<(outs), (ins GPR:$divisor), NoItinerary, + [(win__dbzchk GPR:$divisor)]>; + +//===----------------------------------------------------------------------===// +// TLS Instructions +// + +// __aeabi_read_tp preserves the registers r1-r3. +// This is a pseudo inst so that we can get the encoding right, +// complete with fixup for the aeabi_read_tp function. +// TPsoft is valid for ARM mode only, in case of Thumb mode a tTPsoft pattern +// is defined in "ARMInstrThumb.td". +let isCall = 1, + Defs = [R0, R12, LR, CPSR], Uses = [SP] in { + def TPsoft : ARMPseudoInst<(outs), (ins), 4, IIC_Br, + [(set R0, ARMthread_pointer)]>, Sched<[WriteBr]>; +} + +//===----------------------------------------------------------------------===// +// SJLJ Exception handling intrinsics +// eh_sjlj_setjmp() is an instruction sequence to store the return +// address and save #0 in R0 for the non-longjmp case. +// Since by its nature we may be coming from some other function to get +// here, and we're using the stack frame for the containing function to +// save/restore registers, we can't keep anything live in regs across +// the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon +// when we get here from a longjmp(). We force everything out of registers +// except for our own input by listing the relevant registers in Defs. By +// doing so, we also cause the prologue/epilogue code to actively preserve +// all of the callee-saved resgisters, which is exactly what we want. +// A constant value is passed in $val, and we use the location as a scratch. +// +// These are pseudo-instructions and are lowered to individual MC-insts, so +// no encoding information is necessary. +let Defs = + [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR, + Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15 ], + hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in { + def Int_eh_sjlj_setjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$val), + NoItinerary, + [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>, + Requires<[IsARM, HasVFP2]>; +} + +let Defs = + [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR ], + hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in { + def Int_eh_sjlj_setjmp_nofp : PseudoInst<(outs), (ins GPR:$src, GPR:$val), + NoItinerary, + [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>, + Requires<[IsARM, NoVFP]>; +} + +// FIXME: Non-IOS version(s) +let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, + Defs = [ R7, LR, SP ] in { +def Int_eh_sjlj_longjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$scratch), + NoItinerary, + [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>, + Requires<[IsARM]>; +} + +let isBarrier = 1, hasSideEffects = 1, usesCustomInserter = 1 in +def Int_eh_sjlj_setup_dispatch : PseudoInst<(outs), (ins), NoItinerary, + [(ARMeh_sjlj_setup_dispatch)]>; + +// eh.sjlj.dispatchsetup pseudo-instruction. +// This pseudo is used for both ARM and Thumb. Any differences are handled when +// the pseudo is expanded (which happens before any passes that need the +// instruction size). +let isBarrier = 1 in +def Int_eh_sjlj_dispatchsetup : PseudoInst<(outs), (ins), NoItinerary, []>; + + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +// + +// ARMv4 indirect branch using (MOVr PC, dst) +let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in + def MOVPCRX : ARMPseudoExpand<(outs), (ins GPR:$dst), + 4, IIC_Br, [(brind GPR:$dst)], + (MOVr PC, GPR:$dst, (ops 14, zero_reg), zero_reg)>, + Requires<[IsARM, NoV4T]>, Sched<[WriteBr]>; + +// Large immediate handling. + +// 32-bit immediate using two piece mod_imms or movw + movt. +// This is a single pseudo instruction, the benefit is that it can be remat'd +// as a single unit instead of having to handle reg inputs. +// FIXME: Remove this when we can do generalized remat. +let isReMaterializable = 1, isMoveImm = 1 in +def MOVi32imm : PseudoInst<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVix2, + [(set GPR:$dst, (arm_i32imm:$src))]>, + Requires<[IsARM]>; + +def LDRLIT_ga_abs : PseudoInst<(outs GPR:$dst), (ins i32imm:$src), IIC_iLoad_i, + [(set GPR:$dst, (ARMWrapper tglobaladdr:$src))]>, + Requires<[IsARM, DontUseMovt]>; + +// Pseudo instruction that combines movw + movt + add pc (if PIC). +// It also makes it possible to rematerialize the instructions. +// FIXME: Remove this when we can do generalized remat and when machine licm +// can properly the instructions. +let isReMaterializable = 1 in { +def MOV_ga_pcrel : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr), + IIC_iMOVix2addpc, + [(set GPR:$dst, (ARMWrapperPIC tglobaladdr:$addr))]>, + Requires<[IsARM, UseMovt]>; + +def LDRLIT_ga_pcrel : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr), + IIC_iLoadiALU, + [(set GPR:$dst, + (ARMWrapperPIC tglobaladdr:$addr))]>, + Requires<[IsARM, DontUseMovt]>; + +let AddedComplexity = 10 in +def LDRLIT_ga_pcrel_ldr : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr), + NoItinerary, + [(set GPR:$dst, + (load (ARMWrapperPIC tglobaladdr:$addr)))]>, + Requires<[IsARM, DontUseMovt]>; + +let AddedComplexity = 10 in +def MOV_ga_pcrel_ldr : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr), + IIC_iMOVix2ld, + [(set GPR:$dst, (load (ARMWrapperPIC tglobaladdr:$addr)))]>, + Requires<[IsARM, UseMovt]>; +} // isReMaterializable + +// ConstantPool, GlobalAddress, and JumpTable +def : ARMPat<(ARMWrapper tconstpool :$dst), (LEApcrel tconstpool :$dst)>; +def : ARMPat<(ARMWrapper tglobaladdr :$dst), (MOVi32imm tglobaladdr :$dst)>, + Requires<[IsARM, UseMovt]>; +def : ARMPat<(ARMWrapperJT tjumptable:$dst), + (LEApcrelJT tjumptable:$dst)>; + +// TODO: add,sub,and, 3-instr forms? + +// Tail calls. These patterns also apply to Thumb mode. +def : Pat<(ARMtcret tcGPR:$dst), (TCRETURNri tcGPR:$dst)>; +def : Pat<(ARMtcret (i32 tglobaladdr:$dst)), (TCRETURNdi texternalsym:$dst)>; +def : Pat<(ARMtcret (i32 texternalsym:$dst)), (TCRETURNdi texternalsym:$dst)>; + +// Direct calls +def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>; +def : ARMPat<(ARMcall_nolink texternalsym:$func), + (BMOVPCB_CALL texternalsym:$func)>; + +// zextload i1 -> zextload i8 +def : ARMPat<(zextloadi1 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>; +def : ARMPat<(zextloadi1 ldst_so_reg:$addr), (LDRBrs ldst_so_reg:$addr)>; + +// extload -> zextload +def : ARMPat<(extloadi1 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>; +def : ARMPat<(extloadi1 ldst_so_reg:$addr), (LDRBrs ldst_so_reg:$addr)>; +def : ARMPat<(extloadi8 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>; +def : ARMPat<(extloadi8 ldst_so_reg:$addr), (LDRBrs ldst_so_reg:$addr)>; + +def : ARMPat<(extloadi16 addrmode3:$addr), (LDRH addrmode3:$addr)>; + +def : ARMPat<(extloadi8 addrmodepc:$addr), (PICLDRB addrmodepc:$addr)>; +def : ARMPat<(extloadi16 addrmodepc:$addr), (PICLDRH addrmodepc:$addr)>; + +// smul* and smla* +def : ARMV5TEPat<(mul (sra (shl GPR:$a, (i32 16)), (i32 16)), + (sra (shl GPR:$b, (i32 16)), (i32 16))), + (SMULBB GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul sext_16_node:$a, sext_16_node:$b), + (SMULBB GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul (sra (shl GPR:$a, (i32 16)), (i32 16)), + (sra GPR:$b, (i32 16))), + (SMULBT GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul sext_16_node:$a, (sra GPR:$b, (i32 16))), + (SMULBT GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)), + (sra (shl GPR:$b, (i32 16)), (i32 16))), + (SMULTB GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)), sext_16_node:$b), + (SMULTB GPR:$a, GPR:$b)>; + +def : ARMV5MOPat<(add GPR:$acc, + (mul (sra (shl GPR:$a, (i32 16)), (i32 16)), + (sra (shl GPR:$b, (i32 16)), (i32 16)))), + (SMLABB GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5MOPat<(add GPR:$acc, + (mul sext_16_node:$a, sext_16_node:$b)), + (SMLABB GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5MOPat<(add GPR:$acc, + (mul (sra (shl GPR:$a, (i32 16)), (i32 16)), + (sra GPR:$b, (i32 16)))), + (SMLABT GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5MOPat<(add GPR:$acc, + (mul sext_16_node:$a, (sra GPR:$b, (i32 16)))), + (SMLABT GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5MOPat<(add GPR:$acc, + (mul (sra GPR:$a, (i32 16)), + (sra (shl GPR:$b, (i32 16)), (i32 16)))), + (SMLATB GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5MOPat<(add GPR:$acc, + (mul (sra GPR:$a, (i32 16)), sext_16_node:$b)), + (SMLATB GPR:$a, GPR:$b, GPR:$acc)>; + + +// Pre-v7 uses MCR for synchronization barriers. +def : ARMPat<(ARMMemBarrierMCR GPR:$zero), (MCR 15, 0, GPR:$zero, 7, 10, 5)>, + Requires<[IsARM, HasV6]>; + +// SXT/UXT with no rotate +let AddedComplexity = 16 in { +def : ARMV6Pat<(and GPR:$Src, 0x000000FF), (UXTB GPR:$Src, 0)>; +def : ARMV6Pat<(and GPR:$Src, 0x0000FFFF), (UXTH GPR:$Src, 0)>; +def : ARMV6Pat<(and GPR:$Src, 0x00FF00FF), (UXTB16 GPR:$Src, 0)>; +def : ARMV6Pat<(add GPR:$Rn, (and GPR:$Rm, 0x00FF)), + (UXTAB GPR:$Rn, GPR:$Rm, 0)>; +def : ARMV6Pat<(add GPR:$Rn, (and GPR:$Rm, 0xFFFF)), + (UXTAH GPR:$Rn, GPR:$Rm, 0)>; +} + +def : ARMV6Pat<(sext_inreg GPR:$Src, i8), (SXTB GPR:$Src, 0)>; +def : ARMV6Pat<(sext_inreg GPR:$Src, i16), (SXTH GPR:$Src, 0)>; + +def : ARMV6Pat<(add GPR:$Rn, (sext_inreg GPRnopc:$Rm, i8)), + (SXTAB GPR:$Rn, GPRnopc:$Rm, 0)>; +def : ARMV6Pat<(add GPR:$Rn, (sext_inreg GPRnopc:$Rm, i16)), + (SXTAH GPR:$Rn, GPRnopc:$Rm, 0)>; + +// Atomic load/store patterns +def : ARMPat<(atomic_load_8 ldst_so_reg:$src), + (LDRBrs ldst_so_reg:$src)>; +def : ARMPat<(atomic_load_8 addrmode_imm12:$src), + (LDRBi12 addrmode_imm12:$src)>; +def : ARMPat<(atomic_load_16 addrmode3:$src), + (LDRH addrmode3:$src)>; +def : ARMPat<(atomic_load_32 ldst_so_reg:$src), + (LDRrs ldst_so_reg:$src)>; +def : ARMPat<(atomic_load_32 addrmode_imm12:$src), + (LDRi12 addrmode_imm12:$src)>; +def : ARMPat<(atomic_store_8 ldst_so_reg:$ptr, GPR:$val), + (STRBrs GPR:$val, ldst_so_reg:$ptr)>; +def : ARMPat<(atomic_store_8 addrmode_imm12:$ptr, GPR:$val), + (STRBi12 GPR:$val, addrmode_imm12:$ptr)>; +def : ARMPat<(atomic_store_16 addrmode3:$ptr, GPR:$val), + (STRH GPR:$val, addrmode3:$ptr)>; +def : ARMPat<(atomic_store_32 ldst_so_reg:$ptr, GPR:$val), + (STRrs GPR:$val, ldst_so_reg:$ptr)>; +def : ARMPat<(atomic_store_32 addrmode_imm12:$ptr, GPR:$val), + (STRi12 GPR:$val, addrmode_imm12:$ptr)>; + + +//===----------------------------------------------------------------------===// +// Thumb Support +// + +include "ARMInstrThumb.td" + +//===----------------------------------------------------------------------===// +// Thumb2 Support +// + +include "ARMInstrThumb2.td" + +//===----------------------------------------------------------------------===// +// Floating Point Support +// + +include "ARMInstrVFP.td" + +//===----------------------------------------------------------------------===// +// Advanced SIMD (NEON) Support +// + +include "ARMInstrNEON.td" + +//===----------------------------------------------------------------------===// +// Assembler aliases +// + +// Memory barriers +def : InstAlias<"dmb", (DMB 0xf)>, Requires<[IsARM, HasDB]>; +def : InstAlias<"dsb", (DSB 0xf)>, Requires<[IsARM, HasDB]>; +def : InstAlias<"isb", (ISB 0xf)>, Requires<[IsARM, HasDB]>; + +// System instructions +def : MnemonicAlias<"swi", "svc">; + +// Load / Store Multiple +def : MnemonicAlias<"ldmfd", "ldm">; +def : MnemonicAlias<"ldmia", "ldm">; +def : MnemonicAlias<"ldmea", "ldmdb">; +def : MnemonicAlias<"stmfd", "stmdb">; +def : MnemonicAlias<"stmia", "stm">; +def : MnemonicAlias<"stmea", "stm">; + +// PKHBT/PKHTB with default shift amount. PKHTB is equivalent to PKHBT when the +// shift amount is zero (i.e., unspecified). +def : InstAlias<"pkhbt${p} $Rd, $Rn, $Rm", + (PKHBT GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, 0, pred:$p)>, + Requires<[IsARM, HasV6]>; +def : InstAlias<"pkhtb${p} $Rd, $Rn, $Rm", + (PKHBT GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, 0, pred:$p)>, + Requires<[IsARM, HasV6]>; + +// PUSH/POP aliases for STM/LDM +def : ARMInstAlias<"push${p} $regs", (STMDB_UPD SP, pred:$p, reglist:$regs)>; +def : ARMInstAlias<"pop${p} $regs", (LDMIA_UPD SP, pred:$p, reglist:$regs)>; + +// SSAT/USAT optional shift operand. +def : ARMInstAlias<"ssat${p} $Rd, $sat_imm, $Rn", + (SSAT GPRnopc:$Rd, imm1_32:$sat_imm, GPRnopc:$Rn, 0, pred:$p)>; +def : ARMInstAlias<"usat${p} $Rd, $sat_imm, $Rn", + (USAT GPRnopc:$Rd, imm0_31:$sat_imm, GPRnopc:$Rn, 0, pred:$p)>; + + +// Extend instruction optional rotate operand. +def : ARMInstAlias<"sxtab${p} $Rd, $Rn, $Rm", + (SXTAB GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>; +def : ARMInstAlias<"sxtah${p} $Rd, $Rn, $Rm", + (SXTAH GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>; +def : ARMInstAlias<"sxtab16${p} $Rd, $Rn, $Rm", + (SXTAB16 GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>; +def : ARMInstAlias<"sxtb${p} $Rd, $Rm", + (SXTB GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>; +def : ARMInstAlias<"sxtb16${p} $Rd, $Rm", + (SXTB16 GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>; +def : ARMInstAlias<"sxth${p} $Rd, $Rm", + (SXTH GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>; + +def : ARMInstAlias<"uxtab${p} $Rd, $Rn, $Rm", + (UXTAB GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>; +def : ARMInstAlias<"uxtah${p} $Rd, $Rn, $Rm", + (UXTAH GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>; +def : ARMInstAlias<"uxtab16${p} $Rd, $Rn, $Rm", + (UXTAB16 GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>; +def : ARMInstAlias<"uxtb${p} $Rd, $Rm", + (UXTB GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>; +def : ARMInstAlias<"uxtb16${p} $Rd, $Rm", + (UXTB16 GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>; +def : ARMInstAlias<"uxth${p} $Rd, $Rm", + (UXTH GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>; + + +// RFE aliases +def : MnemonicAlias<"rfefa", "rfeda">; +def : MnemonicAlias<"rfeea", "rfedb">; +def : MnemonicAlias<"rfefd", "rfeia">; +def : MnemonicAlias<"rfeed", "rfeib">; +def : MnemonicAlias<"rfe", "rfeia">; + +// SRS aliases +def : MnemonicAlias<"srsfa", "srsib">; +def : MnemonicAlias<"srsea", "srsia">; +def : MnemonicAlias<"srsfd", "srsdb">; +def : MnemonicAlias<"srsed", "srsda">; +def : MnemonicAlias<"srs", "srsia">; + +// QSAX == QSUBADDX +def : MnemonicAlias<"qsubaddx", "qsax">; +// SASX == SADDSUBX +def : MnemonicAlias<"saddsubx", "sasx">; +// SHASX == SHADDSUBX +def : MnemonicAlias<"shaddsubx", "shasx">; +// SHSAX == SHSUBADDX +def : MnemonicAlias<"shsubaddx", "shsax">; +// SSAX == SSUBADDX +def : MnemonicAlias<"ssubaddx", "ssax">; +// UASX == UADDSUBX +def : MnemonicAlias<"uaddsubx", "uasx">; +// UHASX == UHADDSUBX +def : MnemonicAlias<"uhaddsubx", "uhasx">; +// UHSAX == UHSUBADDX +def : MnemonicAlias<"uhsubaddx", "uhsax">; +// UQASX == UQADDSUBX +def : MnemonicAlias<"uqaddsubx", "uqasx">; +// UQSAX == UQSUBADDX +def : MnemonicAlias<"uqsubaddx", "uqsax">; +// USAX == USUBADDX +def : MnemonicAlias<"usubaddx", "usax">; + +// "mov Rd, mod_imm_not" can be handled via "mvn" in assembly, just like +// for isel. +def : ARMInstAlias<"mov${s}${p} $Rd, $imm", + (MVNi rGPR:$Rd, mod_imm_not:$imm, pred:$p, cc_out:$s)>; +def : ARMInstAlias<"mvn${s}${p} $Rd, $imm", + (MOVi rGPR:$Rd, mod_imm_not:$imm, pred:$p, cc_out:$s)>; +// Same for AND <--> BIC +def : ARMInstAlias<"bic${s}${p} $Rd, $Rn, $imm", + (ANDri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm, + pred:$p, cc_out:$s)>; +def : ARMInstAlias<"bic${s}${p} $Rdn, $imm", + (ANDri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm, + pred:$p, cc_out:$s)>; +def : ARMInstAlias<"and${s}${p} $Rd, $Rn, $imm", + (BICri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm, + pred:$p, cc_out:$s)>; +def : ARMInstAlias<"and${s}${p} $Rdn, $imm", + (BICri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm, + pred:$p, cc_out:$s)>; + +// Likewise, "add Rd, mod_imm_neg" -> sub +def : ARMInstAlias<"add${s}${p} $Rd, $Rn, $imm", + (SUBri GPR:$Rd, GPR:$Rn, mod_imm_neg:$imm, pred:$p, cc_out:$s)>; +def : ARMInstAlias<"add${s}${p} $Rd, $imm", + (SUBri GPR:$Rd, GPR:$Rd, mod_imm_neg:$imm, pred:$p, cc_out:$s)>; +// Same for CMP <--> CMN via mod_imm_neg +def : ARMInstAlias<"cmp${p} $Rd, $imm", + (CMNri rGPR:$Rd, mod_imm_neg:$imm, pred:$p)>; +def : ARMInstAlias<"cmn${p} $Rd, $imm", + (CMPri rGPR:$Rd, mod_imm_neg:$imm, pred:$p)>; + +// The shifter forms of the MOV instruction are aliased to the ASR, LSL, +// LSR, ROR, and RRX instructions. +// FIXME: We need C++ parser hooks to map the alias to the MOV +// encoding. It seems we should be able to do that sort of thing +// in tblgen, but it could get ugly. +let TwoOperandAliasConstraint = "$Rm = $Rd" in { +def ASRi : ARMAsmPseudo<"asr${s}${p} $Rd, $Rm, $imm", + (ins GPR:$Rd, GPR:$Rm, imm0_32:$imm, pred:$p, + cc_out:$s)>; +def LSRi : ARMAsmPseudo<"lsr${s}${p} $Rd, $Rm, $imm", + (ins GPR:$Rd, GPR:$Rm, imm0_32:$imm, pred:$p, + cc_out:$s)>; +def LSLi : ARMAsmPseudo<"lsl${s}${p} $Rd, $Rm, $imm", + (ins GPR:$Rd, GPR:$Rm, imm0_31:$imm, pred:$p, + cc_out:$s)>; +def RORi : ARMAsmPseudo<"ror${s}${p} $Rd, $Rm, $imm", + (ins GPR:$Rd, GPR:$Rm, imm0_31:$imm, pred:$p, + cc_out:$s)>; +} +def RRXi : ARMAsmPseudo<"rrx${s}${p} $Rd, $Rm", + (ins GPR:$Rd, GPR:$Rm, pred:$p, cc_out:$s)>; +let TwoOperandAliasConstraint = "$Rn = $Rd" in { +def ASRr : ARMAsmPseudo<"asr${s}${p} $Rd, $Rn, $Rm", + (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, + cc_out:$s)>; +def LSRr : ARMAsmPseudo<"lsr${s}${p} $Rd, $Rn, $Rm", + (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, + cc_out:$s)>; +def LSLr : ARMAsmPseudo<"lsl${s}${p} $Rd, $Rn, $Rm", + (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, + cc_out:$s)>; +def RORr : ARMAsmPseudo<"ror${s}${p} $Rd, $Rn, $Rm", + (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, + cc_out:$s)>; +} + +// "neg" is and alias for "rsb rd, rn, #0" +def : ARMInstAlias<"neg${s}${p} $Rd, $Rm", + (RSBri GPR:$Rd, GPR:$Rm, 0, pred:$p, cc_out:$s)>; + +// Pre-v6, 'mov r0, r0' was used as a NOP encoding. +def : InstAlias<"nop${p}", (MOVr R0, R0, pred:$p, zero_reg)>, + Requires<[IsARM, NoV6]>; + +// MUL/UMLAL/SMLAL/UMULL/SMULL are available on all arches, but +// the instruction definitions need difference constraints pre-v6. +// Use these aliases for the assembly parsing on pre-v6. +def : InstAlias<"mul${s}${p} $Rd, $Rn, $Rm", + (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s)>, + Requires<[IsARM, NoV6]>; +def : InstAlias<"mla${s}${p} $Rd, $Rn, $Rm, $Ra", + (MLA GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra, + pred:$p, cc_out:$s)>, + Requires<[IsARM, NoV6]>; +def : InstAlias<"smlal${s}${p} $RdLo, $RdHi, $Rn, $Rm", + (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>, + Requires<[IsARM, NoV6]>; +def : InstAlias<"umlal${s}${p} $RdLo, $RdHi, $Rn, $Rm", + (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>, + Requires<[IsARM, NoV6]>; +def : InstAlias<"smull${s}${p} $RdLo, $RdHi, $Rn, $Rm", + (SMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>, + Requires<[IsARM, NoV6]>; +def : InstAlias<"umull${s}${p} $RdLo, $RdHi, $Rn, $Rm", + (UMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>, + Requires<[IsARM, NoV6]>; + +// 'it' blocks in ARM mode just validate the predicates. The IT itself +// is discarded. +def ITasm : ARMAsmPseudo<"it$mask $cc", (ins it_pred:$cc, it_mask:$mask)>, + ComplexDeprecationPredicate<"IT">; + +let mayLoad = 1, mayStore =1, hasSideEffects = 1 in +def SPACE : PseudoInst<(outs GPR:$Rd), (ins i32imm:$size, GPR:$Rn), + NoItinerary, + [(set GPR:$Rd, (int_arm_space imm:$size, GPR:$Rn))]>; diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td new file mode 100644 index 0000000..7020ffb --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -0,0 +1,8178 @@ +//===-- ARMInstrNEON.td - NEON support for ARM -------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the ARM NEON instruction set. +// +//===----------------------------------------------------------------------===// + + +//===----------------------------------------------------------------------===// +// NEON-specific Operands. +//===----------------------------------------------------------------------===// +def nModImm : Operand<i32> { + let PrintMethod = "printNEONModImmOperand"; +} + +def nImmSplatI8AsmOperand : AsmOperandClass { let Name = "NEONi8splat"; } +def nImmSplatI8 : Operand<i32> { + let PrintMethod = "printNEONModImmOperand"; + let ParserMatchClass = nImmSplatI8AsmOperand; +} +def nImmSplatI16AsmOperand : AsmOperandClass { let Name = "NEONi16splat"; } +def nImmSplatI16 : Operand<i32> { + let PrintMethod = "printNEONModImmOperand"; + let ParserMatchClass = nImmSplatI16AsmOperand; +} +def nImmSplatI32AsmOperand : AsmOperandClass { let Name = "NEONi32splat"; } +def nImmSplatI32 : Operand<i32> { + let PrintMethod = "printNEONModImmOperand"; + let ParserMatchClass = nImmSplatI32AsmOperand; +} +def nImmSplatNotI16AsmOperand : AsmOperandClass { let Name = "NEONi16splatNot"; } +def nImmSplatNotI16 : Operand<i32> { + let ParserMatchClass = nImmSplatNotI16AsmOperand; +} +def nImmSplatNotI32AsmOperand : AsmOperandClass { let Name = "NEONi32splatNot"; } +def nImmSplatNotI32 : Operand<i32> { + let ParserMatchClass = nImmSplatNotI32AsmOperand; +} +def nImmVMOVI32AsmOperand : AsmOperandClass { let Name = "NEONi32vmov"; } +def nImmVMOVI32 : Operand<i32> { + let PrintMethod = "printNEONModImmOperand"; + let ParserMatchClass = nImmVMOVI32AsmOperand; +} + +def nImmVMOVI16AsmOperandByteReplicate : + AsmOperandClass { + let Name = "NEONi16vmovByteReplicate"; + let PredicateMethod = "isNEONi16ByteReplicate"; + let RenderMethod = "addNEONvmovByteReplicateOperands"; +} +def nImmVMOVI32AsmOperandByteReplicate : + AsmOperandClass { + let Name = "NEONi32vmovByteReplicate"; + let PredicateMethod = "isNEONi32ByteReplicate"; + let RenderMethod = "addNEONvmovByteReplicateOperands"; +} +def nImmVMVNI16AsmOperandByteReplicate : + AsmOperandClass { + let Name = "NEONi16invByteReplicate"; + let PredicateMethod = "isNEONi16ByteReplicate"; + let RenderMethod = "addNEONinvByteReplicateOperands"; +} +def nImmVMVNI32AsmOperandByteReplicate : + AsmOperandClass { + let Name = "NEONi32invByteReplicate"; + let PredicateMethod = "isNEONi32ByteReplicate"; + let RenderMethod = "addNEONinvByteReplicateOperands"; +} + +def nImmVMOVI16ByteReplicate : Operand<i32> { + let PrintMethod = "printNEONModImmOperand"; + let ParserMatchClass = nImmVMOVI16AsmOperandByteReplicate; +} +def nImmVMOVI32ByteReplicate : Operand<i32> { + let PrintMethod = "printNEONModImmOperand"; + let ParserMatchClass = nImmVMOVI32AsmOperandByteReplicate; +} +def nImmVMVNI16ByteReplicate : Operand<i32> { + let PrintMethod = "printNEONModImmOperand"; + let ParserMatchClass = nImmVMVNI16AsmOperandByteReplicate; +} +def nImmVMVNI32ByteReplicate : Operand<i32> { + let PrintMethod = "printNEONModImmOperand"; + let ParserMatchClass = nImmVMVNI32AsmOperandByteReplicate; +} + +def nImmVMOVI32NegAsmOperand : AsmOperandClass { let Name = "NEONi32vmovNeg"; } +def nImmVMOVI32Neg : Operand<i32> { + let PrintMethod = "printNEONModImmOperand"; + let ParserMatchClass = nImmVMOVI32NegAsmOperand; +} +def nImmVMOVF32 : Operand<i32> { + let PrintMethod = "printFPImmOperand"; + let ParserMatchClass = FPImmOperand; +} +def nImmSplatI64AsmOperand : AsmOperandClass { let Name = "NEONi64splat"; } +def nImmSplatI64 : Operand<i32> { + let PrintMethod = "printNEONModImmOperand"; + let ParserMatchClass = nImmSplatI64AsmOperand; +} + +def VectorIndex8Operand : AsmOperandClass { let Name = "VectorIndex8"; } +def VectorIndex16Operand : AsmOperandClass { let Name = "VectorIndex16"; } +def VectorIndex32Operand : AsmOperandClass { let Name = "VectorIndex32"; } +def VectorIndex8 : Operand<i32>, ImmLeaf<i32, [{ + return ((uint64_t)Imm) < 8; +}]> { + let ParserMatchClass = VectorIndex8Operand; + let PrintMethod = "printVectorIndex"; + let MIOperandInfo = (ops i32imm); +} +def VectorIndex16 : Operand<i32>, ImmLeaf<i32, [{ + return ((uint64_t)Imm) < 4; +}]> { + let ParserMatchClass = VectorIndex16Operand; + let PrintMethod = "printVectorIndex"; + let MIOperandInfo = (ops i32imm); +} +def VectorIndex32 : Operand<i32>, ImmLeaf<i32, [{ + return ((uint64_t)Imm) < 2; +}]> { + let ParserMatchClass = VectorIndex32Operand; + let PrintMethod = "printVectorIndex"; + let MIOperandInfo = (ops i32imm); +} + +// Register list of one D register. +def VecListOneDAsmOperand : AsmOperandClass { + let Name = "VecListOneD"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListOperands"; +} +def VecListOneD : RegisterOperand<DPR, "printVectorListOne"> { + let ParserMatchClass = VecListOneDAsmOperand; +} +// Register list of two sequential D registers. +def VecListDPairAsmOperand : AsmOperandClass { + let Name = "VecListDPair"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListOperands"; +} +def VecListDPair : RegisterOperand<DPair, "printVectorListTwo"> { + let ParserMatchClass = VecListDPairAsmOperand; +} +// Register list of three sequential D registers. +def VecListThreeDAsmOperand : AsmOperandClass { + let Name = "VecListThreeD"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListOperands"; +} +def VecListThreeD : RegisterOperand<DPR, "printVectorListThree"> { + let ParserMatchClass = VecListThreeDAsmOperand; +} +// Register list of four sequential D registers. +def VecListFourDAsmOperand : AsmOperandClass { + let Name = "VecListFourD"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListOperands"; +} +def VecListFourD : RegisterOperand<DPR, "printVectorListFour"> { + let ParserMatchClass = VecListFourDAsmOperand; +} +// Register list of two D registers spaced by 2 (two sequential Q registers). +def VecListDPairSpacedAsmOperand : AsmOperandClass { + let Name = "VecListDPairSpaced"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListOperands"; +} +def VecListDPairSpaced : RegisterOperand<DPair, "printVectorListTwoSpaced"> { + let ParserMatchClass = VecListDPairSpacedAsmOperand; +} +// Register list of three D registers spaced by 2 (three Q registers). +def VecListThreeQAsmOperand : AsmOperandClass { + let Name = "VecListThreeQ"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListOperands"; +} +def VecListThreeQ : RegisterOperand<DPR, "printVectorListThreeSpaced"> { + let ParserMatchClass = VecListThreeQAsmOperand; +} +// Register list of three D registers spaced by 2 (three Q registers). +def VecListFourQAsmOperand : AsmOperandClass { + let Name = "VecListFourQ"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListOperands"; +} +def VecListFourQ : RegisterOperand<DPR, "printVectorListFourSpaced"> { + let ParserMatchClass = VecListFourQAsmOperand; +} + +// Register list of one D register, with "all lanes" subscripting. +def VecListOneDAllLanesAsmOperand : AsmOperandClass { + let Name = "VecListOneDAllLanes"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListOperands"; +} +def VecListOneDAllLanes : RegisterOperand<DPR, "printVectorListOneAllLanes"> { + let ParserMatchClass = VecListOneDAllLanesAsmOperand; +} +// Register list of two D registers, with "all lanes" subscripting. +def VecListDPairAllLanesAsmOperand : AsmOperandClass { + let Name = "VecListDPairAllLanes"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListOperands"; +} +def VecListDPairAllLanes : RegisterOperand<DPair, + "printVectorListTwoAllLanes"> { + let ParserMatchClass = VecListDPairAllLanesAsmOperand; +} +// Register list of two D registers spaced by 2 (two sequential Q registers). +def VecListDPairSpacedAllLanesAsmOperand : AsmOperandClass { + let Name = "VecListDPairSpacedAllLanes"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListOperands"; +} +def VecListDPairSpacedAllLanes : RegisterOperand<DPair, + "printVectorListTwoSpacedAllLanes"> { + let ParserMatchClass = VecListDPairSpacedAllLanesAsmOperand; +} +// Register list of three D registers, with "all lanes" subscripting. +def VecListThreeDAllLanesAsmOperand : AsmOperandClass { + let Name = "VecListThreeDAllLanes"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListOperands"; +} +def VecListThreeDAllLanes : RegisterOperand<DPR, + "printVectorListThreeAllLanes"> { + let ParserMatchClass = VecListThreeDAllLanesAsmOperand; +} +// Register list of three D registers spaced by 2 (three sequential Q regs). +def VecListThreeQAllLanesAsmOperand : AsmOperandClass { + let Name = "VecListThreeQAllLanes"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListOperands"; +} +def VecListThreeQAllLanes : RegisterOperand<DPR, + "printVectorListThreeSpacedAllLanes"> { + let ParserMatchClass = VecListThreeQAllLanesAsmOperand; +} +// Register list of four D registers, with "all lanes" subscripting. +def VecListFourDAllLanesAsmOperand : AsmOperandClass { + let Name = "VecListFourDAllLanes"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListOperands"; +} +def VecListFourDAllLanes : RegisterOperand<DPR, "printVectorListFourAllLanes"> { + let ParserMatchClass = VecListFourDAllLanesAsmOperand; +} +// Register list of four D registers spaced by 2 (four sequential Q regs). +def VecListFourQAllLanesAsmOperand : AsmOperandClass { + let Name = "VecListFourQAllLanes"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListOperands"; +} +def VecListFourQAllLanes : RegisterOperand<DPR, + "printVectorListFourSpacedAllLanes"> { + let ParserMatchClass = VecListFourQAllLanesAsmOperand; +} + + +// Register list of one D register, with byte lane subscripting. +def VecListOneDByteIndexAsmOperand : AsmOperandClass { + let Name = "VecListOneDByteIndexed"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListIndexedOperands"; +} +def VecListOneDByteIndexed : Operand<i32> { + let ParserMatchClass = VecListOneDByteIndexAsmOperand; + let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx); +} +// ...with half-word lane subscripting. +def VecListOneDHWordIndexAsmOperand : AsmOperandClass { + let Name = "VecListOneDHWordIndexed"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListIndexedOperands"; +} +def VecListOneDHWordIndexed : Operand<i32> { + let ParserMatchClass = VecListOneDHWordIndexAsmOperand; + let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx); +} +// ...with word lane subscripting. +def VecListOneDWordIndexAsmOperand : AsmOperandClass { + let Name = "VecListOneDWordIndexed"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListIndexedOperands"; +} +def VecListOneDWordIndexed : Operand<i32> { + let ParserMatchClass = VecListOneDWordIndexAsmOperand; + let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx); +} + +// Register list of two D registers with byte lane subscripting. +def VecListTwoDByteIndexAsmOperand : AsmOperandClass { + let Name = "VecListTwoDByteIndexed"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListIndexedOperands"; +} +def VecListTwoDByteIndexed : Operand<i32> { + let ParserMatchClass = VecListTwoDByteIndexAsmOperand; + let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx); +} +// ...with half-word lane subscripting. +def VecListTwoDHWordIndexAsmOperand : AsmOperandClass { + let Name = "VecListTwoDHWordIndexed"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListIndexedOperands"; +} +def VecListTwoDHWordIndexed : Operand<i32> { + let ParserMatchClass = VecListTwoDHWordIndexAsmOperand; + let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx); +} +// ...with word lane subscripting. +def VecListTwoDWordIndexAsmOperand : AsmOperandClass { + let Name = "VecListTwoDWordIndexed"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListIndexedOperands"; +} +def VecListTwoDWordIndexed : Operand<i32> { + let ParserMatchClass = VecListTwoDWordIndexAsmOperand; + let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx); +} +// Register list of two Q registers with half-word lane subscripting. +def VecListTwoQHWordIndexAsmOperand : AsmOperandClass { + let Name = "VecListTwoQHWordIndexed"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListIndexedOperands"; +} +def VecListTwoQHWordIndexed : Operand<i32> { + let ParserMatchClass = VecListTwoQHWordIndexAsmOperand; + let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx); +} +// ...with word lane subscripting. +def VecListTwoQWordIndexAsmOperand : AsmOperandClass { + let Name = "VecListTwoQWordIndexed"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListIndexedOperands"; +} +def VecListTwoQWordIndexed : Operand<i32> { + let ParserMatchClass = VecListTwoQWordIndexAsmOperand; + let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx); +} + + +// Register list of three D registers with byte lane subscripting. +def VecListThreeDByteIndexAsmOperand : AsmOperandClass { + let Name = "VecListThreeDByteIndexed"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListIndexedOperands"; +} +def VecListThreeDByteIndexed : Operand<i32> { + let ParserMatchClass = VecListThreeDByteIndexAsmOperand; + let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx); +} +// ...with half-word lane subscripting. +def VecListThreeDHWordIndexAsmOperand : AsmOperandClass { + let Name = "VecListThreeDHWordIndexed"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListIndexedOperands"; +} +def VecListThreeDHWordIndexed : Operand<i32> { + let ParserMatchClass = VecListThreeDHWordIndexAsmOperand; + let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx); +} +// ...with word lane subscripting. +def VecListThreeDWordIndexAsmOperand : AsmOperandClass { + let Name = "VecListThreeDWordIndexed"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListIndexedOperands"; +} +def VecListThreeDWordIndexed : Operand<i32> { + let ParserMatchClass = VecListThreeDWordIndexAsmOperand; + let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx); +} +// Register list of three Q registers with half-word lane subscripting. +def VecListThreeQHWordIndexAsmOperand : AsmOperandClass { + let Name = "VecListThreeQHWordIndexed"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListIndexedOperands"; +} +def VecListThreeQHWordIndexed : Operand<i32> { + let ParserMatchClass = VecListThreeQHWordIndexAsmOperand; + let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx); +} +// ...with word lane subscripting. +def VecListThreeQWordIndexAsmOperand : AsmOperandClass { + let Name = "VecListThreeQWordIndexed"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListIndexedOperands"; +} +def VecListThreeQWordIndexed : Operand<i32> { + let ParserMatchClass = VecListThreeQWordIndexAsmOperand; + let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx); +} + +// Register list of four D registers with byte lane subscripting. +def VecListFourDByteIndexAsmOperand : AsmOperandClass { + let Name = "VecListFourDByteIndexed"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListIndexedOperands"; +} +def VecListFourDByteIndexed : Operand<i32> { + let ParserMatchClass = VecListFourDByteIndexAsmOperand; + let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx); +} +// ...with half-word lane subscripting. +def VecListFourDHWordIndexAsmOperand : AsmOperandClass { + let Name = "VecListFourDHWordIndexed"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListIndexedOperands"; +} +def VecListFourDHWordIndexed : Operand<i32> { + let ParserMatchClass = VecListFourDHWordIndexAsmOperand; + let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx); +} +// ...with word lane subscripting. +def VecListFourDWordIndexAsmOperand : AsmOperandClass { + let Name = "VecListFourDWordIndexed"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListIndexedOperands"; +} +def VecListFourDWordIndexed : Operand<i32> { + let ParserMatchClass = VecListFourDWordIndexAsmOperand; + let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx); +} +// Register list of four Q registers with half-word lane subscripting. +def VecListFourQHWordIndexAsmOperand : AsmOperandClass { + let Name = "VecListFourQHWordIndexed"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListIndexedOperands"; +} +def VecListFourQHWordIndexed : Operand<i32> { + let ParserMatchClass = VecListFourQHWordIndexAsmOperand; + let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx); +} +// ...with word lane subscripting. +def VecListFourQWordIndexAsmOperand : AsmOperandClass { + let Name = "VecListFourQWordIndexed"; + let ParserMethod = "parseVectorList"; + let RenderMethod = "addVecListIndexedOperands"; +} +def VecListFourQWordIndexed : Operand<i32> { + let ParserMatchClass = VecListFourQWordIndexAsmOperand; + let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx); +} + +def dword_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() >= 8; +}]>; +def dword_alignedstore : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getAlignment() >= 8; +}]>; +def word_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() == 4; +}]>; +def word_alignedstore : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getAlignment() == 4; +}]>; +def hword_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() == 2; +}]>; +def hword_alignedstore : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getAlignment() == 2; +}]>; +def byte_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() == 1; +}]>; +def byte_alignedstore : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getAlignment() == 1; +}]>; +def non_word_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() < 4; +}]>; +def non_word_alignedstore : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getAlignment() < 4; +}]>; + +//===----------------------------------------------------------------------===// +// NEON-specific DAG Nodes. +//===----------------------------------------------------------------------===// + +def SDTARMVCMP : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<1, 2>]>; +def SDTARMVCMPZ : SDTypeProfile<1, 1, []>; + +def NEONvceq : SDNode<"ARMISD::VCEQ", SDTARMVCMP>; +def NEONvceqz : SDNode<"ARMISD::VCEQZ", SDTARMVCMPZ>; +def NEONvcge : SDNode<"ARMISD::VCGE", SDTARMVCMP>; +def NEONvcgez : SDNode<"ARMISD::VCGEZ", SDTARMVCMPZ>; +def NEONvclez : SDNode<"ARMISD::VCLEZ", SDTARMVCMPZ>; +def NEONvcgeu : SDNode<"ARMISD::VCGEU", SDTARMVCMP>; +def NEONvcgt : SDNode<"ARMISD::VCGT", SDTARMVCMP>; +def NEONvcgtz : SDNode<"ARMISD::VCGTZ", SDTARMVCMPZ>; +def NEONvcltz : SDNode<"ARMISD::VCLTZ", SDTARMVCMPZ>; +def NEONvcgtu : SDNode<"ARMISD::VCGTU", SDTARMVCMP>; +def NEONvtst : SDNode<"ARMISD::VTST", SDTARMVCMP>; + +// Types for vector shift by immediates. The "SHX" version is for long and +// narrow operations where the source and destination vectors have different +// types. The "SHINS" version is for shift and insert operations. +def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>, + SDTCisVT<2, i32>]>; +def SDTARMVSHX : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, + SDTCisVT<2, i32>]>; +def SDTARMVSHINS : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>; + +def NEONvshl : SDNode<"ARMISD::VSHL", SDTARMVSH>; +def NEONvshrs : SDNode<"ARMISD::VSHRs", SDTARMVSH>; +def NEONvshru : SDNode<"ARMISD::VSHRu", SDTARMVSH>; +def NEONvshrn : SDNode<"ARMISD::VSHRN", SDTARMVSHX>; + +def NEONvrshrs : SDNode<"ARMISD::VRSHRs", SDTARMVSH>; +def NEONvrshru : SDNode<"ARMISD::VRSHRu", SDTARMVSH>; +def NEONvrshrn : SDNode<"ARMISD::VRSHRN", SDTARMVSHX>; + +def NEONvqshls : SDNode<"ARMISD::VQSHLs", SDTARMVSH>; +def NEONvqshlu : SDNode<"ARMISD::VQSHLu", SDTARMVSH>; +def NEONvqshlsu : SDNode<"ARMISD::VQSHLsu", SDTARMVSH>; +def NEONvqshrns : SDNode<"ARMISD::VQSHRNs", SDTARMVSHX>; +def NEONvqshrnu : SDNode<"ARMISD::VQSHRNu", SDTARMVSHX>; +def NEONvqshrnsu : SDNode<"ARMISD::VQSHRNsu", SDTARMVSHX>; + +def NEONvqrshrns : SDNode<"ARMISD::VQRSHRNs", SDTARMVSHX>; +def NEONvqrshrnu : SDNode<"ARMISD::VQRSHRNu", SDTARMVSHX>; +def NEONvqrshrnsu : SDNode<"ARMISD::VQRSHRNsu", SDTARMVSHX>; + +def NEONvsli : SDNode<"ARMISD::VSLI", SDTARMVSHINS>; +def NEONvsri : SDNode<"ARMISD::VSRI", SDTARMVSHINS>; + +def SDTARMVGETLN : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>, + SDTCisVT<2, i32>]>; +def NEONvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>; +def NEONvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>; + +def SDTARMVMOVIMM : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>; +def NEONvmovImm : SDNode<"ARMISD::VMOVIMM", SDTARMVMOVIMM>; +def NEONvmvnImm : SDNode<"ARMISD::VMVNIMM", SDTARMVMOVIMM>; +def NEONvmovFPImm : SDNode<"ARMISD::VMOVFPIMM", SDTARMVMOVIMM>; + +def SDTARMVORRIMM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisVT<2, i32>]>; +def NEONvorrImm : SDNode<"ARMISD::VORRIMM", SDTARMVORRIMM>; +def NEONvbicImm : SDNode<"ARMISD::VBICIMM", SDTARMVORRIMM>; + +def NEONvbsl : SDNode<"ARMISD::VBSL", + SDTypeProfile<1, 3, [SDTCisVec<0>, + SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>]>>; + +def NEONvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>; + +// VDUPLANE can produce a quad-register result from a double-register source, +// so the result is not constrained to match the source. +def NEONvduplane : SDNode<"ARMISD::VDUPLANE", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisVT<2, i32>]>>; + +def SDTARMVEXT : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>; +def NEONvext : SDNode<"ARMISD::VEXT", SDTARMVEXT>; + +def SDTARMVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>; +def NEONvrev64 : SDNode<"ARMISD::VREV64", SDTARMVSHUF>; +def NEONvrev32 : SDNode<"ARMISD::VREV32", SDTARMVSHUF>; +def NEONvrev16 : SDNode<"ARMISD::VREV16", SDTARMVSHUF>; + +def SDTARMVSHUF2 : SDTypeProfile<2, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>]>; +def NEONzip : SDNode<"ARMISD::VZIP", SDTARMVSHUF2>; +def NEONuzp : SDNode<"ARMISD::VUZP", SDTARMVSHUF2>; +def NEONtrn : SDNode<"ARMISD::VTRN", SDTARMVSHUF2>; + +def SDTARMVMULL : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, + SDTCisSameAs<1, 2>]>; +def NEONvmulls : SDNode<"ARMISD::VMULLs", SDTARMVMULL>; +def NEONvmullu : SDNode<"ARMISD::VMULLu", SDTARMVMULL>; + +def NEONimmAllZerosV: PatLeaf<(NEONvmovImm (i32 timm)), [{ + ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0)); + unsigned EltBits = 0; + uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits); + return (EltBits == 32 && EltVal == 0); +}]>; + +def NEONimmAllOnesV: PatLeaf<(NEONvmovImm (i32 timm)), [{ + ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0)); + unsigned EltBits = 0; + uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits); + return (EltBits == 8 && EltVal == 0xff); +}]>; + +//===----------------------------------------------------------------------===// +// NEON load / store instructions +//===----------------------------------------------------------------------===// + +// Use VLDM to load a Q register as a D register pair. +// This is a pseudo instruction that is expanded to VLDMD after reg alloc. +def VLDMQIA + : PseudoVFPLdStM<(outs DPair:$dst), (ins GPR:$Rn), + IIC_fpLoad_m, "", + [(set DPair:$dst, (v2f64 (load GPR:$Rn)))]>; + +// Use VSTM to store a Q register as a D register pair. +// This is a pseudo instruction that is expanded to VSTMD after reg alloc. +def VSTMQIA + : PseudoVFPLdStM<(outs), (ins DPair:$src, GPR:$Rn), + IIC_fpStore_m, "", + [(store (v2f64 DPair:$src), GPR:$Rn)]>; + +// Classes for VLD* pseudo-instructions with multi-register operands. +// These are expanded to real instructions after register allocation. +class VLDQPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QPR:$dst), (ins addrmode6:$addr), itin, "">; +class VLDQWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QPR:$dst, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset), itin, + "$addr.addr = $wb">; +class VLDQWBfixedPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QPR:$dst, GPR:$wb), + (ins addrmode6:$addr), itin, + "$addr.addr = $wb">; +class VLDQWBregisterPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QPR:$dst, GPR:$wb), + (ins addrmode6:$addr, rGPR:$offset), itin, + "$addr.addr = $wb">; + +class VLDQQPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QQPR:$dst), (ins addrmode6:$addr), itin, "">; +class VLDQQWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset), itin, + "$addr.addr = $wb">; +class VLDQQWBfixedPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb), + (ins addrmode6:$addr), itin, + "$addr.addr = $wb">; +class VLDQQWBregisterPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb), + (ins addrmode6:$addr, rGPR:$offset), itin, + "$addr.addr = $wb">; + + +class VLDQQQQPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QQQQPR:$dst), (ins addrmode6:$addr, QQQQPR:$src),itin, + "$src = $dst">; +class VLDQQQQWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QQQQPR:$dst, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), itin, + "$addr.addr = $wb, $src = $dst">; + +let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in { + +// VLD1 : Vector Load (multiple single elements) +class VLD1D<bits<4> op7_4, string Dt, Operand AddrMode> + : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd), + (ins AddrMode:$Rn), IIC_VLD1, + "vld1", Dt, "$Vd, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLDST1Instruction"; +} +class VLD1Q<bits<4> op7_4, string Dt, Operand AddrMode> + : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd), + (ins AddrMode:$Rn), IIC_VLD1x2, + "vld1", Dt, "$Vd, $Rn", "", []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDST1Instruction"; +} + +def VLD1d8 : VLD1D<{0,0,0,?}, "8", addrmode6align64>; +def VLD1d16 : VLD1D<{0,1,0,?}, "16", addrmode6align64>; +def VLD1d32 : VLD1D<{1,0,0,?}, "32", addrmode6align64>; +def VLD1d64 : VLD1D<{1,1,0,?}, "64", addrmode6align64>; + +def VLD1q8 : VLD1Q<{0,0,?,?}, "8", addrmode6align64or128>; +def VLD1q16 : VLD1Q<{0,1,?,?}, "16", addrmode6align64or128>; +def VLD1q32 : VLD1Q<{1,0,?,?}, "32", addrmode6align64or128>; +def VLD1q64 : VLD1Q<{1,1,?,?}, "64", addrmode6align64or128>; + +// ...with address register writeback: +multiclass VLD1DWB<bits<4> op7_4, string Dt, Operand AddrMode> { + def _fixed : NLdSt<0,0b10, 0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb), + (ins AddrMode:$Rn), IIC_VLD1u, + "vld1", Dt, "$Vd, $Rn!", + "$Rn.addr = $wb", []> { + let Rm = 0b1101; // NLdSt will assign to the right encoding bits. + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLDST1Instruction"; + } + def _register : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb), + (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1u, + "vld1", Dt, "$Vd, $Rn, $Rm", + "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLDST1Instruction"; + } +} +multiclass VLD1QWB<bits<4> op7_4, string Dt, Operand AddrMode> { + def _fixed : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb), + (ins AddrMode:$Rn), IIC_VLD1x2u, + "vld1", Dt, "$Vd, $Rn!", + "$Rn.addr = $wb", []> { + let Rm = 0b1101; // NLdSt will assign to the right encoding bits. + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDST1Instruction"; + } + def _register : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb), + (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u, + "vld1", Dt, "$Vd, $Rn, $Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDST1Instruction"; + } +} + +defm VLD1d8wb : VLD1DWB<{0,0,0,?}, "8", addrmode6align64>; +defm VLD1d16wb : VLD1DWB<{0,1,0,?}, "16", addrmode6align64>; +defm VLD1d32wb : VLD1DWB<{1,0,0,?}, "32", addrmode6align64>; +defm VLD1d64wb : VLD1DWB<{1,1,0,?}, "64", addrmode6align64>; +defm VLD1q8wb : VLD1QWB<{0,0,?,?}, "8", addrmode6align64or128>; +defm VLD1q16wb : VLD1QWB<{0,1,?,?}, "16", addrmode6align64or128>; +defm VLD1q32wb : VLD1QWB<{1,0,?,?}, "32", addrmode6align64or128>; +defm VLD1q64wb : VLD1QWB<{1,1,?,?}, "64", addrmode6align64or128>; + +// ...with 3 registers +class VLD1D3<bits<4> op7_4, string Dt, Operand AddrMode> + : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd), + (ins AddrMode:$Rn), IIC_VLD1x3, "vld1", Dt, + "$Vd, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLDST1Instruction"; +} +multiclass VLD1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> { + def _fixed : NLdSt<0,0b10,0b0110, op7_4, (outs VecListThreeD:$Vd, GPR:$wb), + (ins AddrMode:$Rn), IIC_VLD1x2u, + "vld1", Dt, "$Vd, $Rn!", + "$Rn.addr = $wb", []> { + let Rm = 0b1101; // NLdSt will assign to the right encoding bits. + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLDST1Instruction"; + } + def _register : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd, GPR:$wb), + (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u, + "vld1", Dt, "$Vd, $Rn, $Rm", + "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLDST1Instruction"; + } +} + +def VLD1d8T : VLD1D3<{0,0,0,?}, "8", addrmode6align64>; +def VLD1d16T : VLD1D3<{0,1,0,?}, "16", addrmode6align64>; +def VLD1d32T : VLD1D3<{1,0,0,?}, "32", addrmode6align64>; +def VLD1d64T : VLD1D3<{1,1,0,?}, "64", addrmode6align64>; + +defm VLD1d8Twb : VLD1D3WB<{0,0,0,?}, "8", addrmode6align64>; +defm VLD1d16Twb : VLD1D3WB<{0,1,0,?}, "16", addrmode6align64>; +defm VLD1d32Twb : VLD1D3WB<{1,0,0,?}, "32", addrmode6align64>; +defm VLD1d64Twb : VLD1D3WB<{1,1,0,?}, "64", addrmode6align64>; + +def VLD1d64TPseudo : VLDQQPseudo<IIC_VLD1x3>; +def VLD1d64TPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x3>; +def VLD1d64TPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x3>; + +// ...with 4 registers +class VLD1D4<bits<4> op7_4, string Dt, Operand AddrMode> + : NLdSt<0, 0b10, 0b0010, op7_4, (outs VecListFourD:$Vd), + (ins AddrMode:$Rn), IIC_VLD1x4, "vld1", Dt, + "$Vd, $Rn", "", []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDST1Instruction"; +} +multiclass VLD1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> { + def _fixed : NLdSt<0,0b10,0b0010, op7_4, (outs VecListFourD:$Vd, GPR:$wb), + (ins AddrMode:$Rn), IIC_VLD1x2u, + "vld1", Dt, "$Vd, $Rn!", + "$Rn.addr = $wb", []> { + let Rm = 0b1101; // NLdSt will assign to the right encoding bits. + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDST1Instruction"; + } + def _register : NLdSt<0,0b10,0b0010,op7_4, (outs VecListFourD:$Vd, GPR:$wb), + (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u, + "vld1", Dt, "$Vd, $Rn, $Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDST1Instruction"; + } +} + +def VLD1d8Q : VLD1D4<{0,0,?,?}, "8", addrmode6align64or128or256>; +def VLD1d16Q : VLD1D4<{0,1,?,?}, "16", addrmode6align64or128or256>; +def VLD1d32Q : VLD1D4<{1,0,?,?}, "32", addrmode6align64or128or256>; +def VLD1d64Q : VLD1D4<{1,1,?,?}, "64", addrmode6align64or128or256>; + +defm VLD1d8Qwb : VLD1D4WB<{0,0,?,?}, "8", addrmode6align64or128or256>; +defm VLD1d16Qwb : VLD1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>; +defm VLD1d32Qwb : VLD1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>; +defm VLD1d64Qwb : VLD1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>; + +def VLD1d64QPseudo : VLDQQPseudo<IIC_VLD1x4>; +def VLD1d64QPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x4>; +def VLD1d64QPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x4>; + +// VLD2 : Vector Load (multiple 2-element structures) +class VLD2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy, + InstrItinClass itin, Operand AddrMode> + : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd), + (ins AddrMode:$Rn), itin, + "vld2", Dt, "$Vd, $Rn", "", []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDST2Instruction"; +} + +def VLD2d8 : VLD2<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2, + addrmode6align64or128>; +def VLD2d16 : VLD2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2, + addrmode6align64or128>; +def VLD2d32 : VLD2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2, + addrmode6align64or128>; + +def VLD2q8 : VLD2<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2, + addrmode6align64or128or256>; +def VLD2q16 : VLD2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2, + addrmode6align64or128or256>; +def VLD2q32 : VLD2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2, + addrmode6align64or128or256>; + +def VLD2q8Pseudo : VLDQQPseudo<IIC_VLD2x2>; +def VLD2q16Pseudo : VLDQQPseudo<IIC_VLD2x2>; +def VLD2q32Pseudo : VLDQQPseudo<IIC_VLD2x2>; + +// ...with address register writeback: +multiclass VLD2WB<bits<4> op11_8, bits<4> op7_4, string Dt, + RegisterOperand VdTy, InstrItinClass itin, Operand AddrMode> { + def _fixed : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd, GPR:$wb), + (ins AddrMode:$Rn), itin, + "vld2", Dt, "$Vd, $Rn!", + "$Rn.addr = $wb", []> { + let Rm = 0b1101; // NLdSt will assign to the right encoding bits. + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDST2Instruction"; + } + def _register : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd, GPR:$wb), + (ins AddrMode:$Rn, rGPR:$Rm), itin, + "vld2", Dt, "$Vd, $Rn, $Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDST2Instruction"; + } +} + +defm VLD2d8wb : VLD2WB<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2u, + addrmode6align64or128>; +defm VLD2d16wb : VLD2WB<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2u, + addrmode6align64or128>; +defm VLD2d32wb : VLD2WB<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2u, + addrmode6align64or128>; + +defm VLD2q8wb : VLD2WB<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2u, + addrmode6align64or128or256>; +defm VLD2q16wb : VLD2WB<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2u, + addrmode6align64or128or256>; +defm VLD2q32wb : VLD2WB<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2u, + addrmode6align64or128or256>; + +def VLD2q8PseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2x2u>; +def VLD2q16PseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2x2u>; +def VLD2q32PseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2x2u>; +def VLD2q8PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>; +def VLD2q16PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>; +def VLD2q32PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>; + +// ...with double-spaced registers +def VLD2b8 : VLD2<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2, + addrmode6align64or128>; +def VLD2b16 : VLD2<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2, + addrmode6align64or128>; +def VLD2b32 : VLD2<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2, + addrmode6align64or128>; +defm VLD2b8wb : VLD2WB<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2u, + addrmode6align64or128>; +defm VLD2b16wb : VLD2WB<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2u, + addrmode6align64or128>; +defm VLD2b32wb : VLD2WB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2u, + addrmode6align64or128>; + +// VLD3 : Vector Load (multiple 3-element structures) +class VLD3D<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3), + (ins addrmode6:$Rn), IIC_VLD3, + "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLDST3Instruction"; +} + +def VLD3d8 : VLD3D<0b0100, {0,0,0,?}, "8">; +def VLD3d16 : VLD3D<0b0100, {0,1,0,?}, "16">; +def VLD3d32 : VLD3D<0b0100, {1,0,0,?}, "32">; + +def VLD3d8Pseudo : VLDQQPseudo<IIC_VLD3>; +def VLD3d16Pseudo : VLDQQPseudo<IIC_VLD3>; +def VLD3d32Pseudo : VLDQQPseudo<IIC_VLD3>; + +// ...with address register writeback: +class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b10, op11_8, op7_4, + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD3u, + "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLDST3Instruction"; +} + +def VLD3d8_UPD : VLD3DWB<0b0100, {0,0,0,?}, "8">; +def VLD3d16_UPD : VLD3DWB<0b0100, {0,1,0,?}, "16">; +def VLD3d32_UPD : VLD3DWB<0b0100, {1,0,0,?}, "32">; + +def VLD3d8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>; +def VLD3d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>; +def VLD3d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>; + +// ...with double-spaced registers: +def VLD3q8 : VLD3D<0b0101, {0,0,0,?}, "8">; +def VLD3q16 : VLD3D<0b0101, {0,1,0,?}, "16">; +def VLD3q32 : VLD3D<0b0101, {1,0,0,?}, "32">; +def VLD3q8_UPD : VLD3DWB<0b0101, {0,0,0,?}, "8">; +def VLD3q16_UPD : VLD3DWB<0b0101, {0,1,0,?}, "16">; +def VLD3q32_UPD : VLD3DWB<0b0101, {1,0,0,?}, "32">; + +def VLD3q8Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>; +def VLD3q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>; +def VLD3q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>; + +// ...alternate versions to be allocated odd register numbers: +def VLD3q8oddPseudo : VLDQQQQPseudo<IIC_VLD3>; +def VLD3q16oddPseudo : VLDQQQQPseudo<IIC_VLD3>; +def VLD3q32oddPseudo : VLDQQQQPseudo<IIC_VLD3>; + +def VLD3q8oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>; +def VLD3q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>; +def VLD3q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>; + +// VLD4 : Vector Load (multiple 4-element structures) +class VLD4D<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b10, op11_8, op7_4, + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4), + (ins addrmode6:$Rn), IIC_VLD4, + "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDST4Instruction"; +} + +def VLD4d8 : VLD4D<0b0000, {0,0,?,?}, "8">; +def VLD4d16 : VLD4D<0b0000, {0,1,?,?}, "16">; +def VLD4d32 : VLD4D<0b0000, {1,0,?,?}, "32">; + +def VLD4d8Pseudo : VLDQQPseudo<IIC_VLD4>; +def VLD4d16Pseudo : VLDQQPseudo<IIC_VLD4>; +def VLD4d32Pseudo : VLDQQPseudo<IIC_VLD4>; + +// ...with address register writeback: +class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b10, op11_8, op7_4, + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD4u, + "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDST4Instruction"; +} + +def VLD4d8_UPD : VLD4DWB<0b0000, {0,0,?,?}, "8">; +def VLD4d16_UPD : VLD4DWB<0b0000, {0,1,?,?}, "16">; +def VLD4d32_UPD : VLD4DWB<0b0000, {1,0,?,?}, "32">; + +def VLD4d8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>; +def VLD4d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>; +def VLD4d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>; + +// ...with double-spaced registers: +def VLD4q8 : VLD4D<0b0001, {0,0,?,?}, "8">; +def VLD4q16 : VLD4D<0b0001, {0,1,?,?}, "16">; +def VLD4q32 : VLD4D<0b0001, {1,0,?,?}, "32">; +def VLD4q8_UPD : VLD4DWB<0b0001, {0,0,?,?}, "8">; +def VLD4q16_UPD : VLD4DWB<0b0001, {0,1,?,?}, "16">; +def VLD4q32_UPD : VLD4DWB<0b0001, {1,0,?,?}, "32">; + +def VLD4q8Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>; +def VLD4q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>; +def VLD4q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>; + +// ...alternate versions to be allocated odd register numbers: +def VLD4q8oddPseudo : VLDQQQQPseudo<IIC_VLD4>; +def VLD4q16oddPseudo : VLDQQQQPseudo<IIC_VLD4>; +def VLD4q32oddPseudo : VLDQQQQPseudo<IIC_VLD4>; + +def VLD4q8oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>; +def VLD4q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>; +def VLD4q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>; + +} // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 + +// Classes for VLD*LN pseudo-instructions with multi-register operands. +// These are expanded to real instructions after register allocation. +class VLDQLNPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QPR:$dst), + (ins addrmode6:$addr, QPR:$src, nohash_imm:$lane), + itin, "$src = $dst">; +class VLDQLNWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QPR:$dst, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QPR:$src, + nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">; +class VLDQQLNPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QQPR:$dst), + (ins addrmode6:$addr, QQPR:$src, nohash_imm:$lane), + itin, "$src = $dst">; +class VLDQQLNWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QQPR:$src, + nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">; +class VLDQQQQLNPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QQQQPR:$dst), + (ins addrmode6:$addr, QQQQPR:$src, nohash_imm:$lane), + itin, "$src = $dst">; +class VLDQQQQLNWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QQQQPR:$dst, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src, + nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">; + +// VLD1LN : Vector Load (single element to one lane) +class VLD1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty, + PatFrag LoadOp> + : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd), + (ins addrmode6:$Rn, DPR:$src, nohash_imm:$lane), + IIC_VLD1ln, "vld1", Dt, "\\{$Vd[$lane]\\}, $Rn", + "$src = $Vd", + [(set DPR:$Vd, (vector_insert (Ty DPR:$src), + (i32 (LoadOp addrmode6:$Rn)), + imm:$lane))]> { + let Rm = 0b1111; + let DecoderMethod = "DecodeVLD1LN"; +} +class VLD1LN32<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty, + PatFrag LoadOp> + : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd), + (ins addrmode6oneL32:$Rn, DPR:$src, nohash_imm:$lane), + IIC_VLD1ln, "vld1", Dt, "\\{$Vd[$lane]\\}, $Rn", + "$src = $Vd", + [(set DPR:$Vd, (vector_insert (Ty DPR:$src), + (i32 (LoadOp addrmode6oneL32:$Rn)), + imm:$lane))]> { + let Rm = 0b1111; + let DecoderMethod = "DecodeVLD1LN"; +} +class VLD1QLNPseudo<ValueType Ty, PatFrag LoadOp> : VLDQLNPseudo<IIC_VLD1ln> { + let Pattern = [(set QPR:$dst, (vector_insert (Ty QPR:$src), + (i32 (LoadOp addrmode6:$addr)), + imm:$lane))]; +} + +def VLD1LNd8 : VLD1LN<0b0000, {?,?,?,0}, "8", v8i8, extloadi8> { + let Inst{7-5} = lane{2-0}; +} +def VLD1LNd16 : VLD1LN<0b0100, {?,?,0,?}, "16", v4i16, extloadi16> { + let Inst{7-6} = lane{1-0}; + let Inst{5-4} = Rn{5-4}; +} +def VLD1LNd32 : VLD1LN32<0b1000, {?,0,?,?}, "32", v2i32, load> { + let Inst{7} = lane{0}; + let Inst{5-4} = Rn{5-4}; +} + +def VLD1LNq8Pseudo : VLD1QLNPseudo<v16i8, extloadi8>; +def VLD1LNq16Pseudo : VLD1QLNPseudo<v8i16, extloadi16>; +def VLD1LNq32Pseudo : VLD1QLNPseudo<v4i32, load>; + +def : Pat<(vector_insert (v2f32 DPR:$src), + (f32 (load addrmode6:$addr)), imm:$lane), + (VLD1LNd32 addrmode6:$addr, DPR:$src, imm:$lane)>; +def : Pat<(vector_insert (v4f32 QPR:$src), + (f32 (load addrmode6:$addr)), imm:$lane), + (VLD1LNq32Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>; + +let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in { + +// ...with address register writeback: +class VLD1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$src, nohash_imm:$lane), IIC_VLD1lnu, "vld1", Dt, + "\\{$Vd[$lane]\\}, $Rn$Rm", + "$src = $Vd, $Rn.addr = $wb", []> { + let DecoderMethod = "DecodeVLD1LN"; +} + +def VLD1LNd8_UPD : VLD1LNWB<0b0000, {?,?,?,0}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VLD1LNd16_UPD : VLD1LNWB<0b0100, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; + let Inst{4} = Rn{4}; +} +def VLD1LNd32_UPD : VLD1LNWB<0b1000, {?,0,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{4}; + let Inst{4} = Rn{4}; +} + +def VLD1LNq8Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>; +def VLD1LNq16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>; +def VLD1LNq32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>; + +// VLD2LN : Vector Load (single 2-element structure to one lane) +class VLD2LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2), + (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, nohash_imm:$lane), + IIC_VLD2ln, "vld2", Dt, "\\{$Vd[$lane], $dst2[$lane]\\}, $Rn", + "$src1 = $Vd, $src2 = $dst2", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLD2LN"; +} + +def VLD2LNd8 : VLD2LN<0b0001, {?,?,?,?}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VLD2LNd16 : VLD2LN<0b0101, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD2LNd32 : VLD2LN<0b1001, {?,0,0,?}, "32"> { + let Inst{7} = lane{0}; +} + +def VLD2LNd8Pseudo : VLDQLNPseudo<IIC_VLD2ln>; +def VLD2LNd16Pseudo : VLDQLNPseudo<IIC_VLD2ln>; +def VLD2LNd32Pseudo : VLDQLNPseudo<IIC_VLD2ln>; + +// ...with double-spaced registers: +def VLD2LNq16 : VLD2LN<0b0101, {?,?,1,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD2LNq32 : VLD2LN<0b1001, {?,1,0,?}, "32"> { + let Inst{7} = lane{0}; +} + +def VLD2LNq16Pseudo : VLDQQLNPseudo<IIC_VLD2ln>; +def VLD2LNq32Pseudo : VLDQQLNPseudo<IIC_VLD2ln>; + +// ...with address register writeback: +class VLD2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VLD2lnu, "vld2", Dt, + "\\{$Vd[$lane], $dst2[$lane]\\}, $Rn$Rm", + "$src1 = $Vd, $src2 = $dst2, $Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLD2LN"; +} + +def VLD2LNd8_UPD : VLD2LNWB<0b0001, {?,?,?,?}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VLD2LNd16_UPD : VLD2LNWB<0b0101, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD2LNd32_UPD : VLD2LNWB<0b1001, {?,0,0,?}, "32"> { + let Inst{7} = lane{0}; +} + +def VLD2LNd8Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>; +def VLD2LNd16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>; +def VLD2LNd32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>; + +def VLD2LNq16_UPD : VLD2LNWB<0b0101, {?,?,1,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD2LNq32_UPD : VLD2LNWB<0b1001, {?,1,0,?}, "32"> { + let Inst{7} = lane{0}; +} + +def VLD2LNq16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>; +def VLD2LNq32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>; + +// VLD3LN : Vector Load (single 3-element structure to one lane) +class VLD3LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3), + (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, DPR:$src3, + nohash_imm:$lane), IIC_VLD3ln, "vld3", Dt, + "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn", + "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3", []> { + let Rm = 0b1111; + let DecoderMethod = "DecodeVLD3LN"; +} + +def VLD3LNd8 : VLD3LN<0b0010, {?,?,?,0}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VLD3LNd16 : VLD3LN<0b0110, {?,?,0,0}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD3LNd32 : VLD3LN<0b1010, {?,0,0,0}, "32"> { + let Inst{7} = lane{0}; +} + +def VLD3LNd8Pseudo : VLDQQLNPseudo<IIC_VLD3ln>; +def VLD3LNd16Pseudo : VLDQQLNPseudo<IIC_VLD3ln>; +def VLD3LNd32Pseudo : VLDQQLNPseudo<IIC_VLD3ln>; + +// ...with double-spaced registers: +def VLD3LNq16 : VLD3LN<0b0110, {?,?,1,0}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD3LNq32 : VLD3LN<0b1010, {?,1,0,0}, "32"> { + let Inst{7} = lane{0}; +} + +def VLD3LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>; +def VLD3LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>; + +// ...with address register writeback: +class VLD3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b10, op11_8, op7_4, + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$src1, DPR:$src2, DPR:$src3, nohash_imm:$lane), + IIC_VLD3lnu, "vld3", Dt, + "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn$Rm", + "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $Rn.addr = $wb", + []> { + let DecoderMethod = "DecodeVLD3LN"; +} + +def VLD3LNd8_UPD : VLD3LNWB<0b0010, {?,?,?,0}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VLD3LNd16_UPD : VLD3LNWB<0b0110, {?,?,0,0}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD3LNd32_UPD : VLD3LNWB<0b1010, {?,0,0,0}, "32"> { + let Inst{7} = lane{0}; +} + +def VLD3LNd8Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>; +def VLD3LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>; +def VLD3LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>; + +def VLD3LNq16_UPD : VLD3LNWB<0b0110, {?,?,1,0}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD3LNq32_UPD : VLD3LNWB<0b1010, {?,1,0,0}, "32"> { + let Inst{7} = lane{0}; +} + +def VLD3LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>; +def VLD3LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>; + +// VLD4LN : Vector Load (single 4-element structure to one lane) +class VLD4LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b10, op11_8, op7_4, + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4), + (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, + nohash_imm:$lane), IIC_VLD4ln, "vld4", Dt, + "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $Rn", + "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLD4LN"; +} + +def VLD4LNd8 : VLD4LN<0b0011, {?,?,?,?}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VLD4LNd16 : VLD4LN<0b0111, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD4LNd32 : VLD4LN<0b1011, {?,0,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{5}; +} + +def VLD4LNd8Pseudo : VLDQQLNPseudo<IIC_VLD4ln>; +def VLD4LNd16Pseudo : VLDQQLNPseudo<IIC_VLD4ln>; +def VLD4LNd32Pseudo : VLDQQLNPseudo<IIC_VLD4ln>; + +// ...with double-spaced registers: +def VLD4LNq16 : VLD4LN<0b0111, {?,?,1,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD4LNq32 : VLD4LN<0b1011, {?,1,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{5}; +} + +def VLD4LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>; +def VLD4LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>; + +// ...with address register writeback: +class VLD4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b10, op11_8, op7_4, + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane), + IIC_VLD4lnu, "vld4", Dt, +"\\{$Vd[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $Rn$Rm", +"$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4, $Rn.addr = $wb", + []> { + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLD4LN" ; +} + +def VLD4LNd8_UPD : VLD4LNWB<0b0011, {?,?,?,?}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VLD4LNd16_UPD : VLD4LNWB<0b0111, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD4LNd32_UPD : VLD4LNWB<0b1011, {?,0,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{5}; +} + +def VLD4LNd8Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>; +def VLD4LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>; +def VLD4LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>; + +def VLD4LNq16_UPD : VLD4LNWB<0b0111, {?,?,1,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD4LNq32_UPD : VLD4LNWB<0b1011, {?,1,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{5}; +} + +def VLD4LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>; +def VLD4LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>; + +} // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 + +// VLD1DUP : Vector Load (single element to all lanes) +class VLD1DUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp, + Operand AddrMode> + : NLdSt<1, 0b10, 0b1100, op7_4, (outs VecListOneDAllLanes:$Vd), + (ins AddrMode:$Rn), + IIC_VLD1dup, "vld1", Dt, "$Vd, $Rn", "", + [(set VecListOneDAllLanes:$Vd, + (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLD1DupInstruction"; +} +def VLD1DUPd8 : VLD1DUP<{0,0,0,?}, "8", v8i8, extloadi8, + addrmode6dupalignNone>; +def VLD1DUPd16 : VLD1DUP<{0,1,0,?}, "16", v4i16, extloadi16, + addrmode6dupalign16>; +def VLD1DUPd32 : VLD1DUP<{1,0,0,?}, "32", v2i32, load, + addrmode6dupalign32>; + +def : Pat<(v2f32 (NEONvdup (f32 (load addrmode6dup:$addr)))), + (VLD1DUPd32 addrmode6:$addr)>; + +class VLD1QDUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp, + Operand AddrMode> + : NLdSt<1, 0b10, 0b1100, op7_4, (outs VecListDPairAllLanes:$Vd), + (ins AddrMode:$Rn), IIC_VLD1dup, + "vld1", Dt, "$Vd, $Rn", "", + [(set VecListDPairAllLanes:$Vd, + (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLD1DupInstruction"; +} + +def VLD1DUPq8 : VLD1QDUP<{0,0,1,0}, "8", v16i8, extloadi8, + addrmode6dupalignNone>; +def VLD1DUPq16 : VLD1QDUP<{0,1,1,?}, "16", v8i16, extloadi16, + addrmode6dupalign16>; +def VLD1DUPq32 : VLD1QDUP<{1,0,1,?}, "32", v4i32, load, + addrmode6dupalign32>; + +def : Pat<(v4f32 (NEONvdup (f32 (load addrmode6dup:$addr)))), + (VLD1DUPq32 addrmode6:$addr)>; + +let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in { +// ...with address register writeback: +multiclass VLD1DUPWB<bits<4> op7_4, string Dt, Operand AddrMode> { + def _fixed : NLdSt<1, 0b10, 0b1100, op7_4, + (outs VecListOneDAllLanes:$Vd, GPR:$wb), + (ins AddrMode:$Rn), IIC_VLD1dupu, + "vld1", Dt, "$Vd, $Rn!", + "$Rn.addr = $wb", []> { + let Rm = 0b1101; // NLdSt will assign to the right encoding bits. + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLD1DupInstruction"; + } + def _register : NLdSt<1, 0b10, 0b1100, op7_4, + (outs VecListOneDAllLanes:$Vd, GPR:$wb), + (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1dupu, + "vld1", Dt, "$Vd, $Rn, $Rm", + "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLD1DupInstruction"; + } +} +multiclass VLD1QDUPWB<bits<4> op7_4, string Dt, Operand AddrMode> { + def _fixed : NLdSt<1, 0b10, 0b1100, op7_4, + (outs VecListDPairAllLanes:$Vd, GPR:$wb), + (ins AddrMode:$Rn), IIC_VLD1dupu, + "vld1", Dt, "$Vd, $Rn!", + "$Rn.addr = $wb", []> { + let Rm = 0b1101; // NLdSt will assign to the right encoding bits. + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLD1DupInstruction"; + } + def _register : NLdSt<1, 0b10, 0b1100, op7_4, + (outs VecListDPairAllLanes:$Vd, GPR:$wb), + (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1dupu, + "vld1", Dt, "$Vd, $Rn, $Rm", + "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLD1DupInstruction"; + } +} + +defm VLD1DUPd8wb : VLD1DUPWB<{0,0,0,0}, "8", addrmode6dupalignNone>; +defm VLD1DUPd16wb : VLD1DUPWB<{0,1,0,?}, "16", addrmode6dupalign16>; +defm VLD1DUPd32wb : VLD1DUPWB<{1,0,0,?}, "32", addrmode6dupalign32>; + +defm VLD1DUPq8wb : VLD1QDUPWB<{0,0,1,0}, "8", addrmode6dupalignNone>; +defm VLD1DUPq16wb : VLD1QDUPWB<{0,1,1,?}, "16", addrmode6dupalign16>; +defm VLD1DUPq32wb : VLD1QDUPWB<{1,0,1,?}, "32", addrmode6dupalign32>; + +// VLD2DUP : Vector Load (single 2-element structure to all lanes) +class VLD2DUP<bits<4> op7_4, string Dt, RegisterOperand VdTy, Operand AddrMode> + : NLdSt<1, 0b10, 0b1101, op7_4, (outs VdTy:$Vd), + (ins AddrMode:$Rn), IIC_VLD2dup, + "vld2", Dt, "$Vd, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLD2DupInstruction"; +} + +def VLD2DUPd8 : VLD2DUP<{0,0,0,?}, "8", VecListDPairAllLanes, + addrmode6dupalign16>; +def VLD2DUPd16 : VLD2DUP<{0,1,0,?}, "16", VecListDPairAllLanes, + addrmode6dupalign32>; +def VLD2DUPd32 : VLD2DUP<{1,0,0,?}, "32", VecListDPairAllLanes, + addrmode6dupalign64>; + +// HACK this one, VLD2DUPd8x2 must be changed at the same time with VLD2b8 or +// "vld2.8 {d0[], d2[]}, [r4:32]" will become "vld2.8 {d0, d2}, [r4:32]". +// ...with double-spaced registers +def VLD2DUPd8x2 : VLD2DUP<{0,0,1,?}, "8", VecListDPairSpacedAllLanes, + addrmode6dupalign16>; +def VLD2DUPd16x2 : VLD2DUP<{0,1,1,?}, "16", VecListDPairSpacedAllLanes, + addrmode6dupalign32>; +def VLD2DUPd32x2 : VLD2DUP<{1,0,1,?}, "32", VecListDPairSpacedAllLanes, + addrmode6dupalign64>; + +// ...with address register writeback: +multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy, + Operand AddrMode> { + def _fixed : NLdSt<1, 0b10, 0b1101, op7_4, + (outs VdTy:$Vd, GPR:$wb), + (ins AddrMode:$Rn), IIC_VLD2dupu, + "vld2", Dt, "$Vd, $Rn!", + "$Rn.addr = $wb", []> { + let Rm = 0b1101; // NLdSt will assign to the right encoding bits. + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLD2DupInstruction"; + } + def _register : NLdSt<1, 0b10, 0b1101, op7_4, + (outs VdTy:$Vd, GPR:$wb), + (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD2dupu, + "vld2", Dt, "$Vd, $Rn, $Rm", + "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLD2DupInstruction"; + } +} + +defm VLD2DUPd8wb : VLD2DUPWB<{0,0,0,0}, "8", VecListDPairAllLanes, + addrmode6dupalign16>; +defm VLD2DUPd16wb : VLD2DUPWB<{0,1,0,?}, "16", VecListDPairAllLanes, + addrmode6dupalign32>; +defm VLD2DUPd32wb : VLD2DUPWB<{1,0,0,?}, "32", VecListDPairAllLanes, + addrmode6dupalign64>; + +defm VLD2DUPd8x2wb : VLD2DUPWB<{0,0,1,0}, "8", VecListDPairSpacedAllLanes, + addrmode6dupalign16>; +defm VLD2DUPd16x2wb : VLD2DUPWB<{0,1,1,?}, "16", VecListDPairSpacedAllLanes, + addrmode6dupalign32>; +defm VLD2DUPd32x2wb : VLD2DUPWB<{1,0,1,?}, "32", VecListDPairSpacedAllLanes, + addrmode6dupalign64>; + +// VLD3DUP : Vector Load (single 3-element structure to all lanes) +class VLD3DUP<bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3), + (ins addrmode6dup:$Rn), IIC_VLD3dup, + "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = 0; + let DecoderMethod = "DecodeVLD3DupInstruction"; +} + +def VLD3DUPd8 : VLD3DUP<{0,0,0,?}, "8">; +def VLD3DUPd16 : VLD3DUP<{0,1,0,?}, "16">; +def VLD3DUPd32 : VLD3DUP<{1,0,0,?}, "32">; + +def VLD3DUPd8Pseudo : VLDQQPseudo<IIC_VLD3dup>; +def VLD3DUPd16Pseudo : VLDQQPseudo<IIC_VLD3dup>; +def VLD3DUPd32Pseudo : VLDQQPseudo<IIC_VLD3dup>; + +// ...with double-spaced registers (not used for codegen): +def VLD3DUPq8 : VLD3DUP<{0,0,1,?}, "8">; +def VLD3DUPq16 : VLD3DUP<{0,1,1,?}, "16">; +def VLD3DUPq32 : VLD3DUP<{1,0,1,?}, "32">; + +// ...with address register writeback: +class VLD3DUPWB<bits<4> op7_4, string Dt, Operand AddrMode> + : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb), + (ins AddrMode:$Rn, am6offset:$Rm), IIC_VLD3dupu, + "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{4} = 0; + let DecoderMethod = "DecodeVLD3DupInstruction"; +} + +def VLD3DUPd8_UPD : VLD3DUPWB<{0,0,0,0}, "8", addrmode6dupalign64>; +def VLD3DUPd16_UPD : VLD3DUPWB<{0,1,0,?}, "16", addrmode6dupalign64>; +def VLD3DUPd32_UPD : VLD3DUPWB<{1,0,0,?}, "32", addrmode6dupalign64>; + +def VLD3DUPq8_UPD : VLD3DUPWB<{0,0,1,0}, "8", addrmode6dupalign64>; +def VLD3DUPq16_UPD : VLD3DUPWB<{0,1,1,?}, "16", addrmode6dupalign64>; +def VLD3DUPq32_UPD : VLD3DUPWB<{1,0,1,?}, "32", addrmode6dupalign64>; + +def VLD3DUPd8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>; +def VLD3DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>; +def VLD3DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>; + +// VLD4DUP : Vector Load (single 4-element structure to all lanes) +class VLD4DUP<bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, 0b1111, op7_4, + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4), + (ins addrmode6dup:$Rn), IIC_VLD4dup, + "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLD4DupInstruction"; +} + +def VLD4DUPd8 : VLD4DUP<{0,0,0,?}, "8">; +def VLD4DUPd16 : VLD4DUP<{0,1,0,?}, "16">; +def VLD4DUPd32 : VLD4DUP<{1,?,0,?}, "32"> { let Inst{6} = Rn{5}; } + +def VLD4DUPd8Pseudo : VLDQQPseudo<IIC_VLD4dup>; +def VLD4DUPd16Pseudo : VLDQQPseudo<IIC_VLD4dup>; +def VLD4DUPd32Pseudo : VLDQQPseudo<IIC_VLD4dup>; + +// ...with double-spaced registers (not used for codegen): +def VLD4DUPq8 : VLD4DUP<{0,0,1,?}, "8">; +def VLD4DUPq16 : VLD4DUP<{0,1,1,?}, "16">; +def VLD4DUPq32 : VLD4DUP<{1,?,1,?}, "32"> { let Inst{6} = Rn{5}; } + +// ...with address register writeback: +class VLD4DUPWB<bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, 0b1111, op7_4, + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), + (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD4dupu, + "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLD4DupInstruction"; +} + +def VLD4DUPd8_UPD : VLD4DUPWB<{0,0,0,0}, "8">; +def VLD4DUPd16_UPD : VLD4DUPWB<{0,1,0,?}, "16">; +def VLD4DUPd32_UPD : VLD4DUPWB<{1,?,0,?}, "32"> { let Inst{6} = Rn{5}; } + +def VLD4DUPq8_UPD : VLD4DUPWB<{0,0,1,0}, "8">; +def VLD4DUPq16_UPD : VLD4DUPWB<{0,1,1,?}, "16">; +def VLD4DUPq32_UPD : VLD4DUPWB<{1,?,1,?}, "32"> { let Inst{6} = Rn{5}; } + +def VLD4DUPd8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>; +def VLD4DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>; +def VLD4DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>; + +} // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 + +let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in { + +// Classes for VST* pseudo-instructions with multi-register operands. +// These are expanded to real instructions after register allocation. +class VSTQPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs), (ins addrmode6:$addr, QPR:$src), itin, "">; +class VSTQWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QPR:$src), itin, + "$addr.addr = $wb">; +class VSTQWBfixedPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs GPR:$wb), + (ins addrmode6:$addr, QPR:$src), itin, + "$addr.addr = $wb">; +class VSTQWBregisterPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs GPR:$wb), + (ins addrmode6:$addr, rGPR:$offset, QPR:$src), itin, + "$addr.addr = $wb">; +class VSTQQPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQPR:$src), itin, "">; +class VSTQQWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QQPR:$src), itin, + "$addr.addr = $wb">; +class VSTQQWBfixedPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs GPR:$wb), + (ins addrmode6:$addr, QQPR:$src), itin, + "$addr.addr = $wb">; +class VSTQQWBregisterPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs GPR:$wb), + (ins addrmode6:$addr, rGPR:$offset, QQPR:$src), itin, + "$addr.addr = $wb">; + +class VSTQQQQPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQQQPR:$src), itin, "">; +class VSTQQQQWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), itin, + "$addr.addr = $wb">; + +// VST1 : Vector Store (multiple single elements) +class VST1D<bits<4> op7_4, string Dt, Operand AddrMode> + : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins AddrMode:$Rn, VecListOneD:$Vd), + IIC_VST1, "vst1", Dt, "$Vd, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLDST1Instruction"; +} +class VST1Q<bits<4> op7_4, string Dt, Operand AddrMode> + : NLdSt<0,0b00,0b1010,op7_4, (outs), (ins AddrMode:$Rn, VecListDPair:$Vd), + IIC_VST1x2, "vst1", Dt, "$Vd, $Rn", "", []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDST1Instruction"; +} + +def VST1d8 : VST1D<{0,0,0,?}, "8", addrmode6align64>; +def VST1d16 : VST1D<{0,1,0,?}, "16", addrmode6align64>; +def VST1d32 : VST1D<{1,0,0,?}, "32", addrmode6align64>; +def VST1d64 : VST1D<{1,1,0,?}, "64", addrmode6align64>; + +def VST1q8 : VST1Q<{0,0,?,?}, "8", addrmode6align64or128>; +def VST1q16 : VST1Q<{0,1,?,?}, "16", addrmode6align64or128>; +def VST1q32 : VST1Q<{1,0,?,?}, "32", addrmode6align64or128>; +def VST1q64 : VST1Q<{1,1,?,?}, "64", addrmode6align64or128>; + +// ...with address register writeback: +multiclass VST1DWB<bits<4> op7_4, string Dt, Operand AddrMode> { + def _fixed : NLdSt<0,0b00, 0b0111,op7_4, (outs GPR:$wb), + (ins AddrMode:$Rn, VecListOneD:$Vd), IIC_VLD1u, + "vst1", Dt, "$Vd, $Rn!", + "$Rn.addr = $wb", []> { + let Rm = 0b1101; // NLdSt will assign to the right encoding bits. + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLDST1Instruction"; + } + def _register : NLdSt<0,0b00,0b0111,op7_4, (outs GPR:$wb), + (ins AddrMode:$Rn, rGPR:$Rm, VecListOneD:$Vd), + IIC_VLD1u, + "vst1", Dt, "$Vd, $Rn, $Rm", + "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLDST1Instruction"; + } +} +multiclass VST1QWB<bits<4> op7_4, string Dt, Operand AddrMode> { + def _fixed : NLdSt<0,0b00,0b1010,op7_4, (outs GPR:$wb), + (ins AddrMode:$Rn, VecListDPair:$Vd), IIC_VLD1x2u, + "vst1", Dt, "$Vd, $Rn!", + "$Rn.addr = $wb", []> { + let Rm = 0b1101; // NLdSt will assign to the right encoding bits. + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDST1Instruction"; + } + def _register : NLdSt<0,0b00,0b1010,op7_4, (outs GPR:$wb), + (ins AddrMode:$Rn, rGPR:$Rm, VecListDPair:$Vd), + IIC_VLD1x2u, + "vst1", Dt, "$Vd, $Rn, $Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDST1Instruction"; + } +} + +defm VST1d8wb : VST1DWB<{0,0,0,?}, "8", addrmode6align64>; +defm VST1d16wb : VST1DWB<{0,1,0,?}, "16", addrmode6align64>; +defm VST1d32wb : VST1DWB<{1,0,0,?}, "32", addrmode6align64>; +defm VST1d64wb : VST1DWB<{1,1,0,?}, "64", addrmode6align64>; + +defm VST1q8wb : VST1QWB<{0,0,?,?}, "8", addrmode6align64or128>; +defm VST1q16wb : VST1QWB<{0,1,?,?}, "16", addrmode6align64or128>; +defm VST1q32wb : VST1QWB<{1,0,?,?}, "32", addrmode6align64or128>; +defm VST1q64wb : VST1QWB<{1,1,?,?}, "64", addrmode6align64or128>; + +// ...with 3 registers +class VST1D3<bits<4> op7_4, string Dt, Operand AddrMode> + : NLdSt<0, 0b00, 0b0110, op7_4, (outs), + (ins AddrMode:$Rn, VecListThreeD:$Vd), + IIC_VST1x3, "vst1", Dt, "$Vd, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLDST1Instruction"; +} +multiclass VST1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> { + def _fixed : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb), + (ins AddrMode:$Rn, VecListThreeD:$Vd), IIC_VLD1x3u, + "vst1", Dt, "$Vd, $Rn!", + "$Rn.addr = $wb", []> { + let Rm = 0b1101; // NLdSt will assign to the right encoding bits. + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDST1Instruction"; + } + def _register : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb), + (ins AddrMode:$Rn, rGPR:$Rm, VecListThreeD:$Vd), + IIC_VLD1x3u, + "vst1", Dt, "$Vd, $Rn, $Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDST1Instruction"; + } +} + +def VST1d8T : VST1D3<{0,0,0,?}, "8", addrmode6align64>; +def VST1d16T : VST1D3<{0,1,0,?}, "16", addrmode6align64>; +def VST1d32T : VST1D3<{1,0,0,?}, "32", addrmode6align64>; +def VST1d64T : VST1D3<{1,1,0,?}, "64", addrmode6align64>; + +defm VST1d8Twb : VST1D3WB<{0,0,0,?}, "8", addrmode6align64>; +defm VST1d16Twb : VST1D3WB<{0,1,0,?}, "16", addrmode6align64>; +defm VST1d32Twb : VST1D3WB<{1,0,0,?}, "32", addrmode6align64>; +defm VST1d64Twb : VST1D3WB<{1,1,0,?}, "64", addrmode6align64>; + +def VST1d64TPseudo : VSTQQPseudo<IIC_VST1x3>; +def VST1d64TPseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST1x3u>; +def VST1d64TPseudoWB_register : VSTQQWBPseudo<IIC_VST1x3u>; + +// ...with 4 registers +class VST1D4<bits<4> op7_4, string Dt, Operand AddrMode> + : NLdSt<0, 0b00, 0b0010, op7_4, (outs), + (ins AddrMode:$Rn, VecListFourD:$Vd), + IIC_VST1x4, "vst1", Dt, "$Vd, $Rn", "", + []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDST1Instruction"; +} +multiclass VST1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> { + def _fixed : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb), + (ins AddrMode:$Rn, VecListFourD:$Vd), IIC_VLD1x4u, + "vst1", Dt, "$Vd, $Rn!", + "$Rn.addr = $wb", []> { + let Rm = 0b1101; // NLdSt will assign to the right encoding bits. + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDST1Instruction"; + } + def _register : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb), + (ins AddrMode:$Rn, rGPR:$Rm, VecListFourD:$Vd), + IIC_VLD1x4u, + "vst1", Dt, "$Vd, $Rn, $Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDST1Instruction"; + } +} + +def VST1d8Q : VST1D4<{0,0,?,?}, "8", addrmode6align64or128or256>; +def VST1d16Q : VST1D4<{0,1,?,?}, "16", addrmode6align64or128or256>; +def VST1d32Q : VST1D4<{1,0,?,?}, "32", addrmode6align64or128or256>; +def VST1d64Q : VST1D4<{1,1,?,?}, "64", addrmode6align64or128or256>; + +defm VST1d8Qwb : VST1D4WB<{0,0,?,?}, "8", addrmode6align64or128or256>; +defm VST1d16Qwb : VST1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>; +defm VST1d32Qwb : VST1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>; +defm VST1d64Qwb : VST1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>; + +def VST1d64QPseudo : VSTQQPseudo<IIC_VST1x4>; +def VST1d64QPseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST1x4u>; +def VST1d64QPseudoWB_register : VSTQQWBPseudo<IIC_VST1x4u>; + +// VST2 : Vector Store (multiple 2-element structures) +class VST2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy, + InstrItinClass itin, Operand AddrMode> + : NLdSt<0, 0b00, op11_8, op7_4, (outs), (ins AddrMode:$Rn, VdTy:$Vd), + itin, "vst2", Dt, "$Vd, $Rn", "", []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDST2Instruction"; +} + +def VST2d8 : VST2<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VST2, + addrmode6align64or128>; +def VST2d16 : VST2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VST2, + addrmode6align64or128>; +def VST2d32 : VST2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VST2, + addrmode6align64or128>; + +def VST2q8 : VST2<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VST2x2, + addrmode6align64or128or256>; +def VST2q16 : VST2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VST2x2, + addrmode6align64or128or256>; +def VST2q32 : VST2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VST2x2, + addrmode6align64or128or256>; + +def VST2q8Pseudo : VSTQQPseudo<IIC_VST2x2>; +def VST2q16Pseudo : VSTQQPseudo<IIC_VST2x2>; +def VST2q32Pseudo : VSTQQPseudo<IIC_VST2x2>; + +// ...with address register writeback: +multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt, + RegisterOperand VdTy, Operand AddrMode> { + def _fixed : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb), + (ins AddrMode:$Rn, VdTy:$Vd), IIC_VLD1u, + "vst2", Dt, "$Vd, $Rn!", + "$Rn.addr = $wb", []> { + let Rm = 0b1101; // NLdSt will assign to the right encoding bits. + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDST2Instruction"; + } + def _register : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb), + (ins AddrMode:$Rn, rGPR:$Rm, VdTy:$Vd), IIC_VLD1u, + "vst2", Dt, "$Vd, $Rn, $Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDST2Instruction"; + } +} +multiclass VST2QWB<bits<4> op7_4, string Dt, Operand AddrMode> { + def _fixed : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb), + (ins AddrMode:$Rn, VecListFourD:$Vd), IIC_VLD1u, + "vst2", Dt, "$Vd, $Rn!", + "$Rn.addr = $wb", []> { + let Rm = 0b1101; // NLdSt will assign to the right encoding bits. + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDST2Instruction"; + } + def _register : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb), + (ins AddrMode:$Rn, rGPR:$Rm, VecListFourD:$Vd), + IIC_VLD1u, + "vst2", Dt, "$Vd, $Rn, $Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDST2Instruction"; + } +} + +defm VST2d8wb : VST2DWB<0b1000, {0,0,?,?}, "8", VecListDPair, + addrmode6align64or128>; +defm VST2d16wb : VST2DWB<0b1000, {0,1,?,?}, "16", VecListDPair, + addrmode6align64or128>; +defm VST2d32wb : VST2DWB<0b1000, {1,0,?,?}, "32", VecListDPair, + addrmode6align64or128>; + +defm VST2q8wb : VST2QWB<{0,0,?,?}, "8", addrmode6align64or128or256>; +defm VST2q16wb : VST2QWB<{0,1,?,?}, "16", addrmode6align64or128or256>; +defm VST2q32wb : VST2QWB<{1,0,?,?}, "32", addrmode6align64or128or256>; + +def VST2q8PseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST2x2u>; +def VST2q16PseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST2x2u>; +def VST2q32PseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST2x2u>; +def VST2q8PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>; +def VST2q16PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>; +def VST2q32PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>; + +// ...with double-spaced registers +def VST2b8 : VST2<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VST2, + addrmode6align64or128>; +def VST2b16 : VST2<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VST2, + addrmode6align64or128>; +def VST2b32 : VST2<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VST2, + addrmode6align64or128>; +defm VST2b8wb : VST2DWB<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, + addrmode6align64or128>; +defm VST2b16wb : VST2DWB<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, + addrmode6align64or128>; +defm VST2b32wb : VST2DWB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, + addrmode6align64or128>; + +// VST3 : Vector Store (multiple 3-element structures) +class VST3D<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, op11_8, op7_4, (outs), + (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3), IIC_VST3, + "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLDST3Instruction"; +} + +def VST3d8 : VST3D<0b0100, {0,0,0,?}, "8">; +def VST3d16 : VST3D<0b0100, {0,1,0,?}, "16">; +def VST3d32 : VST3D<0b0100, {1,0,0,?}, "32">; + +def VST3d8Pseudo : VSTQQPseudo<IIC_VST3>; +def VST3d16Pseudo : VSTQQPseudo<IIC_VST3>; +def VST3d32Pseudo : VSTQQPseudo<IIC_VST3>; + +// ...with address register writeback: +class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$Vd, DPR:$src2, DPR:$src3), IIC_VST3u, + "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLDST3Instruction"; +} + +def VST3d8_UPD : VST3DWB<0b0100, {0,0,0,?}, "8">; +def VST3d16_UPD : VST3DWB<0b0100, {0,1,0,?}, "16">; +def VST3d32_UPD : VST3DWB<0b0100, {1,0,0,?}, "32">; + +def VST3d8Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>; +def VST3d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>; +def VST3d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>; + +// ...with double-spaced registers: +def VST3q8 : VST3D<0b0101, {0,0,0,?}, "8">; +def VST3q16 : VST3D<0b0101, {0,1,0,?}, "16">; +def VST3q32 : VST3D<0b0101, {1,0,0,?}, "32">; +def VST3q8_UPD : VST3DWB<0b0101, {0,0,0,?}, "8">; +def VST3q16_UPD : VST3DWB<0b0101, {0,1,0,?}, "16">; +def VST3q32_UPD : VST3DWB<0b0101, {1,0,0,?}, "32">; + +def VST3q8Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>; +def VST3q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>; +def VST3q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>; + +// ...alternate versions to be allocated odd register numbers: +def VST3q8oddPseudo : VSTQQQQPseudo<IIC_VST3>; +def VST3q16oddPseudo : VSTQQQQPseudo<IIC_VST3>; +def VST3q32oddPseudo : VSTQQQQPseudo<IIC_VST3>; + +def VST3q8oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>; +def VST3q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>; +def VST3q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>; + +// VST4 : Vector Store (multiple 4-element structures) +class VST4D<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, op11_8, op7_4, (outs), + (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), + IIC_VST4, "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn", + "", []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDST4Instruction"; +} + +def VST4d8 : VST4D<0b0000, {0,0,?,?}, "8">; +def VST4d16 : VST4D<0b0000, {0,1,?,?}, "16">; +def VST4d32 : VST4D<0b0000, {1,0,?,?}, "32">; + +def VST4d8Pseudo : VSTQQPseudo<IIC_VST4>; +def VST4d16Pseudo : VSTQQPseudo<IIC_VST4>; +def VST4d32Pseudo : VSTQQPseudo<IIC_VST4>; + +// ...with address register writeback: +class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST4u, + "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDST4Instruction"; +} + +def VST4d8_UPD : VST4DWB<0b0000, {0,0,?,?}, "8">; +def VST4d16_UPD : VST4DWB<0b0000, {0,1,?,?}, "16">; +def VST4d32_UPD : VST4DWB<0b0000, {1,0,?,?}, "32">; + +def VST4d8Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>; +def VST4d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>; +def VST4d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>; + +// ...with double-spaced registers: +def VST4q8 : VST4D<0b0001, {0,0,?,?}, "8">; +def VST4q16 : VST4D<0b0001, {0,1,?,?}, "16">; +def VST4q32 : VST4D<0b0001, {1,0,?,?}, "32">; +def VST4q8_UPD : VST4DWB<0b0001, {0,0,?,?}, "8">; +def VST4q16_UPD : VST4DWB<0b0001, {0,1,?,?}, "16">; +def VST4q32_UPD : VST4DWB<0b0001, {1,0,?,?}, "32">; + +def VST4q8Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>; +def VST4q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>; +def VST4q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>; + +// ...alternate versions to be allocated odd register numbers: +def VST4q8oddPseudo : VSTQQQQPseudo<IIC_VST4>; +def VST4q16oddPseudo : VSTQQQQPseudo<IIC_VST4>; +def VST4q32oddPseudo : VSTQQQQPseudo<IIC_VST4>; + +def VST4q8oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>; +def VST4q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>; +def VST4q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>; + +} // mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 + +// Classes for VST*LN pseudo-instructions with multi-register operands. +// These are expanded to real instructions after register allocation. +class VSTQLNPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs), (ins addrmode6:$addr, QPR:$src, nohash_imm:$lane), + itin, "">; +class VSTQLNWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QPR:$src, + nohash_imm:$lane), itin, "$addr.addr = $wb">; +class VSTQQLNPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQPR:$src, nohash_imm:$lane), + itin, "">; +class VSTQQLNWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QQPR:$src, + nohash_imm:$lane), itin, "$addr.addr = $wb">; +class VSTQQQQLNPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQQQPR:$src, nohash_imm:$lane), + itin, "">; +class VSTQQQQLNWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src, + nohash_imm:$lane), itin, "$addr.addr = $wb">; + +// VST1LN : Vector Store (single element from one lane) +class VST1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty, + PatFrag StoreOp, SDNode ExtractOp, Operand AddrMode> + : NLdStLn<1, 0b00, op11_8, op7_4, (outs), + (ins AddrMode:$Rn, DPR:$Vd, nohash_imm:$lane), + IIC_VST1ln, "vst1", Dt, "\\{$Vd[$lane]\\}, $Rn", "", + [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), AddrMode:$Rn)]> { + let Rm = 0b1111; + let DecoderMethod = "DecodeVST1LN"; +} +class VST1QLNPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp> + : VSTQLNPseudo<IIC_VST1ln> { + let Pattern = [(StoreOp (ExtractOp (Ty QPR:$src), imm:$lane), + addrmode6:$addr)]; +} + +def VST1LNd8 : VST1LN<0b0000, {?,?,?,0}, "8", v8i8, truncstorei8, + NEONvgetlaneu, addrmode6> { + let Inst{7-5} = lane{2-0}; +} +def VST1LNd16 : VST1LN<0b0100, {?,?,0,?}, "16", v4i16, truncstorei16, + NEONvgetlaneu, addrmode6> { + let Inst{7-6} = lane{1-0}; + let Inst{4} = Rn{4}; +} + +def VST1LNd32 : VST1LN<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt, + addrmode6oneL32> { + let Inst{7} = lane{0}; + let Inst{5-4} = Rn{5-4}; +} + +def VST1LNq8Pseudo : VST1QLNPseudo<v16i8, truncstorei8, NEONvgetlaneu>; +def VST1LNq16Pseudo : VST1QLNPseudo<v8i16, truncstorei16, NEONvgetlaneu>; +def VST1LNq32Pseudo : VST1QLNPseudo<v4i32, store, extractelt>; + +def : Pat<(store (extractelt (v2f32 DPR:$src), imm:$lane), addrmode6:$addr), + (VST1LNd32 addrmode6:$addr, DPR:$src, imm:$lane)>; +def : Pat<(store (extractelt (v4f32 QPR:$src), imm:$lane), addrmode6:$addr), + (VST1LNq32Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>; + +// ...with address register writeback: +class VST1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty, + PatFrag StoreOp, SDNode ExtractOp, Operand AdrMode> + : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb), + (ins AdrMode:$Rn, am6offset:$Rm, + DPR:$Vd, nohash_imm:$lane), IIC_VST1lnu, "vst1", Dt, + "\\{$Vd[$lane]\\}, $Rn$Rm", + "$Rn.addr = $wb", + [(set GPR:$wb, (StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), + AdrMode:$Rn, am6offset:$Rm))]> { + let DecoderMethod = "DecodeVST1LN"; +} +class VST1QLNWBPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp> + : VSTQLNWBPseudo<IIC_VST1lnu> { + let Pattern = [(set GPR:$wb, (StoreOp (ExtractOp (Ty QPR:$src), imm:$lane), + addrmode6:$addr, am6offset:$offset))]; +} + +def VST1LNd8_UPD : VST1LNWB<0b0000, {?,?,?,0}, "8", v8i8, post_truncsti8, + NEONvgetlaneu, addrmode6> { + let Inst{7-5} = lane{2-0}; +} +def VST1LNd16_UPD : VST1LNWB<0b0100, {?,?,0,?}, "16", v4i16, post_truncsti16, + NEONvgetlaneu, addrmode6> { + let Inst{7-6} = lane{1-0}; + let Inst{4} = Rn{4}; +} +def VST1LNd32_UPD : VST1LNWB<0b1000, {?,0,?,?}, "32", v2i32, post_store, + extractelt, addrmode6oneL32> { + let Inst{7} = lane{0}; + let Inst{5-4} = Rn{5-4}; +} + +def VST1LNq8Pseudo_UPD : VST1QLNWBPseudo<v16i8, post_truncsti8, NEONvgetlaneu>; +def VST1LNq16Pseudo_UPD : VST1QLNWBPseudo<v8i16, post_truncsti16,NEONvgetlaneu>; +def VST1LNq32Pseudo_UPD : VST1QLNWBPseudo<v4i32, post_store, extractelt>; + +let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in { + +// VST2LN : Vector Store (single 2-element structure from one lane) +class VST2LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b00, op11_8, op7_4, (outs), + (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, nohash_imm:$lane), + IIC_VST2ln, "vst2", Dt, "\\{$Vd[$lane], $src2[$lane]\\}, $Rn", + "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVST2LN"; +} + +def VST2LNd8 : VST2LN<0b0001, {?,?,?,?}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VST2LNd16 : VST2LN<0b0101, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST2LNd32 : VST2LN<0b1001, {?,0,0,?}, "32"> { + let Inst{7} = lane{0}; +} + +def VST2LNd8Pseudo : VSTQLNPseudo<IIC_VST2ln>; +def VST2LNd16Pseudo : VSTQLNPseudo<IIC_VST2ln>; +def VST2LNd32Pseudo : VSTQLNPseudo<IIC_VST2ln>; + +// ...with double-spaced registers: +def VST2LNq16 : VST2LN<0b0101, {?,?,1,?}, "16"> { + let Inst{7-6} = lane{1-0}; + let Inst{4} = Rn{4}; +} +def VST2LNq32 : VST2LN<0b1001, {?,1,0,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{4} = Rn{4}; +} + +def VST2LNq16Pseudo : VSTQQLNPseudo<IIC_VST2ln>; +def VST2LNq32Pseudo : VSTQQLNPseudo<IIC_VST2ln>; + +// ...with address register writeback: +class VST2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$Vd, DPR:$src2, nohash_imm:$lane), IIC_VST2lnu, "vst2", Dt, + "\\{$Vd[$lane], $src2[$lane]\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVST2LN"; +} + +def VST2LNd8_UPD : VST2LNWB<0b0001, {?,?,?,?}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VST2LNd16_UPD : VST2LNWB<0b0101, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST2LNd32_UPD : VST2LNWB<0b1001, {?,0,0,?}, "32"> { + let Inst{7} = lane{0}; +} + +def VST2LNd8Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>; +def VST2LNd16Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>; +def VST2LNd32Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>; + +def VST2LNq16_UPD : VST2LNWB<0b0101, {?,?,1,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST2LNq32_UPD : VST2LNWB<0b1001, {?,1,0,?}, "32"> { + let Inst{7} = lane{0}; +} + +def VST2LNq16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>; +def VST2LNq32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>; + +// VST3LN : Vector Store (single 3-element structure from one lane) +class VST3LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b00, op11_8, op7_4, (outs), + (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, + nohash_imm:$lane), IIC_VST3ln, "vst3", Dt, + "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn", "", []> { + let Rm = 0b1111; + let DecoderMethod = "DecodeVST3LN"; +} + +def VST3LNd8 : VST3LN<0b0010, {?,?,?,0}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VST3LNd16 : VST3LN<0b0110, {?,?,0,0}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST3LNd32 : VST3LN<0b1010, {?,0,0,0}, "32"> { + let Inst{7} = lane{0}; +} + +def VST3LNd8Pseudo : VSTQQLNPseudo<IIC_VST3ln>; +def VST3LNd16Pseudo : VSTQQLNPseudo<IIC_VST3ln>; +def VST3LNd32Pseudo : VSTQQLNPseudo<IIC_VST3ln>; + +// ...with double-spaced registers: +def VST3LNq16 : VST3LN<0b0110, {?,?,1,0}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST3LNq32 : VST3LN<0b1010, {?,1,0,0}, "32"> { + let Inst{7} = lane{0}; +} + +def VST3LNq16Pseudo : VSTQQQQLNPseudo<IIC_VST3ln>; +def VST3LNq32Pseudo : VSTQQQQLNPseudo<IIC_VST3ln>; + +// ...with address register writeback: +class VST3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$Vd, DPR:$src2, DPR:$src3, nohash_imm:$lane), + IIC_VST3lnu, "vst3", Dt, + "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let DecoderMethod = "DecodeVST3LN"; +} + +def VST3LNd8_UPD : VST3LNWB<0b0010, {?,?,?,0}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VST3LNd16_UPD : VST3LNWB<0b0110, {?,?,0,0}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST3LNd32_UPD : VST3LNWB<0b1010, {?,0,0,0}, "32"> { + let Inst{7} = lane{0}; +} + +def VST3LNd8Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>; +def VST3LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>; +def VST3LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>; + +def VST3LNq16_UPD : VST3LNWB<0b0110, {?,?,1,0}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST3LNq32_UPD : VST3LNWB<0b1010, {?,1,0,0}, "32"> { + let Inst{7} = lane{0}; +} + +def VST3LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>; +def VST3LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>; + +// VST4LN : Vector Store (single 4-element structure from one lane) +class VST4LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b00, op11_8, op7_4, (outs), + (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4, + nohash_imm:$lane), IIC_VST4ln, "vst4", Dt, + "\\{$Vd[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $Rn", + "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVST4LN"; +} + +def VST4LNd8 : VST4LN<0b0011, {?,?,?,?}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VST4LNd16 : VST4LN<0b0111, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST4LNd32 : VST4LN<0b1011, {?,0,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{5}; +} + +def VST4LNd8Pseudo : VSTQQLNPseudo<IIC_VST4ln>; +def VST4LNd16Pseudo : VSTQQLNPseudo<IIC_VST4ln>; +def VST4LNd32Pseudo : VSTQQLNPseudo<IIC_VST4ln>; + +// ...with double-spaced registers: +def VST4LNq16 : VST4LN<0b0111, {?,?,1,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST4LNq32 : VST4LN<0b1011, {?,1,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{5}; +} + +def VST4LNq16Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>; +def VST4LNq32Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>; + +// ...with address register writeback: +class VST4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane), + IIC_VST4lnu, "vst4", Dt, + "\\{$Vd[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVST4LN"; +} + +def VST4LNd8_UPD : VST4LNWB<0b0011, {?,?,?,?}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VST4LNd16_UPD : VST4LNWB<0b0111, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST4LNd32_UPD : VST4LNWB<0b1011, {?,0,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{5}; +} + +def VST4LNd8Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>; +def VST4LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>; +def VST4LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>; + +def VST4LNq16_UPD : VST4LNWB<0b0111, {?,?,1,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST4LNq32_UPD : VST4LNWB<0b1011, {?,1,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{5}; +} + +def VST4LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>; +def VST4LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>; + +} // mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 + +// Use vld1/vst1 for unaligned f64 load / store +def : Pat<(f64 (hword_alignedload addrmode6:$addr)), + (VLD1d16 addrmode6:$addr)>, Requires<[IsLE]>; +def : Pat<(hword_alignedstore (f64 DPR:$value), addrmode6:$addr), + (VST1d16 addrmode6:$addr, DPR:$value)>, Requires<[IsLE]>; +def : Pat<(f64 (byte_alignedload addrmode6:$addr)), + (VLD1d8 addrmode6:$addr)>, Requires<[IsLE]>; +def : Pat<(byte_alignedstore (f64 DPR:$value), addrmode6:$addr), + (VST1d8 addrmode6:$addr, DPR:$value)>, Requires<[IsLE]>; +def : Pat<(f64 (non_word_alignedload addrmode6:$addr)), + (VLD1d64 addrmode6:$addr)>, Requires<[IsBE]>; +def : Pat<(non_word_alignedstore (f64 DPR:$value), addrmode6:$addr), + (VST1d64 addrmode6:$addr, DPR:$value)>, Requires<[IsBE]>; + +// Use vld1/vst1 for Q and QQ. Also use them for unaligned v2f64 +// load / store if it's legal. +def : Pat<(v2f64 (dword_alignedload addrmode6:$addr)), + (VLD1q64 addrmode6:$addr)>; +def : Pat<(dword_alignedstore (v2f64 QPR:$value), addrmode6:$addr), + (VST1q64 addrmode6:$addr, QPR:$value)>; +def : Pat<(v2f64 (word_alignedload addrmode6:$addr)), + (VLD1q32 addrmode6:$addr)>, Requires<[IsLE]>; +def : Pat<(word_alignedstore (v2f64 QPR:$value), addrmode6:$addr), + (VST1q32 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>; +def : Pat<(v2f64 (hword_alignedload addrmode6:$addr)), + (VLD1q16 addrmode6:$addr)>, Requires<[IsLE]>; +def : Pat<(hword_alignedstore (v2f64 QPR:$value), addrmode6:$addr), + (VST1q16 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>; +def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)), + (VLD1q8 addrmode6:$addr)>, Requires<[IsLE]>; +def : Pat<(byte_alignedstore (v2f64 QPR:$value), addrmode6:$addr), + (VST1q8 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>; + +//===----------------------------------------------------------------------===// +// NEON pattern fragments +//===----------------------------------------------------------------------===// + +// Extract D sub-registers of Q registers. +def DSubReg_i8_reg : SDNodeXForm<imm, [{ + assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); + return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/8, SDLoc(N), + MVT::i32); +}]>; +def DSubReg_i16_reg : SDNodeXForm<imm, [{ + assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); + return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/4, SDLoc(N), + MVT::i32); +}]>; +def DSubReg_i32_reg : SDNodeXForm<imm, [{ + assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); + return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/2, SDLoc(N), + MVT::i32); +}]>; +def DSubReg_f64_reg : SDNodeXForm<imm, [{ + assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); + return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue(), SDLoc(N), + MVT::i32); +}]>; + +// Extract S sub-registers of Q/D registers. +def SSubReg_f32_reg : SDNodeXForm<imm, [{ + assert(ARM::ssub_3 == ARM::ssub_0+3 && "Unexpected subreg numbering"); + return CurDAG->getTargetConstant(ARM::ssub_0 + N->getZExtValue(), SDLoc(N), + MVT::i32); +}]>; + +// Translate lane numbers from Q registers to D subregs. +def SubReg_i8_lane : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() & 7, SDLoc(N), MVT::i32); +}]>; +def SubReg_i16_lane : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() & 3, SDLoc(N), MVT::i32); +}]>; +def SubReg_i32_lane : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() & 1, SDLoc(N), MVT::i32); +}]>; + +//===----------------------------------------------------------------------===// +// Instruction Classes +//===----------------------------------------------------------------------===// + +// Basic 2-register operations: double- and quad-register. +class N2VD<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, + string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd), + (ins DPR:$Vm), IIC_VUNAD, OpcodeStr, Dt,"$Vd, $Vm", "", + [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vm))))]>; +class N2VQ<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, + string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd), + (ins QPR:$Vm), IIC_VUNAQ, OpcodeStr, Dt,"$Vd, $Vm", "", + [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vm))))]>; + +// Basic 2-register intrinsics, both double- and quad-register. +class N2VDInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd), + (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>; +class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd), + (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>; + +// Same as above, but not predicated. +class N2VDIntnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op7, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> + : N2Vnp<op19_18, op17_16, op10_8, op7, 0, (outs DPR:$Vd), (ins DPR:$Vm), + itin, OpcodeStr, Dt, + [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>; + +class N2VQIntnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op7, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> + : N2Vnp<op19_18, op17_16, op10_8, op7, 1, (outs QPR:$Vd), (ins QPR:$Vm), + itin, OpcodeStr, Dt, + [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>; + +// Similar to NV2VQIntnp with some more encoding bits exposed (crypto). +class N2VQIntXnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op6, + bit op7, InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> + : N2Vnp<op19_18, op17_16, op10_8, op7, op6, (outs QPR:$Vd), (ins QPR:$Vm), + itin, OpcodeStr, Dt, + [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>; + +// Same as N2VQIntXnp but with Vd as a src register. +class N2VQIntX2np<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op6, + bit op7, InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> + : N2Vnp<op19_18, op17_16, op10_8, op7, op6, + (outs QPR:$Vd), (ins QPR:$src, QPR:$Vm), + itin, OpcodeStr, Dt, + [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$src), (OpTy QPR:$Vm))))]> { + let Constraints = "$src = $Vd"; +} + +// Narrow 2-register operations. +class N2VN<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op6, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyD, ValueType TyQ, SDNode OpNode> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$Vd), + (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set DPR:$Vd, (TyD (OpNode (TyQ QPR:$Vm))))]>; + +// Narrow 2-register intrinsics. +class N2VNInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op6, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyD, ValueType TyQ, SDPatternOperator IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$Vd), + (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set DPR:$Vd, (TyD (IntOp (TyQ QPR:$Vm))))]>; + +// Long 2-register operations (currently only used for VMOVL). +class N2VL<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op6, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDNode OpNode> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$Vd), + (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set QPR:$Vd, (TyQ (OpNode (TyD DPR:$Vm))))]>; + +// Long 2-register intrinsics. +class N2VLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op6, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDPatternOperator IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$Vd), + (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set QPR:$Vd, (TyQ (IntOp (TyD DPR:$Vm))))]>; + +// 2-register shuffles (VTRN/VZIP/VUZP), both double- and quad-register. +class N2VDShuffle<bits<2> op19_18, bits<5> op11_7, string OpcodeStr, string Dt> + : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 0, 0, (outs DPR:$Vd, DPR:$Vm), + (ins DPR:$src1, DPR:$src2), IIC_VPERMD, + OpcodeStr, Dt, "$Vd, $Vm", + "$src1 = $Vd, $src2 = $Vm", []>; +class N2VQShuffle<bits<2> op19_18, bits<5> op11_7, + InstrItinClass itin, string OpcodeStr, string Dt> + : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 1, 0, (outs QPR:$Vd, QPR:$Vm), + (ins QPR:$src1, QPR:$src2), itin, OpcodeStr, Dt, "$Vd, $Vm", + "$src1 = $Vd, $src2 = $Vm", []>; + +// Basic 3-register operations: double- and quad-register. +class N3VD<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]> { + // All of these have a two-operand InstAlias. + let TwoOperandAliasConstraint = "$Vn = $Vd"; + let isCommutable = Commutable; +} +// Same as N3VD but no data type. +class N3VDX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, + ValueType ResTy, ValueType OpTy, + SDNode OpNode, bit Commutable> + : N3VX<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, "$Vd, $Vn, $Vm", "", + [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>{ + // All of these have a two-operand InstAlias. + let TwoOperandAliasConstraint = "$Vn = $Vd"; + let isCommutable = Commutable; +} + +class N3VDSL<bits<2> op21_20, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType Ty, SDNode ShOp> + : N3VLane32<0, 1, op21_20, op11_8, 1, 0, + (outs DPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", + [(set (Ty DPR:$Vd), + (Ty (ShOp (Ty DPR:$Vn), + (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),imm:$lane)))))]> { + // All of these have a two-operand InstAlias. + let TwoOperandAliasConstraint = "$Vn = $Vd"; + let isCommutable = 0; +} +class N3VDSL16<bits<2> op21_20, bits<4> op11_8, + string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp> + : N3VLane16<0, 1, op21_20, op11_8, 1, 0, + (outs DPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane), + NVMulSLFrm, IIC_VMULi16D, OpcodeStr, Dt,"$Vd, $Vn, $Vm$lane","", + [(set (Ty DPR:$Vd), + (Ty (ShOp (Ty DPR:$Vn), + (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> { + // All of these have a two-operand InstAlias. + let TwoOperandAliasConstraint = "$Vn = $Vd"; + let isCommutable = 0; +} + +class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 1, op4, + (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]> { + // All of these have a two-operand InstAlias. + let TwoOperandAliasConstraint = "$Vn = $Vd"; + let isCommutable = Commutable; +} +class N3VQX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, + ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable> + : N3VX<op24, op23, op21_20, op11_8, 1, op4, + (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin, + OpcodeStr, "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>{ + // All of these have a two-operand InstAlias. + let TwoOperandAliasConstraint = "$Vn = $Vd"; + let isCommutable = Commutable; +} +class N3VQSL<bits<2> op21_20, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDNode ShOp> + : N3VLane32<1, 1, op21_20, op11_8, 1, 0, + (outs QPR:$Vd), (ins QPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", + [(set (ResTy QPR:$Vd), + (ResTy (ShOp (ResTy QPR:$Vn), + (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm), + imm:$lane)))))]> { + // All of these have a two-operand InstAlias. + let TwoOperandAliasConstraint = "$Vn = $Vd"; + let isCommutable = 0; +} +class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDNode ShOp> + : N3VLane16<1, 1, op21_20, op11_8, 1, 0, + (outs QPR:$Vd), (ins QPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane), + NVMulSLFrm, IIC_VMULi16Q, OpcodeStr, Dt,"$Vd, $Vn, $Vm$lane", "", + [(set (ResTy QPR:$Vd), + (ResTy (ShOp (ResTy QPR:$Vn), + (ResTy (NEONvduplane (OpTy DPR_8:$Vm), + imm:$lane)))))]> { + // All of these have a two-operand InstAlias. + let TwoOperandAliasConstraint = "$Vn = $Vd"; + let isCommutable = 0; +} + +// Basic 3-register intrinsics, both double- and quad-register. +class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + Format f, InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), f, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]> { + // All of these have a two-operand InstAlias. + let TwoOperandAliasConstraint = "$Vn = $Vd"; + let isCommutable = Commutable; +} + +class N3VDIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6, + bit op4, Format f, InstrItinClass itin, string OpcodeStr, + string Dt, ValueType ResTy, ValueType OpTy, + SDPatternOperator IntOp, bit Commutable> + : N3Vnp<op27_23, op21_20, op11_8, op6, op4, + (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt, + [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>; + +class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, ValueType Ty, SDPatternOperator IntOp> + : N3VLane32<0, 1, op21_20, op11_8, 1, 0, + (outs DPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", + [(set (Ty DPR:$Vd), + (Ty (IntOp (Ty DPR:$Vn), + (Ty (NEONvduplane (Ty DPR_VFP2:$Vm), + imm:$lane)))))]> { + let isCommutable = 0; +} + +class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, ValueType Ty, SDPatternOperator IntOp> + : N3VLane16<0, 1, op21_20, op11_8, 1, 0, + (outs DPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", + [(set (Ty DPR:$Vd), + (Ty (IntOp (Ty DPR:$Vn), + (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> { + let isCommutable = 0; +} +class N3VDIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + Format f, InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$Vd), (ins DPR:$Vm, DPR:$Vn), f, itin, + OpcodeStr, Dt, "$Vd, $Vm, $Vn", "", + [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm), (OpTy DPR:$Vn))))]> { + let TwoOperandAliasConstraint = "$Vm = $Vd"; + let isCommutable = 0; +} + +class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + Format f, InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 1, op4, + (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), f, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]> { + // All of these have a two-operand InstAlias. + let TwoOperandAliasConstraint = "$Vn = $Vd"; + let isCommutable = Commutable; +} + +class N3VQIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6, + bit op4, Format f, InstrItinClass itin, string OpcodeStr, + string Dt, ValueType ResTy, ValueType OpTy, + SDPatternOperator IntOp, bit Commutable> + : N3Vnp<op27_23, op21_20, op11_8, op6, op4, + (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), f, itin, OpcodeStr, Dt, + [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>; + +// Same as N3VQIntnp but with Vd as a src register. +class N3VQInt3np<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6, + bit op4, Format f, InstrItinClass itin, string OpcodeStr, + string Dt, ValueType ResTy, ValueType OpTy, + SDPatternOperator IntOp, bit Commutable> + : N3Vnp<op27_23, op21_20, op11_8, op6, op4, + (outs QPR:$Vd), (ins QPR:$src, QPR:$Vn, QPR:$Vm), + f, itin, OpcodeStr, Dt, + [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$src), (OpTy QPR:$Vn), + (OpTy QPR:$Vm))))]> { + let Constraints = "$src = $Vd"; +} + +class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> + : N3VLane32<1, 1, op21_20, op11_8, 1, 0, + (outs QPR:$Vd), (ins QPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", + [(set (ResTy QPR:$Vd), + (ResTy (IntOp (ResTy QPR:$Vn), + (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm), + imm:$lane)))))]> { + let isCommutable = 0; +} +class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> + : N3VLane16<1, 1, op21_20, op11_8, 1, 0, + (outs QPR:$Vd), (ins QPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", + [(set (ResTy QPR:$Vd), + (ResTy (IntOp (ResTy QPR:$Vn), + (ResTy (NEONvduplane (OpTy DPR_8:$Vm), + imm:$lane)))))]> { + let isCommutable = 0; +} +class N3VQIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + Format f, InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> + : N3V<op24, op23, op21_20, op11_8, 1, op4, + (outs QPR:$Vd), (ins QPR:$Vm, QPR:$Vn), f, itin, + OpcodeStr, Dt, "$Vd, $Vm, $Vn", "", + [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm), (OpTy QPR:$Vn))))]> { + let TwoOperandAliasConstraint = "$Vm = $Vd"; + let isCommutable = 0; +} + +// Multiply-Add/Sub operations: double- and quad-register. +class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType Ty, SDPatternOperator MulOp, SDPatternOperator OpNode> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set DPR:$Vd, (Ty (OpNode DPR:$src1, + (Ty (MulOp DPR:$Vn, DPR:$Vm)))))]>; + +class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, + ValueType Ty, SDPatternOperator MulOp, SDPatternOperator ShOp> + : N3VLane32<0, 1, op21_20, op11_8, 1, 0, + (outs DPR:$Vd), + (ins DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane), + NVMulSLFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd", + [(set (Ty DPR:$Vd), + (Ty (ShOp (Ty DPR:$src1), + (Ty (MulOp DPR:$Vn, + (Ty (NEONvduplane (Ty DPR_VFP2:$Vm), + imm:$lane)))))))]>; +class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, + ValueType Ty, SDPatternOperator MulOp, SDPatternOperator ShOp> + : N3VLane16<0, 1, op21_20, op11_8, 1, 0, + (outs DPR:$Vd), + (ins DPR:$src1, DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane), + NVMulSLFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd", + [(set (Ty DPR:$Vd), + (Ty (ShOp (Ty DPR:$src1), + (Ty (MulOp DPR:$Vn, + (Ty (NEONvduplane (Ty DPR_8:$Vm), + imm:$lane)))))))]>; + +class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, + SDPatternOperator MulOp, SDPatternOperator OpNode> + : N3V<op24, op23, op21_20, op11_8, 1, op4, + (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set QPR:$Vd, (Ty (OpNode QPR:$src1, + (Ty (MulOp QPR:$Vn, QPR:$Vm)))))]>; +class N3VQMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, + SDPatternOperator MulOp, SDPatternOperator ShOp> + : N3VLane32<1, 1, op21_20, op11_8, 1, 0, + (outs QPR:$Vd), + (ins QPR:$src1, QPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane), + NVMulSLFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd", + [(set (ResTy QPR:$Vd), + (ResTy (ShOp (ResTy QPR:$src1), + (ResTy (MulOp QPR:$Vn, + (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm), + imm:$lane)))))))]>; +class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, + SDPatternOperator MulOp, SDPatternOperator ShOp> + : N3VLane16<1, 1, op21_20, op11_8, 1, 0, + (outs QPR:$Vd), + (ins QPR:$src1, QPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane), + NVMulSLFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd", + [(set (ResTy QPR:$Vd), + (ResTy (ShOp (ResTy QPR:$src1), + (ResTy (MulOp QPR:$Vn, + (ResTy (NEONvduplane (OpTy DPR_8:$Vm), + imm:$lane)))))))]>; + +// Neon Intrinsic-Op instructions (VABA): double- and quad-register. +class N3VDIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType Ty, SDPatternOperator IntOp, SDNode OpNode> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set DPR:$Vd, (Ty (OpNode DPR:$src1, + (Ty (IntOp (Ty DPR:$Vn), (Ty DPR:$Vm))))))]>; +class N3VQIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType Ty, SDPatternOperator IntOp, SDNode OpNode> + : N3V<op24, op23, op21_20, op11_8, 1, op4, + (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set QPR:$Vd, (Ty (OpNode QPR:$src1, + (Ty (IntOp (Ty QPR:$Vn), (Ty QPR:$Vm))))))]>; + +// Neon 3-argument intrinsics, both double- and quad-register. +// The destination register is also used as the first source operand register. +class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$src1), + (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>; +class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> + : N3V<op24, op23, op21_20, op11_8, 1, op4, + (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$src1), + (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>; + +// Long Multiply-Add/Sub operations. +class N3VLMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set QPR:$Vd, (OpNode (TyQ QPR:$src1), + (TyQ (MulOp (TyD DPR:$Vn), + (TyD DPR:$Vm)))))]>; +class N3VLMulOpSL<bit op24, bits<2> op21_20, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode> + : N3VLane32<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), + (ins QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane), + NVMulSLFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd", + [(set QPR:$Vd, + (OpNode (TyQ QPR:$src1), + (TyQ (MulOp (TyD DPR:$Vn), + (TyD (NEONvduplane (TyD DPR_VFP2:$Vm), + imm:$lane))))))]>; +class N3VLMulOpSL16<bit op24, bits<2> op21_20, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode> + : N3VLane16<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), + (ins QPR:$src1, DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane), + NVMulSLFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd", + [(set QPR:$Vd, + (OpNode (TyQ QPR:$src1), + (TyQ (MulOp (TyD DPR:$Vn), + (TyD (NEONvduplane (TyD DPR_8:$Vm), + imm:$lane))))))]>; + +// Long Intrinsic-Op vector operations with explicit extend (VABAL). +class N3VLIntExtOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDPatternOperator IntOp, SDNode ExtOp, + SDNode OpNode> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set QPR:$Vd, (OpNode (TyQ QPR:$src1), + (TyQ (ExtOp (TyD (IntOp (TyD DPR:$Vn), + (TyD DPR:$Vm)))))))]>; + +// Neon Long 3-argument intrinsic. The destination register is +// a quad-register and is also used as the first source operand register. +class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDPatternOperator IntOp> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set QPR:$Vd, + (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$Vn), (TyD DPR:$Vm))))]>; +class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> + : N3VLane32<op24, 1, op21_20, op11_8, 1, 0, + (outs QPR:$Vd), + (ins QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane), + NVMulSLFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd", + [(set (ResTy QPR:$Vd), + (ResTy (IntOp (ResTy QPR:$src1), + (OpTy DPR:$Vn), + (OpTy (NEONvduplane (OpTy DPR_VFP2:$Vm), + imm:$lane)))))]>; +class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> + : N3VLane16<op24, 1, op21_20, op11_8, 1, 0, + (outs QPR:$Vd), + (ins QPR:$src1, DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane), + NVMulSLFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd", + [(set (ResTy QPR:$Vd), + (ResTy (IntOp (ResTy QPR:$src1), + (OpTy DPR:$Vn), + (OpTy (NEONvduplane (OpTy DPR_8:$Vm), + imm:$lane)))))]>; + +// Narrowing 3-register intrinsics. +class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + string OpcodeStr, string Dt, ValueType TyD, ValueType TyQ, + SDPatternOperator IntOp, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINi4D, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set DPR:$Vd, (TyD (IntOp (TyQ QPR:$Vn), (TyQ QPR:$Vm))))]> { + let isCommutable = Commutable; +} + +// Long 3-register operations. +class N3VL<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDNode OpNode, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (TyQ (OpNode (TyD DPR:$Vn), (TyD DPR:$Vm))))]> { + let isCommutable = Commutable; +} + +class N3VLSL<bit op24, bits<2> op21_20, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDNode OpNode> + : N3VLane32<op24, 1, op21_20, op11_8, 1, 0, + (outs QPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", + [(set QPR:$Vd, + (TyQ (OpNode (TyD DPR:$Vn), + (TyD (NEONvduplane (TyD DPR_VFP2:$Vm),imm:$lane)))))]>; +class N3VLSL16<bit op24, bits<2> op21_20, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDNode OpNode> + : N3VLane16<op24, 1, op21_20, op11_8, 1, 0, + (outs QPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", + [(set QPR:$Vd, + (TyQ (OpNode (TyD DPR:$Vn), + (TyD (NEONvduplane (TyD DPR_8:$Vm), imm:$lane)))))]>; + +// Long 3-register operations with explicitly extended operands. +class N3VLExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDNode OpNode, SDNode ExtOp, + bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (OpNode (TyQ (ExtOp (TyD DPR:$Vn))), + (TyQ (ExtOp (TyD DPR:$Vm)))))]> { + let isCommutable = Commutable; +} + +// Long 3-register intrinsics with explicit extend (VABDL). +class N3VLIntExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDPatternOperator IntOp, SDNode ExtOp, + bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (TyQ (ExtOp (TyD (IntOp (TyD DPR:$Vn), + (TyD DPR:$Vm))))))]> { + let isCommutable = Commutable; +} + +// Long 3-register intrinsics. +class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDPatternOperator IntOp, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (TyQ (IntOp (TyD DPR:$Vn), (TyD DPR:$Vm))))]> { + let isCommutable = Commutable; +} + +// Same as above, but not predicated. +class N3VLIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6, + bit op4, InstrItinClass itin, string OpcodeStr, + string Dt, ValueType ResTy, ValueType OpTy, + SDPatternOperator IntOp, bit Commutable> + : N3Vnp<op27_23, op21_20, op11_8, op6, op4, + (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt, + [(set QPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>; + +class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> + : N3VLane32<op24, 1, op21_20, op11_8, 1, 0, + (outs QPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", + [(set (ResTy QPR:$Vd), + (ResTy (IntOp (OpTy DPR:$Vn), + (OpTy (NEONvduplane (OpTy DPR_VFP2:$Vm), + imm:$lane)))))]>; +class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> + : N3VLane16<op24, 1, op21_20, op11_8, 1, 0, + (outs QPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "", + [(set (ResTy QPR:$Vd), + (ResTy (IntOp (OpTy DPR:$Vn), + (OpTy (NEONvduplane (OpTy DPR_8:$Vm), + imm:$lane)))))]>; + +// Wide 3-register operations. +class N3VW<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD, + SDNode OpNode, SDNode ExtOp, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs QPR:$Vd), (ins QPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VSUBiD, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (OpNode (TyQ QPR:$Vn), + (TyQ (ExtOp (TyD DPR:$Vm)))))]> { + // All of these have a two-operand InstAlias. + let TwoOperandAliasConstraint = "$Vn = $Vd"; + let isCommutable = Commutable; +} + +// Pairwise long 2-register intrinsics, both double- and quad-register. +class N2VDPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd), + (ins DPR:$Vm), IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>; +class N2VQPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd), + (ins QPR:$Vm), IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>; + +// Pairwise long 2-register accumulate intrinsics, +// both double- and quad-register. +// The destination register is also used as the first source operand register. +class N2VDPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, + (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vm), IIC_VPALiD, + OpcodeStr, Dt, "$Vd, $Vm", "$src1 = $Vd", + [(set DPR:$Vd, (ResTy (IntOp (ResTy DPR:$src1), (OpTy DPR:$Vm))))]>; +class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, + (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vm), IIC_VPALiQ, + OpcodeStr, Dt, "$Vd, $Vm", "$src1 = $Vd", + [(set QPR:$Vd, (ResTy (IntOp (ResTy QPR:$src1), (OpTy QPR:$Vm))))]>; + +// Shift by immediate, +// both double- and quad-register. +let TwoOperandAliasConstraint = "$Vm = $Vd" in { +class N2VDSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, + Format f, InstrItinClass itin, Operand ImmTy, + string OpcodeStr, string Dt, ValueType Ty, SDNode OpNode> + : N2VImm<op24, op23, op11_8, op7, 0, op4, + (outs DPR:$Vd), (ins DPR:$Vm, ImmTy:$SIMM), f, itin, + OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "", + [(set DPR:$Vd, (Ty (OpNode (Ty DPR:$Vm), (i32 imm:$SIMM))))]>; +class N2VQSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, + Format f, InstrItinClass itin, Operand ImmTy, + string OpcodeStr, string Dt, ValueType Ty, SDNode OpNode> + : N2VImm<op24, op23, op11_8, op7, 1, op4, + (outs QPR:$Vd), (ins QPR:$Vm, ImmTy:$SIMM), f, itin, + OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "", + [(set QPR:$Vd, (Ty (OpNode (Ty QPR:$Vm), (i32 imm:$SIMM))))]>; +} + +// Long shift by immediate. +class N2VLSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Operand ImmTy, + SDPatternOperator OpNode> + : N2VImm<op24, op23, op11_8, op7, op6, op4, + (outs QPR:$Vd), (ins DPR:$Vm, ImmTy:$SIMM), N2RegVShLFrm, + IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "", + [(set QPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vm), ImmTy:$SIMM)))]>; + +// Narrow shift by immediate. +class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Operand ImmTy, + SDPatternOperator OpNode> + : N2VImm<op24, op23, op11_8, op7, op6, op4, + (outs DPR:$Vd), (ins QPR:$Vm, ImmTy:$SIMM), N2RegVShRFrm, itin, + OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "", + [(set DPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vm), + (i32 ImmTy:$SIMM))))]>; + +// Shift right by immediate and accumulate, +// both double- and quad-register. +let TwoOperandAliasConstraint = "$Vm = $Vd" in { +class N2VDShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, + Operand ImmTy, string OpcodeStr, string Dt, + ValueType Ty, SDNode ShOp> + : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$Vd), + (ins DPR:$src1, DPR:$Vm, ImmTy:$SIMM), N2RegVShRFrm, IIC_VPALiD, + OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd", + [(set DPR:$Vd, (Ty (add DPR:$src1, + (Ty (ShOp DPR:$Vm, (i32 imm:$SIMM))))))]>; +class N2VQShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, + Operand ImmTy, string OpcodeStr, string Dt, + ValueType Ty, SDNode ShOp> + : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$Vd), + (ins QPR:$src1, QPR:$Vm, ImmTy:$SIMM), N2RegVShRFrm, IIC_VPALiD, + OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd", + [(set QPR:$Vd, (Ty (add QPR:$src1, + (Ty (ShOp QPR:$Vm, (i32 imm:$SIMM))))))]>; +} + +// Shift by immediate and insert, +// both double- and quad-register. +let TwoOperandAliasConstraint = "$Vm = $Vd" in { +class N2VDShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, + Operand ImmTy, Format f, string OpcodeStr, string Dt, + ValueType Ty,SDNode ShOp> + : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$Vd), + (ins DPR:$src1, DPR:$Vm, ImmTy:$SIMM), f, IIC_VSHLiD, + OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd", + [(set DPR:$Vd, (Ty (ShOp DPR:$src1, DPR:$Vm, (i32 imm:$SIMM))))]>; +class N2VQShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, + Operand ImmTy, Format f, string OpcodeStr, string Dt, + ValueType Ty,SDNode ShOp> + : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$Vd), + (ins QPR:$src1, QPR:$Vm, ImmTy:$SIMM), f, IIC_VSHLiQ, + OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd", + [(set QPR:$Vd, (Ty (ShOp QPR:$src1, QPR:$Vm, (i32 imm:$SIMM))))]>; +} + +// Convert, with fractional bits immediate, +// both double- and quad-register. +class N2VCvtD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, + string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, + SDPatternOperator IntOp> + : N2VImm<op24, op23, op11_8, op7, 0, op4, + (outs DPR:$Vd), (ins DPR:$Vm, neon_vcvt_imm32:$SIMM), NVCVTFrm, + IIC_VUNAD, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "", + [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm), (i32 imm:$SIMM))))]>; +class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, + string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, + SDPatternOperator IntOp> + : N2VImm<op24, op23, op11_8, op7, 1, op4, + (outs QPR:$Vd), (ins QPR:$Vm, neon_vcvt_imm32:$SIMM), NVCVTFrm, + IIC_VUNAQ, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "", + [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm), (i32 imm:$SIMM))))]>; + +//===----------------------------------------------------------------------===// +// Multiclasses +//===----------------------------------------------------------------------===// + +// Abbreviations used in multiclass suffixes: +// Q = quarter int (8 bit) elements +// H = half int (16 bit) elements +// S = single int (32 bit) elements +// D = double int (64 bit) elements + +// Neon 2-register vector operations and intrinsics. + +// Neon 2-register comparisons. +// source operand element sizes of 8, 16 and 32 bits: +multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, + bits<5> op11_7, bit op4, string opc, string Dt, + string asm, SDNode OpNode> { + // 64-bit vector types. + def v8i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 0, op4, + (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, + opc, !strconcat(Dt, "8"), asm, "", + [(set DPR:$Vd, (v8i8 (OpNode (v8i8 DPR:$Vm))))]>; + def v4i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4, + (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, + opc, !strconcat(Dt, "16"), asm, "", + [(set DPR:$Vd, (v4i16 (OpNode (v4i16 DPR:$Vm))))]>; + def v2i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4, + (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, + opc, !strconcat(Dt, "32"), asm, "", + [(set DPR:$Vd, (v2i32 (OpNode (v2i32 DPR:$Vm))))]>; + def v2f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4, + (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, + opc, "f32", asm, "", + [(set DPR:$Vd, (v2i32 (OpNode (v2f32 DPR:$Vm))))]> { + let Inst{10} = 1; // overwrite F = 1 + } + def v4f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4, + (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, + opc, "f16", asm, "", + [(set DPR:$Vd, (v4i16 (OpNode (v4f16 DPR:$Vm))))]>, + Requires<[HasNEON,HasFullFP16]> { + let Inst{10} = 1; // overwrite F = 1 + } + + // 128-bit vector types. + def v16i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 1, op4, + (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, + opc, !strconcat(Dt, "8"), asm, "", + [(set QPR:$Vd, (v16i8 (OpNode (v16i8 QPR:$Vm))))]>; + def v8i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4, + (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, + opc, !strconcat(Dt, "16"), asm, "", + [(set QPR:$Vd, (v8i16 (OpNode (v8i16 QPR:$Vm))))]>; + def v4i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4, + (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, + opc, !strconcat(Dt, "32"), asm, "", + [(set QPR:$Vd, (v4i32 (OpNode (v4i32 QPR:$Vm))))]>; + def v4f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4, + (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, + opc, "f32", asm, "", + [(set QPR:$Vd, (v4i32 (OpNode (v4f32 QPR:$Vm))))]> { + let Inst{10} = 1; // overwrite F = 1 + } + def v8f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4, + (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, + opc, "f16", asm, "", + [(set QPR:$Vd, (v8i16 (OpNode (v8f16 QPR:$Vm))))]>, + Requires<[HasNEON,HasFullFP16]> { + let Inst{10} = 1; // overwrite F = 1 + } +} + + +// Neon 2-register vector intrinsics, +// element sizes of 8, 16 and 32 bits: +multiclass N2VInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, + bits<5> op11_7, bit op4, + InstrItinClass itinD, InstrItinClass itinQ, + string OpcodeStr, string Dt, SDPatternOperator IntOp> { + // 64-bit vector types. + def v8i8 : N2VDInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4, + itinD, OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>; + def v4i16 : N2VDInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4, + itinD, OpcodeStr, !strconcat(Dt, "16"),v4i16,v4i16,IntOp>; + def v2i32 : N2VDInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4, + itinD, OpcodeStr, !strconcat(Dt, "32"),v2i32,v2i32,IntOp>; + + // 128-bit vector types. + def v16i8 : N2VQInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4, + itinQ, OpcodeStr, !strconcat(Dt, "8"), v16i8,v16i8,IntOp>; + def v8i16 : N2VQInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4, + itinQ, OpcodeStr, !strconcat(Dt, "16"),v8i16,v8i16,IntOp>; + def v4i32 : N2VQInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4, + itinQ, OpcodeStr, !strconcat(Dt, "32"),v4i32,v4i32,IntOp>; +} + + +// Neon Narrowing 2-register vector operations, +// source operand element sizes of 16, 32 and 64 bits: +multiclass N2VN_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, + bits<5> op11_7, bit op6, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + SDNode OpNode> { + def v8i8 : N2VN<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4, + itin, OpcodeStr, !strconcat(Dt, "16"), + v8i8, v8i16, OpNode>; + def v4i16 : N2VN<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4, + itin, OpcodeStr, !strconcat(Dt, "32"), + v4i16, v4i32, OpNode>; + def v2i32 : N2VN<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4, + itin, OpcodeStr, !strconcat(Dt, "64"), + v2i32, v2i64, OpNode>; +} + +// Neon Narrowing 2-register vector intrinsics, +// source operand element sizes of 16, 32 and 64 bits: +multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, + bits<5> op11_7, bit op6, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + SDPatternOperator IntOp> { + def v8i8 : N2VNInt<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4, + itin, OpcodeStr, !strconcat(Dt, "16"), + v8i8, v8i16, IntOp>; + def v4i16 : N2VNInt<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4, + itin, OpcodeStr, !strconcat(Dt, "32"), + v4i16, v4i32, IntOp>; + def v2i32 : N2VNInt<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4, + itin, OpcodeStr, !strconcat(Dt, "64"), + v2i32, v2i64, IntOp>; +} + + +// Neon Lengthening 2-register vector intrinsic (currently specific to VMOVL). +// source operand element sizes of 16, 32 and 64 bits: +multiclass N2VL_QHS<bits<2> op24_23, bits<5> op11_7, bit op6, bit op4, + string OpcodeStr, string Dt, SDNode OpNode> { + def v8i16 : N2VL<op24_23, 0b00, 0b10, 0b00, op11_7, op6, op4, IIC_VQUNAiD, + OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, OpNode>; + def v4i32 : N2VL<op24_23, 0b01, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD, + OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, OpNode>; + def v2i64 : N2VL<op24_23, 0b10, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD, + OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, OpNode>; +} + + +// Neon 3-register vector operations. + +// First with only element sizes of 8, 16 and 32 bits: +multiclass N3V_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, + SDNode OpNode, bit Commutable = 0> { + // 64-bit vector types. + def v8i8 : N3VD<op24, op23, 0b00, op11_8, op4, itinD16, + OpcodeStr, !strconcat(Dt, "8"), + v8i8, v8i8, OpNode, Commutable>; + def v4i16 : N3VD<op24, op23, 0b01, op11_8, op4, itinD16, + OpcodeStr, !strconcat(Dt, "16"), + v4i16, v4i16, OpNode, Commutable>; + def v2i32 : N3VD<op24, op23, 0b10, op11_8, op4, itinD32, + OpcodeStr, !strconcat(Dt, "32"), + v2i32, v2i32, OpNode, Commutable>; + + // 128-bit vector types. + def v16i8 : N3VQ<op24, op23, 0b00, op11_8, op4, itinQ16, + OpcodeStr, !strconcat(Dt, "8"), + v16i8, v16i8, OpNode, Commutable>; + def v8i16 : N3VQ<op24, op23, 0b01, op11_8, op4, itinQ16, + OpcodeStr, !strconcat(Dt, "16"), + v8i16, v8i16, OpNode, Commutable>; + def v4i32 : N3VQ<op24, op23, 0b10, op11_8, op4, itinQ32, + OpcodeStr, !strconcat(Dt, "32"), + v4i32, v4i32, OpNode, Commutable>; +} + +multiclass N3VSL_HS<bits<4> op11_8, string OpcodeStr, SDNode ShOp> { + def v4i16 : N3VDSL16<0b01, op11_8, OpcodeStr, "i16", v4i16, ShOp>; + def v2i32 : N3VDSL<0b10, op11_8, IIC_VMULi32D, OpcodeStr, "i32", v2i32, ShOp>; + def v8i16 : N3VQSL16<0b01, op11_8, OpcodeStr, "i16", v8i16, v4i16, ShOp>; + def v4i32 : N3VQSL<0b10, op11_8, IIC_VMULi32Q, OpcodeStr, "i32", + v4i32, v2i32, ShOp>; +} + +// ....then also with element size 64 bits: +multiclass N3V_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itinD, InstrItinClass itinQ, + string OpcodeStr, string Dt, + SDNode OpNode, bit Commutable = 0> + : N3V_QHS<op24, op23, op11_8, op4, itinD, itinD, itinQ, itinQ, + OpcodeStr, Dt, OpNode, Commutable> { + def v1i64 : N3VD<op24, op23, 0b11, op11_8, op4, itinD, + OpcodeStr, !strconcat(Dt, "64"), + v1i64, v1i64, OpNode, Commutable>; + def v2i64 : N3VQ<op24, op23, 0b11, op11_8, op4, itinQ, + OpcodeStr, !strconcat(Dt, "64"), + v2i64, v2i64, OpNode, Commutable>; +} + + +// Neon 3-register vector intrinsics. + +// First with only element sizes of 16 and 32 bits: +multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, + SDPatternOperator IntOp, bit Commutable = 0> { + // 64-bit vector types. + def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, f, itinD16, + OpcodeStr, !strconcat(Dt, "16"), + v4i16, v4i16, IntOp, Commutable>; + def v2i32 : N3VDInt<op24, op23, 0b10, op11_8, op4, f, itinD32, + OpcodeStr, !strconcat(Dt, "32"), + v2i32, v2i32, IntOp, Commutable>; + + // 128-bit vector types. + def v8i16 : N3VQInt<op24, op23, 0b01, op11_8, op4, f, itinQ16, + OpcodeStr, !strconcat(Dt, "16"), + v8i16, v8i16, IntOp, Commutable>; + def v4i32 : N3VQInt<op24, op23, 0b10, op11_8, op4, f, itinQ32, + OpcodeStr, !strconcat(Dt, "32"), + v4i32, v4i32, IntOp, Commutable>; +} +multiclass N3VInt_HSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, + SDPatternOperator IntOp> { + // 64-bit vector types. + def v4i16 : N3VDIntSh<op24, op23, 0b01, op11_8, op4, f, itinD16, + OpcodeStr, !strconcat(Dt, "16"), + v4i16, v4i16, IntOp>; + def v2i32 : N3VDIntSh<op24, op23, 0b10, op11_8, op4, f, itinD32, + OpcodeStr, !strconcat(Dt, "32"), + v2i32, v2i32, IntOp>; + + // 128-bit vector types. + def v8i16 : N3VQIntSh<op24, op23, 0b01, op11_8, op4, f, itinQ16, + OpcodeStr, !strconcat(Dt, "16"), + v8i16, v8i16, IntOp>; + def v4i32 : N3VQIntSh<op24, op23, 0b10, op11_8, op4, f, itinQ32, + OpcodeStr, !strconcat(Dt, "32"), + v4i32, v4i32, IntOp>; +} + +multiclass N3VIntSL_HS<bits<4> op11_8, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, SDPatternOperator IntOp> { + def v4i16 : N3VDIntSL16<0b01, op11_8, itinD16, + OpcodeStr, !strconcat(Dt, "16"), v4i16, IntOp>; + def v2i32 : N3VDIntSL<0b10, op11_8, itinD32, + OpcodeStr, !strconcat(Dt, "32"), v2i32, IntOp>; + def v8i16 : N3VQIntSL16<0b01, op11_8, itinQ16, + OpcodeStr, !strconcat(Dt, "16"), v8i16, v4i16, IntOp>; + def v4i32 : N3VQIntSL<0b10, op11_8, itinQ32, + OpcodeStr, !strconcat(Dt, "32"), v4i32, v2i32, IntOp>; +} + +// ....then also with element size of 8 bits: +multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, + SDPatternOperator IntOp, bit Commutable = 0> + : N3VInt_HS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32, + OpcodeStr, Dt, IntOp, Commutable> { + def v8i8 : N3VDInt<op24, op23, 0b00, op11_8, op4, f, itinD16, + OpcodeStr, !strconcat(Dt, "8"), + v8i8, v8i8, IntOp, Commutable>; + def v16i8 : N3VQInt<op24, op23, 0b00, op11_8, op4, f, itinQ16, + OpcodeStr, !strconcat(Dt, "8"), + v16i8, v16i8, IntOp, Commutable>; +} +multiclass N3VInt_QHSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, + SDPatternOperator IntOp> + : N3VInt_HSSh<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32, + OpcodeStr, Dt, IntOp> { + def v8i8 : N3VDIntSh<op24, op23, 0b00, op11_8, op4, f, itinD16, + OpcodeStr, !strconcat(Dt, "8"), + v8i8, v8i8, IntOp>; + def v16i8 : N3VQIntSh<op24, op23, 0b00, op11_8, op4, f, itinQ16, + OpcodeStr, !strconcat(Dt, "8"), + v16i8, v16i8, IntOp>; +} + + +// ....then also with element size of 64 bits: +multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, + SDPatternOperator IntOp, bit Commutable = 0> + : N3VInt_QHS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32, + OpcodeStr, Dt, IntOp, Commutable> { + def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, f, itinD32, + OpcodeStr, !strconcat(Dt, "64"), + v1i64, v1i64, IntOp, Commutable>; + def v2i64 : N3VQInt<op24, op23, 0b11, op11_8, op4, f, itinQ32, + OpcodeStr, !strconcat(Dt, "64"), + v2i64, v2i64, IntOp, Commutable>; +} +multiclass N3VInt_QHSDSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, + SDPatternOperator IntOp> + : N3VInt_QHSSh<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32, + OpcodeStr, Dt, IntOp> { + def v1i64 : N3VDIntSh<op24, op23, 0b11, op11_8, op4, f, itinD32, + OpcodeStr, !strconcat(Dt, "64"), + v1i64, v1i64, IntOp>; + def v2i64 : N3VQIntSh<op24, op23, 0b11, op11_8, op4, f, itinQ32, + OpcodeStr, !strconcat(Dt, "64"), + v2i64, v2i64, IntOp>; +} + +// Neon Narrowing 3-register vector intrinsics, +// source operand element sizes of 16, 32 and 64 bits: +multiclass N3VNInt_HSD<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr, string Dt, + SDPatternOperator IntOp, bit Commutable = 0> { + def v8i8 : N3VNInt<op24, op23, 0b00, op11_8, op4, + OpcodeStr, !strconcat(Dt, "16"), + v8i8, v8i16, IntOp, Commutable>; + def v4i16 : N3VNInt<op24, op23, 0b01, op11_8, op4, + OpcodeStr, !strconcat(Dt, "32"), + v4i16, v4i32, IntOp, Commutable>; + def v2i32 : N3VNInt<op24, op23, 0b10, op11_8, op4, + OpcodeStr, !strconcat(Dt, "64"), + v2i32, v2i64, IntOp, Commutable>; +} + + +// Neon Long 3-register vector operations. + +multiclass N3VL_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin16, InstrItinClass itin32, + string OpcodeStr, string Dt, + SDNode OpNode, bit Commutable = 0> { + def v8i16 : N3VL<op24, op23, 0b00, op11_8, op4, itin16, + OpcodeStr, !strconcat(Dt, "8"), + v8i16, v8i8, OpNode, Commutable>; + def v4i32 : N3VL<op24, op23, 0b01, op11_8, op4, itin16, + OpcodeStr, !strconcat(Dt, "16"), + v4i32, v4i16, OpNode, Commutable>; + def v2i64 : N3VL<op24, op23, 0b10, op11_8, op4, itin32, + OpcodeStr, !strconcat(Dt, "32"), + v2i64, v2i32, OpNode, Commutable>; +} + +multiclass N3VLSL_HS<bit op24, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + SDNode OpNode> { + def v4i16 : N3VLSL16<op24, 0b01, op11_8, itin, OpcodeStr, + !strconcat(Dt, "16"), v4i32, v4i16, OpNode>; + def v2i32 : N3VLSL<op24, 0b10, op11_8, itin, OpcodeStr, + !strconcat(Dt, "32"), v2i64, v2i32, OpNode>; +} + +multiclass N3VLExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin16, InstrItinClass itin32, + string OpcodeStr, string Dt, + SDNode OpNode, SDNode ExtOp, bit Commutable = 0> { + def v8i16 : N3VLExt<op24, op23, 0b00, op11_8, op4, itin16, + OpcodeStr, !strconcat(Dt, "8"), + v8i16, v8i8, OpNode, ExtOp, Commutable>; + def v4i32 : N3VLExt<op24, op23, 0b01, op11_8, op4, itin16, + OpcodeStr, !strconcat(Dt, "16"), + v4i32, v4i16, OpNode, ExtOp, Commutable>; + def v2i64 : N3VLExt<op24, op23, 0b10, op11_8, op4, itin32, + OpcodeStr, !strconcat(Dt, "32"), + v2i64, v2i32, OpNode, ExtOp, Commutable>; +} + +// Neon Long 3-register vector intrinsics. + +// First with only element sizes of 16 and 32 bits: +multiclass N3VLInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin16, InstrItinClass itin32, + string OpcodeStr, string Dt, + SDPatternOperator IntOp, bit Commutable = 0> { + def v4i32 : N3VLInt<op24, op23, 0b01, op11_8, op4, itin16, + OpcodeStr, !strconcat(Dt, "16"), + v4i32, v4i16, IntOp, Commutable>; + def v2i64 : N3VLInt<op24, op23, 0b10, op11_8, op4, itin32, + OpcodeStr, !strconcat(Dt, "32"), + v2i64, v2i32, IntOp, Commutable>; +} + +multiclass N3VLIntSL_HS<bit op24, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + SDPatternOperator IntOp> { + def v4i16 : N3VLIntSL16<op24, 0b01, op11_8, itin, + OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>; + def v2i32 : N3VLIntSL<op24, 0b10, op11_8, itin, + OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>; +} + +// ....then also with element size of 8 bits: +multiclass N3VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin16, InstrItinClass itin32, + string OpcodeStr, string Dt, + SDPatternOperator IntOp, bit Commutable = 0> + : N3VLInt_HS<op24, op23, op11_8, op4, itin16, itin32, OpcodeStr, Dt, + IntOp, Commutable> { + def v8i16 : N3VLInt<op24, op23, 0b00, op11_8, op4, itin16, + OpcodeStr, !strconcat(Dt, "8"), + v8i16, v8i8, IntOp, Commutable>; +} + +// ....with explicit extend (VABDL). +multiclass N3VLIntExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + SDPatternOperator IntOp, SDNode ExtOp, bit Commutable = 0> { + def v8i16 : N3VLIntExt<op24, op23, 0b00, op11_8, op4, itin, + OpcodeStr, !strconcat(Dt, "8"), + v8i16, v8i8, IntOp, ExtOp, Commutable>; + def v4i32 : N3VLIntExt<op24, op23, 0b01, op11_8, op4, itin, + OpcodeStr, !strconcat(Dt, "16"), + v4i32, v4i16, IntOp, ExtOp, Commutable>; + def v2i64 : N3VLIntExt<op24, op23, 0b10, op11_8, op4, itin, + OpcodeStr, !strconcat(Dt, "32"), + v2i64, v2i32, IntOp, ExtOp, Commutable>; +} + + +// Neon Wide 3-register vector intrinsics, +// source operand element sizes of 8, 16 and 32 bits: +multiclass N3VW_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr, string Dt, + SDNode OpNode, SDNode ExtOp, bit Commutable = 0> { + def v8i16 : N3VW<op24, op23, 0b00, op11_8, op4, + OpcodeStr, !strconcat(Dt, "8"), + v8i16, v8i8, OpNode, ExtOp, Commutable>; + def v4i32 : N3VW<op24, op23, 0b01, op11_8, op4, + OpcodeStr, !strconcat(Dt, "16"), + v4i32, v4i16, OpNode, ExtOp, Commutable>; + def v2i64 : N3VW<op24, op23, 0b10, op11_8, op4, + OpcodeStr, !strconcat(Dt, "32"), + v2i64, v2i32, OpNode, ExtOp, Commutable>; +} + + +// Neon Multiply-Op vector operations, +// element sizes of 8, 16 and 32 bits: +multiclass N3VMulOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, SDNode OpNode> { + // 64-bit vector types. + def v8i8 : N3VDMulOp<op24, op23, 0b00, op11_8, op4, itinD16, + OpcodeStr, !strconcat(Dt, "8"), v8i8, mul, OpNode>; + def v4i16 : N3VDMulOp<op24, op23, 0b01, op11_8, op4, itinD16, + OpcodeStr, !strconcat(Dt, "16"), v4i16, mul, OpNode>; + def v2i32 : N3VDMulOp<op24, op23, 0b10, op11_8, op4, itinD32, + OpcodeStr, !strconcat(Dt, "32"), v2i32, mul, OpNode>; + + // 128-bit vector types. + def v16i8 : N3VQMulOp<op24, op23, 0b00, op11_8, op4, itinQ16, + OpcodeStr, !strconcat(Dt, "8"), v16i8, mul, OpNode>; + def v8i16 : N3VQMulOp<op24, op23, 0b01, op11_8, op4, itinQ16, + OpcodeStr, !strconcat(Dt, "16"), v8i16, mul, OpNode>; + def v4i32 : N3VQMulOp<op24, op23, 0b10, op11_8, op4, itinQ32, + OpcodeStr, !strconcat(Dt, "32"), v4i32, mul, OpNode>; +} + +multiclass N3VMulOpSL_HS<bits<4> op11_8, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, SDPatternOperator ShOp> { + def v4i16 : N3VDMulOpSL16<0b01, op11_8, itinD16, + OpcodeStr, !strconcat(Dt, "16"), v4i16, mul, ShOp>; + def v2i32 : N3VDMulOpSL<0b10, op11_8, itinD32, + OpcodeStr, !strconcat(Dt, "32"), v2i32, mul, ShOp>; + def v8i16 : N3VQMulOpSL16<0b01, op11_8, itinQ16, + OpcodeStr, !strconcat(Dt, "16"), v8i16, v4i16, + mul, ShOp>; + def v4i32 : N3VQMulOpSL<0b10, op11_8, itinQ32, + OpcodeStr, !strconcat(Dt, "32"), v4i32, v2i32, + mul, ShOp>; +} + +// Neon Intrinsic-Op vector operations, +// element sizes of 8, 16 and 32 bits: +multiclass N3VIntOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itinD, InstrItinClass itinQ, + string OpcodeStr, string Dt, SDPatternOperator IntOp, + SDNode OpNode> { + // 64-bit vector types. + def v8i8 : N3VDIntOp<op24, op23, 0b00, op11_8, op4, itinD, + OpcodeStr, !strconcat(Dt, "8"), v8i8, IntOp, OpNode>; + def v4i16 : N3VDIntOp<op24, op23, 0b01, op11_8, op4, itinD, + OpcodeStr, !strconcat(Dt, "16"), v4i16, IntOp, OpNode>; + def v2i32 : N3VDIntOp<op24, op23, 0b10, op11_8, op4, itinD, + OpcodeStr, !strconcat(Dt, "32"), v2i32, IntOp, OpNode>; + + // 128-bit vector types. + def v16i8 : N3VQIntOp<op24, op23, 0b00, op11_8, op4, itinQ, + OpcodeStr, !strconcat(Dt, "8"), v16i8, IntOp, OpNode>; + def v8i16 : N3VQIntOp<op24, op23, 0b01, op11_8, op4, itinQ, + OpcodeStr, !strconcat(Dt, "16"), v8i16, IntOp, OpNode>; + def v4i32 : N3VQIntOp<op24, op23, 0b10, op11_8, op4, itinQ, + OpcodeStr, !strconcat(Dt, "32"), v4i32, IntOp, OpNode>; +} + +// Neon 3-argument intrinsics, +// element sizes of 16 and 32 bits: +multiclass N3VInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, SDPatternOperator IntOp> { + // 64-bit vector types. + def v4i16 : N3VDInt3<op24, op23, 0b01, op11_8, op4, itinD16, + OpcodeStr, !strconcat(Dt, "16"), v4i16, v4i16, IntOp>; + def v2i32 : N3VDInt3<op24, op23, 0b10, op11_8, op4, itinD32, + OpcodeStr, !strconcat(Dt, "32"), v2i32, v2i32, IntOp>; + + // 128-bit vector types. + def v8i16 : N3VQInt3<op24, op23, 0b01, op11_8, op4, itinQ16, + OpcodeStr, !strconcat(Dt, "16"), v8i16, v8i16, IntOp>; + def v4i32 : N3VQInt3<op24, op23, 0b10, op11_8, op4, itinQ32, + OpcodeStr, !strconcat(Dt, "32"), v4i32, v4i32, IntOp>; +} + +// element sizes of 8, 16 and 32 bits: +multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, SDPatternOperator IntOp> + :N3VInt3_HS <op24, op23, op11_8, op4, itinD16, itinD32, + itinQ16, itinQ32, OpcodeStr, Dt, IntOp>{ + // 64-bit vector types. + def v8i8 : N3VDInt3<op24, op23, 0b00, op11_8, op4, itinD16, + OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>; + // 128-bit vector types. + def v16i8 : N3VQInt3<op24, op23, 0b00, op11_8, op4, itinQ16, + OpcodeStr, !strconcat(Dt, "8"), v16i8, v16i8, IntOp>; +} + +// Neon Long Multiply-Op vector operations, +// element sizes of 8, 16 and 32 bits: +multiclass N3VLMulOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin16, InstrItinClass itin32, + string OpcodeStr, string Dt, SDNode MulOp, + SDNode OpNode> { + def v8i16 : N3VLMulOp<op24, op23, 0b00, op11_8, op4, itin16, OpcodeStr, + !strconcat(Dt, "8"), v8i16, v8i8, MulOp, OpNode>; + def v4i32 : N3VLMulOp<op24, op23, 0b01, op11_8, op4, itin16, OpcodeStr, + !strconcat(Dt, "16"), v4i32, v4i16, MulOp, OpNode>; + def v2i64 : N3VLMulOp<op24, op23, 0b10, op11_8, op4, itin32, OpcodeStr, + !strconcat(Dt, "32"), v2i64, v2i32, MulOp, OpNode>; +} + +multiclass N3VLMulOpSL_HS<bit op24, bits<4> op11_8, string OpcodeStr, + string Dt, SDNode MulOp, SDNode OpNode> { + def v4i16 : N3VLMulOpSL16<op24, 0b01, op11_8, IIC_VMACi16D, OpcodeStr, + !strconcat(Dt,"16"), v4i32, v4i16, MulOp, OpNode>; + def v2i32 : N3VLMulOpSL<op24, 0b10, op11_8, IIC_VMACi32D, OpcodeStr, + !strconcat(Dt, "32"), v2i64, v2i32, MulOp, OpNode>; +} + + +// Neon Long 3-argument intrinsics. + +// First with only element sizes of 16 and 32 bits: +multiclass N3VLInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin16, InstrItinClass itin32, + string OpcodeStr, string Dt, SDPatternOperator IntOp> { + def v4i32 : N3VLInt3<op24, op23, 0b01, op11_8, op4, itin16, + OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>; + def v2i64 : N3VLInt3<op24, op23, 0b10, op11_8, op4, itin32, + OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>; +} + +multiclass N3VLInt3SL_HS<bit op24, bits<4> op11_8, + string OpcodeStr, string Dt, SDPatternOperator IntOp> { + def v4i16 : N3VLInt3SL16<op24, 0b01, op11_8, IIC_VMACi16D, + OpcodeStr, !strconcat(Dt,"16"), v4i32, v4i16, IntOp>; + def v2i32 : N3VLInt3SL<op24, 0b10, op11_8, IIC_VMACi32D, + OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>; +} + +// ....then also with element size of 8 bits: +multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin16, InstrItinClass itin32, + string OpcodeStr, string Dt, SDPatternOperator IntOp> + : N3VLInt3_HS<op24, op23, op11_8, op4, itin16, itin32, OpcodeStr, Dt, IntOp> { + def v8i16 : N3VLInt3<op24, op23, 0b00, op11_8, op4, itin16, + OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, IntOp>; +} + +// ....with explicit extend (VABAL). +multiclass N3VLIntExtOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + SDPatternOperator IntOp, SDNode ExtOp, SDNode OpNode> { + def v8i16 : N3VLIntExtOp<op24, op23, 0b00, op11_8, op4, itin, + OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, + IntOp, ExtOp, OpNode>; + def v4i32 : N3VLIntExtOp<op24, op23, 0b01, op11_8, op4, itin, + OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, + IntOp, ExtOp, OpNode>; + def v2i64 : N3VLIntExtOp<op24, op23, 0b10, op11_8, op4, itin, + OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, + IntOp, ExtOp, OpNode>; +} + + +// Neon Pairwise long 2-register intrinsics, +// element sizes of 8, 16 and 32 bits: +multiclass N2VPLInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, + bits<5> op11_7, bit op4, + string OpcodeStr, string Dt, SDPatternOperator IntOp> { + // 64-bit vector types. + def v8i8 : N2VDPLInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "8"), v4i16, v8i8, IntOp>; + def v4i16 : N2VDPLInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "16"), v2i32, v4i16, IntOp>; + def v2i32 : N2VDPLInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "32"), v1i64, v2i32, IntOp>; + + // 128-bit vector types. + def v16i8 : N2VQPLInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "8"), v8i16, v16i8, IntOp>; + def v8i16 : N2VQPLInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "16"), v4i32, v8i16, IntOp>; + def v4i32 : N2VQPLInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "32"), v2i64, v4i32, IntOp>; +} + + +// Neon Pairwise long 2-register accumulate intrinsics, +// element sizes of 8, 16 and 32 bits: +multiclass N2VPLInt2_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, + bits<5> op11_7, bit op4, + string OpcodeStr, string Dt, SDPatternOperator IntOp> { + // 64-bit vector types. + def v8i8 : N2VDPLInt2<op24_23, op21_20, 0b00, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "8"), v4i16, v8i8, IntOp>; + def v4i16 : N2VDPLInt2<op24_23, op21_20, 0b01, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "16"), v2i32, v4i16, IntOp>; + def v2i32 : N2VDPLInt2<op24_23, op21_20, 0b10, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "32"), v1i64, v2i32, IntOp>; + + // 128-bit vector types. + def v16i8 : N2VQPLInt2<op24_23, op21_20, 0b00, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "8"), v8i16, v16i8, IntOp>; + def v8i16 : N2VQPLInt2<op24_23, op21_20, 0b01, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "16"), v4i32, v8i16, IntOp>; + def v4i32 : N2VQPLInt2<op24_23, op21_20, 0b10, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "32"), v2i64, v4i32, IntOp>; +} + + +// Neon 2-register vector shift by immediate, +// with f of either N2RegVShLFrm or N2RegVShRFrm +// element sizes of 8, 16, 32 and 64 bits: +multiclass N2VShL_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + SDNode OpNode> { + // 64-bit vector types. + def v8i8 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm, + OpcodeStr, !strconcat(Dt, "8"), v8i8, OpNode> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm, + OpcodeStr, !strconcat(Dt, "16"), v4i16, OpNode> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm, + OpcodeStr, !strconcat(Dt, "32"), v2i32, OpNode> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, N2RegVShLFrm, itin, i32imm, + OpcodeStr, !strconcat(Dt, "64"), v1i64, OpNode>; + // imm6 = xxxxxx + + // 128-bit vector types. + def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm, + OpcodeStr, !strconcat(Dt, "8"), v16i8, OpNode> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm, + OpcodeStr, !strconcat(Dt, "16"), v8i16, OpNode> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm, + OpcodeStr, !strconcat(Dt, "32"), v4i32, OpNode> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, N2RegVShLFrm, itin, i32imm, + OpcodeStr, !strconcat(Dt, "64"), v2i64, OpNode>; + // imm6 = xxxxxx +} +multiclass N2VShR_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + string baseOpc, SDNode OpNode> { + // 64-bit vector types. + def v8i8 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm8, + OpcodeStr, !strconcat(Dt, "8"), v8i8, OpNode> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm16, + OpcodeStr, !strconcat(Dt, "16"), v4i16, OpNode> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm32, + OpcodeStr, !strconcat(Dt, "32"), v2i32, OpNode> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, N2RegVShRFrm, itin, shr_imm64, + OpcodeStr, !strconcat(Dt, "64"), v1i64, OpNode>; + // imm6 = xxxxxx + + // 128-bit vector types. + def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm8, + OpcodeStr, !strconcat(Dt, "8"), v16i8, OpNode> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm16, + OpcodeStr, !strconcat(Dt, "16"), v8i16, OpNode> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm32, + OpcodeStr, !strconcat(Dt, "32"), v4i32, OpNode> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, N2RegVShRFrm, itin, shr_imm64, + OpcodeStr, !strconcat(Dt, "64"), v2i64, OpNode>; + // imm6 = xxxxxx +} + +// Neon Shift-Accumulate vector operations, +// element sizes of 8, 16, 32 and 64 bits: +multiclass N2VShAdd_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr, string Dt, SDNode ShOp> { + // 64-bit vector types. + def v8i8 : N2VDShAdd<op24, op23, op11_8, 0, op4, shr_imm8, + OpcodeStr, !strconcat(Dt, "8"), v8i8, ShOp> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v4i16 : N2VDShAdd<op24, op23, op11_8, 0, op4, shr_imm16, + OpcodeStr, !strconcat(Dt, "16"), v4i16, ShOp> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v2i32 : N2VDShAdd<op24, op23, op11_8, 0, op4, shr_imm32, + OpcodeStr, !strconcat(Dt, "32"), v2i32, ShOp> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v1i64 : N2VDShAdd<op24, op23, op11_8, 1, op4, shr_imm64, + OpcodeStr, !strconcat(Dt, "64"), v1i64, ShOp>; + // imm6 = xxxxxx + + // 128-bit vector types. + def v16i8 : N2VQShAdd<op24, op23, op11_8, 0, op4, shr_imm8, + OpcodeStr, !strconcat(Dt, "8"), v16i8, ShOp> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v8i16 : N2VQShAdd<op24, op23, op11_8, 0, op4, shr_imm16, + OpcodeStr, !strconcat(Dt, "16"), v8i16, ShOp> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v4i32 : N2VQShAdd<op24, op23, op11_8, 0, op4, shr_imm32, + OpcodeStr, !strconcat(Dt, "32"), v4i32, ShOp> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v2i64 : N2VQShAdd<op24, op23, op11_8, 1, op4, shr_imm64, + OpcodeStr, !strconcat(Dt, "64"), v2i64, ShOp>; + // imm6 = xxxxxx +} + +// Neon Shift-Insert vector operations, +// with f of either N2RegVShLFrm or N2RegVShRFrm +// element sizes of 8, 16, 32 and 64 bits: +multiclass N2VShInsL_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr> { + // 64-bit vector types. + def v8i8 : N2VDShIns<op24, op23, op11_8, 0, op4, i32imm, + N2RegVShLFrm, OpcodeStr, "8", v8i8, NEONvsli> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4, i32imm, + N2RegVShLFrm, OpcodeStr, "16", v4i16, NEONvsli> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4, i32imm, + N2RegVShLFrm, OpcodeStr, "32", v2i32, NEONvsli> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4, i32imm, + N2RegVShLFrm, OpcodeStr, "64", v1i64, NEONvsli>; + // imm6 = xxxxxx + + // 128-bit vector types. + def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4, i32imm, + N2RegVShLFrm, OpcodeStr, "8", v16i8, NEONvsli> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4, i32imm, + N2RegVShLFrm, OpcodeStr, "16", v8i16, NEONvsli> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4, i32imm, + N2RegVShLFrm, OpcodeStr, "32", v4i32, NEONvsli> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4, i32imm, + N2RegVShLFrm, OpcodeStr, "64", v2i64, NEONvsli>; + // imm6 = xxxxxx +} +multiclass N2VShInsR_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr> { + // 64-bit vector types. + def v8i8 : N2VDShIns<op24, op23, op11_8, 0, op4, shr_imm8, + N2RegVShRFrm, OpcodeStr, "8", v8i8, NEONvsri> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4, shr_imm16, + N2RegVShRFrm, OpcodeStr, "16", v4i16, NEONvsri> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4, shr_imm32, + N2RegVShRFrm, OpcodeStr, "32", v2i32, NEONvsri> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4, shr_imm64, + N2RegVShRFrm, OpcodeStr, "64", v1i64, NEONvsri>; + // imm6 = xxxxxx + + // 128-bit vector types. + def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4, shr_imm8, + N2RegVShRFrm, OpcodeStr, "8", v16i8, NEONvsri> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4, shr_imm16, + N2RegVShRFrm, OpcodeStr, "16", v8i16, NEONvsri> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4, shr_imm32, + N2RegVShRFrm, OpcodeStr, "32", v4i32, NEONvsri> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4, shr_imm64, + N2RegVShRFrm, OpcodeStr, "64", v2i64, NEONvsri>; + // imm6 = xxxxxx +} + +// Neon Shift Long operations, +// element sizes of 8, 16, 32 bits: +multiclass N2VLSh_QHS<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, + bit op4, string OpcodeStr, string Dt, + SDPatternOperator OpNode> { + def v8i16 : N2VLSh<op24, op23, op11_8, op7, op6, op4, + OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, imm1_7, OpNode> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v4i32 : N2VLSh<op24, op23, op11_8, op7, op6, op4, + OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, imm1_15, OpNode> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v2i64 : N2VLSh<op24, op23, op11_8, op7, op6, op4, + OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, imm1_31, OpNode> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } +} + +// Neon Shift Narrow operations, +// element sizes of 16, 32, 64 bits: +multiclass N2VNSh_HSD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, + bit op4, InstrItinClass itin, string OpcodeStr, string Dt, + SDPatternOperator OpNode> { + def v8i8 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin, + OpcodeStr, !strconcat(Dt, "16"), + v8i8, v8i16, shr_imm8, OpNode> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v4i16 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin, + OpcodeStr, !strconcat(Dt, "32"), + v4i16, v4i32, shr_imm16, OpNode> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v2i32 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin, + OpcodeStr, !strconcat(Dt, "64"), + v2i32, v2i64, shr_imm32, OpNode> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } +} + +//===----------------------------------------------------------------------===// +// Instruction Definitions. +//===----------------------------------------------------------------------===// + +// Vector Add Operations. + +// VADD : Vector Add (integer and floating-point) +defm VADD : N3V_QHSD<0, 0, 0b1000, 0, IIC_VBINiD, IIC_VBINiQ, "vadd", "i", + add, 1>; +def VADDfd : N3VD<0, 0, 0b00, 0b1101, 0, IIC_VBIND, "vadd", "f32", + v2f32, v2f32, fadd, 1>; +def VADDfq : N3VQ<0, 0, 0b00, 0b1101, 0, IIC_VBINQ, "vadd", "f32", + v4f32, v4f32, fadd, 1>; +def VADDhd : N3VD<0, 0, 0b01, 0b1101, 0, IIC_VBIND, "vadd", "f16", + v4f16, v4f16, fadd, 1>, + Requires<[HasNEON,HasFullFP16]>; +def VADDhq : N3VQ<0, 0, 0b01, 0b1101, 0, IIC_VBINQ, "vadd", "f16", + v8f16, v8f16, fadd, 1>, + Requires<[HasNEON,HasFullFP16]>; +// VADDL : Vector Add Long (Q = D + D) +defm VADDLs : N3VLExt_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD, + "vaddl", "s", add, sext, 1>; +defm VADDLu : N3VLExt_QHS<1,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD, + "vaddl", "u", add, zext, 1>; +// VADDW : Vector Add Wide (Q = Q + D) +defm VADDWs : N3VW_QHS<0,1,0b0001,0, "vaddw", "s", add, sext, 0>; +defm VADDWu : N3VW_QHS<1,1,0b0001,0, "vaddw", "u", add, zext, 0>; +// VHADD : Vector Halving Add +defm VHADDs : N3VInt_QHS<0, 0, 0b0000, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vhadd", "s", int_arm_neon_vhadds, 1>; +defm VHADDu : N3VInt_QHS<1, 0, 0b0000, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vhadd", "u", int_arm_neon_vhaddu, 1>; +// VRHADD : Vector Rounding Halving Add +defm VRHADDs : N3VInt_QHS<0, 0, 0b0001, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vrhadd", "s", int_arm_neon_vrhadds, 1>; +defm VRHADDu : N3VInt_QHS<1, 0, 0b0001, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vrhadd", "u", int_arm_neon_vrhaddu, 1>; +// VQADD : Vector Saturating Add +defm VQADDs : N3VInt_QHSD<0, 0, 0b0000, 1, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vqadd", "s", int_arm_neon_vqadds, 1>; +defm VQADDu : N3VInt_QHSD<1, 0, 0b0000, 1, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vqadd", "u", int_arm_neon_vqaddu, 1>; +// VADDHN : Vector Add and Narrow Returning High Half (D = Q + Q) +defm VADDHN : N3VNInt_HSD<0,1,0b0100,0, "vaddhn", "i", null_frag, 1>; +// VRADDHN : Vector Rounding Add and Narrow Returning High Half (D = Q + Q) +defm VRADDHN : N3VNInt_HSD<1,1,0b0100,0, "vraddhn", "i", + int_arm_neon_vraddhn, 1>; + +def : Pat<(v8i8 (trunc (NEONvshru (add (v8i16 QPR:$Vn), QPR:$Vm), 8))), + (VADDHNv8i8 QPR:$Vn, QPR:$Vm)>; +def : Pat<(v4i16 (trunc (NEONvshru (add (v4i32 QPR:$Vn), QPR:$Vm), 16))), + (VADDHNv4i16 QPR:$Vn, QPR:$Vm)>; +def : Pat<(v2i32 (trunc (NEONvshru (add (v2i64 QPR:$Vn), QPR:$Vm), 32))), + (VADDHNv2i32 QPR:$Vn, QPR:$Vm)>; + +// Vector Multiply Operations. + +// VMUL : Vector Multiply (integer, polynomial and floating-point) +defm VMUL : N3V_QHS<0, 0, 0b1001, 1, IIC_VMULi16D, IIC_VMULi32D, + IIC_VMULi16Q, IIC_VMULi32Q, "vmul", "i", mul, 1>; +def VMULpd : N3VDInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16D, "vmul", + "p8", v8i8, v8i8, int_arm_neon_vmulp, 1>; +def VMULpq : N3VQInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16Q, "vmul", + "p8", v16i8, v16i8, int_arm_neon_vmulp, 1>; +def VMULfd : N3VD<1, 0, 0b00, 0b1101, 1, IIC_VFMULD, "vmul", "f32", + v2f32, v2f32, fmul, 1>; +def VMULfq : N3VQ<1, 0, 0b00, 0b1101, 1, IIC_VFMULQ, "vmul", "f32", + v4f32, v4f32, fmul, 1>; +def VMULhd : N3VD<1, 0, 0b01, 0b1101, 1, IIC_VFMULD, "vmul", "f16", + v4f16, v4f16, fmul, 1>, + Requires<[HasNEON,HasFullFP16]>; +def VMULhq : N3VQ<1, 0, 0b01, 0b1101, 1, IIC_VFMULQ, "vmul", "f16", + v8f16, v8f16, fmul, 1>, + Requires<[HasNEON,HasFullFP16]>; +defm VMULsl : N3VSL_HS<0b1000, "vmul", mul>; +def VMULslfd : N3VDSL<0b10, 0b1001, IIC_VBIND, "vmul", "f32", v2f32, fmul>; +def VMULslfq : N3VQSL<0b10, 0b1001, IIC_VBINQ, "vmul", "f32", v4f32, + v2f32, fmul>; +def VMULslhd : N3VDSL16<0b01, 0b1001, "vmul", "f16", v4f16, fmul>, + Requires<[HasNEON,HasFullFP16]>; +def VMULslhq : N3VQSL16<0b01, 0b1001, "vmul", "f16", v8f16, + v4f16, fmul>, + Requires<[HasNEON,HasFullFP16]>; + +def : Pat<(v8i16 (mul (v8i16 QPR:$src1), + (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))), + (v8i16 (VMULslv8i16 (v8i16 QPR:$src1), + (v4i16 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; +def : Pat<(v4i32 (mul (v4i32 QPR:$src1), + (v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))), + (v4i32 (VMULslv4i32 (v4i32 QPR:$src1), + (v2i32 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; +def : Pat<(v4f32 (fmul (v4f32 QPR:$src1), + (v4f32 (NEONvduplane (v4f32 QPR:$src2), imm:$lane)))), + (v4f32 (VMULslfq (v4f32 QPR:$src1), + (v2f32 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + + +def : Pat<(v2f32 (fmul DPR:$Rn, (NEONvdup (f32 SPR:$Rm)))), + (VMULslfd DPR:$Rn, + (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0), + (i32 0))>; +def : Pat<(v4f32 (fmul QPR:$Rn, (NEONvdup (f32 SPR:$Rm)))), + (VMULslfq QPR:$Rn, + (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0), + (i32 0))>; + + +// VQDMULH : Vector Saturating Doubling Multiply Returning High Half +defm VQDMULH : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D, + IIC_VMULi16Q, IIC_VMULi32Q, + "vqdmulh", "s", int_arm_neon_vqdmulh, 1>; +defm VQDMULHsl: N3VIntSL_HS<0b1100, IIC_VMULi16D, IIC_VMULi32D, + IIC_VMULi16Q, IIC_VMULi32Q, + "vqdmulh", "s", int_arm_neon_vqdmulh>; +def : Pat<(v8i16 (int_arm_neon_vqdmulh (v8i16 QPR:$src1), + (v8i16 (NEONvduplane (v8i16 QPR:$src2), + imm:$lane)))), + (v8i16 (VQDMULHslv8i16 (v8i16 QPR:$src1), + (v4i16 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; +def : Pat<(v4i32 (int_arm_neon_vqdmulh (v4i32 QPR:$src1), + (v4i32 (NEONvduplane (v4i32 QPR:$src2), + imm:$lane)))), + (v4i32 (VQDMULHslv4i32 (v4i32 QPR:$src1), + (v2i32 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + +// VQRDMULH : Vector Rounding Saturating Doubling Multiply Returning High Half +defm VQRDMULH : N3VInt_HS<1, 0, 0b1011, 0, N3RegFrm, + IIC_VMULi16D,IIC_VMULi32D,IIC_VMULi16Q,IIC_VMULi32Q, + "vqrdmulh", "s", int_arm_neon_vqrdmulh, 1>; +defm VQRDMULHsl : N3VIntSL_HS<0b1101, IIC_VMULi16D, IIC_VMULi32D, + IIC_VMULi16Q, IIC_VMULi32Q, + "vqrdmulh", "s", int_arm_neon_vqrdmulh>; +def : Pat<(v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src1), + (v8i16 (NEONvduplane (v8i16 QPR:$src2), + imm:$lane)))), + (v8i16 (VQRDMULHslv8i16 (v8i16 QPR:$src1), + (v4i16 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; +def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1), + (v4i32 (NEONvduplane (v4i32 QPR:$src2), + imm:$lane)))), + (v4i32 (VQRDMULHslv4i32 (v4i32 QPR:$src1), + (v2i32 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + +// VMULL : Vector Multiply Long (integer and polynomial) (Q = D * D) +let PostEncoderMethod = "NEONThumb2DataIPostEncoder", + DecoderNamespace = "NEONData" in { + defm VMULLs : N3VL_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D, + "vmull", "s", NEONvmulls, 1>; + defm VMULLu : N3VL_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D, + "vmull", "u", NEONvmullu, 1>; + def VMULLp8 : N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull", "p8", + v8i16, v8i8, int_arm_neon_vmullp, 1>; + def VMULLp64 : N3VLIntnp<0b00101, 0b10, 0b1110, 0, 0, NoItinerary, + "vmull", "p64", v2i64, v1i64, int_arm_neon_vmullp, 1>, + Requires<[HasV8, HasCrypto]>; +} +defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", NEONvmulls>; +defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", NEONvmullu>; + +// VQDMULL : Vector Saturating Doubling Multiply Long (Q = D * D) +defm VQDMULL : N3VLInt_HS<0,1,0b1101,0, IIC_VMULi16D, IIC_VMULi32D, + "vqdmull", "s", int_arm_neon_vqdmull, 1>; +defm VQDMULLsl: N3VLIntSL_HS<0, 0b1011, IIC_VMULi16D, + "vqdmull", "s", int_arm_neon_vqdmull>; + +// Vector Multiply-Accumulate and Multiply-Subtract Operations. + +// VMLA : Vector Multiply Accumulate (integer and floating-point) +defm VMLA : N3VMulOp_QHS<0, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, + IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>; +def VMLAfd : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32", + v2f32, fmul_su, fadd_mlx>, + Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>; +def VMLAfq : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32", + v4f32, fmul_su, fadd_mlx>, + Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>; +def VMLAhd : N3VDMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACD, "vmla", "f16", + v4f16, fmul_su, fadd_mlx>, + Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>; +def VMLAhq : N3VQMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACQ, "vmla", "f16", + v8f16, fmul_su, fadd_mlx>, + Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>; +defm VMLAsl : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D, + IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>; +def VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32", + v2f32, fmul_su, fadd_mlx>, + Requires<[HasNEON, UseFPVMLx]>; +def VMLAslfq : N3VQMulOpSL<0b10, 0b0001, IIC_VMACQ, "vmla", "f32", + v4f32, v2f32, fmul_su, fadd_mlx>, + Requires<[HasNEON, UseFPVMLx]>; +def VMLAslhd : N3VDMulOpSL16<0b01, 0b0001, IIC_VMACD, "vmla", "f16", + v4f16, fmul, fadd>, + Requires<[HasNEON, HasFullFP16, UseFPVMLx]>; +def VMLAslhq : N3VQMulOpSL16<0b01, 0b0001, IIC_VMACQ, "vmla", "f16", + v8f16, v4f16, fmul, fadd>, + Requires<[HasNEON, HasFullFP16, UseFPVMLx]>; + +def : Pat<(v8i16 (add (v8i16 QPR:$src1), + (mul (v8i16 QPR:$src2), + (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))), + (v8i16 (VMLAslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2), + (v4i16 (EXTRACT_SUBREG QPR:$src3, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; + +def : Pat<(v4i32 (add (v4i32 QPR:$src1), + (mul (v4i32 QPR:$src2), + (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))), + (v4i32 (VMLAslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2), + (v2i32 (EXTRACT_SUBREG QPR:$src3, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + +def : Pat<(v4f32 (fadd_mlx (v4f32 QPR:$src1), + (fmul_su (v4f32 QPR:$src2), + (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))), + (v4f32 (VMLAslfq (v4f32 QPR:$src1), + (v4f32 QPR:$src2), + (v2f32 (EXTRACT_SUBREG QPR:$src3, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>, + Requires<[HasNEON, UseFPVMLx]>; + +// VMLAL : Vector Multiply Accumulate Long (Q += D * D) +defm VMLALs : N3VLMulOp_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D, + "vmlal", "s", NEONvmulls, add>; +defm VMLALu : N3VLMulOp_QHS<1,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D, + "vmlal", "u", NEONvmullu, add>; + +defm VMLALsls : N3VLMulOpSL_HS<0, 0b0010, "vmlal", "s", NEONvmulls, add>; +defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", NEONvmullu, add>; + +let Predicates = [HasNEON, HasV8_1a] in { + // v8.1a Neon Rounding Double Multiply-Op vector operations, + // VQRDMLAH : Vector Saturating Rounding Doubling Multiply Accumulate Long + // (Q += D * D) + defm VQRDMLAH : N3VInt3_HS<1, 0, 0b1011, 1, IIC_VMACi16D, IIC_VMACi32D, + IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s", + null_frag>; + def : Pat<(v4i16 (int_arm_neon_vqadds + (v4i16 DPR:$src1), + (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn), + (v4i16 DPR:$Vm))))), + (v4i16 (VQRDMLAHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>; + def : Pat<(v2i32 (int_arm_neon_vqadds + (v2i32 DPR:$src1), + (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn), + (v2i32 DPR:$Vm))))), + (v2i32 (VQRDMLAHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>; + def : Pat<(v8i16 (int_arm_neon_vqadds + (v8i16 QPR:$src1), + (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn), + (v8i16 QPR:$Vm))))), + (v8i16 (VQRDMLAHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>; + def : Pat<(v4i32 (int_arm_neon_vqadds + (v4i32 QPR:$src1), + (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn), + (v4i32 QPR:$Vm))))), + (v4i32 (VQRDMLAHv4i32 QPR:$src1, QPR:$Vn, QPR:$Vm))>; + + defm VQRDMLAHsl : N3VMulOpSL_HS<0b1110, IIC_VMACi16D, IIC_VMACi32D, + IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s", + null_frag>; + def : Pat<(v4i16 (int_arm_neon_vqadds + (v4i16 DPR:$src1), + (v4i16 (int_arm_neon_vqrdmulh + (v4i16 DPR:$Vn), + (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm), + imm:$lane)))))), + (v4i16 (VQRDMLAHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm, + imm:$lane))>; + def : Pat<(v2i32 (int_arm_neon_vqadds + (v2i32 DPR:$src1), + (v2i32 (int_arm_neon_vqrdmulh + (v2i32 DPR:$Vn), + (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm), + imm:$lane)))))), + (v2i32 (VQRDMLAHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, + imm:$lane))>; + def : Pat<(v8i16 (int_arm_neon_vqadds + (v8i16 QPR:$src1), + (v8i16 (int_arm_neon_vqrdmulh + (v8i16 QPR:$src2), + (v8i16 (NEONvduplane (v8i16 QPR:$src3), + imm:$lane)))))), + (v8i16 (VQRDMLAHslv8i16 (v8i16 QPR:$src1), + (v8i16 QPR:$src2), + (v4i16 (EXTRACT_SUBREG + QPR:$src3, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; + def : Pat<(v4i32 (int_arm_neon_vqadds + (v4i32 QPR:$src1), + (v4i32 (int_arm_neon_vqrdmulh + (v4i32 QPR:$src2), + (v4i32 (NEONvduplane (v4i32 QPR:$src3), + imm:$lane)))))), + (v4i32 (VQRDMLAHslv4i32 (v4i32 QPR:$src1), + (v4i32 QPR:$src2), + (v2i32 (EXTRACT_SUBREG + QPR:$src3, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + + // VQRDMLSH : Vector Saturating Rounding Doubling Multiply Subtract Long + // (Q -= D * D) + defm VQRDMLSH : N3VInt3_HS<1, 0, 0b1100, 1, IIC_VMACi16D, IIC_VMACi32D, + IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s", + null_frag>; + def : Pat<(v4i16 (int_arm_neon_vqsubs + (v4i16 DPR:$src1), + (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn), + (v4i16 DPR:$Vm))))), + (v4i16 (VQRDMLSHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>; + def : Pat<(v2i32 (int_arm_neon_vqsubs + (v2i32 DPR:$src1), + (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn), + (v2i32 DPR:$Vm))))), + (v2i32 (VQRDMLSHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>; + def : Pat<(v8i16 (int_arm_neon_vqsubs + (v8i16 QPR:$src1), + (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn), + (v8i16 QPR:$Vm))))), + (v8i16 (VQRDMLSHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>; + def : Pat<(v4i32 (int_arm_neon_vqsubs + (v4i32 QPR:$src1), + (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn), + (v4i32 QPR:$Vm))))), + (v4i32 (VQRDMLSHv4i32 QPR:$src1, QPR:$Vn, QPR:$Vm))>; + + defm VQRDMLSHsl : N3VMulOpSL_HS<0b1111, IIC_VMACi16D, IIC_VMACi32D, + IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s", + null_frag>; + def : Pat<(v4i16 (int_arm_neon_vqsubs + (v4i16 DPR:$src1), + (v4i16 (int_arm_neon_vqrdmulh + (v4i16 DPR:$Vn), + (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm), + imm:$lane)))))), + (v4i16 (VQRDMLSHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane))>; + def : Pat<(v2i32 (int_arm_neon_vqsubs + (v2i32 DPR:$src1), + (v2i32 (int_arm_neon_vqrdmulh + (v2i32 DPR:$Vn), + (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm), + imm:$lane)))))), + (v2i32 (VQRDMLSHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, + imm:$lane))>; + def : Pat<(v8i16 (int_arm_neon_vqsubs + (v8i16 QPR:$src1), + (v8i16 (int_arm_neon_vqrdmulh + (v8i16 QPR:$src2), + (v8i16 (NEONvduplane (v8i16 QPR:$src3), + imm:$lane)))))), + (v8i16 (VQRDMLSHslv8i16 (v8i16 QPR:$src1), + (v8i16 QPR:$src2), + (v4i16 (EXTRACT_SUBREG + QPR:$src3, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; + def : Pat<(v4i32 (int_arm_neon_vqsubs + (v4i32 QPR:$src1), + (v4i32 (int_arm_neon_vqrdmulh + (v4i32 QPR:$src2), + (v4i32 (NEONvduplane (v4i32 QPR:$src3), + imm:$lane)))))), + (v4i32 (VQRDMLSHslv4i32 (v4i32 QPR:$src1), + (v4i32 QPR:$src2), + (v2i32 (EXTRACT_SUBREG + QPR:$src3, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; +} +// VQDMLAL : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D) +defm VQDMLAL : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, + "vqdmlal", "s", null_frag>; +defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", null_frag>; + +def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1), + (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn), + (v4i16 DPR:$Vm))))), + (VQDMLALv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>; +def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1), + (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn), + (v2i32 DPR:$Vm))))), + (VQDMLALv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>; +def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1), + (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn), + (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm), + imm:$lane)))))), + (VQDMLALslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>; +def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1), + (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn), + (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm), + imm:$lane)))))), + (VQDMLALslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>; + +// VMLS : Vector Multiply Subtract (integer and floating-point) +defm VMLS : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, + IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>; +def VMLSfd : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32", + v2f32, fmul_su, fsub_mlx>, + Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>; +def VMLSfq : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32", + v4f32, fmul_su, fsub_mlx>, + Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>; +def VMLShd : N3VDMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACD, "vmls", "f16", + v4f16, fmul, fsub>, + Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>; +def VMLShq : N3VQMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACQ, "vmls", "f16", + v8f16, fmul, fsub>, + Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>; +defm VMLSsl : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D, + IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>; +def VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32", + v2f32, fmul_su, fsub_mlx>, + Requires<[HasNEON, UseFPVMLx]>; +def VMLSslfq : N3VQMulOpSL<0b10, 0b0101, IIC_VMACQ, "vmls", "f32", + v4f32, v2f32, fmul_su, fsub_mlx>, + Requires<[HasNEON, UseFPVMLx]>; +def VMLSslhd : N3VDMulOpSL16<0b01, 0b0101, IIC_VMACD, "vmls", "f16", + v4f16, fmul, fsub>, + Requires<[HasNEON, HasFullFP16, UseFPVMLx]>; +def VMLSslhq : N3VQMulOpSL16<0b01, 0b0101, IIC_VMACQ, "vmls", "f16", + v8f16, v4f16, fmul, fsub>, + Requires<[HasNEON, HasFullFP16, UseFPVMLx]>; + +def : Pat<(v8i16 (sub (v8i16 QPR:$src1), + (mul (v8i16 QPR:$src2), + (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))), + (v8i16 (VMLSslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2), + (v4i16 (EXTRACT_SUBREG QPR:$src3, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; + +def : Pat<(v4i32 (sub (v4i32 QPR:$src1), + (mul (v4i32 QPR:$src2), + (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))), + (v4i32 (VMLSslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2), + (v2i32 (EXTRACT_SUBREG QPR:$src3, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + +def : Pat<(v4f32 (fsub_mlx (v4f32 QPR:$src1), + (fmul_su (v4f32 QPR:$src2), + (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))), + (v4f32 (VMLSslfq (v4f32 QPR:$src1), (v4f32 QPR:$src2), + (v2f32 (EXTRACT_SUBREG QPR:$src3, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>, + Requires<[HasNEON, UseFPVMLx]>; + +// VMLSL : Vector Multiply Subtract Long (Q -= D * D) +defm VMLSLs : N3VLMulOp_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D, + "vmlsl", "s", NEONvmulls, sub>; +defm VMLSLu : N3VLMulOp_QHS<1,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D, + "vmlsl", "u", NEONvmullu, sub>; + +defm VMLSLsls : N3VLMulOpSL_HS<0, 0b0110, "vmlsl", "s", NEONvmulls, sub>; +defm VMLSLslu : N3VLMulOpSL_HS<1, 0b0110, "vmlsl", "u", NEONvmullu, sub>; + +// VQDMLSL : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D) +defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D, + "vqdmlsl", "s", null_frag>; +defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b0111, "vqdmlsl", "s", null_frag>; + +def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1), + (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn), + (v4i16 DPR:$Vm))))), + (VQDMLSLv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>; +def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1), + (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn), + (v2i32 DPR:$Vm))))), + (VQDMLSLv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>; +def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1), + (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn), + (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm), + imm:$lane)))))), + (VQDMLSLslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>; +def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1), + (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn), + (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm), + imm:$lane)))))), + (VQDMLSLslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>; + +// Fused Vector Multiply-Accumulate and Fused Multiply-Subtract Operations. +def VFMAfd : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32", + v2f32, fmul_su, fadd_mlx>, + Requires<[HasNEON,HasVFP4,UseFusedMAC]>; + +def VFMAfq : N3VQMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACQ, "vfma", "f32", + v4f32, fmul_su, fadd_mlx>, + Requires<[HasNEON,HasVFP4,UseFusedMAC]>; +def VFMAhd : N3VDMulOp<0, 0, 0b01, 0b1100, 1, IIC_VFMACD, "vfma", "f16", + v4f16, fmul, fadd>, + Requires<[HasNEON,HasFullFP16,UseFusedMAC]>; + +def VFMAhq : N3VQMulOp<0, 0, 0b01, 0b1100, 1, IIC_VFMACQ, "vfma", "f16", + v8f16, fmul, fadd>, + Requires<[HasNEON,HasFullFP16,UseFusedMAC]>; + +// Fused Vector Multiply Subtract (floating-point) +def VFMSfd : N3VDMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACD, "vfms", "f32", + v2f32, fmul_su, fsub_mlx>, + Requires<[HasNEON,HasVFP4,UseFusedMAC]>; +def VFMSfq : N3VQMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACQ, "vfms", "f32", + v4f32, fmul_su, fsub_mlx>, + Requires<[HasNEON,HasVFP4,UseFusedMAC]>; +def VFMShd : N3VDMulOp<0, 0, 0b11, 0b1100, 1, IIC_VFMACD, "vfms", "f16", + v4f16, fmul, fsub>, + Requires<[HasNEON,HasFullFP16,UseFusedMAC]>; +def VFMShq : N3VQMulOp<0, 0, 0b11, 0b1100, 1, IIC_VFMACQ, "vfms", "f16", + v8f16, fmul, fsub>, + Requires<[HasNEON,HasFullFP16,UseFusedMAC]>; + +// Match @llvm.fma.* intrinsics +def : Pat<(v2f32 (fma DPR:$Vn, DPR:$Vm, DPR:$src1)), + (VFMAfd DPR:$src1, DPR:$Vn, DPR:$Vm)>, + Requires<[HasVFP4]>; +def : Pat<(v4f32 (fma QPR:$Vn, QPR:$Vm, QPR:$src1)), + (VFMAfq QPR:$src1, QPR:$Vn, QPR:$Vm)>, + Requires<[HasVFP4]>; +def : Pat<(v2f32 (fma (fneg DPR:$Vn), DPR:$Vm, DPR:$src1)), + (VFMSfd DPR:$src1, DPR:$Vn, DPR:$Vm)>, + Requires<[HasVFP4]>; +def : Pat<(v4f32 (fma (fneg QPR:$Vn), QPR:$Vm, QPR:$src1)), + (VFMSfq QPR:$src1, QPR:$Vn, QPR:$Vm)>, + Requires<[HasVFP4]>; + +// Vector Subtract Operations. + +// VSUB : Vector Subtract (integer and floating-point) +defm VSUB : N3V_QHSD<1, 0, 0b1000, 0, IIC_VSUBiD, IIC_VSUBiQ, + "vsub", "i", sub, 0>; +def VSUBfd : N3VD<0, 0, 0b10, 0b1101, 0, IIC_VBIND, "vsub", "f32", + v2f32, v2f32, fsub, 0>; +def VSUBfq : N3VQ<0, 0, 0b10, 0b1101, 0, IIC_VBINQ, "vsub", "f32", + v4f32, v4f32, fsub, 0>; +def VSUBhd : N3VD<0, 0, 0b11, 0b1101, 0, IIC_VBIND, "vsub", "f16", + v4f16, v4f16, fsub, 0>, + Requires<[HasNEON,HasFullFP16]>; +def VSUBhq : N3VQ<0, 0, 0b11, 0b1101, 0, IIC_VBINQ, "vsub", "f16", + v8f16, v8f16, fsub, 0>, + Requires<[HasNEON,HasFullFP16]>; +// VSUBL : Vector Subtract Long (Q = D - D) +defm VSUBLs : N3VLExt_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD, + "vsubl", "s", sub, sext, 0>; +defm VSUBLu : N3VLExt_QHS<1,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD, + "vsubl", "u", sub, zext, 0>; +// VSUBW : Vector Subtract Wide (Q = Q - D) +defm VSUBWs : N3VW_QHS<0,1,0b0011,0, "vsubw", "s", sub, sext, 0>; +defm VSUBWu : N3VW_QHS<1,1,0b0011,0, "vsubw", "u", sub, zext, 0>; +// VHSUB : Vector Halving Subtract +defm VHSUBs : N3VInt_QHS<0, 0, 0b0010, 0, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vhsub", "s", int_arm_neon_vhsubs, 0>; +defm VHSUBu : N3VInt_QHS<1, 0, 0b0010, 0, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vhsub", "u", int_arm_neon_vhsubu, 0>; +// VQSUB : Vector Saturing Subtract +defm VQSUBs : N3VInt_QHSD<0, 0, 0b0010, 1, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vqsub", "s", int_arm_neon_vqsubs, 0>; +defm VQSUBu : N3VInt_QHSD<1, 0, 0b0010, 1, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vqsub", "u", int_arm_neon_vqsubu, 0>; +// VSUBHN : Vector Subtract and Narrow Returning High Half (D = Q - Q) +defm VSUBHN : N3VNInt_HSD<0,1,0b0110,0, "vsubhn", "i", null_frag, 0>; +// VRSUBHN : Vector Rounding Subtract and Narrow Returning High Half (D=Q-Q) +defm VRSUBHN : N3VNInt_HSD<1,1,0b0110,0, "vrsubhn", "i", + int_arm_neon_vrsubhn, 0>; + +def : Pat<(v8i8 (trunc (NEONvshru (sub (v8i16 QPR:$Vn), QPR:$Vm), 8))), + (VSUBHNv8i8 QPR:$Vn, QPR:$Vm)>; +def : Pat<(v4i16 (trunc (NEONvshru (sub (v4i32 QPR:$Vn), QPR:$Vm), 16))), + (VSUBHNv4i16 QPR:$Vn, QPR:$Vm)>; +def : Pat<(v2i32 (trunc (NEONvshru (sub (v2i64 QPR:$Vn), QPR:$Vm), 32))), + (VSUBHNv2i32 QPR:$Vn, QPR:$Vm)>; + +// Vector Comparisons. + +// VCEQ : Vector Compare Equal +defm VCEQ : N3V_QHS<1, 0, 0b1000, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, + IIC_VSUBi4Q, "vceq", "i", NEONvceq, 1>; +def VCEQfd : N3VD<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32, + NEONvceq, 1>; +def VCEQfq : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32, + NEONvceq, 1>; +def VCEQhd : N3VD<0,0,0b01,0b1110,0, IIC_VBIND, "vceq", "f16", v4i16, v4f16, + NEONvceq, 1>, + Requires<[HasNEON, HasFullFP16]>; +def VCEQhq : N3VQ<0,0,0b01,0b1110,0, IIC_VBINQ, "vceq", "f16", v8i16, v8f16, + NEONvceq, 1>, + Requires<[HasNEON, HasFullFP16]>; + +let TwoOperandAliasConstraint = "$Vm = $Vd" in +defm VCEQz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00010, 0, "vceq", "i", + "$Vd, $Vm, #0", NEONvceqz>; + +// VCGE : Vector Compare Greater Than or Equal +defm VCGEs : N3V_QHS<0, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, + IIC_VSUBi4Q, "vcge", "s", NEONvcge, 0>; +defm VCGEu : N3V_QHS<1, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, + IIC_VSUBi4Q, "vcge", "u", NEONvcgeu, 0>; +def VCGEfd : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32, + NEONvcge, 0>; +def VCGEfq : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32, + NEONvcge, 0>; +def VCGEhd : N3VD<1,0,0b01,0b1110,0, IIC_VBIND, "vcge", "f16", v4i16, v4f16, + NEONvcge, 0>, + Requires<[HasNEON, HasFullFP16]>; +def VCGEhq : N3VQ<1,0,0b01,0b1110,0, IIC_VBINQ, "vcge", "f16", v8i16, v8f16, + NEONvcge, 0>, + Requires<[HasNEON, HasFullFP16]>; + +let TwoOperandAliasConstraint = "$Vm = $Vd" in { +defm VCGEz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00001, 0, "vcge", "s", + "$Vd, $Vm, #0", NEONvcgez>; +defm VCLEz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00011, 0, "vcle", "s", + "$Vd, $Vm, #0", NEONvclez>; +} + +// VCGT : Vector Compare Greater Than +defm VCGTs : N3V_QHS<0, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, + IIC_VSUBi4Q, "vcgt", "s", NEONvcgt, 0>; +defm VCGTu : N3V_QHS<1, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, + IIC_VSUBi4Q, "vcgt", "u", NEONvcgtu, 0>; +def VCGTfd : N3VD<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32, + NEONvcgt, 0>; +def VCGTfq : N3VQ<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32, + NEONvcgt, 0>; +def VCGThd : N3VD<1,0,0b11,0b1110,0, IIC_VBIND, "vcgt", "f16", v4i16, v4f16, + NEONvcgt, 0>, + Requires<[HasNEON, HasFullFP16]>; +def VCGThq : N3VQ<1,0,0b11,0b1110,0, IIC_VBINQ, "vcgt", "f16", v8i16, v8f16, + NEONvcgt, 0>, + Requires<[HasNEON, HasFullFP16]>; + +let TwoOperandAliasConstraint = "$Vm = $Vd" in { +defm VCGTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00000, 0, "vcgt", "s", + "$Vd, $Vm, #0", NEONvcgtz>; +defm VCLTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s", + "$Vd, $Vm, #0", NEONvcltz>; +} + +// VACGE : Vector Absolute Compare Greater Than or Equal (aka VCAGE) +def VACGEfd : N3VDInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge", + "f32", v2i32, v2f32, int_arm_neon_vacge, 0>; +def VACGEfq : N3VQInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacge", + "f32", v4i32, v4f32, int_arm_neon_vacge, 0>; +def VACGEhd : N3VDInt<1, 0, 0b01, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge", + "f16", v4i16, v4f16, int_arm_neon_vacge, 0>, + Requires<[HasNEON, HasFullFP16]>; +def VACGEhq : N3VQInt<1, 0, 0b01, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacge", + "f16", v8i16, v8f16, int_arm_neon_vacge, 0>, + Requires<[HasNEON, HasFullFP16]>; +// VACGT : Vector Absolute Compare Greater Than (aka VCAGT) +def VACGTfd : N3VDInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt", + "f32", v2i32, v2f32, int_arm_neon_vacgt, 0>; +def VACGTfq : N3VQInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt", + "f32", v4i32, v4f32, int_arm_neon_vacgt, 0>; +def VACGThd : N3VDInt<1, 0, 0b11, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt", + "f16", v4i16, v4f16, int_arm_neon_vacgt, 0>, + Requires<[HasNEON, HasFullFP16]>; +def VACGThq : N3VQInt<1, 0, 0b11, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt", + "f16", v8f16, v8f16, int_arm_neon_vacgt, 0>, + Requires<[HasNEON, HasFullFP16]>; +// VTST : Vector Test Bits +defm VTST : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, + IIC_VBINi4Q, "vtst", "", NEONvtst, 1>; + +def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vn, $Vm", + (VACGTfd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>; +def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vn, $Vm", + (VACGTfq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>; +def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vn, $Vm", + (VACGEfd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>; +def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vn, $Vm", + (VACGEfq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>; +let Predicates = [HasNEON, HasFullFP16] in { +def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vn, $Vm", + (VACGThd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>; +def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vn, $Vm", + (VACGThq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>; +def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vn, $Vm", + (VACGEhd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>; +def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vn, $Vm", + (VACGEhq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>; +} + +def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vm", + (VACGTfd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>; +def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vm", + (VACGTfq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>; +def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vm", + (VACGEfd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>; +def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vm", + (VACGEfq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>; +let Predicates = [HasNEON, HasFullFP16] in { +def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vm", + (VACGThd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>; +def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vm", + (VACGThq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>; +def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vm", + (VACGEhd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>; +def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vm", + (VACGEhq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>; +} + +// Vector Bitwise Operations. + +def vnotd : PatFrag<(ops node:$in), + (xor node:$in, (bitconvert (v8i8 NEONimmAllOnesV)))>; +def vnotq : PatFrag<(ops node:$in), + (xor node:$in, (bitconvert (v16i8 NEONimmAllOnesV)))>; + + +// VAND : Vector Bitwise AND +def VANDd : N3VDX<0, 0, 0b00, 0b0001, 1, IIC_VBINiD, "vand", + v2i32, v2i32, and, 1>; +def VANDq : N3VQX<0, 0, 0b00, 0b0001, 1, IIC_VBINiQ, "vand", + v4i32, v4i32, and, 1>; + +// VEOR : Vector Bitwise Exclusive OR +def VEORd : N3VDX<1, 0, 0b00, 0b0001, 1, IIC_VBINiD, "veor", + v2i32, v2i32, xor, 1>; +def VEORq : N3VQX<1, 0, 0b00, 0b0001, 1, IIC_VBINiQ, "veor", + v4i32, v4i32, xor, 1>; + +// VORR : Vector Bitwise OR +def VORRd : N3VDX<0, 0, 0b10, 0b0001, 1, IIC_VBINiD, "vorr", + v2i32, v2i32, or, 1>; +def VORRq : N3VQX<0, 0, 0b10, 0b0001, 1, IIC_VBINiQ, "vorr", + v4i32, v4i32, or, 1>; + +def VORRiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 0, 1, + (outs DPR:$Vd), (ins nImmSplatI16:$SIMM, DPR:$src), + IIC_VMOVImm, + "vorr", "i16", "$Vd, $SIMM", "$src = $Vd", + [(set DPR:$Vd, + (v4i16 (NEONvorrImm DPR:$src, timm:$SIMM)))]> { + let Inst{9} = SIMM{9}; +} + +def VORRiv2i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 0, 0, 1, + (outs DPR:$Vd), (ins nImmSplatI32:$SIMM, DPR:$src), + IIC_VMOVImm, + "vorr", "i32", "$Vd, $SIMM", "$src = $Vd", + [(set DPR:$Vd, + (v2i32 (NEONvorrImm DPR:$src, timm:$SIMM)))]> { + let Inst{10-9} = SIMM{10-9}; +} + +def VORRiv8i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 1, 0, 1, + (outs QPR:$Vd), (ins nImmSplatI16:$SIMM, QPR:$src), + IIC_VMOVImm, + "vorr", "i16", "$Vd, $SIMM", "$src = $Vd", + [(set QPR:$Vd, + (v8i16 (NEONvorrImm QPR:$src, timm:$SIMM)))]> { + let Inst{9} = SIMM{9}; +} + +def VORRiv4i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 1, 0, 1, + (outs QPR:$Vd), (ins nImmSplatI32:$SIMM, QPR:$src), + IIC_VMOVImm, + "vorr", "i32", "$Vd, $SIMM", "$src = $Vd", + [(set QPR:$Vd, + (v4i32 (NEONvorrImm QPR:$src, timm:$SIMM)))]> { + let Inst{10-9} = SIMM{10-9}; +} + + +// VBIC : Vector Bitwise Bit Clear (AND NOT) +let TwoOperandAliasConstraint = "$Vn = $Vd" in { +def VBICd : N3VX<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd), + (ins DPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VBINiD, + "vbic", "$Vd, $Vn, $Vm", "", + [(set DPR:$Vd, (v2i32 (and DPR:$Vn, + (vnotd DPR:$Vm))))]>; +def VBICq : N3VX<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd), + (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINiQ, + "vbic", "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (v4i32 (and QPR:$Vn, + (vnotq QPR:$Vm))))]>; +} + +def VBICiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 1, 1, + (outs DPR:$Vd), (ins nImmSplatI16:$SIMM, DPR:$src), + IIC_VMOVImm, + "vbic", "i16", "$Vd, $SIMM", "$src = $Vd", + [(set DPR:$Vd, + (v4i16 (NEONvbicImm DPR:$src, timm:$SIMM)))]> { + let Inst{9} = SIMM{9}; +} + +def VBICiv2i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 0, 1, 1, + (outs DPR:$Vd), (ins nImmSplatI32:$SIMM, DPR:$src), + IIC_VMOVImm, + "vbic", "i32", "$Vd, $SIMM", "$src = $Vd", + [(set DPR:$Vd, + (v2i32 (NEONvbicImm DPR:$src, timm:$SIMM)))]> { + let Inst{10-9} = SIMM{10-9}; +} + +def VBICiv8i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 1, 1, 1, + (outs QPR:$Vd), (ins nImmSplatI16:$SIMM, QPR:$src), + IIC_VMOVImm, + "vbic", "i16", "$Vd, $SIMM", "$src = $Vd", + [(set QPR:$Vd, + (v8i16 (NEONvbicImm QPR:$src, timm:$SIMM)))]> { + let Inst{9} = SIMM{9}; +} + +def VBICiv4i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 1, 1, 1, + (outs QPR:$Vd), (ins nImmSplatI32:$SIMM, QPR:$src), + IIC_VMOVImm, + "vbic", "i32", "$Vd, $SIMM", "$src = $Vd", + [(set QPR:$Vd, + (v4i32 (NEONvbicImm QPR:$src, timm:$SIMM)))]> { + let Inst{10-9} = SIMM{10-9}; +} + +// VORN : Vector Bitwise OR NOT +def VORNd : N3VX<0, 0, 0b11, 0b0001, 0, 1, (outs DPR:$Vd), + (ins DPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VBINiD, + "vorn", "$Vd, $Vn, $Vm", "", + [(set DPR:$Vd, (v2i32 (or DPR:$Vn, + (vnotd DPR:$Vm))))]>; +def VORNq : N3VX<0, 0, 0b11, 0b0001, 1, 1, (outs QPR:$Vd), + (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINiQ, + "vorn", "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (v4i32 (or QPR:$Vn, + (vnotq QPR:$Vm))))]>; + +// VMVN : Vector Bitwise NOT (Immediate) + +let isReMaterializable = 1 in { + +def VMVNv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 1, 1, (outs DPR:$Vd), + (ins nImmSplatI16:$SIMM), IIC_VMOVImm, + "vmvn", "i16", "$Vd, $SIMM", "", + [(set DPR:$Vd, (v4i16 (NEONvmvnImm timm:$SIMM)))]> { + let Inst{9} = SIMM{9}; +} + +def VMVNv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 1, 1, (outs QPR:$Vd), + (ins nImmSplatI16:$SIMM), IIC_VMOVImm, + "vmvn", "i16", "$Vd, $SIMM", "", + [(set QPR:$Vd, (v8i16 (NEONvmvnImm timm:$SIMM)))]> { + let Inst{9} = SIMM{9}; +} + +def VMVNv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 1, 1, (outs DPR:$Vd), + (ins nImmVMOVI32:$SIMM), IIC_VMOVImm, + "vmvn", "i32", "$Vd, $SIMM", "", + [(set DPR:$Vd, (v2i32 (NEONvmvnImm timm:$SIMM)))]> { + let Inst{11-8} = SIMM{11-8}; +} + +def VMVNv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 1, 1, (outs QPR:$Vd), + (ins nImmVMOVI32:$SIMM), IIC_VMOVImm, + "vmvn", "i32", "$Vd, $SIMM", "", + [(set QPR:$Vd, (v4i32 (NEONvmvnImm timm:$SIMM)))]> { + let Inst{11-8} = SIMM{11-8}; +} +} + +// VMVN : Vector Bitwise NOT +def VMVNd : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 0, 0, + (outs DPR:$Vd), (ins DPR:$Vm), IIC_VSUBiD, + "vmvn", "$Vd, $Vm", "", + [(set DPR:$Vd, (v2i32 (vnotd DPR:$Vm)))]>; +def VMVNq : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 1, 0, + (outs QPR:$Vd), (ins QPR:$Vm), IIC_VSUBiD, + "vmvn", "$Vd, $Vm", "", + [(set QPR:$Vd, (v4i32 (vnotq QPR:$Vm)))]>; +def : Pat<(v2i32 (vnotd DPR:$src)), (VMVNd DPR:$src)>; +def : Pat<(v4i32 (vnotq QPR:$src)), (VMVNq QPR:$src)>; + +// VBSL : Vector Bitwise Select +def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd), + (ins DPR:$src1, DPR:$Vn, DPR:$Vm), + N3RegFrm, IIC_VCNTiD, + "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set DPR:$Vd, + (v2i32 (NEONvbsl DPR:$src1, DPR:$Vn, DPR:$Vm)))]>; +def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 DPR:$src1), + (v8i8 DPR:$Vn), (v8i8 DPR:$Vm))), + (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>, + Requires<[HasNEON]>; +def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 DPR:$src1), + (v4i16 DPR:$Vn), (v4i16 DPR:$Vm))), + (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>, + Requires<[HasNEON]>; +def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 DPR:$src1), + (v2i32 DPR:$Vn), (v2i32 DPR:$Vm))), + (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>, + Requires<[HasNEON]>; +def : Pat<(v2f32 (int_arm_neon_vbsl (v2f32 DPR:$src1), + (v2f32 DPR:$Vn), (v2f32 DPR:$Vm))), + (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>, + Requires<[HasNEON]>; +def : Pat<(v1i64 (int_arm_neon_vbsl (v1i64 DPR:$src1), + (v1i64 DPR:$Vn), (v1i64 DPR:$Vm))), + (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>, + Requires<[HasNEON]>; + +def : Pat<(v2i32 (or (and DPR:$Vn, DPR:$Vd), + (and DPR:$Vm, (vnotd DPR:$Vd)))), + (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>, + Requires<[HasNEON]>; + +def : Pat<(v1i64 (or (and DPR:$Vn, DPR:$Vd), + (and DPR:$Vm, (vnotd DPR:$Vd)))), + (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>, + Requires<[HasNEON]>; + +def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd), + (ins QPR:$src1, QPR:$Vn, QPR:$Vm), + N3RegFrm, IIC_VCNTiQ, + "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set QPR:$Vd, + (v4i32 (NEONvbsl QPR:$src1, QPR:$Vn, QPR:$Vm)))]>; + +def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 QPR:$src1), + (v16i8 QPR:$Vn), (v16i8 QPR:$Vm))), + (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>, + Requires<[HasNEON]>; +def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 QPR:$src1), + (v8i16 QPR:$Vn), (v8i16 QPR:$Vm))), + (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>, + Requires<[HasNEON]>; +def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 QPR:$src1), + (v4i32 QPR:$Vn), (v4i32 QPR:$Vm))), + (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>, + Requires<[HasNEON]>; +def : Pat<(v4f32 (int_arm_neon_vbsl (v4f32 QPR:$src1), + (v4f32 QPR:$Vn), (v4f32 QPR:$Vm))), + (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>, + Requires<[HasNEON]>; +def : Pat<(v2i64 (int_arm_neon_vbsl (v2i64 QPR:$src1), + (v2i64 QPR:$Vn), (v2i64 QPR:$Vm))), + (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>, + Requires<[HasNEON]>; + +def : Pat<(v4i32 (or (and QPR:$Vn, QPR:$Vd), + (and QPR:$Vm, (vnotq QPR:$Vd)))), + (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>, + Requires<[HasNEON]>; +def : Pat<(v2i64 (or (and QPR:$Vn, QPR:$Vd), + (and QPR:$Vm, (vnotq QPR:$Vd)))), + (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>, + Requires<[HasNEON]>; + +// VBIF : Vector Bitwise Insert if False +// like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst", +// FIXME: This instruction's encoding MAY NOT BE correct. +def VBIFd : N3VX<1, 0, 0b11, 0b0001, 0, 1, + (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), + N3RegFrm, IIC_VBINiD, + "vbif", "$Vd, $Vn, $Vm", "$src1 = $Vd", + []>; +def VBIFq : N3VX<1, 0, 0b11, 0b0001, 1, 1, + (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), + N3RegFrm, IIC_VBINiQ, + "vbif", "$Vd, $Vn, $Vm", "$src1 = $Vd", + []>; + +// VBIT : Vector Bitwise Insert if True +// like VBSL but with: "vbit $dst, $src2, $src1", "$src3 = $dst", +// FIXME: This instruction's encoding MAY NOT BE correct. +def VBITd : N3VX<1, 0, 0b10, 0b0001, 0, 1, + (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), + N3RegFrm, IIC_VBINiD, + "vbit", "$Vd, $Vn, $Vm", "$src1 = $Vd", + []>; +def VBITq : N3VX<1, 0, 0b10, 0b0001, 1, 1, + (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), + N3RegFrm, IIC_VBINiQ, + "vbit", "$Vd, $Vn, $Vm", "$src1 = $Vd", + []>; + +// VBIT/VBIF are not yet implemented. The TwoAddress pass will not go looking +// for equivalent operations with different register constraints; it just +// inserts copies. + +// Vector Absolute Differences. + +// VABD : Vector Absolute Difference +defm VABDs : N3VInt_QHS<0, 0, 0b0111, 0, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vabd", "s", int_arm_neon_vabds, 1>; +defm VABDu : N3VInt_QHS<1, 0, 0b0111, 0, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vabd", "u", int_arm_neon_vabdu, 1>; +def VABDfd : N3VDInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBIND, + "vabd", "f32", v2f32, v2f32, int_arm_neon_vabds, 1>; +def VABDfq : N3VQInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBINQ, + "vabd", "f32", v4f32, v4f32, int_arm_neon_vabds, 1>; +def VABDhd : N3VDInt<1, 0, 0b11, 0b1101, 0, N3RegFrm, IIC_VBIND, + "vabd", "f16", v4f16, v4f16, int_arm_neon_vabds, 1>, + Requires<[HasNEON, HasFullFP16]>; +def VABDhq : N3VQInt<1, 0, 0b11, 0b1101, 0, N3RegFrm, IIC_VBINQ, + "vabd", "f16", v8f16, v8f16, int_arm_neon_vabds, 1>, + Requires<[HasNEON, HasFullFP16]>; + +// VABDL : Vector Absolute Difference Long (Q = | D - D |) +defm VABDLs : N3VLIntExt_QHS<0,1,0b0111,0, IIC_VSUBi4Q, + "vabdl", "s", int_arm_neon_vabds, zext, 1>; +defm VABDLu : N3VLIntExt_QHS<1,1,0b0111,0, IIC_VSUBi4Q, + "vabdl", "u", int_arm_neon_vabdu, zext, 1>; + +def abd_shr : + PatFrag<(ops node:$in1, node:$in2, node:$shift), + (NEONvshrs (sub (zext node:$in1), + (zext node:$in2)), (i32 $shift))>; + +def : Pat<(xor (v4i32 (bitconvert (v8i16 (abd_shr (v8i8 DPR:$opA), (v8i8 DPR:$opB), 15)))), + (v4i32 (bitconvert (v8i16 (add (sub (zext (v8i8 DPR:$opA)), + (zext (v8i8 DPR:$opB))), + (v8i16 (abd_shr (v8i8 DPR:$opA), (v8i8 DPR:$opB), 15))))))), + (VABDLuv8i16 DPR:$opA, DPR:$opB)>; + +def : Pat<(xor (v4i32 (abd_shr (v4i16 DPR:$opA), (v4i16 DPR:$opB), 31)), + (v4i32 (add (sub (zext (v4i16 DPR:$opA)), + (zext (v4i16 DPR:$opB))), + (abd_shr (v4i16 DPR:$opA), (v4i16 DPR:$opB), 31)))), + (VABDLuv4i32 DPR:$opA, DPR:$opB)>; + +def : Pat<(xor (v4i32 (bitconvert (v2i64 (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))), + (v4i32 (bitconvert (v2i64 (add (sub (zext (v2i32 DPR:$opA)), + (zext (v2i32 DPR:$opB))), + (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))))), + (VABDLuv2i64 DPR:$opA, DPR:$opB)>; + +// VABA : Vector Absolute Difference and Accumulate +defm VABAs : N3VIntOp_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ, + "vaba", "s", int_arm_neon_vabds, add>; +defm VABAu : N3VIntOp_QHS<1,0,0b0111,1, IIC_VABAD, IIC_VABAQ, + "vaba", "u", int_arm_neon_vabdu, add>; + +// VABAL : Vector Absolute Difference and Accumulate Long (Q += | D - D |) +defm VABALs : N3VLIntExtOp_QHS<0,1,0b0101,0, IIC_VABAD, + "vabal", "s", int_arm_neon_vabds, zext, add>; +defm VABALu : N3VLIntExtOp_QHS<1,1,0b0101,0, IIC_VABAD, + "vabal", "u", int_arm_neon_vabdu, zext, add>; + +// Vector Maximum and Minimum. + +// VMAX : Vector Maximum +defm VMAXs : N3VInt_QHS<0, 0, 0b0110, 0, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vmax", "s", smax, 1>; +defm VMAXu : N3VInt_QHS<1, 0, 0b0110, 0, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vmax", "u", umax, 1>; +def VMAXfd : N3VDInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBIND, + "vmax", "f32", + v2f32, v2f32, fmaxnan, 1>; +def VMAXfq : N3VQInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBINQ, + "vmax", "f32", + v4f32, v4f32, fmaxnan, 1>; +def VMAXhd : N3VDInt<0, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VBIND, + "vmax", "f16", + v4f16, v4f16, fmaxnan, 1>, + Requires<[HasNEON, HasFullFP16]>; +def VMAXhq : N3VQInt<0, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VBINQ, + "vmax", "f16", + v8f16, v8f16, fmaxnan, 1>, + Requires<[HasNEON, HasFullFP16]>; + +// VMAXNM +let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in { + def VMAXNMNDf : N3VDIntnp<0b00110, 0b00, 0b1111, 0, 1, + N3RegFrm, NoItinerary, "vmaxnm", "f32", + v2f32, v2f32, fmaxnum, 1>, + Requires<[HasV8, HasNEON]>; + def VMAXNMNQf : N3VQIntnp<0b00110, 0b00, 0b1111, 1, 1, + N3RegFrm, NoItinerary, "vmaxnm", "f32", + v4f32, v4f32, fmaxnum, 1>, + Requires<[HasV8, HasNEON]>; + def VMAXNMNDh : N3VDIntnp<0b00110, 0b01, 0b1111, 0, 1, + N3RegFrm, NoItinerary, "vmaxnm", "f16", + v4f16, v4f16, fmaxnum, 1>, + Requires<[HasV8, HasNEON, HasFullFP16]>; + def VMAXNMNQh : N3VQIntnp<0b00110, 0b01, 0b1111, 1, 1, + N3RegFrm, NoItinerary, "vmaxnm", "f16", + v8f16, v8f16, fmaxnum, 1>, + Requires<[HasV8, HasNEON, HasFullFP16]>; +} + +// VMIN : Vector Minimum +defm VMINs : N3VInt_QHS<0, 0, 0b0110, 1, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vmin", "s", smin, 1>; +defm VMINu : N3VInt_QHS<1, 0, 0b0110, 1, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vmin", "u", umin, 1>; +def VMINfd : N3VDInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBIND, + "vmin", "f32", + v2f32, v2f32, fminnan, 1>; +def VMINfq : N3VQInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBINQ, + "vmin", "f32", + v4f32, v4f32, fminnan, 1>; +def VMINhd : N3VDInt<0, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VBIND, + "vmin", "f16", + v4f16, v4f16, fminnan, 1>, + Requires<[HasNEON, HasFullFP16]>; +def VMINhq : N3VQInt<0, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VBINQ, + "vmin", "f16", + v8f16, v8f16, fminnan, 1>, + Requires<[HasNEON, HasFullFP16]>; + +// VMINNM +let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in { + def VMINNMNDf : N3VDIntnp<0b00110, 0b10, 0b1111, 0, 1, + N3RegFrm, NoItinerary, "vminnm", "f32", + v2f32, v2f32, fminnum, 1>, + Requires<[HasV8, HasNEON]>; + def VMINNMNQf : N3VQIntnp<0b00110, 0b10, 0b1111, 1, 1, + N3RegFrm, NoItinerary, "vminnm", "f32", + v4f32, v4f32, fminnum, 1>, + Requires<[HasV8, HasNEON]>; + def VMINNMNDh : N3VDIntnp<0b00110, 0b11, 0b1111, 0, 1, + N3RegFrm, NoItinerary, "vminnm", "f16", + v4f16, v4f16, fminnum, 1>, + Requires<[HasV8, HasNEON, HasFullFP16]>; + def VMINNMNQh : N3VQIntnp<0b00110, 0b11, 0b1111, 1, 1, + N3RegFrm, NoItinerary, "vminnm", "f16", + v8f16, v8f16, fminnum, 1>, + Requires<[HasV8, HasNEON, HasFullFP16]>; +} + +// Vector Pairwise Operations. + +// VPADD : Vector Pairwise Add +def VPADDi8 : N3VDInt<0, 0, 0b00, 0b1011, 1, N3RegFrm, IIC_VSHLiD, + "vpadd", "i8", + v8i8, v8i8, int_arm_neon_vpadd, 0>; +def VPADDi16 : N3VDInt<0, 0, 0b01, 0b1011, 1, N3RegFrm, IIC_VSHLiD, + "vpadd", "i16", + v4i16, v4i16, int_arm_neon_vpadd, 0>; +def VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, N3RegFrm, IIC_VSHLiD, + "vpadd", "i32", + v2i32, v2i32, int_arm_neon_vpadd, 0>; +def VPADDf : N3VDInt<1, 0, 0b00, 0b1101, 0, N3RegFrm, + IIC_VPBIND, "vpadd", "f32", + v2f32, v2f32, int_arm_neon_vpadd, 0>; +def VPADDh : N3VDInt<1, 0, 0b01, 0b1101, 0, N3RegFrm, + IIC_VPBIND, "vpadd", "f16", + v4f16, v4f16, int_arm_neon_vpadd, 0>, + Requires<[HasNEON, HasFullFP16]>; + +// VPADDL : Vector Pairwise Add Long +defm VPADDLs : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00100, 0, "vpaddl", "s", + int_arm_neon_vpaddls>; +defm VPADDLu : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00101, 0, "vpaddl", "u", + int_arm_neon_vpaddlu>; + +// VPADAL : Vector Pairwise Add and Accumulate Long +defm VPADALs : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b01100, 0, "vpadal", "s", + int_arm_neon_vpadals>; +defm VPADALu : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b01101, 0, "vpadal", "u", + int_arm_neon_vpadalu>; + +// VPMAX : Vector Pairwise Maximum +def VPMAXs8 : N3VDInt<0, 0, 0b00, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax", + "s8", v8i8, v8i8, int_arm_neon_vpmaxs, 0>; +def VPMAXs16 : N3VDInt<0, 0, 0b01, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax", + "s16", v4i16, v4i16, int_arm_neon_vpmaxs, 0>; +def VPMAXs32 : N3VDInt<0, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax", + "s32", v2i32, v2i32, int_arm_neon_vpmaxs, 0>; +def VPMAXu8 : N3VDInt<1, 0, 0b00, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax", + "u8", v8i8, v8i8, int_arm_neon_vpmaxu, 0>; +def VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax", + "u16", v4i16, v4i16, int_arm_neon_vpmaxu, 0>; +def VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax", + "u32", v2i32, v2i32, int_arm_neon_vpmaxu, 0>; +def VPMAXf : N3VDInt<1, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmax", + "f32", v2f32, v2f32, int_arm_neon_vpmaxs, 0>; +def VPMAXh : N3VDInt<1, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmax", + "f16", v4f16, v4f16, int_arm_neon_vpmaxs, 0>, + Requires<[HasNEON, HasFullFP16]>; + +// VPMIN : Vector Pairwise Minimum +def VPMINs8 : N3VDInt<0, 0, 0b00, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin", + "s8", v8i8, v8i8, int_arm_neon_vpmins, 0>; +def VPMINs16 : N3VDInt<0, 0, 0b01, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin", + "s16", v4i16, v4i16, int_arm_neon_vpmins, 0>; +def VPMINs32 : N3VDInt<0, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin", + "s32", v2i32, v2i32, int_arm_neon_vpmins, 0>; +def VPMINu8 : N3VDInt<1, 0, 0b00, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin", + "u8", v8i8, v8i8, int_arm_neon_vpminu, 0>; +def VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin", + "u16", v4i16, v4i16, int_arm_neon_vpminu, 0>; +def VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin", + "u32", v2i32, v2i32, int_arm_neon_vpminu, 0>; +def VPMINf : N3VDInt<1, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmin", + "f32", v2f32, v2f32, int_arm_neon_vpmins, 0>; +def VPMINh : N3VDInt<1, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmin", + "f16", v4f16, v4f16, int_arm_neon_vpmins, 0>, + Requires<[HasNEON, HasFullFP16]>; + +// Vector Reciprocal and Reciprocal Square Root Estimate and Step. + +// VRECPE : Vector Reciprocal Estimate +def VRECPEd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, + IIC_VUNAD, "vrecpe", "u32", + v2i32, v2i32, int_arm_neon_vrecpe>; +def VRECPEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, + IIC_VUNAQ, "vrecpe", "u32", + v4i32, v4i32, int_arm_neon_vrecpe>; +def VRECPEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, + IIC_VUNAD, "vrecpe", "f32", + v2f32, v2f32, int_arm_neon_vrecpe>; +def VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, + IIC_VUNAQ, "vrecpe", "f32", + v4f32, v4f32, int_arm_neon_vrecpe>; +def VRECPEhd : N2VDInt<0b11, 0b11, 0b01, 0b11, 0b01010, 0, + IIC_VUNAD, "vrecpe", "f16", + v4f16, v4f16, int_arm_neon_vrecpe>, + Requires<[HasNEON, HasFullFP16]>; +def VRECPEhq : N2VQInt<0b11, 0b11, 0b01, 0b11, 0b01010, 0, + IIC_VUNAQ, "vrecpe", "f16", + v8f16, v8f16, int_arm_neon_vrecpe>, + Requires<[HasNEON, HasFullFP16]>; + +// VRECPS : Vector Reciprocal Step +def VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, N3RegFrm, + IIC_VRECSD, "vrecps", "f32", + v2f32, v2f32, int_arm_neon_vrecps, 1>; +def VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1, N3RegFrm, + IIC_VRECSQ, "vrecps", "f32", + v4f32, v4f32, int_arm_neon_vrecps, 1>; +def VRECPShd : N3VDInt<0, 0, 0b01, 0b1111, 1, N3RegFrm, + IIC_VRECSD, "vrecps", "f16", + v4f16, v4f16, int_arm_neon_vrecps, 1>, + Requires<[HasNEON, HasFullFP16]>; +def VRECPShq : N3VQInt<0, 0, 0b01, 0b1111, 1, N3RegFrm, + IIC_VRECSQ, "vrecps", "f16", + v8f16, v8f16, int_arm_neon_vrecps, 1>, + Requires<[HasNEON, HasFullFP16]>; + +// VRSQRTE : Vector Reciprocal Square Root Estimate +def VRSQRTEd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0, + IIC_VUNAD, "vrsqrte", "u32", + v2i32, v2i32, int_arm_neon_vrsqrte>; +def VRSQRTEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0, + IIC_VUNAQ, "vrsqrte", "u32", + v4i32, v4i32, int_arm_neon_vrsqrte>; +def VRSQRTEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, + IIC_VUNAD, "vrsqrte", "f32", + v2f32, v2f32, int_arm_neon_vrsqrte>; +def VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, + IIC_VUNAQ, "vrsqrte", "f32", + v4f32, v4f32, int_arm_neon_vrsqrte>; +def VRSQRTEhd : N2VDInt<0b11, 0b11, 0b01, 0b11, 0b01011, 0, + IIC_VUNAD, "vrsqrte", "f16", + v4f16, v4f16, int_arm_neon_vrsqrte>, + Requires<[HasNEON, HasFullFP16]>; +def VRSQRTEhq : N2VQInt<0b11, 0b11, 0b01, 0b11, 0b01011, 0, + IIC_VUNAQ, "vrsqrte", "f16", + v8f16, v8f16, int_arm_neon_vrsqrte>, + Requires<[HasNEON, HasFullFP16]>; + +// VRSQRTS : Vector Reciprocal Square Root Step +def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, N3RegFrm, + IIC_VRECSD, "vrsqrts", "f32", + v2f32, v2f32, int_arm_neon_vrsqrts, 1>; +def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, N3RegFrm, + IIC_VRECSQ, "vrsqrts", "f32", + v4f32, v4f32, int_arm_neon_vrsqrts, 1>; +def VRSQRTShd : N3VDInt<0, 0, 0b11, 0b1111, 1, N3RegFrm, + IIC_VRECSD, "vrsqrts", "f16", + v4f16, v4f16, int_arm_neon_vrsqrts, 1>, + Requires<[HasNEON, HasFullFP16]>; +def VRSQRTShq : N3VQInt<0, 0, 0b11, 0b1111, 1, N3RegFrm, + IIC_VRECSQ, "vrsqrts", "f16", + v8f16, v8f16, int_arm_neon_vrsqrts, 1>, + Requires<[HasNEON, HasFullFP16]>; + +// Vector Shifts. + +// VSHL : Vector Shift +defm VSHLs : N3VInt_QHSDSh<0, 0, 0b0100, 0, N3RegVShFrm, + IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ, + "vshl", "s", int_arm_neon_vshifts>; +defm VSHLu : N3VInt_QHSDSh<1, 0, 0b0100, 0, N3RegVShFrm, + IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ, + "vshl", "u", int_arm_neon_vshiftu>; + +// VSHL : Vector Shift Left (Immediate) +defm VSHLi : N2VShL_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshl>; + +// VSHR : Vector Shift Right (Immediate) +defm VSHRs : N2VShR_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s", "VSHRs", + NEONvshrs>; +defm VSHRu : N2VShR_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u", "VSHRu", + NEONvshru>; + +// VSHLL : Vector Shift Left Long +defm VSHLLs : N2VLSh_QHS<0, 1, 0b1010, 0, 0, 1, "vshll", "s", + PatFrag<(ops node:$LHS, node:$RHS), (NEONvshl (sext node:$LHS), node:$RHS)>>; +defm VSHLLu : N2VLSh_QHS<1, 1, 0b1010, 0, 0, 1, "vshll", "u", + PatFrag<(ops node:$LHS, node:$RHS), (NEONvshl (zext node:$LHS), node:$RHS)>>; + +// VSHLL : Vector Shift Left Long (with maximum shift count) +class N2VLShMax<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, + bit op6, bit op4, string OpcodeStr, string Dt, ValueType ResTy, + ValueType OpTy, Operand ImmTy> + : N2VLSh<op24, op23, op11_8, op7, op6, op4, OpcodeStr, Dt, + ResTy, OpTy, ImmTy, null_frag> { + let Inst{21-16} = op21_16; + let DecoderMethod = "DecodeVSHLMaxInstruction"; +} +def VSHLLi8 : N2VLShMax<1, 1, 0b110010, 0b0011, 0, 0, 0, "vshll", "i8", + v8i16, v8i8, imm8>; +def VSHLLi16 : N2VLShMax<1, 1, 0b110110, 0b0011, 0, 0, 0, "vshll", "i16", + v4i32, v4i16, imm16>; +def VSHLLi32 : N2VLShMax<1, 1, 0b111010, 0b0011, 0, 0, 0, "vshll", "i32", + v2i64, v2i32, imm32>; + +def : Pat<(v8i16 (NEONvshl (zext (v8i8 DPR:$Rn)), (i32 8))), + (VSHLLi8 DPR:$Rn, 8)>; +def : Pat<(v4i32 (NEONvshl (zext (v4i16 DPR:$Rn)), (i32 16))), + (VSHLLi16 DPR:$Rn, 16)>; +def : Pat<(v2i64 (NEONvshl (zext (v2i32 DPR:$Rn)), (i32 32))), + (VSHLLi32 DPR:$Rn, 32)>; +def : Pat<(v8i16 (NEONvshl (sext (v8i8 DPR:$Rn)), (i32 8))), + (VSHLLi8 DPR:$Rn, 8)>; +def : Pat<(v4i32 (NEONvshl (sext (v4i16 DPR:$Rn)), (i32 16))), + (VSHLLi16 DPR:$Rn, 16)>; +def : Pat<(v2i64 (NEONvshl (sext (v2i32 DPR:$Rn)), (i32 32))), + (VSHLLi32 DPR:$Rn, 32)>; + +// VSHRN : Vector Shift Right and Narrow +defm VSHRN : N2VNSh_HSD<0,1,0b1000,0,0,1, IIC_VSHLiD, "vshrn", "i", + PatFrag<(ops node:$Rn, node:$amt), + (trunc (NEONvshrs node:$Rn, node:$amt))>>; + +def : Pat<(v8i8 (trunc (NEONvshru (v8i16 QPR:$Vn), shr_imm8:$amt))), + (VSHRNv8i8 QPR:$Vn, shr_imm8:$amt)>; +def : Pat<(v4i16 (trunc (NEONvshru (v4i32 QPR:$Vn), shr_imm16:$amt))), + (VSHRNv4i16 QPR:$Vn, shr_imm16:$amt)>; +def : Pat<(v2i32 (trunc (NEONvshru (v2i64 QPR:$Vn), shr_imm32:$amt))), + (VSHRNv2i32 QPR:$Vn, shr_imm32:$amt)>; + +// VRSHL : Vector Rounding Shift +defm VRSHLs : N3VInt_QHSDSh<0, 0, 0b0101, 0, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vrshl", "s", int_arm_neon_vrshifts>; +defm VRSHLu : N3VInt_QHSDSh<1, 0, 0b0101, 0, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vrshl", "u", int_arm_neon_vrshiftu>; +// VRSHR : Vector Rounding Shift Right +defm VRSHRs : N2VShR_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s", "VRSHRs", + NEONvrshrs>; +defm VRSHRu : N2VShR_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u", "VRSHRu", + NEONvrshru>; + +// VRSHRN : Vector Rounding Shift Right and Narrow +defm VRSHRN : N2VNSh_HSD<0, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vrshrn", "i", + NEONvrshrn>; + +// VQSHL : Vector Saturating Shift +defm VQSHLs : N3VInt_QHSDSh<0, 0, 0b0100, 1, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vqshl", "s", int_arm_neon_vqshifts>; +defm VQSHLu : N3VInt_QHSDSh<1, 0, 0b0100, 1, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vqshl", "u", int_arm_neon_vqshiftu>; +// VQSHL : Vector Saturating Shift Left (Immediate) +defm VQSHLsi : N2VShL_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s",NEONvqshls>; +defm VQSHLui : N2VShL_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u",NEONvqshlu>; + +// VQSHLU : Vector Saturating Shift Left (Immediate, Unsigned) +defm VQSHLsu : N2VShL_QHSD<1,1,0b0110,1, IIC_VSHLi4D,"vqshlu","s",NEONvqshlsu>; + +// VQSHRN : Vector Saturating Shift Right and Narrow +defm VQSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "s", + NEONvqshrns>; +defm VQSHRNu : N2VNSh_HSD<1, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "u", + NEONvqshrnu>; + +// VQSHRUN : Vector Saturating Shift Right and Narrow (Unsigned) +defm VQSHRUN : N2VNSh_HSD<1, 1, 0b1000, 0, 0, 1, IIC_VSHLi4D, "vqshrun", "s", + NEONvqshrnsu>; + +// VQRSHL : Vector Saturating Rounding Shift +defm VQRSHLs : N3VInt_QHSDSh<0, 0, 0b0101, 1, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vqrshl", "s", int_arm_neon_vqrshifts>; +defm VQRSHLu : N3VInt_QHSDSh<1, 0, 0b0101, 1, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vqrshl", "u", int_arm_neon_vqrshiftu>; + +// VQRSHRN : Vector Saturating Rounding Shift Right and Narrow +defm VQRSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "s", + NEONvqrshrns>; +defm VQRSHRNu : N2VNSh_HSD<1, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "u", + NEONvqrshrnu>; + +// VQRSHRUN : Vector Saturating Rounding Shift Right and Narrow (Unsigned) +defm VQRSHRUN : N2VNSh_HSD<1, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vqrshrun", "s", + NEONvqrshrnsu>; + +// VSRA : Vector Shift Right and Accumulate +defm VSRAs : N2VShAdd_QHSD<0, 1, 0b0001, 1, "vsra", "s", NEONvshrs>; +defm VSRAu : N2VShAdd_QHSD<1, 1, 0b0001, 1, "vsra", "u", NEONvshru>; +// VRSRA : Vector Rounding Shift Right and Accumulate +defm VRSRAs : N2VShAdd_QHSD<0, 1, 0b0011, 1, "vrsra", "s", NEONvrshrs>; +defm VRSRAu : N2VShAdd_QHSD<1, 1, 0b0011, 1, "vrsra", "u", NEONvrshru>; + +// VSLI : Vector Shift Left and Insert +defm VSLI : N2VShInsL_QHSD<1, 1, 0b0101, 1, "vsli">; + +// VSRI : Vector Shift Right and Insert +defm VSRI : N2VShInsR_QHSD<1, 1, 0b0100, 1, "vsri">; + +// Vector Absolute and Saturating Absolute. + +// VABS : Vector Absolute Value +defm VABS : N2VInt_QHS<0b11, 0b11, 0b01, 0b00110, 0, + IIC_VUNAiD, IIC_VUNAiQ, "vabs", "s", + int_arm_neon_vabs>; +def VABSfd : N2VD<0b11, 0b11, 0b10, 0b01, 0b01110, 0, + "vabs", "f32", + v2f32, v2f32, fabs>; +def VABSfq : N2VQ<0b11, 0b11, 0b10, 0b01, 0b01110, 0, + "vabs", "f32", + v4f32, v4f32, fabs>; +def VABShd : N2VD<0b11, 0b11, 0b01, 0b01, 0b01110, 0, + "vabs", "f16", + v4f16, v4f16, fabs>, + Requires<[HasNEON, HasFullFP16]>; +def VABShq : N2VQ<0b11, 0b11, 0b01, 0b01, 0b01110, 0, + "vabs", "f16", + v8f16, v8f16, fabs>, + Requires<[HasNEON, HasFullFP16]>; + +def : Pat<(xor (v2i32 (bitconvert (v8i8 (NEONvshrs DPR:$src, (i32 7))))), + (v2i32 (bitconvert (v8i8 (add DPR:$src, + (NEONvshrs DPR:$src, (i32 7))))))), + (VABSv8i8 DPR:$src)>; +def : Pat<(xor (v2i32 (bitconvert (v4i16 (NEONvshrs DPR:$src, (i32 15))))), + (v2i32 (bitconvert (v4i16 (add DPR:$src, + (NEONvshrs DPR:$src, (i32 15))))))), + (VABSv4i16 DPR:$src)>; +def : Pat<(xor (v2i32 (NEONvshrs DPR:$src, (i32 31))), + (v2i32 (add DPR:$src, (NEONvshrs DPR:$src, (i32 31))))), + (VABSv2i32 DPR:$src)>; +def : Pat<(xor (v4i32 (bitconvert (v16i8 (NEONvshrs QPR:$src, (i32 7))))), + (v4i32 (bitconvert (v16i8 (add QPR:$src, + (NEONvshrs QPR:$src, (i32 7))))))), + (VABSv16i8 QPR:$src)>; +def : Pat<(xor (v4i32 (bitconvert (v8i16 (NEONvshrs QPR:$src, (i32 15))))), + (v4i32 (bitconvert (v8i16 (add QPR:$src, + (NEONvshrs QPR:$src, (i32 15))))))), + (VABSv8i16 QPR:$src)>; +def : Pat<(xor (v4i32 (NEONvshrs QPR:$src, (i32 31))), + (v4i32 (add QPR:$src, (NEONvshrs QPR:$src, (i32 31))))), + (VABSv4i32 QPR:$src)>; + +// VQABS : Vector Saturating Absolute Value +defm VQABS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0, + IIC_VQUNAiD, IIC_VQUNAiQ, "vqabs", "s", + int_arm_neon_vqabs>; + +// Vector Negate. + +def vnegd : PatFrag<(ops node:$in), + (sub (bitconvert (v2i32 NEONimmAllZerosV)), node:$in)>; +def vnegq : PatFrag<(ops node:$in), + (sub (bitconvert (v4i32 NEONimmAllZerosV)), node:$in)>; + +class VNEGD<bits<2> size, string OpcodeStr, string Dt, ValueType Ty> + : N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$Vd), (ins DPR:$Vm), + IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set DPR:$Vd, (Ty (vnegd DPR:$Vm)))]>; +class VNEGQ<bits<2> size, string OpcodeStr, string Dt, ValueType Ty> + : N2V<0b11, 0b11, size, 0b01, 0b00111, 1, 0, (outs QPR:$Vd), (ins QPR:$Vm), + IIC_VSHLiQ, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set QPR:$Vd, (Ty (vnegq QPR:$Vm)))]>; + +// VNEG : Vector Negate (integer) +def VNEGs8d : VNEGD<0b00, "vneg", "s8", v8i8>; +def VNEGs16d : VNEGD<0b01, "vneg", "s16", v4i16>; +def VNEGs32d : VNEGD<0b10, "vneg", "s32", v2i32>; +def VNEGs8q : VNEGQ<0b00, "vneg", "s8", v16i8>; +def VNEGs16q : VNEGQ<0b01, "vneg", "s16", v8i16>; +def VNEGs32q : VNEGQ<0b10, "vneg", "s32", v4i32>; + +// VNEG : Vector Negate (floating-point) +def VNEGfd : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0, + (outs DPR:$Vd), (ins DPR:$Vm), IIC_VUNAD, + "vneg", "f32", "$Vd, $Vm", "", + [(set DPR:$Vd, (v2f32 (fneg DPR:$Vm)))]>; +def VNEGf32q : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 1, 0, + (outs QPR:$Vd), (ins QPR:$Vm), IIC_VUNAQ, + "vneg", "f32", "$Vd, $Vm", "", + [(set QPR:$Vd, (v4f32 (fneg QPR:$Vm)))]>; +def VNEGhd : N2V<0b11, 0b11, 0b01, 0b01, 0b01111, 0, 0, + (outs DPR:$Vd), (ins DPR:$Vm), IIC_VUNAD, + "vneg", "f16", "$Vd, $Vm", "", + [(set DPR:$Vd, (v4f16 (fneg DPR:$Vm)))]>, + Requires<[HasNEON, HasFullFP16]>; +def VNEGhq : N2V<0b11, 0b11, 0b01, 0b01, 0b01111, 1, 0, + (outs QPR:$Vd), (ins QPR:$Vm), IIC_VUNAQ, + "vneg", "f16", "$Vd, $Vm", "", + [(set QPR:$Vd, (v8f16 (fneg QPR:$Vm)))]>, + Requires<[HasNEON, HasFullFP16]>; + +def : Pat<(v8i8 (vnegd DPR:$src)), (VNEGs8d DPR:$src)>; +def : Pat<(v4i16 (vnegd DPR:$src)), (VNEGs16d DPR:$src)>; +def : Pat<(v2i32 (vnegd DPR:$src)), (VNEGs32d DPR:$src)>; +def : Pat<(v16i8 (vnegq QPR:$src)), (VNEGs8q QPR:$src)>; +def : Pat<(v8i16 (vnegq QPR:$src)), (VNEGs16q QPR:$src)>; +def : Pat<(v4i32 (vnegq QPR:$src)), (VNEGs32q QPR:$src)>; + +// VQNEG : Vector Saturating Negate +defm VQNEG : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0, + IIC_VQUNAiD, IIC_VQUNAiQ, "vqneg", "s", + int_arm_neon_vqneg>; + +// Vector Bit Counting Operations. + +// VCLS : Vector Count Leading Sign Bits +defm VCLS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0, + IIC_VCNTiD, IIC_VCNTiQ, "vcls", "s", + int_arm_neon_vcls>; +// VCLZ : Vector Count Leading Zeros +defm VCLZ : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0, + IIC_VCNTiD, IIC_VCNTiQ, "vclz", "i", + ctlz>; +// VCNT : Vector Count One Bits +def VCNTd : N2VDInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, + IIC_VCNTiD, "vcnt", "8", + v8i8, v8i8, ctpop>; +def VCNTq : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, + IIC_VCNTiQ, "vcnt", "8", + v16i8, v16i8, ctpop>; + +// Vector Swap +def VSWPd : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 0, 0, + (outs DPR:$Vd, DPR:$Vm), (ins DPR:$in1, DPR:$in2), + NoItinerary, "vswp", "$Vd, $Vm", "$in1 = $Vd, $in2 = $Vm", + []>; +def VSWPq : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 1, 0, + (outs QPR:$Vd, QPR:$Vm), (ins QPR:$in1, QPR:$in2), + NoItinerary, "vswp", "$Vd, $Vm", "$in1 = $Vd, $in2 = $Vm", + []>; + +// Vector Move Operations. + +// VMOV : Vector Move (Register) +def : NEONInstAlias<"vmov${p} $Vd, $Vm", + (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>; +def : NEONInstAlias<"vmov${p} $Vd, $Vm", + (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>; + +// VMOV : Vector Move (Immediate) + +let isReMaterializable = 1 in { +def VMOVv8i8 : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$Vd), + (ins nImmSplatI8:$SIMM), IIC_VMOVImm, + "vmov", "i8", "$Vd, $SIMM", "", + [(set DPR:$Vd, (v8i8 (NEONvmovImm timm:$SIMM)))]>; +def VMOVv16i8 : N1ModImm<1, 0b000, 0b1110, 0, 1, 0, 1, (outs QPR:$Vd), + (ins nImmSplatI8:$SIMM), IIC_VMOVImm, + "vmov", "i8", "$Vd, $SIMM", "", + [(set QPR:$Vd, (v16i8 (NEONvmovImm timm:$SIMM)))]>; + +def VMOVv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 0, 1, (outs DPR:$Vd), + (ins nImmSplatI16:$SIMM), IIC_VMOVImm, + "vmov", "i16", "$Vd, $SIMM", "", + [(set DPR:$Vd, (v4i16 (NEONvmovImm timm:$SIMM)))]> { + let Inst{9} = SIMM{9}; +} + +def VMOVv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 0, 1, (outs QPR:$Vd), + (ins nImmSplatI16:$SIMM), IIC_VMOVImm, + "vmov", "i16", "$Vd, $SIMM", "", + [(set QPR:$Vd, (v8i16 (NEONvmovImm timm:$SIMM)))]> { + let Inst{9} = SIMM{9}; +} + +def VMOVv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 0, 1, (outs DPR:$Vd), + (ins nImmVMOVI32:$SIMM), IIC_VMOVImm, + "vmov", "i32", "$Vd, $SIMM", "", + [(set DPR:$Vd, (v2i32 (NEONvmovImm timm:$SIMM)))]> { + let Inst{11-8} = SIMM{11-8}; +} + +def VMOVv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 0, 1, (outs QPR:$Vd), + (ins nImmVMOVI32:$SIMM), IIC_VMOVImm, + "vmov", "i32", "$Vd, $SIMM", "", + [(set QPR:$Vd, (v4i32 (NEONvmovImm timm:$SIMM)))]> { + let Inst{11-8} = SIMM{11-8}; +} + +def VMOVv1i64 : N1ModImm<1, 0b000, 0b1110, 0, 0, 1, 1, (outs DPR:$Vd), + (ins nImmSplatI64:$SIMM), IIC_VMOVImm, + "vmov", "i64", "$Vd, $SIMM", "", + [(set DPR:$Vd, (v1i64 (NEONvmovImm timm:$SIMM)))]>; +def VMOVv2i64 : N1ModImm<1, 0b000, 0b1110, 0, 1, 1, 1, (outs QPR:$Vd), + (ins nImmSplatI64:$SIMM), IIC_VMOVImm, + "vmov", "i64", "$Vd, $SIMM", "", + [(set QPR:$Vd, (v2i64 (NEONvmovImm timm:$SIMM)))]>; + +def VMOVv2f32 : N1ModImm<1, 0b000, 0b1111, 0, 0, 0, 1, (outs DPR:$Vd), + (ins nImmVMOVF32:$SIMM), IIC_VMOVImm, + "vmov", "f32", "$Vd, $SIMM", "", + [(set DPR:$Vd, (v2f32 (NEONvmovFPImm timm:$SIMM)))]>; +def VMOVv4f32 : N1ModImm<1, 0b000, 0b1111, 0, 1, 0, 1, (outs QPR:$Vd), + (ins nImmVMOVF32:$SIMM), IIC_VMOVImm, + "vmov", "f32", "$Vd, $SIMM", "", + [(set QPR:$Vd, (v4f32 (NEONvmovFPImm timm:$SIMM)))]>; +} // isReMaterializable + +// Add support for bytes replication feature, so it could be GAS compatible. +// E.g. instructions below: +// "vmov.i32 d0, 0xffffffff" +// "vmov.i32 d0, 0xabababab" +// "vmov.i16 d0, 0xabab" +// are incorrect, but we could deal with such cases. +// For last two instructions, for example, it should emit: +// "vmov.i8 d0, 0xab" +def : NEONInstAlias<"vmov${p}.i16 $Vd, $Vm", + (VMOVv8i8 DPR:$Vd, nImmVMOVI16ByteReplicate:$Vm, pred:$p)>; +def : NEONInstAlias<"vmov${p}.i32 $Vd, $Vm", + (VMOVv8i8 DPR:$Vd, nImmVMOVI32ByteReplicate:$Vm, pred:$p)>; +def : NEONInstAlias<"vmov${p}.i16 $Vd, $Vm", + (VMOVv16i8 QPR:$Vd, nImmVMOVI16ByteReplicate:$Vm, pred:$p)>; +def : NEONInstAlias<"vmov${p}.i32 $Vd, $Vm", + (VMOVv16i8 QPR:$Vd, nImmVMOVI32ByteReplicate:$Vm, pred:$p)>; + +// Also add same support for VMVN instructions. So instruction: +// "vmvn.i32 d0, 0xabababab" +// actually means: +// "vmov.i8 d0, 0x54" +def : NEONInstAlias<"vmvn${p}.i16 $Vd, $Vm", + (VMOVv8i8 DPR:$Vd, nImmVMVNI16ByteReplicate:$Vm, pred:$p)>; +def : NEONInstAlias<"vmvn${p}.i32 $Vd, $Vm", + (VMOVv8i8 DPR:$Vd, nImmVMVNI32ByteReplicate:$Vm, pred:$p)>; +def : NEONInstAlias<"vmvn${p}.i16 $Vd, $Vm", + (VMOVv16i8 QPR:$Vd, nImmVMVNI16ByteReplicate:$Vm, pred:$p)>; +def : NEONInstAlias<"vmvn${p}.i32 $Vd, $Vm", + (VMOVv16i8 QPR:$Vd, nImmVMVNI32ByteReplicate:$Vm, pred:$p)>; + +// On some CPUs the two instructions "vmov.i32 dD, #0" and "vmov.i32 qD, #0" +// require zero cycles to execute so they should be used wherever possible for +// setting a register to zero. + +// Even without these pseudo-insts we would probably end up with the correct +// instruction, but we could not mark the general ones with "isAsCheapAsAMove" +// since they are sometimes rather expensive (in general). + +let AddedComplexity = 50, isAsCheapAsAMove = 1, isReMaterializable = 1 in { + def VMOVD0 : ARMPseudoExpand<(outs DPR:$Vd), (ins), 4, IIC_VMOVImm, + [(set DPR:$Vd, (v2i32 NEONimmAllZerosV))], + (VMOVv2i32 DPR:$Vd, 0, (ops 14, zero_reg))>, + Requires<[HasZCZ]>; + def VMOVQ0 : ARMPseudoExpand<(outs QPR:$Vd), (ins), 4, IIC_VMOVImm, + [(set QPR:$Vd, (v4i32 NEONimmAllZerosV))], + (VMOVv4i32 QPR:$Vd, 0, (ops 14, zero_reg))>, + Requires<[HasZCZ]>; +} + +// VMOV : Vector Get Lane (move scalar to ARM core register) + +def VGETLNs8 : NVGetLane<{1,1,1,0,0,1,?,1}, 0b1011, {?,?}, + (outs GPR:$R), (ins DPR:$V, VectorIndex8:$lane), + IIC_VMOVSI, "vmov", "s8", "$R, $V$lane", + [(set GPR:$R, (NEONvgetlanes (v8i8 DPR:$V), + imm:$lane))]> { + let Inst{21} = lane{2}; + let Inst{6-5} = lane{1-0}; +} +def VGETLNs16 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, {?,1}, + (outs GPR:$R), (ins DPR:$V, VectorIndex16:$lane), + IIC_VMOVSI, "vmov", "s16", "$R, $V$lane", + [(set GPR:$R, (NEONvgetlanes (v4i16 DPR:$V), + imm:$lane))]> { + let Inst{21} = lane{1}; + let Inst{6} = lane{0}; +} +def VGETLNu8 : NVGetLane<{1,1,1,0,1,1,?,1}, 0b1011, {?,?}, + (outs GPR:$R), (ins DPR:$V, VectorIndex8:$lane), + IIC_VMOVSI, "vmov", "u8", "$R, $V$lane", + [(set GPR:$R, (NEONvgetlaneu (v8i8 DPR:$V), + imm:$lane))]> { + let Inst{21} = lane{2}; + let Inst{6-5} = lane{1-0}; +} +def VGETLNu16 : NVGetLane<{1,1,1,0,1,0,?,1}, 0b1011, {?,1}, + (outs GPR:$R), (ins DPR:$V, VectorIndex16:$lane), + IIC_VMOVSI, "vmov", "u16", "$R, $V$lane", + [(set GPR:$R, (NEONvgetlaneu (v4i16 DPR:$V), + imm:$lane))]> { + let Inst{21} = lane{1}; + let Inst{6} = lane{0}; +} +def VGETLNi32 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, 0b00, + (outs GPR:$R), (ins DPR:$V, VectorIndex32:$lane), + IIC_VMOVSI, "vmov", "32", "$R, $V$lane", + [(set GPR:$R, (extractelt (v2i32 DPR:$V), + imm:$lane))]>, + Requires<[HasVFP2, HasFastVGETLNi32]> { + let Inst{21} = lane{0}; +} +// def VGETLNf32: see FMRDH and FMRDL in ARMInstrVFP.td +def : Pat<(NEONvgetlanes (v16i8 QPR:$src), imm:$lane), + (VGETLNs8 (v8i8 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i8_reg imm:$lane))), + (SubReg_i8_lane imm:$lane))>; +def : Pat<(NEONvgetlanes (v8i16 QPR:$src), imm:$lane), + (VGETLNs16 (v4i16 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane))>; +def : Pat<(NEONvgetlaneu (v16i8 QPR:$src), imm:$lane), + (VGETLNu8 (v8i8 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i8_reg imm:$lane))), + (SubReg_i8_lane imm:$lane))>; +def : Pat<(NEONvgetlaneu (v8i16 QPR:$src), imm:$lane), + (VGETLNu16 (v4i16 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane))>; +def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane), + (VGETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane))>, + Requires<[HasNEON, HasFastVGETLNi32]>; +def : Pat<(extractelt (v2i32 DPR:$src), imm:$lane), + (COPY_TO_REGCLASS + (i32 (EXTRACT_SUBREG DPR:$src, (SSubReg_f32_reg imm:$lane))), GPR)>, + Requires<[HasNEON, HasSlowVGETLNi32]>; +def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane), + (COPY_TO_REGCLASS + (i32 (EXTRACT_SUBREG QPR:$src, (SSubReg_f32_reg imm:$lane))), GPR)>, + Requires<[HasNEON, HasSlowVGETLNi32]>; +def : Pat<(extractelt (v2f32 DPR:$src1), imm:$src2), + (EXTRACT_SUBREG (v2f32 (COPY_TO_REGCLASS (v2f32 DPR:$src1),DPR_VFP2)), + (SSubReg_f32_reg imm:$src2))>; +def : Pat<(extractelt (v4f32 QPR:$src1), imm:$src2), + (EXTRACT_SUBREG (v4f32 (COPY_TO_REGCLASS (v4f32 QPR:$src1),QPR_VFP2)), + (SSubReg_f32_reg imm:$src2))>; +//def : Pat<(extractelt (v2i64 QPR:$src1), imm:$src2), +// (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>; +def : Pat<(extractelt (v2f64 QPR:$src1), imm:$src2), + (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>; + + +// VMOV : Vector Set Lane (move ARM core register to scalar) + +let Constraints = "$src1 = $V" in { +def VSETLNi8 : NVSetLane<{1,1,1,0,0,1,?,0}, 0b1011, {?,?}, (outs DPR:$V), + (ins DPR:$src1, GPR:$R, VectorIndex8:$lane), + IIC_VMOVISL, "vmov", "8", "$V$lane, $R", + [(set DPR:$V, (vector_insert (v8i8 DPR:$src1), + GPR:$R, imm:$lane))]> { + let Inst{21} = lane{2}; + let Inst{6-5} = lane{1-0}; +} +def VSETLNi16 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, {?,1}, (outs DPR:$V), + (ins DPR:$src1, GPR:$R, VectorIndex16:$lane), + IIC_VMOVISL, "vmov", "16", "$V$lane, $R", + [(set DPR:$V, (vector_insert (v4i16 DPR:$src1), + GPR:$R, imm:$lane))]> { + let Inst{21} = lane{1}; + let Inst{6} = lane{0}; +} +def VSETLNi32 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, 0b00, (outs DPR:$V), + (ins DPR:$src1, GPR:$R, VectorIndex32:$lane), + IIC_VMOVISL, "vmov", "32", "$V$lane, $R", + [(set DPR:$V, (insertelt (v2i32 DPR:$src1), + GPR:$R, imm:$lane))]>, + Requires<[HasVFP2]> { + let Inst{21} = lane{0}; + // This instruction is equivalent as + // $V = INSERT_SUBREG $src1, $R, translateImmToSubIdx($imm) + let isInsertSubreg = 1; +} +} +def : Pat<(vector_insert (v16i8 QPR:$src1), GPR:$src2, imm:$lane), + (v16i8 (INSERT_SUBREG QPR:$src1, + (v8i8 (VSETLNi8 (v8i8 (EXTRACT_SUBREG QPR:$src1, + (DSubReg_i8_reg imm:$lane))), + GPR:$src2, (SubReg_i8_lane imm:$lane))), + (DSubReg_i8_reg imm:$lane)))>; +def : Pat<(vector_insert (v8i16 QPR:$src1), GPR:$src2, imm:$lane), + (v8i16 (INSERT_SUBREG QPR:$src1, + (v4i16 (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1, + (DSubReg_i16_reg imm:$lane))), + GPR:$src2, (SubReg_i16_lane imm:$lane))), + (DSubReg_i16_reg imm:$lane)))>; +def : Pat<(insertelt (v4i32 QPR:$src1), GPR:$src2, imm:$lane), + (v4i32 (INSERT_SUBREG QPR:$src1, + (v2i32 (VSETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src1, + (DSubReg_i32_reg imm:$lane))), + GPR:$src2, (SubReg_i32_lane imm:$lane))), + (DSubReg_i32_reg imm:$lane)))>; + +def : Pat<(v2f32 (insertelt DPR:$src1, SPR:$src2, imm:$src3)), + (INSERT_SUBREG (v2f32 (COPY_TO_REGCLASS DPR:$src1, DPR_VFP2)), + SPR:$src2, (SSubReg_f32_reg imm:$src3))>; +def : Pat<(v4f32 (insertelt QPR:$src1, SPR:$src2, imm:$src3)), + (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS QPR:$src1, QPR_VFP2)), + SPR:$src2, (SSubReg_f32_reg imm:$src3))>; + +//def : Pat<(v2i64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)), +// (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>; +def : Pat<(v2f64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)), + (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>; + +def : Pat<(v2f32 (scalar_to_vector SPR:$src)), + (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>; +def : Pat<(v2f64 (scalar_to_vector (f64 DPR:$src))), + (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), DPR:$src, dsub_0)>; +def : Pat<(v4f32 (scalar_to_vector SPR:$src)), + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>; + +def : Pat<(v8i8 (scalar_to_vector GPR:$src)), + (VSETLNi8 (v8i8 (IMPLICIT_DEF)), GPR:$src, (i32 0))>; +def : Pat<(v4i16 (scalar_to_vector GPR:$src)), + (VSETLNi16 (v4i16 (IMPLICIT_DEF)), GPR:$src, (i32 0))>; +def : Pat<(v2i32 (scalar_to_vector GPR:$src)), + (VSETLNi32 (v2i32 (IMPLICIT_DEF)), GPR:$src, (i32 0))>; + +def : Pat<(v16i8 (scalar_to_vector GPR:$src)), + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (VSETLNi8 (v8i8 (IMPLICIT_DEF)), GPR:$src, (i32 0)), + dsub_0)>; +def : Pat<(v8i16 (scalar_to_vector GPR:$src)), + (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), + (VSETLNi16 (v4i16 (IMPLICIT_DEF)), GPR:$src, (i32 0)), + dsub_0)>; +def : Pat<(v4i32 (scalar_to_vector GPR:$src)), + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), + (VSETLNi32 (v2i32 (IMPLICIT_DEF)), GPR:$src, (i32 0)), + dsub_0)>; + +// VDUP : Vector Duplicate (from ARM core register to all elements) + +class VDUPD<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty> + : NVDup<opcod1, 0b1011, opcod3, (outs DPR:$V), (ins GPR:$R), + IIC_VMOVIS, "vdup", Dt, "$V, $R", + [(set DPR:$V, (Ty (NEONvdup (i32 GPR:$R))))]>; +class VDUPQ<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty> + : NVDup<opcod1, 0b1011, opcod3, (outs QPR:$V), (ins GPR:$R), + IIC_VMOVIS, "vdup", Dt, "$V, $R", + [(set QPR:$V, (Ty (NEONvdup (i32 GPR:$R))))]>; + +def VDUP8d : VDUPD<0b11101100, 0b00, "8", v8i8>; +def VDUP16d : VDUPD<0b11101000, 0b01, "16", v4i16>; +def VDUP32d : VDUPD<0b11101000, 0b00, "32", v2i32>, + Requires<[HasNEON, HasFastVDUP32]>; +def VDUP8q : VDUPQ<0b11101110, 0b00, "8", v16i8>; +def VDUP16q : VDUPQ<0b11101010, 0b01, "16", v8i16>; +def VDUP32q : VDUPQ<0b11101010, 0b00, "32", v4i32>; + +// NEONvdup patterns for uarchs with fast VDUP.32. +def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32d GPR:$R)>, + Requires<[HasNEON,HasFastVDUP32]>; +def : Pat<(v4f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32q GPR:$R)>; + +// NEONvdup patterns for uarchs with slow VDUP.32 - use VMOVDRR instead. +def : Pat<(v2i32 (NEONvdup (i32 GPR:$R))), (VMOVDRR GPR:$R, GPR:$R)>, + Requires<[HasNEON,HasSlowVDUP32]>; +def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VMOVDRR GPR:$R, GPR:$R)>, + Requires<[HasNEON,HasSlowVDUP32]>; + +// VDUP : Vector Duplicate Lane (from scalar to all elements) + +class VDUPLND<bits<4> op19_16, string OpcodeStr, string Dt, + ValueType Ty, Operand IdxTy> + : NVDupLane<op19_16, 0, (outs DPR:$Vd), (ins DPR:$Vm, IdxTy:$lane), + IIC_VMOVD, OpcodeStr, Dt, "$Vd, $Vm$lane", + [(set DPR:$Vd, (Ty (NEONvduplane (Ty DPR:$Vm), imm:$lane)))]>; + +class VDUPLNQ<bits<4> op19_16, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Operand IdxTy> + : NVDupLane<op19_16, 1, (outs QPR:$Vd), (ins DPR:$Vm, IdxTy:$lane), + IIC_VMOVQ, OpcodeStr, Dt, "$Vd, $Vm$lane", + [(set QPR:$Vd, (ResTy (NEONvduplane (OpTy DPR:$Vm), + VectorIndex32:$lane)))]>; + +// Inst{19-16} is partially specified depending on the element size. + +def VDUPLN8d : VDUPLND<{?,?,?,1}, "vdup", "8", v8i8, VectorIndex8> { + bits<3> lane; + let Inst{19-17} = lane{2-0}; +} +def VDUPLN16d : VDUPLND<{?,?,1,0}, "vdup", "16", v4i16, VectorIndex16> { + bits<2> lane; + let Inst{19-18} = lane{1-0}; +} +def VDUPLN32d : VDUPLND<{?,1,0,0}, "vdup", "32", v2i32, VectorIndex32> { + bits<1> lane; + let Inst{19} = lane{0}; +} +def VDUPLN8q : VDUPLNQ<{?,?,?,1}, "vdup", "8", v16i8, v8i8, VectorIndex8> { + bits<3> lane; + let Inst{19-17} = lane{2-0}; +} +def VDUPLN16q : VDUPLNQ<{?,?,1,0}, "vdup", "16", v8i16, v4i16, VectorIndex16> { + bits<2> lane; + let Inst{19-18} = lane{1-0}; +} +def VDUPLN32q : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4i32, v2i32, VectorIndex32> { + bits<1> lane; + let Inst{19} = lane{0}; +} + +def : Pat<(v2f32 (NEONvduplane (v2f32 DPR:$Vm), imm:$lane)), + (VDUPLN32d DPR:$Vm, imm:$lane)>; + +def : Pat<(v4f32 (NEONvduplane (v2f32 DPR:$Vm), imm:$lane)), + (VDUPLN32q DPR:$Vm, imm:$lane)>; + +def : Pat<(v16i8 (NEONvduplane (v16i8 QPR:$src), imm:$lane)), + (v16i8 (VDUPLN8q (v8i8 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i8_reg imm:$lane))), + (SubReg_i8_lane imm:$lane)))>; +def : Pat<(v8i16 (NEONvduplane (v8i16 QPR:$src), imm:$lane)), + (v8i16 (VDUPLN16q (v4i16 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; +def : Pat<(v4i32 (NEONvduplane (v4i32 QPR:$src), imm:$lane)), + (v4i32 (VDUPLN32q (v2i32 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; +def : Pat<(v4f32 (NEONvduplane (v4f32 QPR:$src), imm:$lane)), + (v4f32 (VDUPLN32q (v2f32 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + +def : Pat<(v2f32 (NEONvdup (f32 SPR:$src))), + (v2f32 (VDUPLN32d (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), + SPR:$src, ssub_0), (i32 0)))>; +def : Pat<(v4f32 (NEONvdup (f32 SPR:$src))), + (v4f32 (VDUPLN32q (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), + SPR:$src, ssub_0), (i32 0)))>; + +// VMOVN : Vector Narrowing Move +defm VMOVN : N2VN_HSD<0b11,0b11,0b10,0b00100,0,0, IIC_VMOVN, + "vmovn", "i", trunc>; +// VQMOVN : Vector Saturating Narrowing Move +defm VQMOVNs : N2VNInt_HSD<0b11,0b11,0b10,0b00101,0,0, IIC_VQUNAiD, + "vqmovn", "s", int_arm_neon_vqmovns>; +defm VQMOVNu : N2VNInt_HSD<0b11,0b11,0b10,0b00101,1,0, IIC_VQUNAiD, + "vqmovn", "u", int_arm_neon_vqmovnu>; +defm VQMOVNsu : N2VNInt_HSD<0b11,0b11,0b10,0b00100,1,0, IIC_VQUNAiD, + "vqmovun", "s", int_arm_neon_vqmovnsu>; +// VMOVL : Vector Lengthening Move +defm VMOVLs : N2VL_QHS<0b01,0b10100,0,1, "vmovl", "s", sext>; +defm VMOVLu : N2VL_QHS<0b11,0b10100,0,1, "vmovl", "u", zext>; +def : Pat<(v8i16 (anyext (v8i8 DPR:$Vm))), (VMOVLuv8i16 DPR:$Vm)>; +def : Pat<(v4i32 (anyext (v4i16 DPR:$Vm))), (VMOVLuv4i32 DPR:$Vm)>; +def : Pat<(v2i64 (anyext (v2i32 DPR:$Vm))), (VMOVLuv2i64 DPR:$Vm)>; + +// Vector Conversions. + +// VCVT : Vector Convert Between Floating-Point and Integers +def VCVTf2sd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt", "s32.f32", + v2i32, v2f32, fp_to_sint>; +def VCVTf2ud : N2VD<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt", "u32.f32", + v2i32, v2f32, fp_to_uint>; +def VCVTs2fd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32", + v2f32, v2i32, sint_to_fp>; +def VCVTu2fd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32", + v2f32, v2i32, uint_to_fp>; + +def VCVTf2sq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt", "s32.f32", + v4i32, v4f32, fp_to_sint>; +def VCVTf2uq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt", "u32.f32", + v4i32, v4f32, fp_to_uint>; +def VCVTs2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32", + v4f32, v4i32, sint_to_fp>; +def VCVTu2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32", + v4f32, v4i32, uint_to_fp>; + +def VCVTh2sd : N2VD<0b11, 0b11, 0b01, 0b11, 0b01110, 0, "vcvt", "s16.f16", + v4i16, v4f16, fp_to_sint>, + Requires<[HasNEON, HasFullFP16]>; +def VCVTh2ud : N2VD<0b11, 0b11, 0b01, 0b11, 0b01111, 0, "vcvt", "u16.f16", + v4i16, v4f16, fp_to_uint>, + Requires<[HasNEON, HasFullFP16]>; +def VCVTs2hd : N2VD<0b11, 0b11, 0b01, 0b11, 0b01100, 0, "vcvt", "f16.s16", + v4f16, v4i16, sint_to_fp>, + Requires<[HasNEON, HasFullFP16]>; +def VCVTu2hd : N2VD<0b11, 0b11, 0b01, 0b11, 0b01101, 0, "vcvt", "f16.u16", + v4f16, v4i16, uint_to_fp>, + Requires<[HasNEON, HasFullFP16]>; + +def VCVTh2sq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01110, 0, "vcvt", "s16.f16", + v8i16, v8f16, fp_to_sint>, + Requires<[HasNEON, HasFullFP16]>; +def VCVTh2uq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01111, 0, "vcvt", "u16.f16", + v8i16, v8f16, fp_to_uint>, + Requires<[HasNEON, HasFullFP16]>; +def VCVTs2hq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01100, 0, "vcvt", "f16.s16", + v8f16, v8i16, sint_to_fp>, + Requires<[HasNEON, HasFullFP16]>; +def VCVTu2hq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01101, 0, "vcvt", "f16.u16", + v8f16, v8i16, uint_to_fp>, + Requires<[HasNEON, HasFullFP16]>; + +// VCVT{A, N, P, M} +multiclass VCVT_FPI<string op, bits<3> op10_8, SDPatternOperator IntS, + SDPatternOperator IntU> { + let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in { + def SDf : N2VDIntnp<0b10, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op), + "s32.f32", v2i32, v2f32, IntS>, Requires<[HasV8, HasNEON]>; + def SQf : N2VQIntnp<0b10, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op), + "s32.f32", v4i32, v4f32, IntS>, Requires<[HasV8, HasNEON]>; + def UDf : N2VDIntnp<0b10, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op), + "u32.f32", v2i32, v2f32, IntU>, Requires<[HasV8, HasNEON]>; + def UQf : N2VQIntnp<0b10, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op), + "u32.f32", v4i32, v4f32, IntU>, Requires<[HasV8, HasNEON]>; + def SDh : N2VDIntnp<0b01, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op), + "s16.f16", v4i16, v4f16, IntS>, + Requires<[HasV8, HasNEON, HasFullFP16]>; + def SQh : N2VQIntnp<0b01, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op), + "s16.f16", v8i16, v8f16, IntS>, + Requires<[HasV8, HasNEON, HasFullFP16]>; + def UDh : N2VDIntnp<0b01, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op), + "u16.f16", v4i16, v4f16, IntU>, + Requires<[HasV8, HasNEON, HasFullFP16]>; + def UQh : N2VQIntnp<0b01, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op), + "u16.f16", v8i16, v8f16, IntU>, + Requires<[HasV8, HasNEON, HasFullFP16]>; + } +} + +defm VCVTAN : VCVT_FPI<"a", 0b000, int_arm_neon_vcvtas, int_arm_neon_vcvtau>; +defm VCVTNN : VCVT_FPI<"n", 0b001, int_arm_neon_vcvtns, int_arm_neon_vcvtnu>; +defm VCVTPN : VCVT_FPI<"p", 0b010, int_arm_neon_vcvtps, int_arm_neon_vcvtpu>; +defm VCVTMN : VCVT_FPI<"m", 0b011, int_arm_neon_vcvtms, int_arm_neon_vcvtmu>; + +// VCVT : Vector Convert Between Floating-Point and Fixed-Point. +let DecoderMethod = "DecodeVCVTD" in { +def VCVTf2xsd : N2VCvtD<0, 1, 0b1111, 0, 1, "vcvt", "s32.f32", + v2i32, v2f32, int_arm_neon_vcvtfp2fxs>; +def VCVTf2xud : N2VCvtD<1, 1, 0b1111, 0, 1, "vcvt", "u32.f32", + v2i32, v2f32, int_arm_neon_vcvtfp2fxu>; +def VCVTxs2fd : N2VCvtD<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32", + v2f32, v2i32, int_arm_neon_vcvtfxs2fp>; +def VCVTxu2fd : N2VCvtD<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32", + v2f32, v2i32, int_arm_neon_vcvtfxu2fp>; +let Predicates = [HasNEON, HasFullFP16] in { +def VCVTh2xsd : N2VCvtD<0, 1, 0b1101, 0, 1, "vcvt", "s16.f16", + v4i16, v4f16, int_arm_neon_vcvtfp2fxs>; +def VCVTh2xud : N2VCvtD<1, 1, 0b1101, 0, 1, "vcvt", "u16.f16", + v4i16, v4f16, int_arm_neon_vcvtfp2fxu>; +def VCVTxs2hd : N2VCvtD<0, 1, 0b1100, 0, 1, "vcvt", "f16.s16", + v4f16, v4i16, int_arm_neon_vcvtfxs2fp>; +def VCVTxu2hd : N2VCvtD<1, 1, 0b1100, 0, 1, "vcvt", "f16.u16", + v4f16, v4i16, int_arm_neon_vcvtfxu2fp>; +} // Predicates = [HasNEON, HasFullFP16] +} + +let DecoderMethod = "DecodeVCVTQ" in { +def VCVTf2xsq : N2VCvtQ<0, 1, 0b1111, 0, 1, "vcvt", "s32.f32", + v4i32, v4f32, int_arm_neon_vcvtfp2fxs>; +def VCVTf2xuq : N2VCvtQ<1, 1, 0b1111, 0, 1, "vcvt", "u32.f32", + v4i32, v4f32, int_arm_neon_vcvtfp2fxu>; +def VCVTxs2fq : N2VCvtQ<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32", + v4f32, v4i32, int_arm_neon_vcvtfxs2fp>; +def VCVTxu2fq : N2VCvtQ<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32", + v4f32, v4i32, int_arm_neon_vcvtfxu2fp>; +let Predicates = [HasNEON, HasFullFP16] in { +def VCVTh2xsq : N2VCvtQ<0, 1, 0b1101, 0, 1, "vcvt", "s16.f16", + v8i16, v8f16, int_arm_neon_vcvtfp2fxs>; +def VCVTh2xuq : N2VCvtQ<1, 1, 0b1101, 0, 1, "vcvt", "u16.f16", + v8i16, v8f16, int_arm_neon_vcvtfp2fxu>; +def VCVTxs2hq : N2VCvtQ<0, 1, 0b1100, 0, 1, "vcvt", "f16.s16", + v8f16, v8i16, int_arm_neon_vcvtfxs2fp>; +def VCVTxu2hq : N2VCvtQ<1, 1, 0b1100, 0, 1, "vcvt", "f16.u16", + v8f16, v8i16, int_arm_neon_vcvtfxu2fp>; +} // Predicates = [HasNEON, HasFullFP16] +} + +def : NEONInstAlias<"vcvt${p}.s32.f32 $Dd, $Dm, #0", + (VCVTf2sd DPR:$Dd, DPR:$Dm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.u32.f32 $Dd, $Dm, #0", + (VCVTf2ud DPR:$Dd, DPR:$Dm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.f32.s32 $Dd, $Dm, #0", + (VCVTs2fd DPR:$Dd, DPR:$Dm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.f32.u32 $Dd, $Dm, #0", + (VCVTu2fd DPR:$Dd, DPR:$Dm, pred:$p)>; + +def : NEONInstAlias<"vcvt${p}.s32.f32 $Qd, $Qm, #0", + (VCVTf2sq QPR:$Qd, QPR:$Qm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.u32.f32 $Qd, $Qm, #0", + (VCVTf2uq QPR:$Qd, QPR:$Qm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.f32.s32 $Qd, $Qm, #0", + (VCVTs2fq QPR:$Qd, QPR:$Qm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.f32.u32 $Qd, $Qm, #0", + (VCVTu2fq QPR:$Qd, QPR:$Qm, pred:$p)>; + +def : NEONInstAlias<"vcvt${p}.s16.f16 $Dd, $Dm, #0", + (VCVTh2sd DPR:$Dd, DPR:$Dm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.u16.f16 $Dd, $Dm, #0", + (VCVTh2ud DPR:$Dd, DPR:$Dm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.f16.s16 $Dd, $Dm, #0", + (VCVTs2hd DPR:$Dd, DPR:$Dm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.f16.u16 $Dd, $Dm, #0", + (VCVTu2hd DPR:$Dd, DPR:$Dm, pred:$p)>; + +def : NEONInstAlias<"vcvt${p}.s16.f16 $Qd, $Qm, #0", + (VCVTh2sq QPR:$Qd, QPR:$Qm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.u16.f16 $Qd, $Qm, #0", + (VCVTh2uq QPR:$Qd, QPR:$Qm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.f16.s16 $Qd, $Qm, #0", + (VCVTs2hq QPR:$Qd, QPR:$Qm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.f16.u16 $Qd, $Qm, #0", + (VCVTu2hq QPR:$Qd, QPR:$Qm, pred:$p)>; + + +// VCVT : Vector Convert Between Half-Precision and Single-Precision. +def VCVTf2h : N2VNInt<0b11, 0b11, 0b01, 0b10, 0b01100, 0, 0, + IIC_VUNAQ, "vcvt", "f16.f32", + v4i16, v4f32, int_arm_neon_vcvtfp2hf>, + Requires<[HasNEON, HasFP16]>; +def VCVTh2f : N2VLInt<0b11, 0b11, 0b01, 0b10, 0b01110, 0, 0, + IIC_VUNAQ, "vcvt", "f32.f16", + v4f32, v4i16, int_arm_neon_vcvthf2fp>, + Requires<[HasNEON, HasFP16]>; + +// Vector Reverse. + +// VREV64 : Vector Reverse elements within 64-bit doublewords + +class VREV64D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty> + : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 0, 0, (outs DPR:$Vd), + (ins DPR:$Vm), IIC_VMOVD, + OpcodeStr, Dt, "$Vd, $Vm", "", + [(set DPR:$Vd, (Ty (NEONvrev64 (Ty DPR:$Vm))))]>; +class VREV64Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty> + : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 1, 0, (outs QPR:$Vd), + (ins QPR:$Vm), IIC_VMOVQ, + OpcodeStr, Dt, "$Vd, $Vm", "", + [(set QPR:$Vd, (Ty (NEONvrev64 (Ty QPR:$Vm))))]>; + +def VREV64d8 : VREV64D<0b00, "vrev64", "8", v8i8>; +def VREV64d16 : VREV64D<0b01, "vrev64", "16", v4i16>; +def VREV64d32 : VREV64D<0b10, "vrev64", "32", v2i32>; +def : Pat<(v2f32 (NEONvrev64 (v2f32 DPR:$Vm))), (VREV64d32 DPR:$Vm)>; + +def VREV64q8 : VREV64Q<0b00, "vrev64", "8", v16i8>; +def VREV64q16 : VREV64Q<0b01, "vrev64", "16", v8i16>; +def VREV64q32 : VREV64Q<0b10, "vrev64", "32", v4i32>; +def : Pat<(v4f32 (NEONvrev64 (v4f32 QPR:$Vm))), (VREV64q32 QPR:$Vm)>; + +// VREV32 : Vector Reverse elements within 32-bit words + +class VREV32D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty> + : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 0, 0, (outs DPR:$Vd), + (ins DPR:$Vm), IIC_VMOVD, + OpcodeStr, Dt, "$Vd, $Vm", "", + [(set DPR:$Vd, (Ty (NEONvrev32 (Ty DPR:$Vm))))]>; +class VREV32Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty> + : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 1, 0, (outs QPR:$Vd), + (ins QPR:$Vm), IIC_VMOVQ, + OpcodeStr, Dt, "$Vd, $Vm", "", + [(set QPR:$Vd, (Ty (NEONvrev32 (Ty QPR:$Vm))))]>; + +def VREV32d8 : VREV32D<0b00, "vrev32", "8", v8i8>; +def VREV32d16 : VREV32D<0b01, "vrev32", "16", v4i16>; + +def VREV32q8 : VREV32Q<0b00, "vrev32", "8", v16i8>; +def VREV32q16 : VREV32Q<0b01, "vrev32", "16", v8i16>; + +// VREV16 : Vector Reverse elements within 16-bit halfwords + +class VREV16D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty> + : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 0, 0, (outs DPR:$Vd), + (ins DPR:$Vm), IIC_VMOVD, + OpcodeStr, Dt, "$Vd, $Vm", "", + [(set DPR:$Vd, (Ty (NEONvrev16 (Ty DPR:$Vm))))]>; +class VREV16Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty> + : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 1, 0, (outs QPR:$Vd), + (ins QPR:$Vm), IIC_VMOVQ, + OpcodeStr, Dt, "$Vd, $Vm", "", + [(set QPR:$Vd, (Ty (NEONvrev16 (Ty QPR:$Vm))))]>; + +def VREV16d8 : VREV16D<0b00, "vrev16", "8", v8i8>; +def VREV16q8 : VREV16Q<0b00, "vrev16", "8", v16i8>; + +// Other Vector Shuffles. + +// Aligned extractions: really just dropping registers + +class AlignedVEXTq<ValueType DestTy, ValueType SrcTy, SDNodeXForm LaneCVT> + : Pat<(DestTy (vector_extract_subvec (SrcTy QPR:$src), (i32 imm:$start))), + (EXTRACT_SUBREG (SrcTy QPR:$src), (LaneCVT imm:$start))>; + +def : AlignedVEXTq<v8i8, v16i8, DSubReg_i8_reg>; + +def : AlignedVEXTq<v4i16, v8i16, DSubReg_i16_reg>; + +def : AlignedVEXTq<v2i32, v4i32, DSubReg_i32_reg>; + +def : AlignedVEXTq<v1i64, v2i64, DSubReg_f64_reg>; + +def : AlignedVEXTq<v2f32, v4f32, DSubReg_i32_reg>; + + +// VEXT : Vector Extract + + +// All of these have a two-operand InstAlias. +let TwoOperandAliasConstraint = "$Vn = $Vd" in { +class VEXTd<string OpcodeStr, string Dt, ValueType Ty, Operand immTy> + : N3V<0,1,0b11,{?,?,?,?},0,0, (outs DPR:$Vd), + (ins DPR:$Vn, DPR:$Vm, immTy:$index), NVExtFrm, + IIC_VEXTD, OpcodeStr, Dt, "$Vd, $Vn, $Vm, $index", "", + [(set DPR:$Vd, (Ty (NEONvext (Ty DPR:$Vn), + (Ty DPR:$Vm), imm:$index)))]> { + bits<3> index; + let Inst{11} = 0b0; + let Inst{10-8} = index{2-0}; +} + +class VEXTq<string OpcodeStr, string Dt, ValueType Ty, Operand immTy> + : N3V<0,1,0b11,{?,?,?,?},1,0, (outs QPR:$Vd), + (ins QPR:$Vn, QPR:$Vm, imm0_15:$index), NVExtFrm, + IIC_VEXTQ, OpcodeStr, Dt, "$Vd, $Vn, $Vm, $index", "", + [(set QPR:$Vd, (Ty (NEONvext (Ty QPR:$Vn), + (Ty QPR:$Vm), imm:$index)))]> { + bits<4> index; + let Inst{11-8} = index{3-0}; +} +} + +def VEXTd8 : VEXTd<"vext", "8", v8i8, imm0_7> { + let Inst{10-8} = index{2-0}; +} +def VEXTd16 : VEXTd<"vext", "16", v4i16, imm0_3> { + let Inst{10-9} = index{1-0}; + let Inst{8} = 0b0; +} +def VEXTd32 : VEXTd<"vext", "32", v2i32, imm0_1> { + let Inst{10} = index{0}; + let Inst{9-8} = 0b00; +} +def : Pat<(v2f32 (NEONvext (v2f32 DPR:$Vn), + (v2f32 DPR:$Vm), + (i32 imm:$index))), + (VEXTd32 DPR:$Vn, DPR:$Vm, imm:$index)>; + +def VEXTq8 : VEXTq<"vext", "8", v16i8, imm0_15> { + let Inst{11-8} = index{3-0}; +} +def VEXTq16 : VEXTq<"vext", "16", v8i16, imm0_7> { + let Inst{11-9} = index{2-0}; + let Inst{8} = 0b0; +} +def VEXTq32 : VEXTq<"vext", "32", v4i32, imm0_3> { + let Inst{11-10} = index{1-0}; + let Inst{9-8} = 0b00; +} +def VEXTq64 : VEXTq<"vext", "64", v2i64, imm0_1> { + let Inst{11} = index{0}; + let Inst{10-8} = 0b000; +} +def : Pat<(v4f32 (NEONvext (v4f32 QPR:$Vn), + (v4f32 QPR:$Vm), + (i32 imm:$index))), + (VEXTq32 QPR:$Vn, QPR:$Vm, imm:$index)>; + +// VTRN : Vector Transpose + +def VTRNd8 : N2VDShuffle<0b00, 0b00001, "vtrn", "8">; +def VTRNd16 : N2VDShuffle<0b01, 0b00001, "vtrn", "16">; +def VTRNd32 : N2VDShuffle<0b10, 0b00001, "vtrn", "32">; + +def VTRNq8 : N2VQShuffle<0b00, 0b00001, IIC_VPERMQ, "vtrn", "8">; +def VTRNq16 : N2VQShuffle<0b01, 0b00001, IIC_VPERMQ, "vtrn", "16">; +def VTRNq32 : N2VQShuffle<0b10, 0b00001, IIC_VPERMQ, "vtrn", "32">; + +// VUZP : Vector Unzip (Deinterleave) + +def VUZPd8 : N2VDShuffle<0b00, 0b00010, "vuzp", "8">; +def VUZPd16 : N2VDShuffle<0b01, 0b00010, "vuzp", "16">; +// vuzp.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm. +def : NEONInstAlias<"vuzp${p}.32 $Dd, $Dm", + (VTRNd32 DPR:$Dd, DPR:$Dm, pred:$p)>; + +def VUZPq8 : N2VQShuffle<0b00, 0b00010, IIC_VPERMQ3, "vuzp", "8">; +def VUZPq16 : N2VQShuffle<0b01, 0b00010, IIC_VPERMQ3, "vuzp", "16">; +def VUZPq32 : N2VQShuffle<0b10, 0b00010, IIC_VPERMQ3, "vuzp", "32">; + +// VZIP : Vector Zip (Interleave) + +def VZIPd8 : N2VDShuffle<0b00, 0b00011, "vzip", "8">; +def VZIPd16 : N2VDShuffle<0b01, 0b00011, "vzip", "16">; +// vzip.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm. +def : NEONInstAlias<"vzip${p}.32 $Dd, $Dm", + (VTRNd32 DPR:$Dd, DPR:$Dm, pred:$p)>; + +def VZIPq8 : N2VQShuffle<0b00, 0b00011, IIC_VPERMQ3, "vzip", "8">; +def VZIPq16 : N2VQShuffle<0b01, 0b00011, IIC_VPERMQ3, "vzip", "16">; +def VZIPq32 : N2VQShuffle<0b10, 0b00011, IIC_VPERMQ3, "vzip", "32">; + +// Vector Table Lookup and Table Extension. + +// VTBL : Vector Table Lookup +let DecoderMethod = "DecodeTBLInstruction" in { +def VTBL1 + : N3V<1,1,0b11,0b1000,0,0, (outs DPR:$Vd), + (ins VecListOneD:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTB1, + "vtbl", "8", "$Vd, $Vn, $Vm", "", + [(set DPR:$Vd, (v8i8 (int_arm_neon_vtbl1 VecListOneD:$Vn, DPR:$Vm)))]>; +let hasExtraSrcRegAllocReq = 1 in { +def VTBL2 + : N3V<1,1,0b11,0b1001,0,0, (outs DPR:$Vd), + (ins VecListDPair:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTB2, + "vtbl", "8", "$Vd, $Vn, $Vm", "", []>; +def VTBL3 + : N3V<1,1,0b11,0b1010,0,0, (outs DPR:$Vd), + (ins VecListThreeD:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTB3, + "vtbl", "8", "$Vd, $Vn, $Vm", "", []>; +def VTBL4 + : N3V<1,1,0b11,0b1011,0,0, (outs DPR:$Vd), + (ins VecListFourD:$Vn, DPR:$Vm), + NVTBLFrm, IIC_VTB4, + "vtbl", "8", "$Vd, $Vn, $Vm", "", []>; +} // hasExtraSrcRegAllocReq = 1 + +def VTBL3Pseudo + : PseudoNeonI<(outs DPR:$dst), (ins QQPR:$tbl, DPR:$src), IIC_VTB3, "", []>; +def VTBL4Pseudo + : PseudoNeonI<(outs DPR:$dst), (ins QQPR:$tbl, DPR:$src), IIC_VTB4, "", []>; + +// VTBX : Vector Table Extension +def VTBX1 + : N3V<1,1,0b11,0b1000,1,0, (outs DPR:$Vd), + (ins DPR:$orig, VecListOneD:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTBX1, + "vtbx", "8", "$Vd, $Vn, $Vm", "$orig = $Vd", + [(set DPR:$Vd, (v8i8 (int_arm_neon_vtbx1 + DPR:$orig, VecListOneD:$Vn, DPR:$Vm)))]>; +let hasExtraSrcRegAllocReq = 1 in { +def VTBX2 + : N3V<1,1,0b11,0b1001,1,0, (outs DPR:$Vd), + (ins DPR:$orig, VecListDPair:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTBX2, + "vtbx", "8", "$Vd, $Vn, $Vm", "$orig = $Vd", []>; +def VTBX3 + : N3V<1,1,0b11,0b1010,1,0, (outs DPR:$Vd), + (ins DPR:$orig, VecListThreeD:$Vn, DPR:$Vm), + NVTBLFrm, IIC_VTBX3, + "vtbx", "8", "$Vd, $Vn, $Vm", + "$orig = $Vd", []>; +def VTBX4 + : N3V<1,1,0b11,0b1011,1,0, (outs DPR:$Vd), + (ins DPR:$orig, VecListFourD:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTBX4, + "vtbx", "8", "$Vd, $Vn, $Vm", + "$orig = $Vd", []>; +} // hasExtraSrcRegAllocReq = 1 + +def VTBX3Pseudo + : PseudoNeonI<(outs DPR:$dst), (ins DPR:$orig, QQPR:$tbl, DPR:$src), + IIC_VTBX3, "$orig = $dst", []>; +def VTBX4Pseudo + : PseudoNeonI<(outs DPR:$dst), (ins DPR:$orig, QQPR:$tbl, DPR:$src), + IIC_VTBX4, "$orig = $dst", []>; +} // DecoderMethod = "DecodeTBLInstruction" + +// VRINT : Vector Rounding +multiclass VRINT_FPI<string op, bits<3> op9_7, SDPatternOperator Int> { + let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in { + def Df : N2VDIntnp<0b10, 0b10, 0b100, 0, NoItinerary, + !strconcat("vrint", op), "f32", + v2f32, v2f32, Int>, Requires<[HasV8, HasNEON]> { + let Inst{9-7} = op9_7; + } + def Qf : N2VQIntnp<0b10, 0b10, 0b100, 0, NoItinerary, + !strconcat("vrint", op), "f32", + v4f32, v4f32, Int>, Requires<[HasV8, HasNEON]> { + let Inst{9-7} = op9_7; + } + def Dh : N2VDIntnp<0b01, 0b10, 0b100, 0, NoItinerary, + !strconcat("vrint", op), "f16", + v4f16, v4f16, Int>, + Requires<[HasV8, HasNEON, HasFullFP16]> { + let Inst{9-7} = op9_7; + } + def Qh : N2VQIntnp<0b01, 0b10, 0b100, 0, NoItinerary, + !strconcat("vrint", op), "f16", + v8f16, v8f16, Int>, + Requires<[HasV8, HasNEON, HasFullFP16]> { + let Inst{9-7} = op9_7; + } + } + + def : NEONInstAlias<!strconcat("vrint", op, ".f32.f32\t$Dd, $Dm"), + (!cast<Instruction>(NAME#"Df") DPR:$Dd, DPR:$Dm)>; + def : NEONInstAlias<!strconcat("vrint", op, ".f32.f32\t$Qd, $Qm"), + (!cast<Instruction>(NAME#"Qf") QPR:$Qd, QPR:$Qm)>; + let Predicates = [HasNEON, HasFullFP16] in { + def : NEONInstAlias<!strconcat("vrint", op, ".f16.f16\t$Dd, $Dm"), + (!cast<Instruction>(NAME#"Dh") DPR:$Dd, DPR:$Dm)>; + def : NEONInstAlias<!strconcat("vrint", op, ".f16.f16\t$Qd, $Qm"), + (!cast<Instruction>(NAME#"Qh") QPR:$Qd, QPR:$Qm)>; + } +} + +defm VRINTNN : VRINT_FPI<"n", 0b000, int_arm_neon_vrintn>; +defm VRINTXN : VRINT_FPI<"x", 0b001, int_arm_neon_vrintx>; +defm VRINTAN : VRINT_FPI<"a", 0b010, int_arm_neon_vrinta>; +defm VRINTZN : VRINT_FPI<"z", 0b011, int_arm_neon_vrintz>; +defm VRINTMN : VRINT_FPI<"m", 0b101, int_arm_neon_vrintm>; +defm VRINTPN : VRINT_FPI<"p", 0b111, int_arm_neon_vrintp>; + +// Cryptography instructions +let PostEncoderMethod = "NEONThumb2DataIPostEncoder", + DecoderNamespace = "v8Crypto", hasSideEffects = 0 in { + class AES<string op, bit op7, bit op6, SDPatternOperator Int> + : N2VQIntXnp<0b00, 0b00, 0b011, op6, op7, NoItinerary, + !strconcat("aes", op), "8", v16i8, v16i8, Int>, + Requires<[HasV8, HasCrypto]>; + class AES2Op<string op, bit op7, bit op6, SDPatternOperator Int> + : N2VQIntX2np<0b00, 0b00, 0b011, op6, op7, NoItinerary, + !strconcat("aes", op), "8", v16i8, v16i8, Int>, + Requires<[HasV8, HasCrypto]>; + class N2SHA<string op, bits<2> op17_16, bits<3> op10_8, bit op7, bit op6, + SDPatternOperator Int> + : N2VQIntXnp<0b10, op17_16, op10_8, op6, op7, NoItinerary, + !strconcat("sha", op), "32", v4i32, v4i32, Int>, + Requires<[HasV8, HasCrypto]>; + class N2SHA2Op<string op, bits<2> op17_16, bits<3> op10_8, bit op7, bit op6, + SDPatternOperator Int> + : N2VQIntX2np<0b10, op17_16, op10_8, op6, op7, NoItinerary, + !strconcat("sha", op), "32", v4i32, v4i32, Int>, + Requires<[HasV8, HasCrypto]>; + class N3SHA3Op<string op, bits<5> op27_23, bits<2> op21_20, SDPatternOperator Int> + : N3VQInt3np<op27_23, op21_20, 0b1100, 1, 0, N3RegFrm, NoItinerary, + !strconcat("sha", op), "32", v4i32, v4i32, Int, 0>, + Requires<[HasV8, HasCrypto]>; +} + +def AESD : AES2Op<"d", 0, 1, int_arm_neon_aesd>; +def AESE : AES2Op<"e", 0, 0, int_arm_neon_aese>; +def AESIMC : AES<"imc", 1, 1, int_arm_neon_aesimc>; +def AESMC : AES<"mc", 1, 0, int_arm_neon_aesmc>; + +def SHA1H : N2SHA<"1h", 0b01, 0b010, 1, 1, null_frag>; +def SHA1SU1 : N2SHA2Op<"1su1", 0b10, 0b011, 1, 0, int_arm_neon_sha1su1>; +def SHA256SU0 : N2SHA2Op<"256su0", 0b10, 0b011, 1, 1, int_arm_neon_sha256su0>; +def SHA1C : N3SHA3Op<"1c", 0b00100, 0b00, null_frag>; +def SHA1M : N3SHA3Op<"1m", 0b00100, 0b10, null_frag>; +def SHA1P : N3SHA3Op<"1p", 0b00100, 0b01, null_frag>; +def SHA1SU0 : N3SHA3Op<"1su0", 0b00100, 0b11, int_arm_neon_sha1su0>; +def SHA256H : N3SHA3Op<"256h", 0b00110, 0b00, int_arm_neon_sha256h>; +def SHA256H2 : N3SHA3Op<"256h2", 0b00110, 0b01, int_arm_neon_sha256h2>; +def SHA256SU1 : N3SHA3Op<"256su1", 0b00110, 0b10, int_arm_neon_sha256su1>; + +def : Pat<(i32 (int_arm_neon_sha1h i32:$Rn)), + (COPY_TO_REGCLASS (f32 (EXTRACT_SUBREG + (SHA1H (SUBREG_TO_REG (i64 0), + (f32 (COPY_TO_REGCLASS i32:$Rn, SPR)), + ssub_0)), + ssub_0)), GPR)>; + +def : Pat<(v4i32 (int_arm_neon_sha1c v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk)), + (SHA1C v4i32:$hash_abcd, + (SUBREG_TO_REG (i64 0), + (f32 (COPY_TO_REGCLASS i32:$hash_e, SPR)), + ssub_0), + v4i32:$wk)>; + +def : Pat<(v4i32 (int_arm_neon_sha1m v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk)), + (SHA1M v4i32:$hash_abcd, + (SUBREG_TO_REG (i64 0), + (f32 (COPY_TO_REGCLASS i32:$hash_e, SPR)), + ssub_0), + v4i32:$wk)>; + +def : Pat<(v4i32 (int_arm_neon_sha1p v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk)), + (SHA1P v4i32:$hash_abcd, + (SUBREG_TO_REG (i64 0), + (f32 (COPY_TO_REGCLASS i32:$hash_e, SPR)), + ssub_0), + v4i32:$wk)>; + +//===----------------------------------------------------------------------===// +// NEON instructions for single-precision FP math +//===----------------------------------------------------------------------===// + +class N2VSPat<SDNode OpNode, NeonI Inst> + : NEONFPPat<(f32 (OpNode SPR:$a)), + (EXTRACT_SUBREG + (v2f32 (COPY_TO_REGCLASS (Inst + (INSERT_SUBREG + (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)), + SPR:$a, ssub_0)), DPR_VFP2)), ssub_0)>; + +class N3VSPat<SDNode OpNode, NeonI Inst> + : NEONFPPat<(f32 (OpNode SPR:$a, SPR:$b)), + (EXTRACT_SUBREG + (v2f32 (COPY_TO_REGCLASS (Inst + (INSERT_SUBREG + (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)), + SPR:$a, ssub_0), + (INSERT_SUBREG + (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)), + SPR:$b, ssub_0)), DPR_VFP2)), ssub_0)>; + +class N3VSMulOpPat<SDNode MulNode, SDNode OpNode, NeonI Inst> + : NEONFPPat<(f32 (OpNode SPR:$acc, (f32 (MulNode SPR:$a, SPR:$b)))), + (EXTRACT_SUBREG + (v2f32 (COPY_TO_REGCLASS (Inst + (INSERT_SUBREG + (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)), + SPR:$acc, ssub_0), + (INSERT_SUBREG + (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)), + SPR:$a, ssub_0), + (INSERT_SUBREG + (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)), + SPR:$b, ssub_0)), DPR_VFP2)), ssub_0)>; + +class NVCVTIFPat<SDNode OpNode, NeonI Inst> + : NEONFPPat<(f32 (OpNode GPR:$a)), + (f32 (EXTRACT_SUBREG + (v2f32 (Inst + (INSERT_SUBREG + (v2f32 (IMPLICIT_DEF)), + (i32 (COPY_TO_REGCLASS GPR:$a, SPR)), ssub_0))), + ssub_0))>; +class NVCVTFIPat<SDNode OpNode, NeonI Inst> + : NEONFPPat<(i32 (OpNode SPR:$a)), + (i32 (EXTRACT_SUBREG + (v2f32 (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), + SPR:$a, ssub_0))), + ssub_0))>; + +def : N3VSPat<fadd, VADDfd>; +def : N3VSPat<fsub, VSUBfd>; +def : N3VSPat<fmul, VMULfd>; +def : N3VSMulOpPat<fmul, fadd, VMLAfd>, + Requires<[HasNEON, UseNEONForFP, UseFPVMLx, DontUseFusedMAC]>; +def : N3VSMulOpPat<fmul, fsub, VMLSfd>, + Requires<[HasNEON, UseNEONForFP, UseFPVMLx, DontUseFusedMAC]>; +def : N3VSMulOpPat<fmul, fadd, VFMAfd>, + Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>; +def : N3VSMulOpPat<fmul, fsub, VFMSfd>, + Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>; +def : N2VSPat<fabs, VABSfd>; +def : N2VSPat<fneg, VNEGfd>; +def : N3VSPat<fmaxnan, VMAXfd>, Requires<[HasNEON]>; +def : N3VSPat<fminnan, VMINfd>, Requires<[HasNEON]>; +def : NVCVTFIPat<fp_to_sint, VCVTf2sd>; +def : NVCVTFIPat<fp_to_uint, VCVTf2ud>; +def : NVCVTIFPat<sint_to_fp, VCVTs2fd>; +def : NVCVTIFPat<uint_to_fp, VCVTu2fd>; + +// NEON doesn't have any f64 conversions, so provide patterns to make +// sure the VFP conversions match when extracting from a vector. +def : VFPPat<(f64 (sint_to_fp (extractelt (v2i32 DPR:$src), imm:$lane))), + (VSITOD (EXTRACT_SUBREG DPR:$src, (SSubReg_f32_reg imm:$lane)))>; +def : VFPPat<(f64 (sint_to_fp (extractelt (v4i32 QPR:$src), imm:$lane))), + (VSITOD (EXTRACT_SUBREG QPR:$src, (SSubReg_f32_reg imm:$lane)))>; +def : VFPPat<(f64 (uint_to_fp (extractelt (v2i32 DPR:$src), imm:$lane))), + (VUITOD (EXTRACT_SUBREG DPR:$src, (SSubReg_f32_reg imm:$lane)))>; +def : VFPPat<(f64 (uint_to_fp (extractelt (v4i32 QPR:$src), imm:$lane))), + (VUITOD (EXTRACT_SUBREG QPR:$src, (SSubReg_f32_reg imm:$lane)))>; + + +// Prefer VMOVDRR for i32 -> f32 bitcasts, it can write all DPR registers. +def : Pat<(f32 (bitconvert GPR:$a)), + (EXTRACT_SUBREG (VMOVDRR GPR:$a, GPR:$a), ssub_0)>, + Requires<[HasNEON, DontUseVMOVSR]>; + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +//===----------------------------------------------------------------------===// + +// bit_convert +let Predicates = [IsLE] in { + def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (v1i64 DPR:$src)>; + def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (v1i64 DPR:$src)>; + def : Pat<(v1i64 (bitconvert (v8i8 DPR:$src))), (v1i64 DPR:$src)>; +} +def : Pat<(v1i64 (bitconvert (f64 DPR:$src))), (v1i64 DPR:$src)>; +let Predicates = [IsLE] in { + def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (v1i64 DPR:$src)>; + def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (v2i32 DPR:$src)>; + def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (v2i32 DPR:$src)>; + def : Pat<(v2i32 (bitconvert (v8i8 DPR:$src))), (v2i32 DPR:$src)>; + def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (v2i32 DPR:$src)>; +} +def : Pat<(v2i32 (bitconvert (v2f32 DPR:$src))), (v2i32 DPR:$src)>; +let Predicates = [IsLE] in { + def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (v4i16 DPR:$src)>; + def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (v4i16 DPR:$src)>; + def : Pat<(v4i16 (bitconvert (v8i8 DPR:$src))), (v4i16 DPR:$src)>; + def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (v4i16 DPR:$src)>; + def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (v4i16 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (v1i64 DPR:$src))), (v8i8 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (v2i32 DPR:$src))), (v8i8 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (v4i16 DPR:$src))), (v8i8 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (v8i8 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (v8i8 DPR:$src)>; +} +def : Pat<(f64 (bitconvert (v1i64 DPR:$src))), (f64 DPR:$src)>; +let Predicates = [IsLE] in { + def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (f64 DPR:$src)>; + def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (f64 DPR:$src)>; + def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (f64 DPR:$src)>; + def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (f64 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (v2f32 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>; +} +def : Pat<(v2f32 (bitconvert (v2i32 DPR:$src))), (v2f32 DPR:$src)>; +let Predicates = [IsLE] in { + def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (v2f32 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (v2f32 DPR:$src)>; +} + +let Predicates = [IsLE] in { + def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>; + def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>; + def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>; +} +def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>; +let Predicates = [IsLE] in { + def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>; +} +def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>; +let Predicates = [IsLE] in { + def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>; +} +def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>; +let Predicates = [IsLE] in { + def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>; +} +def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>; +let Predicates = [IsLE] in { + def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>; +} + +let Predicates = [IsBE] in { + // 64 bit conversions + def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>; + def : Pat<(v1i64 (bitconvert (v8i8 DPR:$src))), (VREV64d8 DPR:$src)>; + def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>; + def : Pat<(v2i32 (bitconvert (v8i8 DPR:$src))), (VREV32d8 DPR:$src)>; + def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (VREV64d16 DPR:$src)>; + def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (VREV32d16 DPR:$src)>; + def : Pat<(v4i16 (bitconvert (v8i8 DPR:$src))), (VREV16d8 DPR:$src)>; + def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (VREV64d16 DPR:$src)>; + def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (VREV32d16 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (v1i64 DPR:$src))), (VREV64d8 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (v2i32 DPR:$src))), (VREV32d8 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (v4i16 DPR:$src))), (VREV16d8 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (VREV64d8 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (VREV32d8 DPR:$src)>; + def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>; + def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (VREV64d8 DPR:$src)>; + def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (VREV32d8 DPR:$src)>; + + // 128 bit conversions + def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>; + def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>; + def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (VREV64q8 QPR:$src)>; + def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (VREV32q8 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (VREV64q16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (VREV32q16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (VREV16q8 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (VREV64q8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (VREV32q8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (VREV16q8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (VREV64q8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (VREV32q8 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (VREV32q8 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (VREV64q8 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>; +} + +// Fold extracting an element out of a v2i32 into a vfp register. +def : Pat<(f32 (bitconvert (i32 (extractelt (v2i32 DPR:$src), imm:$lane)))), + (f32 (EXTRACT_SUBREG DPR:$src, (SSubReg_f32_reg imm:$lane)))>; + +// Vector lengthening move with load, matching extending loads. + +// extload, zextload and sextload for a standard lengthening load. Example: +// Lengthen_Single<"8", "i16", "8"> = +// Pat<(v8i16 (extloadvi8 addrmode6:$addr)) +// (VMOVLuv8i16 (VLD1d8 addrmode6:$addr, +// (f64 (IMPLICIT_DEF)), (i32 0)))>; +multiclass Lengthen_Single<string DestLanes, string DestTy, string SrcTy> { + let AddedComplexity = 10 in { + def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("extloadvi" # SrcTy) addrmode6:$addr)), + (!cast<Instruction>("VMOVLuv" # DestLanes # DestTy) + (!cast<Instruction>("VLD1d" # SrcTy) addrmode6:$addr))>; + + def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("zextloadvi" # SrcTy) addrmode6:$addr)), + (!cast<Instruction>("VMOVLuv" # DestLanes # DestTy) + (!cast<Instruction>("VLD1d" # SrcTy) addrmode6:$addr))>; + + def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("sextloadvi" # SrcTy) addrmode6:$addr)), + (!cast<Instruction>("VMOVLsv" # DestLanes # DestTy) + (!cast<Instruction>("VLD1d" # SrcTy) addrmode6:$addr))>; + } +} + +// extload, zextload and sextload for a lengthening load which only uses +// half the lanes available. Example: +// Lengthen_HalfSingle<"4", "i16", "8", "i16", "i8"> = +// Pat<(v4i16 (extloadvi8 addrmode6oneL32:$addr)), +// (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd32 addrmode6oneL32:$addr, +// (f64 (IMPLICIT_DEF)), (i32 0))), +// dsub_0)>; +multiclass Lengthen_HalfSingle<string DestLanes, string DestTy, string SrcTy, + string InsnLanes, string InsnTy> { + def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)), + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy) + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), + dsub_0)>; + def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)), + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy) + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), + dsub_0)>; + def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)), + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # InsnLanes # InsnTy) + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), + dsub_0)>; +} + +// The following class definition is basically a copy of the +// Lengthen_HalfSingle definition above, however with an additional parameter +// "RevLanes" to select the correct VREV32dXX instruction. This is to convert +// data loaded by VLD1LN into proper vector format in big endian mode. +multiclass Lengthen_HalfSingle_Big_Endian<string DestLanes, string DestTy, string SrcTy, + string InsnLanes, string InsnTy, string RevLanes> { + def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)), + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy) + (!cast<Instruction>("VREV32d" # RevLanes) + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), + dsub_0)>; + def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)), + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy) + (!cast<Instruction>("VREV32d" # RevLanes) + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), + dsub_0)>; + def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)), + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # InsnLanes # InsnTy) + (!cast<Instruction>("VREV32d" # RevLanes) + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), + dsub_0)>; +} + +// extload, zextload and sextload for a lengthening load followed by another +// lengthening load, to quadruple the initial length. +// +// Lengthen_Double<"4", "i32", "i8", "8", "i16", "4", "i32"> = +// Pat<(v4i32 (extloadvi8 addrmode6oneL32:$addr)) +// (EXTRACT_SUBREG (VMOVLuv4i32 +// (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd32 addrmode6oneL32:$addr, +// (f64 (IMPLICIT_DEF)), +// (i32 0))), +// dsub_0)), +// dsub_0)>; +multiclass Lengthen_Double<string DestLanes, string DestTy, string SrcTy, + string Insn1Lanes, string Insn1Ty, string Insn2Lanes, + string Insn2Ty> { + def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)), + (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty) + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty) + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), + dsub_0))>; + def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)), + (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty) + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty) + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), + dsub_0))>; + def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)), + (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty) + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty) + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), + dsub_0))>; +} + +// The following class definition is basically a copy of the +// Lengthen_Double definition above, however with an additional parameter +// "RevLanes" to select the correct VREV32dXX instruction. This is to convert +// data loaded by VLD1LN into proper vector format in big endian mode. +multiclass Lengthen_Double_Big_Endian<string DestLanes, string DestTy, string SrcTy, + string Insn1Lanes, string Insn1Ty, string Insn2Lanes, + string Insn2Ty, string RevLanes> { + def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)), + (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty) + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty) + (!cast<Instruction>("VREV32d" # RevLanes) + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), + dsub_0))>; + def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)), + (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty) + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty) + (!cast<Instruction>("VREV32d" # RevLanes) + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), + dsub_0))>; + def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)), + (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty) + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty) + (!cast<Instruction>("VREV32d" # RevLanes) + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), + dsub_0))>; +} + +// extload, zextload and sextload for a lengthening load followed by another +// lengthening load, to quadruple the initial length, but which ends up only +// requiring half the available lanes (a 64-bit outcome instead of a 128-bit). +// +// Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32"> = +// Pat<(v2i32 (extloadvi8 addrmode6:$addr)) +// (EXTRACT_SUBREG (VMOVLuv4i32 +// (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd16 addrmode6:$addr, +// (f64 (IMPLICIT_DEF)), (i32 0))), +// dsub_0)), +// dsub_0)>; +multiclass Lengthen_HalfDouble<string DestLanes, string DestTy, string SrcTy, + string Insn1Lanes, string Insn1Ty, string Insn2Lanes, + string Insn2Ty> { + def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("extloadv" # SrcTy) addrmode6:$addr)), + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty) + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty) + (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), + dsub_0)), + dsub_0)>; + def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6:$addr)), + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty) + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty) + (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), + dsub_0)), + dsub_0)>; + def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6:$addr)), + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty) + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty) + (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), + dsub_0)), + dsub_0)>; +} + +// The following class definition is basically a copy of the +// Lengthen_HalfDouble definition above, however with an additional VREV16d8 +// instruction to convert data loaded by VLD1LN into proper vector format +// in big endian mode. +multiclass Lengthen_HalfDouble_Big_Endian<string DestLanes, string DestTy, string SrcTy, + string Insn1Lanes, string Insn1Ty, string Insn2Lanes, + string Insn2Ty> { + def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("extloadv" # SrcTy) addrmode6:$addr)), + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty) + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty) + (!cast<Instruction>("VREV16d8") + (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), + dsub_0)), + dsub_0)>; + def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6:$addr)), + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty) + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty) + (!cast<Instruction>("VREV16d8") + (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), + dsub_0)), + dsub_0)>; + def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6:$addr)), + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty) + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty) + (!cast<Instruction>("VREV16d8") + (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), + dsub_0)), + dsub_0)>; +} + +defm : Lengthen_Single<"8", "i16", "8">; // v8i8 -> v8i16 +defm : Lengthen_Single<"4", "i32", "16">; // v4i16 -> v4i32 +defm : Lengthen_Single<"2", "i64", "32">; // v2i32 -> v2i64 + +let Predicates = [IsLE] in { + defm : Lengthen_HalfSingle<"4", "i16", "i8", "8", "i16">; // v4i8 -> v4i16 + defm : Lengthen_HalfSingle<"2", "i32", "i16", "4", "i32">; // v2i16 -> v2i32 + + // Double lengthening - v4i8 -> v4i16 -> v4i32 + defm : Lengthen_Double<"4", "i32", "i8", "8", "i16", "4", "i32">; + // v2i8 -> v2i16 -> v2i32 + defm : Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32">; + // v2i16 -> v2i32 -> v2i64 + defm : Lengthen_Double<"2", "i64", "i16", "4", "i32", "2", "i64">; +} + +let Predicates = [IsBE] in { + defm : Lengthen_HalfSingle_Big_Endian<"4", "i16", "i8", "8", "i16", "8">; // v4i8 -> v4i16 + defm : Lengthen_HalfSingle_Big_Endian<"2", "i32", "i16", "4", "i32", "16">; // v2i16 -> v2i32 + + // Double lengthening - v4i8 -> v4i16 -> v4i32 + defm : Lengthen_Double_Big_Endian<"4", "i32", "i8", "8", "i16", "4", "i32", "8">; + // v2i8 -> v2i16 -> v2i32 + defm : Lengthen_HalfDouble_Big_Endian<"2", "i32", "i8", "8", "i16", "4", "i32">; + // v2i16 -> v2i32 -> v2i64 + defm : Lengthen_Double_Big_Endian<"2", "i64", "i16", "4", "i32", "2", "i64", "16">; +} + +// Triple lengthening - v2i8 -> v2i16 -> v2i32 -> v2i64 +let Predicates = [IsLE] in { + def : Pat<(v2i64 (extloadvi8 addrmode6:$addr)), + (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16 + (VLD1LNd16 addrmode6:$addr, + (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>; + def : Pat<(v2i64 (zextloadvi8 addrmode6:$addr)), + (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16 + (VLD1LNd16 addrmode6:$addr, + (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>; + def : Pat<(v2i64 (sextloadvi8 addrmode6:$addr)), + (VMOVLsv2i64 (EXTRACT_SUBREG (VMOVLsv4i32 (EXTRACT_SUBREG (VMOVLsv8i16 + (VLD1LNd16 addrmode6:$addr, + (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>; +} +// The following patterns are basically a copy of the patterns above, +// however with an additional VREV16d instruction to convert data +// loaded by VLD1LN into proper vector format in big endian mode. +let Predicates = [IsBE] in { + def : Pat<(v2i64 (extloadvi8 addrmode6:$addr)), + (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16 + (!cast<Instruction>("VREV16d8") + (VLD1LNd16 addrmode6:$addr, + (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), dsub_0))>; + def : Pat<(v2i64 (zextloadvi8 addrmode6:$addr)), + (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16 + (!cast<Instruction>("VREV16d8") + (VLD1LNd16 addrmode6:$addr, + (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), dsub_0))>; + def : Pat<(v2i64 (sextloadvi8 addrmode6:$addr)), + (VMOVLsv2i64 (EXTRACT_SUBREG (VMOVLsv4i32 (EXTRACT_SUBREG (VMOVLsv8i16 + (!cast<Instruction>("VREV16d8") + (VLD1LNd16 addrmode6:$addr, + (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), dsub_0))>; +} + +//===----------------------------------------------------------------------===// +// Assembler aliases +// + +def : VFP2InstAlias<"fmdhr${p} $Dd, $Rn", + (VSETLNi32 DPR:$Dd, GPR:$Rn, 1, pred:$p)>; +def : VFP2InstAlias<"fmdlr${p} $Dd, $Rn", + (VSETLNi32 DPR:$Dd, GPR:$Rn, 0, pred:$p)>; + +// VAND/VBIC/VEOR/VORR accept but do not require a type suffix. +defm : NEONDTAnyInstAlias<"vand${p}", "$Vd, $Vn, $Vm", + (VANDd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>; +defm : NEONDTAnyInstAlias<"vand${p}", "$Vd, $Vn, $Vm", + (VANDq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>; +defm : NEONDTAnyInstAlias<"vbic${p}", "$Vd, $Vn, $Vm", + (VBICd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>; +defm : NEONDTAnyInstAlias<"vbic${p}", "$Vd, $Vn, $Vm", + (VBICq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>; +defm : NEONDTAnyInstAlias<"veor${p}", "$Vd, $Vn, $Vm", + (VEORd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>; +defm : NEONDTAnyInstAlias<"veor${p}", "$Vd, $Vn, $Vm", + (VEORq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>; +defm : NEONDTAnyInstAlias<"vorr${p}", "$Vd, $Vn, $Vm", + (VORRd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>; +defm : NEONDTAnyInstAlias<"vorr${p}", "$Vd, $Vn, $Vm", + (VORRq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>; +// ... two-operand aliases +defm : NEONDTAnyInstAlias<"vand${p}", "$Vdn, $Vm", + (VANDd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; +defm : NEONDTAnyInstAlias<"vand${p}", "$Vdn, $Vm", + (VANDq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; +defm : NEONDTAnyInstAlias<"veor${p}", "$Vdn, $Vm", + (VEORd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; +defm : NEONDTAnyInstAlias<"veor${p}", "$Vdn, $Vm", + (VEORq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; +defm : NEONDTAnyInstAlias<"vorr${p}", "$Vdn, $Vm", + (VORRd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>; +defm : NEONDTAnyInstAlias<"vorr${p}", "$Vdn, $Vm", + (VORRq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>; +// ... immediates +def : NEONInstAlias<"vand${p}.i16 $Vd, $imm", + (VBICiv4i16 DPR:$Vd, nImmSplatNotI16:$imm, pred:$p)>; +def : NEONInstAlias<"vand${p}.i32 $Vd, $imm", + (VBICiv2i32 DPR:$Vd, nImmSplatNotI32:$imm, pred:$p)>; +def : NEONInstAlias<"vand${p}.i16 $Vd, $imm", + (VBICiv8i16 QPR:$Vd, nImmSplatNotI16:$imm, pred:$p)>; +def : NEONInstAlias<"vand${p}.i32 $Vd, $imm", + (VBICiv4i32 QPR:$Vd, nImmSplatNotI32:$imm, pred:$p)>; + + +// VLD1 single-lane pseudo-instructions. These need special handling for +// the lane index that an InstAlias can't handle, so we use these instead. +def VLD1LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld1${p}", ".8", "$list, $addr", + (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr, + pred:$p)>; +def VLD1LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld1${p}", ".16", "$list, $addr", + (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr, + pred:$p)>; +def VLD1LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld1${p}", ".32", "$list, $addr", + (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr, + pred:$p)>; + +def VLD1LNdWB_fixed_Asm_8 : + NEONDataTypeAsmPseudoInst<"vld1${p}", ".8", "$list, $addr!", + (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr, + pred:$p)>; +def VLD1LNdWB_fixed_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld1${p}", ".16", "$list, $addr!", + (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr, + pred:$p)>; +def VLD1LNdWB_fixed_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld1${p}", ".32", "$list, $addr!", + (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr, + pred:$p)>; +def VLD1LNdWB_register_Asm_8 : + NEONDataTypeAsmPseudoInst<"vld1${p}", ".8", "$list, $addr, $Rm", + (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr, + rGPR:$Rm, pred:$p)>; +def VLD1LNdWB_register_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld1${p}", ".16", "$list, $addr, $Rm", + (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr, + rGPR:$Rm, pred:$p)>; +def VLD1LNdWB_register_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld1${p}", ".32", "$list, $addr, $Rm", + (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr, + rGPR:$Rm, pred:$p)>; + + +// VST1 single-lane pseudo-instructions. These need special handling for +// the lane index that an InstAlias can't handle, so we use these instead. +def VST1LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst1${p}", ".8", "$list, $addr", + (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr, + pred:$p)>; +def VST1LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst1${p}", ".16", "$list, $addr", + (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr, + pred:$p)>; +def VST1LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst1${p}", ".32", "$list, $addr", + (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr, + pred:$p)>; + +def VST1LNdWB_fixed_Asm_8 : + NEONDataTypeAsmPseudoInst<"vst1${p}", ".8", "$list, $addr!", + (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr, + pred:$p)>; +def VST1LNdWB_fixed_Asm_16 : + NEONDataTypeAsmPseudoInst<"vst1${p}", ".16", "$list, $addr!", + (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr, + pred:$p)>; +def VST1LNdWB_fixed_Asm_32 : + NEONDataTypeAsmPseudoInst<"vst1${p}", ".32", "$list, $addr!", + (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr, + pred:$p)>; +def VST1LNdWB_register_Asm_8 : + NEONDataTypeAsmPseudoInst<"vst1${p}", ".8", "$list, $addr, $Rm", + (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr, + rGPR:$Rm, pred:$p)>; +def VST1LNdWB_register_Asm_16 : + NEONDataTypeAsmPseudoInst<"vst1${p}", ".16", "$list, $addr, $Rm", + (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr, + rGPR:$Rm, pred:$p)>; +def VST1LNdWB_register_Asm_32 : + NEONDataTypeAsmPseudoInst<"vst1${p}", ".32", "$list, $addr, $Rm", + (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr, + rGPR:$Rm, pred:$p)>; + +// VLD2 single-lane pseudo-instructions. These need special handling for +// the lane index that an InstAlias can't handle, so we use these instead. +def VLD2LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".8", "$list, $addr", + (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr, + pred:$p)>; +def VLD2LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr", + (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr, + pred:$p)>; +def VLD2LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr", + (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr, pred:$p)>; +def VLD2LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr", + (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr, + pred:$p)>; +def VLD2LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr", + (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr, + pred:$p)>; + +def VLD2LNdWB_fixed_Asm_8 : + NEONDataTypeAsmPseudoInst<"vld2${p}", ".8", "$list, $addr!", + (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr, + pred:$p)>; +def VLD2LNdWB_fixed_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr!", + (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr, + pred:$p)>; +def VLD2LNdWB_fixed_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr!", + (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr, + pred:$p)>; +def VLD2LNqWB_fixed_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr!", + (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr, + pred:$p)>; +def VLD2LNqWB_fixed_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr!", + (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr, + pred:$p)>; +def VLD2LNdWB_register_Asm_8 : + NEONDataTypeAsmPseudoInst<"vld2${p}", ".8", "$list, $addr, $Rm", + (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr, + rGPR:$Rm, pred:$p)>; +def VLD2LNdWB_register_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr, $Rm", + (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr, + rGPR:$Rm, pred:$p)>; +def VLD2LNdWB_register_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr, $Rm", + (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr, + rGPR:$Rm, pred:$p)>; +def VLD2LNqWB_register_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr, $Rm", + (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr, + rGPR:$Rm, pred:$p)>; +def VLD2LNqWB_register_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr, $Rm", + (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr, + rGPR:$Rm, pred:$p)>; + + +// VST2 single-lane pseudo-instructions. These need special handling for +// the lane index that an InstAlias can't handle, so we use these instead. +def VST2LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".8", "$list, $addr", + (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr, + pred:$p)>; +def VST2LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr", + (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr, + pred:$p)>; +def VST2LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr", + (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr, + pred:$p)>; +def VST2LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr", + (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr, + pred:$p)>; +def VST2LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr", + (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr, + pred:$p)>; + +def VST2LNdWB_fixed_Asm_8 : + NEONDataTypeAsmPseudoInst<"vst2${p}", ".8", "$list, $addr!", + (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr, + pred:$p)>; +def VST2LNdWB_fixed_Asm_16 : + NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr!", + (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr, + pred:$p)>; +def VST2LNdWB_fixed_Asm_32 : + NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr!", + (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr, + pred:$p)>; +def VST2LNqWB_fixed_Asm_16 : + NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr!", + (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr, + pred:$p)>; +def VST2LNqWB_fixed_Asm_32 : + NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr!", + (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr, + pred:$p)>; +def VST2LNdWB_register_Asm_8 : + NEONDataTypeAsmPseudoInst<"vst2${p}", ".8", "$list, $addr, $Rm", + (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr, + rGPR:$Rm, pred:$p)>; +def VST2LNdWB_register_Asm_16 : + NEONDataTypeAsmPseudoInst<"vst2${p}", ".16","$list, $addr, $Rm", + (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr, + rGPR:$Rm, pred:$p)>; +def VST2LNdWB_register_Asm_32 : + NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr, $Rm", + (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr, + rGPR:$Rm, pred:$p)>; +def VST2LNqWB_register_Asm_16 : + NEONDataTypeAsmPseudoInst<"vst2${p}", ".16","$list, $addr, $Rm", + (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr, + rGPR:$Rm, pred:$p)>; +def VST2LNqWB_register_Asm_32 : + NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr, $Rm", + (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr, + rGPR:$Rm, pred:$p)>; + +// VLD3 all-lanes pseudo-instructions. These need special handling for +// the lane index that an InstAlias can't handle, so we use these instead. +def VLD3DUPdAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr", + (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr, + pred:$p)>; +def VLD3DUPdAsm_16: NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr", + (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr, + pred:$p)>; +def VLD3DUPdAsm_32: NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr", + (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr, + pred:$p)>; +def VLD3DUPqAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr", + (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr, + pred:$p)>; +def VLD3DUPqAsm_16: NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr", + (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr, + pred:$p)>; +def VLD3DUPqAsm_32: NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr", + (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr, + pred:$p)>; + +def VLD3DUPdWB_fixed_Asm_8 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!", + (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr, + pred:$p)>; +def VLD3DUPdWB_fixed_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!", + (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr, + pred:$p)>; +def VLD3DUPdWB_fixed_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!", + (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr, + pred:$p)>; +def VLD3DUPqWB_fixed_Asm_8 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!", + (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr, + pred:$p)>; +def VLD3DUPqWB_fixed_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!", + (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr, + pred:$p)>; +def VLD3DUPqWB_fixed_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!", + (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr, + pred:$p)>; +def VLD3DUPdWB_register_Asm_8 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm", + (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr, + rGPR:$Rm, pred:$p)>; +def VLD3DUPdWB_register_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm", + (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr, + rGPR:$Rm, pred:$p)>; +def VLD3DUPdWB_register_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm", + (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr, + rGPR:$Rm, pred:$p)>; +def VLD3DUPqWB_register_Asm_8 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm", + (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr, + rGPR:$Rm, pred:$p)>; +def VLD3DUPqWB_register_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm", + (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr, + rGPR:$Rm, pred:$p)>; +def VLD3DUPqWB_register_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm", + (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr, + rGPR:$Rm, pred:$p)>; + + +// VLD3 single-lane pseudo-instructions. These need special handling for +// the lane index that an InstAlias can't handle, so we use these instead. +def VLD3LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr", + (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr, + pred:$p)>; +def VLD3LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr", + (ins VecListThreeDHWordIndexed:$list, addrmode6alignNone:$addr, + pred:$p)>; +def VLD3LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr", + (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr, + pred:$p)>; +def VLD3LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr", + (ins VecListThreeQHWordIndexed:$list, addrmode6alignNone:$addr, + pred:$p)>; +def VLD3LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr", + (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr, + pred:$p)>; + +def VLD3LNdWB_fixed_Asm_8 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!", + (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr, + pred:$p)>; +def VLD3LNdWB_fixed_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!", + (ins VecListThreeDHWordIndexed:$list, addrmode6alignNone:$addr, + pred:$p)>; +def VLD3LNdWB_fixed_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!", + (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr, + pred:$p)>; +def VLD3LNqWB_fixed_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!", + (ins VecListThreeQHWordIndexed:$list, addrmode6alignNone:$addr, + pred:$p)>; +def VLD3LNqWB_fixed_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!", + (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr, + pred:$p)>; +def VLD3LNdWB_register_Asm_8 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm", + (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr, + rGPR:$Rm, pred:$p)>; +def VLD3LNdWB_register_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm", + (ins VecListThreeDHWordIndexed:$list, + addrmode6alignNone:$addr, rGPR:$Rm, pred:$p)>; +def VLD3LNdWB_register_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm", + (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr, + rGPR:$Rm, pred:$p)>; +def VLD3LNqWB_register_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm", + (ins VecListThreeQHWordIndexed:$list, + addrmode6alignNone:$addr, rGPR:$Rm, pred:$p)>; +def VLD3LNqWB_register_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm", + (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr, + rGPR:$Rm, pred:$p)>; + +// VLD3 multiple structure pseudo-instructions. These need special handling for +// the vector operands that the normal instructions don't yet model. +// FIXME: Remove these when the register classes and instructions are updated. +def VLD3dAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr", + (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>; +def VLD3dAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr", + (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>; +def VLD3dAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr", + (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>; +def VLD3qAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr", + (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>; +def VLD3qAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr", + (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>; +def VLD3qAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr", + (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>; + +def VLD3dWB_fixed_Asm_8 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!", + (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>; +def VLD3dWB_fixed_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!", + (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>; +def VLD3dWB_fixed_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!", + (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>; +def VLD3qWB_fixed_Asm_8 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!", + (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>; +def VLD3qWB_fixed_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!", + (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>; +def VLD3qWB_fixed_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!", + (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>; +def VLD3dWB_register_Asm_8 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm", + (ins VecListThreeD:$list, addrmode6align64:$addr, + rGPR:$Rm, pred:$p)>; +def VLD3dWB_register_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm", + (ins VecListThreeD:$list, addrmode6align64:$addr, + rGPR:$Rm, pred:$p)>; +def VLD3dWB_register_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm", + (ins VecListThreeD:$list, addrmode6align64:$addr, + rGPR:$Rm, pred:$p)>; +def VLD3qWB_register_Asm_8 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm", + (ins VecListThreeQ:$list, addrmode6align64:$addr, + rGPR:$Rm, pred:$p)>; +def VLD3qWB_register_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm", + (ins VecListThreeQ:$list, addrmode6align64:$addr, + rGPR:$Rm, pred:$p)>; +def VLD3qWB_register_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm", + (ins VecListThreeQ:$list, addrmode6align64:$addr, + rGPR:$Rm, pred:$p)>; + +// VST3 single-lane pseudo-instructions. These need special handling for +// the lane index that an InstAlias can't handle, so we use these instead. +def VST3LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr", + (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr, + pred:$p)>; +def VST3LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr", + (ins VecListThreeDHWordIndexed:$list, addrmode6alignNone:$addr, + pred:$p)>; +def VST3LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr", + (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr, + pred:$p)>; +def VST3LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr", + (ins VecListThreeQHWordIndexed:$list, addrmode6alignNone:$addr, + pred:$p)>; +def VST3LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr", + (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr, + pred:$p)>; + +def VST3LNdWB_fixed_Asm_8 : + NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr!", + (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr, + pred:$p)>; +def VST3LNdWB_fixed_Asm_16 : + NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!", + (ins VecListThreeDHWordIndexed:$list, addrmode6alignNone:$addr, + pred:$p)>; +def VST3LNdWB_fixed_Asm_32 : + NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!", + (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr, + pred:$p)>; +def VST3LNqWB_fixed_Asm_16 : + NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!", + (ins VecListThreeQHWordIndexed:$list, addrmode6alignNone:$addr, + pred:$p)>; +def VST3LNqWB_fixed_Asm_32 : + NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!", + (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr, + pred:$p)>; +def VST3LNdWB_register_Asm_8 : + NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr, $Rm", + (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr, + rGPR:$Rm, pred:$p)>; +def VST3LNdWB_register_Asm_16 : + NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm", + (ins VecListThreeDHWordIndexed:$list, + addrmode6alignNone:$addr, rGPR:$Rm, pred:$p)>; +def VST3LNdWB_register_Asm_32 : + NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm", + (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr, + rGPR:$Rm, pred:$p)>; +def VST3LNqWB_register_Asm_16 : + NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm", + (ins VecListThreeQHWordIndexed:$list, + addrmode6alignNone:$addr, rGPR:$Rm, pred:$p)>; +def VST3LNqWB_register_Asm_32 : + NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm", + (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr, + rGPR:$Rm, pred:$p)>; + + +// VST3 multiple structure pseudo-instructions. These need special handling for +// the vector operands that the normal instructions don't yet model. +// FIXME: Remove these when the register classes and instructions are updated. +def VST3dAsm_8 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr", + (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>; +def VST3dAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr", + (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>; +def VST3dAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr", + (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>; +def VST3qAsm_8 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr", + (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>; +def VST3qAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr", + (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>; +def VST3qAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr", + (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>; + +def VST3dWB_fixed_Asm_8 : + NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr!", + (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>; +def VST3dWB_fixed_Asm_16 : + NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!", + (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>; +def VST3dWB_fixed_Asm_32 : + NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!", + (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>; +def VST3qWB_fixed_Asm_8 : + NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr!", + (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>; +def VST3qWB_fixed_Asm_16 : + NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!", + (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>; +def VST3qWB_fixed_Asm_32 : + NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!", + (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>; +def VST3dWB_register_Asm_8 : + NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr, $Rm", + (ins VecListThreeD:$list, addrmode6align64:$addr, + rGPR:$Rm, pred:$p)>; +def VST3dWB_register_Asm_16 : + NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm", + (ins VecListThreeD:$list, addrmode6align64:$addr, + rGPR:$Rm, pred:$p)>; +def VST3dWB_register_Asm_32 : + NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm", + (ins VecListThreeD:$list, addrmode6align64:$addr, + rGPR:$Rm, pred:$p)>; +def VST3qWB_register_Asm_8 : + NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr, $Rm", + (ins VecListThreeQ:$list, addrmode6align64:$addr, + rGPR:$Rm, pred:$p)>; +def VST3qWB_register_Asm_16 : + NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm", + (ins VecListThreeQ:$list, addrmode6align64:$addr, + rGPR:$Rm, pred:$p)>; +def VST3qWB_register_Asm_32 : + NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm", + (ins VecListThreeQ:$list, addrmode6align64:$addr, + rGPR:$Rm, pred:$p)>; + +// VLD4 all-lanes pseudo-instructions. These need special handling for +// the lane index that an InstAlias can't handle, so we use these instead. +def VLD4DUPdAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr", + (ins VecListFourDAllLanes:$list, addrmode6dupalign32:$addr, + pred:$p)>; +def VLD4DUPdAsm_16: NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr", + (ins VecListFourDAllLanes:$list, addrmode6dupalign64:$addr, + pred:$p)>; +def VLD4DUPdAsm_32: NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr", + (ins VecListFourDAllLanes:$list, addrmode6dupalign64or128:$addr, + pred:$p)>; +def VLD4DUPqAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr", + (ins VecListFourQAllLanes:$list, addrmode6dupalign32:$addr, + pred:$p)>; +def VLD4DUPqAsm_16: NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr", + (ins VecListFourQAllLanes:$list, addrmode6dupalign64:$addr, + pred:$p)>; +def VLD4DUPqAsm_32: NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr", + (ins VecListFourQAllLanes:$list, addrmode6dupalign64or128:$addr, + pred:$p)>; + +def VLD4DUPdWB_fixed_Asm_8 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!", + (ins VecListFourDAllLanes:$list, addrmode6dupalign32:$addr, + pred:$p)>; +def VLD4DUPdWB_fixed_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!", + (ins VecListFourDAllLanes:$list, addrmode6dupalign64:$addr, + pred:$p)>; +def VLD4DUPdWB_fixed_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!", + (ins VecListFourDAllLanes:$list, addrmode6dupalign64or128:$addr, + pred:$p)>; +def VLD4DUPqWB_fixed_Asm_8 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!", + (ins VecListFourQAllLanes:$list, addrmode6dupalign32:$addr, + pred:$p)>; +def VLD4DUPqWB_fixed_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!", + (ins VecListFourQAllLanes:$list, addrmode6dupalign64:$addr, + pred:$p)>; +def VLD4DUPqWB_fixed_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!", + (ins VecListFourQAllLanes:$list, addrmode6dupalign64or128:$addr, + pred:$p)>; +def VLD4DUPdWB_register_Asm_8 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm", + (ins VecListFourDAllLanes:$list, addrmode6dupalign32:$addr, + rGPR:$Rm, pred:$p)>; +def VLD4DUPdWB_register_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm", + (ins VecListFourDAllLanes:$list, addrmode6dupalign64:$addr, + rGPR:$Rm, pred:$p)>; +def VLD4DUPdWB_register_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm", + (ins VecListFourDAllLanes:$list, + addrmode6dupalign64or128:$addr, rGPR:$Rm, pred:$p)>; +def VLD4DUPqWB_register_Asm_8 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm", + (ins VecListFourQAllLanes:$list, addrmode6dupalign32:$addr, + rGPR:$Rm, pred:$p)>; +def VLD4DUPqWB_register_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm", + (ins VecListFourQAllLanes:$list, addrmode6dupalign64:$addr, + rGPR:$Rm, pred:$p)>; +def VLD4DUPqWB_register_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm", + (ins VecListFourQAllLanes:$list, + addrmode6dupalign64or128:$addr, rGPR:$Rm, pred:$p)>; + + +// VLD4 single-lane pseudo-instructions. These need special handling for +// the lane index that an InstAlias can't handle, so we use these instead. +def VLD4LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr", + (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr, + pred:$p)>; +def VLD4LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr", + (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr, + pred:$p)>; +def VLD4LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr", + (ins VecListFourDWordIndexed:$list, addrmode6align64or128:$addr, + pred:$p)>; +def VLD4LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr", + (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr, + pred:$p)>; +def VLD4LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr", + (ins VecListFourQWordIndexed:$list, addrmode6align64or128:$addr, + pred:$p)>; + +def VLD4LNdWB_fixed_Asm_8 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!", + (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr, + pred:$p)>; +def VLD4LNdWB_fixed_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!", + (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr, + pred:$p)>; +def VLD4LNdWB_fixed_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!", + (ins VecListFourDWordIndexed:$list, addrmode6align64or128:$addr, + pred:$p)>; +def VLD4LNqWB_fixed_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!", + (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr, + pred:$p)>; +def VLD4LNqWB_fixed_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!", + (ins VecListFourQWordIndexed:$list, addrmode6align64or128:$addr, + pred:$p)>; +def VLD4LNdWB_register_Asm_8 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm", + (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr, + rGPR:$Rm, pred:$p)>; +def VLD4LNdWB_register_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm", + (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr, + rGPR:$Rm, pred:$p)>; +def VLD4LNdWB_register_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm", + (ins VecListFourDWordIndexed:$list, + addrmode6align64or128:$addr, rGPR:$Rm, pred:$p)>; +def VLD4LNqWB_register_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm", + (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr, + rGPR:$Rm, pred:$p)>; +def VLD4LNqWB_register_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm", + (ins VecListFourQWordIndexed:$list, + addrmode6align64or128:$addr, rGPR:$Rm, pred:$p)>; + + + +// VLD4 multiple structure pseudo-instructions. These need special handling for +// the vector operands that the normal instructions don't yet model. +// FIXME: Remove these when the register classes and instructions are updated. +def VLD4dAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr", + (ins VecListFourD:$list, addrmode6align64or128or256:$addr, + pred:$p)>; +def VLD4dAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr", + (ins VecListFourD:$list, addrmode6align64or128or256:$addr, + pred:$p)>; +def VLD4dAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr", + (ins VecListFourD:$list, addrmode6align64or128or256:$addr, + pred:$p)>; +def VLD4qAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr", + (ins VecListFourQ:$list, addrmode6align64or128or256:$addr, + pred:$p)>; +def VLD4qAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr", + (ins VecListFourQ:$list, addrmode6align64or128or256:$addr, + pred:$p)>; +def VLD4qAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr", + (ins VecListFourQ:$list, addrmode6align64or128or256:$addr, + pred:$p)>; + +def VLD4dWB_fixed_Asm_8 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!", + (ins VecListFourD:$list, addrmode6align64or128or256:$addr, + pred:$p)>; +def VLD4dWB_fixed_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!", + (ins VecListFourD:$list, addrmode6align64or128or256:$addr, + pred:$p)>; +def VLD4dWB_fixed_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!", + (ins VecListFourD:$list, addrmode6align64or128or256:$addr, + pred:$p)>; +def VLD4qWB_fixed_Asm_8 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!", + (ins VecListFourQ:$list, addrmode6align64or128or256:$addr, + pred:$p)>; +def VLD4qWB_fixed_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!", + (ins VecListFourQ:$list, addrmode6align64or128or256:$addr, + pred:$p)>; +def VLD4qWB_fixed_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!", + (ins VecListFourQ:$list, addrmode6align64or128or256:$addr, + pred:$p)>; +def VLD4dWB_register_Asm_8 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm", + (ins VecListFourD:$list, addrmode6align64or128or256:$addr, + rGPR:$Rm, pred:$p)>; +def VLD4dWB_register_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm", + (ins VecListFourD:$list, addrmode6align64or128or256:$addr, + rGPR:$Rm, pred:$p)>; +def VLD4dWB_register_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm", + (ins VecListFourD:$list, addrmode6align64or128or256:$addr, + rGPR:$Rm, pred:$p)>; +def VLD4qWB_register_Asm_8 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm", + (ins VecListFourQ:$list, addrmode6align64or128or256:$addr, + rGPR:$Rm, pred:$p)>; +def VLD4qWB_register_Asm_16 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm", + (ins VecListFourQ:$list, addrmode6align64or128or256:$addr, + rGPR:$Rm, pred:$p)>; +def VLD4qWB_register_Asm_32 : + NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm", + (ins VecListFourQ:$list, addrmode6align64or128or256:$addr, + rGPR:$Rm, pred:$p)>; + +// VST4 single-lane pseudo-instructions. These need special handling for +// the lane index that an InstAlias can't handle, so we use these instead. +def VST4LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr", + (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr, + pred:$p)>; +def VST4LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr", + (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr, + pred:$p)>; +def VST4LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr", + (ins VecListFourDWordIndexed:$list, addrmode6align64or128:$addr, + pred:$p)>; +def VST4LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr", + (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr, + pred:$p)>; +def VST4LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr", + (ins VecListFourQWordIndexed:$list, addrmode6align64or128:$addr, + pred:$p)>; + +def VST4LNdWB_fixed_Asm_8 : + NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr!", + (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr, + pred:$p)>; +def VST4LNdWB_fixed_Asm_16 : + NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!", + (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr, + pred:$p)>; +def VST4LNdWB_fixed_Asm_32 : + NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!", + (ins VecListFourDWordIndexed:$list, addrmode6align64or128:$addr, + pred:$p)>; +def VST4LNqWB_fixed_Asm_16 : + NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!", + (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr, + pred:$p)>; +def VST4LNqWB_fixed_Asm_32 : + NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!", + (ins VecListFourQWordIndexed:$list, addrmode6align64or128:$addr, + pred:$p)>; +def VST4LNdWB_register_Asm_8 : + NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr, $Rm", + (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr, + rGPR:$Rm, pred:$p)>; +def VST4LNdWB_register_Asm_16 : + NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm", + (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr, + rGPR:$Rm, pred:$p)>; +def VST4LNdWB_register_Asm_32 : + NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm", + (ins VecListFourDWordIndexed:$list, + addrmode6align64or128:$addr, rGPR:$Rm, pred:$p)>; +def VST4LNqWB_register_Asm_16 : + NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm", + (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr, + rGPR:$Rm, pred:$p)>; +def VST4LNqWB_register_Asm_32 : + NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm", + (ins VecListFourQWordIndexed:$list, + addrmode6align64or128:$addr, rGPR:$Rm, pred:$p)>; + + +// VST4 multiple structure pseudo-instructions. These need special handling for +// the vector operands that the normal instructions don't yet model. +// FIXME: Remove these when the register classes and instructions are updated. +def VST4dAsm_8 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr", + (ins VecListFourD:$list, addrmode6align64or128or256:$addr, + pred:$p)>; +def VST4dAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr", + (ins VecListFourD:$list, addrmode6align64or128or256:$addr, + pred:$p)>; +def VST4dAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr", + (ins VecListFourD:$list, addrmode6align64or128or256:$addr, + pred:$p)>; +def VST4qAsm_8 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr", + (ins VecListFourQ:$list, addrmode6align64or128or256:$addr, + pred:$p)>; +def VST4qAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr", + (ins VecListFourQ:$list, addrmode6align64or128or256:$addr, + pred:$p)>; +def VST4qAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr", + (ins VecListFourQ:$list, addrmode6align64or128or256:$addr, + pred:$p)>; + +def VST4dWB_fixed_Asm_8 : + NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr!", + (ins VecListFourD:$list, addrmode6align64or128or256:$addr, + pred:$p)>; +def VST4dWB_fixed_Asm_16 : + NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!", + (ins VecListFourD:$list, addrmode6align64or128or256:$addr, + pred:$p)>; +def VST4dWB_fixed_Asm_32 : + NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!", + (ins VecListFourD:$list, addrmode6align64or128or256:$addr, + pred:$p)>; +def VST4qWB_fixed_Asm_8 : + NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr!", + (ins VecListFourQ:$list, addrmode6align64or128or256:$addr, + pred:$p)>; +def VST4qWB_fixed_Asm_16 : + NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!", + (ins VecListFourQ:$list, addrmode6align64or128or256:$addr, + pred:$p)>; +def VST4qWB_fixed_Asm_32 : + NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!", + (ins VecListFourQ:$list, addrmode6align64or128or256:$addr, + pred:$p)>; +def VST4dWB_register_Asm_8 : + NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr, $Rm", + (ins VecListFourD:$list, addrmode6align64or128or256:$addr, + rGPR:$Rm, pred:$p)>; +def VST4dWB_register_Asm_16 : + NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm", + (ins VecListFourD:$list, addrmode6align64or128or256:$addr, + rGPR:$Rm, pred:$p)>; +def VST4dWB_register_Asm_32 : + NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm", + (ins VecListFourD:$list, addrmode6align64or128or256:$addr, + rGPR:$Rm, pred:$p)>; +def VST4qWB_register_Asm_8 : + NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr, $Rm", + (ins VecListFourQ:$list, addrmode6align64or128or256:$addr, + rGPR:$Rm, pred:$p)>; +def VST4qWB_register_Asm_16 : + NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm", + (ins VecListFourQ:$list, addrmode6align64or128or256:$addr, + rGPR:$Rm, pred:$p)>; +def VST4qWB_register_Asm_32 : + NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm", + (ins VecListFourQ:$list, addrmode6align64or128or256:$addr, + rGPR:$Rm, pred:$p)>; + +// VMOV/VMVN takes an optional datatype suffix +defm : NEONDTAnyInstAlias<"vmov${p}", "$Vd, $Vm", + (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>; +defm : NEONDTAnyInstAlias<"vmov${p}", "$Vd, $Vm", + (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>; + +defm : NEONDTAnyInstAlias<"vmvn${p}", "$Vd, $Vm", + (VMVNd DPR:$Vd, DPR:$Vm, pred:$p)>; +defm : NEONDTAnyInstAlias<"vmvn${p}", "$Vd, $Vm", + (VMVNq QPR:$Vd, QPR:$Vm, pred:$p)>; + +// VCLT (register) is an assembler alias for VCGT w/ the operands reversed. +// D-register versions. +def : NEONInstAlias<"vcle${p}.s8 $Dd, $Dn, $Dm", + (VCGEsv8i8 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; +def : NEONInstAlias<"vcle${p}.s16 $Dd, $Dn, $Dm", + (VCGEsv4i16 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; +def : NEONInstAlias<"vcle${p}.s32 $Dd, $Dn, $Dm", + (VCGEsv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; +def : NEONInstAlias<"vcle${p}.u8 $Dd, $Dn, $Dm", + (VCGEuv8i8 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; +def : NEONInstAlias<"vcle${p}.u16 $Dd, $Dn, $Dm", + (VCGEuv4i16 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; +def : NEONInstAlias<"vcle${p}.u32 $Dd, $Dn, $Dm", + (VCGEuv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; +def : NEONInstAlias<"vcle${p}.f32 $Dd, $Dn, $Dm", + (VCGEfd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; +let Predicates = [HasNEON, HasFullFP16] in +def : NEONInstAlias<"vcle${p}.f16 $Dd, $Dn, $Dm", + (VCGEhd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; +// Q-register versions. +def : NEONInstAlias<"vcle${p}.s8 $Qd, $Qn, $Qm", + (VCGEsv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; +def : NEONInstAlias<"vcle${p}.s16 $Qd, $Qn, $Qm", + (VCGEsv8i16 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; +def : NEONInstAlias<"vcle${p}.s32 $Qd, $Qn, $Qm", + (VCGEsv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; +def : NEONInstAlias<"vcle${p}.u8 $Qd, $Qn, $Qm", + (VCGEuv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; +def : NEONInstAlias<"vcle${p}.u16 $Qd, $Qn, $Qm", + (VCGEuv8i16 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; +def : NEONInstAlias<"vcle${p}.u32 $Qd, $Qn, $Qm", + (VCGEuv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; +def : NEONInstAlias<"vcle${p}.f32 $Qd, $Qn, $Qm", + (VCGEfq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; +let Predicates = [HasNEON, HasFullFP16] in +def : NEONInstAlias<"vcle${p}.f16 $Qd, $Qn, $Qm", + (VCGEhq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; + +// VCLT (register) is an assembler alias for VCGT w/ the operands reversed. +// D-register versions. +def : NEONInstAlias<"vclt${p}.s8 $Dd, $Dn, $Dm", + (VCGTsv8i8 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; +def : NEONInstAlias<"vclt${p}.s16 $Dd, $Dn, $Dm", + (VCGTsv4i16 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; +def : NEONInstAlias<"vclt${p}.s32 $Dd, $Dn, $Dm", + (VCGTsv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; +def : NEONInstAlias<"vclt${p}.u8 $Dd, $Dn, $Dm", + (VCGTuv8i8 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; +def : NEONInstAlias<"vclt${p}.u16 $Dd, $Dn, $Dm", + (VCGTuv4i16 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; +def : NEONInstAlias<"vclt${p}.u32 $Dd, $Dn, $Dm", + (VCGTuv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; +def : NEONInstAlias<"vclt${p}.f32 $Dd, $Dn, $Dm", + (VCGTfd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; +let Predicates = [HasNEON, HasFullFP16] in +def : NEONInstAlias<"vclt${p}.f16 $Dd, $Dn, $Dm", + (VCGThd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; +// Q-register versions. +def : NEONInstAlias<"vclt${p}.s8 $Qd, $Qn, $Qm", + (VCGTsv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; +def : NEONInstAlias<"vclt${p}.s16 $Qd, $Qn, $Qm", + (VCGTsv8i16 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; +def : NEONInstAlias<"vclt${p}.s32 $Qd, $Qn, $Qm", + (VCGTsv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; +def : NEONInstAlias<"vclt${p}.u8 $Qd, $Qn, $Qm", + (VCGTuv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; +def : NEONInstAlias<"vclt${p}.u16 $Qd, $Qn, $Qm", + (VCGTuv8i16 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; +def : NEONInstAlias<"vclt${p}.u32 $Qd, $Qn, $Qm", + (VCGTuv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; +def : NEONInstAlias<"vclt${p}.f32 $Qd, $Qn, $Qm", + (VCGTfq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; +let Predicates = [HasNEON, HasFullFP16] in +def : NEONInstAlias<"vclt${p}.f16 $Qd, $Qn, $Qm", + (VCGThq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; + +// VSWP allows, but does not require, a type suffix. +defm : NEONDTAnyInstAlias<"vswp${p}", "$Vd, $Vm", + (VSWPd DPR:$Vd, DPR:$Vm, pred:$p)>; +defm : NEONDTAnyInstAlias<"vswp${p}", "$Vd, $Vm", + (VSWPq QPR:$Vd, QPR:$Vm, pred:$p)>; + +// VBIF, VBIT, and VBSL allow, but do not require, a type suffix. +defm : NEONDTAnyInstAlias<"vbif${p}", "$Vd, $Vn, $Vm", + (VBIFd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>; +defm : NEONDTAnyInstAlias<"vbit${p}", "$Vd, $Vn, $Vm", + (VBITd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>; +defm : NEONDTAnyInstAlias<"vbsl${p}", "$Vd, $Vn, $Vm", + (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>; +defm : NEONDTAnyInstAlias<"vbif${p}", "$Vd, $Vn, $Vm", + (VBIFq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>; +defm : NEONDTAnyInstAlias<"vbit${p}", "$Vd, $Vn, $Vm", + (VBITq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>; +defm : NEONDTAnyInstAlias<"vbsl${p}", "$Vd, $Vn, $Vm", + (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>; + +// "vmov Rd, #-imm" can be handled via "vmvn". +def : NEONInstAlias<"vmov${p}.i32 $Vd, $imm", + (VMVNv2i32 DPR:$Vd, nImmVMOVI32Neg:$imm, pred:$p)>; +def : NEONInstAlias<"vmov${p}.i32 $Vd, $imm", + (VMVNv4i32 QPR:$Vd, nImmVMOVI32Neg:$imm, pred:$p)>; +def : NEONInstAlias<"vmvn${p}.i32 $Vd, $imm", + (VMOVv2i32 DPR:$Vd, nImmVMOVI32Neg:$imm, pred:$p)>; +def : NEONInstAlias<"vmvn${p}.i32 $Vd, $imm", + (VMOVv4i32 QPR:$Vd, nImmVMOVI32Neg:$imm, pred:$p)>; + +// 'gas' compatibility aliases for quad-word instructions. Strictly speaking, +// these should restrict to just the Q register variants, but the register +// classes are enough to match correctly regardless, so we keep it simple +// and just use MnemonicAlias. +def : NEONMnemonicAlias<"vbicq", "vbic">; +def : NEONMnemonicAlias<"vandq", "vand">; +def : NEONMnemonicAlias<"veorq", "veor">; +def : NEONMnemonicAlias<"vorrq", "vorr">; + +def : NEONMnemonicAlias<"vmovq", "vmov">; +def : NEONMnemonicAlias<"vmvnq", "vmvn">; +// Explicit versions for floating point so that the FPImm variants get +// handled early. The parser gets confused otherwise. +def : NEONMnemonicAlias<"vmovq.f32", "vmov.f32">; +def : NEONMnemonicAlias<"vmovq.f64", "vmov.f64">; + +def : NEONMnemonicAlias<"vaddq", "vadd">; +def : NEONMnemonicAlias<"vsubq", "vsub">; + +def : NEONMnemonicAlias<"vminq", "vmin">; +def : NEONMnemonicAlias<"vmaxq", "vmax">; + +def : NEONMnemonicAlias<"vmulq", "vmul">; + +def : NEONMnemonicAlias<"vabsq", "vabs">; + +def : NEONMnemonicAlias<"vshlq", "vshl">; +def : NEONMnemonicAlias<"vshrq", "vshr">; + +def : NEONMnemonicAlias<"vcvtq", "vcvt">; + +def : NEONMnemonicAlias<"vcleq", "vcle">; +def : NEONMnemonicAlias<"vceqq", "vceq">; + +def : NEONMnemonicAlias<"vzipq", "vzip">; +def : NEONMnemonicAlias<"vswpq", "vswp">; + +def : NEONMnemonicAlias<"vrecpeq.f32", "vrecpe.f32">; +def : NEONMnemonicAlias<"vrecpeq.u32", "vrecpe.u32">; + + +// Alias for loading floating point immediates that aren't representable +// using the vmov.f32 encoding but the bitpattern is representable using +// the .i32 encoding. +def : NEONInstAlias<"vmov${p}.f32 $Vd, $imm", + (VMOVv4i32 QPR:$Vd, nImmVMOVI32:$imm, pred:$p)>; +def : NEONInstAlias<"vmov${p}.f32 $Vd, $imm", + (VMOVv2i32 DPR:$Vd, nImmVMOVI32:$imm, pred:$p)>; diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td new file mode 100644 index 0000000..df6f243 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td @@ -0,0 +1,1518 @@ +//===-- ARMInstrThumb.td - Thumb support for ARM -----------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the Thumb instruction set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Thumb specific DAG Nodes. +// + +def ARMtcall : SDNode<"ARMISD::tCALL", SDT_ARMcall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; + +def imm_sr_XFORM: SDNodeXForm<imm, [{ + unsigned Imm = N->getZExtValue(); + return CurDAG->getTargetConstant((Imm == 32 ? 0 : Imm), SDLoc(N), MVT::i32); +}]>; +def ThumbSRImmAsmOperand: AsmOperandClass { let Name = "ImmThumbSR"; } +def imm_sr : Operand<i32>, PatLeaf<(imm), [{ + uint64_t Imm = N->getZExtValue(); + return Imm > 0 && Imm <= 32; +}], imm_sr_XFORM> { + let PrintMethod = "printThumbSRImm"; + let ParserMatchClass = ThumbSRImmAsmOperand; +} + +def imm_comp_XFORM : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), SDLoc(N), + MVT::i32); +}]>; + +def imm0_7_neg : PatLeaf<(i32 imm), [{ + return (uint32_t)-N->getZExtValue() < 8; +}], imm_neg_XFORM>; + +def imm0_255_comp : PatLeaf<(i32 imm), [{ + return ~((uint32_t)N->getZExtValue()) < 256; +}]>; + +def imm8_255 : ImmLeaf<i32, [{ + return Imm >= 8 && Imm < 256; +}]>; +def imm8_255_neg : PatLeaf<(i32 imm), [{ + unsigned Val = -N->getZExtValue(); + return Val >= 8 && Val < 256; +}], imm_neg_XFORM>; + +// Break imm's up into two pieces: an immediate + a left shift. This uses +// thumb_immshifted to match and thumb_immshifted_val and thumb_immshifted_shamt +// to get the val/shift pieces. +def thumb_immshifted : PatLeaf<(imm), [{ + return ARM_AM::isThumbImmShiftedVal((unsigned)N->getZExtValue()); +}]>; + +def thumb_immshifted_val : SDNodeXForm<imm, [{ + unsigned V = ARM_AM::getThumbImmNonShiftedVal((unsigned)N->getZExtValue()); + return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i32); +}]>; + +def thumb_immshifted_shamt : SDNodeXForm<imm, [{ + unsigned V = ARM_AM::getThumbImmValShift((unsigned)N->getZExtValue()); + return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i32); +}]>; + +// Scaled 4 immediate. +def t_imm0_1020s4_asmoperand: AsmOperandClass { let Name = "Imm0_1020s4"; } +def t_imm0_1020s4 : Operand<i32> { + let PrintMethod = "printThumbS4ImmOperand"; + let ParserMatchClass = t_imm0_1020s4_asmoperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +def t_imm0_508s4_asmoperand: AsmOperandClass { let Name = "Imm0_508s4"; } +def t_imm0_508s4 : Operand<i32> { + let PrintMethod = "printThumbS4ImmOperand"; + let ParserMatchClass = t_imm0_508s4_asmoperand; + let OperandType = "OPERAND_IMMEDIATE"; +} +// Alias use only, so no printer is necessary. +def t_imm0_508s4_neg_asmoperand: AsmOperandClass { let Name = "Imm0_508s4Neg"; } +def t_imm0_508s4_neg : Operand<i32> { + let ParserMatchClass = t_imm0_508s4_neg_asmoperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +// Define Thumb specific addressing modes. + +// unsigned 8-bit, 2-scaled memory offset +class OperandUnsignedOffset_b8s2 : AsmOperandClass { + let Name = "UnsignedOffset_b8s2"; + let PredicateMethod = "isUnsignedOffset<8, 2>"; +} + +def UnsignedOffset_b8s2 : OperandUnsignedOffset_b8s2; + +// thumb style PC relative operand. signed, 8 bits magnitude, +// two bits shift. can be represented as either [pc, #imm], #imm, +// or relocatable expression... +def ThumbMemPC : AsmOperandClass { + let Name = "ThumbMemPC"; +} + +let OperandType = "OPERAND_PCREL" in { +def t_brtarget : Operand<OtherVT> { + let EncoderMethod = "getThumbBRTargetOpValue"; + let DecoderMethod = "DecodeThumbBROperand"; +} + +// ADR instruction labels. +def t_adrlabel : Operand<i32> { + let EncoderMethod = "getThumbAdrLabelOpValue"; + let PrintMethod = "printAdrLabelOperand<2>"; + let ParserMatchClass = UnsignedOffset_b8s2; +} + +def t_bcctarget : Operand<i32> { + let EncoderMethod = "getThumbBCCTargetOpValue"; + let DecoderMethod = "DecodeThumbBCCTargetOperand"; +} + +def t_cbtarget : Operand<i32> { + let EncoderMethod = "getThumbCBTargetOpValue"; + let DecoderMethod = "DecodeThumbCmpBROperand"; +} + +def t_bltarget : Operand<i32> { + let EncoderMethod = "getThumbBLTargetOpValue"; + let DecoderMethod = "DecodeThumbBLTargetOperand"; +} + +def t_blxtarget : Operand<i32> { + let EncoderMethod = "getThumbBLXTargetOpValue"; + let DecoderMethod = "DecodeThumbBLXOffset"; +} + +// t_addrmode_pc := <label> => pc + imm8 * 4 +// +def t_addrmode_pc : MemOperand { + let EncoderMethod = "getAddrModePCOpValue"; + let DecoderMethod = "DecodeThumbAddrModePC"; + let PrintMethod = "printThumbLdrLabelOperand"; + let ParserMatchClass = ThumbMemPC; +} +} + +// t_addrmode_rr := reg + reg +// +def t_addrmode_rr_asm_operand : AsmOperandClass { let Name = "MemThumbRR"; } +def t_addrmode_rr : MemOperand, + ComplexPattern<i32, 2, "SelectThumbAddrModeRR", []> { + let EncoderMethod = "getThumbAddrModeRegRegOpValue"; + let PrintMethod = "printThumbAddrModeRROperand"; + let DecoderMethod = "DecodeThumbAddrModeRR"; + let ParserMatchClass = t_addrmode_rr_asm_operand; + let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg); +} + +// t_addrmode_rrs := reg + reg +// +// We use separate scaled versions because the Select* functions need +// to explicitly check for a matching constant and return false here so that +// the reg+imm forms will match instead. This is a horrible way to do that, +// as it forces tight coupling between the methods, but it's how selectiondag +// currently works. +def t_addrmode_rrs1 : MemOperand, + ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S1", []> { + let EncoderMethod = "getThumbAddrModeRegRegOpValue"; + let PrintMethod = "printThumbAddrModeRROperand"; + let DecoderMethod = "DecodeThumbAddrModeRR"; + let ParserMatchClass = t_addrmode_rr_asm_operand; + let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg); +} +def t_addrmode_rrs2 : MemOperand, + ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S2", []> { + let EncoderMethod = "getThumbAddrModeRegRegOpValue"; + let DecoderMethod = "DecodeThumbAddrModeRR"; + let PrintMethod = "printThumbAddrModeRROperand"; + let ParserMatchClass = t_addrmode_rr_asm_operand; + let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg); +} +def t_addrmode_rrs4 : MemOperand, + ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S4", []> { + let EncoderMethod = "getThumbAddrModeRegRegOpValue"; + let DecoderMethod = "DecodeThumbAddrModeRR"; + let PrintMethod = "printThumbAddrModeRROperand"; + let ParserMatchClass = t_addrmode_rr_asm_operand; + let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg); +} + +// t_addrmode_is4 := reg + imm5 * 4 +// +def t_addrmode_is4_asm_operand : AsmOperandClass { let Name = "MemThumbRIs4"; } +def t_addrmode_is4 : MemOperand, + ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S4", []> { + let EncoderMethod = "getAddrModeISOpValue"; + let DecoderMethod = "DecodeThumbAddrModeIS"; + let PrintMethod = "printThumbAddrModeImm5S4Operand"; + let ParserMatchClass = t_addrmode_is4_asm_operand; + let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm); +} + +// t_addrmode_is2 := reg + imm5 * 2 +// +def t_addrmode_is2_asm_operand : AsmOperandClass { let Name = "MemThumbRIs2"; } +def t_addrmode_is2 : MemOperand, + ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S2", []> { + let EncoderMethod = "getAddrModeISOpValue"; + let DecoderMethod = "DecodeThumbAddrModeIS"; + let PrintMethod = "printThumbAddrModeImm5S2Operand"; + let ParserMatchClass = t_addrmode_is2_asm_operand; + let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm); +} + +// t_addrmode_is1 := reg + imm5 +// +def t_addrmode_is1_asm_operand : AsmOperandClass { let Name = "MemThumbRIs1"; } +def t_addrmode_is1 : MemOperand, + ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S1", []> { + let EncoderMethod = "getAddrModeISOpValue"; + let DecoderMethod = "DecodeThumbAddrModeIS"; + let PrintMethod = "printThumbAddrModeImm5S1Operand"; + let ParserMatchClass = t_addrmode_is1_asm_operand; + let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm); +} + +// t_addrmode_sp := sp + imm8 * 4 +// +// FIXME: This really shouldn't have an explicit SP operand at all. It should +// be implicit, just like in the instruction encoding itself. +def t_addrmode_sp_asm_operand : AsmOperandClass { let Name = "MemThumbSPI"; } +def t_addrmode_sp : MemOperand, + ComplexPattern<i32, 2, "SelectThumbAddrModeSP", []> { + let EncoderMethod = "getAddrModeThumbSPOpValue"; + let DecoderMethod = "DecodeThumbAddrModeSP"; + let PrintMethod = "printThumbAddrModeSPOperand"; + let ParserMatchClass = t_addrmode_sp_asm_operand; + let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); +} + +//===----------------------------------------------------------------------===// +// Miscellaneous Instructions. +// + +// FIXME: Marking these as hasSideEffects is necessary to prevent machine DCE +// from removing one half of the matched pairs. That breaks PEI, which assumes +// these will always be in pairs, and asserts if it finds otherwise. Better way? +let Defs = [SP], Uses = [SP], hasSideEffects = 1 in { +def tADJCALLSTACKUP : + PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), NoItinerary, + [(ARMcallseq_end imm:$amt1, imm:$amt2)]>, + Requires<[IsThumb, IsThumb1Only]>; + +def tADJCALLSTACKDOWN : + PseudoInst<(outs), (ins i32imm:$amt), NoItinerary, + [(ARMcallseq_start imm:$amt)]>, + Requires<[IsThumb, IsThumb1Only]>; +} + +class T1SystemEncoding<bits<8> opc> + : T1Encoding<0b101111> { + let Inst{9-8} = 0b11; + let Inst{7-0} = opc; +} + +def tHINT : T1pI<(outs), (ins imm0_15:$imm), NoItinerary, "hint", "\t$imm", + [(int_arm_hint imm0_15:$imm)]>, + T1SystemEncoding<0x00>, + Requires<[IsThumb, HasV6M]> { + bits<4> imm; + let Inst{7-4} = imm; +} + +class tHintAlias<string Asm, dag Result> : tInstAlias<Asm, Result> { + let Predicates = [IsThumb, HasV6M]; +} + +def : tHintAlias<"nop$p", (tHINT 0, pred:$p)>; // A8.6.110 +def : tHintAlias<"yield$p", (tHINT 1, pred:$p)>; // A8.6.410 +def : tHintAlias<"wfe$p", (tHINT 2, pred:$p)>; // A8.6.408 +def : tHintAlias<"wfi$p", (tHINT 3, pred:$p)>; // A8.6.409 +def : tHintAlias<"sev$p", (tHINT 4, pred:$p)>; // A8.6.157 +def : tInstAlias<"sevl$p", (tHINT 5, pred:$p)> { + let Predicates = [IsThumb2, HasV8]; +} + +// The imm operand $val can be used by a debugger to store more information +// about the breakpoint. +def tBKPT : T1I<(outs), (ins imm0_255:$val), NoItinerary, "bkpt\t$val", + []>, + T1Encoding<0b101111> { + let Inst{9-8} = 0b10; + // A8.6.22 + bits<8> val; + let Inst{7-0} = val; +} +// default immediate for breakpoint mnemonic +def : InstAlias<"bkpt", (tBKPT 0)>, Requires<[IsThumb]>; + +def tHLT : T1I<(outs), (ins imm0_63:$val), NoItinerary, "hlt\t$val", + []>, T1Encoding<0b101110>, Requires<[IsThumb, HasV8]> { + let Inst{9-6} = 0b1010; + bits<6> val; + let Inst{5-0} = val; +} + +def tSETEND : T1I<(outs), (ins setend_op:$end), NoItinerary, "setend\t$end", + []>, T1Encoding<0b101101>, Requires<[IsNotMClass]>, Deprecated<HasV8Ops> { + bits<1> end; + // A8.6.156 + let Inst{9-5} = 0b10010; + let Inst{4} = 1; + let Inst{3} = end; + let Inst{2-0} = 0b000; +} + +// Change Processor State is a system instruction -- for disassembly only. +def tCPS : T1I<(outs), (ins imod_op:$imod, iflags_op:$iflags), + NoItinerary, "cps$imod $iflags", []>, + T1Misc<0b0110011> { + // A8.6.38 & B6.1.1 + bit imod; + bits<3> iflags; + + let Inst{4} = imod; + let Inst{3} = 0; + let Inst{2-0} = iflags; + let DecoderMethod = "DecodeThumbCPS"; +} + +// For both thumb1 and thumb2. +let isNotDuplicable = 1, isCodeGenOnly = 1 in +def tPICADD : TIt<(outs GPR:$dst), (ins GPR:$lhs, pclabel:$cp), IIC_iALUr, "", + [(set GPR:$dst, (ARMpic_add GPR:$lhs, imm:$cp))]>, + T1Special<{0,0,?,?}>, Sched<[WriteALU]> { + // A8.6.6 + bits<3> dst; + let Inst{6-3} = 0b1111; // Rm = pc + let Inst{2-0} = dst; +} + +// ADD <Rd>, sp, #<imm8> +// FIXME: This should not be marked as having side effects, and it should be +// rematerializable. Clearing the side effect bit causes miscompilations, +// probably because the instruction can be moved around. +def tADDrSPi : T1pI<(outs tGPR:$dst), (ins GPRsp:$sp, t_imm0_1020s4:$imm), + IIC_iALUi, "add", "\t$dst, $sp, $imm", []>, + T1Encoding<{1,0,1,0,1,?}>, Sched<[WriteALU]> { + // A6.2 & A8.6.8 + bits<3> dst; + bits<8> imm; + let Inst{10-8} = dst; + let Inst{7-0} = imm; + let DecoderMethod = "DecodeThumbAddSpecialReg"; +} + +// Thumb1 frame lowering is rather fragile, we hope to be able to use +// tADDrSPi, but we may need to insert a sequence that clobbers CPSR. +def tADDframe : PseudoInst<(outs tGPR:$dst), (ins i32imm:$base, i32imm:$offset), + NoItinerary, []>, + Requires<[IsThumb, IsThumb1Only]> { + let Defs = [CPSR]; +} + +// ADD sp, sp, #<imm7> +def tADDspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm), + IIC_iALUi, "add", "\t$Rdn, $imm", []>, + T1Misc<{0,0,0,0,0,?,?}>, Sched<[WriteALU]> { + // A6.2.5 & A8.6.8 + bits<7> imm; + let Inst{6-0} = imm; + let DecoderMethod = "DecodeThumbAddSPImm"; +} + +// SUB sp, sp, #<imm7> +// FIXME: The encoding and the ASM string don't match up. +def tSUBspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm), + IIC_iALUi, "sub", "\t$Rdn, $imm", []>, + T1Misc<{0,0,0,0,1,?,?}>, Sched<[WriteALU]> { + // A6.2.5 & A8.6.214 + bits<7> imm; + let Inst{6-0} = imm; + let DecoderMethod = "DecodeThumbAddSPImm"; +} + +def : tInstAlias<"add${p} sp, $imm", + (tSUBspi SP, t_imm0_508s4_neg:$imm, pred:$p)>; +def : tInstAlias<"add${p} sp, sp, $imm", + (tSUBspi SP, t_imm0_508s4_neg:$imm, pred:$p)>; + +// Can optionally specify SP as a three operand instruction. +def : tInstAlias<"add${p} sp, sp, $imm", + (tADDspi SP, t_imm0_508s4:$imm, pred:$p)>; +def : tInstAlias<"sub${p} sp, sp, $imm", + (tSUBspi SP, t_imm0_508s4:$imm, pred:$p)>; + +// ADD <Rm>, sp +def tADDrSP : T1pI<(outs GPR:$Rdn), (ins GPRsp:$sp, GPR:$Rn), IIC_iALUr, + "add", "\t$Rdn, $sp, $Rn", []>, + T1Special<{0,0,?,?}>, Sched<[WriteALU]> { + // A8.6.9 Encoding T1 + bits<4> Rdn; + let Inst{7} = Rdn{3}; + let Inst{6-3} = 0b1101; + let Inst{2-0} = Rdn{2-0}; + let DecoderMethod = "DecodeThumbAddSPReg"; +} + +// ADD sp, <Rm> +def tADDspr : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, GPR:$Rm), IIC_iALUr, + "add", "\t$Rdn, $Rm", []>, + T1Special<{0,0,?,?}>, Sched<[WriteALU]> { + // A8.6.9 Encoding T2 + bits<4> Rm; + let Inst{7} = 1; + let Inst{6-3} = Rm; + let Inst{2-0} = 0b101; + let DecoderMethod = "DecodeThumbAddSPReg"; +} + +//===----------------------------------------------------------------------===// +// Control Flow Instructions. +// + +// Indirect branches +let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { + def tBX : TI<(outs), (ins GPR:$Rm, pred:$p), IIC_Br, "bx${p}\t$Rm", []>, + T1Special<{1,1,0,?}>, Sched<[WriteBr]> { + // A6.2.3 & A8.6.25 + bits<4> Rm; + let Inst{6-3} = Rm; + let Inst{2-0} = 0b000; + let Unpredictable{2-0} = 0b111; + } +} + +let isReturn = 1, isTerminator = 1, isBarrier = 1 in { + def tBX_RET : tPseudoExpand<(outs), (ins pred:$p), 2, IIC_Br, + [(ARMretflag)], (tBX LR, pred:$p)>, Sched<[WriteBr]>; + + // Alternative return instruction used by vararg functions. + def tBX_RET_vararg : tPseudoExpand<(outs), (ins tGPR:$Rm, pred:$p), + 2, IIC_Br, [], + (tBX GPR:$Rm, pred:$p)>, Sched<[WriteBr]>; +} + +// All calls clobber the non-callee saved registers. SP is marked as a use to +// prevent stack-pointer assignments that appear immediately before calls from +// potentially appearing dead. +let isCall = 1, + Defs = [LR], Uses = [SP] in { + // Also used for Thumb2 + def tBL : TIx2<0b11110, 0b11, 1, + (outs), (ins pred:$p, t_bltarget:$func), IIC_Br, + "bl${p}\t$func", + [(ARMtcall tglobaladdr:$func)]>, + Requires<[IsThumb]>, Sched<[WriteBrL]> { + bits<24> func; + let Inst{26} = func{23}; + let Inst{25-16} = func{20-11}; + let Inst{13} = func{22}; + let Inst{11} = func{21}; + let Inst{10-0} = func{10-0}; + } + + // ARMv5T and above, also used for Thumb2 + def tBLXi : TIx2<0b11110, 0b11, 0, + (outs), (ins pred:$p, t_blxtarget:$func), IIC_Br, + "blx${p}\t$func", + [(ARMcall tglobaladdr:$func)]>, + Requires<[IsThumb, HasV5T, IsNotMClass]>, Sched<[WriteBrL]> { + bits<24> func; + let Inst{26} = func{23}; + let Inst{25-16} = func{20-11}; + let Inst{13} = func{22}; + let Inst{11} = func{21}; + let Inst{10-1} = func{10-1}; + let Inst{0} = 0; // func{0} is assumed zero + } + + // Also used for Thumb2 + def tBLXr : TI<(outs), (ins pred:$p, GPR:$func), IIC_Br, + "blx${p}\t$func", + [(ARMtcall GPR:$func)]>, + Requires<[IsThumb, HasV5T]>, + T1Special<{1,1,1,?}>, Sched<[WriteBrL]> { // A6.2.3 & A8.6.24; + bits<4> func; + let Inst{6-3} = func; + let Inst{2-0} = 0b000; + } + + // ARMv4T + def tBX_CALL : tPseudoInst<(outs), (ins tGPR:$func), + 4, IIC_Br, + [(ARMcall_nolink tGPR:$func)]>, + Requires<[IsThumb, IsThumb1Only]>, Sched<[WriteBr]>; +} + +let isBranch = 1, isTerminator = 1, isBarrier = 1 in { + let isPredicable = 1 in + def tB : T1pI<(outs), (ins t_brtarget:$target), IIC_Br, + "b", "\t$target", [(br bb:$target)]>, + T1Encoding<{1,1,1,0,0,?}>, Sched<[WriteBr]> { + bits<11> target; + let Inst{10-0} = target; + let AsmMatchConverter = "cvtThumbBranches"; + } + + // Far jump + // Just a pseudo for a tBL instruction. Needed to let regalloc know about + // the clobber of LR. + let Defs = [LR] in + def tBfar : tPseudoExpand<(outs), (ins t_bltarget:$target, pred:$p), + 4, IIC_Br, [], (tBL pred:$p, t_bltarget:$target)>, + Sched<[WriteBrTbl]>; + + def tBR_JTr : tPseudoInst<(outs), + (ins tGPR:$target, i32imm:$jt), + 0, IIC_Br, + [(ARMbrjt tGPR:$target, tjumptable:$jt)]>, + Sched<[WriteBrTbl]> { + let Size = 2; + list<Predicate> Predicates = [IsThumb, IsThumb1Only]; + } +} + +// FIXME: should be able to write a pattern for ARMBrcond, but can't use +// a two-value operand where a dag node expects two operands. :( +let isBranch = 1, isTerminator = 1 in + def tBcc : T1I<(outs), (ins t_bcctarget:$target, pred:$p), IIC_Br, + "b${p}\t$target", + [/*(ARMbrcond bb:$target, imm:$cc)*/]>, + T1BranchCond<{1,1,0,1}>, Sched<[WriteBr]> { + bits<4> p; + bits<8> target; + let Inst{11-8} = p; + let Inst{7-0} = target; + let AsmMatchConverter = "cvtThumbBranches"; +} + + +// Tail calls +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in { + // IOS versions. + let Uses = [SP] in { + def tTAILJMPr : tPseudoExpand<(outs), (ins tcGPR:$dst), + 4, IIC_Br, [], + (tBX GPR:$dst, (ops 14, zero_reg))>, + Requires<[IsThumb]>, Sched<[WriteBr]>; + } + // tTAILJMPd: MachO version uses a Thumb2 branch (no Thumb1 tail calls + // on MachO), so it's in ARMInstrThumb2.td. + // Non-MachO version: + let Uses = [SP] in { + def tTAILJMPdND : tPseudoExpand<(outs), + (ins t_brtarget:$dst, pred:$p), + 4, IIC_Br, [], + (tB t_brtarget:$dst, pred:$p)>, + Requires<[IsThumb, IsNotMachO]>, Sched<[WriteBr]>; + } +} + + +// A8.6.218 Supervisor Call (Software Interrupt) +// A8.6.16 B: Encoding T1 +// If Inst{11-8} == 0b1111 then SEE SVC +let isCall = 1, Uses = [SP] in +def tSVC : T1pI<(outs), (ins imm0_255:$imm), IIC_Br, + "svc", "\t$imm", []>, Encoding16, Sched<[WriteBr]> { + bits<8> imm; + let Inst{15-12} = 0b1101; + let Inst{11-8} = 0b1111; + let Inst{7-0} = imm; +} + +// The assembler uses 0xDEFE for a trap instruction. +let isBarrier = 1, isTerminator = 1 in +def tTRAP : TI<(outs), (ins), IIC_Br, + "trap", [(trap)]>, Encoding16, Sched<[WriteBr]> { + let Inst = 0xdefe; +} + +//===----------------------------------------------------------------------===// +// Load Store Instructions. +// + +// PC-relative loads need to be matched first as constant pool accesses need to +// always be PC-relative. We do this using AddedComplexity, as the pattern is +// simpler than the patterns of the other load instructions. +let canFoldAsLoad = 1, isReMaterializable = 1, AddedComplexity = 10 in +def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i, + "ldr", "\t$Rt, $addr", + [(set tGPR:$Rt, (load (ARMWrapper tconstpool:$addr)))]>, + T1Encoding<{0,1,0,0,1,?}> { + // A6.2 & A8.6.59 + bits<3> Rt; + bits<8> addr; + let Inst{10-8} = Rt; + let Inst{7-0} = addr; +} + +// SP-relative loads should be matched before standard immediate-offset loads as +// it means we avoid having to move SP to another register. +let canFoldAsLoad = 1 in +def tLDRspi : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_sp:$addr), IIC_iLoad_i, + "ldr", "\t$Rt, $addr", + [(set tGPR:$Rt, (load t_addrmode_sp:$addr))]>, + T1LdStSP<{1,?,?}> { + bits<3> Rt; + bits<8> addr; + let Inst{10-8} = Rt; + let Inst{7-0} = addr; +} + +// Loads: reg/reg and reg/imm5 +let canFoldAsLoad = 1, isReMaterializable = 1 in +multiclass thumb_ld_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc, + Operand AddrMode_r, Operand AddrMode_i, + AddrMode am, InstrItinClass itin_r, + InstrItinClass itin_i, string asm, + PatFrag opnode> { + // Immediate-offset loads should be matched before register-offset loads as + // when the offset is a constant it's simpler to first check if it fits in the + // immediate offset field then fall back to register-offset if it doesn't. + def i : // reg/imm5 + T1pILdStEncodeImm<imm_opc, 1 /* Load */, + (outs tGPR:$Rt), (ins AddrMode_i:$addr), + am, itin_i, asm, "\t$Rt, $addr", + [(set tGPR:$Rt, (opnode AddrMode_i:$addr))]>; + // Register-offset loads are matched last. + def r : // reg/reg + T1pILdStEncode<reg_opc, + (outs tGPR:$Rt), (ins AddrMode_r:$addr), + am, itin_r, asm, "\t$Rt, $addr", + [(set tGPR:$Rt, (opnode AddrMode_r:$addr))]>; +} +// Stores: reg/reg and reg/imm5 +multiclass thumb_st_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc, + Operand AddrMode_r, Operand AddrMode_i, + AddrMode am, InstrItinClass itin_r, + InstrItinClass itin_i, string asm, + PatFrag opnode> { + def i : // reg/imm5 + T1pILdStEncodeImm<imm_opc, 0 /* Store */, + (outs), (ins tGPR:$Rt, AddrMode_i:$addr), + am, itin_i, asm, "\t$Rt, $addr", + [(opnode tGPR:$Rt, AddrMode_i:$addr)]>; + def r : // reg/reg + T1pILdStEncode<reg_opc, + (outs), (ins tGPR:$Rt, AddrMode_r:$addr), + am, itin_r, asm, "\t$Rt, $addr", + [(opnode tGPR:$Rt, AddrMode_r:$addr)]>; +} + +// A8.6.57 & A8.6.60 +defm tLDR : thumb_ld_rr_ri_enc<0b100, 0b0110, t_addrmode_rr, + t_addrmode_is4, AddrModeT1_4, + IIC_iLoad_r, IIC_iLoad_i, "ldr", + UnOpFrag<(load node:$Src)>>; + +// A8.6.64 & A8.6.61 +defm tLDRB : thumb_ld_rr_ri_enc<0b110, 0b0111, t_addrmode_rr, + t_addrmode_is1, AddrModeT1_1, + IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrb", + UnOpFrag<(zextloadi8 node:$Src)>>; + +// A8.6.76 & A8.6.73 +defm tLDRH : thumb_ld_rr_ri_enc<0b101, 0b1000, t_addrmode_rr, + t_addrmode_is2, AddrModeT1_2, + IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrh", + UnOpFrag<(zextloadi16 node:$Src)>>; + +let AddedComplexity = 10 in +def tLDRSB : // A8.6.80 + T1pILdStEncode<0b011, (outs tGPR:$Rt), (ins t_addrmode_rr:$addr), + AddrModeT1_1, IIC_iLoad_bh_r, + "ldrsb", "\t$Rt, $addr", + [(set tGPR:$Rt, (sextloadi8 t_addrmode_rr:$addr))]>; + +let AddedComplexity = 10 in +def tLDRSH : // A8.6.84 + T1pILdStEncode<0b111, (outs tGPR:$Rt), (ins t_addrmode_rr:$addr), + AddrModeT1_2, IIC_iLoad_bh_r, + "ldrsh", "\t$Rt, $addr", + [(set tGPR:$Rt, (sextloadi16 t_addrmode_rr:$addr))]>; + + +def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i, + "str", "\t$Rt, $addr", + [(store tGPR:$Rt, t_addrmode_sp:$addr)]>, + T1LdStSP<{0,?,?}> { + bits<3> Rt; + bits<8> addr; + let Inst{10-8} = Rt; + let Inst{7-0} = addr; +} + +// A8.6.194 & A8.6.192 +defm tSTR : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rr, + t_addrmode_is4, AddrModeT1_4, + IIC_iStore_r, IIC_iStore_i, "str", + BinOpFrag<(store node:$LHS, node:$RHS)>>; + +// A8.6.197 & A8.6.195 +defm tSTRB : thumb_st_rr_ri_enc<0b010, 0b0111, t_addrmode_rr, + t_addrmode_is1, AddrModeT1_1, + IIC_iStore_bh_r, IIC_iStore_bh_i, "strb", + BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>; + +// A8.6.207 & A8.6.205 +defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rr, + t_addrmode_is2, AddrModeT1_2, + IIC_iStore_bh_r, IIC_iStore_bh_i, "strh", + BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>; + + +//===----------------------------------------------------------------------===// +// Load / store multiple Instructions. +// + +// These require base address to be written back or one of the loaded regs. +let hasSideEffects = 0 in { + +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in +def tLDMIA : T1I<(outs), (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops), + IIC_iLoad_m, "ldm${p}\t$Rn, $regs", []>, T1Encoding<{1,1,0,0,1,?}> { + bits<3> Rn; + bits<8> regs; + let Inst{10-8} = Rn; + let Inst{7-0} = regs; +} + +// Writeback version is just a pseudo, as there's no encoding difference. +// Writeback happens iff the base register is not in the destination register +// list. +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in +def tLDMIA_UPD : + InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo, GenericDomain, + "$Rn = $wb", IIC_iLoad_mu>, + PseudoInstExpansion<(tLDMIA tGPR:$Rn, pred:$p, reglist:$regs)> { + let Size = 2; + let OutOperandList = (outs GPR:$wb); + let InOperandList = (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops); + let Pattern = []; + let isCodeGenOnly = 1; + let isPseudo = 1; + list<Predicate> Predicates = [IsThumb]; +} + +// There is no non-writeback version of STM for Thumb. +let mayStore = 1, hasExtraSrcRegAllocReq = 1 in +def tSTMIA_UPD : Thumb1I<(outs GPR:$wb), + (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops), + AddrModeNone, 2, IIC_iStore_mu, + "stm${p}\t$Rn!, $regs", "$Rn = $wb", []>, + T1Encoding<{1,1,0,0,0,?}> { + bits<3> Rn; + bits<8> regs; + let Inst{10-8} = Rn; + let Inst{7-0} = regs; +} + +} // hasSideEffects + +def : InstAlias<"ldm${p} $Rn!, $regs", + (tLDMIA tGPR:$Rn, pred:$p, reglist:$regs)>, + Requires<[IsThumb, IsThumb1Only]>; + +let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1 in +def tPOP : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops), + IIC_iPop, + "pop${p}\t$regs", []>, + T1Misc<{1,1,0,?,?,?,?}> { + bits<16> regs; + let Inst{8} = regs{15}; + let Inst{7-0} = regs{7-0}; +} + +let mayStore = 1, Uses = [SP], Defs = [SP], hasExtraSrcRegAllocReq = 1 in +def tPUSH : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops), + IIC_iStore_m, + "push${p}\t$regs", []>, + T1Misc<{0,1,0,?,?,?,?}> { + bits<16> regs; + let Inst{8} = regs{14}; + let Inst{7-0} = regs{7-0}; +} + +//===----------------------------------------------------------------------===// +// Arithmetic Instructions. +// + +// Helper classes for encoding T1pI patterns: +class T1pIDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T1pI<oops, iops, itin, opc, asm, pattern>, + T1DataProcessing<opA> { + bits<3> Rm; + bits<3> Rn; + let Inst{5-3} = Rm; + let Inst{2-0} = Rn; +} +class T1pIMiscEncode<bits<7> opA, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T1pI<oops, iops, itin, opc, asm, pattern>, + T1Misc<opA> { + bits<3> Rm; + bits<3> Rd; + let Inst{5-3} = Rm; + let Inst{2-0} = Rd; +} + +// Helper classes for encoding T1sI patterns: +class T1sIDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T1sI<oops, iops, itin, opc, asm, pattern>, + T1DataProcessing<opA> { + bits<3> Rd; + bits<3> Rn; + let Inst{5-3} = Rn; + let Inst{2-0} = Rd; +} +class T1sIGenEncode<bits<5> opA, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T1sI<oops, iops, itin, opc, asm, pattern>, + T1General<opA> { + bits<3> Rm; + bits<3> Rn; + bits<3> Rd; + let Inst{8-6} = Rm; + let Inst{5-3} = Rn; + let Inst{2-0} = Rd; +} +class T1sIGenEncodeImm<bits<5> opA, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T1sI<oops, iops, itin, opc, asm, pattern>, + T1General<opA> { + bits<3> Rd; + bits<3> Rm; + let Inst{5-3} = Rm; + let Inst{2-0} = Rd; +} + +// Helper classes for encoding T1sIt patterns: +class T1sItDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T1sIt<oops, iops, itin, opc, asm, pattern>, + T1DataProcessing<opA> { + bits<3> Rdn; + bits<3> Rm; + let Inst{5-3} = Rm; + let Inst{2-0} = Rdn; +} +class T1sItGenEncodeImm<bits<5> opA, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T1sIt<oops, iops, itin, opc, asm, pattern>, + T1General<opA> { + bits<3> Rdn; + bits<8> imm8; + let Inst{10-8} = Rdn; + let Inst{7-0} = imm8; +} + +// Add with carry register +let isCommutable = 1, Uses = [CPSR] in +def tADC : // A8.6.2 + T1sItDPEncode<0b0101, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), IIC_iALUr, + "adc", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (adde tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>; + +// Add immediate +def tADDi3 : // A8.6.4 T1 + T1sIGenEncodeImm<0b01110, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3), + IIC_iALUi, + "add", "\t$Rd, $Rm, $imm3", + [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7:$imm3))]>, + Sched<[WriteALU]> { + bits<3> imm3; + let Inst{8-6} = imm3; +} + +def tADDi8 : // A8.6.4 T2 + T1sItGenEncodeImm<{1,1,0,?,?}, (outs tGPR:$Rdn), + (ins tGPR:$Rn, imm0_255:$imm8), IIC_iALUi, + "add", "\t$Rdn, $imm8", + [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255:$imm8))]>, + Sched<[WriteALU]>; + +// Add register +let isCommutable = 1 in +def tADDrr : // A8.6.6 T1 + T1sIGenEncode<0b01100, (outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iALUr, + "add", "\t$Rd, $Rn, $Rm", + [(set tGPR:$Rd, (add tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>; + +let hasSideEffects = 0 in +def tADDhirr : T1pIt<(outs GPR:$Rdn), (ins GPR:$Rn, GPR:$Rm), IIC_iALUr, + "add", "\t$Rdn, $Rm", []>, + T1Special<{0,0,?,?}>, Sched<[WriteALU]> { + // A8.6.6 T2 + bits<4> Rdn; + bits<4> Rm; + let Inst{7} = Rdn{3}; + let Inst{6-3} = Rm; + let Inst{2-0} = Rdn{2-0}; +} + +// AND register +let isCommutable = 1 in +def tAND : // A8.6.12 + T1sItDPEncode<0b0000, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iBITr, + "and", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (and tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>; + +// ASR immediate +def tASRri : // A8.6.14 + T1sIGenEncodeImm<{0,1,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm_sr:$imm5), + IIC_iMOVsi, + "asr", "\t$Rd, $Rm, $imm5", + [(set tGPR:$Rd, (sra tGPR:$Rm, (i32 imm_sr:$imm5)))]>, + Sched<[WriteALU]> { + bits<5> imm5; + let Inst{10-6} = imm5; +} + +// ASR register +def tASRrr : // A8.6.15 + T1sItDPEncode<0b0100, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iMOVsr, + "asr", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (sra tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>; + +// BIC register +def tBIC : // A8.6.20 + T1sItDPEncode<0b1110, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iBITr, + "bic", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (and tGPR:$Rn, (not tGPR:$Rm)))]>, + Sched<[WriteALU]>; + +// CMN register +let isCompare = 1, Defs = [CPSR] in { +//FIXME: Disable CMN, as CCodes are backwards from compare expectations +// Compare-to-zero still works out, just not the relationals +//def tCMN : // A8.6.33 +// T1pIDPEncode<0b1011, (outs), (ins tGPR:$lhs, tGPR:$rhs), +// IIC_iCMPr, +// "cmn", "\t$lhs, $rhs", +// [(ARMcmp tGPR:$lhs, (ineg tGPR:$rhs))]>; + +def tCMNz : // A8.6.33 + T1pIDPEncode<0b1011, (outs), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iCMPr, + "cmn", "\t$Rn, $Rm", + [(ARMcmpZ tGPR:$Rn, (ineg tGPR:$Rm))]>, Sched<[WriteCMP]>; + +} // isCompare = 1, Defs = [CPSR] + +// CMP immediate +let isCompare = 1, Defs = [CPSR] in { +def tCMPi8 : T1pI<(outs), (ins tGPR:$Rn, imm0_255:$imm8), IIC_iCMPi, + "cmp", "\t$Rn, $imm8", + [(ARMcmp tGPR:$Rn, imm0_255:$imm8)]>, + T1General<{1,0,1,?,?}>, Sched<[WriteCMP]> { + // A8.6.35 + bits<3> Rn; + bits<8> imm8; + let Inst{10-8} = Rn; + let Inst{7-0} = imm8; +} + +// CMP register +def tCMPr : // A8.6.36 T1 + T1pIDPEncode<0b1010, (outs), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iCMPr, + "cmp", "\t$Rn, $Rm", + [(ARMcmp tGPR:$Rn, tGPR:$Rm)]>, Sched<[WriteCMP]>; + +def tCMPhir : T1pI<(outs), (ins GPR:$Rn, GPR:$Rm), IIC_iCMPr, + "cmp", "\t$Rn, $Rm", []>, + T1Special<{0,1,?,?}>, Sched<[WriteCMP]> { + // A8.6.36 T2 + bits<4> Rm; + bits<4> Rn; + let Inst{7} = Rn{3}; + let Inst{6-3} = Rm; + let Inst{2-0} = Rn{2-0}; +} +} // isCompare = 1, Defs = [CPSR] + + +// XOR register +let isCommutable = 1 in +def tEOR : // A8.6.45 + T1sItDPEncode<0b0001, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iBITr, + "eor", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (xor tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>; + +// LSL immediate +def tLSLri : // A8.6.88 + T1sIGenEncodeImm<{0,0,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_31:$imm5), + IIC_iMOVsi, + "lsl", "\t$Rd, $Rm, $imm5", + [(set tGPR:$Rd, (shl tGPR:$Rm, (i32 imm:$imm5)))]>, + Sched<[WriteALU]> { + bits<5> imm5; + let Inst{10-6} = imm5; +} + +// LSL register +def tLSLrr : // A8.6.89 + T1sItDPEncode<0b0010, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iMOVsr, + "lsl", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (shl tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>; + +// LSR immediate +def tLSRri : // A8.6.90 + T1sIGenEncodeImm<{0,0,1,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm_sr:$imm5), + IIC_iMOVsi, + "lsr", "\t$Rd, $Rm, $imm5", + [(set tGPR:$Rd, (srl tGPR:$Rm, (i32 imm_sr:$imm5)))]>, + Sched<[WriteALU]> { + bits<5> imm5; + let Inst{10-6} = imm5; +} + +// LSR register +def tLSRrr : // A8.6.91 + T1sItDPEncode<0b0011, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iMOVsr, + "lsr", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (srl tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>; + +// Move register +let isMoveImm = 1 in +def tMOVi8 : T1sI<(outs tGPR:$Rd), (ins imm0_255:$imm8), IIC_iMOVi, + "mov", "\t$Rd, $imm8", + [(set tGPR:$Rd, imm0_255:$imm8)]>, + T1General<{1,0,0,?,?}>, Sched<[WriteALU]> { + // A8.6.96 + bits<3> Rd; + bits<8> imm8; + let Inst{10-8} = Rd; + let Inst{7-0} = imm8; +} +// Because we have an explicit tMOVSr below, we need an alias to handle +// the immediate "movs" form here. Blech. +def : tInstAlias <"movs $Rdn, $imm", + (tMOVi8 tGPR:$Rdn, CPSR, imm0_255:$imm, 14, 0)>; + +// A7-73: MOV(2) - mov setting flag. + +let hasSideEffects = 0 in { +def tMOVr : Thumb1pI<(outs GPR:$Rd), (ins GPR:$Rm), AddrModeNone, + 2, IIC_iMOVr, + "mov", "\t$Rd, $Rm", "", []>, + T1Special<{1,0,?,?}>, Sched<[WriteALU]> { + // A8.6.97 + bits<4> Rd; + bits<4> Rm; + let Inst{7} = Rd{3}; + let Inst{6-3} = Rm; + let Inst{2-0} = Rd{2-0}; +} +let Defs = [CPSR] in +def tMOVSr : T1I<(outs tGPR:$Rd), (ins tGPR:$Rm), IIC_iMOVr, + "movs\t$Rd, $Rm", []>, Encoding16, Sched<[WriteALU]> { + // A8.6.97 + bits<3> Rd; + bits<3> Rm; + let Inst{15-6} = 0b0000000000; + let Inst{5-3} = Rm; + let Inst{2-0} = Rd; +} +} // hasSideEffects + +// Multiply register +let isCommutable = 1 in +def tMUL : // A8.6.105 T1 + Thumb1sI<(outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm), AddrModeNone, 2, + IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm", "$Rm = $Rd", + [(set tGPR:$Rd, (mul tGPR:$Rn, tGPR:$Rm))]>, + T1DataProcessing<0b1101> { + bits<3> Rd; + bits<3> Rn; + let Inst{5-3} = Rn; + let Inst{2-0} = Rd; + let AsmMatchConverter = "cvtThumbMultiply"; +} + +def :tInstAlias<"mul${s}${p} $Rdm, $Rn", (tMUL tGPR:$Rdm, s_cc_out:$s, tGPR:$Rn, + pred:$p)>; + +// Move inverse register +def tMVN : // A8.6.107 + T1sIDPEncode<0b1111, (outs tGPR:$Rd), (ins tGPR:$Rn), IIC_iMVNr, + "mvn", "\t$Rd, $Rn", + [(set tGPR:$Rd, (not tGPR:$Rn))]>, Sched<[WriteALU]>; + +// Bitwise or register +let isCommutable = 1 in +def tORR : // A8.6.114 + T1sItDPEncode<0b1100, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iBITr, + "orr", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (or tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>; + +// Swaps +def tREV : // A8.6.134 + T1pIMiscEncode<{1,0,1,0,0,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm), + IIC_iUNAr, + "rev", "\t$Rd, $Rm", + [(set tGPR:$Rd, (bswap tGPR:$Rm))]>, + Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>; + +def tREV16 : // A8.6.135 + T1pIMiscEncode<{1,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm), + IIC_iUNAr, + "rev16", "\t$Rd, $Rm", + [(set tGPR:$Rd, (rotr (bswap tGPR:$Rm), (i32 16)))]>, + Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>; + +def tREVSH : // A8.6.136 + T1pIMiscEncode<{1,0,1,0,1,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm), + IIC_iUNAr, + "revsh", "\t$Rd, $Rm", + [(set tGPR:$Rd, (sra (bswap tGPR:$Rm), (i32 16)))]>, + Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>; + +// Rotate right register +def tROR : // A8.6.139 + T1sItDPEncode<0b0111, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iMOVsr, + "ror", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (rotr tGPR:$Rn, tGPR:$Rm))]>, + Sched<[WriteALU]>; + +// Negate register +def tRSB : // A8.6.141 + T1sIDPEncode<0b1001, (outs tGPR:$Rd), (ins tGPR:$Rn), + IIC_iALUi, + "rsb", "\t$Rd, $Rn, #0", + [(set tGPR:$Rd, (ineg tGPR:$Rn))]>, Sched<[WriteALU]>; + +// Subtract with carry register +let Uses = [CPSR] in +def tSBC : // A8.6.151 + T1sItDPEncode<0b0110, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iALUr, + "sbc", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (sube tGPR:$Rn, tGPR:$Rm))]>, + Sched<[WriteALU]>; + +// Subtract immediate +def tSUBi3 : // A8.6.210 T1 + T1sIGenEncodeImm<0b01111, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3), + IIC_iALUi, + "sub", "\t$Rd, $Rm, $imm3", + [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7_neg:$imm3))]>, + Sched<[WriteALU]> { + bits<3> imm3; + let Inst{8-6} = imm3; +} + +def tSUBi8 : // A8.6.210 T2 + T1sItGenEncodeImm<{1,1,1,?,?}, (outs tGPR:$Rdn), + (ins tGPR:$Rn, imm0_255:$imm8), IIC_iALUi, + "sub", "\t$Rdn, $imm8", + [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255_neg:$imm8))]>, + Sched<[WriteALU]>; + +// Subtract register +def tSUBrr : // A8.6.212 + T1sIGenEncode<0b01101, (outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iALUr, + "sub", "\t$Rd, $Rn, $Rm", + [(set tGPR:$Rd, (sub tGPR:$Rn, tGPR:$Rm))]>, + Sched<[WriteALU]>; + +// Sign-extend byte +def tSXTB : // A8.6.222 + T1pIMiscEncode<{0,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm), + IIC_iUNAr, + "sxtb", "\t$Rd, $Rm", + [(set tGPR:$Rd, (sext_inreg tGPR:$Rm, i8))]>, + Requires<[IsThumb, IsThumb1Only, HasV6]>, + Sched<[WriteALU]>; + +// Sign-extend short +def tSXTH : // A8.6.224 + T1pIMiscEncode<{0,0,1,0,0,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm), + IIC_iUNAr, + "sxth", "\t$Rd, $Rm", + [(set tGPR:$Rd, (sext_inreg tGPR:$Rm, i16))]>, + Requires<[IsThumb, IsThumb1Only, HasV6]>, + Sched<[WriteALU]>; + +// Test +let isCompare = 1, isCommutable = 1, Defs = [CPSR] in +def tTST : // A8.6.230 + T1pIDPEncode<0b1000, (outs), (ins tGPR:$Rn, tGPR:$Rm), IIC_iTSTr, + "tst", "\t$Rn, $Rm", + [(ARMcmpZ (and_su tGPR:$Rn, tGPR:$Rm), 0)]>, + Sched<[WriteALU]>; + +// A8.8.247 UDF - Undefined (Encoding T1) +def tUDF : TI<(outs), (ins imm0_255:$imm8), IIC_Br, "udf\t$imm8", + [(int_arm_undefined imm0_255:$imm8)]>, Encoding16 { + bits<8> imm8; + let Inst{15-12} = 0b1101; + let Inst{11-8} = 0b1110; + let Inst{7-0} = imm8; +} + +// Zero-extend byte +def tUXTB : // A8.6.262 + T1pIMiscEncode<{0,0,1,0,1,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm), + IIC_iUNAr, + "uxtb", "\t$Rd, $Rm", + [(set tGPR:$Rd, (and tGPR:$Rm, 0xFF))]>, + Requires<[IsThumb, IsThumb1Only, HasV6]>, + Sched<[WriteALU]>; + +// Zero-extend short +def tUXTH : // A8.6.264 + T1pIMiscEncode<{0,0,1,0,1,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm), + IIC_iUNAr, + "uxth", "\t$Rd, $Rm", + [(set tGPR:$Rd, (and tGPR:$Rm, 0xFFFF))]>, + Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>; + +// Conditional move tMOVCCr - Used to implement the Thumb SELECT_CC operation. +// Expanded after instruction selection into a branch sequence. +let usesCustomInserter = 1 in // Expanded after instruction selection. + def tMOVCCr_pseudo : + PseudoInst<(outs tGPR:$dst), (ins tGPR:$false, tGPR:$true, cmovpred:$p), + NoItinerary, + [(set tGPR:$dst, (ARMcmov tGPR:$false, tGPR:$true, cmovpred:$p))]>; + +// tLEApcrel - Load a pc-relative address into a register without offending the +// assembler. + +def tADR : T1I<(outs tGPR:$Rd), (ins t_adrlabel:$addr, pred:$p), + IIC_iALUi, "adr{$p}\t$Rd, $addr", []>, + T1Encoding<{1,0,1,0,0,?}>, Sched<[WriteALU]> { + bits<3> Rd; + bits<8> addr; + let Inst{10-8} = Rd; + let Inst{7-0} = addr; + let DecoderMethod = "DecodeThumbAddSpecialReg"; +} + +let hasSideEffects = 0, isReMaterializable = 1 in +def tLEApcrel : tPseudoInst<(outs tGPR:$Rd), (ins i32imm:$label, pred:$p), + 2, IIC_iALUi, []>, Sched<[WriteALU]>; + +let hasSideEffects = 1 in +def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd), + (ins i32imm:$label, pred:$p), + 2, IIC_iALUi, []>, Sched<[WriteALU]>; + +//===----------------------------------------------------------------------===// +// TLS Instructions +// + +// __aeabi_read_tp preserves the registers r1-r3. +// This is a pseudo inst so that we can get the encoding right, +// complete with fixup for the aeabi_read_tp function. +let isCall = 1, Defs = [R0, R12, LR, CPSR], Uses = [SP] in +def tTPsoft : tPseudoInst<(outs), (ins), 4, IIC_Br, + [(set R0, ARMthread_pointer)]>, + Sched<[WriteBr]>; + +//===----------------------------------------------------------------------===// +// SJLJ Exception handling intrinsics +// + +// eh_sjlj_setjmp() is an instruction sequence to store the return address and +// save #0 in R0 for the non-longjmp case. Since by its nature we may be coming +// from some other function to get here, and we're using the stack frame for the +// containing function to save/restore registers, we can't keep anything live in +// regs across the eh_sjlj_setjmp(), else it will almost certainly have been +// tromped upon when we get here from a longjmp(). We force everything out of +// registers except for our own input by listing the relevant registers in +// Defs. By doing so, we also cause the prologue/epilogue code to actively +// preserve all of the callee-saved resgisters, which is exactly what we want. +// $val is a scratch register for our use. +let Defs = [ R0, R1, R2, R3, R4, R5, R6, R7, R12, CPSR ], + hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1, + usesCustomInserter = 1 in +def tInt_eh_sjlj_setjmp : ThumbXI<(outs),(ins tGPR:$src, tGPR:$val), + AddrModeNone, 0, NoItinerary, "","", + [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>; + +// FIXME: Non-IOS version(s) +let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, isCodeGenOnly = 1, + Defs = [ R7, LR, SP ] in +def tInt_eh_sjlj_longjmp : XI<(outs), (ins GPR:$src, GPR:$scratch), + AddrModeNone, 0, IndexModeNone, + Pseudo, NoItinerary, "", "", + [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>, + Requires<[IsThumb]>; + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +// + +// Comparisons +def : T1Pat<(ARMcmpZ tGPR:$Rn, imm0_255:$imm8), + (tCMPi8 tGPR:$Rn, imm0_255:$imm8)>; +def : T1Pat<(ARMcmpZ tGPR:$Rn, tGPR:$Rm), + (tCMPr tGPR:$Rn, tGPR:$Rm)>; + +// Add with carry +def : T1Pat<(addc tGPR:$lhs, imm0_7:$rhs), + (tADDi3 tGPR:$lhs, imm0_7:$rhs)>; +def : T1Pat<(addc tGPR:$lhs, imm8_255:$rhs), + (tADDi8 tGPR:$lhs, imm8_255:$rhs)>; +def : T1Pat<(addc tGPR:$lhs, tGPR:$rhs), + (tADDrr tGPR:$lhs, tGPR:$rhs)>; + +// Subtract with carry +def : T1Pat<(addc tGPR:$lhs, imm0_7_neg:$rhs), + (tSUBi3 tGPR:$lhs, imm0_7_neg:$rhs)>; +def : T1Pat<(addc tGPR:$lhs, imm8_255_neg:$rhs), + (tSUBi8 tGPR:$lhs, imm8_255_neg:$rhs)>; +def : T1Pat<(subc tGPR:$lhs, tGPR:$rhs), + (tSUBrr tGPR:$lhs, tGPR:$rhs)>; + +// Bswap 16 with load/store +def : T1Pat<(srl (bswap (extloadi16 t_addrmode_is2:$addr)), (i32 16)), + (tREV16 (tLDRHi t_addrmode_is2:$addr))>; +def : T1Pat<(srl (bswap (extloadi16 t_addrmode_rr:$addr)), (i32 16)), + (tREV16 (tLDRHr t_addrmode_rr:$addr))>; +def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)), + t_addrmode_is2:$addr), + (tSTRHi(tREV16 tGPR:$Rn), t_addrmode_is2:$addr)>; +def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)), + t_addrmode_rr:$addr), + (tSTRHr (tREV16 tGPR:$Rn), t_addrmode_rr:$addr)>; + +// ConstantPool +def : T1Pat<(ARMWrapper tconstpool :$dst), (tLEApcrel tconstpool :$dst)>; + +// GlobalAddress +def tLDRLIT_ga_pcrel : PseudoInst<(outs tGPR:$dst), (ins i32imm:$addr), + IIC_iLoadiALU, + [(set tGPR:$dst, + (ARMWrapperPIC tglobaladdr:$addr))]>, + Requires<[IsThumb, DontUseMovt]>; + +def tLDRLIT_ga_abs : PseudoInst<(outs tGPR:$dst), (ins i32imm:$src), + IIC_iLoad_i, + [(set tGPR:$dst, + (ARMWrapper tglobaladdr:$src))]>, + Requires<[IsThumb, DontUseMovt]>; + + +// JumpTable +def : T1Pat<(ARMWrapperJT tjumptable:$dst), + (tLEApcrelJT tjumptable:$dst)>; + +// Direct calls +def : T1Pat<(ARMtcall texternalsym:$func), (tBL texternalsym:$func)>, + Requires<[IsThumb]>; + +def : Tv5Pat<(ARMcall texternalsym:$func), (tBLXi texternalsym:$func)>, + Requires<[IsThumb, HasV5T, IsNotMClass]>; + +// Indirect calls to ARM routines +def : Tv5Pat<(ARMcall GPR:$dst), (tBLXr GPR:$dst)>, + Requires<[IsThumb, HasV5T]>; + +// zextload i1 -> zextload i8 +def : T1Pat<(zextloadi1 t_addrmode_is1:$addr), + (tLDRBi t_addrmode_is1:$addr)>; +def : T1Pat<(zextloadi1 t_addrmode_rr:$addr), + (tLDRBr t_addrmode_rr:$addr)>; + +// extload from the stack -> word load from the stack, as it avoids having to +// materialize the base in a separate register. This only works when a word +// load puts the byte/halfword value in the same place in the register that the +// byte/halfword load would, i.e. when little-endian. +def : T1Pat<(extloadi1 t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>, + Requires<[IsThumb, IsThumb1Only, IsLE]>; +def : T1Pat<(extloadi8 t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>, + Requires<[IsThumb, IsThumb1Only, IsLE]>; +def : T1Pat<(extloadi16 t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>, + Requires<[IsThumb, IsThumb1Only, IsLE]>; + +// extload -> zextload +def : T1Pat<(extloadi1 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>; +def : T1Pat<(extloadi1 t_addrmode_rr:$addr), (tLDRBr t_addrmode_rr:$addr)>; +def : T1Pat<(extloadi8 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>; +def : T1Pat<(extloadi8 t_addrmode_rr:$addr), (tLDRBr t_addrmode_rr:$addr)>; +def : T1Pat<(extloadi16 t_addrmode_is2:$addr), (tLDRHi t_addrmode_is2:$addr)>; +def : T1Pat<(extloadi16 t_addrmode_rr:$addr), (tLDRHr t_addrmode_rr:$addr)>; + +// If it's impossible to use [r,r] address mode for sextload, select to +// ldr{b|h} + sxt{b|h} instead. +def : T1Pat<(sextloadi8 t_addrmode_is1:$addr), + (tSXTB (tLDRBi t_addrmode_is1:$addr))>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; +def : T1Pat<(sextloadi8 t_addrmode_rr:$addr), + (tSXTB (tLDRBr t_addrmode_rr:$addr))>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; +def : T1Pat<(sextloadi16 t_addrmode_is2:$addr), + (tSXTH (tLDRHi t_addrmode_is2:$addr))>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; +def : T1Pat<(sextloadi16 t_addrmode_rr:$addr), + (tSXTH (tLDRHr t_addrmode_rr:$addr))>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; + +def : T1Pat<(sextloadi8 t_addrmode_is1:$addr), + (tASRri (tLSLri (tLDRBi t_addrmode_is1:$addr), 24), 24)>; +def : T1Pat<(sextloadi8 t_addrmode_rr:$addr), + (tASRri (tLSLri (tLDRBr t_addrmode_rr:$addr), 24), 24)>; +def : T1Pat<(sextloadi16 t_addrmode_is2:$addr), + (tASRri (tLSLri (tLDRHi t_addrmode_is2:$addr), 16), 16)>; +def : T1Pat<(sextloadi16 t_addrmode_rr:$addr), + (tASRri (tLSLri (tLDRHr t_addrmode_rr:$addr), 16), 16)>; + +def : T1Pat<(atomic_load_8 t_addrmode_is1:$src), + (tLDRBi t_addrmode_is1:$src)>; +def : T1Pat<(atomic_load_8 t_addrmode_rr:$src), + (tLDRBr t_addrmode_rr:$src)>; +def : T1Pat<(atomic_load_16 t_addrmode_is2:$src), + (tLDRHi t_addrmode_is2:$src)>; +def : T1Pat<(atomic_load_16 t_addrmode_rr:$src), + (tLDRHr t_addrmode_rr:$src)>; +def : T1Pat<(atomic_load_32 t_addrmode_is4:$src), + (tLDRi t_addrmode_is4:$src)>; +def : T1Pat<(atomic_load_32 t_addrmode_rr:$src), + (tLDRr t_addrmode_rr:$src)>; +def : T1Pat<(atomic_store_8 t_addrmode_is1:$ptr, tGPR:$val), + (tSTRBi tGPR:$val, t_addrmode_is1:$ptr)>; +def : T1Pat<(atomic_store_8 t_addrmode_rr:$ptr, tGPR:$val), + (tSTRBr tGPR:$val, t_addrmode_rr:$ptr)>; +def : T1Pat<(atomic_store_16 t_addrmode_is2:$ptr, tGPR:$val), + (tSTRHi tGPR:$val, t_addrmode_is2:$ptr)>; +def : T1Pat<(atomic_store_16 t_addrmode_rr:$ptr, tGPR:$val), + (tSTRHr tGPR:$val, t_addrmode_rr:$ptr)>; +def : T1Pat<(atomic_store_32 t_addrmode_is4:$ptr, tGPR:$val), + (tSTRi tGPR:$val, t_addrmode_is4:$ptr)>; +def : T1Pat<(atomic_store_32 t_addrmode_rr:$ptr, tGPR:$val), + (tSTRr tGPR:$val, t_addrmode_rr:$ptr)>; + +// Large immediate handling. + +// Two piece imms. +def : T1Pat<(i32 thumb_immshifted:$src), + (tLSLri (tMOVi8 (thumb_immshifted_val imm:$src)), + (thumb_immshifted_shamt imm:$src))>; + +def : T1Pat<(i32 imm0_255_comp:$src), + (tMVN (tMOVi8 (imm_comp_XFORM imm:$src)))>; + +// Pseudo instruction that combines ldr from constpool and add pc. This should +// be expanded into two instructions late to allow if-conversion and +// scheduling. +let isReMaterializable = 1 in +def tLDRpci_pic : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr, pclabel:$cp), + NoItinerary, + [(set GPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)), + imm:$cp))]>, + Requires<[IsThumb, IsThumb1Only]>; + +// Pseudo-instruction for merged POP and return. +// FIXME: remove when we have a way to marking a MI with these properties. +let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1, + hasExtraDefRegAllocReq = 1 in +def tPOP_RET : tPseudoExpand<(outs), (ins pred:$p, reglist:$regs, variable_ops), + 2, IIC_iPop_Br, [], + (tPOP pred:$p, reglist:$regs)>, Sched<[WriteBrL]>; + +// Indirect branch using "mov pc, $Rm" +let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { + def tBRIND : tPseudoExpand<(outs), (ins GPR:$Rm, pred:$p), + 2, IIC_Br, [(brind GPR:$Rm)], + (tMOVr PC, GPR:$Rm, pred:$p)>, Sched<[WriteBr]>; +} + + +// In Thumb1, "nop" is encoded as a "mov r8, r8". Technically, the bf00 +// encoding is available on ARMv6K, but we don't differentiate that finely. +def : InstAlias<"nop", (tMOVr R8, R8, 14, 0)>,Requires<[IsThumb, IsThumb1Only]>; + + +// For round-trip assembly/disassembly, we have to handle a CPS instruction +// without any iflags. That's not, strictly speaking, valid syntax, but it's +// a useful extension and assembles to defined behaviour (the insn does +// nothing). +def : tInstAlias<"cps$imod", (tCPS imod_op:$imod, 0)>; +def : tInstAlias<"cps$imod", (tCPS imod_op:$imod, 0)>; + +// "neg" is and alias for "rsb rd, rn, #0" +def : tInstAlias<"neg${s}${p} $Rd, $Rm", + (tRSB tGPR:$Rd, s_cc_out:$s, tGPR:$Rm, pred:$p)>; + + +// Implied destination operand forms for shifts. +def : tInstAlias<"lsl${s}${p} $Rdm, $imm", + (tLSLri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm0_31:$imm, pred:$p)>; +def : tInstAlias<"lsr${s}${p} $Rdm, $imm", + (tLSRri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm_sr:$imm, pred:$p)>; +def : tInstAlias<"asr${s}${p} $Rdm, $imm", + (tASRri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm_sr:$imm, pred:$p)>; diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td new file mode 100644 index 0000000..d460d33 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -0,0 +1,4765 @@ +//===-- ARMInstrThumb2.td - Thumb2 support for ARM ---------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the Thumb2 instruction set. +// +//===----------------------------------------------------------------------===// + +// IT block predicate field +def it_pred_asmoperand : AsmOperandClass { + let Name = "ITCondCode"; + let ParserMethod = "parseITCondCode"; +} +def it_pred : Operand<i32> { + let PrintMethod = "printMandatoryPredicateOperand"; + let ParserMatchClass = it_pred_asmoperand; +} + +// IT block condition mask +def it_mask_asmoperand : AsmOperandClass { let Name = "ITMask"; } +def it_mask : Operand<i32> { + let PrintMethod = "printThumbITMask"; + let ParserMatchClass = it_mask_asmoperand; +} + +// t2_shift_imm: An integer that encodes a shift amount and the type of shift +// (asr or lsl). The 6-bit immediate encodes as: +// {5} 0 ==> lsl +// 1 asr +// {4-0} imm5 shift amount. +// asr #32 not allowed +def t2_shift_imm : Operand<i32> { + let PrintMethod = "printShiftImmOperand"; + let ParserMatchClass = ShifterImmAsmOperand; + let DecoderMethod = "DecodeT2ShifterImmOperand"; +} + +// Shifted operands. No register controlled shifts for Thumb2. +// Note: We do not support rrx shifted operands yet. +def t2_so_reg : Operand<i32>, // reg imm + ComplexPattern<i32, 2, "SelectShiftImmShifterOperand", + [shl,srl,sra,rotr]> { + let EncoderMethod = "getT2SORegOpValue"; + let PrintMethod = "printT2SOOperand"; + let DecoderMethod = "DecodeSORegImmOperand"; + let ParserMatchClass = ShiftedImmAsmOperand; + let MIOperandInfo = (ops rGPR, i32imm); +} + +// t2_so_imm_not_XFORM - Return the complement of a t2_so_imm value +def t2_so_imm_not_XFORM : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), SDLoc(N), + MVT::i32); +}]>; + +// t2_so_imm_neg_XFORM - Return the negation of a t2_so_imm value +def t2_so_imm_neg_XFORM : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(-((int)N->getZExtValue()), SDLoc(N), + MVT::i32); +}]>; + +// so_imm_notSext_XFORM - Return a so_imm value packed into the format +// described for so_imm_notSext def below, with sign extension from 16 +// bits. +def t2_so_imm_notSext16_XFORM : SDNodeXForm<imm, [{ + APInt apIntN = N->getAPIntValue(); + unsigned N16bitSignExt = apIntN.trunc(16).sext(32).getZExtValue(); + return CurDAG->getTargetConstant(~N16bitSignExt, SDLoc(N), MVT::i32); +}]>; + +// t2_so_imm - Match a 32-bit immediate operand, which is an +// 8-bit immediate rotated by an arbitrary number of bits, or an 8-bit +// immediate splatted into multiple bytes of the word. +def t2_so_imm_asmoperand : ImmAsmOperand { let Name = "T2SOImm"; } +def t2_so_imm : Operand<i32>, ImmLeaf<i32, [{ + return ARM_AM::getT2SOImmVal(Imm) != -1; + }]> { + let ParserMatchClass = t2_so_imm_asmoperand; + let EncoderMethod = "getT2SOImmOpValue"; + let DecoderMethod = "DecodeT2SOImm"; +} + +// t2_so_imm_not - Match an immediate that is a complement +// of a t2_so_imm. +// Note: this pattern doesn't require an encoder method and such, as it's +// only used on aliases (Pat<> and InstAlias<>). The actual encoding +// is handled by the destination instructions, which use t2_so_imm. +def t2_so_imm_not_asmoperand : AsmOperandClass { let Name = "T2SOImmNot"; } +def t2_so_imm_not : Operand<i32>, PatLeaf<(imm), [{ + return ARM_AM::getT2SOImmVal(~((uint32_t)N->getZExtValue())) != -1; +}], t2_so_imm_not_XFORM> { + let ParserMatchClass = t2_so_imm_not_asmoperand; +} + +// t2_so_imm_notSext - match an immediate that is a complement of a t2_so_imm +// if the upper 16 bits are zero. +def t2_so_imm_notSext : Operand<i32>, PatLeaf<(imm), [{ + APInt apIntN = N->getAPIntValue(); + if (!apIntN.isIntN(16)) return false; + unsigned N16bitSignExt = apIntN.trunc(16).sext(32).getZExtValue(); + return ARM_AM::getT2SOImmVal(~N16bitSignExt) != -1; + }], t2_so_imm_notSext16_XFORM> { + let ParserMatchClass = t2_so_imm_not_asmoperand; +} + +// t2_so_imm_neg - Match an immediate that is a negation of a t2_so_imm. +def t2_so_imm_neg_asmoperand : AsmOperandClass { let Name = "T2SOImmNeg"; } +def t2_so_imm_neg : Operand<i32>, PatLeaf<(imm), [{ + int64_t Value = -(int)N->getZExtValue(); + return Value && ARM_AM::getT2SOImmVal(Value) != -1; +}], t2_so_imm_neg_XFORM> { + let ParserMatchClass = t2_so_imm_neg_asmoperand; +} + +/// imm0_4095 predicate - True if the 32-bit immediate is in the range [0.4095]. +def imm0_4095_asmoperand: ImmAsmOperand { let Name = "Imm0_4095"; } +def imm0_4095 : Operand<i32>, ImmLeaf<i32, [{ + return Imm >= 0 && Imm < 4096; +}]> { + let ParserMatchClass = imm0_4095_asmoperand; +} + +def imm0_4095_neg_asmoperand: AsmOperandClass { let Name = "Imm0_4095Neg"; } +def imm0_4095_neg : Operand<i32>, PatLeaf<(i32 imm), [{ + return (uint32_t)(-N->getZExtValue()) < 4096; +}], imm_neg_XFORM> { + let ParserMatchClass = imm0_4095_neg_asmoperand; +} + +def imm1_255_neg : PatLeaf<(i32 imm), [{ + uint32_t Val = -N->getZExtValue(); + return (Val > 0 && Val < 255); +}], imm_neg_XFORM>; + +def imm0_255_not : PatLeaf<(i32 imm), [{ + return (uint32_t)(~N->getZExtValue()) < 255; +}], imm_comp_XFORM>; + +def lo5AllOne : PatLeaf<(i32 imm), [{ + // Returns true if all low 5-bits are 1. + return (((uint32_t)N->getZExtValue()) & 0x1FUL) == 0x1FUL; +}]>; + +// Define Thumb2 specific addressing modes. + +// t2addrmode_imm12 := reg + imm12 +def t2addrmode_imm12_asmoperand : AsmOperandClass {let Name="MemUImm12Offset";} +def t2addrmode_imm12 : MemOperand, + ComplexPattern<i32, 2, "SelectT2AddrModeImm12", []> { + let PrintMethod = "printAddrModeImm12Operand<false>"; + let EncoderMethod = "getAddrModeImm12OpValue"; + let DecoderMethod = "DecodeT2AddrModeImm12"; + let ParserMatchClass = t2addrmode_imm12_asmoperand; + let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); +} + +// t2ldrlabel := imm12 +def t2ldrlabel : Operand<i32> { + let EncoderMethod = "getAddrModeImm12OpValue"; + let PrintMethod = "printThumbLdrLabelOperand"; +} + +def t2ldr_pcrel_imm12_asmoperand : AsmOperandClass {let Name = "MemPCRelImm12";} +def t2ldr_pcrel_imm12 : Operand<i32> { + let ParserMatchClass = t2ldr_pcrel_imm12_asmoperand; + // used for assembler pseudo instruction and maps to t2ldrlabel, so + // doesn't need encoder or print methods of its own. +} + +// ADR instruction labels. +def t2adrlabel : Operand<i32> { + let EncoderMethod = "getT2AdrLabelOpValue"; + let PrintMethod = "printAdrLabelOperand<0>"; +} + +// t2addrmode_posimm8 := reg + imm8 +def MemPosImm8OffsetAsmOperand : AsmOperandClass {let Name="MemPosImm8Offset";} +def t2addrmode_posimm8 : MemOperand { + let PrintMethod = "printT2AddrModeImm8Operand<false>"; + let EncoderMethod = "getT2AddrModeImm8OpValue"; + let DecoderMethod = "DecodeT2AddrModeImm8"; + let ParserMatchClass = MemPosImm8OffsetAsmOperand; + let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); +} + +// t2addrmode_negimm8 := reg - imm8 +def MemNegImm8OffsetAsmOperand : AsmOperandClass {let Name="MemNegImm8Offset";} +def t2addrmode_negimm8 : MemOperand, + ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> { + let PrintMethod = "printT2AddrModeImm8Operand<false>"; + let EncoderMethod = "getT2AddrModeImm8OpValue"; + let DecoderMethod = "DecodeT2AddrModeImm8"; + let ParserMatchClass = MemNegImm8OffsetAsmOperand; + let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); +} + +// t2addrmode_imm8 := reg +/- imm8 +def MemImm8OffsetAsmOperand : AsmOperandClass { let Name = "MemImm8Offset"; } +class T2AddrMode_Imm8 : MemOperand, + ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> { + let EncoderMethod = "getT2AddrModeImm8OpValue"; + let DecoderMethod = "DecodeT2AddrModeImm8"; + let ParserMatchClass = MemImm8OffsetAsmOperand; + let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); +} + +def t2addrmode_imm8 : T2AddrMode_Imm8 { + let PrintMethod = "printT2AddrModeImm8Operand<false>"; +} + +def t2addrmode_imm8_pre : T2AddrMode_Imm8 { + let PrintMethod = "printT2AddrModeImm8Operand<true>"; +} + +def t2am_imm8_offset : MemOperand, + ComplexPattern<i32, 1, "SelectT2AddrModeImm8Offset", + [], [SDNPWantRoot]> { + let PrintMethod = "printT2AddrModeImm8OffsetOperand"; + let EncoderMethod = "getT2AddrModeImm8OffsetOpValue"; + let DecoderMethod = "DecodeT2Imm8"; +} + +// t2addrmode_imm8s4 := reg +/- (imm8 << 2) +def MemImm8s4OffsetAsmOperand : AsmOperandClass {let Name = "MemImm8s4Offset";} +class T2AddrMode_Imm8s4 : MemOperand { + let EncoderMethod = "getT2AddrModeImm8s4OpValue"; + let DecoderMethod = "DecodeT2AddrModeImm8s4"; + let ParserMatchClass = MemImm8s4OffsetAsmOperand; + let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); +} + +def t2addrmode_imm8s4 : T2AddrMode_Imm8s4 { + let PrintMethod = "printT2AddrModeImm8s4Operand<false>"; +} + +def t2addrmode_imm8s4_pre : T2AddrMode_Imm8s4 { + let PrintMethod = "printT2AddrModeImm8s4Operand<true>"; +} + +def t2am_imm8s4_offset_asmoperand : AsmOperandClass { let Name = "Imm8s4"; } +def t2am_imm8s4_offset : MemOperand { + let PrintMethod = "printT2AddrModeImm8s4OffsetOperand"; + let EncoderMethod = "getT2Imm8s4OpValue"; + let DecoderMethod = "DecodeT2Imm8S4"; +} + +// t2addrmode_imm0_1020s4 := reg + (imm8 << 2) +def MemImm0_1020s4OffsetAsmOperand : AsmOperandClass { + let Name = "MemImm0_1020s4Offset"; +} +def t2addrmode_imm0_1020s4 : MemOperand, + ComplexPattern<i32, 2, "SelectT2AddrModeExclusive"> { + let PrintMethod = "printT2AddrModeImm0_1020s4Operand"; + let EncoderMethod = "getT2AddrModeImm0_1020s4OpValue"; + let DecoderMethod = "DecodeT2AddrModeImm0_1020s4"; + let ParserMatchClass = MemImm0_1020s4OffsetAsmOperand; + let MIOperandInfo = (ops GPRnopc:$base, i32imm:$offsimm); +} + +// t2addrmode_so_reg := reg + (reg << imm2) +def t2addrmode_so_reg_asmoperand : AsmOperandClass {let Name="T2MemRegOffset";} +def t2addrmode_so_reg : MemOperand, + ComplexPattern<i32, 3, "SelectT2AddrModeSoReg", []> { + let PrintMethod = "printT2AddrModeSoRegOperand"; + let EncoderMethod = "getT2AddrModeSORegOpValue"; + let DecoderMethod = "DecodeT2AddrModeSOReg"; + let ParserMatchClass = t2addrmode_so_reg_asmoperand; + let MIOperandInfo = (ops GPR:$base, rGPR:$offsreg, i32imm:$offsimm); +} + +// Addresses for the TBB/TBH instructions. +def addrmode_tbb_asmoperand : AsmOperandClass { let Name = "MemTBB"; } +def addrmode_tbb : MemOperand { + let PrintMethod = "printAddrModeTBB"; + let ParserMatchClass = addrmode_tbb_asmoperand; + let MIOperandInfo = (ops GPR:$Rn, rGPR:$Rm); +} +def addrmode_tbh_asmoperand : AsmOperandClass { let Name = "MemTBH"; } +def addrmode_tbh : MemOperand { + let PrintMethod = "printAddrModeTBH"; + let ParserMatchClass = addrmode_tbh_asmoperand; + let MIOperandInfo = (ops GPR:$Rn, rGPR:$Rm); +} + +//===----------------------------------------------------------------------===// +// Multiclass helpers... +// + + +class T2OneRegImm<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<12> imm; + + let Inst{11-8} = Rd; + let Inst{26} = imm{11}; + let Inst{14-12} = imm{10-8}; + let Inst{7-0} = imm{7-0}; +} + + +class T2sOneRegImm<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2sI<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<12> imm; + + let Inst{11-8} = Rd; + let Inst{26} = imm{11}; + let Inst{14-12} = imm{10-8}; + let Inst{7-0} = imm{7-0}; +} + +class T2OneRegCmpImm<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rn; + bits<12> imm; + + let Inst{19-16} = Rn; + let Inst{26} = imm{11}; + let Inst{14-12} = imm{10-8}; + let Inst{7-0} = imm{7-0}; +} + + +class T2OneRegShiftedReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<12> ShiftedRm; + + let Inst{11-8} = Rd; + let Inst{3-0} = ShiftedRm{3-0}; + let Inst{5-4} = ShiftedRm{6-5}; + let Inst{14-12} = ShiftedRm{11-9}; + let Inst{7-6} = ShiftedRm{8-7}; +} + +class T2sOneRegShiftedReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2sI<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<12> ShiftedRm; + + let Inst{11-8} = Rd; + let Inst{3-0} = ShiftedRm{3-0}; + let Inst{5-4} = ShiftedRm{6-5}; + let Inst{14-12} = ShiftedRm{11-9}; + let Inst{7-6} = ShiftedRm{8-7}; +} + +class T2OneRegCmpShiftedReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rn; + bits<12> ShiftedRm; + + let Inst{19-16} = Rn; + let Inst{3-0} = ShiftedRm{3-0}; + let Inst{5-4} = ShiftedRm{6-5}; + let Inst{14-12} = ShiftedRm{11-9}; + let Inst{7-6} = ShiftedRm{8-7}; +} + +class T2TwoReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rm; + + let Inst{11-8} = Rd; + let Inst{3-0} = Rm; +} + +class T2sTwoReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2sI<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rm; + + let Inst{11-8} = Rd; + let Inst{3-0} = Rm; +} + +class T2TwoRegCmp<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rn; + bits<4> Rm; + + let Inst{19-16} = Rn; + let Inst{3-0} = Rm; +} + + +class T2TwoRegImm<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<12> imm; + + let Inst{11-8} = Rd; + let Inst{19-16} = Rn; + let Inst{26} = imm{11}; + let Inst{14-12} = imm{10-8}; + let Inst{7-0} = imm{7-0}; +} + +class T2sTwoRegImm<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2sI<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<12> imm; + + let Inst{11-8} = Rd; + let Inst{19-16} = Rn; + let Inst{26} = imm{11}; + let Inst{14-12} = imm{10-8}; + let Inst{7-0} = imm{7-0}; +} + +class T2TwoRegShiftImm<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rm; + bits<5> imm; + + let Inst{11-8} = Rd; + let Inst{3-0} = Rm; + let Inst{14-12} = imm{4-2}; + let Inst{7-6} = imm{1-0}; +} + +class T2sTwoRegShiftImm<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2sI<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rm; + bits<5> imm; + + let Inst{11-8} = Rd; + let Inst{3-0} = Rm; + let Inst{14-12} = imm{4-2}; + let Inst{7-6} = imm{1-0}; +} + +class T2ThreeReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + + let Inst{11-8} = Rd; + let Inst{19-16} = Rn; + let Inst{3-0} = Rm; +} + +class T2ThreeRegNoP<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : T2XI<oops, iops, itin, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + + let Inst{11-8} = Rd; + let Inst{19-16} = Rn; + let Inst{3-0} = Rm; +} + +class T2sThreeReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2sI<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + + let Inst{11-8} = Rd; + let Inst{19-16} = Rn; + let Inst{3-0} = Rm; +} + +class T2TwoRegShiftedReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<12> ShiftedRm; + + let Inst{11-8} = Rd; + let Inst{19-16} = Rn; + let Inst{3-0} = ShiftedRm{3-0}; + let Inst{5-4} = ShiftedRm{6-5}; + let Inst{14-12} = ShiftedRm{11-9}; + let Inst{7-6} = ShiftedRm{8-7}; +} + +class T2sTwoRegShiftedReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2sI<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<12> ShiftedRm; + + let Inst{11-8} = Rd; + let Inst{19-16} = Rn; + let Inst{3-0} = ShiftedRm{3-0}; + let Inst{5-4} = ShiftedRm{6-5}; + let Inst{14-12} = ShiftedRm{11-9}; + let Inst{7-6} = ShiftedRm{8-7}; +} + +class T2FourReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + bits<4> Ra; + + let Inst{19-16} = Rn; + let Inst{15-12} = Ra; + let Inst{11-8} = Rd; + let Inst{3-0} = Rm; +} + +class T2MulLong<bits<3> opc22_20, bits<4> opc7_4, + dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> RdLo; + bits<4> RdHi; + bits<4> Rn; + bits<4> Rm; + + let Inst{31-23} = 0b111110111; + let Inst{22-20} = opc22_20; + let Inst{19-16} = Rn; + let Inst{15-12} = RdLo; + let Inst{11-8} = RdHi; + let Inst{7-4} = opc7_4; + let Inst{3-0} = Rm; +} +class T2MlaLong<bits<3> opc22_20, bits<4> opc7_4, + dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> RdLo; + bits<4> RdHi; + bits<4> Rn; + bits<4> Rm; + + let Inst{31-23} = 0b111110111; + let Inst{22-20} = opc22_20; + let Inst{19-16} = Rn; + let Inst{15-12} = RdLo; + let Inst{11-8} = RdHi; + let Inst{7-4} = opc7_4; + let Inst{3-0} = Rm; +} + + +/// T2I_bin_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a +/// binary operation that produces a value. These are predicable and can be +/// changed to modify CPSR. +multiclass T2I_bin_irs<bits<4> opcod, string opc, + InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, + PatFrag opnode, bit Commutable = 0, + string wide = ""> { + // shifted imm + def ri : T2sTwoRegImm< + (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), iii, + opc, "\t$Rd, $Rn, $imm", + [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]>, + Sched<[WriteALU, ReadALU]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = opcod; + let Inst{15} = 0; + } + // register + def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), iir, + opc, !strconcat(wide, "\t$Rd, $Rn, $Rm"), + [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]>, + Sched<[WriteALU, ReadALU, ReadALU]> { + let isCommutable = Commutable; + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{14-12} = 0b000; // imm3 + let Inst{7-6} = 0b00; // imm2 + let Inst{5-4} = 0b00; // type + } + // shifted register + def rs : T2sTwoRegShiftedReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm), iis, + opc, !strconcat(wide, "\t$Rd, $Rn, $ShiftedRm"), + [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]>, + Sched<[WriteALUsi, ReadALU]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + } + // Assembly aliases for optional destination operand when it's the same + // as the source operand. + def : t2InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"), + (!cast<Instruction>(NAME#"ri") rGPR:$Rdn, rGPR:$Rdn, + t2_so_imm:$imm, pred:$p, + cc_out:$s)>; + def : t2InstAlias<!strconcat(opc, "${s}${p}", wide, " $Rdn, $Rm"), + (!cast<Instruction>(NAME#"rr") rGPR:$Rdn, rGPR:$Rdn, + rGPR:$Rm, pred:$p, + cc_out:$s)>; + def : t2InstAlias<!strconcat(opc, "${s}${p}", wide, " $Rdn, $shift"), + (!cast<Instruction>(NAME#"rs") rGPR:$Rdn, rGPR:$Rdn, + t2_so_reg:$shift, pred:$p, + cc_out:$s)>; +} + +/// T2I_bin_w_irs - Same as T2I_bin_irs except these operations need +// the ".w" suffix to indicate that they are wide. +multiclass T2I_bin_w_irs<bits<4> opcod, string opc, + InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, + PatFrag opnode, bit Commutable = 0> : + T2I_bin_irs<opcod, opc, iii, iir, iis, opnode, Commutable, ".w"> { + // Assembler aliases w/ the ".w" suffix. + def : t2InstAlias<!strconcat(opc, "${s}${p}.w", " $Rd, $Rn, $imm"), + (!cast<Instruction>(NAME#"ri") rGPR:$Rd, rGPR:$Rn, t2_so_imm:$imm, pred:$p, + cc_out:$s)>; + // Assembler aliases w/o the ".w" suffix. + def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $Rm"), + (!cast<Instruction>(NAME#"rr") rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p, + cc_out:$s)>; + def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $shift"), + (!cast<Instruction>(NAME#"rs") rGPR:$Rd, rGPR:$Rn, t2_so_reg:$shift, + pred:$p, cc_out:$s)>; + + // and with the optional destination operand, too. + def : t2InstAlias<!strconcat(opc, "${s}${p}.w", " $Rdn, $imm"), + (!cast<Instruction>(NAME#"ri") rGPR:$Rdn, rGPR:$Rdn, t2_so_imm:$imm, + pred:$p, cc_out:$s)>; + def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $Rm"), + (!cast<Instruction>(NAME#"rr") rGPR:$Rdn, rGPR:$Rdn, rGPR:$Rm, pred:$p, + cc_out:$s)>; + def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $shift"), + (!cast<Instruction>(NAME#"rs") rGPR:$Rdn, rGPR:$Rdn, t2_so_reg:$shift, + pred:$p, cc_out:$s)>; +} + +/// T2I_rbin_is - Same as T2I_bin_irs except the order of operands are +/// reversed. The 'rr' form is only defined for the disassembler; for codegen +/// it is equivalent to the T2I_bin_irs counterpart. +multiclass T2I_rbin_irs<bits<4> opcod, string opc, PatFrag opnode> { + // shifted imm + def ri : T2sTwoRegImm< + (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), IIC_iALUi, + opc, ".w\t$Rd, $Rn, $imm", + [(set rGPR:$Rd, (opnode t2_so_imm:$imm, rGPR:$Rn))]>, + Sched<[WriteALU, ReadALU]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = opcod; + let Inst{15} = 0; + } + // register + def rr : T2sThreeReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr, + opc, "\t$Rd, $Rn, $Rm", + [/* For disassembly only; pattern left blank */]>, + Sched<[WriteALU, ReadALU, ReadALU]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{14-12} = 0b000; // imm3 + let Inst{7-6} = 0b00; // imm2 + let Inst{5-4} = 0b00; // type + } + // shifted register + def rs : T2sTwoRegShiftedReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm), + IIC_iALUsir, opc, "\t$Rd, $Rn, $ShiftedRm", + [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm, rGPR:$Rn))]>, + Sched<[WriteALUsi, ReadALU]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + } +} + +/// T2I_bin_s_irs - Similar to T2I_bin_irs except it sets the 's' bit so the +/// instruction modifies the CPSR register. +/// +/// These opcodes will be converted to the real non-S opcodes by +/// AdjustInstrPostInstrSelection after giving then an optional CPSR operand. +let hasPostISelHook = 1, Defs = [CPSR] in { +multiclass T2I_bin_s_irs<InstrItinClass iii, InstrItinClass iir, + InstrItinClass iis, PatFrag opnode, + bit Commutable = 0> { + // shifted imm + def ri : t2PseudoInst<(outs rGPR:$Rd), + (ins GPRnopc:$Rn, t2_so_imm:$imm, pred:$p), + 4, iii, + [(set rGPR:$Rd, CPSR, (opnode GPRnopc:$Rn, + t2_so_imm:$imm))]>, + Sched<[WriteALU, ReadALU]>; + // register + def rr : t2PseudoInst<(outs rGPR:$Rd), (ins GPRnopc:$Rn, rGPR:$Rm, pred:$p), + 4, iir, + [(set rGPR:$Rd, CPSR, (opnode GPRnopc:$Rn, + rGPR:$Rm))]>, + Sched<[WriteALU, ReadALU, ReadALU]> { + let isCommutable = Commutable; + } + // shifted register + def rs : t2PseudoInst<(outs rGPR:$Rd), + (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm, pred:$p), + 4, iis, + [(set rGPR:$Rd, CPSR, (opnode GPRnopc:$Rn, + t2_so_reg:$ShiftedRm))]>, + Sched<[WriteALUsi, ReadALUsr]>; +} +} + +/// T2I_rbin_s_is - Same as T2I_bin_s_irs, except selection DAG +/// operands are reversed. +let hasPostISelHook = 1, Defs = [CPSR] in { +multiclass T2I_rbin_s_is<PatFrag opnode> { + // shifted imm + def ri : t2PseudoInst<(outs rGPR:$Rd), + (ins rGPR:$Rn, t2_so_imm:$imm, pred:$p), + 4, IIC_iALUi, + [(set rGPR:$Rd, CPSR, (opnode t2_so_imm:$imm, + rGPR:$Rn))]>, + Sched<[WriteALU, ReadALU]>; + // shifted register + def rs : t2PseudoInst<(outs rGPR:$Rd), + (ins rGPR:$Rn, t2_so_reg:$ShiftedRm, pred:$p), + 4, IIC_iALUsi, + [(set rGPR:$Rd, CPSR, (opnode t2_so_reg:$ShiftedRm, + rGPR:$Rn))]>, + Sched<[WriteALUsi, ReadALU]>; +} +} + +/// T2I_bin_ii12rs - Defines a set of (op reg, {so_imm|imm0_4095|r|so_reg}) +/// patterns for a binary operation that produces a value. +multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode, + bit Commutable = 0> { + // shifted imm + // The register-immediate version is re-materializable. This is useful + // in particular for taking the address of a local. + let isReMaterializable = 1 in { + def ri : T2sTwoRegImm< + (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, t2_so_imm:$imm), IIC_iALUi, + opc, ".w\t$Rd, $Rn, $imm", + [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, t2_so_imm:$imm))]>, + Sched<[WriteALU, ReadALU]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24} = 1; + let Inst{23-21} = op23_21; + let Inst{15} = 0; + } + } + // 12-bit imm + def ri12 : T2I< + (outs GPRnopc:$Rd), (ins GPR:$Rn, imm0_4095:$imm), IIC_iALUi, + !strconcat(opc, "w"), "\t$Rd, $Rn, $imm", + [(set GPRnopc:$Rd, (opnode GPR:$Rn, imm0_4095:$imm))]>, + Sched<[WriteALU, ReadALU]> { + bits<4> Rd; + bits<4> Rn; + bits<12> imm; + let Inst{31-27} = 0b11110; + let Inst{26} = imm{11}; + let Inst{25-24} = 0b10; + let Inst{23-21} = op23_21; + let Inst{20} = 0; // The S bit. + let Inst{19-16} = Rn; + let Inst{15} = 0; + let Inst{14-12} = imm{10-8}; + let Inst{11-8} = Rd; + let Inst{7-0} = imm{7-0}; + } + // register + def rr : T2sThreeReg<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, rGPR:$Rm), + IIC_iALUr, opc, ".w\t$Rd, $Rn, $Rm", + [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, rGPR:$Rm))]>, + Sched<[WriteALU, ReadALU, ReadALU]> { + let isCommutable = Commutable; + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24} = 1; + let Inst{23-21} = op23_21; + let Inst{14-12} = 0b000; // imm3 + let Inst{7-6} = 0b00; // imm2 + let Inst{5-4} = 0b00; // type + } + // shifted register + def rs : T2sTwoRegShiftedReg< + (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm), + IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm", + [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, t2_so_reg:$ShiftedRm))]>, + Sched<[WriteALUsi, ReadALU]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24} = 1; + let Inst{23-21} = op23_21; + } +} + +/// T2I_adde_sube_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns +/// for a binary operation that produces a value and use the carry +/// bit. It's not predicable. +let Defs = [CPSR], Uses = [CPSR] in { +multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, + bit Commutable = 0> { + // shifted imm + def ri : T2sTwoRegImm<(outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), + IIC_iALUi, opc, "\t$Rd, $Rn, $imm", + [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, t2_so_imm:$imm, CPSR))]>, + Requires<[IsThumb2]>, Sched<[WriteALU, ReadALU]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = opcod; + let Inst{15} = 0; + } + // register + def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr, + opc, ".w\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, rGPR:$Rm, CPSR))]>, + Requires<[IsThumb2]>, Sched<[WriteALU, ReadALU, ReadALU]> { + let isCommutable = Commutable; + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{14-12} = 0b000; // imm3 + let Inst{7-6} = 0b00; // imm2 + let Inst{5-4} = 0b00; // type + } + // shifted register + def rs : T2sTwoRegShiftedReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm), + IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm", + [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm, CPSR))]>, + Requires<[IsThumb2]>, Sched<[WriteALUsi, ReadALU]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + } +} +} + +/// T2I_sh_ir - Defines a set of (op reg, {so_imm|r}) patterns for a shift / +// rotate operation that produces a value. +multiclass T2I_sh_ir<bits<2> opcod, string opc, Operand ty, PatFrag opnode> { + // 5-bit imm + def ri : T2sTwoRegShiftImm< + (outs rGPR:$Rd), (ins rGPR:$Rm, ty:$imm), IIC_iMOVsi, + opc, ".w\t$Rd, $Rm, $imm", + [(set rGPR:$Rd, (opnode rGPR:$Rm, (i32 ty:$imm)))]>, + Sched<[WriteALU]> { + let Inst{31-27} = 0b11101; + let Inst{26-21} = 0b010010; + let Inst{19-16} = 0b1111; // Rn + let Inst{5-4} = opcod; + } + // register + def rr : T2sThreeReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMOVsr, + opc, ".w\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]>, + Sched<[WriteALU]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0100; + let Inst{22-21} = opcod; + let Inst{15-12} = 0b1111; + let Inst{7-4} = 0b0000; + } + + // Optional destination register + def : t2InstAlias<!strconcat(opc, "${s}${p}", ".w $Rdn, $imm"), + (!cast<Instruction>(NAME#"ri") rGPR:$Rdn, rGPR:$Rdn, ty:$imm, pred:$p, + cc_out:$s)>; + def : t2InstAlias<!strconcat(opc, "${s}${p}", ".w $Rdn, $Rm"), + (!cast<Instruction>(NAME#"rr") rGPR:$Rdn, rGPR:$Rdn, rGPR:$Rm, pred:$p, + cc_out:$s)>; + + // Assembler aliases w/o the ".w" suffix. + def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $imm"), + (!cast<Instruction>(NAME#"ri") rGPR:$Rd, rGPR:$Rn, ty:$imm, pred:$p, + cc_out:$s)>; + def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $Rm"), + (!cast<Instruction>(NAME#"rr") rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p, + cc_out:$s)>; + + // and with the optional destination operand, too. + def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $imm"), + (!cast<Instruction>(NAME#"ri") rGPR:$Rdn, rGPR:$Rdn, ty:$imm, pred:$p, + cc_out:$s)>; + def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $Rm"), + (!cast<Instruction>(NAME#"rr") rGPR:$Rdn, rGPR:$Rdn, rGPR:$Rm, pred:$p, + cc_out:$s)>; +} + +/// T2I_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test +/// patterns. Similar to T2I_bin_irs except the instruction does not produce +/// a explicit result, only implicitly set CPSR. +multiclass T2I_cmp_irs<bits<4> opcod, string opc, + InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, + PatFrag opnode> { +let isCompare = 1, Defs = [CPSR] in { + // shifted imm + def ri : T2OneRegCmpImm< + (outs), (ins GPRnopc:$Rn, t2_so_imm:$imm), iii, + opc, ".w\t$Rn, $imm", + [(opnode GPRnopc:$Rn, t2_so_imm:$imm)]>, Sched<[WriteCMP]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = opcod; + let Inst{20} = 1; // The S bit. + let Inst{15} = 0; + let Inst{11-8} = 0b1111; // Rd + } + // register + def rr : T2TwoRegCmp< + (outs), (ins GPRnopc:$Rn, rGPR:$Rm), iir, + opc, ".w\t$Rn, $Rm", + [(opnode GPRnopc:$Rn, rGPR:$Rm)]>, Sched<[WriteCMP]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{20} = 1; // The S bit. + let Inst{14-12} = 0b000; // imm3 + let Inst{11-8} = 0b1111; // Rd + let Inst{7-6} = 0b00; // imm2 + let Inst{5-4} = 0b00; // type + } + // shifted register + def rs : T2OneRegCmpShiftedReg< + (outs), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm), iis, + opc, ".w\t$Rn, $ShiftedRm", + [(opnode GPRnopc:$Rn, t2_so_reg:$ShiftedRm)]>, + Sched<[WriteCMPsi]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{20} = 1; // The S bit. + let Inst{11-8} = 0b1111; // Rd + } +} + + // Assembler aliases w/o the ".w" suffix. + // No alias here for 'rr' version as not all instantiations of this + // multiclass want one (CMP in particular, does not). + def : t2InstAlias<!strconcat(opc, "${p}", " $Rn, $imm"), + (!cast<Instruction>(NAME#"ri") GPRnopc:$Rn, t2_so_imm:$imm, pred:$p)>; + def : t2InstAlias<!strconcat(opc, "${p}", " $Rn, $shift"), + (!cast<Instruction>(NAME#"rs") GPRnopc:$Rn, t2_so_reg:$shift, pred:$p)>; +} + +/// T2I_ld - Defines a set of (op r, {imm12|imm8|so_reg}) load patterns. +multiclass T2I_ld<bit signed, bits<2> opcod, string opc, + InstrItinClass iii, InstrItinClass iis, RegisterClass target, + PatFrag opnode> { + def i12 : T2Ii12<(outs target:$Rt), (ins t2addrmode_imm12:$addr), iii, + opc, ".w\t$Rt, $addr", + [(set target:$Rt, (opnode t2addrmode_imm12:$addr))]> { + bits<4> Rt; + bits<17> addr; + let Inst{31-25} = 0b1111100; + let Inst{24} = signed; + let Inst{23} = 1; + let Inst{22-21} = opcod; + let Inst{20} = 1; // load + let Inst{19-16} = addr{16-13}; // Rn + let Inst{15-12} = Rt; + let Inst{11-0} = addr{11-0}; // imm + + let DecoderMethod = "DecodeT2LoadImm12"; + } + def i8 : T2Ii8 <(outs target:$Rt), (ins t2addrmode_negimm8:$addr), iii, + opc, "\t$Rt, $addr", + [(set target:$Rt, (opnode t2addrmode_negimm8:$addr))]> { + bits<4> Rt; + bits<13> addr; + let Inst{31-27} = 0b11111; + let Inst{26-25} = 0b00; + let Inst{24} = signed; + let Inst{23} = 0; + let Inst{22-21} = opcod; + let Inst{20} = 1; // load + let Inst{19-16} = addr{12-9}; // Rn + let Inst{15-12} = Rt; + let Inst{11} = 1; + // Offset: index==TRUE, wback==FALSE + let Inst{10} = 1; // The P bit. + let Inst{9} = addr{8}; // U + let Inst{8} = 0; // The W bit. + let Inst{7-0} = addr{7-0}; // imm + + let DecoderMethod = "DecodeT2LoadImm8"; + } + def s : T2Iso <(outs target:$Rt), (ins t2addrmode_so_reg:$addr), iis, + opc, ".w\t$Rt, $addr", + [(set target:$Rt, (opnode t2addrmode_so_reg:$addr))]> { + let Inst{31-27} = 0b11111; + let Inst{26-25} = 0b00; + let Inst{24} = signed; + let Inst{23} = 0; + let Inst{22-21} = opcod; + let Inst{20} = 1; // load + let Inst{11-6} = 0b000000; + + bits<4> Rt; + let Inst{15-12} = Rt; + + bits<10> addr; + let Inst{19-16} = addr{9-6}; // Rn + let Inst{3-0} = addr{5-2}; // Rm + let Inst{5-4} = addr{1-0}; // imm + + let DecoderMethod = "DecodeT2LoadShift"; + } + + // pci variant is very similar to i12, but supports negative offsets + // from the PC. + def pci : T2Ipc <(outs target:$Rt), (ins t2ldrlabel:$addr), iii, + opc, ".w\t$Rt, $addr", + [(set target:$Rt, (opnode (ARMWrapper tconstpool:$addr)))]> { + let isReMaterializable = 1; + let Inst{31-27} = 0b11111; + let Inst{26-25} = 0b00; + let Inst{24} = signed; + let Inst{22-21} = opcod; + let Inst{20} = 1; // load + let Inst{19-16} = 0b1111; // Rn + + bits<4> Rt; + let Inst{15-12} = Rt{3-0}; + + bits<13> addr; + let Inst{23} = addr{12}; // add = (U == '1') + let Inst{11-0} = addr{11-0}; + + let DecoderMethod = "DecodeT2LoadLabel"; + } +} + +/// T2I_st - Defines a set of (op r, {imm12|imm8|so_reg}) store patterns. +multiclass T2I_st<bits<2> opcod, string opc, + InstrItinClass iii, InstrItinClass iis, RegisterClass target, + PatFrag opnode> { + def i12 : T2Ii12<(outs), (ins target:$Rt, t2addrmode_imm12:$addr), iii, + opc, ".w\t$Rt, $addr", + [(opnode target:$Rt, t2addrmode_imm12:$addr)]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0001; + let Inst{22-21} = opcod; + let Inst{20} = 0; // !load + + bits<4> Rt; + let Inst{15-12} = Rt; + + bits<17> addr; + let addr{12} = 1; // add = TRUE + let Inst{19-16} = addr{16-13}; // Rn + let Inst{23} = addr{12}; // U + let Inst{11-0} = addr{11-0}; // imm + } + def i8 : T2Ii8 <(outs), (ins target:$Rt, t2addrmode_negimm8:$addr), iii, + opc, "\t$Rt, $addr", + [(opnode target:$Rt, t2addrmode_negimm8:$addr)]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0000; + let Inst{22-21} = opcod; + let Inst{20} = 0; // !load + let Inst{11} = 1; + // Offset: index==TRUE, wback==FALSE + let Inst{10} = 1; // The P bit. + let Inst{8} = 0; // The W bit. + + bits<4> Rt; + let Inst{15-12} = Rt; + + bits<13> addr; + let Inst{19-16} = addr{12-9}; // Rn + let Inst{9} = addr{8}; // U + let Inst{7-0} = addr{7-0}; // imm + } + def s : T2Iso <(outs), (ins target:$Rt, t2addrmode_so_reg:$addr), iis, + opc, ".w\t$Rt, $addr", + [(opnode target:$Rt, t2addrmode_so_reg:$addr)]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0000; + let Inst{22-21} = opcod; + let Inst{20} = 0; // !load + let Inst{11-6} = 0b000000; + + bits<4> Rt; + let Inst{15-12} = Rt; + + bits<10> addr; + let Inst{19-16} = addr{9-6}; // Rn + let Inst{3-0} = addr{5-2}; // Rm + let Inst{5-4} = addr{1-0}; // imm + } +} + +/// T2I_ext_rrot - A unary operation with two forms: one whose operand is a +/// register and one whose operand is a register rotated by 8/16/24. +class T2I_ext_rrot<bits<3> opcod, string opc, PatFrag opnode> + : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), IIC_iEXTr, + opc, ".w\t$Rd, $Rm$rot", + [(set rGPR:$Rd, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]>, + Requires<[IsThumb2]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0100; + let Inst{22-20} = opcod; + let Inst{19-16} = 0b1111; // Rn + let Inst{15-12} = 0b1111; + let Inst{7} = 1; + + bits<2> rot; + let Inst{5-4} = rot{1-0}; // rotate +} + +// UXTB16 - Requres T2ExtractPack, does not need the .w qualifier. +class T2I_ext_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode> + : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), + IIC_iEXTr, opc, "\t$Rd, $Rm$rot", + [(set rGPR:$Rd, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]>, + Requires<[HasT2ExtractPack, IsThumb2]> { + bits<2> rot; + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0100; + let Inst{22-20} = opcod; + let Inst{19-16} = 0b1111; // Rn + let Inst{15-12} = 0b1111; + let Inst{7} = 1; + let Inst{5-4} = rot; +} + +// SXTB16 - Requres T2ExtractPack, does not need the .w qualifier, no pattern +// supported yet. +class T2I_ext_rrot_sxtb16<bits<3> opcod, string opc> + : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), IIC_iEXTr, + opc, "\t$Rd, $Rm$rot", []>, + Requires<[IsThumb2, HasT2ExtractPack]> { + bits<2> rot; + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0100; + let Inst{22-20} = opcod; + let Inst{19-16} = 0b1111; // Rn + let Inst{15-12} = 0b1111; + let Inst{7} = 1; + let Inst{5-4} = rot; +} + +/// T2I_exta_rrot - A binary operation with two forms: one whose operand is a +/// register and one whose operand is a register rotated by 8/16/24. +class T2I_exta_rrot<bits<3> opcod, string opc, PatFrag opnode> + : T2ThreeReg<(outs rGPR:$Rd), + (ins rGPR:$Rn, rGPR:$Rm, rot_imm:$rot), + IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm$rot", + [(set rGPR:$Rd, (opnode rGPR:$Rn, (rotr rGPR:$Rm,rot_imm:$rot)))]>, + Requires<[HasT2ExtractPack, IsThumb2]> { + bits<2> rot; + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0100; + let Inst{22-20} = opcod; + let Inst{15-12} = 0b1111; + let Inst{7} = 1; + let Inst{5-4} = rot; +} + +class T2I_exta_rrot_np<bits<3> opcod, string opc> + : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm,rot_imm:$rot), + IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm$rot", []>, + Requires<[HasT2ExtractPack, IsThumb2]> { + bits<2> rot; + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0100; + let Inst{22-20} = opcod; + let Inst{15-12} = 0b1111; + let Inst{7} = 1; + let Inst{5-4} = rot; +} + +//===----------------------------------------------------------------------===// +// Instructions +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Miscellaneous Instructions. +// + +class T2PCOneRegImm<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : T2XI<oops, iops, itin, asm, pattern> { + bits<4> Rd; + bits<12> label; + + let Inst{11-8} = Rd; + let Inst{26} = label{11}; + let Inst{14-12} = label{10-8}; + let Inst{7-0} = label{7-0}; +} + +// LEApcrel - Load a pc-relative address into a register without offending the +// assembler. +def t2ADR : T2PCOneRegImm<(outs rGPR:$Rd), + (ins t2adrlabel:$addr, pred:$p), + IIC_iALUi, "adr{$p}.w\t$Rd, $addr", []>, + Sched<[WriteALU, ReadALU]> { + let Inst{31-27} = 0b11110; + let Inst{25-24} = 0b10; + // Inst{23:21} = '11' (add = FALSE) or '00' (add = TRUE) + let Inst{22} = 0; + let Inst{20} = 0; + let Inst{19-16} = 0b1111; // Rn + let Inst{15} = 0; + + bits<4> Rd; + bits<13> addr; + let Inst{11-8} = Rd; + let Inst{23} = addr{12}; + let Inst{21} = addr{12}; + let Inst{26} = addr{11}; + let Inst{14-12} = addr{10-8}; + let Inst{7-0} = addr{7-0}; + + let DecoderMethod = "DecodeT2Adr"; +} + +let hasSideEffects = 0, isReMaterializable = 1 in +def t2LEApcrel : t2PseudoInst<(outs rGPR:$Rd), (ins i32imm:$label, pred:$p), + 4, IIC_iALUi, []>, Sched<[WriteALU, ReadALU]>; +let hasSideEffects = 1 in +def t2LEApcrelJT : t2PseudoInst<(outs rGPR:$Rd), + (ins i32imm:$label, pred:$p), + 4, IIC_iALUi, + []>, Sched<[WriteALU, ReadALU]>; + + +//===----------------------------------------------------------------------===// +// Load / store Instructions. +// + +// Load +let canFoldAsLoad = 1, isReMaterializable = 1 in +defm t2LDR : T2I_ld<0, 0b10, "ldr", IIC_iLoad_i, IIC_iLoad_si, GPR, + UnOpFrag<(load node:$Src)>>; + +// Loads with zero extension +defm t2LDRH : T2I_ld<0, 0b01, "ldrh", IIC_iLoad_bh_i, IIC_iLoad_bh_si, + GPRnopc, UnOpFrag<(zextloadi16 node:$Src)>>; +defm t2LDRB : T2I_ld<0, 0b00, "ldrb", IIC_iLoad_bh_i, IIC_iLoad_bh_si, + GPRnopc, UnOpFrag<(zextloadi8 node:$Src)>>; + +// Loads with sign extension +defm t2LDRSH : T2I_ld<1, 0b01, "ldrsh", IIC_iLoad_bh_i, IIC_iLoad_bh_si, + GPRnopc, UnOpFrag<(sextloadi16 node:$Src)>>; +defm t2LDRSB : T2I_ld<1, 0b00, "ldrsb", IIC_iLoad_bh_i, IIC_iLoad_bh_si, + GPRnopc, UnOpFrag<(sextloadi8 node:$Src)>>; + +let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in { +// Load doubleword +def t2LDRDi8 : T2Ii8s4<1, 0, 1, (outs rGPR:$Rt, rGPR:$Rt2), + (ins t2addrmode_imm8s4:$addr), + IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", "", []>; +} // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 + +// zextload i1 -> zextload i8 +def : T2Pat<(zextloadi1 t2addrmode_imm12:$addr), + (t2LDRBi12 t2addrmode_imm12:$addr)>; +def : T2Pat<(zextloadi1 t2addrmode_negimm8:$addr), + (t2LDRBi8 t2addrmode_negimm8:$addr)>; +def : T2Pat<(zextloadi1 t2addrmode_so_reg:$addr), + (t2LDRBs t2addrmode_so_reg:$addr)>; +def : T2Pat<(zextloadi1 (ARMWrapper tconstpool:$addr)), + (t2LDRBpci tconstpool:$addr)>; + +// extload -> zextload +// FIXME: Reduce the number of patterns by legalizing extload to zextload +// earlier? +def : T2Pat<(extloadi1 t2addrmode_imm12:$addr), + (t2LDRBi12 t2addrmode_imm12:$addr)>; +def : T2Pat<(extloadi1 t2addrmode_negimm8:$addr), + (t2LDRBi8 t2addrmode_negimm8:$addr)>; +def : T2Pat<(extloadi1 t2addrmode_so_reg:$addr), + (t2LDRBs t2addrmode_so_reg:$addr)>; +def : T2Pat<(extloadi1 (ARMWrapper tconstpool:$addr)), + (t2LDRBpci tconstpool:$addr)>; + +def : T2Pat<(extloadi8 t2addrmode_imm12:$addr), + (t2LDRBi12 t2addrmode_imm12:$addr)>; +def : T2Pat<(extloadi8 t2addrmode_negimm8:$addr), + (t2LDRBi8 t2addrmode_negimm8:$addr)>; +def : T2Pat<(extloadi8 t2addrmode_so_reg:$addr), + (t2LDRBs t2addrmode_so_reg:$addr)>; +def : T2Pat<(extloadi8 (ARMWrapper tconstpool:$addr)), + (t2LDRBpci tconstpool:$addr)>; + +def : T2Pat<(extloadi16 t2addrmode_imm12:$addr), + (t2LDRHi12 t2addrmode_imm12:$addr)>; +def : T2Pat<(extloadi16 t2addrmode_negimm8:$addr), + (t2LDRHi8 t2addrmode_negimm8:$addr)>; +def : T2Pat<(extloadi16 t2addrmode_so_reg:$addr), + (t2LDRHs t2addrmode_so_reg:$addr)>; +def : T2Pat<(extloadi16 (ARMWrapper tconstpool:$addr)), + (t2LDRHpci tconstpool:$addr)>; + +// FIXME: The destination register of the loads and stores can't be PC, but +// can be SP. We need another regclass (similar to rGPR) to represent +// that. Not a pressing issue since these are selected manually, +// not via pattern. + +// Indexed loads + +let mayLoad = 1, hasSideEffects = 0 in { +def t2LDR_PRE : T2Ipreldst<0, 0b10, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb), + (ins t2addrmode_imm8_pre:$addr), + AddrModeT2_i8, IndexModePre, IIC_iLoad_iu, + "ldr", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>; + +def t2LDR_POST : T2Ipostldst<0, 0b10, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset), + AddrModeT2_i8, IndexModePost, IIC_iLoad_iu, + "ldr", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>; + +def t2LDRB_PRE : T2Ipreldst<0, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb), + (ins t2addrmode_imm8_pre:$addr), + AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu, + "ldrb", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>; + +def t2LDRB_POST : T2Ipostldst<0, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset), + AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu, + "ldrb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>; + +def t2LDRH_PRE : T2Ipreldst<0, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb), + (ins t2addrmode_imm8_pre:$addr), + AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu, + "ldrh", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>; + +def t2LDRH_POST : T2Ipostldst<0, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset), + AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu, + "ldrh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>; + +def t2LDRSB_PRE : T2Ipreldst<1, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb), + (ins t2addrmode_imm8_pre:$addr), + AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu, + "ldrsb", "\t$Rt, $addr!", "$addr.base = $Rn_wb", + []>; + +def t2LDRSB_POST : T2Ipostldst<1, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset), + AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu, + "ldrsb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>; + +def t2LDRSH_PRE : T2Ipreldst<1, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb), + (ins t2addrmode_imm8_pre:$addr), + AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu, + "ldrsh", "\t$Rt, $addr!", "$addr.base = $Rn_wb", + []>; + +def t2LDRSH_POST : T2Ipostldst<1, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset), + AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu, + "ldrsh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>; +} // mayLoad = 1, hasSideEffects = 0 + +// LDRT, LDRBT, LDRHT, LDRSBT, LDRSHT all have offset mode (PUW=0b110). +// Ref: A8.6.57 LDR (immediate, Thumb) Encoding T4 +class T2IldT<bit signed, bits<2> type, string opc, InstrItinClass ii> + : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_posimm8:$addr), ii, opc, + "\t$Rt, $addr", []> { + bits<4> Rt; + bits<13> addr; + let Inst{31-27} = 0b11111; + let Inst{26-25} = 0b00; + let Inst{24} = signed; + let Inst{23} = 0; + let Inst{22-21} = type; + let Inst{20} = 1; // load + let Inst{19-16} = addr{12-9}; + let Inst{15-12} = Rt; + let Inst{11} = 1; + let Inst{10-8} = 0b110; // PUW. + let Inst{7-0} = addr{7-0}; + + let DecoderMethod = "DecodeT2LoadT"; +} + +def t2LDRT : T2IldT<0, 0b10, "ldrt", IIC_iLoad_i>; +def t2LDRBT : T2IldT<0, 0b00, "ldrbt", IIC_iLoad_bh_i>; +def t2LDRHT : T2IldT<0, 0b01, "ldrht", IIC_iLoad_bh_i>; +def t2LDRSBT : T2IldT<1, 0b00, "ldrsbt", IIC_iLoad_bh_i>; +def t2LDRSHT : T2IldT<1, 0b01, "ldrsht", IIC_iLoad_bh_i>; + +class T2Ildacq<bits<4> bits23_20, bits<2> bit54, dag oops, dag iops, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeNone, 4, NoItinerary, + opc, asm, "", pattern>, Requires<[IsThumb, HasV8]> { + bits<4> Rt; + bits<4> addr; + + let Inst{31-27} = 0b11101; + let Inst{26-24} = 0b000; + let Inst{23-20} = bits23_20; + let Inst{11-6} = 0b111110; + let Inst{5-4} = bit54; + let Inst{3-0} = 0b1111; + + // Encode instruction operands + let Inst{19-16} = addr; + let Inst{15-12} = Rt; +} + +def t2LDA : T2Ildacq<0b1101, 0b10, (outs rGPR:$Rt), + (ins addr_offset_none:$addr), "lda", "\t$Rt, $addr", []>; +def t2LDAB : T2Ildacq<0b1101, 0b00, (outs rGPR:$Rt), + (ins addr_offset_none:$addr), "ldab", "\t$Rt, $addr", []>; +def t2LDAH : T2Ildacq<0b1101, 0b01, (outs rGPR:$Rt), + (ins addr_offset_none:$addr), "ldah", "\t$Rt, $addr", []>; + +// Store +defm t2STR :T2I_st<0b10,"str", IIC_iStore_i, IIC_iStore_si, GPR, + BinOpFrag<(store node:$LHS, node:$RHS)>>; +defm t2STRB:T2I_st<0b00,"strb", IIC_iStore_bh_i, IIC_iStore_bh_si, + rGPR, BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>; +defm t2STRH:T2I_st<0b01,"strh", IIC_iStore_bh_i, IIC_iStore_bh_si, + rGPR, BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>; + +// Store doubleword +let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in +def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs), + (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr), + IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "", []>; + +// Indexed stores + +let mayStore = 1, hasSideEffects = 0 in { +def t2STR_PRE : T2Ipreldst<0, 0b10, 0, 1, (outs GPRnopc:$Rn_wb), + (ins GPRnopc:$Rt, t2addrmode_imm8_pre:$addr), + AddrModeT2_i8, IndexModePre, IIC_iStore_iu, + "str", "\t$Rt, $addr!", + "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>; + +def t2STRH_PRE : T2Ipreldst<0, 0b01, 0, 1, (outs GPRnopc:$Rn_wb), + (ins rGPR:$Rt, t2addrmode_imm8_pre:$addr), + AddrModeT2_i8, IndexModePre, IIC_iStore_iu, + "strh", "\t$Rt, $addr!", + "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>; + +def t2STRB_PRE : T2Ipreldst<0, 0b00, 0, 1, (outs GPRnopc:$Rn_wb), + (ins rGPR:$Rt, t2addrmode_imm8_pre:$addr), + AddrModeT2_i8, IndexModePre, IIC_iStore_bh_iu, + "strb", "\t$Rt, $addr!", + "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>; +} // mayStore = 1, hasSideEffects = 0 + +def t2STR_POST : T2Ipostldst<0, 0b10, 0, 0, (outs GPRnopc:$Rn_wb), + (ins GPRnopc:$Rt, addr_offset_none:$Rn, + t2am_imm8_offset:$offset), + AddrModeT2_i8, IndexModePost, IIC_iStore_iu, + "str", "\t$Rt, $Rn$offset", + "$Rn = $Rn_wb,@earlyclobber $Rn_wb", + [(set GPRnopc:$Rn_wb, + (post_store GPRnopc:$Rt, addr_offset_none:$Rn, + t2am_imm8_offset:$offset))]>; + +def t2STRH_POST : T2Ipostldst<0, 0b01, 0, 0, (outs GPRnopc:$Rn_wb), + (ins rGPR:$Rt, addr_offset_none:$Rn, + t2am_imm8_offset:$offset), + AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu, + "strh", "\t$Rt, $Rn$offset", + "$Rn = $Rn_wb,@earlyclobber $Rn_wb", + [(set GPRnopc:$Rn_wb, + (post_truncsti16 rGPR:$Rt, addr_offset_none:$Rn, + t2am_imm8_offset:$offset))]>; + +def t2STRB_POST : T2Ipostldst<0, 0b00, 0, 0, (outs GPRnopc:$Rn_wb), + (ins rGPR:$Rt, addr_offset_none:$Rn, + t2am_imm8_offset:$offset), + AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu, + "strb", "\t$Rt, $Rn$offset", + "$Rn = $Rn_wb,@earlyclobber $Rn_wb", + [(set GPRnopc:$Rn_wb, + (post_truncsti8 rGPR:$Rt, addr_offset_none:$Rn, + t2am_imm8_offset:$offset))]>; + +// Pseudo-instructions for pattern matching the pre-indexed stores. We can't +// put the patterns on the instruction definitions directly as ISel wants +// the address base and offset to be separate operands, not a single +// complex operand like we represent the instructions themselves. The +// pseudos map between the two. +let usesCustomInserter = 1, + Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in { +def t2STR_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb), + (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p), + 4, IIC_iStore_ru, + [(set GPRnopc:$Rn_wb, + (pre_store rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>; +def t2STRB_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb), + (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p), + 4, IIC_iStore_ru, + [(set GPRnopc:$Rn_wb, + (pre_truncsti8 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>; +def t2STRH_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb), + (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p), + 4, IIC_iStore_ru, + [(set GPRnopc:$Rn_wb, + (pre_truncsti16 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>; +} + +// STRT, STRBT, STRHT all have offset mode (PUW=0b110) and are for disassembly +// only. +// Ref: A8.6.193 STR (immediate, Thumb) Encoding T4 +class T2IstT<bits<2> type, string opc, InstrItinClass ii> + : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc, + "\t$Rt, $addr", []> { + let Inst{31-27} = 0b11111; + let Inst{26-25} = 0b00; + let Inst{24} = 0; // not signed + let Inst{23} = 0; + let Inst{22-21} = type; + let Inst{20} = 0; // store + let Inst{11} = 1; + let Inst{10-8} = 0b110; // PUW + + bits<4> Rt; + bits<13> addr; + let Inst{15-12} = Rt; + let Inst{19-16} = addr{12-9}; + let Inst{7-0} = addr{7-0}; +} + +def t2STRT : T2IstT<0b10, "strt", IIC_iStore_i>; +def t2STRBT : T2IstT<0b00, "strbt", IIC_iStore_bh_i>; +def t2STRHT : T2IstT<0b01, "strht", IIC_iStore_bh_i>; + +// ldrd / strd pre / post variants + +let mayLoad = 1 in +def t2LDRD_PRE : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb), + (ins t2addrmode_imm8s4_pre:$addr), IIC_iLoad_d_ru, + "ldrd", "\t$Rt, $Rt2, $addr!", "$addr.base = $wb", []> { + let DecoderMethod = "DecodeT2LDRDPreInstruction"; +} + +let mayLoad = 1 in +def t2LDRD_POST : T2Ii8s4post<0, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb), + (ins addr_offset_none:$addr, t2am_imm8s4_offset:$imm), + IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, $addr$imm", + "$addr.base = $wb", []>; + +let mayStore = 1 in +def t2STRD_PRE : T2Ii8s4<1, 1, 0, (outs GPR:$wb), + (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4_pre:$addr), + IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr!", + "$addr.base = $wb", []> { + let DecoderMethod = "DecodeT2STRDPreInstruction"; +} + +let mayStore = 1 in +def t2STRD_POST : T2Ii8s4post<0, 1, 0, (outs GPR:$wb), + (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr, + t2am_imm8s4_offset:$imm), + IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr$imm", + "$addr.base = $wb", []>; + +class T2Istrrel<bits<2> bit54, dag oops, dag iops, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeNone, 4, NoItinerary, opc, + asm, "", pattern>, Requires<[IsThumb, HasV8]> { + bits<4> Rt; + bits<4> addr; + + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0001100; + let Inst{11-6} = 0b111110; + let Inst{5-4} = bit54; + let Inst{3-0} = 0b1111; + + // Encode instruction operands + let Inst{19-16} = addr; + let Inst{15-12} = Rt; +} + +def t2STL : T2Istrrel<0b10, (outs), (ins rGPR:$Rt, addr_offset_none:$addr), + "stl", "\t$Rt, $addr", []>; +def t2STLB : T2Istrrel<0b00, (outs), (ins rGPR:$Rt, addr_offset_none:$addr), + "stlb", "\t$Rt, $addr", []>; +def t2STLH : T2Istrrel<0b01, (outs), (ins rGPR:$Rt, addr_offset_none:$addr), + "stlh", "\t$Rt, $addr", []>; + +// T2Ipl (Preload Data/Instruction) signals the memory system of possible future +// data/instruction access. +// instr_write is inverted for Thumb mode: (prefetch 3) -> (preload 0), +// (prefetch 1) -> (preload 2), (prefetch 2) -> (preload 1). +multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> { + + def i12 : T2Ii12<(outs), (ins t2addrmode_imm12:$addr), IIC_Preload, opc, + "\t$addr", + [(ARMPreload t2addrmode_imm12:$addr, (i32 write), (i32 instr))]>, + Sched<[WritePreLd]> { + let Inst{31-25} = 0b1111100; + let Inst{24} = instr; + let Inst{23} = 1; + let Inst{22} = 0; + let Inst{21} = write; + let Inst{20} = 1; + let Inst{15-12} = 0b1111; + + bits<17> addr; + let Inst{19-16} = addr{16-13}; // Rn + let Inst{11-0} = addr{11-0}; // imm12 + + let DecoderMethod = "DecodeT2LoadImm12"; + } + + def i8 : T2Ii8<(outs), (ins t2addrmode_negimm8:$addr), IIC_Preload, opc, + "\t$addr", + [(ARMPreload t2addrmode_negimm8:$addr, (i32 write), (i32 instr))]>, + Sched<[WritePreLd]> { + let Inst{31-25} = 0b1111100; + let Inst{24} = instr; + let Inst{23} = 0; // U = 0 + let Inst{22} = 0; + let Inst{21} = write; + let Inst{20} = 1; + let Inst{15-12} = 0b1111; + let Inst{11-8} = 0b1100; + + bits<13> addr; + let Inst{19-16} = addr{12-9}; // Rn + let Inst{7-0} = addr{7-0}; // imm8 + + let DecoderMethod = "DecodeT2LoadImm8"; + } + + def s : T2Iso<(outs), (ins t2addrmode_so_reg:$addr), IIC_Preload, opc, + "\t$addr", + [(ARMPreload t2addrmode_so_reg:$addr, (i32 write), (i32 instr))]>, + Sched<[WritePreLd]> { + let Inst{31-25} = 0b1111100; + let Inst{24} = instr; + let Inst{23} = 0; // add = TRUE for T1 + let Inst{22} = 0; + let Inst{21} = write; + let Inst{20} = 1; + let Inst{15-12} = 0b1111; + let Inst{11-6} = 0b000000; + + bits<10> addr; + let Inst{19-16} = addr{9-6}; // Rn + let Inst{3-0} = addr{5-2}; // Rm + let Inst{5-4} = addr{1-0}; // imm2 + + let DecoderMethod = "DecodeT2LoadShift"; + } +} + +defm t2PLD : T2Ipl<0, 0, "pld">, Requires<[IsThumb2]>; +defm t2PLDW : T2Ipl<1, 0, "pldw">, Requires<[IsThumb2,HasV7,HasMP]>; +defm t2PLI : T2Ipl<0, 1, "pli">, Requires<[IsThumb2,HasV7]>; + +// pci variant is very similar to i12, but supports negative offsets +// from the PC. Only PLD and PLI have pci variants (not PLDW) +class T2Iplpci<bits<1> inst, string opc> : T2Iso<(outs), (ins t2ldrlabel:$addr), + IIC_Preload, opc, "\t$addr", + [(ARMPreload (ARMWrapper tconstpool:$addr), + (i32 0), (i32 inst))]>, Sched<[WritePreLd]> { + let Inst{31-25} = 0b1111100; + let Inst{24} = inst; + let Inst{22-20} = 0b001; + let Inst{19-16} = 0b1111; + let Inst{15-12} = 0b1111; + + bits<13> addr; + let Inst{23} = addr{12}; // add = (U == '1') + let Inst{11-0} = addr{11-0}; // imm12 + + let DecoderMethod = "DecodeT2LoadLabel"; +} + +def t2PLDpci : T2Iplpci<0, "pld">, Requires<[IsThumb2]>; +def t2PLIpci : T2Iplpci<1, "pli">, Requires<[IsThumb2,HasV7]>; + +//===----------------------------------------------------------------------===// +// Load / store multiple Instructions. +// + +multiclass thumb2_ld_mult<string asm, InstrItinClass itin, + InstrItinClass itin_upd, bit L_bit> { + def IA : + T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + itin, !strconcat(asm, "${p}.w\t$Rn, $regs"), []> { + bits<4> Rn; + bits<16> regs; + + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b00; + let Inst{24-23} = 0b01; // Increment After + let Inst{22} = 0; + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + let Inst{19-16} = Rn; + let Inst{15-0} = regs; + } + def IA_UPD : + T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + itin_upd, !strconcat(asm, "${p}.w\t$Rn!, $regs"), "$Rn = $wb", []> { + bits<4> Rn; + bits<16> regs; + + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b00; + let Inst{24-23} = 0b01; // Increment After + let Inst{22} = 0; + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + let Inst{19-16} = Rn; + let Inst{15-0} = regs; + } + def DB : + T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + itin, !strconcat(asm, "db${p}\t$Rn, $regs"), []> { + bits<4> Rn; + bits<16> regs; + + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b00; + let Inst{24-23} = 0b10; // Decrement Before + let Inst{22} = 0; + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + let Inst{19-16} = Rn; + let Inst{15-0} = regs; + } + def DB_UPD : + T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + itin_upd, !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> { + bits<4> Rn; + bits<16> regs; + + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b00; + let Inst{24-23} = 0b10; // Decrement Before + let Inst{22} = 0; + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + let Inst{19-16} = Rn; + let Inst{15-0} = regs; + } +} + +let hasSideEffects = 0 in { + +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in +defm t2LDM : thumb2_ld_mult<"ldm", IIC_iLoad_m, IIC_iLoad_mu, 1>; + +multiclass thumb2_st_mult<string asm, InstrItinClass itin, + InstrItinClass itin_upd, bit L_bit> { + def IA : + T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + itin, !strconcat(asm, "${p}.w\t$Rn, $regs"), []> { + bits<4> Rn; + bits<16> regs; + + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b00; + let Inst{24-23} = 0b01; // Increment After + let Inst{22} = 0; + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + let Inst{19-16} = Rn; + let Inst{15} = 0; + let Inst{14} = regs{14}; + let Inst{13} = 0; + let Inst{12-0} = regs{12-0}; + } + def IA_UPD : + T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + itin_upd, !strconcat(asm, "${p}.w\t$Rn!, $regs"), "$Rn = $wb", []> { + bits<4> Rn; + bits<16> regs; + + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b00; + let Inst{24-23} = 0b01; // Increment After + let Inst{22} = 0; + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + let Inst{19-16} = Rn; + let Inst{15} = 0; + let Inst{14} = regs{14}; + let Inst{13} = 0; + let Inst{12-0} = regs{12-0}; + } + def DB : + T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + itin, !strconcat(asm, "db${p}\t$Rn, $regs"), []> { + bits<4> Rn; + bits<16> regs; + + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b00; + let Inst{24-23} = 0b10; // Decrement Before + let Inst{22} = 0; + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + let Inst{19-16} = Rn; + let Inst{15} = 0; + let Inst{14} = regs{14}; + let Inst{13} = 0; + let Inst{12-0} = regs{12-0}; + } + def DB_UPD : + T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + itin_upd, !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> { + bits<4> Rn; + bits<16> regs; + + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b00; + let Inst{24-23} = 0b10; // Decrement Before + let Inst{22} = 0; + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + let Inst{19-16} = Rn; + let Inst{15} = 0; + let Inst{14} = regs{14}; + let Inst{13} = 0; + let Inst{12-0} = regs{12-0}; + } +} + + +let mayStore = 1, hasExtraSrcRegAllocReq = 1 in +defm t2STM : thumb2_st_mult<"stm", IIC_iStore_m, IIC_iStore_mu, 0>; + +} // hasSideEffects + + +//===----------------------------------------------------------------------===// +// Move Instructions. +// + +let hasSideEffects = 0 in +def t2MOVr : T2sTwoReg<(outs GPRnopc:$Rd), (ins GPR:$Rm), IIC_iMOVr, + "mov", ".w\t$Rd, $Rm", []>, Sched<[WriteALU]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = 0b0010; + let Inst{19-16} = 0b1111; // Rn + let Inst{14-12} = 0b000; + let Inst{7-4} = 0b0000; +} +def : t2InstAlias<"mov${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm, + pred:$p, zero_reg)>; +def : t2InstAlias<"movs${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm, + pred:$p, CPSR)>; +def : t2InstAlias<"movs${p} $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm, + pred:$p, CPSR)>; + +// AddedComplexity to ensure isel tries t2MOVi before t2MOVi16. +let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1, + AddedComplexity = 1 in +def t2MOVi : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), IIC_iMOVi, + "mov", ".w\t$Rd, $imm", + [(set rGPR:$Rd, t2_so_imm:$imm)]>, Sched<[WriteALU]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = 0b0010; + let Inst{19-16} = 0b1111; // Rn + let Inst{15} = 0; +} + +// cc_out is handled as part of the explicit mnemonic in the parser for 'mov'. +// Use aliases to get that to play nice here. +def : t2InstAlias<"movs${p}.w $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm, + pred:$p, CPSR)>; +def : t2InstAlias<"movs${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm, + pred:$p, CPSR)>; + +def : t2InstAlias<"mov${p}.w $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm, + pred:$p, zero_reg)>; +def : t2InstAlias<"mov${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm, + pred:$p, zero_reg)>; + +let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in +def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins imm0_65535_expr:$imm), IIC_iMOVi, + "movw", "\t$Rd, $imm", + [(set rGPR:$Rd, imm0_65535:$imm)]>, Sched<[WriteALU]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 1; + let Inst{24-21} = 0b0010; + let Inst{20} = 0; // The S bit. + let Inst{15} = 0; + + bits<4> Rd; + bits<16> imm; + + let Inst{11-8} = Rd; + let Inst{19-16} = imm{15-12}; + let Inst{26} = imm{11}; + let Inst{14-12} = imm{10-8}; + let Inst{7-0} = imm{7-0}; + let DecoderMethod = "DecodeT2MOVTWInstruction"; +} + +def : t2InstAlias<"mov${p} $Rd, $imm", + (t2MOVi16 rGPR:$Rd, imm256_65535_expr:$imm, pred:$p)>; + +def t2MOVi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd), + (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>; + +let Constraints = "$src = $Rd" in { +def t2MOVTi16 : T2I<(outs rGPR:$Rd), + (ins rGPR:$src, imm0_65535_expr:$imm), IIC_iMOVi, + "movt", "\t$Rd, $imm", + [(set rGPR:$Rd, + (or (and rGPR:$src, 0xffff), lo16AllZero:$imm))]>, + Sched<[WriteALU]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 1; + let Inst{24-21} = 0b0110; + let Inst{20} = 0; // The S bit. + let Inst{15} = 0; + + bits<4> Rd; + bits<16> imm; + + let Inst{11-8} = Rd; + let Inst{19-16} = imm{15-12}; + let Inst{26} = imm{11}; + let Inst{14-12} = imm{10-8}; + let Inst{7-0} = imm{7-0}; + let DecoderMethod = "DecodeT2MOVTWInstruction"; +} + +def t2MOVTi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd), + (ins rGPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>, + Sched<[WriteALU]>; +} // Constraints + +def : T2Pat<(or rGPR:$src, 0xffff0000), (t2MOVTi16 rGPR:$src, 0xffff)>; + +//===----------------------------------------------------------------------===// +// Extend Instructions. +// + +// Sign extenders + +def t2SXTB : T2I_ext_rrot<0b100, "sxtb", + UnOpFrag<(sext_inreg node:$Src, i8)>>; +def t2SXTH : T2I_ext_rrot<0b000, "sxth", + UnOpFrag<(sext_inreg node:$Src, i16)>>; +def t2SXTB16 : T2I_ext_rrot_sxtb16<0b010, "sxtb16">; + +def t2SXTAB : T2I_exta_rrot<0b100, "sxtab", + BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>; +def t2SXTAH : T2I_exta_rrot<0b000, "sxtah", + BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>; +def t2SXTAB16 : T2I_exta_rrot_np<0b010, "sxtab16">; + +// A simple right-shift can also be used in most cases (the exception is the +// SXTH operations with a rotate of 24: there the non-contiguous bits are +// relevant). +def : Pat<(add rGPR:$Rn, (sext_inreg (srl rGPR:$Rm, rot_imm:$rot), i8)), + (t2SXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>, + Requires<[HasT2ExtractPack, IsThumb2]>; +def : Pat<(add rGPR:$Rn, (sext_inreg (srl rGPR:$Rm, imm8_or_16:$rot), i16)), + (t2SXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>, + Requires<[HasT2ExtractPack, IsThumb2]>; + +// Zero extenders + +let AddedComplexity = 16 in { +def t2UXTB : T2I_ext_rrot<0b101, "uxtb", + UnOpFrag<(and node:$Src, 0x000000FF)>>; +def t2UXTH : T2I_ext_rrot<0b001, "uxth", + UnOpFrag<(and node:$Src, 0x0000FFFF)>>; +def t2UXTB16 : T2I_ext_rrot_uxtb16<0b011, "uxtb16", + UnOpFrag<(and node:$Src, 0x00FF00FF)>>; + +// FIXME: This pattern incorrectly assumes the shl operator is a rotate. +// The transformation should probably be done as a combiner action +// instead so we can include a check for masking back in the upper +// eight bits of the source into the lower eight bits of the result. +//def : T2Pat<(and (shl rGPR:$Src, (i32 8)), 0xFF00FF), +// (t2UXTB16 rGPR:$Src, 3)>, +// Requires<[HasT2ExtractPack, IsThumb2]>; +def : T2Pat<(and (srl rGPR:$Src, (i32 8)), 0xFF00FF), + (t2UXTB16 rGPR:$Src, 1)>, + Requires<[HasT2ExtractPack, IsThumb2]>; + +def t2UXTAB : T2I_exta_rrot<0b101, "uxtab", + BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>; +def t2UXTAH : T2I_exta_rrot<0b001, "uxtah", + BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>; +def t2UXTAB16 : T2I_exta_rrot_np<0b011, "uxtab16">; + +def : Pat<(add rGPR:$Rn, (and (srl rGPR:$Rm, rot_imm:$rot), 0xFF)), + (t2UXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>, + Requires<[HasT2ExtractPack, IsThumb2]>; +def : Pat<(add rGPR:$Rn, (and (srl rGPR:$Rm, imm8_or_16:$rot), 0xFFFF)), + (t2UXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>, + Requires<[HasT2ExtractPack, IsThumb2]>; +} + + +//===----------------------------------------------------------------------===// +// Arithmetic Instructions. +// + +defm t2ADD : T2I_bin_ii12rs<0b000, "add", + BinOpFrag<(add node:$LHS, node:$RHS)>, 1>; +defm t2SUB : T2I_bin_ii12rs<0b101, "sub", + BinOpFrag<(sub node:$LHS, node:$RHS)>>; + +// ADD and SUB with 's' bit set. No 12-bit immediate (T4) variants. +// +// Currently, t2ADDS/t2SUBS are pseudo opcodes that exist only in the +// selection DAG. They are "lowered" to real t2ADD/t2SUB opcodes by +// AdjustInstrPostInstrSelection where we determine whether or not to +// set the "s" bit based on CPSR liveness. +// +// FIXME: Eliminate t2ADDS/t2SUBS pseudo opcodes after adding tablegen +// support for an optional CPSR definition that corresponds to the DAG +// node's second value. We can then eliminate the implicit def of CPSR. +defm t2ADDS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi, + BinOpFrag<(ARMaddc node:$LHS, node:$RHS)>, 1>; +defm t2SUBS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi, + BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>; + +let hasPostISelHook = 1 in { +defm t2ADC : T2I_adde_sube_irs<0b1010, "adc", + BinOpWithFlagFrag<(ARMadde node:$LHS, node:$RHS, node:$FLAG)>, 1>; +defm t2SBC : T2I_adde_sube_irs<0b1011, "sbc", + BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>>; +} + +// RSB +defm t2RSB : T2I_rbin_irs <0b1110, "rsb", + BinOpFrag<(sub node:$LHS, node:$RHS)>>; + +// FIXME: Eliminate them if we can write def : Pat patterns which defines +// CPSR and the implicit def of CPSR is not needed. +defm t2RSBS : T2I_rbin_s_is <BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>; + +// (sub X, imm) gets canonicalized to (add X, -imm). Match this form. +// The assume-no-carry-in form uses the negation of the input since add/sub +// assume opposite meanings of the carry flag (i.e., carry == !borrow). +// See the definition of AddWithCarry() in the ARM ARM A2.2.1 for the gory +// details. +// The AddedComplexity preferences the first variant over the others since +// it can be shrunk to a 16-bit wide encoding, while the others cannot. +let AddedComplexity = 1 in +def : T2Pat<(add GPR:$src, imm1_255_neg:$imm), + (t2SUBri GPR:$src, imm1_255_neg:$imm)>; +def : T2Pat<(add GPR:$src, t2_so_imm_neg:$imm), + (t2SUBri GPR:$src, t2_so_imm_neg:$imm)>; +def : T2Pat<(add GPR:$src, imm0_4095_neg:$imm), + (t2SUBri12 GPR:$src, imm0_4095_neg:$imm)>; +def : T2Pat<(add GPR:$src, imm0_65535_neg:$imm), + (t2SUBrr GPR:$src, (t2MOVi16 (imm_neg_XFORM imm:$imm)))>; + +let AddedComplexity = 1 in +def : T2Pat<(ARMaddc rGPR:$src, imm1_255_neg:$imm), + (t2SUBSri rGPR:$src, imm1_255_neg:$imm)>; +def : T2Pat<(ARMaddc rGPR:$src, t2_so_imm_neg:$imm), + (t2SUBSri rGPR:$src, t2_so_imm_neg:$imm)>; +def : T2Pat<(ARMaddc rGPR:$src, imm0_65535_neg:$imm), + (t2SUBSrr rGPR:$src, (t2MOVi16 (imm_neg_XFORM imm:$imm)))>; +// The with-carry-in form matches bitwise not instead of the negation. +// Effectively, the inverse interpretation of the carry flag already accounts +// for part of the negation. +let AddedComplexity = 1 in +def : T2Pat<(ARMadde rGPR:$src, imm0_255_not:$imm, CPSR), + (t2SBCri rGPR:$src, imm0_255_not:$imm)>; +def : T2Pat<(ARMadde rGPR:$src, t2_so_imm_not:$imm, CPSR), + (t2SBCri rGPR:$src, t2_so_imm_not:$imm)>; +def : T2Pat<(ARMadde rGPR:$src, imm0_65535_neg:$imm, CPSR), + (t2SBCrr rGPR:$src, (t2MOVi16 (imm_not_XFORM imm:$imm)))>; + +// Select Bytes -- for disassembly only + +def t2SEL : T2ThreeReg<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + NoItinerary, "sel", "\t$Rd, $Rn, $Rm", []>, + Requires<[IsThumb2, HasDSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-24} = 0b010; + let Inst{23} = 0b1; + let Inst{22-20} = 0b010; + let Inst{15-12} = 0b1111; + let Inst{7} = 0b1; + let Inst{6-4} = 0b000; +} + +// A6.3.13, A6.3.14, A6.3.15 Parallel addition and subtraction (signed/unsigned) +// And Miscellaneous operations -- for disassembly only +class T2I_pam<bits<3> op22_20, bits<4> op7_4, string opc, + list<dag> pat = [/* For disassembly only; pattern left blank */], + dag iops = (ins rGPR:$Rn, rGPR:$Rm), + string asm = "\t$Rd, $Rn, $Rm"> + : T2I<(outs rGPR:$Rd), iops, NoItinerary, opc, asm, pat>, + Requires<[IsThumb2, HasDSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0101; + let Inst{22-20} = op22_20; + let Inst{15-12} = 0b1111; + let Inst{7-4} = op7_4; + + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + + let Inst{11-8} = Rd; + let Inst{19-16} = Rn; + let Inst{3-0} = Rm; +} + +// Saturating add/subtract -- for disassembly only + +def t2QADD : T2I_pam<0b000, 0b1000, "qadd", + [(set rGPR:$Rd, (int_arm_qadd rGPR:$Rn, rGPR:$Rm))], + (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">; +def t2QADD16 : T2I_pam<0b001, 0b0001, "qadd16">; +def t2QADD8 : T2I_pam<0b000, 0b0001, "qadd8">; +def t2QASX : T2I_pam<0b010, 0b0001, "qasx">; +def t2QDADD : T2I_pam<0b000, 0b1001, "qdadd", [], + (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">; +def t2QDSUB : T2I_pam<0b000, 0b1011, "qdsub", [], + (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">; +def t2QSAX : T2I_pam<0b110, 0b0001, "qsax">; +def t2QSUB : T2I_pam<0b000, 0b1010, "qsub", + [(set rGPR:$Rd, (int_arm_qsub rGPR:$Rn, rGPR:$Rm))], + (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">; +def t2QSUB16 : T2I_pam<0b101, 0b0001, "qsub16">; +def t2QSUB8 : T2I_pam<0b100, 0b0001, "qsub8">; +def t2UQADD16 : T2I_pam<0b001, 0b0101, "uqadd16">; +def t2UQADD8 : T2I_pam<0b000, 0b0101, "uqadd8">; +def t2UQASX : T2I_pam<0b010, 0b0101, "uqasx">; +def t2UQSAX : T2I_pam<0b110, 0b0101, "uqsax">; +def t2UQSUB16 : T2I_pam<0b101, 0b0101, "uqsub16">; +def t2UQSUB8 : T2I_pam<0b100, 0b0101, "uqsub8">; + +// Signed/Unsigned add/subtract -- for disassembly only + +def t2SASX : T2I_pam<0b010, 0b0000, "sasx">; +def t2SADD16 : T2I_pam<0b001, 0b0000, "sadd16">; +def t2SADD8 : T2I_pam<0b000, 0b0000, "sadd8">; +def t2SSAX : T2I_pam<0b110, 0b0000, "ssax">; +def t2SSUB16 : T2I_pam<0b101, 0b0000, "ssub16">; +def t2SSUB8 : T2I_pam<0b100, 0b0000, "ssub8">; +def t2UASX : T2I_pam<0b010, 0b0100, "uasx">; +def t2UADD16 : T2I_pam<0b001, 0b0100, "uadd16">; +def t2UADD8 : T2I_pam<0b000, 0b0100, "uadd8">; +def t2USAX : T2I_pam<0b110, 0b0100, "usax">; +def t2USUB16 : T2I_pam<0b101, 0b0100, "usub16">; +def t2USUB8 : T2I_pam<0b100, 0b0100, "usub8">; + +// Signed/Unsigned halving add/subtract -- for disassembly only + +def t2SHASX : T2I_pam<0b010, 0b0010, "shasx">; +def t2SHADD16 : T2I_pam<0b001, 0b0010, "shadd16">; +def t2SHADD8 : T2I_pam<0b000, 0b0010, "shadd8">; +def t2SHSAX : T2I_pam<0b110, 0b0010, "shsax">; +def t2SHSUB16 : T2I_pam<0b101, 0b0010, "shsub16">; +def t2SHSUB8 : T2I_pam<0b100, 0b0010, "shsub8">; +def t2UHASX : T2I_pam<0b010, 0b0110, "uhasx">; +def t2UHADD16 : T2I_pam<0b001, 0b0110, "uhadd16">; +def t2UHADD8 : T2I_pam<0b000, 0b0110, "uhadd8">; +def t2UHSAX : T2I_pam<0b110, 0b0110, "uhsax">; +def t2UHSUB16 : T2I_pam<0b101, 0b0110, "uhsub16">; +def t2UHSUB8 : T2I_pam<0b100, 0b0110, "uhsub8">; + +// Helper class for disassembly only +// A6.3.16 & A6.3.17 +// T2Imac - Thumb2 multiply [accumulate, and absolute difference] instructions. +class T2ThreeReg_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops, + dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> + : T2ThreeReg<oops, iops, itin, opc, asm, pattern> { + let Inst{31-27} = 0b11111; + let Inst{26-24} = 0b011; + let Inst{23} = long; + let Inst{22-20} = op22_20; + let Inst{7-4} = op7_4; +} + +class T2FourReg_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops, + dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> + : T2FourReg<oops, iops, itin, opc, asm, pattern> { + let Inst{31-27} = 0b11111; + let Inst{26-24} = 0b011; + let Inst{23} = long; + let Inst{22-20} = op22_20; + let Inst{7-4} = op7_4; +} + +// Unsigned Sum of Absolute Differences [and Accumulate]. +def t2USAD8 : T2ThreeReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd), + (ins rGPR:$Rn, rGPR:$Rm), + NoItinerary, "usad8", "\t$Rd, $Rn, $Rm", []>, + Requires<[IsThumb2, HasDSP]> { + let Inst{15-12} = 0b1111; +} +def t2USADA8 : T2FourReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd), + (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), NoItinerary, + "usada8", "\t$Rd, $Rn, $Rm, $Ra", []>, + Requires<[IsThumb2, HasDSP]>; + +// Signed/Unsigned saturate. +class T2SatI<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<5> sat_imm; + bits<7> sh; + + let Inst{11-8} = Rd; + let Inst{19-16} = Rn; + let Inst{4-0} = sat_imm; + let Inst{21} = sh{5}; + let Inst{14-12} = sh{4-2}; + let Inst{7-6} = sh{1-0}; +} + +def t2SSAT: T2SatI< + (outs rGPR:$Rd), + (ins imm1_32:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh), + NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []> { + let Inst{31-27} = 0b11110; + let Inst{25-22} = 0b1100; + let Inst{20} = 0; + let Inst{15} = 0; + let Inst{5} = 0; +} + +def t2SSAT16: T2SatI< + (outs rGPR:$Rd), (ins imm1_16:$sat_imm, rGPR:$Rn), NoItinerary, + "ssat16", "\t$Rd, $sat_imm, $Rn", []>, + Requires<[IsThumb2, HasDSP]> { + let Inst{31-27} = 0b11110; + let Inst{25-22} = 0b1100; + let Inst{20} = 0; + let Inst{15} = 0; + let Inst{21} = 1; // sh = '1' + let Inst{14-12} = 0b000; // imm3 = '000' + let Inst{7-6} = 0b00; // imm2 = '00' + let Inst{5-4} = 0b00; +} + +def t2USAT: T2SatI< + (outs rGPR:$Rd), + (ins imm0_31:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh), + NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []> { + let Inst{31-27} = 0b11110; + let Inst{25-22} = 0b1110; + let Inst{20} = 0; + let Inst{15} = 0; +} + +def t2USAT16: T2SatI<(outs rGPR:$Rd), (ins imm0_15:$sat_imm, rGPR:$Rn), + NoItinerary, + "usat16", "\t$Rd, $sat_imm, $Rn", []>, + Requires<[IsThumb2, HasDSP]> { + let Inst{31-22} = 0b1111001110; + let Inst{20} = 0; + let Inst{15} = 0; + let Inst{21} = 1; // sh = '1' + let Inst{14-12} = 0b000; // imm3 = '000' + let Inst{7-6} = 0b00; // imm2 = '00' + let Inst{5-4} = 0b00; +} + +def : T2Pat<(int_arm_ssat GPR:$a, imm1_32:$pos), (t2SSAT imm1_32:$pos, GPR:$a, 0)>; +def : T2Pat<(int_arm_usat GPR:$a, imm0_31:$pos), (t2USAT imm0_31:$pos, GPR:$a, 0)>; + +//===----------------------------------------------------------------------===// +// Shift and rotate Instructions. +// + +defm t2LSL : T2I_sh_ir<0b00, "lsl", imm0_31, + BinOpFrag<(shl node:$LHS, node:$RHS)>>; +defm t2LSR : T2I_sh_ir<0b01, "lsr", imm_sr, + BinOpFrag<(srl node:$LHS, node:$RHS)>>; +defm t2ASR : T2I_sh_ir<0b10, "asr", imm_sr, + BinOpFrag<(sra node:$LHS, node:$RHS)>>; +defm t2ROR : T2I_sh_ir<0b11, "ror", imm0_31, + BinOpFrag<(rotr node:$LHS, node:$RHS)>>; + +// (rotr x, (and y, 0x...1f)) ==> (ROR x, y) +def : T2Pat<(rotr rGPR:$lhs, (and rGPR:$rhs, lo5AllOne)), + (t2RORrr rGPR:$lhs, rGPR:$rhs)>; + +let Uses = [CPSR] in { +def t2RRX : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi, + "rrx", "\t$Rd, $Rm", + [(set rGPR:$Rd, (ARMrrx rGPR:$Rm))]>, Sched<[WriteALU]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = 0b0010; + let Inst{19-16} = 0b1111; // Rn + let Inst{14-12} = 0b000; + let Inst{7-4} = 0b0011; +} +} + +let isCodeGenOnly = 1, Defs = [CPSR] in { +def t2MOVsrl_flag : T2TwoRegShiftImm< + (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi, + "lsrs", ".w\t$Rd, $Rm, #1", + [(set rGPR:$Rd, (ARMsrl_flag rGPR:$Rm))]>, + Sched<[WriteALU]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = 0b0010; + let Inst{20} = 1; // The S bit. + let Inst{19-16} = 0b1111; // Rn + let Inst{5-4} = 0b01; // Shift type. + // Shift amount = Inst{14-12:7-6} = 1. + let Inst{14-12} = 0b000; + let Inst{7-6} = 0b01; +} +def t2MOVsra_flag : T2TwoRegShiftImm< + (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi, + "asrs", ".w\t$Rd, $Rm, #1", + [(set rGPR:$Rd, (ARMsra_flag rGPR:$Rm))]>, + Sched<[WriteALU]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = 0b0010; + let Inst{20} = 1; // The S bit. + let Inst{19-16} = 0b1111; // Rn + let Inst{5-4} = 0b10; // Shift type. + // Shift amount = Inst{14-12:7-6} = 1. + let Inst{14-12} = 0b000; + let Inst{7-6} = 0b01; +} +} + +//===----------------------------------------------------------------------===// +// Bitwise Instructions. +// + +defm t2AND : T2I_bin_w_irs<0b0000, "and", + IIC_iBITi, IIC_iBITr, IIC_iBITsi, + BinOpFrag<(and node:$LHS, node:$RHS)>, 1>; +defm t2ORR : T2I_bin_w_irs<0b0010, "orr", + IIC_iBITi, IIC_iBITr, IIC_iBITsi, + BinOpFrag<(or node:$LHS, node:$RHS)>, 1>; +defm t2EOR : T2I_bin_w_irs<0b0100, "eor", + IIC_iBITi, IIC_iBITr, IIC_iBITsi, + BinOpFrag<(xor node:$LHS, node:$RHS)>, 1>; + +defm t2BIC : T2I_bin_w_irs<0b0001, "bic", + IIC_iBITi, IIC_iBITr, IIC_iBITsi, + BinOpFrag<(and node:$LHS, (not node:$RHS))>>; + +class T2BitFI<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<5> msb; + bits<5> lsb; + + let Inst{11-8} = Rd; + let Inst{4-0} = msb{4-0}; + let Inst{14-12} = lsb{4-2}; + let Inst{7-6} = lsb{1-0}; +} + +class T2TwoRegBitFI<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2BitFI<oops, iops, itin, opc, asm, pattern> { + bits<4> Rn; + + let Inst{19-16} = Rn; +} + +let Constraints = "$src = $Rd" in +def t2BFC : T2BitFI<(outs rGPR:$Rd), (ins rGPR:$src, bf_inv_mask_imm:$imm), + IIC_iUNAsi, "bfc", "\t$Rd, $imm", + [(set rGPR:$Rd, (and rGPR:$src, bf_inv_mask_imm:$imm))]> { + let Inst{31-27} = 0b11110; + let Inst{26} = 0; // should be 0. + let Inst{25} = 1; + let Inst{24-20} = 0b10110; + let Inst{19-16} = 0b1111; // Rn + let Inst{15} = 0; + let Inst{5} = 0; // should be 0. + + bits<10> imm; + let msb{4-0} = imm{9-5}; + let lsb{4-0} = imm{4-0}; +} + +def t2SBFX: T2TwoRegBitFI< + (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm1_32:$msb), + IIC_iUNAsi, "sbfx", "\t$Rd, $Rn, $lsb, $msb", []> { + let Inst{31-27} = 0b11110; + let Inst{25} = 1; + let Inst{24-20} = 0b10100; + let Inst{15} = 0; +} + +def t2UBFX: T2TwoRegBitFI< + (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm1_32:$msb), + IIC_iUNAsi, "ubfx", "\t$Rd, $Rn, $lsb, $msb", []> { + let Inst{31-27} = 0b11110; + let Inst{25} = 1; + let Inst{24-20} = 0b11100; + let Inst{15} = 0; +} + +// A8.8.247 UDF - Undefined (Encoding T2) +def t2UDF : T2XI<(outs), (ins imm0_65535:$imm16), IIC_Br, "udf.w\t$imm16", + [(int_arm_undefined imm0_65535:$imm16)]> { + bits<16> imm16; + let Inst{31-29} = 0b111; + let Inst{28-27} = 0b10; + let Inst{26-20} = 0b1111111; + let Inst{19-16} = imm16{15-12}; + let Inst{15} = 0b1; + let Inst{14-12} = 0b010; + let Inst{11-0} = imm16{11-0}; +} + +// A8.6.18 BFI - Bitfield insert (Encoding T1) +let Constraints = "$src = $Rd" in { + def t2BFI : T2TwoRegBitFI<(outs rGPR:$Rd), + (ins rGPR:$src, rGPR:$Rn, bf_inv_mask_imm:$imm), + IIC_iBITi, "bfi", "\t$Rd, $Rn, $imm", + [(set rGPR:$Rd, (ARMbfi rGPR:$src, rGPR:$Rn, + bf_inv_mask_imm:$imm))]> { + let Inst{31-27} = 0b11110; + let Inst{26} = 0; // should be 0. + let Inst{25} = 1; + let Inst{24-20} = 0b10110; + let Inst{15} = 0; + let Inst{5} = 0; // should be 0. + + bits<10> imm; + let msb{4-0} = imm{9-5}; + let lsb{4-0} = imm{4-0}; + } +} + +defm t2ORN : T2I_bin_irs<0b0011, "orn", + IIC_iBITi, IIC_iBITr, IIC_iBITsi, + BinOpFrag<(or node:$LHS, (not node:$RHS))>, 0, "">; + +/// T2I_un_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a +/// unary operation that produces a value. These are predicable and can be +/// changed to modify CPSR. +multiclass T2I_un_irs<bits<4> opcod, string opc, + InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, + PatFrag opnode, + bit Cheap = 0, bit ReMat = 0, bit MoveImm = 0> { + // shifted imm + def i : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), iii, + opc, "\t$Rd, $imm", + [(set rGPR:$Rd, (opnode t2_so_imm:$imm))]>, Sched<[WriteALU]> { + let isAsCheapAsAMove = Cheap; + let isReMaterializable = ReMat; + let isMoveImm = MoveImm; + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = opcod; + let Inst{19-16} = 0b1111; // Rn + let Inst{15} = 0; + } + // register + def r : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), iir, + opc, ".w\t$Rd, $Rm", + [(set rGPR:$Rd, (opnode rGPR:$Rm))]>, Sched<[WriteALU]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{19-16} = 0b1111; // Rn + let Inst{14-12} = 0b000; // imm3 + let Inst{7-6} = 0b00; // imm2 + let Inst{5-4} = 0b00; // type + } + // shifted register + def s : T2sOneRegShiftedReg<(outs rGPR:$Rd), (ins t2_so_reg:$ShiftedRm), iis, + opc, ".w\t$Rd, $ShiftedRm", + [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm))]>, + Sched<[WriteALU]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{19-16} = 0b1111; // Rn + } +} + +// Prefer over of t2EORri ra, rb, -1 because mvn has 16-bit version +let AddedComplexity = 1 in +defm t2MVN : T2I_un_irs <0b0011, "mvn", + IIC_iMVNi, IIC_iMVNr, IIC_iMVNsi, + UnOpFrag<(not node:$Src)>, 1, 1, 1>; + +let AddedComplexity = 1 in +def : T2Pat<(and rGPR:$src, t2_so_imm_not:$imm), + (t2BICri rGPR:$src, t2_so_imm_not:$imm)>; + +// top16Zero - answer true if the upper 16 bits of $src are 0, false otherwise +def top16Zero: PatLeaf<(i32 rGPR:$src), [{ + return CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(32, 16)); + }]>; + +// so_imm_notSext is needed instead of so_imm_not, as the value of imm +// will match the extended, not the original bitWidth for $src. +def : T2Pat<(and top16Zero:$src, t2_so_imm_notSext:$imm), + (t2BICri rGPR:$src, t2_so_imm_notSext:$imm)>; + + +// FIXME: Disable this pattern on Darwin to workaround an assembler bug. +def : T2Pat<(or rGPR:$src, t2_so_imm_not:$imm), + (t2ORNri rGPR:$src, t2_so_imm_not:$imm)>, + Requires<[IsThumb2]>; + +def : T2Pat<(t2_so_imm_not:$src), + (t2MVNi t2_so_imm_not:$src)>; + +//===----------------------------------------------------------------------===// +// Multiply Instructions. +// +let isCommutable = 1 in +def t2MUL: T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32, + "mul", "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (mul rGPR:$Rn, rGPR:$Rm))]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b000; + let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate) + let Inst{7-4} = 0b0000; // Multiply +} + +def t2MLA: T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, + "mla", "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (add (mul rGPR:$Rn, rGPR:$Rm), rGPR:$Ra))]>, + Requires<[IsThumb2, UseMulOps]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b000; + let Inst{7-4} = 0b0000; // Multiply +} + +def t2MLS: T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, + "mls", "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (sub rGPR:$Ra, (mul rGPR:$Rn, rGPR:$Rm)))]>, + Requires<[IsThumb2, UseMulOps]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b000; + let Inst{7-4} = 0b0001; // Multiply and Subtract +} + +// Extra precision multiplies with low / high results +let hasSideEffects = 0 in { +let isCommutable = 1 in { +def t2SMULL : T2MulLong<0b000, 0b0000, + (outs rGPR:$RdLo, rGPR:$RdHi), + (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL64, + "smull", "\t$RdLo, $RdHi, $Rn, $Rm", []>; + +def t2UMULL : T2MulLong<0b010, 0b0000, + (outs rGPR:$RdLo, rGPR:$RdHi), + (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL64, + "umull", "\t$RdLo, $RdHi, $Rn, $Rm", []>; +} // isCommutable + +// Multiply + accumulate +def t2SMLAL : T2MlaLong<0b100, 0b0000, + (outs rGPR:$RdLo, rGPR:$RdHi), + (ins rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi), IIC_iMAC64, + "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">; + +def t2UMLAL : T2MlaLong<0b110, 0b0000, + (outs rGPR:$RdLo, rGPR:$RdHi), + (ins rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi), IIC_iMAC64, + "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">; + +def t2UMAAL : T2MulLong<0b110, 0b0110, + (outs rGPR:$RdLo, rGPR:$RdHi), + (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64, + "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + Requires<[IsThumb2, HasDSP]>; +} // hasSideEffects + +// Rounding variants of the below included for disassembly only + +// Most significant word multiply +def t2SMMUL : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32, + "smmul", "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (mulhs rGPR:$Rn, rGPR:$Rm))]>, + Requires<[IsThumb2, HasDSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b101; + let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate) + let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0) +} + +def t2SMMULR : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32, + "smmulr", "\t$Rd, $Rn, $Rm", []>, + Requires<[IsThumb2, HasDSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b101; + let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate) + let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1) +} + +def t2SMMLA : T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, + "smmla", "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (add (mulhs rGPR:$Rm, rGPR:$Rn), rGPR:$Ra))]>, + Requires<[IsThumb2, HasDSP, UseMulOps]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b101; + let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0) +} + +def t2SMMLAR: T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, + "smmlar", "\t$Rd, $Rn, $Rm, $Ra", []>, + Requires<[IsThumb2, HasDSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b101; + let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1) +} + +def t2SMMLS: T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, + "smmls", "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (sub rGPR:$Ra, (mulhs rGPR:$Rn, rGPR:$Rm)))]>, + Requires<[IsThumb2, HasDSP, UseMulOps]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b110; + let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0) +} + +def t2SMMLSR:T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, + "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []>, + Requires<[IsThumb2, HasDSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b110; + let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1) +} + +multiclass T2I_smul<string opc, PatFrag opnode> { + def BB : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16, + !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (opnode (sext_inreg rGPR:$Rn, i16), + (sext_inreg rGPR:$Rm, i16)))]>, + Requires<[IsThumb2, HasDSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b001; + let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate) + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b00; + } + + def BT : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16, + !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (opnode (sext_inreg rGPR:$Rn, i16), + (sra rGPR:$Rm, (i32 16))))]>, + Requires<[IsThumb2, HasDSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b001; + let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate) + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b01; + } + + def TB : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16, + !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (opnode (sra rGPR:$Rn, (i32 16)), + (sext_inreg rGPR:$Rm, i16)))]>, + Requires<[IsThumb2, HasDSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b001; + let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate) + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b10; + } + + def TT : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16, + !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (opnode (sra rGPR:$Rn, (i32 16)), + (sra rGPR:$Rm, (i32 16))))]>, + Requires<[IsThumb2, HasDSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b001; + let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate) + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b11; + } + + def WB : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16, + !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm", + []>, + Requires<[IsThumb2, HasDSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b011; + let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate) + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b00; + } + + def WT : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16, + !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm", + []>, + Requires<[IsThumb2, HasDSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b011; + let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate) + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b01; + } +} + + +multiclass T2I_smla<string opc, PatFrag opnode> { + def BB : T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16, + !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (add rGPR:$Ra, + (opnode (sext_inreg rGPR:$Rn, i16), + (sext_inreg rGPR:$Rm, i16))))]>, + Requires<[IsThumb2, HasDSP, UseMulOps]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b001; + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b00; + } + + def BT : T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16, + !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sext_inreg rGPR:$Rn, i16), + (sra rGPR:$Rm, (i32 16)))))]>, + Requires<[IsThumb2, HasDSP, UseMulOps]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b001; + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b01; + } + + def TB : T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16, + !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sra rGPR:$Rn, (i32 16)), + (sext_inreg rGPR:$Rm, i16))))]>, + Requires<[IsThumb2, HasDSP, UseMulOps]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b001; + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b10; + } + + def TT : T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16, + !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sra rGPR:$Rn, (i32 16)), + (sra rGPR:$Rm, (i32 16)))))]>, + Requires<[IsThumb2, HasDSP, UseMulOps]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b001; + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b11; + } + + def WB : T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16, + !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra", + []>, + Requires<[IsThumb2, HasDSP, UseMulOps]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b011; + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b00; + } + + def WT : T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16, + !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra", + []>, + Requires<[IsThumb2, HasDSP, UseMulOps]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b011; + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b01; + } +} + +defm t2SMUL : T2I_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>; +defm t2SMLA : T2I_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>; + +// Halfword multiple accumulate long: SMLAL<x><y> +def t2SMLALBB : T2FourReg_mac<1, 0b100, 0b1000, (outs rGPR:$Ra,rGPR:$Rd), + (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlalbb", "\t$Ra, $Rd, $Rn, $Rm", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsThumb2, HasDSP]>; +def t2SMLALBT : T2FourReg_mac<1, 0b100, 0b1001, (outs rGPR:$Ra,rGPR:$Rd), + (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlalbt", "\t$Ra, $Rd, $Rn, $Rm", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsThumb2, HasDSP]>; +def t2SMLALTB : T2FourReg_mac<1, 0b100, 0b1010, (outs rGPR:$Ra,rGPR:$Rd), + (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaltb", "\t$Ra, $Rd, $Rn, $Rm", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsThumb2, HasDSP]>; +def t2SMLALTT : T2FourReg_mac<1, 0b100, 0b1011, (outs rGPR:$Ra,rGPR:$Rd), + (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaltt", "\t$Ra, $Rd, $Rn, $Rm", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsThumb2, HasDSP]>; + +// Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD +def t2SMUAD: T2ThreeReg_mac< + 0, 0b010, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), + IIC_iMAC32, "smuad", "\t$Rd, $Rn, $Rm", []>, + Requires<[IsThumb2, HasDSP]> { + let Inst{15-12} = 0b1111; +} +def t2SMUADX:T2ThreeReg_mac< + 0, 0b010, 0b0001, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), + IIC_iMAC32, "smuadx", "\t$Rd, $Rn, $Rm", []>, + Requires<[IsThumb2, HasDSP]> { + let Inst{15-12} = 0b1111; +} +def t2SMUSD: T2ThreeReg_mac< + 0, 0b100, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), + IIC_iMAC32, "smusd", "\t$Rd, $Rn, $Rm", []>, + Requires<[IsThumb2, HasDSP]> { + let Inst{15-12} = 0b1111; +} +def t2SMUSDX:T2ThreeReg_mac< + 0, 0b100, 0b0001, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), + IIC_iMAC32, "smusdx", "\t$Rd, $Rn, $Rm", []>, + Requires<[IsThumb2, HasDSP]> { + let Inst{15-12} = 0b1111; +} +def t2SMLAD : T2FourReg_mac< + 0, 0b010, 0b0000, (outs rGPR:$Rd), + (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlad", + "\t$Rd, $Rn, $Rm, $Ra", []>, + Requires<[IsThumb2, HasDSP]>; +def t2SMLADX : T2FourReg_mac< + 0, 0b010, 0b0001, (outs rGPR:$Rd), + (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smladx", + "\t$Rd, $Rn, $Rm, $Ra", []>, + Requires<[IsThumb2, HasDSP]>; +def t2SMLSD : T2FourReg_mac<0, 0b100, 0b0000, (outs rGPR:$Rd), + (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlsd", + "\t$Rd, $Rn, $Rm, $Ra", []>, + Requires<[IsThumb2, HasDSP]>; +def t2SMLSDX : T2FourReg_mac<0, 0b100, 0b0001, (outs rGPR:$Rd), + (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlsdx", + "\t$Rd, $Rn, $Rm, $Ra", []>, + Requires<[IsThumb2, HasDSP]>; +def t2SMLALD : T2FourReg_mac<1, 0b100, 0b1100, (outs rGPR:$Ra,rGPR:$Rd), + (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64, "smlald", + "\t$Ra, $Rd, $Rn, $Rm", []>, + Requires<[IsThumb2, HasDSP]>; +def t2SMLALDX : T2FourReg_mac<1, 0b100, 0b1101, (outs rGPR:$Ra,rGPR:$Rd), + (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaldx", + "\t$Ra, $Rd, $Rn, $Rm", []>, + Requires<[IsThumb2, HasDSP]>; +def t2SMLSLD : T2FourReg_mac<1, 0b101, 0b1100, (outs rGPR:$Ra,rGPR:$Rd), + (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlsld", + "\t$Ra, $Rd, $Rn, $Rm", []>, + Requires<[IsThumb2, HasDSP]>; +def t2SMLSLDX : T2FourReg_mac<1, 0b101, 0b1101, (outs rGPR:$Ra,rGPR:$Rd), + (ins rGPR:$Rm,rGPR:$Rn), IIC_iMAC64, "smlsldx", + "\t$Ra, $Rd, $Rn, $Rm", []>, + Requires<[IsThumb2, HasDSP]>; + +//===----------------------------------------------------------------------===// +// Division Instructions. +// Signed and unsigned division on v7-M +// +def t2SDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV, + "sdiv", "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (sdiv rGPR:$Rn, rGPR:$Rm))]>, + Requires<[HasDivide, IsThumb2]> { + let Inst{31-27} = 0b11111; + let Inst{26-21} = 0b011100; + let Inst{20} = 0b1; + let Inst{15-12} = 0b1111; + let Inst{7-4} = 0b1111; +} + +def t2UDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV, + "udiv", "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (udiv rGPR:$Rn, rGPR:$Rm))]>, + Requires<[HasDivide, IsThumb2]> { + let Inst{31-27} = 0b11111; + let Inst{26-21} = 0b011101; + let Inst{20} = 0b1; + let Inst{15-12} = 0b1111; + let Inst{7-4} = 0b1111; +} + +//===----------------------------------------------------------------------===// +// Misc. Arithmetic Instructions. +// + +class T2I_misc<bits<2> op1, bits<2> op2, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : T2ThreeReg<oops, iops, itin, opc, asm, pattern> { + let Inst{31-27} = 0b11111; + let Inst{26-22} = 0b01010; + let Inst{21-20} = op1; + let Inst{15-12} = 0b1111; + let Inst{7-6} = 0b10; + let Inst{5-4} = op2; + let Rn{3-0} = Rm; +} + +def t2CLZ : T2I_misc<0b11, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr, + "clz", "\t$Rd, $Rm", [(set rGPR:$Rd, (ctlz rGPR:$Rm))]>, + Sched<[WriteALU]>; + +def t2RBIT : T2I_misc<0b01, 0b10, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr, + "rbit", "\t$Rd, $Rm", + [(set rGPR:$Rd, (bitreverse rGPR:$Rm))]>, + Sched<[WriteALU]>; + +def t2REV : T2I_misc<0b01, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr, + "rev", ".w\t$Rd, $Rm", [(set rGPR:$Rd, (bswap rGPR:$Rm))]>, + Sched<[WriteALU]>; + +def t2REV16 : T2I_misc<0b01, 0b01, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr, + "rev16", ".w\t$Rd, $Rm", + [(set rGPR:$Rd, (rotr (bswap rGPR:$Rm), (i32 16)))]>, + Sched<[WriteALU]>; + +def t2REVSH : T2I_misc<0b01, 0b11, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr, + "revsh", ".w\t$Rd, $Rm", + [(set rGPR:$Rd, (sra (bswap rGPR:$Rm), (i32 16)))]>, + Sched<[WriteALU]>; + +def : T2Pat<(or (sra (shl rGPR:$Rm, (i32 24)), (i32 16)), + (and (srl rGPR:$Rm, (i32 8)), 0xFF)), + (t2REVSH rGPR:$Rm)>; + +def t2PKHBT : T2ThreeReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, pkh_lsl_amt:$sh), + IIC_iBITsi, "pkhbt", "\t$Rd, $Rn, $Rm$sh", + [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF), + (and (shl rGPR:$Rm, pkh_lsl_amt:$sh), + 0xFFFF0000)))]>, + Requires<[HasT2ExtractPack, IsThumb2]>, + Sched<[WriteALUsi, ReadALU]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-20} = 0b01100; + let Inst{5} = 0; // BT form + let Inst{4} = 0; + + bits<5> sh; + let Inst{14-12} = sh{4-2}; + let Inst{7-6} = sh{1-0}; +} + +// Alternate cases for PKHBT where identities eliminate some nodes. +def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (and rGPR:$src2, 0xFFFF0000)), + (t2PKHBT rGPR:$src1, rGPR:$src2, 0)>, + Requires<[HasT2ExtractPack, IsThumb2]>; +def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (shl rGPR:$src2, imm16_31:$sh)), + (t2PKHBT rGPR:$src1, rGPR:$src2, imm16_31:$sh)>, + Requires<[HasT2ExtractPack, IsThumb2]>; + +// Note: Shifts of 1-15 bits will be transformed to srl instead of sra and +// will match the pattern below. +def t2PKHTB : T2ThreeReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, pkh_asr_amt:$sh), + IIC_iBITsi, "pkhtb", "\t$Rd, $Rn, $Rm$sh", + [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF0000), + (and (sra rGPR:$Rm, pkh_asr_amt:$sh), + 0xFFFF)))]>, + Requires<[HasT2ExtractPack, IsThumb2]>, + Sched<[WriteALUsi, ReadALU]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-20} = 0b01100; + let Inst{5} = 1; // TB form + let Inst{4} = 0; + + bits<5> sh; + let Inst{14-12} = sh{4-2}; + let Inst{7-6} = sh{1-0}; +} + +// Alternate cases for PKHTB where identities eliminate some nodes. Note that +// a shift amount of 0 is *not legal* here, it is PKHBT instead. +// We also can not replace a srl (17..31) by an arithmetic shift we would use in +// pkhtb src1, src2, asr (17..31). +def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (srl rGPR:$src2, imm16:$sh)), + (t2PKHTB rGPR:$src1, rGPR:$src2, imm16:$sh)>, + Requires<[HasT2ExtractPack, IsThumb2]>; +def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (sra rGPR:$src2, imm16_31:$sh)), + (t2PKHTB rGPR:$src1, rGPR:$src2, imm16_31:$sh)>, + Requires<[HasT2ExtractPack, IsThumb2]>; +def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), + (and (srl rGPR:$src2, imm1_15:$sh), 0xFFFF)), + (t2PKHTB rGPR:$src1, rGPR:$src2, imm1_15:$sh)>, + Requires<[HasT2ExtractPack, IsThumb2]>; + +//===----------------------------------------------------------------------===// +// CRC32 Instructions +// +// Polynomials: +// + CRC32{B,H,W} 0x04C11DB7 +// + CRC32C{B,H,W} 0x1EDC6F41 +// + +class T2I_crc32<bit C, bits<2> sz, string suffix, SDPatternOperator builtin> + : T2ThreeRegNoP<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), NoItinerary, + !strconcat("crc32", suffix, "\t$Rd, $Rn, $Rm"), + [(set rGPR:$Rd, (builtin rGPR:$Rn, rGPR:$Rm))]>, + Requires<[IsThumb2, HasV8, HasCRC]> { + let Inst{31-27} = 0b11111; + let Inst{26-21} = 0b010110; + let Inst{20} = C; + let Inst{15-12} = 0b1111; + let Inst{7-6} = 0b10; + let Inst{5-4} = sz; +} + +def t2CRC32B : T2I_crc32<0, 0b00, "b", int_arm_crc32b>; +def t2CRC32CB : T2I_crc32<1, 0b00, "cb", int_arm_crc32cb>; +def t2CRC32H : T2I_crc32<0, 0b01, "h", int_arm_crc32h>; +def t2CRC32CH : T2I_crc32<1, 0b01, "ch", int_arm_crc32ch>; +def t2CRC32W : T2I_crc32<0, 0b10, "w", int_arm_crc32w>; +def t2CRC32CW : T2I_crc32<1, 0b10, "cw", int_arm_crc32cw>; + +//===----------------------------------------------------------------------===// +// Comparison Instructions... +// +defm t2CMP : T2I_cmp_irs<0b1101, "cmp", + IIC_iCMPi, IIC_iCMPr, IIC_iCMPsi, + BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>; + +def : T2Pat<(ARMcmpZ GPRnopc:$lhs, t2_so_imm:$imm), + (t2CMPri GPRnopc:$lhs, t2_so_imm:$imm)>; +def : T2Pat<(ARMcmpZ GPRnopc:$lhs, rGPR:$rhs), + (t2CMPrr GPRnopc:$lhs, rGPR:$rhs)>; +def : T2Pat<(ARMcmpZ GPRnopc:$lhs, t2_so_reg:$rhs), + (t2CMPrs GPRnopc:$lhs, t2_so_reg:$rhs)>; + +let isCompare = 1, Defs = [CPSR] in { + // shifted imm + def t2CMNri : T2OneRegCmpImm< + (outs), (ins GPRnopc:$Rn, t2_so_imm:$imm), IIC_iCMPi, + "cmn", ".w\t$Rn, $imm", + [(ARMcmn GPRnopc:$Rn, (ineg t2_so_imm:$imm))]>, + Sched<[WriteCMP, ReadALU]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = 0b1000; + let Inst{20} = 1; // The S bit. + let Inst{15} = 0; + let Inst{11-8} = 0b1111; // Rd + } + // register + def t2CMNzrr : T2TwoRegCmp< + (outs), (ins GPRnopc:$Rn, rGPR:$Rm), IIC_iCMPr, + "cmn", ".w\t$Rn, $Rm", + [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))> + GPRnopc:$Rn, rGPR:$Rm)]>, Sched<[WriteCMP, ReadALU, ReadALU]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = 0b1000; + let Inst{20} = 1; // The S bit. + let Inst{14-12} = 0b000; // imm3 + let Inst{11-8} = 0b1111; // Rd + let Inst{7-6} = 0b00; // imm2 + let Inst{5-4} = 0b00; // type + } + // shifted register + def t2CMNzrs : T2OneRegCmpShiftedReg< + (outs), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm), IIC_iCMPsi, + "cmn", ".w\t$Rn, $ShiftedRm", + [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))> + GPRnopc:$Rn, t2_so_reg:$ShiftedRm)]>, + Sched<[WriteCMPsi, ReadALU, ReadALU]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = 0b1000; + let Inst{20} = 1; // The S bit. + let Inst{11-8} = 0b1111; // Rd + } +} + +// Assembler aliases w/o the ".w" suffix. +// No alias here for 'rr' version as not all instantiations of this multiclass +// want one (CMP in particular, does not). +def : t2InstAlias<"cmn${p} $Rn, $imm", + (t2CMNri GPRnopc:$Rn, t2_so_imm:$imm, pred:$p)>; +def : t2InstAlias<"cmn${p} $Rn, $shift", + (t2CMNzrs GPRnopc:$Rn, t2_so_reg:$shift, pred:$p)>; + +def : T2Pat<(ARMcmp GPR:$src, t2_so_imm_neg:$imm), + (t2CMNri GPR:$src, t2_so_imm_neg:$imm)>; + +def : T2Pat<(ARMcmpZ GPRnopc:$src, t2_so_imm_neg:$imm), + (t2CMNri GPRnopc:$src, t2_so_imm_neg:$imm)>; + +defm t2TST : T2I_cmp_irs<0b0000, "tst", + IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi, + BinOpFrag<(ARMcmpZ (and_su node:$LHS, node:$RHS), 0)>>; +defm t2TEQ : T2I_cmp_irs<0b0100, "teq", + IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi, + BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>>; + +// Conditional moves +let hasSideEffects = 0 in { + +let isCommutable = 1, isSelect = 1 in +def t2MOVCCr : t2PseudoInst<(outs rGPR:$Rd), + (ins rGPR:$false, rGPR:$Rm, cmovpred:$p), + 4, IIC_iCMOVr, + [(set rGPR:$Rd, (ARMcmov rGPR:$false, rGPR:$Rm, + cmovpred:$p))]>, + RegConstraint<"$false = $Rd">, Sched<[WriteALU]>; + +let isMoveImm = 1 in +def t2MOVCCi + : t2PseudoInst<(outs rGPR:$Rd), + (ins rGPR:$false, t2_so_imm:$imm, cmovpred:$p), + 4, IIC_iCMOVi, + [(set rGPR:$Rd, (ARMcmov rGPR:$false,t2_so_imm:$imm, + cmovpred:$p))]>, + RegConstraint<"$false = $Rd">, Sched<[WriteALU]>; + +let isCodeGenOnly = 1 in { +let isMoveImm = 1 in +def t2MOVCCi16 + : t2PseudoInst<(outs rGPR:$Rd), + (ins rGPR:$false, imm0_65535_expr:$imm, cmovpred:$p), + 4, IIC_iCMOVi, + [(set rGPR:$Rd, (ARMcmov rGPR:$false, imm0_65535:$imm, + cmovpred:$p))]>, + RegConstraint<"$false = $Rd">, Sched<[WriteALU]>; + +let isMoveImm = 1 in +def t2MVNCCi + : t2PseudoInst<(outs rGPR:$Rd), + (ins rGPR:$false, t2_so_imm:$imm, cmovpred:$p), + 4, IIC_iCMOVi, + [(set rGPR:$Rd, + (ARMcmov rGPR:$false, t2_so_imm_not:$imm, + cmovpred:$p))]>, + RegConstraint<"$false = $Rd">, Sched<[WriteALU]>; + +class MOVCCShPseudo<SDPatternOperator opnode, Operand ty> + : t2PseudoInst<(outs rGPR:$Rd), + (ins rGPR:$false, rGPR:$Rm, i32imm:$imm, cmovpred:$p), + 4, IIC_iCMOVsi, + [(set rGPR:$Rd, (ARMcmov rGPR:$false, + (opnode rGPR:$Rm, (i32 ty:$imm)), + cmovpred:$p))]>, + RegConstraint<"$false = $Rd">, Sched<[WriteALU]>; + +def t2MOVCClsl : MOVCCShPseudo<shl, imm0_31>; +def t2MOVCClsr : MOVCCShPseudo<srl, imm_sr>; +def t2MOVCCasr : MOVCCShPseudo<sra, imm_sr>; +def t2MOVCCror : MOVCCShPseudo<rotr, imm0_31>; + +let isMoveImm = 1 in +def t2MOVCCi32imm + : t2PseudoInst<(outs rGPR:$dst), + (ins rGPR:$false, i32imm:$src, cmovpred:$p), + 8, IIC_iCMOVix2, + [(set rGPR:$dst, (ARMcmov rGPR:$false, imm:$src, + cmovpred:$p))]>, + RegConstraint<"$false = $dst">; +} // isCodeGenOnly = 1 + +} // hasSideEffects + +//===----------------------------------------------------------------------===// +// Atomic operations intrinsics +// + +// memory barriers protect the atomic sequences +let hasSideEffects = 1 in { +def t2DMB : T2I<(outs), (ins memb_opt:$opt), NoItinerary, + "dmb", "\t$opt", [(int_arm_dmb (i32 imm0_15:$opt))]>, + Requires<[IsThumb, HasDB]> { + bits<4> opt; + let Inst{31-4} = 0xf3bf8f5; + let Inst{3-0} = opt; +} + +def t2DSB : T2I<(outs), (ins memb_opt:$opt), NoItinerary, + "dsb", "\t$opt", [(int_arm_dsb (i32 imm0_15:$opt))]>, + Requires<[IsThumb, HasDB]> { + bits<4> opt; + let Inst{31-4} = 0xf3bf8f4; + let Inst{3-0} = opt; +} + +def t2ISB : T2I<(outs), (ins instsyncb_opt:$opt), NoItinerary, + "isb", "\t$opt", [(int_arm_isb (i32 imm0_15:$opt))]>, + Requires<[IsThumb, HasDB]> { + bits<4> opt; + let Inst{31-4} = 0xf3bf8f6; + let Inst{3-0} = opt; +} +} + +class T2I_ldrex<bits<4> opcod, dag oops, dag iops, AddrMode am, int sz, + InstrItinClass itin, string opc, string asm, string cstr, + list<dag> pattern, bits<4> rt2 = 0b1111> + : Thumb2I<oops, iops, am, sz, itin, opc, asm, cstr, pattern> { + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0001101; + let Inst{11-8} = rt2; + let Inst{7-4} = opcod; + let Inst{3-0} = 0b1111; + + bits<4> addr; + bits<4> Rt; + let Inst{19-16} = addr; + let Inst{15-12} = Rt; +} +class T2I_strex<bits<4> opcod, dag oops, dag iops, AddrMode am, int sz, + InstrItinClass itin, string opc, string asm, string cstr, + list<dag> pattern, bits<4> rt2 = 0b1111> + : Thumb2I<oops, iops, am, sz, itin, opc, asm, cstr, pattern> { + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0001100; + let Inst{11-8} = rt2; + let Inst{7-4} = opcod; + + bits<4> Rd; + bits<4> addr; + bits<4> Rt; + let Inst{3-0} = Rd; + let Inst{19-16} = addr; + let Inst{15-12} = Rt; +} + +let mayLoad = 1 in { +def t2LDREXB : T2I_ldrex<0b0100, (outs rGPR:$Rt), (ins addr_offset_none:$addr), + AddrModeNone, 4, NoItinerary, + "ldrexb", "\t$Rt, $addr", "", + [(set rGPR:$Rt, (ldrex_1 addr_offset_none:$addr))]>; +def t2LDREXH : T2I_ldrex<0b0101, (outs rGPR:$Rt), (ins addr_offset_none:$addr), + AddrModeNone, 4, NoItinerary, + "ldrexh", "\t$Rt, $addr", "", + [(set rGPR:$Rt, (ldrex_2 addr_offset_none:$addr))]>; +def t2LDREX : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_imm0_1020s4:$addr), + AddrModeNone, 4, NoItinerary, + "ldrex", "\t$Rt, $addr", "", + [(set rGPR:$Rt, (ldrex_4 t2addrmode_imm0_1020s4:$addr))]> { + bits<4> Rt; + bits<12> addr; + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0000101; + let Inst{19-16} = addr{11-8}; + let Inst{15-12} = Rt; + let Inst{11-8} = 0b1111; + let Inst{7-0} = addr{7-0}; +} +let hasExtraDefRegAllocReq = 1 in +def t2LDREXD : T2I_ldrex<0b0111, (outs rGPR:$Rt, rGPR:$Rt2), + (ins addr_offset_none:$addr), + AddrModeNone, 4, NoItinerary, + "ldrexd", "\t$Rt, $Rt2, $addr", "", + [], {?, ?, ?, ?}>, + Requires<[IsThumb2, IsNotMClass]> { + bits<4> Rt2; + let Inst{11-8} = Rt2; +} +def t2LDAEXB : T2I_ldrex<0b1100, (outs rGPR:$Rt), (ins addr_offset_none:$addr), + AddrModeNone, 4, NoItinerary, + "ldaexb", "\t$Rt, $addr", "", + [(set rGPR:$Rt, (ldaex_1 addr_offset_none:$addr))]>, + Requires<[IsThumb, HasV8]>; +def t2LDAEXH : T2I_ldrex<0b1101, (outs rGPR:$Rt), (ins addr_offset_none:$addr), + AddrModeNone, 4, NoItinerary, + "ldaexh", "\t$Rt, $addr", "", + [(set rGPR:$Rt, (ldaex_2 addr_offset_none:$addr))]>, + Requires<[IsThumb, HasV8]>; +def t2LDAEX : Thumb2I<(outs rGPR:$Rt), (ins addr_offset_none:$addr), + AddrModeNone, 4, NoItinerary, + "ldaex", "\t$Rt, $addr", "", + [(set rGPR:$Rt, (ldaex_4 addr_offset_none:$addr))]>, + Requires<[IsThumb, HasV8]> { + bits<4> Rt; + bits<4> addr; + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0001101; + let Inst{19-16} = addr; + let Inst{15-12} = Rt; + let Inst{11-8} = 0b1111; + let Inst{7-0} = 0b11101111; +} +let hasExtraDefRegAllocReq = 1 in +def t2LDAEXD : T2I_ldrex<0b1111, (outs rGPR:$Rt, rGPR:$Rt2), + (ins addr_offset_none:$addr), + AddrModeNone, 4, NoItinerary, + "ldaexd", "\t$Rt, $Rt2, $addr", "", + [], {?, ?, ?, ?}>, Requires<[IsThumb, HasV8]> { + bits<4> Rt2; + let Inst{11-8} = Rt2; + + let Inst{7} = 1; +} +} + +let mayStore = 1, Constraints = "@earlyclobber $Rd" in { +def t2STREXB : T2I_strex<0b0100, (outs rGPR:$Rd), + (ins rGPR:$Rt, addr_offset_none:$addr), + AddrModeNone, 4, NoItinerary, + "strexb", "\t$Rd, $Rt, $addr", "", + [(set rGPR:$Rd, + (strex_1 rGPR:$Rt, addr_offset_none:$addr))]>; +def t2STREXH : T2I_strex<0b0101, (outs rGPR:$Rd), + (ins rGPR:$Rt, addr_offset_none:$addr), + AddrModeNone, 4, NoItinerary, + "strexh", "\t$Rd, $Rt, $addr", "", + [(set rGPR:$Rd, + (strex_2 rGPR:$Rt, addr_offset_none:$addr))]>; + +def t2STREX : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt, + t2addrmode_imm0_1020s4:$addr), + AddrModeNone, 4, NoItinerary, + "strex", "\t$Rd, $Rt, $addr", "", + [(set rGPR:$Rd, + (strex_4 rGPR:$Rt, t2addrmode_imm0_1020s4:$addr))]> { + bits<4> Rd; + bits<4> Rt; + bits<12> addr; + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0000100; + let Inst{19-16} = addr{11-8}; + let Inst{15-12} = Rt; + let Inst{11-8} = Rd; + let Inst{7-0} = addr{7-0}; +} +let hasExtraSrcRegAllocReq = 1 in +def t2STREXD : T2I_strex<0b0111, (outs rGPR:$Rd), + (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr), + AddrModeNone, 4, NoItinerary, + "strexd", "\t$Rd, $Rt, $Rt2, $addr", "", [], + {?, ?, ?, ?}>, + Requires<[IsThumb2, IsNotMClass]> { + bits<4> Rt2; + let Inst{11-8} = Rt2; +} +def t2STLEXB : T2I_strex<0b1100, (outs rGPR:$Rd), + (ins rGPR:$Rt, addr_offset_none:$addr), + AddrModeNone, 4, NoItinerary, + "stlexb", "\t$Rd, $Rt, $addr", "", + [(set rGPR:$Rd, + (stlex_1 rGPR:$Rt, addr_offset_none:$addr))]>, + Requires<[IsThumb, HasV8]>; + +def t2STLEXH : T2I_strex<0b1101, (outs rGPR:$Rd), + (ins rGPR:$Rt, addr_offset_none:$addr), + AddrModeNone, 4, NoItinerary, + "stlexh", "\t$Rd, $Rt, $addr", "", + [(set rGPR:$Rd, + (stlex_2 rGPR:$Rt, addr_offset_none:$addr))]>, + Requires<[IsThumb, HasV8]>; + +def t2STLEX : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt, + addr_offset_none:$addr), + AddrModeNone, 4, NoItinerary, + "stlex", "\t$Rd, $Rt, $addr", "", + [(set rGPR:$Rd, + (stlex_4 rGPR:$Rt, addr_offset_none:$addr))]>, + Requires<[IsThumb, HasV8]> { + bits<4> Rd; + bits<4> Rt; + bits<4> addr; + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0001100; + let Inst{19-16} = addr; + let Inst{15-12} = Rt; + let Inst{11-4} = 0b11111110; + let Inst{3-0} = Rd; +} +let hasExtraSrcRegAllocReq = 1 in +def t2STLEXD : T2I_strex<0b1111, (outs rGPR:$Rd), + (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr), + AddrModeNone, 4, NoItinerary, + "stlexd", "\t$Rd, $Rt, $Rt2, $addr", "", [], + {?, ?, ?, ?}>, Requires<[IsThumb, HasV8]> { + bits<4> Rt2; + let Inst{11-8} = Rt2; +} +} + +def t2CLREX : T2I<(outs), (ins), NoItinerary, "clrex", "", [(int_arm_clrex)]>, + Requires<[IsThumb2, HasV7]> { + let Inst{31-16} = 0xf3bf; + let Inst{15-14} = 0b10; + let Inst{13} = 0; + let Inst{12} = 0; + let Inst{11-8} = 0b1111; + let Inst{7-4} = 0b0010; + let Inst{3-0} = 0b1111; +} + +def : T2Pat<(and (ldrex_1 addr_offset_none:$addr), 0xff), + (t2LDREXB addr_offset_none:$addr)>; +def : T2Pat<(and (ldrex_2 addr_offset_none:$addr), 0xffff), + (t2LDREXH addr_offset_none:$addr)>; +def : T2Pat<(strex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr), + (t2STREXB GPR:$Rt, addr_offset_none:$addr)>; +def : T2Pat<(strex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr), + (t2STREXH GPR:$Rt, addr_offset_none:$addr)>; + +def : T2Pat<(and (ldaex_1 addr_offset_none:$addr), 0xff), + (t2LDAEXB addr_offset_none:$addr)>; +def : T2Pat<(and (ldaex_2 addr_offset_none:$addr), 0xffff), + (t2LDAEXH addr_offset_none:$addr)>; +def : T2Pat<(stlex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr), + (t2STLEXB GPR:$Rt, addr_offset_none:$addr)>; +def : T2Pat<(stlex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr), + (t2STLEXH GPR:$Rt, addr_offset_none:$addr)>; + +//===----------------------------------------------------------------------===// +// SJLJ Exception handling intrinsics +// eh_sjlj_setjmp() is an instruction sequence to store the return +// address and save #0 in R0 for the non-longjmp case. +// Since by its nature we may be coming from some other function to get +// here, and we're using the stack frame for the containing function to +// save/restore registers, we can't keep anything live in regs across +// the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon +// when we get here from a longjmp(). We force everything out of registers +// except for our own input by listing the relevant registers in Defs. By +// doing so, we also cause the prologue/epilogue code to actively preserve +// all of the callee-saved resgisters, which is exactly what we want. +// $val is a scratch register for our use. +let Defs = + [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR, + Q0, Q1, Q2, Q3, Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15], + hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1, + usesCustomInserter = 1 in { + def t2Int_eh_sjlj_setjmp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val), + AddrModeNone, 0, NoItinerary, "", "", + [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>, + Requires<[IsThumb2, HasVFP2]>; +} + +let Defs = + [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR ], + hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1, + usesCustomInserter = 1 in { + def t2Int_eh_sjlj_setjmp_nofp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val), + AddrModeNone, 0, NoItinerary, "", "", + [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>, + Requires<[IsThumb2, NoVFP]>; +} + + +//===----------------------------------------------------------------------===// +// Control-Flow Instructions +// + +// FIXME: remove when we have a way to marking a MI with these properties. +// FIXME: Should pc be an implicit operand like PICADD, etc? +let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1, + hasExtraDefRegAllocReq = 1, isCodeGenOnly = 1 in +def t2LDMIA_RET: t2PseudoExpand<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, + reglist:$regs, variable_ops), + 4, IIC_iLoad_mBr, [], + (t2LDMIA_UPD GPR:$wb, GPR:$Rn, pred:$p, reglist:$regs)>, + RegConstraint<"$Rn = $wb">; + +let isBranch = 1, isTerminator = 1, isBarrier = 1 in { +let isPredicable = 1 in +def t2B : T2I<(outs), (ins uncondbrtarget:$target), IIC_Br, + "b", ".w\t$target", + [(br bb:$target)]>, Sched<[WriteBr]> { + let Inst{31-27} = 0b11110; + let Inst{15-14} = 0b10; + let Inst{12} = 1; + + bits<24> target; + let Inst{26} = target{23}; + let Inst{13} = target{22}; + let Inst{11} = target{21}; + let Inst{25-16} = target{20-11}; + let Inst{10-0} = target{10-0}; + let DecoderMethod = "DecodeT2BInstruction"; + let AsmMatchConverter = "cvtThumbBranches"; +} + +let Size = 4, isNotDuplicable = 1, isIndirectBranch = 1 in { +def t2BR_JT : t2PseudoInst<(outs), + (ins GPR:$target, GPR:$index, i32imm:$jt), + 0, IIC_Br, + [(ARMbr2jt GPR:$target, GPR:$index, tjumptable:$jt)]>, + Sched<[WriteBr]>; + +// FIXME: Add a case that can be predicated. +def t2TBB_JT : t2PseudoInst<(outs), + (ins GPR:$base, GPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>, + Sched<[WriteBr]>; + +def t2TBH_JT : t2PseudoInst<(outs), + (ins GPR:$base, GPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>, + Sched<[WriteBr]>; + +def t2TBB : T2I<(outs), (ins addrmode_tbb:$addr), IIC_Br, + "tbb", "\t$addr", []>, Sched<[WriteBrTbl]> { + bits<4> Rn; + bits<4> Rm; + let Inst{31-20} = 0b111010001101; + let Inst{19-16} = Rn; + let Inst{15-5} = 0b11110000000; + let Inst{4} = 0; // B form + let Inst{3-0} = Rm; + + let DecoderMethod = "DecodeThumbTableBranch"; +} + +def t2TBH : T2I<(outs), (ins addrmode_tbh:$addr), IIC_Br, + "tbh", "\t$addr", []>, Sched<[WriteBrTbl]> { + bits<4> Rn; + bits<4> Rm; + let Inst{31-20} = 0b111010001101; + let Inst{19-16} = Rn; + let Inst{15-5} = 0b11110000000; + let Inst{4} = 1; // H form + let Inst{3-0} = Rm; + + let DecoderMethod = "DecodeThumbTableBranch"; +} +} // isNotDuplicable, isIndirectBranch + +} // isBranch, isTerminator, isBarrier + +// FIXME: should be able to write a pattern for ARMBrcond, but can't use +// a two-value operand where a dag node expects ", "two operands. :( +let isBranch = 1, isTerminator = 1 in +def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br, + "b", ".w\t$target", + [/*(ARMbrcond bb:$target, imm:$cc)*/]>, Sched<[WriteBr]> { + let Inst{31-27} = 0b11110; + let Inst{15-14} = 0b10; + let Inst{12} = 0; + + bits<4> p; + let Inst{25-22} = p; + + bits<21> target; + let Inst{26} = target{20}; + let Inst{11} = target{19}; + let Inst{13} = target{18}; + let Inst{21-16} = target{17-12}; + let Inst{10-0} = target{11-1}; + + let DecoderMethod = "DecodeThumb2BCCInstruction"; + let AsmMatchConverter = "cvtThumbBranches"; +} + +// Tail calls. The MachO version of thumb tail calls uses a t2 branch, so +// it goes here. +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in { + // IOS version. + let Uses = [SP] in + def tTAILJMPd: tPseudoExpand<(outs), + (ins uncondbrtarget:$dst, pred:$p), + 4, IIC_Br, [], + (t2B uncondbrtarget:$dst, pred:$p)>, + Requires<[IsThumb2, IsMachO]>, Sched<[WriteBr]>; +} + +// IT block +let Defs = [ITSTATE] in +def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask), + AddrModeNone, 2, IIC_iALUx, + "it$mask\t$cc", "", []>, + ComplexDeprecationPredicate<"IT"> { + // 16-bit instruction. + let Inst{31-16} = 0x0000; + let Inst{15-8} = 0b10111111; + + bits<4> cc; + bits<4> mask; + let Inst{7-4} = cc; + let Inst{3-0} = mask; + + let DecoderMethod = "DecodeIT"; +} + +// Branch and Exchange Jazelle -- for disassembly only +// Rm = Inst{19-16} +def t2BXJ : T2I<(outs), (ins GPRnopc:$func), NoItinerary, "bxj", "\t$func", []>, + Sched<[WriteBr]>, Requires<[IsThumb2, IsNotMClass]> { + bits<4> func; + let Inst{31-27} = 0b11110; + let Inst{26} = 0; + let Inst{25-20} = 0b111100; + let Inst{19-16} = func; + let Inst{15-0} = 0b1000111100000000; +} + +// Compare and branch on zero / non-zero +let isBranch = 1, isTerminator = 1 in { + def tCBZ : T1I<(outs), (ins tGPR:$Rn, t_cbtarget:$target), IIC_Br, + "cbz\t$Rn, $target", []>, + T1Misc<{0,0,?,1,?,?,?}>, + Requires<[IsThumb2]>, Sched<[WriteBr]> { + // A8.6.27 + bits<6> target; + bits<3> Rn; + let Inst{9} = target{5}; + let Inst{7-3} = target{4-0}; + let Inst{2-0} = Rn; + } + + def tCBNZ : T1I<(outs), (ins tGPR:$Rn, t_cbtarget:$target), IIC_Br, + "cbnz\t$Rn, $target", []>, + T1Misc<{1,0,?,1,?,?,?}>, + Requires<[IsThumb2]>, Sched<[WriteBr]> { + // A8.6.27 + bits<6> target; + bits<3> Rn; + let Inst{9} = target{5}; + let Inst{7-3} = target{4-0}; + let Inst{2-0} = Rn; + } +} + + +// Change Processor State is a system instruction. +// FIXME: Since the asm parser has currently no clean way to handle optional +// operands, create 3 versions of the same instruction. Once there's a clean +// framework to represent optional operands, change this behavior. +class t2CPS<dag iops, string asm_op> : T2XI<(outs), iops, NoItinerary, + !strconcat("cps", asm_op), []>, + Requires<[IsThumb2, IsNotMClass]> { + bits<2> imod; + bits<3> iflags; + bits<5> mode; + bit M; + + let Inst{31-11} = 0b111100111010111110000; + let Inst{10-9} = imod; + let Inst{8} = M; + let Inst{7-5} = iflags; + let Inst{4-0} = mode; + let DecoderMethod = "DecodeT2CPSInstruction"; +} + +let M = 1 in + def t2CPS3p : t2CPS<(ins imod_op:$imod, iflags_op:$iflags, i32imm:$mode), + "$imod\t$iflags, $mode">; +let mode = 0, M = 0 in + def t2CPS2p : t2CPS<(ins imod_op:$imod, iflags_op:$iflags), + "$imod.w\t$iflags">; +let imod = 0, iflags = 0, M = 1 in + def t2CPS1p : t2CPS<(ins imm0_31:$mode), "\t$mode">; + +def : t2InstAlias<"cps$imod.w $iflags, $mode", + (t2CPS3p imod_op:$imod, iflags_op:$iflags, i32imm:$mode), 0>; +def : t2InstAlias<"cps.w $mode", (t2CPS1p imm0_31:$mode), 0>; + +// A6.3.4 Branches and miscellaneous control +// Table A6-14 Change Processor State, and hint instructions +def t2HINT : T2I<(outs), (ins imm0_239:$imm), NoItinerary, "hint", ".w\t$imm", + [(int_arm_hint imm0_239:$imm)]> { + bits<8> imm; + let Inst{31-3} = 0b11110011101011111000000000000; + let Inst{7-0} = imm; +} + +def : t2InstAlias<"hint$p $imm", (t2HINT imm0_239:$imm, pred:$p)>; +def : t2InstAlias<"nop$p.w", (t2HINT 0, pred:$p)>; +def : t2InstAlias<"yield$p.w", (t2HINT 1, pred:$p)>; +def : t2InstAlias<"wfe$p.w", (t2HINT 2, pred:$p)>; +def : t2InstAlias<"wfi$p.w", (t2HINT 3, pred:$p)>; +def : t2InstAlias<"sev$p.w", (t2HINT 4, pred:$p)>; +def : t2InstAlias<"sevl$p.w", (t2HINT 5, pred:$p)> { + let Predicates = [IsThumb2, HasV8]; +} + +def t2DBG : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "dbg", "\t$opt", + [(int_arm_dbg imm0_15:$opt)]> { + bits<4> opt; + let Inst{31-20} = 0b111100111010; + let Inst{19-16} = 0b1111; + let Inst{15-8} = 0b10000000; + let Inst{7-4} = 0b1111; + let Inst{3-0} = opt; +} + +// Secure Monitor Call is a system instruction. +// Option = Inst{19-16} +def t2SMC : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt", + []>, Requires<[IsThumb2, HasTrustZone]> { + let Inst{31-27} = 0b11110; + let Inst{26-20} = 0b1111111; + let Inst{15-12} = 0b1000; + + bits<4> opt; + let Inst{19-16} = opt; +} + +class T2DCPS<bits<2> opt, string opc> + : T2I<(outs), (ins), NoItinerary, opc, "", []>, Requires<[IsThumb2, HasV8]> { + let Inst{31-27} = 0b11110; + let Inst{26-20} = 0b1111000; + let Inst{19-16} = 0b1111; + let Inst{15-12} = 0b1000; + let Inst{11-2} = 0b0000000000; + let Inst{1-0} = opt; +} + +def t2DCPS1 : T2DCPS<0b01, "dcps1">; +def t2DCPS2 : T2DCPS<0b10, "dcps2">; +def t2DCPS3 : T2DCPS<0b11, "dcps3">; + +class T2SRS<bits<2> Op, bit W, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern>, + Requires<[IsThumb2,IsNotMClass]> { + bits<5> mode; + let Inst{31-25} = 0b1110100; + let Inst{24-23} = Op; + let Inst{22} = 0; + let Inst{21} = W; + let Inst{20-16} = 0b01101; + let Inst{15-5} = 0b11000000000; + let Inst{4-0} = mode{4-0}; +} + +// Store Return State is a system instruction. +def t2SRSDB_UPD : T2SRS<0b00, 1, (outs), (ins imm0_31:$mode), NoItinerary, + "srsdb", "\tsp!, $mode", []>; +def t2SRSDB : T2SRS<0b00, 0, (outs), (ins imm0_31:$mode), NoItinerary, + "srsdb","\tsp, $mode", []>; +def t2SRSIA_UPD : T2SRS<0b11, 1, (outs), (ins imm0_31:$mode), NoItinerary, + "srsia","\tsp!, $mode", []>; +def t2SRSIA : T2SRS<0b11, 0, (outs), (ins imm0_31:$mode), NoItinerary, + "srsia","\tsp, $mode", []>; + + +def : t2InstAlias<"srsdb${p} $mode", (t2SRSDB imm0_31:$mode, pred:$p)>; +def : t2InstAlias<"srsdb${p} $mode!", (t2SRSDB_UPD imm0_31:$mode, pred:$p)>; + +def : t2InstAlias<"srsia${p} $mode", (t2SRSIA imm0_31:$mode, pred:$p)>; +def : t2InstAlias<"srsia${p} $mode!", (t2SRSIA_UPD imm0_31:$mode, pred:$p)>; + +// Return From Exception is a system instruction. +class T2RFE<bits<12> op31_20, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern>, + Requires<[IsThumb2,IsNotMClass]> { + let Inst{31-20} = op31_20{11-0}; + + bits<4> Rn; + let Inst{19-16} = Rn; + let Inst{15-0} = 0xc000; +} + +def t2RFEDBW : T2RFE<0b111010000011, + (outs), (ins GPR:$Rn), NoItinerary, "rfedb", "\t$Rn!", + [/* For disassembly only; pattern left blank */]>; +def t2RFEDB : T2RFE<0b111010000001, + (outs), (ins GPR:$Rn), NoItinerary, "rfedb", "\t$Rn", + [/* For disassembly only; pattern left blank */]>; +def t2RFEIAW : T2RFE<0b111010011011, + (outs), (ins GPR:$Rn), NoItinerary, "rfeia", "\t$Rn!", + [/* For disassembly only; pattern left blank */]>; +def t2RFEIA : T2RFE<0b111010011001, + (outs), (ins GPR:$Rn), NoItinerary, "rfeia", "\t$Rn", + [/* For disassembly only; pattern left blank */]>; + +// B9.3.19 SUBS PC, LR, #imm (Thumb2) system instruction. +// Exception return instruction is "subs pc, lr, #imm". +let isReturn = 1, isBarrier = 1, isTerminator = 1, Defs = [PC] in +def t2SUBS_PC_LR : T2I <(outs), (ins imm0_255:$imm), NoItinerary, + "subs", "\tpc, lr, $imm", + [(ARMintretflag imm0_255:$imm)]>, + Requires<[IsThumb2,IsNotMClass]> { + let Inst{31-8} = 0b111100111101111010001111; + + bits<8> imm; + let Inst{7-0} = imm; +} + +// Hypervisor Call is a system instruction. +let isCall = 1 in { +def t2HVC : T2XI <(outs), (ins imm0_65535:$imm16), IIC_Br, "hvc.w\t$imm16", []>, + Requires<[IsThumb2, HasVirtualization]>, Sched<[WriteBr]> { + bits<16> imm16; + let Inst{31-20} = 0b111101111110; + let Inst{19-16} = imm16{15-12}; + let Inst{15-12} = 0b1000; + let Inst{11-0} = imm16{11-0}; +} +} + +// Alias for HVC without the ".w" optional width specifier +def : t2InstAlias<"hvc\t$imm16", (t2HVC imm0_65535:$imm16)>; + +// ERET - Return from exception in Hypervisor mode. +// B9.3.3, B9.3.20: ERET is an alias for "SUBS PC, LR, #0" in an implementation that +// includes virtualization extensions. +def t2ERET : InstAlias<"eret${p}", (t2SUBS_PC_LR 0, pred:$p)>, + Requires<[IsThumb2, HasVirtualization]>; + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +// + +// 32-bit immediate using movw + movt. +// This is a single pseudo instruction to make it re-materializable. +// FIXME: Remove this when we can do generalized remat. +let isReMaterializable = 1, isMoveImm = 1 in +def t2MOVi32imm : PseudoInst<(outs rGPR:$dst), (ins i32imm:$src), IIC_iMOVix2, + [(set rGPR:$dst, (i32 imm:$src))]>, + Requires<[IsThumb, UseMovt]>; + +// Pseudo instruction that combines movw + movt + add pc (if pic). +// It also makes it possible to rematerialize the instructions. +// FIXME: Remove this when we can do generalized remat and when machine licm +// can properly the instructions. +let isReMaterializable = 1 in { +def t2MOV_ga_pcrel : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr), + IIC_iMOVix2addpc, + [(set rGPR:$dst, (ARMWrapperPIC tglobaladdr:$addr))]>, + Requires<[IsThumb2, UseMovt]>; + +} + +// ConstantPool, GlobalAddress, and JumpTable +def : T2Pat<(ARMWrapper tconstpool :$dst), (t2LEApcrel tconstpool :$dst)>; +def : T2Pat<(ARMWrapper tglobaladdr :$dst), (t2MOVi32imm tglobaladdr :$dst)>, + Requires<[IsThumb2, UseMovt]>; + +def : T2Pat<(ARMWrapperJT tjumptable:$dst), + (t2LEApcrelJT tjumptable:$dst)>; + +// Pseudo instruction that combines ldr from constpool and add pc. This should +// be expanded into two instructions late to allow if-conversion and +// scheduling. +let canFoldAsLoad = 1, isReMaterializable = 1 in +def t2LDRpci_pic : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr, pclabel:$cp), + IIC_iLoadiALU, + [(set rGPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)), + imm:$cp))]>, + Requires<[IsThumb2]>; + +// Pseudo isntruction that combines movs + predicated rsbmi +// to implement integer ABS +let usesCustomInserter = 1, Defs = [CPSR] in { +def t2ABS : PseudoInst<(outs rGPR:$dst), (ins rGPR:$src), + NoItinerary, []>, Requires<[IsThumb2]>; +} + +//===----------------------------------------------------------------------===// +// Coprocessor load/store -- for disassembly only +// +class T2CI<bits<4> op31_28, dag oops, dag iops, string opc, string asm> + : T2I<oops, iops, NoItinerary, opc, asm, []> { + let Inst{31-28} = op31_28; + let Inst{27-25} = 0b110; +} + +multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm> { + def _OFFSET : T2CI<op31_28, + (outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr), + asm, "\t$cop, $CRd, $addr"> { + bits<13> addr; + bits<4> cop; + bits<4> CRd; + let Inst{24} = 1; // P = 1 + let Inst{23} = addr{8}; + let Inst{22} = Dbit; + let Inst{21} = 0; // W = 0 + let Inst{20} = load; + let Inst{19-16} = addr{12-9}; + let Inst{15-12} = CRd; + let Inst{11-8} = cop; + let Inst{7-0} = addr{7-0}; + let DecoderMethod = "DecodeCopMemInstruction"; + } + def _PRE : T2CI<op31_28, + (outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5_pre:$addr), + asm, "\t$cop, $CRd, $addr!"> { + bits<13> addr; + bits<4> cop; + bits<4> CRd; + let Inst{24} = 1; // P = 1 + let Inst{23} = addr{8}; + let Inst{22} = Dbit; + let Inst{21} = 1; // W = 1 + let Inst{20} = load; + let Inst{19-16} = addr{12-9}; + let Inst{15-12} = CRd; + let Inst{11-8} = cop; + let Inst{7-0} = addr{7-0}; + let DecoderMethod = "DecodeCopMemInstruction"; + } + def _POST: T2CI<op31_28, + (outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr, + postidx_imm8s4:$offset), + asm, "\t$cop, $CRd, $addr, $offset"> { + bits<9> offset; + bits<4> addr; + bits<4> cop; + bits<4> CRd; + let Inst{24} = 0; // P = 0 + let Inst{23} = offset{8}; + let Inst{22} = Dbit; + let Inst{21} = 1; // W = 1 + let Inst{20} = load; + let Inst{19-16} = addr; + let Inst{15-12} = CRd; + let Inst{11-8} = cop; + let Inst{7-0} = offset{7-0}; + let DecoderMethod = "DecodeCopMemInstruction"; + } + def _OPTION : T2CI<op31_28, (outs), + (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr, + coproc_option_imm:$option), + asm, "\t$cop, $CRd, $addr, $option"> { + bits<8> option; + bits<4> addr; + bits<4> cop; + bits<4> CRd; + let Inst{24} = 0; // P = 0 + let Inst{23} = 1; // U = 1 + let Inst{22} = Dbit; + let Inst{21} = 0; // W = 0 + let Inst{20} = load; + let Inst{19-16} = addr; + let Inst{15-12} = CRd; + let Inst{11-8} = cop; + let Inst{7-0} = option; + let DecoderMethod = "DecodeCopMemInstruction"; + } +} + +defm t2LDC : t2LdStCop<0b1110, 1, 0, "ldc">; +defm t2LDCL : t2LdStCop<0b1110, 1, 1, "ldcl">; +defm t2STC : t2LdStCop<0b1110, 0, 0, "stc">; +defm t2STCL : t2LdStCop<0b1110, 0, 1, "stcl">; +defm t2LDC2 : t2LdStCop<0b1111, 1, 0, "ldc2">, Requires<[PreV8,IsThumb2]>; +defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l">, Requires<[PreV8,IsThumb2]>; +defm t2STC2 : t2LdStCop<0b1111, 0, 0, "stc2">, Requires<[PreV8,IsThumb2]>; +defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l">, Requires<[PreV8,IsThumb2]>; + + +//===----------------------------------------------------------------------===// +// Move between special register and ARM core register -- for disassembly only +// +// Move to ARM core register from Special Register + +// A/R class MRS. +// +// A/R class can only move from CPSR or SPSR. +def t2MRS_AR : T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, apsr", + []>, Requires<[IsThumb2,IsNotMClass]> { + bits<4> Rd; + let Inst{31-12} = 0b11110011111011111000; + let Inst{11-8} = Rd; + let Inst{7-0} = 0b00000000; +} + +def : t2InstAlias<"mrs${p} $Rd, cpsr", (t2MRS_AR GPR:$Rd, pred:$p)>; + +def t2MRSsys_AR: T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, spsr", + []>, Requires<[IsThumb2,IsNotMClass]> { + bits<4> Rd; + let Inst{31-12} = 0b11110011111111111000; + let Inst{11-8} = Rd; + let Inst{7-0} = 0b00000000; +} + +def t2MRSbanked : T2I<(outs rGPR:$Rd), (ins banked_reg:$banked), + NoItinerary, "mrs", "\t$Rd, $banked", []>, + Requires<[IsThumb, HasVirtualization]> { + bits<6> banked; + bits<4> Rd; + + let Inst{31-21} = 0b11110011111; + let Inst{20} = banked{5}; // R bit + let Inst{19-16} = banked{3-0}; + let Inst{15-12} = 0b1000; + let Inst{11-8} = Rd; + let Inst{7-5} = 0b001; + let Inst{4} = banked{4}; + let Inst{3-0} = 0b0000; +} + + +// M class MRS. +// +// This MRS has a mask field in bits 7-0 and can take more values than +// the A/R class (a full msr_mask). +def t2MRS_M : T2I<(outs rGPR:$Rd), (ins msr_mask:$SYSm), NoItinerary, + "mrs", "\t$Rd, $SYSm", []>, + Requires<[IsThumb,IsMClass]> { + bits<4> Rd; + bits<8> SYSm; + let Inst{31-12} = 0b11110011111011111000; + let Inst{11-8} = Rd; + let Inst{7-0} = SYSm; + + let Unpredictable{20-16} = 0b11111; + let Unpredictable{13} = 0b1; +} + + +// Move from ARM core register to Special Register +// +// A/R class MSR. +// +// No need to have both system and application versions, the encodings are the +// same and the assembly parser has no way to distinguish between them. The mask +// operand contains the special register (R Bit) in bit 4 and bits 3-0 contains +// the mask with the fields to be accessed in the special register. +def t2MSR_AR : T2I<(outs), (ins msr_mask:$mask, rGPR:$Rn), + NoItinerary, "msr", "\t$mask, $Rn", []>, + Requires<[IsThumb2,IsNotMClass]> { + bits<5> mask; + bits<4> Rn; + let Inst{31-21} = 0b11110011100; + let Inst{20} = mask{4}; // R Bit + let Inst{19-16} = Rn; + let Inst{15-12} = 0b1000; + let Inst{11-8} = mask{3-0}; + let Inst{7-0} = 0; +} + +// However, the MSR (banked register) system instruction (ARMv7VE) *does* have a +// separate encoding (distinguished by bit 5. +def t2MSRbanked : T2I<(outs), (ins banked_reg:$banked, rGPR:$Rn), + NoItinerary, "msr", "\t$banked, $Rn", []>, + Requires<[IsThumb, HasVirtualization]> { + bits<6> banked; + bits<4> Rn; + + let Inst{31-21} = 0b11110011100; + let Inst{20} = banked{5}; // R bit + let Inst{19-16} = Rn; + let Inst{15-12} = 0b1000; + let Inst{11-8} = banked{3-0}; + let Inst{7-5} = 0b001; + let Inst{4} = banked{4}; + let Inst{3-0} = 0b0000; +} + + +// M class MSR. +// +// Move from ARM core register to Special Register +def t2MSR_M : T2I<(outs), (ins msr_mask:$SYSm, rGPR:$Rn), + NoItinerary, "msr", "\t$SYSm, $Rn", []>, + Requires<[IsThumb,IsMClass]> { + bits<12> SYSm; + bits<4> Rn; + let Inst{31-21} = 0b11110011100; + let Inst{20} = 0b0; + let Inst{19-16} = Rn; + let Inst{15-12} = 0b1000; + let Inst{11-10} = SYSm{11-10}; + let Inst{9-8} = 0b00; + let Inst{7-0} = SYSm{7-0}; + + let Unpredictable{20} = 0b1; + let Unpredictable{13} = 0b1; + let Unpredictable{9-8} = 0b11; +} + + +//===----------------------------------------------------------------------===// +// Move between coprocessor and ARM core register +// + +class t2MovRCopro<bits<4> Op, string opc, bit direction, dag oops, dag iops, + list<dag> pattern> + : T2Cop<Op, oops, iops, opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2", + pattern> { + let Inst{27-24} = 0b1110; + let Inst{20} = direction; + let Inst{4} = 1; + + bits<4> Rt; + bits<4> cop; + bits<3> opc1; + bits<3> opc2; + bits<4> CRm; + bits<4> CRn; + + let Inst{15-12} = Rt; + let Inst{11-8} = cop; + let Inst{23-21} = opc1; + let Inst{7-5} = opc2; + let Inst{3-0} = CRm; + let Inst{19-16} = CRn; +} + +class t2MovRRCopro<bits<4> Op, string opc, bit direction, dag oops, dag iops, + list<dag> pattern = []> + : T2Cop<Op, oops, iops, opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm", pattern> { + let Inst{27-24} = 0b1100; + let Inst{23-21} = 0b010; + let Inst{20} = direction; + + bits<4> Rt; + bits<4> Rt2; + bits<4> cop; + bits<4> opc1; + bits<4> CRm; + + let Inst{15-12} = Rt; + let Inst{19-16} = Rt2; + let Inst{11-8} = cop; + let Inst{7-4} = opc1; + let Inst{3-0} = CRm; +} + +/* from ARM core register to coprocessor */ +def t2MCR : t2MovRCopro<0b1110, "mcr", 0, + (outs), + (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, + c_imm:$CRm, imm0_7:$opc2), + [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn, + imm:$CRm, imm:$opc2)]>, + ComplexDeprecationPredicate<"MCR">; +def : t2InstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm", + (t2MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, + c_imm:$CRm, 0, pred:$p)>; +def t2MCR2 : t2MovRCopro<0b1111, "mcr2", 0, + (outs), (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, + c_imm:$CRm, imm0_7:$opc2), + [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn, + imm:$CRm, imm:$opc2)]> { + let Predicates = [IsThumb2, PreV8]; +} +def : t2InstAlias<"mcr2${p} $cop, $opc1, $Rt, $CRn, $CRm", + (t2MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, + c_imm:$CRm, 0, pred:$p)>; + +/* from coprocessor to ARM core register */ +def t2MRC : t2MovRCopro<0b1110, "mrc", 1, + (outs GPRwithAPSR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, + c_imm:$CRm, imm0_7:$opc2), []>; +def : t2InstAlias<"mrc${p} $cop, $opc1, $Rt, $CRn, $CRm", + (t2MRC GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, + c_imm:$CRm, 0, pred:$p)>; + +def t2MRC2 : t2MovRCopro<0b1111, "mrc2", 1, + (outs GPRwithAPSR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, + c_imm:$CRm, imm0_7:$opc2), []> { + let Predicates = [IsThumb2, PreV8]; +} +def : t2InstAlias<"mrc2${p} $cop, $opc1, $Rt, $CRn, $CRm", + (t2MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, + c_imm:$CRm, 0, pred:$p)>; + +def : T2v6Pat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2), + (t2MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>; + +def : T2v6Pat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2), + (t2MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>; + + +/* from ARM core register to coprocessor */ +def t2MCRR : t2MovRRCopro<0b1110, "mcrr", 0, (outs), + (ins p_imm:$cop, imm0_15:$opc1, GPR:$Rt, GPR:$Rt2, + c_imm:$CRm), + [(int_arm_mcrr imm:$cop, imm:$opc1, GPR:$Rt, GPR:$Rt2, + imm:$CRm)]>; +def t2MCRR2 : t2MovRRCopro<0b1111, "mcrr2", 0, (outs), + (ins p_imm:$cop, imm0_15:$opc1, GPR:$Rt, GPR:$Rt2, + c_imm:$CRm), + [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPR:$Rt, + GPR:$Rt2, imm:$CRm)]> { + let Predicates = [IsThumb2, PreV8]; +} + +/* from coprocessor to ARM core register */ +def t2MRRC : t2MovRRCopro<0b1110, "mrrc", 1, (outs GPR:$Rt, GPR:$Rt2), + (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRm)>; + +def t2MRRC2 : t2MovRRCopro<0b1111, "mrrc2", 1, (outs GPR:$Rt, GPR:$Rt2), + (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRm)> { + let Predicates = [IsThumb2, PreV8]; +} + +//===----------------------------------------------------------------------===// +// Other Coprocessor Instructions. +// + +def t2CDP : T2Cop<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1, + c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), + "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2", + [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn, + imm:$CRm, imm:$opc2)]> { + let Inst{27-24} = 0b1110; + + bits<4> opc1; + bits<4> CRn; + bits<4> CRd; + bits<4> cop; + bits<3> opc2; + bits<4> CRm; + + let Inst{3-0} = CRm; + let Inst{4} = 0; + let Inst{7-5} = opc2; + let Inst{11-8} = cop; + let Inst{15-12} = CRd; + let Inst{19-16} = CRn; + let Inst{23-20} = opc1; + + let Predicates = [IsThumb2, PreV8]; +} + +def t2CDP2 : T2Cop<0b1111, (outs), (ins p_imm:$cop, imm0_15:$opc1, + c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), + "cdp2", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2", + [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn, + imm:$CRm, imm:$opc2)]> { + let Inst{27-24} = 0b1110; + + bits<4> opc1; + bits<4> CRn; + bits<4> CRd; + bits<4> cop; + bits<3> opc2; + bits<4> CRm; + + let Inst{3-0} = CRm; + let Inst{4} = 0; + let Inst{7-5} = opc2; + let Inst{11-8} = cop; + let Inst{15-12} = CRd; + let Inst{19-16} = CRn; + let Inst{23-20} = opc1; + + let Predicates = [IsThumb2, PreV8]; +} + + + +//===----------------------------------------------------------------------===// +// ARMv8.1 Privilege Access Never extension +// +// SETPAN #imm1 + +def t2SETPAN : T1I<(outs), (ins imm0_1:$imm), NoItinerary, "setpan\t$imm", []>, + T1Misc<0b0110000>, Requires<[IsThumb2, HasV8, HasV8_1a]> { + bits<1> imm; + + let Inst{4} = 0b1; + let Inst{3} = imm; + let Inst{2-0} = 0b000; + + let Unpredictable{4} = 0b1; + let Unpredictable{2-0} = 0b111; +} + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +// + +// SXT/UXT with no rotate +let AddedComplexity = 16 in { +def : T2Pat<(and rGPR:$Rm, 0x000000FF), (t2UXTB rGPR:$Rm, 0)>, + Requires<[IsThumb2]>; +def : T2Pat<(and rGPR:$Rm, 0x0000FFFF), (t2UXTH rGPR:$Rm, 0)>, + Requires<[IsThumb2]>; +def : T2Pat<(and rGPR:$Rm, 0x00FF00FF), (t2UXTB16 rGPR:$Rm, 0)>, + Requires<[HasT2ExtractPack, IsThumb2]>; +def : T2Pat<(add rGPR:$Rn, (and rGPR:$Rm, 0x00FF)), + (t2UXTAB rGPR:$Rn, rGPR:$Rm, 0)>, + Requires<[HasT2ExtractPack, IsThumb2]>; +def : T2Pat<(add rGPR:$Rn, (and rGPR:$Rm, 0xFFFF)), + (t2UXTAH rGPR:$Rn, rGPR:$Rm, 0)>, + Requires<[HasT2ExtractPack, IsThumb2]>; +} + +def : T2Pat<(sext_inreg rGPR:$Src, i8), (t2SXTB rGPR:$Src, 0)>, + Requires<[IsThumb2]>; +def : T2Pat<(sext_inreg rGPR:$Src, i16), (t2SXTH rGPR:$Src, 0)>, + Requires<[IsThumb2]>; +def : T2Pat<(add rGPR:$Rn, (sext_inreg rGPR:$Rm, i8)), + (t2SXTAB rGPR:$Rn, rGPR:$Rm, 0)>, + Requires<[HasT2ExtractPack, IsThumb2]>; +def : T2Pat<(add rGPR:$Rn, (sext_inreg rGPR:$Rm, i16)), + (t2SXTAH rGPR:$Rn, rGPR:$Rm, 0)>, + Requires<[HasT2ExtractPack, IsThumb2]>; + +// Atomic load/store patterns +def : T2Pat<(atomic_load_8 t2addrmode_imm12:$addr), + (t2LDRBi12 t2addrmode_imm12:$addr)>; +def : T2Pat<(atomic_load_8 t2addrmode_negimm8:$addr), + (t2LDRBi8 t2addrmode_negimm8:$addr)>; +def : T2Pat<(atomic_load_8 t2addrmode_so_reg:$addr), + (t2LDRBs t2addrmode_so_reg:$addr)>; +def : T2Pat<(atomic_load_16 t2addrmode_imm12:$addr), + (t2LDRHi12 t2addrmode_imm12:$addr)>; +def : T2Pat<(atomic_load_16 t2addrmode_negimm8:$addr), + (t2LDRHi8 t2addrmode_negimm8:$addr)>; +def : T2Pat<(atomic_load_16 t2addrmode_so_reg:$addr), + (t2LDRHs t2addrmode_so_reg:$addr)>; +def : T2Pat<(atomic_load_32 t2addrmode_imm12:$addr), + (t2LDRi12 t2addrmode_imm12:$addr)>; +def : T2Pat<(atomic_load_32 t2addrmode_negimm8:$addr), + (t2LDRi8 t2addrmode_negimm8:$addr)>; +def : T2Pat<(atomic_load_32 t2addrmode_so_reg:$addr), + (t2LDRs t2addrmode_so_reg:$addr)>; +def : T2Pat<(atomic_store_8 t2addrmode_imm12:$addr, GPR:$val), + (t2STRBi12 GPR:$val, t2addrmode_imm12:$addr)>; +def : T2Pat<(atomic_store_8 t2addrmode_negimm8:$addr, GPR:$val), + (t2STRBi8 GPR:$val, t2addrmode_negimm8:$addr)>; +def : T2Pat<(atomic_store_8 t2addrmode_so_reg:$addr, GPR:$val), + (t2STRBs GPR:$val, t2addrmode_so_reg:$addr)>; +def : T2Pat<(atomic_store_16 t2addrmode_imm12:$addr, GPR:$val), + (t2STRHi12 GPR:$val, t2addrmode_imm12:$addr)>; +def : T2Pat<(atomic_store_16 t2addrmode_negimm8:$addr, GPR:$val), + (t2STRHi8 GPR:$val, t2addrmode_negimm8:$addr)>; +def : T2Pat<(atomic_store_16 t2addrmode_so_reg:$addr, GPR:$val), + (t2STRHs GPR:$val, t2addrmode_so_reg:$addr)>; +def : T2Pat<(atomic_store_32 t2addrmode_imm12:$addr, GPR:$val), + (t2STRi12 GPR:$val, t2addrmode_imm12:$addr)>; +def : T2Pat<(atomic_store_32 t2addrmode_negimm8:$addr, GPR:$val), + (t2STRi8 GPR:$val, t2addrmode_negimm8:$addr)>; +def : T2Pat<(atomic_store_32 t2addrmode_so_reg:$addr, GPR:$val), + (t2STRs GPR:$val, t2addrmode_so_reg:$addr)>; + +let AddedComplexity = 8 in { + def : T2Pat<(atomic_load_acquire_8 addr_offset_none:$addr), (t2LDAB addr_offset_none:$addr)>; + def : T2Pat<(atomic_load_acquire_16 addr_offset_none:$addr), (t2LDAH addr_offset_none:$addr)>; + def : T2Pat<(atomic_load_acquire_32 addr_offset_none:$addr), (t2LDA addr_offset_none:$addr)>; + def : T2Pat<(atomic_store_release_8 addr_offset_none:$addr, GPR:$val), (t2STLB GPR:$val, addr_offset_none:$addr)>; + def : T2Pat<(atomic_store_release_16 addr_offset_none:$addr, GPR:$val), (t2STLH GPR:$val, addr_offset_none:$addr)>; + def : T2Pat<(atomic_store_release_32 addr_offset_none:$addr, GPR:$val), (t2STL GPR:$val, addr_offset_none:$addr)>; +} + + +//===----------------------------------------------------------------------===// +// Assembler aliases +// + +// Aliases for ADC without the ".w" optional width specifier. +def : t2InstAlias<"adc${s}${p} $Rd, $Rn, $Rm", + (t2ADCrr rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"adc${s}${p} $Rd, $Rn, $ShiftedRm", + (t2ADCrs rGPR:$Rd, rGPR:$Rn, t2_so_reg:$ShiftedRm, + pred:$p, cc_out:$s)>; + +// Aliases for SBC without the ".w" optional width specifier. +def : t2InstAlias<"sbc${s}${p} $Rd, $Rn, $Rm", + (t2SBCrr rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"sbc${s}${p} $Rd, $Rn, $ShiftedRm", + (t2SBCrs rGPR:$Rd, rGPR:$Rn, t2_so_reg:$ShiftedRm, + pred:$p, cc_out:$s)>; + +// Aliases for ADD without the ".w" optional width specifier. +def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm", + (t2ADDri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, + cc_out:$s)>; +def : t2InstAlias<"add${p} $Rd, $Rn, $imm", + (t2ADDri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>; +def : t2InstAlias<"add${s}${p} $Rd, $Rn, $Rm", + (t2ADDrr GPRnopc:$Rd, GPRnopc:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"add${s}${p} $Rd, $Rn, $ShiftedRm", + (t2ADDrs GPRnopc:$Rd, GPRnopc:$Rn, t2_so_reg:$ShiftedRm, + pred:$p, cc_out:$s)>; +// ... and with the destination and source register combined. +def : t2InstAlias<"add${s}${p} $Rdn, $imm", + (t2ADDri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"add${p} $Rdn, $imm", + (t2ADDri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095:$imm, pred:$p)>; +def : t2InstAlias<"add${s}${p} $Rdn, $Rm", + (t2ADDrr GPRnopc:$Rdn, GPRnopc:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"add${s}${p} $Rdn, $ShiftedRm", + (t2ADDrs GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_reg:$ShiftedRm, + pred:$p, cc_out:$s)>; + +// add w/ negative immediates is just a sub. +def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm", + (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p, + cc_out:$s)>; +def : t2InstAlias<"add${p} $Rd, $Rn, $imm", + (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>; +def : t2InstAlias<"add${s}${p} $Rdn, $imm", + (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm_neg:$imm, pred:$p, + cc_out:$s)>; +def : t2InstAlias<"add${p} $Rdn, $imm", + (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095_neg:$imm, pred:$p)>; + +def : t2InstAlias<"add${s}${p}.w $Rd, $Rn, $imm", + (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p, + cc_out:$s)>; +def : t2InstAlias<"addw${p} $Rd, $Rn, $imm", + (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>; +def : t2InstAlias<"add${s}${p}.w $Rdn, $imm", + (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm_neg:$imm, pred:$p, + cc_out:$s)>; +def : t2InstAlias<"addw${p} $Rdn, $imm", + (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095_neg:$imm, pred:$p)>; + + +// Aliases for SUB without the ".w" optional width specifier. +def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $imm", + (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"sub${p} $Rd, $Rn, $imm", + (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>; +def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $Rm", + (t2SUBrr GPRnopc:$Rd, GPRnopc:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $ShiftedRm", + (t2SUBrs GPRnopc:$Rd, GPRnopc:$Rn, t2_so_reg:$ShiftedRm, + pred:$p, cc_out:$s)>; +// ... and with the destination and source register combined. +def : t2InstAlias<"sub${s}${p} $Rdn, $imm", + (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"sub${p} $Rdn, $imm", + (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095:$imm, pred:$p)>; +def : t2InstAlias<"sub${s}${p}.w $Rdn, $Rm", + (t2SUBrr GPRnopc:$Rdn, GPRnopc:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"sub${s}${p} $Rdn, $Rm", + (t2SUBrr GPRnopc:$Rdn, GPRnopc:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"sub${s}${p} $Rdn, $ShiftedRm", + (t2SUBrs GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_reg:$ShiftedRm, + pred:$p, cc_out:$s)>; + +// Alias for compares without the ".w" optional width specifier. +def : t2InstAlias<"cmn${p} $Rn, $Rm", + (t2CMNzrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>; +def : t2InstAlias<"teq${p} $Rn, $Rm", + (t2TEQrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>; +def : t2InstAlias<"tst${p} $Rn, $Rm", + (t2TSTrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>; + +// Memory barriers +def : InstAlias<"dmb${p}", (t2DMB 0xf, pred:$p)>, Requires<[HasDB]>; +def : InstAlias<"dsb${p}", (t2DSB 0xf, pred:$p)>, Requires<[HasDB]>; +def : InstAlias<"isb${p}", (t2ISB 0xf, pred:$p)>, Requires<[HasDB]>; + +// Alias for LDR, LDRB, LDRH, LDRSB, and LDRSH without the ".w" optional +// width specifier. +def : t2InstAlias<"ldr${p} $Rt, $addr", + (t2LDRi12 GPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>; +def : t2InstAlias<"ldrb${p} $Rt, $addr", + (t2LDRBi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>; +def : t2InstAlias<"ldrh${p} $Rt, $addr", + (t2LDRHi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>; +def : t2InstAlias<"ldrsb${p} $Rt, $addr", + (t2LDRSBi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>; +def : t2InstAlias<"ldrsh${p} $Rt, $addr", + (t2LDRSHi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>; + +def : t2InstAlias<"ldr${p} $Rt, $addr", + (t2LDRs GPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>; +def : t2InstAlias<"ldrb${p} $Rt, $addr", + (t2LDRBs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>; +def : t2InstAlias<"ldrh${p} $Rt, $addr", + (t2LDRHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>; +def : t2InstAlias<"ldrsb${p} $Rt, $addr", + (t2LDRSBs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>; +def : t2InstAlias<"ldrsh${p} $Rt, $addr", + (t2LDRSHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>; + +def : t2InstAlias<"ldr${p} $Rt, $addr", + (t2LDRpci GPRnopc:$Rt, t2ldrlabel:$addr, pred:$p)>; +def : t2InstAlias<"ldrb${p} $Rt, $addr", + (t2LDRBpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>; +def : t2InstAlias<"ldrh${p} $Rt, $addr", + (t2LDRHpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>; +def : t2InstAlias<"ldrsb${p} $Rt, $addr", + (t2LDRSBpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>; +def : t2InstAlias<"ldrsh${p} $Rt, $addr", + (t2LDRSHpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>; + +// Alias for MVN with(out) the ".w" optional width specifier. +def : t2InstAlias<"mvn${s}${p}.w $Rd, $imm", + (t2MVNi rGPR:$Rd, t2_so_imm:$imm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"mvn${s}${p} $Rd, $Rm", + (t2MVNr rGPR:$Rd, rGPR:$Rm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"mvn${s}${p} $Rd, $ShiftedRm", + (t2MVNs rGPR:$Rd, t2_so_reg:$ShiftedRm, pred:$p, cc_out:$s)>; + +// PKHBT/PKHTB with default shift amount. PKHTB is equivalent to PKHBT when the +// shift amount is zero (i.e., unspecified). +def : InstAlias<"pkhbt${p} $Rd, $Rn, $Rm", + (t2PKHBT rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>, + Requires<[HasT2ExtractPack, IsThumb2]>; +def : InstAlias<"pkhtb${p} $Rd, $Rn, $Rm", + (t2PKHBT rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>, + Requires<[HasT2ExtractPack, IsThumb2]>; + +// PUSH/POP aliases for STM/LDM +def : t2InstAlias<"push${p}.w $regs", (t2STMDB_UPD SP, pred:$p, reglist:$regs)>; +def : t2InstAlias<"push${p} $regs", (t2STMDB_UPD SP, pred:$p, reglist:$regs)>; +def : t2InstAlias<"pop${p}.w $regs", (t2LDMIA_UPD SP, pred:$p, reglist:$regs)>; +def : t2InstAlias<"pop${p} $regs", (t2LDMIA_UPD SP, pred:$p, reglist:$regs)>; + +// STMIA/STMIA_UPD aliases w/o the optional .w suffix +def : t2InstAlias<"stm${p} $Rn, $regs", + (t2STMIA GPR:$Rn, pred:$p, reglist:$regs)>; +def : t2InstAlias<"stm${p} $Rn!, $regs", + (t2STMIA_UPD GPR:$Rn, pred:$p, reglist:$regs)>; + +// LDMIA/LDMIA_UPD aliases w/o the optional .w suffix +def : t2InstAlias<"ldm${p} $Rn, $regs", + (t2LDMIA GPR:$Rn, pred:$p, reglist:$regs)>; +def : t2InstAlias<"ldm${p} $Rn!, $regs", + (t2LDMIA_UPD GPR:$Rn, pred:$p, reglist:$regs)>; + +// STMDB/STMDB_UPD aliases w/ the optional .w suffix +def : t2InstAlias<"stmdb${p}.w $Rn, $regs", + (t2STMDB GPR:$Rn, pred:$p, reglist:$regs)>; +def : t2InstAlias<"stmdb${p}.w $Rn!, $regs", + (t2STMDB_UPD GPR:$Rn, pred:$p, reglist:$regs)>; + +// LDMDB/LDMDB_UPD aliases w/ the optional .w suffix +def : t2InstAlias<"ldmdb${p}.w $Rn, $regs", + (t2LDMDB GPR:$Rn, pred:$p, reglist:$regs)>; +def : t2InstAlias<"ldmdb${p}.w $Rn!, $regs", + (t2LDMDB_UPD GPR:$Rn, pred:$p, reglist:$regs)>; + +// Alias for REV/REV16/REVSH without the ".w" optional width specifier. +def : t2InstAlias<"rev${p} $Rd, $Rm", (t2REV rGPR:$Rd, rGPR:$Rm, pred:$p)>; +def : t2InstAlias<"rev16${p} $Rd, $Rm", (t2REV16 rGPR:$Rd, rGPR:$Rm, pred:$p)>; +def : t2InstAlias<"revsh${p} $Rd, $Rm", (t2REVSH rGPR:$Rd, rGPR:$Rm, pred:$p)>; + + +// Alias for RSB without the ".w" optional width specifier, and with optional +// implied destination register. +def : t2InstAlias<"rsb${s}${p} $Rd, $Rn, $imm", + (t2RSBri rGPR:$Rd, rGPR:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"rsb${s}${p} $Rdn, $imm", + (t2RSBri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"rsb${s}${p} $Rdn, $Rm", + (t2RSBrr rGPR:$Rdn, rGPR:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"rsb${s}${p} $Rdn, $ShiftedRm", + (t2RSBrs rGPR:$Rdn, rGPR:$Rdn, t2_so_reg:$ShiftedRm, pred:$p, + cc_out:$s)>; + +// SSAT/USAT optional shift operand. +def : t2InstAlias<"ssat${p} $Rd, $sat_imm, $Rn", + (t2SSAT rGPR:$Rd, imm1_32:$sat_imm, rGPR:$Rn, 0, pred:$p)>; +def : t2InstAlias<"usat${p} $Rd, $sat_imm, $Rn", + (t2USAT rGPR:$Rd, imm0_31:$sat_imm, rGPR:$Rn, 0, pred:$p)>; + +// STM w/o the .w suffix. +def : t2InstAlias<"stm${p} $Rn, $regs", + (t2STMIA GPR:$Rn, pred:$p, reglist:$regs)>; + +// Alias for STR, STRB, and STRH without the ".w" optional +// width specifier. +def : t2InstAlias<"str${p} $Rt, $addr", + (t2STRi12 GPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>; +def : t2InstAlias<"strb${p} $Rt, $addr", + (t2STRBi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>; +def : t2InstAlias<"strh${p} $Rt, $addr", + (t2STRHi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>; + +def : t2InstAlias<"str${p} $Rt, $addr", + (t2STRs GPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>; +def : t2InstAlias<"strb${p} $Rt, $addr", + (t2STRBs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>; +def : t2InstAlias<"strh${p} $Rt, $addr", + (t2STRHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>; + +// Extend instruction optional rotate operand. +def : InstAlias<"sxtab${p} $Rd, $Rn, $Rm", + (t2SXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>, + Requires<[HasT2ExtractPack, IsThumb2]>; +def : InstAlias<"sxtah${p} $Rd, $Rn, $Rm", + (t2SXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>, + Requires<[HasT2ExtractPack, IsThumb2]>; +def : InstAlias<"sxtab16${p} $Rd, $Rn, $Rm", + (t2SXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>, + Requires<[HasT2ExtractPack, IsThumb2]>; +def : InstAlias<"sxtb16${p} $Rd, $Rm", + (t2SXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>, + Requires<[HasT2ExtractPack, IsThumb2]>; + +def : t2InstAlias<"sxtb${p} $Rd, $Rm", + (t2SXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>; +def : t2InstAlias<"sxth${p} $Rd, $Rm", + (t2SXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>; +def : t2InstAlias<"sxtb${p}.w $Rd, $Rm", + (t2SXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>; +def : t2InstAlias<"sxth${p}.w $Rd, $Rm", + (t2SXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>; + +def : InstAlias<"uxtab${p} $Rd, $Rn, $Rm", + (t2UXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>, + Requires<[HasT2ExtractPack, IsThumb2]>; +def : InstAlias<"uxtah${p} $Rd, $Rn, $Rm", + (t2UXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>, + Requires<[HasT2ExtractPack, IsThumb2]>; +def : InstAlias<"uxtab16${p} $Rd, $Rn, $Rm", + (t2UXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>, + Requires<[HasT2ExtractPack, IsThumb2]>; +def : InstAlias<"uxtb16${p} $Rd, $Rm", + (t2UXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>, + Requires<[HasT2ExtractPack, IsThumb2]>; + +def : t2InstAlias<"uxtb${p} $Rd, $Rm", + (t2UXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>; +def : t2InstAlias<"uxth${p} $Rd, $Rm", + (t2UXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>; +def : t2InstAlias<"uxtb${p}.w $Rd, $Rm", + (t2UXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>; +def : t2InstAlias<"uxth${p}.w $Rd, $Rm", + (t2UXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>; + +// Extend instruction w/o the ".w" optional width specifier. +def : t2InstAlias<"uxtb${p} $Rd, $Rm$rot", + (t2UXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>; +def : InstAlias<"uxtb16${p} $Rd, $Rm$rot", + (t2UXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>, + Requires<[HasT2ExtractPack, IsThumb2]>; +def : t2InstAlias<"uxth${p} $Rd, $Rm$rot", + (t2UXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>; + +def : t2InstAlias<"sxtb${p} $Rd, $Rm$rot", + (t2SXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>; +def : InstAlias<"sxtb16${p} $Rd, $Rm$rot", + (t2SXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>, + Requires<[HasT2ExtractPack, IsThumb2]>; +def : t2InstAlias<"sxth${p} $Rd, $Rm$rot", + (t2SXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>; + + +// "mov Rd, t2_so_imm_not" can be handled via "mvn" in assembly, just like +// for isel. +def : t2InstAlias<"mov${p} $Rd, $imm", + (t2MVNi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, zero_reg)>; +def : t2InstAlias<"mvn${p} $Rd, $imm", + (t2MOVi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, zero_reg)>; +// Same for AND <--> BIC +def : t2InstAlias<"bic${s}${p} $Rd, $Rn, $imm", + (t2ANDri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm, + pred:$p, cc_out:$s)>; +def : t2InstAlias<"bic${s}${p} $Rdn, $imm", + (t2ANDri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm, + pred:$p, cc_out:$s)>; +def : t2InstAlias<"and${s}${p} $Rd, $Rn, $imm", + (t2BICri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm, + pred:$p, cc_out:$s)>; +def : t2InstAlias<"and${s}${p} $Rdn, $imm", + (t2BICri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm, + pred:$p, cc_out:$s)>; +// Likewise, "add Rd, t2_so_imm_neg" -> sub +def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm", + (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, + pred:$p, cc_out:$s)>; +def : t2InstAlias<"add${s}${p} $Rd, $imm", + (t2SUBri GPRnopc:$Rd, GPRnopc:$Rd, t2_so_imm_neg:$imm, + pred:$p, cc_out:$s)>; +// Same for CMP <--> CMN via t2_so_imm_neg +def : t2InstAlias<"cmp${p} $Rd, $imm", + (t2CMNri rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p)>; +def : t2InstAlias<"cmn${p} $Rd, $imm", + (t2CMPri rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p)>; + + +// Wide 'mul' encoding can be specified with only two operands. +def : t2InstAlias<"mul${p} $Rn, $Rm", + (t2MUL rGPR:$Rn, rGPR:$Rm, rGPR:$Rn, pred:$p)>; + +// "neg" is and alias for "rsb rd, rn, #0" +def : t2InstAlias<"neg${s}${p} $Rd, $Rm", + (t2RSBri rGPR:$Rd, rGPR:$Rm, 0, pred:$p, cc_out:$s)>; + +// MOV so_reg assembler pseudos. InstAlias isn't expressive enough for +// these, unfortunately. +def t2MOVsi: t2AsmPseudo<"mov${p} $Rd, $shift", + (ins rGPR:$Rd, t2_so_reg:$shift, pred:$p)>; +def t2MOVSsi: t2AsmPseudo<"movs${p} $Rd, $shift", + (ins rGPR:$Rd, t2_so_reg:$shift, pred:$p)>; + +def t2MOVsr: t2AsmPseudo<"mov${p} $Rd, $shift", + (ins rGPR:$Rd, so_reg_reg:$shift, pred:$p)>; +def t2MOVSsr: t2AsmPseudo<"movs${p} $Rd, $shift", + (ins rGPR:$Rd, so_reg_reg:$shift, pred:$p)>; + +// ADR w/o the .w suffix +def : t2InstAlias<"adr${p} $Rd, $addr", + (t2ADR rGPR:$Rd, t2adrlabel:$addr, pred:$p)>; + +// LDR(literal) w/ alternate [pc, #imm] syntax. +def t2LDRpcrel : t2AsmPseudo<"ldr${p} $Rt, $addr", + (ins GPR:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>; +def t2LDRBpcrel : t2AsmPseudo<"ldrb${p} $Rt, $addr", + (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>; +def t2LDRHpcrel : t2AsmPseudo<"ldrh${p} $Rt, $addr", + (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>; +def t2LDRSBpcrel : t2AsmPseudo<"ldrsb${p} $Rt, $addr", + (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>; +def t2LDRSHpcrel : t2AsmPseudo<"ldrsh${p} $Rt, $addr", + (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>; + // Version w/ the .w suffix. +def : t2InstAlias<"ldr${p}.w $Rt, $addr", + (t2LDRpcrel GPR:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p), 0>; +def : t2InstAlias<"ldrb${p}.w $Rt, $addr", + (t2LDRBpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>; +def : t2InstAlias<"ldrh${p}.w $Rt, $addr", + (t2LDRHpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>; +def : t2InstAlias<"ldrsb${p}.w $Rt, $addr", + (t2LDRSBpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>; +def : t2InstAlias<"ldrsh${p}.w $Rt, $addr", + (t2LDRSHpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>; + +def : t2InstAlias<"add${p} $Rd, pc, $imm", + (t2ADR rGPR:$Rd, imm0_4095:$imm, pred:$p)>; + +// PLD/PLDW/PLI with alternate literal form. +def : t2InstAlias<"pld${p} $addr", + (t2PLDpci t2ldr_pcrel_imm12:$addr, pred:$p)>; +def : InstAlias<"pli${p} $addr", + (t2PLIpci t2ldr_pcrel_imm12:$addr, pred:$p)>, + Requires<[IsThumb2,HasV7]>; diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td new file mode 100644 index 0000000..050cd1a --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td @@ -0,0 +1,1894 @@ +//===-- ARMInstrVFP.td - VFP support for ARM ---------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the ARM VFP instruction set. +// +//===----------------------------------------------------------------------===// + +def SDT_CMPFP0 : SDTypeProfile<0, 1, [SDTCisFP<0>]>; +def SDT_VMOVDRR : SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>, + SDTCisSameAs<1, 2>]>; + +def arm_fmstat : SDNode<"ARMISD::FMSTAT", SDTNone, [SDNPInGlue, SDNPOutGlue]>; +def arm_cmpfp : SDNode<"ARMISD::CMPFP", SDT_ARMCmp, [SDNPOutGlue]>; +def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutGlue]>; +def arm_fmdrr : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>; + +//===----------------------------------------------------------------------===// +// Operand Definitions. +// + +// 8-bit floating-point immediate encodings. +def FPImmOperand : AsmOperandClass { + let Name = "FPImm"; + let ParserMethod = "parseFPImm"; +} + +def vfp_f32imm : Operand<f32>, + PatLeaf<(f32 fpimm), [{ + return ARM_AM::getFP32Imm(N->getValueAPF()) != -1; + }], SDNodeXForm<fpimm, [{ + APFloat InVal = N->getValueAPF(); + uint32_t enc = ARM_AM::getFP32Imm(InVal); + return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); + }]>> { + let PrintMethod = "printFPImmOperand"; + let ParserMatchClass = FPImmOperand; +} + +def vfp_f64imm : Operand<f64>, + PatLeaf<(f64 fpimm), [{ + return ARM_AM::getFP64Imm(N->getValueAPF()) != -1; + }], SDNodeXForm<fpimm, [{ + APFloat InVal = N->getValueAPF(); + uint32_t enc = ARM_AM::getFP64Imm(InVal); + return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); + }]>> { + let PrintMethod = "printFPImmOperand"; + let ParserMatchClass = FPImmOperand; +} + +def alignedload32 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() >= 4; +}]>; + +def alignedstore32 : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getAlignment() >= 4; +}]>; + +// The VCVT to/from fixed-point instructions encode the 'fbits' operand +// (the number of fixed bits) differently than it appears in the assembly +// source. It's encoded as "Size - fbits" where Size is the size of the +// fixed-point representation (32 or 16) and fbits is the value appearing +// in the assembly source, an integer in [0,16] or (0,32], depending on size. +def fbits32_asm_operand : AsmOperandClass { let Name = "FBits32"; } +def fbits32 : Operand<i32> { + let PrintMethod = "printFBits32"; + let ParserMatchClass = fbits32_asm_operand; +} + +def fbits16_asm_operand : AsmOperandClass { let Name = "FBits16"; } +def fbits16 : Operand<i32> { + let PrintMethod = "printFBits16"; + let ParserMatchClass = fbits16_asm_operand; +} + +//===----------------------------------------------------------------------===// +// Load / store Instructions. +// + +let canFoldAsLoad = 1, isReMaterializable = 1 in { + +def VLDRD : ADI5<0b1101, 0b01, (outs DPR:$Dd), (ins addrmode5:$addr), + IIC_fpLoad64, "vldr", "\t$Dd, $addr", + [(set DPR:$Dd, (f64 (alignedload32 addrmode5:$addr)))]>; + +def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5:$addr), + IIC_fpLoad32, "vldr", "\t$Sd, $addr", + [(set SPR:$Sd, (alignedload32 addrmode5:$addr))]> { + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} + +} // End of 'let canFoldAsLoad = 1, isReMaterializable = 1 in' + +def VSTRD : ADI5<0b1101, 0b00, (outs), (ins DPR:$Dd, addrmode5:$addr), + IIC_fpStore64, "vstr", "\t$Dd, $addr", + [(alignedstore32 (f64 DPR:$Dd), addrmode5:$addr)]>; + +def VSTRS : ASI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5:$addr), + IIC_fpStore32, "vstr", "\t$Sd, $addr", + [(alignedstore32 SPR:$Sd, addrmode5:$addr)]> { + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} + +//===----------------------------------------------------------------------===// +// Load / store multiple Instructions. +// + +multiclass vfp_ldst_mult<string asm, bit L_bit, + InstrItinClass itin, InstrItinClass itin_upd> { + // Double Precision + def DIA : + AXDI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops), + IndexModeNone, itin, + !strconcat(asm, "ia${p}\t$Rn, $regs"), "", []> { + let Inst{24-23} = 0b01; // Increment After + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + } + def DIA_UPD : + AXDI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, + variable_ops), + IndexModeUpd, itin_upd, + !strconcat(asm, "ia${p}\t$Rn!, $regs"), "$Rn = $wb", []> { + let Inst{24-23} = 0b01; // Increment After + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + } + def DDB_UPD : + AXDI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, + variable_ops), + IndexModeUpd, itin_upd, + !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> { + let Inst{24-23} = 0b10; // Decrement Before + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + } + + // Single Precision + def SIA : + AXSI4<(outs), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, variable_ops), + IndexModeNone, itin, + !strconcat(asm, "ia${p}\t$Rn, $regs"), "", []> { + let Inst{24-23} = 0b01; // Increment After + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines. + let D = VFPNeonDomain; + } + def SIA_UPD : + AXSI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, + variable_ops), + IndexModeUpd, itin_upd, + !strconcat(asm, "ia${p}\t$Rn!, $regs"), "$Rn = $wb", []> { + let Inst{24-23} = 0b01; // Increment After + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines. + let D = VFPNeonDomain; + } + def SDB_UPD : + AXSI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, + variable_ops), + IndexModeUpd, itin_upd, + !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> { + let Inst{24-23} = 0b10; // Decrement Before + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines. + let D = VFPNeonDomain; + } +} + +let hasSideEffects = 0 in { + +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in +defm VLDM : vfp_ldst_mult<"vldm", 1, IIC_fpLoad_m, IIC_fpLoad_mu>; + +let mayStore = 1, hasExtraSrcRegAllocReq = 1 in +defm VSTM : vfp_ldst_mult<"vstm", 0, IIC_fpStore_m, IIC_fpStore_mu>; + +} // hasSideEffects + +def : MnemonicAlias<"vldm", "vldmia">; +def : MnemonicAlias<"vstm", "vstmia">; + +// FLDM/FSTM - Load / Store multiple single / double precision registers for +// pre-ARMv6 cores. +// These instructions are deprecated! +def : VFP2MnemonicAlias<"fldmias", "vldmia">; +def : VFP2MnemonicAlias<"fldmdbs", "vldmdb">; +def : VFP2MnemonicAlias<"fldmeas", "vldmdb">; +def : VFP2MnemonicAlias<"fldmfds", "vldmia">; +def : VFP2MnemonicAlias<"fldmiad", "vldmia">; +def : VFP2MnemonicAlias<"fldmdbd", "vldmdb">; +def : VFP2MnemonicAlias<"fldmead", "vldmdb">; +def : VFP2MnemonicAlias<"fldmfdd", "vldmia">; + +def : VFP2MnemonicAlias<"fstmias", "vstmia">; +def : VFP2MnemonicAlias<"fstmdbs", "vstmdb">; +def : VFP2MnemonicAlias<"fstmeas", "vstmia">; +def : VFP2MnemonicAlias<"fstmfds", "vstmdb">; +def : VFP2MnemonicAlias<"fstmiad", "vstmia">; +def : VFP2MnemonicAlias<"fstmdbd", "vstmdb">; +def : VFP2MnemonicAlias<"fstmead", "vstmia">; +def : VFP2MnemonicAlias<"fstmfdd", "vstmdb">; + +def : InstAlias<"vpush${p} $r", (VSTMDDB_UPD SP, pred:$p, dpr_reglist:$r)>, + Requires<[HasVFP2]>; +def : InstAlias<"vpush${p} $r", (VSTMSDB_UPD SP, pred:$p, spr_reglist:$r)>, + Requires<[HasVFP2]>; +def : InstAlias<"vpop${p} $r", (VLDMDIA_UPD SP, pred:$p, dpr_reglist:$r)>, + Requires<[HasVFP2]>; +def : InstAlias<"vpop${p} $r", (VLDMSIA_UPD SP, pred:$p, spr_reglist:$r)>, + Requires<[HasVFP2]>; +defm : VFPDTAnyInstAlias<"vpush${p}", "$r", + (VSTMSDB_UPD SP, pred:$p, spr_reglist:$r)>; +defm : VFPDTAnyInstAlias<"vpush${p}", "$r", + (VSTMDDB_UPD SP, pred:$p, dpr_reglist:$r)>; +defm : VFPDTAnyInstAlias<"vpop${p}", "$r", + (VLDMSIA_UPD SP, pred:$p, spr_reglist:$r)>; +defm : VFPDTAnyInstAlias<"vpop${p}", "$r", + (VLDMDIA_UPD SP, pred:$p, dpr_reglist:$r)>; + +// FLDMX, FSTMX - Load and store multiple unknown precision registers for +// pre-armv6 cores. +// These instruction are deprecated so we don't want them to get selected. +multiclass vfp_ldstx_mult<string asm, bit L_bit> { + // Unknown precision + def XIA : + AXXI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops), + IndexModeNone, !strconcat(asm, "iax${p}\t$Rn, $regs"), "", []> { + let Inst{24-23} = 0b01; // Increment After + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + } + def XIA_UPD : + AXXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops), + IndexModeUpd, !strconcat(asm, "iax${p}\t$Rn!, $regs"), "$Rn = $wb", []> { + let Inst{24-23} = 0b01; // Increment After + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + } + def XDB_UPD : + AXXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops), + IndexModeUpd, !strconcat(asm, "dbx${p}\t$Rn!, $regs"), "$Rn = $wb", []> { + let Inst{24-23} = 0b10; // Decrement Before + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + } +} + +defm FLDM : vfp_ldstx_mult<"fldm", 1>; +defm FSTM : vfp_ldstx_mult<"fstm", 0>; + +def : VFP2MnemonicAlias<"fldmeax", "fldmdbx">; +def : VFP2MnemonicAlias<"fldmfdx", "fldmiax">; + +def : VFP2MnemonicAlias<"fstmeax", "fstmiax">; +def : VFP2MnemonicAlias<"fstmfdx", "fstmdbx">; + +//===----------------------------------------------------------------------===// +// FP Binary Operations. +// + +let TwoOperandAliasConstraint = "$Dn = $Dd" in +def VADDD : ADbI<0b11100, 0b11, 0, 0, + (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), + IIC_fpALU64, "vadd", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fadd DPR:$Dn, (f64 DPR:$Dm)))]>; + +let TwoOperandAliasConstraint = "$Sn = $Sd" in +def VADDS : ASbIn<0b11100, 0b11, 0, 0, + (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + IIC_fpALU32, "vadd", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fadd SPR:$Sn, SPR:$Sm))]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +let TwoOperandAliasConstraint = "$Dn = $Dd" in +def VSUBD : ADbI<0b11100, 0b11, 1, 0, + (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), + IIC_fpALU64, "vsub", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fsub DPR:$Dn, (f64 DPR:$Dm)))]>; + +let TwoOperandAliasConstraint = "$Sn = $Sd" in +def VSUBS : ASbIn<0b11100, 0b11, 1, 0, + (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + IIC_fpALU32, "vsub", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fsub SPR:$Sn, SPR:$Sm))]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +let TwoOperandAliasConstraint = "$Dn = $Dd" in +def VDIVD : ADbI<0b11101, 0b00, 0, 0, + (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), + IIC_fpDIV64, "vdiv", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fdiv DPR:$Dn, (f64 DPR:$Dm)))]>; + +let TwoOperandAliasConstraint = "$Sn = $Sd" in +def VDIVS : ASbI<0b11101, 0b00, 0, 0, + (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + IIC_fpDIV32, "vdiv", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fdiv SPR:$Sn, SPR:$Sm))]>; + +let TwoOperandAliasConstraint = "$Dn = $Dd" in +def VMULD : ADbI<0b11100, 0b10, 0, 0, + (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), + IIC_fpMUL64, "vmul", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fmul DPR:$Dn, (f64 DPR:$Dm)))]>; + +let TwoOperandAliasConstraint = "$Sn = $Sd" in +def VMULS : ASbIn<0b11100, 0b10, 0, 0, + (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + IIC_fpMUL32, "vmul", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fmul SPR:$Sn, SPR:$Sm))]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def VNMULD : ADbI<0b11100, 0b10, 1, 0, + (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), + IIC_fpMUL64, "vnmul", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fneg (fmul DPR:$Dn, (f64 DPR:$Dm))))]>; + +def VNMULS : ASbI<0b11100, 0b10, 1, 0, + (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + IIC_fpMUL32, "vnmul", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fneg (fmul SPR:$Sn, SPR:$Sm)))]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +multiclass vsel_inst<string op, bits<2> opc, int CC> { + let DecoderNamespace = "VFPV8", PostEncoderMethod = "", + Uses = [CPSR], AddedComplexity = 4 in { + def S : ASbInp<0b11100, opc, 0, + (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + NoItinerary, !strconcat("vsel", op, ".f32\t$Sd, $Sn, $Sm"), + [(set SPR:$Sd, (ARMcmov SPR:$Sm, SPR:$Sn, CC))]>, + Requires<[HasFPARMv8]>; + + def D : ADbInp<0b11100, opc, 0, + (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), + NoItinerary, !strconcat("vsel", op, ".f64\t$Dd, $Dn, $Dm"), + [(set DPR:$Dd, (ARMcmov (f64 DPR:$Dm), (f64 DPR:$Dn), CC))]>, + Requires<[HasFPARMv8, HasDPVFP]>; + } +} + +// The CC constants here match ARMCC::CondCodes. +defm VSELGT : vsel_inst<"gt", 0b11, 12>; +defm VSELGE : vsel_inst<"ge", 0b10, 10>; +defm VSELEQ : vsel_inst<"eq", 0b00, 0>; +defm VSELVS : vsel_inst<"vs", 0b01, 6>; + +multiclass vmaxmin_inst<string op, bit opc, SDNode SD> { + let DecoderNamespace = "VFPV8", PostEncoderMethod = "" in { + def S : ASbInp<0b11101, 0b00, opc, + (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + NoItinerary, !strconcat(op, ".f32\t$Sd, $Sn, $Sm"), + [(set SPR:$Sd, (SD SPR:$Sn, SPR:$Sm))]>, + Requires<[HasFPARMv8]>; + + def D : ADbInp<0b11101, 0b00, opc, + (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), + NoItinerary, !strconcat(op, ".f64\t$Dd, $Dn, $Dm"), + [(set DPR:$Dd, (f64 (SD (f64 DPR:$Dn), (f64 DPR:$Dm))))]>, + Requires<[HasFPARMv8, HasDPVFP]>; + } +} + +defm VMAXNM : vmaxmin_inst<"vmaxnm", 0, fmaxnum>; +defm VMINNM : vmaxmin_inst<"vminnm", 1, fminnum>; + +// Match reassociated forms only if not sign dependent rounding. +def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)), + (VNMULD DPR:$a, DPR:$b)>, + Requires<[NoHonorSignDependentRounding,HasDPVFP]>; +def : Pat<(fmul (fneg SPR:$a), SPR:$b), + (VNMULS SPR:$a, SPR:$b)>, Requires<[NoHonorSignDependentRounding]>; + +// These are encoded as unary instructions. +let Defs = [FPSCR_NZCV] in { +def VCMPED : ADuI<0b11101, 0b11, 0b0100, 0b11, 0, + (outs), (ins DPR:$Dd, DPR:$Dm), + IIC_fpCMP64, "vcmpe", ".f64\t$Dd, $Dm", + [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm))]>; + +def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0, + (outs), (ins SPR:$Sd, SPR:$Sm), + IIC_fpCMP32, "vcmpe", ".f32\t$Sd, $Sm", + [(arm_cmpfp SPR:$Sd, SPR:$Sm)]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +// FIXME: Verify encoding after integrated assembler is working. +def VCMPD : ADuI<0b11101, 0b11, 0b0100, 0b01, 0, + (outs), (ins DPR:$Dd, DPR:$Dm), + IIC_fpCMP64, "vcmp", ".f64\t$Dd, $Dm", + [/* For disassembly only; pattern left blank */]>; + +def VCMPS : ASuI<0b11101, 0b11, 0b0100, 0b01, 0, + (outs), (ins SPR:$Sd, SPR:$Sm), + IIC_fpCMP32, "vcmp", ".f32\t$Sd, $Sm", + [/* For disassembly only; pattern left blank */]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} +} // Defs = [FPSCR_NZCV] + +//===----------------------------------------------------------------------===// +// FP Unary Operations. +// + +def VABSD : ADuI<0b11101, 0b11, 0b0000, 0b11, 0, + (outs DPR:$Dd), (ins DPR:$Dm), + IIC_fpUNA64, "vabs", ".f64\t$Dd, $Dm", + [(set DPR:$Dd, (fabs (f64 DPR:$Dm)))]>; + +def VABSS : ASuIn<0b11101, 0b11, 0b0000, 0b11, 0, + (outs SPR:$Sd), (ins SPR:$Sm), + IIC_fpUNA32, "vabs", ".f32\t$Sd, $Sm", + [(set SPR:$Sd, (fabs SPR:$Sm))]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +let Defs = [FPSCR_NZCV] in { +def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0, + (outs), (ins DPR:$Dd), + IIC_fpCMP64, "vcmpe", ".f64\t$Dd, #0", + [(arm_cmpfp0 (f64 DPR:$Dd))]> { + let Inst{3-0} = 0b0000; + let Inst{5} = 0; +} + +def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0, + (outs), (ins SPR:$Sd), + IIC_fpCMP32, "vcmpe", ".f32\t$Sd, #0", + [(arm_cmpfp0 SPR:$Sd)]> { + let Inst{3-0} = 0b0000; + let Inst{5} = 0; + + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +// FIXME: Verify encoding after integrated assembler is working. +def VCMPZD : ADuI<0b11101, 0b11, 0b0101, 0b01, 0, + (outs), (ins DPR:$Dd), + IIC_fpCMP64, "vcmp", ".f64\t$Dd, #0", + [/* For disassembly only; pattern left blank */]> { + let Inst{3-0} = 0b0000; + let Inst{5} = 0; +} + +def VCMPZS : ASuI<0b11101, 0b11, 0b0101, 0b01, 0, + (outs), (ins SPR:$Sd), + IIC_fpCMP32, "vcmp", ".f32\t$Sd, #0", + [/* For disassembly only; pattern left blank */]> { + let Inst{3-0} = 0b0000; + let Inst{5} = 0; + + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} +} // Defs = [FPSCR_NZCV] + +def VCVTDS : ASuI<0b11101, 0b11, 0b0111, 0b11, 0, + (outs DPR:$Dd), (ins SPR:$Sm), + IIC_fpCVTDS, "vcvt", ".f64.f32\t$Dd, $Sm", + [(set DPR:$Dd, (fextend SPR:$Sm))]> { + // Instruction operands. + bits<5> Dd; + bits<5> Sm; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{15-12} = Dd{3-0}; + let Inst{22} = Dd{4}; + + let Predicates = [HasVFP2, HasDPVFP]; +} + +// Special case encoding: bits 11-8 is 0b1011. +def VCVTSD : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm, + IIC_fpCVTSD, "vcvt", ".f32.f64\t$Sd, $Dm", + [(set SPR:$Sd, (fround DPR:$Dm))]> { + // Instruction operands. + bits<5> Sd; + bits<5> Dm; + + // Encode instruction operands. + let Inst{3-0} = Dm{3-0}; + let Inst{5} = Dm{4}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; + + let Inst{27-23} = 0b11101; + let Inst{21-16} = 0b110111; + let Inst{11-8} = 0b1011; + let Inst{7-6} = 0b11; + let Inst{4} = 0; + + let Predicates = [HasVFP2, HasDPVFP]; +} + +// Between half, single and double-precision. For disassembly only. + +// FIXME: Verify encoding after integrated assembler is working. +def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), + /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm", + [/* For disassembly only; pattern left blank */]>, + Requires<[HasFP16]>; + +def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), + /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm", + [/* For disassembly only; pattern left blank */]>, + Requires<[HasFP16]>; + +def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), + /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm", + [/* For disassembly only; pattern left blank */]>, + Requires<[HasFP16]>; + +def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), + /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm", + [/* For disassembly only; pattern left blank */]>, + Requires<[HasFP16]>; + +def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0, + (outs DPR:$Dd), (ins SPR:$Sm), + NoItinerary, "vcvtb", ".f64.f16\t$Dd, $Sm", + []>, Requires<[HasFPARMv8, HasDPVFP]> { + // Instruction operands. + bits<5> Sm; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; +} + +def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0, + (outs SPR:$Sd), (ins DPR:$Dm), + NoItinerary, "vcvtb", ".f16.f64\t$Sd, $Dm", + []>, Requires<[HasFPARMv8, HasDPVFP]> { + // Instruction operands. + bits<5> Sd; + bits<5> Dm; + + // Encode instruction operands. + let Inst{3-0} = Dm{3-0}; + let Inst{5} = Dm{4}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; +} + +def VCVTTHD : ADuI<0b11101, 0b11, 0b0010, 0b11, 0, + (outs DPR:$Dd), (ins SPR:$Sm), + NoItinerary, "vcvtt", ".f64.f16\t$Dd, $Sm", + []>, Requires<[HasFPARMv8, HasDPVFP]> { + // Instruction operands. + bits<5> Sm; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; +} + +def VCVTTDH : ADuI<0b11101, 0b11, 0b0011, 0b11, 0, + (outs SPR:$Sd), (ins DPR:$Dm), + NoItinerary, "vcvtt", ".f16.f64\t$Sd, $Dm", + []>, Requires<[HasFPARMv8, HasDPVFP]> { + // Instruction operands. + bits<5> Sd; + bits<5> Dm; + + // Encode instruction operands. + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; + let Inst{3-0} = Dm{3-0}; + let Inst{5} = Dm{4}; +} + +def : Pat<(fp_to_f16 SPR:$a), + (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>; + +def : Pat<(fp_to_f16 (f64 DPR:$a)), + (i32 (COPY_TO_REGCLASS (VCVTBDH DPR:$a), GPR))>; + +def : Pat<(f16_to_fp GPR:$a), + (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>; + +def : Pat<(f64 (f16_to_fp GPR:$a)), + (VCVTBHD (COPY_TO_REGCLASS GPR:$a, SPR))>; + +multiclass vcvt_inst<string opc, bits<2> rm, + SDPatternOperator node = null_frag> { + let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in { + def SS : ASuInp<0b11101, 0b11, 0b1100, 0b11, 0, + (outs SPR:$Sd), (ins SPR:$Sm), + NoItinerary, !strconcat("vcvt", opc, ".s32.f32\t$Sd, $Sm"), + []>, + Requires<[HasFPARMv8]> { + let Inst{17-16} = rm; + } + + def US : ASuInp<0b11101, 0b11, 0b1100, 0b01, 0, + (outs SPR:$Sd), (ins SPR:$Sm), + NoItinerary, !strconcat("vcvt", opc, ".u32.f32\t$Sd, $Sm"), + []>, + Requires<[HasFPARMv8]> { + let Inst{17-16} = rm; + } + + def SD : ASuInp<0b11101, 0b11, 0b1100, 0b11, 0, + (outs SPR:$Sd), (ins DPR:$Dm), + NoItinerary, !strconcat("vcvt", opc, ".s32.f64\t$Sd, $Dm"), + []>, + Requires<[HasFPARMv8, HasDPVFP]> { + bits<5> Dm; + + let Inst{17-16} = rm; + + // Encode instruction operands + let Inst{3-0} = Dm{3-0}; + let Inst{5} = Dm{4}; + let Inst{8} = 1; + } + + def UD : ASuInp<0b11101, 0b11, 0b1100, 0b01, 0, + (outs SPR:$Sd), (ins DPR:$Dm), + NoItinerary, !strconcat("vcvt", opc, ".u32.f64\t$Sd, $Dm"), + []>, + Requires<[HasFPARMv8, HasDPVFP]> { + bits<5> Dm; + + let Inst{17-16} = rm; + + // Encode instruction operands + let Inst{3-0} = Dm{3-0}; + let Inst{5} = Dm{4}; + let Inst{8} = 1; + } + } + + let Predicates = [HasFPARMv8] in { + def : Pat<(i32 (fp_to_sint (node SPR:$a))), + (COPY_TO_REGCLASS + (!cast<Instruction>(NAME#"SS") SPR:$a), + GPR)>; + def : Pat<(i32 (fp_to_uint (node SPR:$a))), + (COPY_TO_REGCLASS + (!cast<Instruction>(NAME#"US") SPR:$a), + GPR)>; + } + let Predicates = [HasFPARMv8, HasDPVFP] in { + def : Pat<(i32 (fp_to_sint (node (f64 DPR:$a)))), + (COPY_TO_REGCLASS + (!cast<Instruction>(NAME#"SD") DPR:$a), + GPR)>; + def : Pat<(i32 (fp_to_uint (node (f64 DPR:$a)))), + (COPY_TO_REGCLASS + (!cast<Instruction>(NAME#"UD") DPR:$a), + GPR)>; + } +} + +defm VCVTA : vcvt_inst<"a", 0b00, frnd>; +defm VCVTN : vcvt_inst<"n", 0b01>; +defm VCVTP : vcvt_inst<"p", 0b10, fceil>; +defm VCVTM : vcvt_inst<"m", 0b11, ffloor>; + +def VNEGD : ADuI<0b11101, 0b11, 0b0001, 0b01, 0, + (outs DPR:$Dd), (ins DPR:$Dm), + IIC_fpUNA64, "vneg", ".f64\t$Dd, $Dm", + [(set DPR:$Dd, (fneg (f64 DPR:$Dm)))]>; + +def VNEGS : ASuIn<0b11101, 0b11, 0b0001, 0b01, 0, + (outs SPR:$Sd), (ins SPR:$Sm), + IIC_fpUNA32, "vneg", ".f32\t$Sd, $Sm", + [(set SPR:$Sd, (fneg SPR:$Sm))]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node> { + def S : ASuI<0b11101, 0b11, 0b0110, 0b11, 0, + (outs SPR:$Sd), (ins SPR:$Sm), + NoItinerary, !strconcat("vrint", opc), ".f32\t$Sd, $Sm", + [(set (f32 SPR:$Sd), (node (f32 SPR:$Sm)))]>, + Requires<[HasFPARMv8]> { + let Inst{7} = op2; + let Inst{16} = op; + } + def D : ADuI<0b11101, 0b11, 0b0110, 0b11, 0, + (outs DPR:$Dd), (ins DPR:$Dm), + NoItinerary, !strconcat("vrint", opc), ".f64\t$Dd, $Dm", + [(set (f64 DPR:$Dd), (node (f64 DPR:$Dm)))]>, + Requires<[HasFPARMv8, HasDPVFP]> { + let Inst{7} = op2; + let Inst{16} = op; + } + + def : InstAlias<!strconcat("vrint", opc, "$p.f32.f32\t$Sd, $Sm"), + (!cast<Instruction>(NAME#"S") SPR:$Sd, SPR:$Sm, pred:$p)>, + Requires<[HasFPARMv8]>; + def : InstAlias<!strconcat("vrint", opc, "$p.f64.f64\t$Dd, $Dm"), + (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm, pred:$p)>, + Requires<[HasFPARMv8,HasDPVFP]>; +} + +defm VRINTZ : vrint_inst_zrx<"z", 0, 1, ftrunc>; +defm VRINTR : vrint_inst_zrx<"r", 0, 0, fnearbyint>; +defm VRINTX : vrint_inst_zrx<"x", 1, 0, frint>; + +multiclass vrint_inst_anpm<string opc, bits<2> rm, + SDPatternOperator node = null_frag> { + let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in { + def S : ASuInp<0b11101, 0b11, 0b1000, 0b01, 0, + (outs SPR:$Sd), (ins SPR:$Sm), + NoItinerary, !strconcat("vrint", opc, ".f32\t$Sd, $Sm"), + [(set (f32 SPR:$Sd), (node (f32 SPR:$Sm)))]>, + Requires<[HasFPARMv8]> { + let Inst{17-16} = rm; + } + def D : ADuInp<0b11101, 0b11, 0b1000, 0b01, 0, + (outs DPR:$Dd), (ins DPR:$Dm), + NoItinerary, !strconcat("vrint", opc, ".f64\t$Dd, $Dm"), + [(set (f64 DPR:$Dd), (node (f64 DPR:$Dm)))]>, + Requires<[HasFPARMv8, HasDPVFP]> { + let Inst{17-16} = rm; + } + } + + def : InstAlias<!strconcat("vrint", opc, ".f32.f32\t$Sd, $Sm"), + (!cast<Instruction>(NAME#"S") SPR:$Sd, SPR:$Sm)>, + Requires<[HasFPARMv8]>; + def : InstAlias<!strconcat("vrint", opc, ".f64.f64\t$Dd, $Dm"), + (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm)>, + Requires<[HasFPARMv8,HasDPVFP]>; +} + +defm VRINTA : vrint_inst_anpm<"a", 0b00, frnd>; +defm VRINTN : vrint_inst_anpm<"n", 0b01>; +defm VRINTP : vrint_inst_anpm<"p", 0b10, fceil>; +defm VRINTM : vrint_inst_anpm<"m", 0b11, ffloor>; + +def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0, + (outs DPR:$Dd), (ins DPR:$Dm), + IIC_fpSQRT64, "vsqrt", ".f64\t$Dd, $Dm", + [(set DPR:$Dd, (fsqrt (f64 DPR:$Dm)))]>; + +def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0, + (outs SPR:$Sd), (ins SPR:$Sm), + IIC_fpSQRT32, "vsqrt", ".f32\t$Sd, $Sm", + [(set SPR:$Sd, (fsqrt SPR:$Sm))]>; + +let hasSideEffects = 0 in { +def VMOVD : ADuI<0b11101, 0b11, 0b0000, 0b01, 0, + (outs DPR:$Dd), (ins DPR:$Dm), + IIC_fpUNA64, "vmov", ".f64\t$Dd, $Dm", []>; + +def VMOVS : ASuI<0b11101, 0b11, 0b0000, 0b01, 0, + (outs SPR:$Sd), (ins SPR:$Sm), + IIC_fpUNA32, "vmov", ".f32\t$Sd, $Sm", []>; +} // hasSideEffects + +//===----------------------------------------------------------------------===// +// FP <-> GPR Copies. Int <-> FP Conversions. +// + +def VMOVRS : AVConv2I<0b11100001, 0b1010, + (outs GPR:$Rt), (ins SPR:$Sn), + IIC_fpMOVSI, "vmov", "\t$Rt, $Sn", + [(set GPR:$Rt, (bitconvert SPR:$Sn))]> { + // Instruction operands. + bits<4> Rt; + bits<5> Sn; + + // Encode instruction operands. + let Inst{19-16} = Sn{4-1}; + let Inst{7} = Sn{0}; + let Inst{15-12} = Rt; + + let Inst{6-5} = 0b00; + let Inst{3-0} = 0b0000; + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} + +// Bitcast i32 -> f32. NEON prefers to use VMOVDRR. +def VMOVSR : AVConv4I<0b11100000, 0b1010, + (outs SPR:$Sn), (ins GPR:$Rt), + IIC_fpMOVIS, "vmov", "\t$Sn, $Rt", + [(set SPR:$Sn, (bitconvert GPR:$Rt))]>, + Requires<[HasVFP2, UseVMOVSR]> { + // Instruction operands. + bits<5> Sn; + bits<4> Rt; + + // Encode instruction operands. + let Inst{19-16} = Sn{4-1}; + let Inst{7} = Sn{0}; + let Inst{15-12} = Rt; + + let Inst{6-5} = 0b00; + let Inst{3-0} = 0b0000; + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} + +let hasSideEffects = 0 in { +def VMOVRRD : AVConv3I<0b11000101, 0b1011, + (outs GPR:$Rt, GPR:$Rt2), (ins DPR:$Dm), + IIC_fpMOVDI, "vmov", "\t$Rt, $Rt2, $Dm", + [/* FIXME: Can't write pattern for multiple result instr*/]> { + // Instruction operands. + bits<5> Dm; + bits<4> Rt; + bits<4> Rt2; + + // Encode instruction operands. + let Inst{3-0} = Dm{3-0}; + let Inst{5} = Dm{4}; + let Inst{15-12} = Rt; + let Inst{19-16} = Rt2; + + let Inst{7-6} = 0b00; + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; + + // This instruction is equivalent to + // $Rt = EXTRACT_SUBREG $Dm, ssub_0 + // $Rt2 = EXTRACT_SUBREG $Dm, ssub_1 + let isExtractSubreg = 1; +} + +def VMOVRRS : AVConv3I<0b11000101, 0b1010, + (outs GPR:$Rt, GPR:$Rt2), (ins SPR:$src1, SPR:$src2), + IIC_fpMOVDI, "vmov", "\t$Rt, $Rt2, $src1, $src2", + [/* For disassembly only; pattern left blank */]> { + bits<5> src1; + bits<4> Rt; + bits<4> Rt2; + + // Encode instruction operands. + let Inst{3-0} = src1{4-1}; + let Inst{5} = src1{0}; + let Inst{15-12} = Rt; + let Inst{19-16} = Rt2; + + let Inst{7-6} = 0b00; + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; + let DecoderMethod = "DecodeVMOVRRS"; +} +} // hasSideEffects + +// FMDHR: GPR -> SPR +// FMDLR: GPR -> SPR + +def VMOVDRR : AVConv5I<0b11000100, 0b1011, + (outs DPR:$Dm), (ins GPR:$Rt, GPR:$Rt2), + IIC_fpMOVID, "vmov", "\t$Dm, $Rt, $Rt2", + [(set DPR:$Dm, (arm_fmdrr GPR:$Rt, GPR:$Rt2))]> { + // Instruction operands. + bits<5> Dm; + bits<4> Rt; + bits<4> Rt2; + + // Encode instruction operands. + let Inst{3-0} = Dm{3-0}; + let Inst{5} = Dm{4}; + let Inst{15-12} = Rt; + let Inst{19-16} = Rt2; + + let Inst{7-6} = 0b00; + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; + + // This instruction is equivalent to + // $Dm = REG_SEQUENCE $Rt, ssub_0, $Rt2, ssub_1 + let isRegSequence = 1; +} + +// Hoist an fabs or a fneg of a value coming from integer registers +// and do the fabs/fneg on the integer value. This is never a lose +// and could enable the conversion to float to be removed completely. +def : Pat<(fabs (arm_fmdrr GPR:$Rl, GPR:$Rh)), + (VMOVDRR GPR:$Rl, (BFC GPR:$Rh, (i32 0x7FFFFFFF)))>, + Requires<[IsARM]>; +def : Pat<(fabs (arm_fmdrr GPR:$Rl, GPR:$Rh)), + (VMOVDRR GPR:$Rl, (t2BFC GPR:$Rh, (i32 0x7FFFFFFF)))>, + Requires<[IsThumb2]>; +def : Pat<(fneg (arm_fmdrr GPR:$Rl, GPR:$Rh)), + (VMOVDRR GPR:$Rl, (EORri GPR:$Rh, (i32 0x80000000)))>, + Requires<[IsARM]>; +def : Pat<(fneg (arm_fmdrr GPR:$Rl, GPR:$Rh)), + (VMOVDRR GPR:$Rl, (t2EORri GPR:$Rh, (i32 0x80000000)))>, + Requires<[IsThumb2]>; + +let hasSideEffects = 0 in +def VMOVSRR : AVConv5I<0b11000100, 0b1010, + (outs SPR:$dst1, SPR:$dst2), (ins GPR:$src1, GPR:$src2), + IIC_fpMOVID, "vmov", "\t$dst1, $dst2, $src1, $src2", + [/* For disassembly only; pattern left blank */]> { + // Instruction operands. + bits<5> dst1; + bits<4> src1; + bits<4> src2; + + // Encode instruction operands. + let Inst{3-0} = dst1{4-1}; + let Inst{5} = dst1{0}; + let Inst{15-12} = src1; + let Inst{19-16} = src2; + + let Inst{7-6} = 0b00; + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; + + let DecoderMethod = "DecodeVMOVSRR"; +} + +// FMRDH: SPR -> GPR +// FMRDL: SPR -> GPR +// FMRRS: SPR -> GPR +// FMRX: SPR system reg -> GPR +// FMSRR: GPR -> SPR +// FMXR: GPR -> VFP system reg + + +// Int -> FP: + +class AVConv1IDs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, + bits<4> opcod4, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm, + pattern> { + // Instruction operands. + bits<5> Dd; + bits<5> Sm; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{15-12} = Dd{3-0}; + let Inst{22} = Dd{4}; + + let Predicates = [HasVFP2, HasDPVFP]; +} + +class AVConv1InSs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, + bits<4> opcod4, dag oops, dag iops,InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : AVConv1In<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm, + pattern> { + // Instruction operands. + bits<5> Sd; + bits<5> Sm; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; +} + +def VSITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011, + (outs DPR:$Dd), (ins SPR:$Sm), + IIC_fpCVTID, "vcvt", ".f64.s32\t$Dd, $Sm", + []> { + let Inst{7} = 1; // s32 +} + +let Predicates=[HasVFP2, HasDPVFP] in { + def : VFPPat<(f64 (sint_to_fp GPR:$a)), + (VSITOD (COPY_TO_REGCLASS GPR:$a, SPR))>; + + def : VFPPat<(f64 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))), + (VSITOD (VLDRS addrmode5:$a))>; +} + +def VSITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010, + (outs SPR:$Sd),(ins SPR:$Sm), + IIC_fpCVTIS, "vcvt", ".f32.s32\t$Sd, $Sm", + []> { + let Inst{7} = 1; // s32 + + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def : VFPNoNEONPat<(f32 (sint_to_fp GPR:$a)), + (VSITOS (COPY_TO_REGCLASS GPR:$a, SPR))>; + +def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))), + (VSITOS (VLDRS addrmode5:$a))>; + +def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011, + (outs DPR:$Dd), (ins SPR:$Sm), + IIC_fpCVTID, "vcvt", ".f64.u32\t$Dd, $Sm", + []> { + let Inst{7} = 0; // u32 +} + +let Predicates=[HasVFP2, HasDPVFP] in { + def : VFPPat<(f64 (uint_to_fp GPR:$a)), + (VUITOD (COPY_TO_REGCLASS GPR:$a, SPR))>; + + def : VFPPat<(f64 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))), + (VUITOD (VLDRS addrmode5:$a))>; +} + +def VUITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010, + (outs SPR:$Sd), (ins SPR:$Sm), + IIC_fpCVTIS, "vcvt", ".f32.u32\t$Sd, $Sm", + []> { + let Inst{7} = 0; // u32 + + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def : VFPNoNEONPat<(f32 (uint_to_fp GPR:$a)), + (VUITOS (COPY_TO_REGCLASS GPR:$a, SPR))>; + +def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))), + (VUITOS (VLDRS addrmode5:$a))>; + +// FP -> Int: + +class AVConv1IsD_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, + bits<4> opcod4, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm, + pattern> { + // Instruction operands. + bits<5> Sd; + bits<5> Dm; + + // Encode instruction operands. + let Inst{3-0} = Dm{3-0}; + let Inst{5} = Dm{4}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; + + let Predicates = [HasVFP2, HasDPVFP]; +} + +class AVConv1InsS_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, + bits<4> opcod4, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : AVConv1In<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm, + pattern> { + // Instruction operands. + bits<5> Sd; + bits<5> Sm; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; +} + +// Always set Z bit in the instruction, i.e. "round towards zero" variants. +def VTOSIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011, + (outs SPR:$Sd), (ins DPR:$Dm), + IIC_fpCVTDI, "vcvt", ".s32.f64\t$Sd, $Dm", + []> { + let Inst{7} = 1; // Z bit +} + +let Predicates=[HasVFP2, HasDPVFP] in { + def : VFPPat<(i32 (fp_to_sint (f64 DPR:$a))), + (COPY_TO_REGCLASS (VTOSIZD DPR:$a), GPR)>; + + def : VFPPat<(alignedstore32 (i32 (fp_to_sint (f64 DPR:$a))), addrmode5:$ptr), + (VSTRS (VTOSIZD DPR:$a), addrmode5:$ptr)>; +} + +def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010, + (outs SPR:$Sd), (ins SPR:$Sm), + IIC_fpCVTSI, "vcvt", ".s32.f32\t$Sd, $Sm", + []> { + let Inst{7} = 1; // Z bit + + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def : VFPNoNEONPat<(i32 (fp_to_sint SPR:$a)), + (COPY_TO_REGCLASS (VTOSIZS SPR:$a), GPR)>; + +def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_sint (f32 SPR:$a))), + addrmode5:$ptr), + (VSTRS (VTOSIZS SPR:$a), addrmode5:$ptr)>; + +def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011, + (outs SPR:$Sd), (ins DPR:$Dm), + IIC_fpCVTDI, "vcvt", ".u32.f64\t$Sd, $Dm", + []> { + let Inst{7} = 1; // Z bit +} + +let Predicates=[HasVFP2, HasDPVFP] in { + def : VFPPat<(i32 (fp_to_uint (f64 DPR:$a))), + (COPY_TO_REGCLASS (VTOUIZD DPR:$a), GPR)>; + + def : VFPPat<(alignedstore32 (i32 (fp_to_uint (f64 DPR:$a))), addrmode5:$ptr), + (VSTRS (VTOUIZD DPR:$a), addrmode5:$ptr)>; +} + +def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010, + (outs SPR:$Sd), (ins SPR:$Sm), + IIC_fpCVTSI, "vcvt", ".u32.f32\t$Sd, $Sm", + []> { + let Inst{7} = 1; // Z bit + + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def : VFPNoNEONPat<(i32 (fp_to_uint SPR:$a)), + (COPY_TO_REGCLASS (VTOUIZS SPR:$a), GPR)>; + +def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_uint (f32 SPR:$a))), + addrmode5:$ptr), + (VSTRS (VTOUIZS SPR:$a), addrmode5:$ptr)>; + +// And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR. +let Uses = [FPSCR] in { +// FIXME: Verify encoding after integrated assembler is working. +def VTOSIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011, + (outs SPR:$Sd), (ins DPR:$Dm), + IIC_fpCVTDI, "vcvtr", ".s32.f64\t$Sd, $Dm", + [(set SPR:$Sd, (int_arm_vcvtr (f64 DPR:$Dm)))]>{ + let Inst{7} = 0; // Z bit +} + +def VTOSIRS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010, + (outs SPR:$Sd), (ins SPR:$Sm), + IIC_fpCVTSI, "vcvtr", ".s32.f32\t$Sd, $Sm", + [(set SPR:$Sd, (int_arm_vcvtr SPR:$Sm))]> { + let Inst{7} = 0; // Z bit +} + +def VTOUIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011, + (outs SPR:$Sd), (ins DPR:$Dm), + IIC_fpCVTDI, "vcvtr", ".u32.f64\t$Sd, $Dm", + [(set SPR:$Sd, (int_arm_vcvtru(f64 DPR:$Dm)))]>{ + let Inst{7} = 0; // Z bit +} + +def VTOUIRS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010, + (outs SPR:$Sd), (ins SPR:$Sm), + IIC_fpCVTSI, "vcvtr", ".u32.f32\t$Sd, $Sm", + [(set SPR:$Sd, (int_arm_vcvtru SPR:$Sm))]> { + let Inst{7} = 0; // Z bit +} +} + +// Convert between floating-point and fixed-point +// Data type for fixed-point naming convention: +// S16 (U=0, sx=0) -> SH +// U16 (U=1, sx=0) -> UH +// S32 (U=0, sx=1) -> SL +// U32 (U=1, sx=1) -> UL + +let Constraints = "$a = $dst" in { + +// FP to Fixed-Point: + +// Single Precision register +class AVConv1XInsS_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, + bit op5, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern>, + Sched<[WriteCvtFP]> { + bits<5> dst; + // if dp_operation then UInt(D:Vd) else UInt(Vd:D); + let Inst{22} = dst{0}; + let Inst{15-12} = dst{4-1}; +} + +// Double Precision register +class AVConv1XInsD_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, + bit op5, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern>, + Sched<[WriteCvtFP]> { + bits<5> dst; + // if dp_operation then UInt(D:Vd) else UInt(Vd:D); + let Inst{22} = dst{4}; + let Inst{15-12} = dst{3-0}; + + let Predicates = [HasVFP2, HasDPVFP]; +} + +def VTOSHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 0, + (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits), + IIC_fpCVTSI, "vcvt", ".s16.f32\t$dst, $a, $fbits", []> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def VTOUHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 0, + (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits), + IIC_fpCVTSI, "vcvt", ".u16.f32\t$dst, $a, $fbits", []> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def VTOSLS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 1, + (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits), + IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a, $fbits", []> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def VTOULS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 1, + (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits), + IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a, $fbits", []> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def VTOSHD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1110, 0b1011, 0, + (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits), + IIC_fpCVTDI, "vcvt", ".s16.f64\t$dst, $a, $fbits", []>; + +def VTOUHD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1111, 0b1011, 0, + (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits), + IIC_fpCVTDI, "vcvt", ".u16.f64\t$dst, $a, $fbits", []>; + +def VTOSLD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1110, 0b1011, 1, + (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits), + IIC_fpCVTDI, "vcvt", ".s32.f64\t$dst, $a, $fbits", []>; + +def VTOULD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1111, 0b1011, 1, + (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits), + IIC_fpCVTDI, "vcvt", ".u32.f64\t$dst, $a, $fbits", []>; + +// Fixed-Point to FP: + +def VSHTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1010, 0, + (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits), + IIC_fpCVTIS, "vcvt", ".f32.s16\t$dst, $a, $fbits", []> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def VUHTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1010, 0, + (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits), + IIC_fpCVTIS, "vcvt", ".f32.u16\t$dst, $a, $fbits", []> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def VSLTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1010, 1, + (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits), + IIC_fpCVTIS, "vcvt", ".f32.s32\t$dst, $a, $fbits", []> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def VULTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1010, 1, + (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits), + IIC_fpCVTIS, "vcvt", ".f32.u32\t$dst, $a, $fbits", []> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def VSHTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1010, 0b1011, 0, + (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits), + IIC_fpCVTID, "vcvt", ".f64.s16\t$dst, $a, $fbits", []>; + +def VUHTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1011, 0b1011, 0, + (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits), + IIC_fpCVTID, "vcvt", ".f64.u16\t$dst, $a, $fbits", []>; + +def VSLTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1010, 0b1011, 1, + (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits), + IIC_fpCVTID, "vcvt", ".f64.s32\t$dst, $a, $fbits", []>; + +def VULTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1011, 0b1011, 1, + (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits), + IIC_fpCVTID, "vcvt", ".f64.u32\t$dst, $a, $fbits", []>; + +} // End of 'let Constraints = "$a = $dst" in' + +//===----------------------------------------------------------------------===// +// FP Multiply-Accumulate Operations. +// + +def VMLAD : ADbI<0b11100, 0b00, 0, 0, + (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), + IIC_fpMAC64, "vmla", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm), + (f64 DPR:$Ddin)))]>, + RegConstraint<"$Ddin = $Dd">, + Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>; + +def VMLAS : ASbIn<0b11100, 0b00, 0, 0, + (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + IIC_fpMAC32, "vmla", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm), + SPR:$Sdin))]>, + RegConstraint<"$Sdin = $Sd">, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), + (VMLAD DPR:$dstin, DPR:$a, DPR:$b)>, + Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>; +def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), + (VMLAS SPR:$dstin, SPR:$a, SPR:$b)>, + Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>; + +def VMLSD : ADbI<0b11100, 0b00, 1, 0, + (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), + IIC_fpMAC64, "vmls", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)), + (f64 DPR:$Ddin)))]>, + RegConstraint<"$Ddin = $Dd">, + Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>; + +def VMLSS : ASbIn<0b11100, 0b00, 1, 0, + (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + IIC_fpMAC32, "vmls", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)), + SPR:$Sdin))]>, + RegConstraint<"$Sdin = $Sd">, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), + (VMLSD DPR:$dstin, DPR:$a, DPR:$b)>, + Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>; +def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), + (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; + +def VNMLAD : ADbI<0b11100, 0b01, 1, 0, + (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), + IIC_fpMAC64, "vnmla", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)), + (f64 DPR:$Ddin)))]>, + RegConstraint<"$Ddin = $Dd">, + Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>; + +def VNMLAS : ASbI<0b11100, 0b01, 1, 0, + (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + IIC_fpMAC32, "vnmla", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)), + SPR:$Sdin))]>, + RegConstraint<"$Sdin = $Sd">, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin), + (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>, + Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>; +def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin), + (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; + +def VNMLSD : ADbI<0b11100, 0b01, 0, 0, + (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), + IIC_fpMAC64, "vnmls", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm), + (f64 DPR:$Ddin)))]>, + RegConstraint<"$Ddin = $Dd">, + Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>; + +def VNMLSS : ASbI<0b11100, 0b01, 0, 0, + (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + IIC_fpMAC32, "vnmls", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>, + RegConstraint<"$Sdin = $Sd">, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin), + (VNMLSD DPR:$dstin, DPR:$a, DPR:$b)>, + Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>; +def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin), + (VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>; + +//===----------------------------------------------------------------------===// +// Fused FP Multiply-Accumulate Operations. +// +def VFMAD : ADbI<0b11101, 0b10, 0, 0, + (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), + IIC_fpFMAC64, "vfma", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm), + (f64 DPR:$Ddin)))]>, + RegConstraint<"$Ddin = $Dd">, + Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>; + +def VFMAS : ASbIn<0b11101, 0b10, 0, 0, + (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + IIC_fpFMAC32, "vfma", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm), + SPR:$Sdin))]>, + RegConstraint<"$Sdin = $Sd">, + Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines. +} + +def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), + (VFMAD DPR:$dstin, DPR:$a, DPR:$b)>, + Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>; +def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), + (VFMAS SPR:$dstin, SPR:$a, SPR:$b)>, + Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>; + +// Match @llvm.fma.* intrinsics +// (fma x, y, z) -> (vfms z, x, y) +def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)), + (VFMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, + Requires<[HasVFP4,HasDPVFP]>; +def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)), + (VFMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, + Requires<[HasVFP4]>; + +def VFMSD : ADbI<0b11101, 0b10, 1, 0, + (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), + IIC_fpFMAC64, "vfms", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)), + (f64 DPR:$Ddin)))]>, + RegConstraint<"$Ddin = $Dd">, + Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>; + +def VFMSS : ASbIn<0b11101, 0b10, 1, 0, + (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + IIC_fpFMAC32, "vfms", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)), + SPR:$Sdin))]>, + RegConstraint<"$Sdin = $Sd">, + Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines. +} + +def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), + (VFMSD DPR:$dstin, DPR:$a, DPR:$b)>, + Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>; +def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), + (VFMSS SPR:$dstin, SPR:$a, SPR:$b)>, + Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>; + +// Match @llvm.fma.* intrinsics +// (fma (fneg x), y, z) -> (vfms z, x, y) +def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)), + (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, + Requires<[HasVFP4,HasDPVFP]>; +def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)), + (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, + Requires<[HasVFP4]>; +// (fma x, (fneg y), z) -> (vfms z, x, y) +def : Pat<(f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin)), + (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, + Requires<[HasVFP4,HasDPVFP]>; +def : Pat<(f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin)), + (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, + Requires<[HasVFP4]>; + +def VFNMAD : ADbI<0b11101, 0b01, 1, 0, + (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), + IIC_fpFMAC64, "vfnma", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)), + (f64 DPR:$Ddin)))]>, + RegConstraint<"$Ddin = $Dd">, + Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>; + +def VFNMAS : ASbI<0b11101, 0b01, 1, 0, + (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + IIC_fpFMAC32, "vfnma", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)), + SPR:$Sdin))]>, + RegConstraint<"$Sdin = $Sd">, + Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines. +} + +def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin), + (VFNMAD DPR:$dstin, DPR:$a, DPR:$b)>, + Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>; +def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin), + (VFNMAS SPR:$dstin, SPR:$a, SPR:$b)>, + Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>; + +// Match @llvm.fma.* intrinsics +// (fneg (fma x, y, z)) -> (vfnma z, x, y) +def : Pat<(fneg (fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))), + (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, + Requires<[HasVFP4,HasDPVFP]>; +def : Pat<(fneg (fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))), + (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, + Requires<[HasVFP4]>; +// (fma (fneg x), y, (fneg z)) -> (vfnma z, x, y) +def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))), + (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, + Requires<[HasVFP4,HasDPVFP]>; +def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))), + (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, + Requires<[HasVFP4]>; + +def VFNMSD : ADbI<0b11101, 0b01, 0, 0, + (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), + IIC_fpFMAC64, "vfnms", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm), + (f64 DPR:$Ddin)))]>, + RegConstraint<"$Ddin = $Dd">, + Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>; + +def VFNMSS : ASbI<0b11101, 0b01, 0, 0, + (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + IIC_fpFMAC32, "vfnms", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>, + RegConstraint<"$Sdin = $Sd">, + Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines. +} + +def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin), + (VFNMSD DPR:$dstin, DPR:$a, DPR:$b)>, + Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>; +def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin), + (VFNMSS SPR:$dstin, SPR:$a, SPR:$b)>, + Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>; + +// Match @llvm.fma.* intrinsics + +// (fma x, y, (fneg z)) -> (vfnms z, x, y)) +def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))), + (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, + Requires<[HasVFP4,HasDPVFP]>; +def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))), + (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, + Requires<[HasVFP4]>; +// (fneg (fma (fneg x), y, z)) -> (vfnms z, x, y) +def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))), + (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, + Requires<[HasVFP4,HasDPVFP]>; +def : Pat<(fneg (f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))), + (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, + Requires<[HasVFP4]>; +// (fneg (fma x, (fneg y), z) -> (vfnms z, x, y) +def : Pat<(fneg (f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin))), + (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>, + Requires<[HasVFP4,HasDPVFP]>; +def : Pat<(fneg (f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin))), + (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, + Requires<[HasVFP4]>; + +//===----------------------------------------------------------------------===// +// FP Conditional moves. +// + +let hasSideEffects = 0 in { +def VMOVDcc : PseudoInst<(outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm, cmovpred:$p), + IIC_fpUNA64, + [(set (f64 DPR:$Dd), + (ARMcmov DPR:$Dn, DPR:$Dm, cmovpred:$p))]>, + RegConstraint<"$Dn = $Dd">, Requires<[HasVFP2,HasDPVFP]>; + +def VMOVScc : PseudoInst<(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm, cmovpred:$p), + IIC_fpUNA32, + [(set (f32 SPR:$Sd), + (ARMcmov SPR:$Sn, SPR:$Sm, cmovpred:$p))]>, + RegConstraint<"$Sn = $Sd">, Requires<[HasVFP2]>; +} // hasSideEffects + +//===----------------------------------------------------------------------===// +// Move from VFP System Register to ARM core register. +// + +class MovFromVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm, + list<dag> pattern>: + VFPAI<oops, iops, VFPMiscFrm, IIC_fpSTAT, opc, asm, pattern> { + + // Instruction operand. + bits<4> Rt; + + let Inst{27-20} = 0b11101111; + let Inst{19-16} = opc19_16; + let Inst{15-12} = Rt; + let Inst{11-8} = 0b1010; + let Inst{7} = 0; + let Inst{6-5} = 0b00; + let Inst{4} = 1; + let Inst{3-0} = 0b0000; +} + +// APSR is the application level alias of CPSR. This FPSCR N, Z, C, V flags +// to APSR. +let Defs = [CPSR], Uses = [FPSCR_NZCV], Rt = 0b1111 /* apsr_nzcv */ in +def FMSTAT : MovFromVFP<0b0001 /* fpscr */, (outs), (ins), + "vmrs", "\tAPSR_nzcv, fpscr", [(arm_fmstat)]>; + +// Application level FPSCR -> GPR +let hasSideEffects = 1, Uses = [FPSCR] in +def VMRS : MovFromVFP<0b0001 /* fpscr */, (outs GPR:$Rt), (ins), + "vmrs", "\t$Rt, fpscr", + [(set GPR:$Rt, (int_arm_get_fpscr))]>; + +// System level FPEXC, FPSID -> GPR +let Uses = [FPSCR] in { + def VMRS_FPEXC : MovFromVFP<0b1000 /* fpexc */, (outs GPR:$Rt), (ins), + "vmrs", "\t$Rt, fpexc", []>; + def VMRS_FPSID : MovFromVFP<0b0000 /* fpsid */, (outs GPR:$Rt), (ins), + "vmrs", "\t$Rt, fpsid", []>; + def VMRS_MVFR0 : MovFromVFP<0b0111 /* mvfr0 */, (outs GPR:$Rt), (ins), + "vmrs", "\t$Rt, mvfr0", []>; + def VMRS_MVFR1 : MovFromVFP<0b0110 /* mvfr1 */, (outs GPR:$Rt), (ins), + "vmrs", "\t$Rt, mvfr1", []>; + def VMRS_MVFR2 : MovFromVFP<0b0101 /* mvfr2 */, (outs GPR:$Rt), (ins), + "vmrs", "\t$Rt, mvfr2", []>, Requires<[HasFPARMv8]>; + def VMRS_FPINST : MovFromVFP<0b1001 /* fpinst */, (outs GPR:$Rt), (ins), + "vmrs", "\t$Rt, fpinst", []>; + def VMRS_FPINST2 : MovFromVFP<0b1010 /* fpinst2 */, (outs GPR:$Rt), (ins), + "vmrs", "\t$Rt, fpinst2", []>; +} + +//===----------------------------------------------------------------------===// +// Move from ARM core register to VFP System Register. +// + +class MovToVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm, + list<dag> pattern>: + VFPAI<oops, iops, VFPMiscFrm, IIC_fpSTAT, opc, asm, pattern> { + + // Instruction operand. + bits<4> src; + + // Encode instruction operand. + let Inst{15-12} = src; + + let Inst{27-20} = 0b11101110; + let Inst{19-16} = opc19_16; + let Inst{11-8} = 0b1010; + let Inst{7} = 0; + let Inst{4} = 1; +} + +let Defs = [FPSCR] in { + // Application level GPR -> FPSCR + def VMSR : MovToVFP<0b0001 /* fpscr */, (outs), (ins GPR:$src), + "vmsr", "\tfpscr, $src", [(int_arm_set_fpscr GPR:$src)]>; + // System level GPR -> FPEXC + def VMSR_FPEXC : MovToVFP<0b1000 /* fpexc */, (outs), (ins GPR:$src), + "vmsr", "\tfpexc, $src", []>; + // System level GPR -> FPSID + def VMSR_FPSID : MovToVFP<0b0000 /* fpsid */, (outs), (ins GPR:$src), + "vmsr", "\tfpsid, $src", []>; + + def VMSR_FPINST : MovToVFP<0b1001 /* fpinst */, (outs), (ins GPR:$src), + "vmsr", "\tfpinst, $src", []>; + def VMSR_FPINST2 : MovToVFP<0b1010 /* fpinst2 */, (outs), (ins GPR:$src), + "vmsr", "\tfpinst2, $src", []>; +} + +//===----------------------------------------------------------------------===// +// Misc. +// + +// Materialize FP immediates. VFP3 only. +let isReMaterializable = 1 in { +def FCONSTD : VFPAI<(outs DPR:$Dd), (ins vfp_f64imm:$imm), + VFPMiscFrm, IIC_fpUNA64, + "vmov", ".f64\t$Dd, $imm", + [(set DPR:$Dd, vfp_f64imm:$imm)]>, + Requires<[HasVFP3,HasDPVFP]> { + bits<5> Dd; + bits<8> imm; + + let Inst{27-23} = 0b11101; + let Inst{22} = Dd{4}; + let Inst{21-20} = 0b11; + let Inst{19-16} = imm{7-4}; + let Inst{15-12} = Dd{3-0}; + let Inst{11-9} = 0b101; + let Inst{8} = 1; // Double precision. + let Inst{7-4} = 0b0000; + let Inst{3-0} = imm{3-0}; +} + +def FCONSTS : VFPAI<(outs SPR:$Sd), (ins vfp_f32imm:$imm), + VFPMiscFrm, IIC_fpUNA32, + "vmov", ".f32\t$Sd, $imm", + [(set SPR:$Sd, vfp_f32imm:$imm)]>, Requires<[HasVFP3]> { + bits<5> Sd; + bits<8> imm; + + let Inst{27-23} = 0b11101; + let Inst{22} = Sd{0}; + let Inst{21-20} = 0b11; + let Inst{19-16} = imm{7-4}; + let Inst{15-12} = Sd{4-1}; + let Inst{11-9} = 0b101; + let Inst{8} = 0; // Single precision. + let Inst{7-4} = 0b0000; + let Inst{3-0} = imm{3-0}; +} +} + +//===----------------------------------------------------------------------===// +// Assembler aliases. +// +// A few mnemonic aliases for pre-unifixed syntax. We don't guarantee to +// support them all, but supporting at least some of the basics is +// good to be friendly. +def : VFP2MnemonicAlias<"flds", "vldr">; +def : VFP2MnemonicAlias<"fldd", "vldr">; +def : VFP2MnemonicAlias<"fmrs", "vmov">; +def : VFP2MnemonicAlias<"fmsr", "vmov">; +def : VFP2MnemonicAlias<"fsqrts", "vsqrt">; +def : VFP2MnemonicAlias<"fsqrtd", "vsqrt">; +def : VFP2MnemonicAlias<"fadds", "vadd.f32">; +def : VFP2MnemonicAlias<"faddd", "vadd.f64">; +def : VFP2MnemonicAlias<"fmrdd", "vmov">; +def : VFP2MnemonicAlias<"fmrds", "vmov">; +def : VFP2MnemonicAlias<"fmrrd", "vmov">; +def : VFP2MnemonicAlias<"fmdrr", "vmov">; +def : VFP2MnemonicAlias<"fmuls", "vmul.f32">; +def : VFP2MnemonicAlias<"fmuld", "vmul.f64">; +def : VFP2MnemonicAlias<"fnegs", "vneg.f32">; +def : VFP2MnemonicAlias<"fnegd", "vneg.f64">; +def : VFP2MnemonicAlias<"ftosizd", "vcvt.s32.f64">; +def : VFP2MnemonicAlias<"ftosid", "vcvtr.s32.f64">; +def : VFP2MnemonicAlias<"ftosizs", "vcvt.s32.f32">; +def : VFP2MnemonicAlias<"ftosis", "vcvtr.s32.f32">; +def : VFP2MnemonicAlias<"ftouizd", "vcvt.u32.f64">; +def : VFP2MnemonicAlias<"ftouid", "vcvtr.u32.f64">; +def : VFP2MnemonicAlias<"ftouizs", "vcvt.u32.f32">; +def : VFP2MnemonicAlias<"ftouis", "vcvtr.u32.f32">; +def : VFP2MnemonicAlias<"fsitod", "vcvt.f64.s32">; +def : VFP2MnemonicAlias<"fsitos", "vcvt.f32.s32">; +def : VFP2MnemonicAlias<"fuitod", "vcvt.f64.u32">; +def : VFP2MnemonicAlias<"fuitos", "vcvt.f32.u32">; +def : VFP2MnemonicAlias<"fsts", "vstr">; +def : VFP2MnemonicAlias<"fstd", "vstr">; +def : VFP2MnemonicAlias<"fmacd", "vmla.f64">; +def : VFP2MnemonicAlias<"fmacs", "vmla.f32">; +def : VFP2MnemonicAlias<"fcpys", "vmov.f32">; +def : VFP2MnemonicAlias<"fcpyd", "vmov.f64">; +def : VFP2MnemonicAlias<"fcmps", "vcmp.f32">; +def : VFP2MnemonicAlias<"fcmpd", "vcmp.f64">; +def : VFP2MnemonicAlias<"fdivs", "vdiv.f32">; +def : VFP2MnemonicAlias<"fdivd", "vdiv.f64">; +def : VFP2MnemonicAlias<"fmrx", "vmrs">; +def : VFP2MnemonicAlias<"fmxr", "vmsr">; + +// Be friendly and accept the old form of zero-compare +def : VFP2DPInstAlias<"fcmpzd${p} $val", (VCMPZD DPR:$val, pred:$p)>; +def : VFP2InstAlias<"fcmpzs${p} $val", (VCMPZS SPR:$val, pred:$p)>; + + +def : VFP2InstAlias<"fmstat${p}", (FMSTAT pred:$p)>; +def : VFP2InstAlias<"fadds${p} $Sd, $Sn, $Sm", + (VADDS SPR:$Sd, SPR:$Sn, SPR:$Sm, pred:$p)>; +def : VFP2DPInstAlias<"faddd${p} $Dd, $Dn, $Dm", + (VADDD DPR:$Dd, DPR:$Dn, DPR:$Dm, pred:$p)>; +def : VFP2InstAlias<"fsubs${p} $Sd, $Sn, $Sm", + (VSUBS SPR:$Sd, SPR:$Sn, SPR:$Sm, pred:$p)>; +def : VFP2DPInstAlias<"fsubd${p} $Dd, $Dn, $Dm", + (VSUBD DPR:$Dd, DPR:$Dn, DPR:$Dm, pred:$p)>; + +// No need for the size suffix on VSQRT. It's implied by the register classes. +def : VFP2InstAlias<"vsqrt${p} $Sd, $Sm", (VSQRTS SPR:$Sd, SPR:$Sm, pred:$p)>; +def : VFP2DPInstAlias<"vsqrt${p} $Dd, $Dm", (VSQRTD DPR:$Dd, DPR:$Dm, pred:$p)>; + +// VLDR/VSTR accept an optional type suffix. +def : VFP2InstAlias<"vldr${p}.32 $Sd, $addr", + (VLDRS SPR:$Sd, addrmode5:$addr, pred:$p)>; +def : VFP2InstAlias<"vstr${p}.32 $Sd, $addr", + (VSTRS SPR:$Sd, addrmode5:$addr, pred:$p)>; +def : VFP2InstAlias<"vldr${p}.64 $Dd, $addr", + (VLDRD DPR:$Dd, addrmode5:$addr, pred:$p)>; +def : VFP2InstAlias<"vstr${p}.64 $Dd, $addr", + (VSTRD DPR:$Dd, addrmode5:$addr, pred:$p)>; + +// VMOV can accept optional 32-bit or less data type suffix suffix. +def : VFP2InstAlias<"vmov${p}.8 $Rt, $Sn", + (VMOVRS GPR:$Rt, SPR:$Sn, pred:$p)>; +def : VFP2InstAlias<"vmov${p}.16 $Rt, $Sn", + (VMOVRS GPR:$Rt, SPR:$Sn, pred:$p)>; +def : VFP2InstAlias<"vmov${p}.32 $Rt, $Sn", + (VMOVRS GPR:$Rt, SPR:$Sn, pred:$p)>; +def : VFP2InstAlias<"vmov${p}.8 $Sn, $Rt", + (VMOVSR SPR:$Sn, GPR:$Rt, pred:$p)>; +def : VFP2InstAlias<"vmov${p}.16 $Sn, $Rt", + (VMOVSR SPR:$Sn, GPR:$Rt, pred:$p)>; +def : VFP2InstAlias<"vmov${p}.32 $Sn, $Rt", + (VMOVSR SPR:$Sn, GPR:$Rt, pred:$p)>; + +def : VFP2InstAlias<"vmov${p}.f64 $Rt, $Rt2, $Dn", + (VMOVRRD GPR:$Rt, GPR:$Rt2, DPR:$Dn, pred:$p)>; +def : VFP2InstAlias<"vmov${p}.f64 $Dn, $Rt, $Rt2", + (VMOVDRR DPR:$Dn, GPR:$Rt, GPR:$Rt2, pred:$p)>; + +// VMOVS doesn't need the .f32 to disambiguate from the NEON encoding the way +// VMOVD does. +def : VFP2InstAlias<"vmov${p} $Sd, $Sm", + (VMOVS SPR:$Sd, SPR:$Sm, pred:$p)>; + +// FCONSTD/FCONSTS alias for vmov.f64/vmov.f32 +// These aliases provide added functionality over vmov.f instructions by +// allowing users to write assembly containing encoded floating point constants +// (e.g. #0x70 vs #1.0). Without these alises there is no way for the +// assembler to accept encoded fp constants (but the equivalent fp-literal is +// accepted directly by vmovf). +def : VFP3InstAlias<"fconstd${p} $Dd, $val", + (FCONSTD DPR:$Dd, vfp_f64imm:$val, pred:$p)>; +def : VFP3InstAlias<"fconsts${p} $Sd, $val", + (FCONSTS SPR:$Sd, vfp_f32imm:$val, pred:$p)>; diff --git a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp new file mode 100644 index 0000000..6e7e47b --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -0,0 +1,2338 @@ +//===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains a pass that performs load / store related peephole +/// optimizations. This pass should be run after register allocation. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "ARMBaseRegisterInfo.h" +#include "ARMISelLowering.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMSubtarget.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "ThumbRegisterInfo.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" +using namespace llvm; + +#define DEBUG_TYPE "arm-ldst-opt" + +STATISTIC(NumLDMGened , "Number of ldm instructions generated"); +STATISTIC(NumSTMGened , "Number of stm instructions generated"); +STATISTIC(NumVLDMGened, "Number of vldm instructions generated"); +STATISTIC(NumVSTMGened, "Number of vstm instructions generated"); +STATISTIC(NumLdStMoved, "Number of load / store instructions moved"); +STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation"); +STATISTIC(NumSTRDFormed,"Number of strd created before allocation"); +STATISTIC(NumLDRD2LDM, "Number of ldrd instructions turned back into ldm"); +STATISTIC(NumSTRD2STM, "Number of strd instructions turned back into stm"); +STATISTIC(NumLDRD2LDR, "Number of ldrd instructions turned back into ldr's"); +STATISTIC(NumSTRD2STR, "Number of strd instructions turned back into str's"); + +namespace llvm { +void initializeARMLoadStoreOptPass(PassRegistry &); +} + +#define ARM_LOAD_STORE_OPT_NAME "ARM load / store optimization pass" + +namespace { + /// Post- register allocation pass the combine load / store instructions to + /// form ldm / stm instructions. + struct ARMLoadStoreOpt : public MachineFunctionPass { + static char ID; + ARMLoadStoreOpt() : MachineFunctionPass(ID) { + initializeARMLoadStoreOptPass(*PassRegistry::getPassRegistry()); + } + + const MachineFunction *MF; + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + const ARMSubtarget *STI; + const TargetLowering *TL; + ARMFunctionInfo *AFI; + LivePhysRegs LiveRegs; + RegisterClassInfo RegClassInfo; + MachineBasicBlock::const_iterator LiveRegPos; + bool LiveRegsValid; + bool RegClassInfoValid; + bool isThumb1, isThumb2; + + bool runOnMachineFunction(MachineFunction &Fn) override; + + const char *getPassName() const override { + return ARM_LOAD_STORE_OPT_NAME; + } + + private: + /// A set of load/store MachineInstrs with same base register sorted by + /// offset. + struct MemOpQueueEntry { + MachineInstr *MI; + int Offset; ///< Load/Store offset. + unsigned Position; ///< Position as counted from end of basic block. + MemOpQueueEntry(MachineInstr *MI, int Offset, unsigned Position) + : MI(MI), Offset(Offset), Position(Position) {} + }; + typedef SmallVector<MemOpQueueEntry,8> MemOpQueue; + + /// A set of MachineInstrs that fulfill (nearly all) conditions to get + /// merged into a LDM/STM. + struct MergeCandidate { + /// List of instructions ordered by load/store offset. + SmallVector<MachineInstr*, 4> Instrs; + /// Index in Instrs of the instruction being latest in the schedule. + unsigned LatestMIIdx; + /// Index in Instrs of the instruction being earliest in the schedule. + unsigned EarliestMIIdx; + /// Index into the basic block where the merged instruction will be + /// inserted. (See MemOpQueueEntry.Position) + unsigned InsertPos; + /// Whether the instructions can be merged into a ldm/stm instruction. + bool CanMergeToLSMulti; + /// Whether the instructions can be merged into a ldrd/strd instruction. + bool CanMergeToLSDouble; + }; + SpecificBumpPtrAllocator<MergeCandidate> Allocator; + SmallVector<const MergeCandidate*,4> Candidates; + SmallVector<MachineInstr*,4> MergeBaseCandidates; + + void moveLiveRegsBefore(const MachineBasicBlock &MBB, + MachineBasicBlock::const_iterator Before); + unsigned findFreeReg(const TargetRegisterClass &RegClass); + void UpdateBaseRegUses(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, unsigned Base, unsigned WordOffset, + ARMCC::CondCodes Pred, unsigned PredReg); + MachineInstr *CreateLoadStoreMulti(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, + bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, + DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs); + MachineInstr *CreateLoadStoreDouble(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, + bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, + DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) const; + void FormCandidates(const MemOpQueue &MemOps); + MachineInstr *MergeOpsUpdate(const MergeCandidate &Cand); + bool FixInvalidRegPairOp(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI); + bool MergeBaseUpdateLoadStore(MachineInstr *MI); + bool MergeBaseUpdateLSMultiple(MachineInstr *MI); + bool MergeBaseUpdateLSDouble(MachineInstr &MI) const; + bool LoadStoreMultipleOpti(MachineBasicBlock &MBB); + bool MergeReturnIntoLDM(MachineBasicBlock &MBB); + bool CombineMovBx(MachineBasicBlock &MBB); + }; + char ARMLoadStoreOpt::ID = 0; +} + +INITIALIZE_PASS(ARMLoadStoreOpt, "arm-load-store-opt", ARM_LOAD_STORE_OPT_NAME, false, false) + +static bool definesCPSR(const MachineInstr *MI) { + for (const auto &MO : MI->operands()) { + if (!MO.isReg()) + continue; + if (MO.isDef() && MO.getReg() == ARM::CPSR && !MO.isDead()) + // If the instruction has live CPSR def, then it's not safe to fold it + // into load / store. + return true; + } + + return false; +} + +static int getMemoryOpOffset(const MachineInstr *MI) { + unsigned Opcode = MI->getOpcode(); + bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD; + unsigned NumOperands = MI->getDesc().getNumOperands(); + unsigned OffField = MI->getOperand(NumOperands-3).getImm(); + + if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 || + Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 || + Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8 || + Opcode == ARM::LDRi12 || Opcode == ARM::STRi12) + return OffField; + + // Thumb1 immediate offsets are scaled by 4 + if (Opcode == ARM::tLDRi || Opcode == ARM::tSTRi || + Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) + return OffField * 4; + + int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField) + : ARM_AM::getAM5Offset(OffField) * 4; + ARM_AM::AddrOpc Op = isAM3 ? ARM_AM::getAM3Op(OffField) + : ARM_AM::getAM5Op(OffField); + + if (Op == ARM_AM::sub) + return -Offset; + + return Offset; +} + +static const MachineOperand &getLoadStoreBaseOp(const MachineInstr &MI) { + return MI.getOperand(1); +} + +static const MachineOperand &getLoadStoreRegOp(const MachineInstr &MI) { + return MI.getOperand(0); +} + +static int getLoadStoreMultipleOpcode(unsigned Opcode, ARM_AM::AMSubMode Mode) { + switch (Opcode) { + default: llvm_unreachable("Unhandled opcode!"); + case ARM::LDRi12: + ++NumLDMGened; + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::LDMIA; + case ARM_AM::da: return ARM::LDMDA; + case ARM_AM::db: return ARM::LDMDB; + case ARM_AM::ib: return ARM::LDMIB; + } + case ARM::STRi12: + ++NumSTMGened; + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::STMIA; + case ARM_AM::da: return ARM::STMDA; + case ARM_AM::db: return ARM::STMDB; + case ARM_AM::ib: return ARM::STMIB; + } + case ARM::tLDRi: + case ARM::tLDRspi: + // tLDMIA is writeback-only - unless the base register is in the input + // reglist. + ++NumLDMGened; + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::tLDMIA; + } + case ARM::tSTRi: + case ARM::tSTRspi: + // There is no non-writeback tSTMIA either. + ++NumSTMGened; + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::tSTMIA_UPD; + } + case ARM::t2LDRi8: + case ARM::t2LDRi12: + ++NumLDMGened; + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::t2LDMIA; + case ARM_AM::db: return ARM::t2LDMDB; + } + case ARM::t2STRi8: + case ARM::t2STRi12: + ++NumSTMGened; + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::t2STMIA; + case ARM_AM::db: return ARM::t2STMDB; + } + case ARM::VLDRS: + ++NumVLDMGened; + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::VLDMSIA; + case ARM_AM::db: return 0; // Only VLDMSDB_UPD exists. + } + case ARM::VSTRS: + ++NumVSTMGened; + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::VSTMSIA; + case ARM_AM::db: return 0; // Only VSTMSDB_UPD exists. + } + case ARM::VLDRD: + ++NumVLDMGened; + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::VLDMDIA; + case ARM_AM::db: return 0; // Only VLDMDDB_UPD exists. + } + case ARM::VSTRD: + ++NumVSTMGened; + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::VSTMDIA; + case ARM_AM::db: return 0; // Only VSTMDDB_UPD exists. + } + } +} + +static ARM_AM::AMSubMode getLoadStoreMultipleSubMode(unsigned Opcode) { + switch (Opcode) { + default: llvm_unreachable("Unhandled opcode!"); + case ARM::LDMIA_RET: + case ARM::LDMIA: + case ARM::LDMIA_UPD: + case ARM::STMIA: + case ARM::STMIA_UPD: + case ARM::tLDMIA: + case ARM::tLDMIA_UPD: + case ARM::tSTMIA_UPD: + case ARM::t2LDMIA_RET: + case ARM::t2LDMIA: + case ARM::t2LDMIA_UPD: + case ARM::t2STMIA: + case ARM::t2STMIA_UPD: + case ARM::VLDMSIA: + case ARM::VLDMSIA_UPD: + case ARM::VSTMSIA: + case ARM::VSTMSIA_UPD: + case ARM::VLDMDIA: + case ARM::VLDMDIA_UPD: + case ARM::VSTMDIA: + case ARM::VSTMDIA_UPD: + return ARM_AM::ia; + + case ARM::LDMDA: + case ARM::LDMDA_UPD: + case ARM::STMDA: + case ARM::STMDA_UPD: + return ARM_AM::da; + + case ARM::LDMDB: + case ARM::LDMDB_UPD: + case ARM::STMDB: + case ARM::STMDB_UPD: + case ARM::t2LDMDB: + case ARM::t2LDMDB_UPD: + case ARM::t2STMDB: + case ARM::t2STMDB_UPD: + case ARM::VLDMSDB_UPD: + case ARM::VSTMSDB_UPD: + case ARM::VLDMDDB_UPD: + case ARM::VSTMDDB_UPD: + return ARM_AM::db; + + case ARM::LDMIB: + case ARM::LDMIB_UPD: + case ARM::STMIB: + case ARM::STMIB_UPD: + return ARM_AM::ib; + } +} + +static bool isT1i32Load(unsigned Opc) { + return Opc == ARM::tLDRi || Opc == ARM::tLDRspi; +} + +static bool isT2i32Load(unsigned Opc) { + return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8; +} + +static bool isi32Load(unsigned Opc) { + return Opc == ARM::LDRi12 || isT1i32Load(Opc) || isT2i32Load(Opc) ; +} + +static bool isT1i32Store(unsigned Opc) { + return Opc == ARM::tSTRi || Opc == ARM::tSTRspi; +} + +static bool isT2i32Store(unsigned Opc) { + return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8; +} + +static bool isi32Store(unsigned Opc) { + return Opc == ARM::STRi12 || isT1i32Store(Opc) || isT2i32Store(Opc); +} + +static bool isLoadSingle(unsigned Opc) { + return isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD; +} + +static unsigned getImmScale(unsigned Opc) { + switch (Opc) { + default: llvm_unreachable("Unhandled opcode!"); + case ARM::tLDRi: + case ARM::tSTRi: + case ARM::tLDRspi: + case ARM::tSTRspi: + return 1; + case ARM::tLDRHi: + case ARM::tSTRHi: + return 2; + case ARM::tLDRBi: + case ARM::tSTRBi: + return 4; + } +} + +static unsigned getLSMultipleTransferSize(const MachineInstr *MI) { + switch (MI->getOpcode()) { + default: return 0; + case ARM::LDRi12: + case ARM::STRi12: + case ARM::tLDRi: + case ARM::tSTRi: + case ARM::tLDRspi: + case ARM::tSTRspi: + case ARM::t2LDRi8: + case ARM::t2LDRi12: + case ARM::t2STRi8: + case ARM::t2STRi12: + case ARM::VLDRS: + case ARM::VSTRS: + return 4; + case ARM::VLDRD: + case ARM::VSTRD: + return 8; + case ARM::LDMIA: + case ARM::LDMDA: + case ARM::LDMDB: + case ARM::LDMIB: + case ARM::STMIA: + case ARM::STMDA: + case ARM::STMDB: + case ARM::STMIB: + case ARM::tLDMIA: + case ARM::tLDMIA_UPD: + case ARM::tSTMIA_UPD: + case ARM::t2LDMIA: + case ARM::t2LDMDB: + case ARM::t2STMIA: + case ARM::t2STMDB: + case ARM::VLDMSIA: + case ARM::VSTMSIA: + return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4; + case ARM::VLDMDIA: + case ARM::VSTMDIA: + return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8; + } +} + +/// Update future uses of the base register with the offset introduced +/// due to writeback. This function only works on Thumb1. +void +ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, unsigned Base, + unsigned WordOffset, + ARMCC::CondCodes Pred, unsigned PredReg) { + assert(isThumb1 && "Can only update base register uses for Thumb1!"); + // Start updating any instructions with immediate offsets. Insert a SUB before + // the first non-updateable instruction (if any). + for (; MBBI != MBB.end(); ++MBBI) { + bool InsertSub = false; + unsigned Opc = MBBI->getOpcode(); + + if (MBBI->readsRegister(Base)) { + int Offset; + bool IsLoad = + Opc == ARM::tLDRi || Opc == ARM::tLDRHi || Opc == ARM::tLDRBi; + bool IsStore = + Opc == ARM::tSTRi || Opc == ARM::tSTRHi || Opc == ARM::tSTRBi; + + if (IsLoad || IsStore) { + // Loads and stores with immediate offsets can be updated, but only if + // the new offset isn't negative. + // The MachineOperand containing the offset immediate is the last one + // before predicates. + MachineOperand &MO = + MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3); + // The offsets are scaled by 1, 2 or 4 depending on the Opcode. + Offset = MO.getImm() - WordOffset * getImmScale(Opc); + + // If storing the base register, it needs to be reset first. + unsigned InstrSrcReg = getLoadStoreRegOp(*MBBI).getReg(); + + if (Offset >= 0 && !(IsStore && InstrSrcReg == Base)) + MO.setImm(Offset); + else + InsertSub = true; + + } else if ((Opc == ARM::tSUBi8 || Opc == ARM::tADDi8) && + !definesCPSR(MBBI)) { + // SUBS/ADDS using this register, with a dead def of the CPSR. + // Merge it with the update; if the merged offset is too large, + // insert a new sub instead. + MachineOperand &MO = + MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3); + Offset = (Opc == ARM::tSUBi8) ? + MO.getImm() + WordOffset * 4 : + MO.getImm() - WordOffset * 4 ; + if (Offset >= 0 && TL->isLegalAddImmediate(Offset)) { + // FIXME: Swap ADDS<->SUBS if Offset < 0, erase instruction if + // Offset == 0. + MO.setImm(Offset); + // The base register has now been reset, so exit early. + return; + } else { + InsertSub = true; + } + + } else { + // Can't update the instruction. + InsertSub = true; + } + + } else if (definesCPSR(MBBI) || MBBI->isCall() || MBBI->isBranch()) { + // Since SUBS sets the condition flags, we can't place the base reset + // after an instruction that has a live CPSR def. + // The base register might also contain an argument for a function call. + InsertSub = true; + } + + if (InsertSub) { + // An instruction above couldn't be updated, so insert a sub. + AddDefaultT1CC(BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true) + .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg); + return; + } + + if (MBBI->killsRegister(Base) || MBBI->definesRegister(Base)) + // Register got killed. Stop updating. + return; + } + + // End of block was reached. + if (MBB.succ_size() > 0) { + // FIXME: Because of a bug, live registers are sometimes missing from + // the successor blocks' live-in sets. This means we can't trust that + // information and *always* have to reset at the end of a block. + // See PR21029. + if (MBBI != MBB.end()) --MBBI; + AddDefaultT1CC( + BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true) + .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg); + } +} + +/// Return the first register of class \p RegClass that is not in \p Regs. +unsigned ARMLoadStoreOpt::findFreeReg(const TargetRegisterClass &RegClass) { + if (!RegClassInfoValid) { + RegClassInfo.runOnMachineFunction(*MF); + RegClassInfoValid = true; + } + + for (unsigned Reg : RegClassInfo.getOrder(&RegClass)) + if (!LiveRegs.contains(Reg)) + return Reg; + return 0; +} + +/// Compute live registers just before instruction \p Before (in normal schedule +/// direction). Computes backwards so multiple queries in the same block must +/// come in reverse order. +void ARMLoadStoreOpt::moveLiveRegsBefore(const MachineBasicBlock &MBB, + MachineBasicBlock::const_iterator Before) { + // Initialize if we never queried in this block. + if (!LiveRegsValid) { + LiveRegs.init(TRI); + LiveRegs.addLiveOuts(&MBB, true); + LiveRegPos = MBB.end(); + LiveRegsValid = true; + } + // Move backward just before the "Before" position. + while (LiveRegPos != Before) { + --LiveRegPos; + LiveRegs.stepBackward(*LiveRegPos); + } +} + +static bool ContainsReg(const ArrayRef<std::pair<unsigned, bool>> &Regs, + unsigned Reg) { + for (const std::pair<unsigned, bool> &R : Regs) + if (R.first == Reg) + return true; + return false; +} + +/// Create and insert a LDM or STM with Base as base register and registers in +/// Regs as the register operands that would be loaded / stored. It returns +/// true if the transformation is done. +MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, + bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, + DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) { + unsigned NumRegs = Regs.size(); + assert(NumRegs > 1); + + // For Thumb1 targets, it might be necessary to clobber the CPSR to merge. + // Compute liveness information for that register to make the decision. + bool SafeToClobberCPSR = !isThumb1 || + (MBB.computeRegisterLiveness(TRI, ARM::CPSR, InsertBefore, 20) == + MachineBasicBlock::LQR_Dead); + + bool Writeback = isThumb1; // Thumb1 LDM/STM have base reg writeback. + + // Exception: If the base register is in the input reglist, Thumb1 LDM is + // non-writeback. + // It's also not possible to merge an STR of the base register in Thumb1. + if (isThumb1 && isi32Load(Opcode) && ContainsReg(Regs, Base)) { + assert(Base != ARM::SP && "Thumb1 does not allow SP in register list"); + if (Opcode == ARM::tLDRi) { + Writeback = false; + } else if (Opcode == ARM::tSTRi) { + return nullptr; + } + } + + ARM_AM::AMSubMode Mode = ARM_AM::ia; + // VFP and Thumb2 do not support IB or DA modes. Thumb1 only supports IA. + bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode); + bool haveIBAndDA = isNotVFP && !isThumb2 && !isThumb1; + + if (Offset == 4 && haveIBAndDA) { + Mode = ARM_AM::ib; + } else if (Offset == -4 * (int)NumRegs + 4 && haveIBAndDA) { + Mode = ARM_AM::da; + } else if (Offset == -4 * (int)NumRegs && isNotVFP && !isThumb1) { + // VLDM/VSTM do not support DB mode without also updating the base reg. + Mode = ARM_AM::db; + } else if (Offset != 0 || Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) { + // Check if this is a supported opcode before inserting instructions to + // calculate a new base register. + if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return nullptr; + + // If starting offset isn't zero, insert a MI to materialize a new base. + // But only do so if it is cost effective, i.e. merging more than two + // loads / stores. + if (NumRegs <= 2) + return nullptr; + + // On Thumb1, it's not worth materializing a new base register without + // clobbering the CPSR (i.e. not using ADDS/SUBS). + if (!SafeToClobberCPSR) + return nullptr; + + unsigned NewBase; + if (isi32Load(Opcode)) { + // If it is a load, then just use one of the destination registers + // as the new base. Will no longer be writeback in Thumb1. + NewBase = Regs[NumRegs-1].first; + Writeback = false; + } else { + // Find a free register that we can use as scratch register. + moveLiveRegsBefore(MBB, InsertBefore); + // The merged instruction does not exist yet but will use several Regs if + // it is a Store. + if (!isLoadSingle(Opcode)) + for (const std::pair<unsigned, bool> &R : Regs) + LiveRegs.addReg(R.first); + + NewBase = findFreeReg(isThumb1 ? ARM::tGPRRegClass : ARM::GPRRegClass); + if (NewBase == 0) + return nullptr; + } + + int BaseOpc = + isThumb2 ? ARM::t2ADDri : + (isThumb1 && Base == ARM::SP) ? ARM::tADDrSPi : + (isThumb1 && Offset < 8) ? ARM::tADDi3 : + isThumb1 ? ARM::tADDi8 : ARM::ADDri; + + if (Offset < 0) { + Offset = - Offset; + BaseOpc = + isThumb2 ? ARM::t2SUBri : + (isThumb1 && Offset < 8 && Base != ARM::SP) ? ARM::tSUBi3 : + isThumb1 ? ARM::tSUBi8 : ARM::SUBri; + } + + if (!TL->isLegalAddImmediate(Offset)) + // FIXME: Try add with register operand? + return nullptr; // Probably not worth it then. + + // We can only append a kill flag to the add/sub input if the value is not + // used in the register list of the stm as well. + bool KillOldBase = BaseKill && + (!isi32Store(Opcode) || !ContainsReg(Regs, Base)); + + if (isThumb1) { + // Thumb1: depending on immediate size, use either + // ADDS NewBase, Base, #imm3 + // or + // MOV NewBase, Base + // ADDS NewBase, #imm8. + if (Base != NewBase && + (BaseOpc == ARM::tADDi8 || BaseOpc == ARM::tSUBi8)) { + // Need to insert a MOV to the new base first. + if (isARMLowRegister(NewBase) && isARMLowRegister(Base) && + !STI->hasV6Ops()) { + // thumbv4t doesn't have lo->lo copies, and we can't predicate tMOVSr + if (Pred != ARMCC::AL) + return nullptr; + BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVSr), NewBase) + .addReg(Base, getKillRegState(KillOldBase)); + } else + BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVr), NewBase) + .addReg(Base, getKillRegState(KillOldBase)) + .addImm(Pred).addReg(PredReg); + + // The following ADDS/SUBS becomes an update. + Base = NewBase; + KillOldBase = true; + } + if (BaseOpc == ARM::tADDrSPi) { + assert(Offset % 4 == 0 && "tADDrSPi offset is scaled by 4"); + BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase) + .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset/4) + .addImm(Pred).addReg(PredReg); + } else + AddDefaultT1CC( + BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase), true) + .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset) + .addImm(Pred).addReg(PredReg); + } else { + BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase) + .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset) + .addImm(Pred).addReg(PredReg).addReg(0); + } + Base = NewBase; + BaseKill = true; // New base is always killed straight away. + } + + bool isDef = isLoadSingle(Opcode); + + // Get LS multiple opcode. Note that for Thumb1 this might be an opcode with + // base register writeback. + Opcode = getLoadStoreMultipleOpcode(Opcode, Mode); + if (!Opcode) + return nullptr; + + // Check if a Thumb1 LDM/STM merge is safe. This is the case if: + // - There is no writeback (LDM of base register), + // - the base register is killed by the merged instruction, + // - or it's safe to overwrite the condition flags, i.e. to insert a SUBS + // to reset the base register. + // Otherwise, don't merge. + // It's safe to return here since the code to materialize a new base register + // above is also conditional on SafeToClobberCPSR. + if (isThumb1 && !SafeToClobberCPSR && Writeback && !BaseKill) + return nullptr; + + MachineInstrBuilder MIB; + + if (Writeback) { + assert(isThumb1 && "expected Writeback only inThumb1"); + if (Opcode == ARM::tLDMIA) { + assert(!(ContainsReg(Regs, Base)) && "Thumb1 can't LDM ! with Base in Regs"); + // Update tLDMIA with writeback if necessary. + Opcode = ARM::tLDMIA_UPD; + } + + MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode)); + + // Thumb1: we might need to set base writeback when building the MI. + MIB.addReg(Base, getDefRegState(true)) + .addReg(Base, getKillRegState(BaseKill)); + + // The base isn't dead after a merged instruction with writeback. + // Insert a sub instruction after the newly formed instruction to reset. + if (!BaseKill) + UpdateBaseRegUses(MBB, InsertBefore, DL, Base, NumRegs, Pred, PredReg); + + } else { + // No writeback, simply build the MachineInstr. + MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode)); + MIB.addReg(Base, getKillRegState(BaseKill)); + } + + MIB.addImm(Pred).addReg(PredReg); + + for (const std::pair<unsigned, bool> &R : Regs) + MIB.addReg(R.first, getDefRegState(isDef) | getKillRegState(R.second)); + + return MIB.getInstr(); +} + +MachineInstr *ARMLoadStoreOpt::CreateLoadStoreDouble(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, + bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, + DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) const { + bool IsLoad = isi32Load(Opcode); + assert((IsLoad || isi32Store(Opcode)) && "Must have integer load or store"); + unsigned LoadStoreOpcode = IsLoad ? ARM::t2LDRDi8 : ARM::t2STRDi8; + + assert(Regs.size() == 2); + MachineInstrBuilder MIB = BuildMI(MBB, InsertBefore, DL, + TII->get(LoadStoreOpcode)); + if (IsLoad) { + MIB.addReg(Regs[0].first, RegState::Define) + .addReg(Regs[1].first, RegState::Define); + } else { + MIB.addReg(Regs[0].first, getKillRegState(Regs[0].second)) + .addReg(Regs[1].first, getKillRegState(Regs[1].second)); + } + MIB.addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); + return MIB.getInstr(); +} + +/// Call MergeOps and update MemOps and merges accordingly on success. +MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) { + const MachineInstr *First = Cand.Instrs.front(); + unsigned Opcode = First->getOpcode(); + bool IsLoad = isLoadSingle(Opcode); + SmallVector<std::pair<unsigned, bool>, 8> Regs; + SmallVector<unsigned, 4> ImpDefs; + DenseSet<unsigned> KilledRegs; + DenseSet<unsigned> UsedRegs; + // Determine list of registers and list of implicit super-register defs. + for (const MachineInstr *MI : Cand.Instrs) { + const MachineOperand &MO = getLoadStoreRegOp(*MI); + unsigned Reg = MO.getReg(); + bool IsKill = MO.isKill(); + if (IsKill) + KilledRegs.insert(Reg); + Regs.push_back(std::make_pair(Reg, IsKill)); + UsedRegs.insert(Reg); + + if (IsLoad) { + // Collect any implicit defs of super-registers, after merging we can't + // be sure anymore that we properly preserved these live ranges and must + // removed these implicit operands. + for (const MachineOperand &MO : MI->implicit_operands()) { + if (!MO.isReg() || !MO.isDef() || MO.isDead()) + continue; + assert(MO.isImplicit()); + unsigned DefReg = MO.getReg(); + + if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) != ImpDefs.end()) + continue; + // We can ignore cases where the super-reg is read and written. + if (MI->readsRegister(DefReg)) + continue; + ImpDefs.push_back(DefReg); + } + } + } + + // Attempt the merge. + typedef MachineBasicBlock::iterator iterator; + MachineInstr *LatestMI = Cand.Instrs[Cand.LatestMIIdx]; + iterator InsertBefore = std::next(iterator(LatestMI)); + MachineBasicBlock &MBB = *LatestMI->getParent(); + unsigned Offset = getMemoryOpOffset(First); + unsigned Base = getLoadStoreBaseOp(*First).getReg(); + bool BaseKill = LatestMI->killsRegister(Base); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = getInstrPredicate(First, PredReg); + DebugLoc DL = First->getDebugLoc(); + MachineInstr *Merged = nullptr; + if (Cand.CanMergeToLSDouble) + Merged = CreateLoadStoreDouble(MBB, InsertBefore, Offset, Base, BaseKill, + Opcode, Pred, PredReg, DL, Regs); + if (!Merged && Cand.CanMergeToLSMulti) + Merged = CreateLoadStoreMulti(MBB, InsertBefore, Offset, Base, BaseKill, + Opcode, Pred, PredReg, DL, Regs); + if (!Merged) + return nullptr; + + // Determine earliest instruction that will get removed. We then keep an + // iterator just above it so the following erases don't invalidated it. + iterator EarliestI(Cand.Instrs[Cand.EarliestMIIdx]); + bool EarliestAtBegin = false; + if (EarliestI == MBB.begin()) { + EarliestAtBegin = true; + } else { + EarliestI = std::prev(EarliestI); + } + + // Remove instructions which have been merged. + for (MachineInstr *MI : Cand.Instrs) + MBB.erase(MI); + + // Determine range between the earliest removed instruction and the new one. + if (EarliestAtBegin) + EarliestI = MBB.begin(); + else + EarliestI = std::next(EarliestI); + auto FixupRange = make_range(EarliestI, iterator(Merged)); + + if (isLoadSingle(Opcode)) { + // If the previous loads defined a super-reg, then we have to mark earlier + // operands undef; Replicate the super-reg def on the merged instruction. + for (MachineInstr &MI : FixupRange) { + for (unsigned &ImpDefReg : ImpDefs) { + for (MachineOperand &MO : MI.implicit_operands()) { + if (!MO.isReg() || MO.getReg() != ImpDefReg) + continue; + if (MO.readsReg()) + MO.setIsUndef(); + else if (MO.isDef()) + ImpDefReg = 0; + } + } + } + + MachineInstrBuilder MIB(*Merged->getParent()->getParent(), Merged); + for (unsigned ImpDef : ImpDefs) + MIB.addReg(ImpDef, RegState::ImplicitDefine); + } else { + // Remove kill flags: We are possibly storing the values later now. + assert(isi32Store(Opcode) || Opcode == ARM::VSTRS || Opcode == ARM::VSTRD); + for (MachineInstr &MI : FixupRange) { + for (MachineOperand &MO : MI.uses()) { + if (!MO.isReg() || !MO.isKill()) + continue; + if (UsedRegs.count(MO.getReg())) + MO.setIsKill(false); + } + } + assert(ImpDefs.empty()); + } + + return Merged; +} + +static bool isValidLSDoubleOffset(int Offset) { + unsigned Value = abs(Offset); + // t2LDRDi8/t2STRDi8 supports an 8 bit immediate which is internally + // multiplied by 4. + return (Value % 4) == 0 && Value < 1024; +} + +/// Find candidates for load/store multiple merge in list of MemOpQueueEntries. +void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) { + const MachineInstr *FirstMI = MemOps[0].MI; + unsigned Opcode = FirstMI->getOpcode(); + bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode); + unsigned Size = getLSMultipleTransferSize(FirstMI); + + unsigned SIndex = 0; + unsigned EIndex = MemOps.size(); + do { + // Look at the first instruction. + const MachineInstr *MI = MemOps[SIndex].MI; + int Offset = MemOps[SIndex].Offset; + const MachineOperand &PMO = getLoadStoreRegOp(*MI); + unsigned PReg = PMO.getReg(); + unsigned PRegNum = PMO.isUndef() ? UINT_MAX : TRI->getEncodingValue(PReg); + unsigned Latest = SIndex; + unsigned Earliest = SIndex; + unsigned Count = 1; + bool CanMergeToLSDouble = + STI->isThumb2() && isNotVFP && isValidLSDoubleOffset(Offset); + // ARM errata 602117: LDRD with base in list may result in incorrect base + // register when interrupted or faulted. + if (STI->isCortexM3() && isi32Load(Opcode) && + PReg == getLoadStoreBaseOp(*MI).getReg()) + CanMergeToLSDouble = false; + + bool CanMergeToLSMulti = true; + // On swift vldm/vstm starting with an odd register number as that needs + // more uops than single vldrs. + if (STI->isSwift() && !isNotVFP && (PRegNum % 2) == 1) + CanMergeToLSMulti = false; + + // LDRD/STRD do not allow SP/PC. LDM/STM do not support it or have it + // deprecated; LDM to PC is fine but cannot happen here. + if (PReg == ARM::SP || PReg == ARM::PC) + CanMergeToLSMulti = CanMergeToLSDouble = false; + + // Merge following instructions where possible. + for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) { + int NewOffset = MemOps[I].Offset; + if (NewOffset != Offset + (int)Size) + break; + const MachineOperand &MO = getLoadStoreRegOp(*MemOps[I].MI); + unsigned Reg = MO.getReg(); + if (Reg == ARM::SP || Reg == ARM::PC) + break; + + // See if the current load/store may be part of a multi load/store. + unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg); + bool PartOfLSMulti = CanMergeToLSMulti; + if (PartOfLSMulti) { + // Register numbers must be in ascending order. + if (RegNum <= PRegNum) + PartOfLSMulti = false; + // For VFP / NEON load/store multiples, the registers must be + // consecutive and within the limit on the number of registers per + // instruction. + else if (!isNotVFP && RegNum != PRegNum+1) + PartOfLSMulti = false; + } + // See if the current load/store may be part of a double load/store. + bool PartOfLSDouble = CanMergeToLSDouble && Count <= 1; + + if (!PartOfLSMulti && !PartOfLSDouble) + break; + CanMergeToLSMulti &= PartOfLSMulti; + CanMergeToLSDouble &= PartOfLSDouble; + // Track MemOp with latest and earliest position (Positions are + // counted in reverse). + unsigned Position = MemOps[I].Position; + if (Position < MemOps[Latest].Position) + Latest = I; + else if (Position > MemOps[Earliest].Position) + Earliest = I; + // Prepare for next MemOp. + Offset += Size; + PRegNum = RegNum; + } + + // Form a candidate from the Ops collected so far. + MergeCandidate *Candidate = new(Allocator.Allocate()) MergeCandidate; + for (unsigned C = SIndex, CE = SIndex + Count; C < CE; ++C) + Candidate->Instrs.push_back(MemOps[C].MI); + Candidate->LatestMIIdx = Latest - SIndex; + Candidate->EarliestMIIdx = Earliest - SIndex; + Candidate->InsertPos = MemOps[Latest].Position; + if (Count == 1) + CanMergeToLSMulti = CanMergeToLSDouble = false; + Candidate->CanMergeToLSMulti = CanMergeToLSMulti; + Candidate->CanMergeToLSDouble = CanMergeToLSDouble; + Candidates.push_back(Candidate); + // Continue after the chain. + SIndex += Count; + } while (SIndex < EIndex); +} + +static unsigned getUpdatingLSMultipleOpcode(unsigned Opc, + ARM_AM::AMSubMode Mode) { + switch (Opc) { + default: llvm_unreachable("Unhandled opcode!"); + case ARM::LDMIA: + case ARM::LDMDA: + case ARM::LDMDB: + case ARM::LDMIB: + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::LDMIA_UPD; + case ARM_AM::ib: return ARM::LDMIB_UPD; + case ARM_AM::da: return ARM::LDMDA_UPD; + case ARM_AM::db: return ARM::LDMDB_UPD; + } + case ARM::STMIA: + case ARM::STMDA: + case ARM::STMDB: + case ARM::STMIB: + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::STMIA_UPD; + case ARM_AM::ib: return ARM::STMIB_UPD; + case ARM_AM::da: return ARM::STMDA_UPD; + case ARM_AM::db: return ARM::STMDB_UPD; + } + case ARM::t2LDMIA: + case ARM::t2LDMDB: + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::t2LDMIA_UPD; + case ARM_AM::db: return ARM::t2LDMDB_UPD; + } + case ARM::t2STMIA: + case ARM::t2STMDB: + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::t2STMIA_UPD; + case ARM_AM::db: return ARM::t2STMDB_UPD; + } + case ARM::VLDMSIA: + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::VLDMSIA_UPD; + case ARM_AM::db: return ARM::VLDMSDB_UPD; + } + case ARM::VLDMDIA: + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::VLDMDIA_UPD; + case ARM_AM::db: return ARM::VLDMDDB_UPD; + } + case ARM::VSTMSIA: + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::VSTMSIA_UPD; + case ARM_AM::db: return ARM::VSTMSDB_UPD; + } + case ARM::VSTMDIA: + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::VSTMDIA_UPD; + case ARM_AM::db: return ARM::VSTMDDB_UPD; + } + } +} + +/// Check if the given instruction increments or decrements a register and +/// return the amount it is incremented/decremented. Returns 0 if the CPSR flags +/// generated by the instruction are possibly read as well. +static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg, + ARMCC::CondCodes Pred, unsigned PredReg) { + bool CheckCPSRDef; + int Scale; + switch (MI.getOpcode()) { + case ARM::tADDi8: Scale = 4; CheckCPSRDef = true; break; + case ARM::tSUBi8: Scale = -4; CheckCPSRDef = true; break; + case ARM::t2SUBri: + case ARM::SUBri: Scale = -1; CheckCPSRDef = true; break; + case ARM::t2ADDri: + case ARM::ADDri: Scale = 1; CheckCPSRDef = true; break; + case ARM::tADDspi: Scale = 4; CheckCPSRDef = false; break; + case ARM::tSUBspi: Scale = -4; CheckCPSRDef = false; break; + default: return 0; + } + + unsigned MIPredReg; + if (MI.getOperand(0).getReg() != Reg || + MI.getOperand(1).getReg() != Reg || + getInstrPredicate(&MI, MIPredReg) != Pred || + MIPredReg != PredReg) + return 0; + + if (CheckCPSRDef && definesCPSR(&MI)) + return 0; + return MI.getOperand(2).getImm() * Scale; +} + +/// Searches for an increment or decrement of \p Reg before \p MBBI. +static MachineBasicBlock::iterator +findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg, + ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) { + Offset = 0; + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineBasicBlock::iterator BeginMBBI = MBB.begin(); + MachineBasicBlock::iterator EndMBBI = MBB.end(); + if (MBBI == BeginMBBI) + return EndMBBI; + + // Skip debug values. + MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI); + while (PrevMBBI->isDebugValue() && PrevMBBI != BeginMBBI) + --PrevMBBI; + + Offset = isIncrementOrDecrement(*PrevMBBI, Reg, Pred, PredReg); + return Offset == 0 ? EndMBBI : PrevMBBI; +} + +/// Searches for a increment or decrement of \p Reg after \p MBBI. +static MachineBasicBlock::iterator +findIncDecAfter(MachineBasicBlock::iterator MBBI, unsigned Reg, + ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) { + Offset = 0; + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineBasicBlock::iterator EndMBBI = MBB.end(); + MachineBasicBlock::iterator NextMBBI = std::next(MBBI); + // Skip debug values. + while (NextMBBI != EndMBBI && NextMBBI->isDebugValue()) + ++NextMBBI; + if (NextMBBI == EndMBBI) + return EndMBBI; + + Offset = isIncrementOrDecrement(*NextMBBI, Reg, Pred, PredReg); + return Offset == 0 ? EndMBBI : NextMBBI; +} + +/// Fold proceeding/trailing inc/dec of base register into the +/// LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible: +/// +/// stmia rn, <ra, rb, rc> +/// rn := rn + 4 * 3; +/// => +/// stmia rn!, <ra, rb, rc> +/// +/// rn := rn - 4 * 3; +/// ldmia rn, <ra, rb, rc> +/// => +/// ldmdb rn!, <ra, rb, rc> +bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) { + // Thumb1 is already using updating loads/stores. + if (isThumb1) return false; + + const MachineOperand &BaseOP = MI->getOperand(0); + unsigned Base = BaseOP.getReg(); + bool BaseKill = BaseOP.isKill(); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); + unsigned Opcode = MI->getOpcode(); + DebugLoc DL = MI->getDebugLoc(); + + // Can't use an updating ld/st if the base register is also a dest + // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined. + for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i) + if (MI->getOperand(i).getReg() == Base) + return false; + + int Bytes = getLSMultipleTransferSize(MI); + MachineBasicBlock &MBB = *MI->getParent(); + MachineBasicBlock::iterator MBBI(MI); + int Offset; + MachineBasicBlock::iterator MergeInstr + = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset); + ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode); + if (Mode == ARM_AM::ia && Offset == -Bytes) { + Mode = ARM_AM::db; + } else if (Mode == ARM_AM::ib && Offset == -Bytes) { + Mode = ARM_AM::da; + } else { + MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset); + if (((Mode != ARM_AM::ia && Mode != ARM_AM::ib) || Offset != Bytes) && + ((Mode != ARM_AM::da && Mode != ARM_AM::db) || Offset != -Bytes)) + return false; + } + MBB.erase(MergeInstr); + + unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode); + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)) + .addReg(Base, getDefRegState(true)) // WB base register + .addReg(Base, getKillRegState(BaseKill)) + .addImm(Pred).addReg(PredReg); + + // Transfer the rest of operands. + for (unsigned OpNum = 3, e = MI->getNumOperands(); OpNum != e; ++OpNum) + MIB.addOperand(MI->getOperand(OpNum)); + + // Transfer memoperands. + MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + + MBB.erase(MBBI); + return true; +} + +static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc, + ARM_AM::AddrOpc Mode) { + switch (Opc) { + case ARM::LDRi12: + return ARM::LDR_PRE_IMM; + case ARM::STRi12: + return ARM::STR_PRE_IMM; + case ARM::VLDRS: + return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD; + case ARM::VLDRD: + return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD; + case ARM::VSTRS: + return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD; + case ARM::VSTRD: + return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD; + case ARM::t2LDRi8: + case ARM::t2LDRi12: + return ARM::t2LDR_PRE; + case ARM::t2STRi8: + case ARM::t2STRi12: + return ARM::t2STR_PRE; + default: llvm_unreachable("Unhandled opcode!"); + } +} + +static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc, + ARM_AM::AddrOpc Mode) { + switch (Opc) { + case ARM::LDRi12: + return ARM::LDR_POST_IMM; + case ARM::STRi12: + return ARM::STR_POST_IMM; + case ARM::VLDRS: + return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD; + case ARM::VLDRD: + return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD; + case ARM::VSTRS: + return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD; + case ARM::VSTRD: + return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD; + case ARM::t2LDRi8: + case ARM::t2LDRi12: + return ARM::t2LDR_POST; + case ARM::t2STRi8: + case ARM::t2STRi12: + return ARM::t2STR_POST; + default: llvm_unreachable("Unhandled opcode!"); + } +} + +/// Fold proceeding/trailing inc/dec of base register into the +/// LDR/STR/FLD{D|S}/FST{D|S} op when possible: +bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { + // Thumb1 doesn't have updating LDR/STR. + // FIXME: Use LDM/STM with single register instead. + if (isThumb1) return false; + + unsigned Base = getLoadStoreBaseOp(*MI).getReg(); + bool BaseKill = getLoadStoreBaseOp(*MI).isKill(); + unsigned Opcode = MI->getOpcode(); + DebugLoc DL = MI->getDebugLoc(); + bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS || + Opcode == ARM::VSTRD || Opcode == ARM::VSTRS); + bool isAM2 = (Opcode == ARM::LDRi12 || Opcode == ARM::STRi12); + if (isi32Load(Opcode) || isi32Store(Opcode)) + if (MI->getOperand(2).getImm() != 0) + return false; + if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0) + return false; + + // Can't do the merge if the destination register is the same as the would-be + // writeback register. + if (MI->getOperand(0).getReg() == Base) + return false; + + unsigned PredReg = 0; + ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); + int Bytes = getLSMultipleTransferSize(MI); + MachineBasicBlock &MBB = *MI->getParent(); + MachineBasicBlock::iterator MBBI(MI); + int Offset; + MachineBasicBlock::iterator MergeInstr + = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset); + unsigned NewOpc; + if (!isAM5 && Offset == Bytes) { + NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::add); + } else if (Offset == -Bytes) { + NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::sub); + } else { + MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset); + if (Offset == Bytes) { + NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::add); + } else if (!isAM5 && Offset == -Bytes) { + NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::sub); + } else + return false; + } + MBB.erase(MergeInstr); + + ARM_AM::AddrOpc AddSub = Offset < 0 ? ARM_AM::sub : ARM_AM::add; + + bool isLd = isLoadSingle(Opcode); + if (isAM5) { + // VLDM[SD]_UPD, VSTM[SD]_UPD + // (There are no base-updating versions of VLDR/VSTR instructions, but the + // updating load/store-multiple instructions can be used with only one + // register.) + MachineOperand &MO = MI->getOperand(0); + BuildMI(MBB, MBBI, DL, TII->get(NewOpc)) + .addReg(Base, getDefRegState(true)) // WB base register + .addReg(Base, getKillRegState(isLd ? BaseKill : false)) + .addImm(Pred).addReg(PredReg) + .addReg(MO.getReg(), (isLd ? getDefRegState(true) : + getKillRegState(MO.isKill()))); + } else if (isLd) { + if (isAM2) { + // LDR_PRE, LDR_POST + if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) { + BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) + .addReg(Base, RegState::Define) + .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); + } else { + int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); + BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) + .addReg(Base, RegState::Define) + .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg); + } + } else { + // t2LDR_PRE, t2LDR_POST + BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) + .addReg(Base, RegState::Define) + .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); + } + } else { + MachineOperand &MO = MI->getOperand(0); + // FIXME: post-indexed stores use am2offset_imm, which still encodes + // the vestigal zero-reg offset register. When that's fixed, this clause + // can be removed entirely. + if (isAM2 && NewOpc == ARM::STR_POST_IMM) { + int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); + // STR_PRE, STR_POST + BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base) + .addReg(MO.getReg(), getKillRegState(MO.isKill())) + .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg); + } else { + // t2STR_PRE, t2STR_POST + BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base) + .addReg(MO.getReg(), getKillRegState(MO.isKill())) + .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); + } + } + MBB.erase(MBBI); + + return true; +} + +bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const { + unsigned Opcode = MI.getOpcode(); + assert((Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) && + "Must have t2STRDi8 or t2LDRDi8"); + if (MI.getOperand(3).getImm() != 0) + return false; + + // Behaviour for writeback is undefined if base register is the same as one + // of the others. + const MachineOperand &BaseOp = MI.getOperand(2); + unsigned Base = BaseOp.getReg(); + const MachineOperand &Reg0Op = MI.getOperand(0); + const MachineOperand &Reg1Op = MI.getOperand(1); + if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base) + return false; + + unsigned PredReg; + ARMCC::CondCodes Pred = getInstrPredicate(&MI, PredReg); + MachineBasicBlock::iterator MBBI(MI); + MachineBasicBlock &MBB = *MI.getParent(); + int Offset; + MachineBasicBlock::iterator MergeInstr = findIncDecBefore(MBBI, Base, Pred, + PredReg, Offset); + unsigned NewOpc; + if (Offset == 8 || Offset == -8) { + NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_PRE : ARM::t2STRD_PRE; + } else { + MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset); + if (Offset == 8 || Offset == -8) { + NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_POST : ARM::t2STRD_POST; + } else + return false; + } + MBB.erase(MergeInstr); + + DebugLoc DL = MI.getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)); + if (NewOpc == ARM::t2LDRD_PRE || NewOpc == ARM::t2LDRD_POST) { + MIB.addOperand(Reg0Op).addOperand(Reg1Op) + .addReg(BaseOp.getReg(), RegState::Define); + } else { + assert(NewOpc == ARM::t2STRD_PRE || NewOpc == ARM::t2STRD_POST); + MIB.addReg(BaseOp.getReg(), RegState::Define) + .addOperand(Reg0Op).addOperand(Reg1Op); + } + MIB.addReg(BaseOp.getReg(), RegState::Kill) + .addImm(Offset).addImm(Pred).addReg(PredReg); + assert(TII->get(Opcode).getNumOperands() == 6 && + TII->get(NewOpc).getNumOperands() == 7 && + "Unexpected number of operands in Opcode specification."); + + // Transfer implicit operands. + for (const MachineOperand &MO : MI.implicit_operands()) + MIB.addOperand(MO); + MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + + MBB.erase(MBBI); + return true; +} + +/// Returns true if instruction is a memory operation that this pass is capable +/// of operating on. +static bool isMemoryOp(const MachineInstr &MI) { + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case ARM::VLDRS: + case ARM::VSTRS: + case ARM::VLDRD: + case ARM::VSTRD: + case ARM::LDRi12: + case ARM::STRi12: + case ARM::tLDRi: + case ARM::tSTRi: + case ARM::tLDRspi: + case ARM::tSTRspi: + case ARM::t2LDRi8: + case ARM::t2LDRi12: + case ARM::t2STRi8: + case ARM::t2STRi12: + break; + default: + return false; + } + if (!MI.getOperand(1).isReg()) + return false; + + // When no memory operands are present, conservatively assume unaligned, + // volatile, unfoldable. + if (!MI.hasOneMemOperand()) + return false; + + const MachineMemOperand &MMO = **MI.memoperands_begin(); + + // Don't touch volatile memory accesses - we may be changing their order. + if (MMO.isVolatile()) + return false; + + // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is + // not. + if (MMO.getAlignment() < 4) + return false; + + // str <undef> could probably be eliminated entirely, but for now we just want + // to avoid making a mess of it. + // FIXME: Use str <undef> as a wildcard to enable better stm folding. + if (MI.getOperand(0).isReg() && MI.getOperand(0).isUndef()) + return false; + + // Likewise don't mess with references to undefined addresses. + if (MI.getOperand(1).isUndef()) + return false; + + return true; +} + +static void InsertLDR_STR(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + int Offset, bool isDef, + DebugLoc DL, unsigned NewOpc, + unsigned Reg, bool RegDeadKill, bool RegUndef, + unsigned BaseReg, bool BaseKill, bool BaseUndef, + bool OffKill, bool OffUndef, + ARMCC::CondCodes Pred, unsigned PredReg, + const TargetInstrInfo *TII, bool isT2) { + if (isDef) { + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(), + TII->get(NewOpc)) + .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill)) + .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef)); + MIB.addImm(Offset).addImm(Pred).addReg(PredReg); + } else { + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(), + TII->get(NewOpc)) + .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef)) + .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef)); + MIB.addImm(Offset).addImm(Pred).addReg(PredReg); + } +} + +bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI) { + MachineInstr *MI = &*MBBI; + unsigned Opcode = MI->getOpcode(); + if (Opcode != ARM::LDRD && Opcode != ARM::STRD && Opcode != ARM::t2LDRDi8) + return false; + + const MachineOperand &BaseOp = MI->getOperand(2); + unsigned BaseReg = BaseOp.getReg(); + unsigned EvenReg = MI->getOperand(0).getReg(); + unsigned OddReg = MI->getOperand(1).getReg(); + unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false); + unsigned OddRegNum = TRI->getDwarfRegNum(OddReg, false); + + // ARM errata 602117: LDRD with base in list may result in incorrect base + // register when interrupted or faulted. + bool Errata602117 = EvenReg == BaseReg && + (Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8) && STI->isCortexM3(); + // ARM LDRD/STRD needs consecutive registers. + bool NonConsecutiveRegs = (Opcode == ARM::LDRD || Opcode == ARM::STRD) && + (EvenRegNum % 2 != 0 || EvenRegNum + 1 != OddRegNum); + + if (!Errata602117 && !NonConsecutiveRegs) + return false; + + bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8; + bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8; + bool EvenDeadKill = isLd ? + MI->getOperand(0).isDead() : MI->getOperand(0).isKill(); + bool EvenUndef = MI->getOperand(0).isUndef(); + bool OddDeadKill = isLd ? + MI->getOperand(1).isDead() : MI->getOperand(1).isKill(); + bool OddUndef = MI->getOperand(1).isUndef(); + bool BaseKill = BaseOp.isKill(); + bool BaseUndef = BaseOp.isUndef(); + bool OffKill = isT2 ? false : MI->getOperand(3).isKill(); + bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef(); + int OffImm = getMemoryOpOffset(MI); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); + + if (OddRegNum > EvenRegNum && OffImm == 0) { + // Ascending register numbers and no offset. It's safe to change it to a + // ldm or stm. + unsigned NewOpc = (isLd) + ? (isT2 ? ARM::t2LDMIA : ARM::LDMIA) + : (isT2 ? ARM::t2STMIA : ARM::STMIA); + if (isLd) { + BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) + .addReg(BaseReg, getKillRegState(BaseKill)) + .addImm(Pred).addReg(PredReg) + .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill)) + .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill)); + ++NumLDRD2LDM; + } else { + BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) + .addReg(BaseReg, getKillRegState(BaseKill)) + .addImm(Pred).addReg(PredReg) + .addReg(EvenReg, + getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef)) + .addReg(OddReg, + getKillRegState(OddDeadKill) | getUndefRegState(OddUndef)); + ++NumSTRD2STM; + } + } else { + // Split into two instructions. + unsigned NewOpc = (isLd) + ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12) + : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12); + // Be extra careful for thumb2. t2LDRi8 can't reference a zero offset, + // so adjust and use t2LDRi12 here for that. + unsigned NewOpc2 = (isLd) + ? (isT2 ? (OffImm+4 < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12) + : (isT2 ? (OffImm+4 < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12); + DebugLoc dl = MBBI->getDebugLoc(); + // If this is a load and base register is killed, it may have been + // re-defed by the load, make sure the first load does not clobber it. + if (isLd && + (BaseKill || OffKill) && + (TRI->regsOverlap(EvenReg, BaseReg))) { + assert(!TRI->regsOverlap(OddReg, BaseReg)); + InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2, + OddReg, OddDeadKill, false, + BaseReg, false, BaseUndef, false, OffUndef, + Pred, PredReg, TII, isT2); + InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, + EvenReg, EvenDeadKill, false, + BaseReg, BaseKill, BaseUndef, OffKill, OffUndef, + Pred, PredReg, TII, isT2); + } else { + if (OddReg == EvenReg && EvenDeadKill) { + // If the two source operands are the same, the kill marker is + // probably on the first one. e.g. + // t2STRDi8 %R5<kill>, %R5, %R9<kill>, 0, 14, %reg0 + EvenDeadKill = false; + OddDeadKill = true; + } + // Never kill the base register in the first instruction. + if (EvenReg == BaseReg) + EvenDeadKill = false; + InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, + EvenReg, EvenDeadKill, EvenUndef, + BaseReg, false, BaseUndef, false, OffUndef, + Pred, PredReg, TII, isT2); + InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2, + OddReg, OddDeadKill, OddUndef, + BaseReg, BaseKill, BaseUndef, OffKill, OffUndef, + Pred, PredReg, TII, isT2); + } + if (isLd) + ++NumLDRD2LDR; + else + ++NumSTRD2STR; + } + + MBBI = MBB.erase(MBBI); + return true; +} + +/// An optimization pass to turn multiple LDR / STR ops of the same base and +/// incrementing offset into LDM / STM ops. +bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { + MemOpQueue MemOps; + unsigned CurrBase = 0; + unsigned CurrOpc = ~0u; + ARMCC::CondCodes CurrPred = ARMCC::AL; + unsigned Position = 0; + assert(Candidates.size() == 0); + assert(MergeBaseCandidates.size() == 0); + LiveRegsValid = false; + + for (MachineBasicBlock::iterator I = MBB.end(), MBBI; I != MBB.begin(); + I = MBBI) { + // The instruction in front of the iterator is the one we look at. + MBBI = std::prev(I); + if (FixInvalidRegPairOp(MBB, MBBI)) + continue; + ++Position; + + if (isMemoryOp(*MBBI)) { + unsigned Opcode = MBBI->getOpcode(); + const MachineOperand &MO = MBBI->getOperand(0); + unsigned Reg = MO.getReg(); + unsigned Base = getLoadStoreBaseOp(*MBBI).getReg(); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = getInstrPredicate(MBBI, PredReg); + int Offset = getMemoryOpOffset(MBBI); + if (CurrBase == 0) { + // Start of a new chain. + CurrBase = Base; + CurrOpc = Opcode; + CurrPred = Pred; + MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position)); + continue; + } + // Note: No need to match PredReg in the next if. + if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) { + // Watch out for: + // r4 := ldr [r0, #8] + // r4 := ldr [r0, #4] + // or + // r0 := ldr [r0] + // If a load overrides the base register or a register loaded by + // another load in our chain, we cannot take this instruction. + bool Overlap = false; + if (isLoadSingle(Opcode)) { + Overlap = (Base == Reg); + if (!Overlap) { + for (const MemOpQueueEntry &E : MemOps) { + if (TRI->regsOverlap(Reg, E.MI->getOperand(0).getReg())) { + Overlap = true; + break; + } + } + } + } + + if (!Overlap) { + // Check offset and sort memory operation into the current chain. + if (Offset > MemOps.back().Offset) { + MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position)); + continue; + } else { + MemOpQueue::iterator MI, ME; + for (MI = MemOps.begin(), ME = MemOps.end(); MI != ME; ++MI) { + if (Offset < MI->Offset) { + // Found a place to insert. + break; + } + if (Offset == MI->Offset) { + // Collision, abort. + MI = ME; + break; + } + } + if (MI != MemOps.end()) { + MemOps.insert(MI, MemOpQueueEntry(MBBI, Offset, Position)); + continue; + } + } + } + } + + // Don't advance the iterator; The op will start a new chain next. + MBBI = I; + --Position; + // Fallthrough to look into existing chain. + } else if (MBBI->isDebugValue()) { + continue; + } else if (MBBI->getOpcode() == ARM::t2LDRDi8 || + MBBI->getOpcode() == ARM::t2STRDi8) { + // ARMPreAllocLoadStoreOpt has already formed some LDRD/STRD instructions + // remember them because we may still be able to merge add/sub into them. + MergeBaseCandidates.push_back(MBBI); + } + + + // If we are here then the chain is broken; Extract candidates for a merge. + if (MemOps.size() > 0) { + FormCandidates(MemOps); + // Reset for the next chain. + CurrBase = 0; + CurrOpc = ~0u; + CurrPred = ARMCC::AL; + MemOps.clear(); + } + } + if (MemOps.size() > 0) + FormCandidates(MemOps); + + // Sort candidates so they get processed from end to begin of the basic + // block later; This is necessary for liveness calculation. + auto LessThan = [](const MergeCandidate* M0, const MergeCandidate *M1) { + return M0->InsertPos < M1->InsertPos; + }; + std::sort(Candidates.begin(), Candidates.end(), LessThan); + + // Go through list of candidates and merge. + bool Changed = false; + for (const MergeCandidate *Candidate : Candidates) { + if (Candidate->CanMergeToLSMulti || Candidate->CanMergeToLSDouble) { + MachineInstr *Merged = MergeOpsUpdate(*Candidate); + // Merge preceding/trailing base inc/dec into the merged op. + if (Merged) { + Changed = true; + unsigned Opcode = Merged->getOpcode(); + if (Opcode == ARM::t2STRDi8 || Opcode == ARM::t2LDRDi8) + MergeBaseUpdateLSDouble(*Merged); + else + MergeBaseUpdateLSMultiple(Merged); + } else { + for (MachineInstr *MI : Candidate->Instrs) { + if (MergeBaseUpdateLoadStore(MI)) + Changed = true; + } + } + } else { + assert(Candidate->Instrs.size() == 1); + if (MergeBaseUpdateLoadStore(Candidate->Instrs.front())) + Changed = true; + } + } + Candidates.clear(); + // Try to fold add/sub into the LDRD/STRD formed by ARMPreAllocLoadStoreOpt. + for (MachineInstr *MI : MergeBaseCandidates) + MergeBaseUpdateLSDouble(*MI); + MergeBaseCandidates.clear(); + + return Changed; +} + +/// If this is a exit BB, try merging the return ops ("bx lr" and "mov pc, lr") +/// into the preceding stack restore so it directly restore the value of LR +/// into pc. +/// ldmfd sp!, {..., lr} +/// bx lr +/// or +/// ldmfd sp!, {..., lr} +/// mov pc, lr +/// => +/// ldmfd sp!, {..., pc} +bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) { + // Thumb1 LDM doesn't allow high registers. + if (isThumb1) return false; + if (MBB.empty()) return false; + + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + if (MBBI != MBB.begin() && + (MBBI->getOpcode() == ARM::BX_RET || + MBBI->getOpcode() == ARM::tBX_RET || + MBBI->getOpcode() == ARM::MOVPCLR)) { + MachineBasicBlock::iterator PrevI = std::prev(MBBI); + // Ignore any DBG_VALUE instructions. + while (PrevI->isDebugValue() && PrevI != MBB.begin()) + --PrevI; + MachineInstr *PrevMI = PrevI; + unsigned Opcode = PrevMI->getOpcode(); + if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::LDMDA_UPD || + Opcode == ARM::LDMDB_UPD || Opcode == ARM::LDMIB_UPD || + Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD) { + MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1); + if (MO.getReg() != ARM::LR) + return false; + unsigned NewOpc = (isThumb2 ? ARM::t2LDMIA_RET : ARM::LDMIA_RET); + assert(((isThumb2 && Opcode == ARM::t2LDMIA_UPD) || + Opcode == ARM::LDMIA_UPD) && "Unsupported multiple load-return!"); + PrevMI->setDesc(TII->get(NewOpc)); + MO.setReg(ARM::PC); + PrevMI->copyImplicitOps(*MBB.getParent(), &*MBBI); + MBB.erase(MBBI); + return true; + } + } + return false; +} + +bool ARMLoadStoreOpt::CombineMovBx(MachineBasicBlock &MBB) { + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + if (MBBI == MBB.begin() || MBBI == MBB.end() || + MBBI->getOpcode() != ARM::tBX_RET) + return false; + + MachineBasicBlock::iterator Prev = MBBI; + --Prev; + if (Prev->getOpcode() != ARM::tMOVr || !Prev->definesRegister(ARM::LR)) + return false; + + for (auto Use : Prev->uses()) + if (Use.isKill()) { + AddDefaultPred(BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(ARM::tBX)) + .addReg(Use.getReg(), RegState::Kill)) + .copyImplicitOps(&*MBBI); + MBB.erase(MBBI); + MBB.erase(Prev); + return true; + } + + llvm_unreachable("tMOVr doesn't kill a reg before tBX_RET?"); +} + +bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { + MF = &Fn; + STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget()); + TL = STI->getTargetLowering(); + AFI = Fn.getInfo<ARMFunctionInfo>(); + TII = STI->getInstrInfo(); + TRI = STI->getRegisterInfo(); + + RegClassInfoValid = false; + isThumb2 = AFI->isThumb2Function(); + isThumb1 = AFI->isThumbFunction() && !isThumb2; + + bool Modified = false; + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; + ++MFI) { + MachineBasicBlock &MBB = *MFI; + Modified |= LoadStoreMultipleOpti(MBB); + if (STI->hasV5TOps()) + Modified |= MergeReturnIntoLDM(MBB); + if (isThumb1) + Modified |= CombineMovBx(MBB); + } + + Allocator.DestroyAll(); + return Modified; +} + +namespace llvm { +void initializeARMPreAllocLoadStoreOptPass(PassRegistry &); +} + +#define ARM_PREALLOC_LOAD_STORE_OPT_NAME \ + "ARM pre- register allocation load / store optimization pass" + +namespace { + /// Pre- register allocation pass that move load / stores from consecutive + /// locations close to make it more likely they will be combined later. + struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{ + static char ID; + ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) { + initializeARMPreAllocLoadStoreOptPass(*PassRegistry::getPassRegistry()); + } + + const DataLayout *TD; + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + const ARMSubtarget *STI; + MachineRegisterInfo *MRI; + MachineFunction *MF; + + bool runOnMachineFunction(MachineFunction &Fn) override; + + const char *getPassName() const override { + return ARM_PREALLOC_LOAD_STORE_OPT_NAME; + } + + private: + bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl, + unsigned &NewOpc, unsigned &EvenReg, + unsigned &OddReg, unsigned &BaseReg, + int &Offset, + unsigned &PredReg, ARMCC::CondCodes &Pred, + bool &isT2); + bool RescheduleOps(MachineBasicBlock *MBB, + SmallVectorImpl<MachineInstr *> &Ops, + unsigned Base, bool isLd, + DenseMap<MachineInstr*, unsigned> &MI2LocMap); + bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB); + }; + char ARMPreAllocLoadStoreOpt::ID = 0; +} + +INITIALIZE_PASS(ARMPreAllocLoadStoreOpt, "arm-prera-load-store-opt", + ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false) + +bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { + TD = &Fn.getDataLayout(); + STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget()); + TII = STI->getInstrInfo(); + TRI = STI->getRegisterInfo(); + MRI = &Fn.getRegInfo(); + MF = &Fn; + + bool Modified = false; + for (MachineBasicBlock &MFI : Fn) + Modified |= RescheduleLoadStoreInstrs(&MFI); + + return Modified; +} + +static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base, + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator E, + SmallPtrSetImpl<MachineInstr*> &MemOps, + SmallSet<unsigned, 4> &MemRegs, + const TargetRegisterInfo *TRI) { + // Are there stores / loads / calls between them? + // FIXME: This is overly conservative. We should make use of alias information + // some day. + SmallSet<unsigned, 4> AddedRegPressure; + while (++I != E) { + if (I->isDebugValue() || MemOps.count(&*I)) + continue; + if (I->isCall() || I->isTerminator() || I->hasUnmodeledSideEffects()) + return false; + if (isLd && I->mayStore()) + return false; + if (!isLd) { + if (I->mayLoad()) + return false; + // It's not safe to move the first 'str' down. + // str r1, [r0] + // strh r5, [r0] + // str r4, [r0, #+4] + if (I->mayStore()) + return false; + } + for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) { + MachineOperand &MO = I->getOperand(j); + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (MO.isDef() && TRI->regsOverlap(Reg, Base)) + return false; + if (Reg != Base && !MemRegs.count(Reg)) + AddedRegPressure.insert(Reg); + } + } + + // Estimate register pressure increase due to the transformation. + if (MemRegs.size() <= 4) + // Ok if we are moving small number of instructions. + return true; + return AddedRegPressure.size() <= MemRegs.size() * 2; +} + +bool +ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, + DebugLoc &dl, unsigned &NewOpc, + unsigned &FirstReg, + unsigned &SecondReg, + unsigned &BaseReg, int &Offset, + unsigned &PredReg, + ARMCC::CondCodes &Pred, + bool &isT2) { + // Make sure we're allowed to generate LDRD/STRD. + if (!STI->hasV5TEOps()) + return false; + + // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD + unsigned Scale = 1; + unsigned Opcode = Op0->getOpcode(); + if (Opcode == ARM::LDRi12) { + NewOpc = ARM::LDRD; + } else if (Opcode == ARM::STRi12) { + NewOpc = ARM::STRD; + } else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) { + NewOpc = ARM::t2LDRDi8; + Scale = 4; + isT2 = true; + } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) { + NewOpc = ARM::t2STRDi8; + Scale = 4; + isT2 = true; + } else { + return false; + } + + // Make sure the base address satisfies i64 ld / st alignment requirement. + // At the moment, we ignore the memoryoperand's value. + // If we want to use AliasAnalysis, we should check it accordingly. + if (!Op0->hasOneMemOperand() || + (*Op0->memoperands_begin())->isVolatile()) + return false; + + unsigned Align = (*Op0->memoperands_begin())->getAlignment(); + const Function *Func = MF->getFunction(); + unsigned ReqAlign = STI->hasV6Ops() + ? TD->getABITypeAlignment(Type::getInt64Ty(Func->getContext())) + : 8; // Pre-v6 need 8-byte align + if (Align < ReqAlign) + return false; + + // Then make sure the immediate offset fits. + int OffImm = getMemoryOpOffset(Op0); + if (isT2) { + int Limit = (1 << 8) * Scale; + if (OffImm >= Limit || (OffImm <= -Limit) || (OffImm & (Scale-1))) + return false; + Offset = OffImm; + } else { + ARM_AM::AddrOpc AddSub = ARM_AM::add; + if (OffImm < 0) { + AddSub = ARM_AM::sub; + OffImm = - OffImm; + } + int Limit = (1 << 8) * Scale; + if (OffImm >= Limit || (OffImm & (Scale-1))) + return false; + Offset = ARM_AM::getAM3Opc(AddSub, OffImm); + } + FirstReg = Op0->getOperand(0).getReg(); + SecondReg = Op1->getOperand(0).getReg(); + if (FirstReg == SecondReg) + return false; + BaseReg = Op0->getOperand(1).getReg(); + Pred = getInstrPredicate(Op0, PredReg); + dl = Op0->getDebugLoc(); + return true; +} + +bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, + SmallVectorImpl<MachineInstr *> &Ops, + unsigned Base, bool isLd, + DenseMap<MachineInstr*, unsigned> &MI2LocMap) { + bool RetVal = false; + + // Sort by offset (in reverse order). + std::sort(Ops.begin(), Ops.end(), + [](const MachineInstr *LHS, const MachineInstr *RHS) { + int LOffset = getMemoryOpOffset(LHS); + int ROffset = getMemoryOpOffset(RHS); + assert(LHS == RHS || LOffset != ROffset); + return LOffset > ROffset; + }); + + // The loads / stores of the same base are in order. Scan them from first to + // last and check for the following: + // 1. Any def of base. + // 2. Any gaps. + while (Ops.size() > 1) { + unsigned FirstLoc = ~0U; + unsigned LastLoc = 0; + MachineInstr *FirstOp = nullptr; + MachineInstr *LastOp = nullptr; + int LastOffset = 0; + unsigned LastOpcode = 0; + unsigned LastBytes = 0; + unsigned NumMove = 0; + for (int i = Ops.size() - 1; i >= 0; --i) { + MachineInstr *Op = Ops[i]; + unsigned Loc = MI2LocMap[Op]; + if (Loc <= FirstLoc) { + FirstLoc = Loc; + FirstOp = Op; + } + if (Loc >= LastLoc) { + LastLoc = Loc; + LastOp = Op; + } + + unsigned LSMOpcode + = getLoadStoreMultipleOpcode(Op->getOpcode(), ARM_AM::ia); + if (LastOpcode && LSMOpcode != LastOpcode) + break; + + int Offset = getMemoryOpOffset(Op); + unsigned Bytes = getLSMultipleTransferSize(Op); + if (LastBytes) { + if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes)) + break; + } + LastOffset = Offset; + LastBytes = Bytes; + LastOpcode = LSMOpcode; + if (++NumMove == 8) // FIXME: Tune this limit. + break; + } + + if (NumMove <= 1) + Ops.pop_back(); + else { + SmallPtrSet<MachineInstr*, 4> MemOps; + SmallSet<unsigned, 4> MemRegs; + for (int i = NumMove-1; i >= 0; --i) { + MemOps.insert(Ops[i]); + MemRegs.insert(Ops[i]->getOperand(0).getReg()); + } + + // Be conservative, if the instructions are too far apart, don't + // move them. We want to limit the increase of register pressure. + bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this. + if (DoMove) + DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp, + MemOps, MemRegs, TRI); + if (!DoMove) { + for (unsigned i = 0; i != NumMove; ++i) + Ops.pop_back(); + } else { + // This is the new location for the loads / stores. + MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp; + while (InsertPos != MBB->end() + && (MemOps.count(InsertPos) || InsertPos->isDebugValue())) + ++InsertPos; + + // If we are moving a pair of loads / stores, see if it makes sense + // to try to allocate a pair of registers that can form register pairs. + MachineInstr *Op0 = Ops.back(); + MachineInstr *Op1 = Ops[Ops.size()-2]; + unsigned FirstReg = 0, SecondReg = 0; + unsigned BaseReg = 0, PredReg = 0; + ARMCC::CondCodes Pred = ARMCC::AL; + bool isT2 = false; + unsigned NewOpc = 0; + int Offset = 0; + DebugLoc dl; + if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc, + FirstReg, SecondReg, BaseReg, + Offset, PredReg, Pred, isT2)) { + Ops.pop_back(); + Ops.pop_back(); + + const MCInstrDesc &MCID = TII->get(NewOpc); + const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF); + MRI->constrainRegClass(FirstReg, TRC); + MRI->constrainRegClass(SecondReg, TRC); + + // Form the pair instruction. + if (isLd) { + MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID) + .addReg(FirstReg, RegState::Define) + .addReg(SecondReg, RegState::Define) + .addReg(BaseReg); + // FIXME: We're converting from LDRi12 to an insn that still + // uses addrmode2, so we need an explicit offset reg. It should + // always by reg0 since we're transforming LDRi12s. + if (!isT2) + MIB.addReg(0); + MIB.addImm(Offset).addImm(Pred).addReg(PredReg); + MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1)); + DEBUG(dbgs() << "Formed " << *MIB << "\n"); + ++NumLDRDFormed; + } else { + MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID) + .addReg(FirstReg) + .addReg(SecondReg) + .addReg(BaseReg); + // FIXME: We're converting from LDRi12 to an insn that still + // uses addrmode2, so we need an explicit offset reg. It should + // always by reg0 since we're transforming STRi12s. + if (!isT2) + MIB.addReg(0); + MIB.addImm(Offset).addImm(Pred).addReg(PredReg); + MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1)); + DEBUG(dbgs() << "Formed " << *MIB << "\n"); + ++NumSTRDFormed; + } + MBB->erase(Op0); + MBB->erase(Op1); + + if (!isT2) { + // Add register allocation hints to form register pairs. + MRI->setRegAllocationHint(FirstReg, ARMRI::RegPairEven, SecondReg); + MRI->setRegAllocationHint(SecondReg, ARMRI::RegPairOdd, FirstReg); + } + } else { + for (unsigned i = 0; i != NumMove; ++i) { + MachineInstr *Op = Ops.back(); + Ops.pop_back(); + MBB->splice(InsertPos, MBB, Op); + } + } + + NumLdStMoved += NumMove; + RetVal = true; + } + } + } + + return RetVal; +} + +bool +ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { + bool RetVal = false; + + DenseMap<MachineInstr*, unsigned> MI2LocMap; + DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap; + DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap; + SmallVector<unsigned, 4> LdBases; + SmallVector<unsigned, 4> StBases; + + unsigned Loc = 0; + MachineBasicBlock::iterator MBBI = MBB->begin(); + MachineBasicBlock::iterator E = MBB->end(); + while (MBBI != E) { + for (; MBBI != E; ++MBBI) { + MachineInstr *MI = MBBI; + if (MI->isCall() || MI->isTerminator()) { + // Stop at barriers. + ++MBBI; + break; + } + + if (!MI->isDebugValue()) + MI2LocMap[MI] = ++Loc; + + if (!isMemoryOp(*MI)) + continue; + unsigned PredReg = 0; + if (getInstrPredicate(MI, PredReg) != ARMCC::AL) + continue; + + int Opc = MI->getOpcode(); + bool isLd = isLoadSingle(Opc); + unsigned Base = MI->getOperand(1).getReg(); + int Offset = getMemoryOpOffset(MI); + + bool StopHere = false; + if (isLd) { + DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI = + Base2LdsMap.find(Base); + if (BI != Base2LdsMap.end()) { + for (unsigned i = 0, e = BI->second.size(); i != e; ++i) { + if (Offset == getMemoryOpOffset(BI->second[i])) { + StopHere = true; + break; + } + } + if (!StopHere) + BI->second.push_back(MI); + } else { + Base2LdsMap[Base].push_back(MI); + LdBases.push_back(Base); + } + } else { + DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI = + Base2StsMap.find(Base); + if (BI != Base2StsMap.end()) { + for (unsigned i = 0, e = BI->second.size(); i != e; ++i) { + if (Offset == getMemoryOpOffset(BI->second[i])) { + StopHere = true; + break; + } + } + if (!StopHere) + BI->second.push_back(MI); + } else { + Base2StsMap[Base].push_back(MI); + StBases.push_back(Base); + } + } + + if (StopHere) { + // Found a duplicate (a base+offset combination that's seen earlier). + // Backtrack. + --Loc; + break; + } + } + + // Re-schedule loads. + for (unsigned i = 0, e = LdBases.size(); i != e; ++i) { + unsigned Base = LdBases[i]; + SmallVectorImpl<MachineInstr *> &Lds = Base2LdsMap[Base]; + if (Lds.size() > 1) + RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap); + } + + // Re-schedule stores. + for (unsigned i = 0, e = StBases.size(); i != e; ++i) { + unsigned Base = StBases[i]; + SmallVectorImpl<MachineInstr *> &Sts = Base2StsMap[Base]; + if (Sts.size() > 1) + RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap); + } + + if (MBBI != E) { + Base2LdsMap.clear(); + Base2StsMap.clear(); + LdBases.clear(); + StBases.clear(); + } + } + + return RetVal; +} + + +/// Returns an instance of the load / store optimization pass. +FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) { + if (PreAlloc) + return new ARMPreAllocLoadStoreOpt(); + return new ARMLoadStoreOpt(); +} + diff --git a/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp b/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp new file mode 100644 index 0000000..a2aca2d --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp @@ -0,0 +1,163 @@ +//===-- ARMMCInstLower.cpp - Convert ARM MachineInstr to an MCInst --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains code to lower ARM MachineInstrs to their corresponding +// MCInst records. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMAsmPrinter.h" +#include "MCTargetDesc/ARMBaseInfo.h" +#include "MCTargetDesc/ARMMCExpr.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Mangler.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +using namespace llvm; + + +MCOperand ARMAsmPrinter::GetSymbolRef(const MachineOperand &MO, + const MCSymbol *Symbol) { + const MCExpr *Expr; + unsigned Option = MO.getTargetFlags() & ARMII::MO_OPTION_MASK; + switch (Option) { + default: { + Expr = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, + OutContext); + switch (Option) { + default: llvm_unreachable("Unknown target flag on symbol operand"); + case ARMII::MO_NO_FLAG: + break; + case ARMII::MO_LO16: + Expr = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, + OutContext); + Expr = ARMMCExpr::createLower16(Expr, OutContext); + break; + case ARMII::MO_HI16: + Expr = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, + OutContext); + Expr = ARMMCExpr::createUpper16(Expr, OutContext); + break; + } + break; + } + + case ARMII::MO_PLT: + Expr = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_PLT, + OutContext); + break; + } + + if (!MO.isJTI() && MO.getOffset()) + Expr = MCBinaryExpr::createAdd(Expr, + MCConstantExpr::create(MO.getOffset(), + OutContext), + OutContext); + return MCOperand::createExpr(Expr); + +} + +bool ARMAsmPrinter::lowerOperand(const MachineOperand &MO, + MCOperand &MCOp) { + switch (MO.getType()) { + default: llvm_unreachable("unknown operand type"); + case MachineOperand::MO_Register: + // Ignore all non-CPSR implicit register operands. + if (MO.isImplicit() && MO.getReg() != ARM::CPSR) + return false; + assert(!MO.getSubReg() && "Subregs should be eliminated!"); + MCOp = MCOperand::createReg(MO.getReg()); + break; + case MachineOperand::MO_Immediate: + MCOp = MCOperand::createImm(MO.getImm()); + break; + case MachineOperand::MO_MachineBasicBlock: + MCOp = MCOperand::createExpr(MCSymbolRefExpr::create( + MO.getMBB()->getSymbol(), OutContext)); + break; + case MachineOperand::MO_GlobalAddress: { + MCOp = GetSymbolRef(MO, + GetARMGVSymbol(MO.getGlobal(), MO.getTargetFlags())); + break; + } + case MachineOperand::MO_ExternalSymbol: + MCOp = GetSymbolRef(MO, + GetExternalSymbolSymbol(MO.getSymbolName())); + break; + case MachineOperand::MO_JumpTableIndex: + MCOp = GetSymbolRef(MO, GetJTISymbol(MO.getIndex())); + break; + case MachineOperand::MO_ConstantPoolIndex: + MCOp = GetSymbolRef(MO, GetCPISymbol(MO.getIndex())); + break; + case MachineOperand::MO_BlockAddress: + MCOp = GetSymbolRef(MO, GetBlockAddressSymbol(MO.getBlockAddress())); + break; + case MachineOperand::MO_FPImmediate: { + APFloat Val = MO.getFPImm()->getValueAPF(); + bool ignored; + Val.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &ignored); + MCOp = MCOperand::createFPImm(Val.convertToDouble()); + break; + } + case MachineOperand::MO_RegisterMask: + // Ignore call clobbers. + return false; + } + return true; +} + +void llvm::LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, + ARMAsmPrinter &AP) { + OutMI.setOpcode(MI->getOpcode()); + + // In the MC layer, we keep modified immediates in their encoded form + bool EncodeImms = false; + switch (MI->getOpcode()) { + default: break; + case ARM::MOVi: + case ARM::MVNi: + case ARM::CMPri: + case ARM::CMNri: + case ARM::TSTri: + case ARM::TEQri: + case ARM::MSRi: + case ARM::ADCri: + case ARM::ADDri: + case ARM::ADDSri: + case ARM::SBCri: + case ARM::SUBri: + case ARM::SUBSri: + case ARM::ANDri: + case ARM::ORRri: + case ARM::EORri: + case ARM::BICri: + case ARM::RSBri: + case ARM::RSBSri: + case ARM::RSCri: + EncodeImms = true; + break; + } + + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + + MCOperand MCOp; + if (AP.lowerOperand(MO, MCOp)) { + if (MCOp.isImm() && EncodeImms) { + int32_t Enc = ARM_AM::getSOImmVal(MCOp.getImm()); + if (Enc != -1) + MCOp.setImm(Enc); + } + OutMI.addOperand(MCOp); + } + } +} diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp new file mode 100644 index 0000000..ac0330f --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp @@ -0,0 +1,23 @@ +//===-- ARMMachineFunctionInfo.cpp - ARM machine function info ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "ARMMachineFunctionInfo.h" + +using namespace llvm; + +void ARMFunctionInfo::anchor() { } + +ARMFunctionInfo::ARMFunctionInfo(MachineFunction &MF) + : isThumb(MF.getSubtarget<ARMSubtarget>().isThumb()), + hasThumb2(MF.getSubtarget<ARMSubtarget>().hasThumb2()), + StByValParamsPadding(0), ArgRegsSaveSize(0), HasStackFrame(false), + RestoreSPFromFP(false), LRSpilledForFarJump(false), + FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), + GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), + PICLabelUId(0), VarArgsFrameIndex(0), HasITBlocks(false) {} diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h new file mode 100644 index 0000000..d644797 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -0,0 +1,226 @@ +//===-- ARMMachineFunctionInfo.h - ARM machine function info ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares ARM-specific per-machine-function information. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_ARMMACHINEFUNCTIONINFO_H +#define LLVM_LIB_TARGET_ARM_ARMMACHINEFUNCTIONINFO_H + +#include "ARMSubtarget.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" + +namespace llvm { + +/// ARMFunctionInfo - This class is derived from MachineFunctionInfo and +/// contains private ARM-specific information for each MachineFunction. +class ARMFunctionInfo : public MachineFunctionInfo { + virtual void anchor(); + + /// isThumb - True if this function is compiled under Thumb mode. + /// Used to initialized Align, so must precede it. + bool isThumb; + + /// hasThumb2 - True if the target architecture supports Thumb2. Do not use + /// to determine if function is compiled under Thumb mode, for that use + /// 'isThumb'. + bool hasThumb2; + + /// StByValParamsPadding - For parameter that is split between + /// GPRs and memory; while recovering GPRs part, when + /// StackAlignment > 4, and GPRs-part-size mod StackAlignment != 0, + /// we need to insert gap before parameter start address. It allows to + /// "attach" GPR-part to the part that was passed via stack. + unsigned StByValParamsPadding; + + /// VarArgsRegSaveSize - Size of the register save area for vararg functions. + /// + unsigned ArgRegsSaveSize; + + /// ReturnRegsCount - Number of registers used up in the return. + unsigned ReturnRegsCount; + + /// HasStackFrame - True if this function has a stack frame. Set by + /// determineCalleeSaves(). + bool HasStackFrame; + + /// RestoreSPFromFP - True if epilogue should restore SP from FP. Set by + /// emitPrologue. + bool RestoreSPFromFP; + + /// LRSpilledForFarJump - True if the LR register has been for spilled to + /// enable far jump. + bool LRSpilledForFarJump; + + /// FramePtrSpillOffset - If HasStackFrame, this records the frame pointer + /// spill stack offset. + unsigned FramePtrSpillOffset; + + /// GPRCS1Offset, GPRCS2Offset, DPRCSOffset - Starting offset of callee saved + /// register spills areas. For Mac OS X: + /// + /// GPR callee-saved (1) : r4, r5, r6, r7, lr + /// -------------------------------------------- + /// GPR callee-saved (2) : r8, r10, r11 + /// -------------------------------------------- + /// DPR callee-saved : d8 - d15 + /// + /// Also see AlignedDPRCSRegs below. Not all D-regs need to go in area 3. + /// Some may be spilled after the stack has been realigned. + unsigned GPRCS1Offset; + unsigned GPRCS2Offset; + unsigned DPRCSOffset; + + /// GPRCS1Size, GPRCS2Size, DPRCSSize - Sizes of callee saved register spills + /// areas. + unsigned GPRCS1Size; + unsigned GPRCS2Size; + unsigned DPRCSAlignGapSize; + unsigned DPRCSSize; + + /// NumAlignedDPRCS2Regs - The number of callee-saved DPRs that are saved in + /// the aligned portion of the stack frame. This is always a contiguous + /// sequence of D-registers starting from d8. + /// + /// We do not keep track of the frame indices used for these registers - they + /// behave like any other frame index in the aligned stack frame. These + /// registers also aren't included in DPRCSSize above. + unsigned NumAlignedDPRCS2Regs; + + unsigned PICLabelUId; + + /// VarArgsFrameIndex - FrameIndex for start of varargs area. + int VarArgsFrameIndex; + + /// HasITBlocks - True if IT blocks have been inserted. + bool HasITBlocks; + + /// CPEClones - Track constant pool entries clones created by Constant Island + /// pass. + DenseMap<unsigned, unsigned> CPEClones; + + /// ArgumentStackSize - amount of bytes on stack consumed by the arguments + /// being passed on the stack + unsigned ArgumentStackSize; + + /// CoalescedWeights - mapping of basic blocks to the rolling counter of + /// coalesced weights. + DenseMap<const MachineBasicBlock*, unsigned> CoalescedWeights; + +public: + ARMFunctionInfo() : + isThumb(false), + hasThumb2(false), + ArgRegsSaveSize(0), ReturnRegsCount(0), HasStackFrame(false), + RestoreSPFromFP(false), + LRSpilledForFarJump(false), + FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), + GPRCS1Size(0), GPRCS2Size(0), DPRCSAlignGapSize(0), DPRCSSize(0), + NumAlignedDPRCS2Regs(0), PICLabelUId(0), + VarArgsFrameIndex(0), HasITBlocks(false) {} + + explicit ARMFunctionInfo(MachineFunction &MF); + + bool isThumbFunction() const { return isThumb; } + bool isThumb1OnlyFunction() const { return isThumb && !hasThumb2; } + bool isThumb2Function() const { return isThumb && hasThumb2; } + + unsigned getStoredByValParamsPadding() const { return StByValParamsPadding; } + void setStoredByValParamsPadding(unsigned p) { StByValParamsPadding = p; } + + unsigned getArgRegsSaveSize() const { return ArgRegsSaveSize; } + void setArgRegsSaveSize(unsigned s) { ArgRegsSaveSize = s; } + + unsigned getReturnRegsCount() const { return ReturnRegsCount; } + void setReturnRegsCount(unsigned s) { ReturnRegsCount = s; } + + bool hasStackFrame() const { return HasStackFrame; } + void setHasStackFrame(bool s) { HasStackFrame = s; } + + bool shouldRestoreSPFromFP() const { return RestoreSPFromFP; } + void setShouldRestoreSPFromFP(bool s) { RestoreSPFromFP = s; } + + bool isLRSpilledForFarJump() const { return LRSpilledForFarJump; } + void setLRIsSpilledForFarJump(bool s) { LRSpilledForFarJump = s; } + + unsigned getFramePtrSpillOffset() const { return FramePtrSpillOffset; } + void setFramePtrSpillOffset(unsigned o) { FramePtrSpillOffset = o; } + + unsigned getNumAlignedDPRCS2Regs() const { return NumAlignedDPRCS2Regs; } + void setNumAlignedDPRCS2Regs(unsigned n) { NumAlignedDPRCS2Regs = n; } + + unsigned getGPRCalleeSavedArea1Offset() const { return GPRCS1Offset; } + unsigned getGPRCalleeSavedArea2Offset() const { return GPRCS2Offset; } + unsigned getDPRCalleeSavedAreaOffset() const { return DPRCSOffset; } + + void setGPRCalleeSavedArea1Offset(unsigned o) { GPRCS1Offset = o; } + void setGPRCalleeSavedArea2Offset(unsigned o) { GPRCS2Offset = o; } + void setDPRCalleeSavedAreaOffset(unsigned o) { DPRCSOffset = o; } + + unsigned getGPRCalleeSavedArea1Size() const { return GPRCS1Size; } + unsigned getGPRCalleeSavedArea2Size() const { return GPRCS2Size; } + unsigned getDPRCalleeSavedGapSize() const { return DPRCSAlignGapSize; } + unsigned getDPRCalleeSavedAreaSize() const { return DPRCSSize; } + + void setGPRCalleeSavedArea1Size(unsigned s) { GPRCS1Size = s; } + void setGPRCalleeSavedArea2Size(unsigned s) { GPRCS2Size = s; } + void setDPRCalleeSavedGapSize(unsigned s) { DPRCSAlignGapSize = s; } + void setDPRCalleeSavedAreaSize(unsigned s) { DPRCSSize = s; } + + unsigned getArgumentStackSize() const { return ArgumentStackSize; } + void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; } + + void initPICLabelUId(unsigned UId) { + PICLabelUId = UId; + } + + unsigned getNumPICLabels() const { + return PICLabelUId; + } + + unsigned createPICLabelUId() { + return PICLabelUId++; + } + + int getVarArgsFrameIndex() const { return VarArgsFrameIndex; } + void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; } + + bool hasITBlocks() const { return HasITBlocks; } + void setHasITBlocks(bool h) { HasITBlocks = h; } + + void recordCPEClone(unsigned CPIdx, unsigned CPCloneIdx) { + if (!CPEClones.insert(std::make_pair(CPCloneIdx, CPIdx)).second) + llvm_unreachable("Duplicate entries!"); + } + + unsigned getOriginalCPIdx(unsigned CloneIdx) const { + DenseMap<unsigned, unsigned>::const_iterator I = CPEClones.find(CloneIdx); + if (I != CPEClones.end()) + return I->second; + else + return -1U; + } + + DenseMap<const MachineBasicBlock*, unsigned>::iterator getCoalescedWeight( + MachineBasicBlock* MBB) { + auto It = CoalescedWeights.find(MBB); + if (It == CoalescedWeights.end()) { + It = CoalescedWeights.insert(std::make_pair(MBB, 0)).first; + } + return It; + } +}; +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMOptimizeBarriersPass.cpp b/contrib/llvm/lib/Target/ARM/ARMOptimizeBarriersPass.cpp new file mode 100644 index 0000000..30baf42 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMOptimizeBarriersPass.cpp @@ -0,0 +1,99 @@ +//===-- ARMOptimizeBarriersPass - two DMBs without a memory access in between, +//removed one -===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===------------------------------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMInstrInfo.h" +#include "ARMMachineFunctionInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +using namespace llvm; + +#define DEBUG_TYPE "double barriers" + +STATISTIC(NumDMBsRemoved, "Number of DMBs removed"); + +namespace { +class ARMOptimizeBarriersPass : public MachineFunctionPass { +public: + static char ID; + ARMOptimizeBarriersPass() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &Fn) override; + + const char *getPassName() const override { + return "optimise barriers pass"; + } +}; +char ARMOptimizeBarriersPass::ID = 0; +} + +// Returns whether the instruction can safely move past a DMB instruction +// The current implementation allows this iif MI does not have any possible +// memory access +static bool CanMovePastDMB(const MachineInstr *MI) { + return !(MI->mayLoad() || + MI->mayStore() || + MI->hasUnmodeledSideEffects() || + MI->isCall() || + MI->isReturn()); +} + +bool ARMOptimizeBarriersPass::runOnMachineFunction(MachineFunction &MF) { + // Vector to store the DMBs we will remove after the first iteration + std::vector<MachineInstr *> ToRemove; + // DMBType is the Imm value of the first operand. It determines whether it's a + // DMB ish, dmb sy, dmb osh, etc + int64_t DMBType = -1; + + // Find a dmb. If we can move it until the next dmb, tag the second one for + // removal + for (auto &MBB : MF) { + // Will be true when we have seen a DMB, and not seen any instruction since + // that cannot move past a DMB + bool IsRemovableNextDMB = false; + for (auto &MI : MBB) { + if (MI.getOpcode() == ARM::DMB) { + if (IsRemovableNextDMB) { + // If the Imm of this DMB is the same as that of the last DMB, we can + // tag this second DMB for removal + if (MI.getOperand(0).getImm() == DMBType) { + ToRemove.push_back(&MI); + } else { + // If it has a different DMBType, we cannot remove it, but will scan + // for the next DMB, recording this DMB's type as last seen DMB type + DMBType = MI.getOperand(0).getImm(); + } + } else { + // After we see a DMB, a next one is removable + IsRemovableNextDMB = true; + DMBType = MI.getOperand(0).getImm(); + } + } else if (!CanMovePastDMB(&MI)) { + // If we find an instruction unable to pass past a DMB, a next DMB is + // not removable + IsRemovableNextDMB = false; + } + } + } + // Remove the tagged DMB + for (auto MI : ToRemove) { + MI->eraseFromParent(); + ++NumDMBsRemoved; + } + + return NumDMBsRemoved > 0; +} + +/// createARMOptimizeBarriersPass - Returns an instance of the remove double +/// barriers +/// pass. +FunctionPass *llvm::createARMOptimizeBarriersPass() { + return new ARMOptimizeBarriersPass(); +} diff --git a/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h b/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h new file mode 100644 index 0000000..3ff0bee --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h @@ -0,0 +1,6591 @@ +//===-- ARMPerfectShuffle.h - NEON Perfect Shuffle Table --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file, which was autogenerated by llvm-PerfectShuffle, contains data +// for the optimal way to build a perfect shuffle using neon instructions. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_ARMPERFECTSHUFFLE_H +#define LLVM_LIB_TARGET_ARM_ARMPERFECTSHUFFLE_H + +// 31 entries have cost 0 +// 242 entries have cost 1 +// 1447 entries have cost 2 +// 3602 entries have cost 3 +// 1237 entries have cost 4 +// 2 entries have cost 5 + +// This table is 6561*4 = 26244 bytes in size. +static const unsigned PerfectShuffleTable[6561+1] = { + 135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS + 1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS + 2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0> + 2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0> + 1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS + 2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3> + 2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3> + 2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0> + 135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS + 2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0> + 1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS + 1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS + 2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0> + 2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5> + 2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7> + 3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1> + 2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1> + 1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS + 3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0> + 3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1> + 1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS + 2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0> + 3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6> + 3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6> + 2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7> + 2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2> + 1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS + 2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0> + 2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0> + 2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0> + 2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3> + 2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6> + 3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6> + 3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7> + 3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0> + 2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0> + 2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1> + 2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS + 3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS + 3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4> + 3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6> + 1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS + 2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS + 3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5> + 1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS + 2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7> + 2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3> + 3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7> + 3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5> + 2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6> + 2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5> + 2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7> + 2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7> + 2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7> + 2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7> + 3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS + 2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3> + 3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7> + 3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS + 3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0> + 2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6> + 2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0> + 2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0> + 2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0> + 3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0> + 3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7> + 3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0> + 2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6> + 3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0> + 3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7> + 2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7> + 2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0> + 135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS + 1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS + 1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS + 2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u> + 1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS + 1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS + 2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS + 2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u> + 135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS + 2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1> + 1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS + 2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1> + 2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0> + 2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5> + 4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7> + 2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1> + 2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0> + 1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS + 1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1> + 2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1> + 2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0> + 2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3> + 1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS + 2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7> + 2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3> + 2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1> + 1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS + 1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS + 3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1> + 2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2> + 835584U, // <0,1,2,3>: Cost 0 copy LHS + 1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS + 3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7> + 2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7> + 1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2> + 835584U, // <0,1,2,u>: Cost 0 copy LHS + 2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0> + 2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3> + 2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0> + 2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0> + 2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS + 2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7> + 2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0> + 2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1> + 2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3> + 2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS + 4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1> + 2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1> + 2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4> + 2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS + 1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS + 2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS + 2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4> + 1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS + 3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1> + 2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1> + 3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0> + 2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7> + 2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6> + 2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1> + 2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1> + 2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1> + 2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7> + 2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS + 3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7> + 2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1> + 3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7> + 2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS + 3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7> + 2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1> + 1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1> + 1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1> + 2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0> + 2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1> + 2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0> + 2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1> + 2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6> + 3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0> + 2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0> + 2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7> + 2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2> + 1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS + 1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS + 2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS + 835584U, // <0,1,u,3>: Cost 0 copy LHS + 1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS + 1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS + 2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS + 1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u> + 835584U, // <0,1,u,u>: Cost 0 copy LHS + 2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0> + 1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS + 1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS + 2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0> + 2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6> + 2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7> + 2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7> + 2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0> + 1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS + 2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2> + 2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1> + 2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2> + 2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2> + 2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS + 2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7> + 2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7> + 3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7> + 2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2> + 1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2> + 2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2> + 2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2> + 3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3> + 1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS + 2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3> + 2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7> + 2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2> + 1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS + 2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2> + 2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2> + 3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3> + 2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3> + 2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6> + 2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6> + 3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3> + 2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0> + 2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2> + 2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS + 4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3> + 2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4> + 2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4> + 2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS + 1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS + 1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS + 2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4> + 1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS + 2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7> + 2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3> + 3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7> + 3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6> + 2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6> + 2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5> + 2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0> + 2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS + 2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS + 2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1> + 3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2> + 2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3> + 2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7> + 2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5> + 3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6> + 2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6> + 2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2> + 2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7> + 2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2> + 2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2> + 3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2> + 2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0> + 2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6> + 2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2> + 3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2> + 2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7> + 2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2> + 1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u> + 1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS + 1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS + 3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3> + 1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS + 1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS + 1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS + 2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS + 1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS + 2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0> + 2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2> + 2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0> + 4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3> + 2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS + 4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6> + 3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7> + 3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0> + 2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS + 2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2> + 2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1> + 2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3> + 2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3> + 2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6> + 4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6> + 3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1> + 3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3> + 2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2> + 1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS + 1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2> + 2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2> + 3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3> + 1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS + 3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6> + 2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3> + 2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2> + 1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS + 3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2> + 2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3> + 3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3> + 2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3> + 3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6> + 3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6> + 3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7> + 3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7> + 2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3> + 3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2> + 2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4> + 2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4> + 4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3> + 3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6> + 2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6> + 3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS + 3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4> + 2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6> + 3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS + 3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2> + 4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2> + 3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7> + 2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5> + 3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7> + 2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7> + 2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0> + 2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5> + 2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7> + 3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3> + 3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7> + 3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7> + 3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7> + 3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6> + 3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6> + 2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0> + 2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0> + 3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1> + 2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3> + 3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7> + 3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7> + 3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5> + 3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7> + 3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7> + 3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0> + 2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3> + 1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS + 1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u> + 2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2> + 3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3> + 1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS + 3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6> + 2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3> + 2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0> + 1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS + 2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4> + 2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS + 2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4> + 3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0> + 2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6> + 2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1> + 3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS + 3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0> + 3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS + 2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS + 3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1> + 3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0> + 2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1> + 2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS + 1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS + 3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS + 2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1> + 1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS + 2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS + 2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2> + 3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2> + 2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4> + 2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS + 2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS + 1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS + 2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2> + 1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS + 3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2> + 3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2> + 3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4> + 3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3> + 3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6> + 3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6> + 4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS + 3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4> + 3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2> + 2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4> + 3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0> + 3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3> + 3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4> + 2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4> + 2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS + 2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS + 3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4> + 2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS + 2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS + 2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0> + 3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5> + 2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5> + 2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS + 2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6> + 1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS + 2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5> + 1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS + 2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6> + 3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2> + 2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6> + 3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0> + 2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6> + 3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6> + 2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0> + 2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4> + 2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2> + 3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS + 3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1> + 3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4> + 3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4> + 3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS + 2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0> + 2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0> + 3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2> + 2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0> + 2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS + 2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS + 2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS + 2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u> + 2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS + 1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS + 1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS + 2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u> + 1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS + 3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0> + 2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS + 3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2> + 3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5> + 3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS + 3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1> + 2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1> + 3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS + 2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS + 1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS + 2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3> + 2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2> + 2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2> + 1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1> + 2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5> + 2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0> + 2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3> + 1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS + 2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS + 2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2> + 3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7> + 2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2> + 2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS + 3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5> + 4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6> + 3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS + 3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS + 3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2> + 3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3> + 3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1> + 3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3> + 2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5> + 3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0> + 3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7> + 2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0> + 2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0> + 2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1> + 3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4> + 3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4> + 3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5> + 3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6> + 2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS + 3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5> + 2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6> + 2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS + 3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0> + 3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0> + 3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5> + 3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0> + 3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5> + 2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5> + 2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0> + 2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7> + 2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7> + 2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS + 3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0> + 3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3> + 3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4> + 2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS + 2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0> + 3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7> + 1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0> + 1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0> + 2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS + 2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0> + 2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7> + 3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2> + 2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS + 2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7> + 2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0> + 2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0> + 2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS + 1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS + 2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS + 2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0> + 2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u> + 1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u> + 2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS + 2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u> + 1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0> + 1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0> + 2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS + 2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS + 2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2> + 3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4> + 2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6> + 2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0> + 2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0> + 2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS + 2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS + 2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS + 3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1> + 2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3> + 3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3> + 2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS + 2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1> + 2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6> + 2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS + 2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS + 1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS + 2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2> + 2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2> + 2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1> + 1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2> + 2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3> + 3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6> + 2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS + 1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS + 3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2> + 3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7> + 3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0> + 3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3> + 2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6> + 3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7> + 3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0> + 2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0> + 2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0> + 2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS + 4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2> + 2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4> + 3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2> + 2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS + 2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS + 2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0> + 4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS + 2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS + 2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7> + 3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0> + 3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7> + 3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0> + 3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5> + 3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0> + 3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7> + 4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS + 2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7> + 3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0> + 3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3> + 3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6> + 3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0> + 3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4> + 3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6> + 2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6> + 2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7> + 2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7> + 2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1> + 2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0> + 3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7> + 2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0> + 2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5> + 2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6> + 3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2> + 2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0> + 2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1> + 1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS + 2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS + 2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u> + 2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0> + 1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u> + 2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS + 2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0> + 2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS + 1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS + 2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0> + 2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS + 2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0> + 3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0> + 2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5> + 2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6> + 2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7> + 3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7> + 2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS + 2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1> + 3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1> + 2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0> + 3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5> + 2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1> + 3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3> + 3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7> + 2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7> + 2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1> + 2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS + 1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2> + 3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2> + 2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0> + 2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS + 2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7> + 2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2> + 3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7> + 1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2> + 3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2> + 3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3> + 3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3> + 3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3> + 3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6> + 3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7> + 2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7> + 3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0> + 2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7> + 3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS + 2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4> + 2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4> + 3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7> + 3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6> + 2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS + 3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7> + 3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5> + 2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS + 3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0> + 3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7> + 3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5> + 3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7> + 2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5> + 3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7> + 2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7> + 2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0> + 2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7> + 3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0> + 2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6> + 3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7> + 3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7> + 3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS + 2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7> + 3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6> + 2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0> + 2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7> + 3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1> + 3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0> + 3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7> + 3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0> + 3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS + 3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7> + 3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7> + 2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7> + 2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7> + 2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u> + 1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u> + 2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u> + 2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0> + 2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u> + 2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS + 2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7> + 2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0> + 1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u> + 135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS + 1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS + 1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS + 2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2> + 1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS + 2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6> + 3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS + 2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0> + 135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS + 1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1> + 1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS + 1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS + 2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2> + 1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS + 1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS + 2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7> + 2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS + 1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS + 1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS + 1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2> + 1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS + 835584U, // <0,u,2,3>: Cost 0 copy LHS + 1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS + 3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6> + 1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS + 1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2> + 835584U, // <0,u,2,u>: Cost 0 copy LHS + 2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2> + 2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2> + 2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u> + 2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3> + 2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6> + 2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6> + 2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7> + 2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u> + 2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2> + 2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS + 2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS + 3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS + 2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4> + 2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS + 1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS + 1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS + 2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6> + 1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS + 2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS + 2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0> + 3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7> + 2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7> + 2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS + 2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u> + 1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS + 2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS + 1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS + 2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS + 2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6> + 2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u> + 2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7> + 2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS + 2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u> + 2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u> + 1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u> + 1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u> + 2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS + 2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0> + 2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7> + 2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u> + 2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS + 2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6> + 2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7> + 2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7> + 2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS + 135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS + 1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS + 1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS + 835584U, // <0,u,u,3>: Cost 0 copy LHS + 1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS + 1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS + 1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS + 1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u> + 835584U, // <0,u,u,u>: Cost 0 copy LHS + 2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0> + 1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1> + 2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2> + 2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0> + 2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1> + 2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0> + 3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7> + 3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0> + 1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1> + 2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS + 2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1> + 1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS + 3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3> + 2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS + 2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1> + 2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7> + 3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2> + 1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS + 2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1> + 2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1> + 2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0> + 2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1> + 2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6> + 3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7> + 2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0> + 2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2> + 2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1> + 3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0> + 3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1> + 67944550U, // <1,0,3,2>: Cost 1 vrev LHS + 2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3> + 2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS + 4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7> + 3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7> + 2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3> + 68386972U, // <1,0,3,u>: Cost 1 vrev LHS + 2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1> + 2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5> + 2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6> + 3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1> + 3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1> + 2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS + 3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1> + 3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4> + 2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS + 4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0> + 2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS + 3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS + 3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5> + 2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5> + 3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0> + 3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0> + 3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS + 3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS + 3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1> + 2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7> + 3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7> + 3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6> + 3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1> + 3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0> + 3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0> + 2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0> + 2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0> + 2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0> + 4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1> + 2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7> + 3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0> + 3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6> + 3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0> + 2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0> + 3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7> + 2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0> + 3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0> + 1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1> + 67985515U, // <1,0,u,2>: Cost 1 vrev LHS + 2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1> + 2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6> + 2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS + 2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0> + 2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u> + 68427937U, // <1,0,u,u>: Cost 1 vrev LHS + 1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1> + 1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS + 2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1> + 2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2> + 2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5> + 2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1> + 3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7> + 3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0> + 1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1> + 1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS + 202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS + 2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0> + 2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3> + 1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS + 2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7> + 2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7> + 2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1> + 202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS + 2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2> + 2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1> + 2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2> + 2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1> + 2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS + 3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7> + 2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7> + 3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0> + 2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1> + 2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2> + 3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1> + 4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2> + 2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS + 2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6> + 2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7> + 3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7> + 2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3> + 2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS + 2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS + 2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4> + 3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0> + 3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5> + 2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS + 1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS + 2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS + 3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4> + 1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS + 2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1> + 2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3> + 4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2> + 2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7> + 2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5> + 2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5> + 2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0> + 2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS + 2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7> + 3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2> + 2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7> + 2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3> + 3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7> + 3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6> + 4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5> + 2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6> + 2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0> + 2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0> + 2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1> + 2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1> + 3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3> + 4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS + 2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6> + 3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7> + 3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0> + 2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7> + 2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1> + 1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS + 202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS + 2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3> + 2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS + 1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS + 1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS + 2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7> + 2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS + 202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS + 2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0> + 1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS + 2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2> + 2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1> + 2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5> + 3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7> + 2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2> + 2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1> + 1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS + 2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2> + 2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1> + 2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0> + 2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS + 2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS + 2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7> + 2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7> + 3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0> + 2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS + 2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2> + 2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2> + 2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2> + 2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3> + 3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5> + 3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7> + 2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7> + 2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1> + 2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3> + 403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS + 1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2> + 1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3> + 403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS + 1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3> + 1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2> + 403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS + 2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2> + 3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6> + 3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4> + 2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2> + 2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4> + 1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS + 2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6> + 2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0> + 1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS + 2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS + 2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7> + 3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2> + 2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS + 2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS + 2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5> + 2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0> + 2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7> + 2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS + 2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1> + 2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2> + 2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3> + 2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7> + 3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5> + 3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7> + 2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6> + 2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2> + 2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7> + 1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2> + 2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2> + 3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3> + 2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1> + 2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6> + 3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0> + 3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1> + 2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1> + 1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2> + 403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS + 1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2> + 1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2> + 1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2> + 403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS + 1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS + 1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2> + 403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS + 2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0> + 1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS + 2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2> + 2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3> + 2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5> + 3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6> + 3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7> + 3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1> + 1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS + 2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2> + 2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1> + 2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3> + 1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS + 2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS + 2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7> + 3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7> + 2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1> + 1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS + 2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0> + 2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3> + 2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2> + 2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3> + 2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4> + 3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5> + 2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6> + 3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3> + 2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u> + 1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS + 1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3> + 2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2> + 2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3> + 1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS + 2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5> + 2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3> + 3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7> + 1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS + 2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS + 2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3> + 3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3> + 2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4> + 2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS + 1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS + 2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6> + 3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4> + 1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS + 2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS + 2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7> + 2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5> + 2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5> + 2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS + 2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5> + 2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4> + 1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS + 1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS + 2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0> + 3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1> + 2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3> + 3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7> + 2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3> + 3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5> + 2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3> + 2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7> + 2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u> + 2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1> + 2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3> + 3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7> + 2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3> + 2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6> + 2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5> + 3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1> + 2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7> + 2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1> + 1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS + 1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u> + 2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2> + 1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS + 1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS + 1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS + 2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6> + 1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS + 1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS + 2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4> + 2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS + 3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4> + 2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4> + 2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5> + 1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1> + 2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2> + 3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1> + 1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1> + 3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2> + 2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4> + 3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0> + 3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6> + 3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1> + 2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS + 3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS + 3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0> + 2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS + 3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2> + 3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3> + 3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2> + 2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1> + 3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4> + 2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS + 3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7> + 3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2> + 2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS + 2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS + 2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4> + 2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3> + 3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3> + 2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6> + 3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5> + 3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6> + 3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3> + 3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u> + 2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1> + 3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4> + 3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4> + 3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4> + 2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4> + 2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS + 2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6> + 3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1> + 2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS + 2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS + 2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5> + 2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1> + 3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2> + 2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS + 2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS + 1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS + 3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS + 1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS + 2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS + 2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1> + 3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2> + 2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2> + 2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS + 3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7> + 3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7> + 2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1> + 2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS + 2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4> + 3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1> + 3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4> + 3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4> + 3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS + 3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0> + 2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1> + 3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2> + 2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4> + 2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS + 2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS + 2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u> + 2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1> + 2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6> + 1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1> + 1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS + 2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1> + 1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS + 2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0> + 1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS + 2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5> + 3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4> + 1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5> + 2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5> + 2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1> + 3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1> + 1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS + 2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2> + 2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1> + 2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0> + 2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7> + 2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5> + 2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1> + 2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7> + 3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS + 2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3> + 3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1> + 2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5> + 2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2> + 2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1> + 3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5> + 3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7> + 2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7> + 4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS + 2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1> + 2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2> + 2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7> + 3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2> + 2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3> + 2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6> + 3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5> + 4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6> + 2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS + 2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS + 2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1> + 2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0> + 3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3> + 3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4> + 2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4> + 1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS + 2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS + 2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6> + 1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS + 2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS + 2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5> + 3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5> + 4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3> + 2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS + 2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5> + 2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0> + 2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7> + 2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS + 2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1> + 2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5> + 2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3> + 3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4> + 3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6> + 3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6> + 2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6> + 2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0> + 2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0> + 2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS + 2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1> + 2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1> + 2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7> + 2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS + 2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7> + 3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0> + 2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1> + 2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS + 2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2> + 1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS + 2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3> + 2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1> + 1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5> + 1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS + 2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7> + 2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS + 2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS + 3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0> + 2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS + 2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6> + 3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1> + 2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5> + 2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6> + 2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6> + 4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS + 2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS + 3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2> + 2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1> + 3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0> + 3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3> + 2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6> + 3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7> + 3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1> + 2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS + 2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS + 2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1> + 3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0> + 3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2> + 3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1> + 3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS + 3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7> + 2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7> + 2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS + 2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS + 2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS + 3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1> + 2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3> + 3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3> + 2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS + 4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5> + 2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3> + 3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7> + 3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u> + 3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1> + 3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0> + 3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4> + 3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4> + 3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5> + 2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS + 3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS + 4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS + 2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS + 3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2> + 3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5> + 3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5> + 3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5> + 3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6> + 3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5> + 2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0> + 2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS + 2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS + 2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1> + 3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6> + 3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7> + 3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0> + 3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS + 3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7> + 2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6> + 2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7> + 2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7> + 1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1> + 2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1> + 2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3> + 3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7> + 2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5> + 2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1> + 2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7> + 4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS + 1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1> + 1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1> + 2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS + 3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2> + 3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1> + 2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5> + 2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS + 2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7> + 3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7> + 1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1> + 2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1> + 2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS + 3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1> + 3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1> + 2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1> + 3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0> + 2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7> + 2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7> + 2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS + 2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS + 2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1> + 3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0> + 2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1> + 2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS + 2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7> + 2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7> + 2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1> + 2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7> + 2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS + 3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7> + 3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2> + 2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1> + 2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS + 3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7> + 2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2> + 3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2> + 2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS + 1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS + 2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7> + 2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2> + 2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3> + 1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS + 1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3> + 2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3> + 2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2> + 1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS + 2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1> + 3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1> + 3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4> + 3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4> + 3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4> + 2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS + 3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0> + 3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6> + 2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS + 2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS + 2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7> + 3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2> + 2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5> + 2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS + 4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6> + 2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7> + 3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS + 2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS + 2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1> + 2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0> + 3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7> + 3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6> + 3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS + 3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6> + 3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6> + 2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0> + 2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0> + 2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1> + 3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1> + 3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1> + 3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0> + 3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS + 2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7> + 3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7> + 2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7> + 2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1> + 1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS + 2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS + 2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2> + 2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u> + 1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS + 1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u> + 2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3> + 2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2> + 1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS + 1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u> + 1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS + 2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2> + 2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u> + 1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u> + 1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1> + 2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2> + 2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1> + 1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS + 1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS + 202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS + 1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS + 1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS + 1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS + 2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7> + 3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS + 2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS + 202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS + 2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0> + 2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS + 2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2> + 2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3> + 2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4> + 2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS + 2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6> + 2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS + 2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u> + 403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS + 1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 115726126U, // <1,u,3,2>: Cost 1 vrev LHS + 2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS + 403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS + 1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3> + 1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS + 403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS + 2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1> + 2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0> + 2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4> + 2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4> + 2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS + 1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS + 2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6> + 2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6> + 1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS + 2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS + 2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7> + 3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS + 2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS + 2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS + 2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS + 1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS + 1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS + 1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS + 2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u> + 2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0> + 2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3> + 2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7> + 2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4> + 3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7> + 2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u> + 2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7> + 2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u> + 1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u> + 2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1> + 2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1> + 2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7> + 2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS + 2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u> + 2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7> + 2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7> + 1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u> + 403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS + 202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS + 115767091U, // <1,u,u,2>: Cost 1 vrev LHS + 1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS + 403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS + 1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS + 1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS + 1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS + 403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS + 2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0> + 2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1> + 1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2> + 2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0> + 2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS + 3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5> + 2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0> + 4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7> + 1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2> + 2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1> + 2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0> + 1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS + 2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS + 2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS + 3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7> + 2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1> + 3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2> + 1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS + 1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS + 2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2> + 2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0> + 2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2> + 1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS + 2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3> + 2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2> + 2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2> + 1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS + 2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0> + 2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1> + 2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2> + 4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3> + 3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS + 3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5> + 4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6> + 3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7> + 2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u> + 2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS + 2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5> + 2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6> + 3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2> + 2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS + 2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS + 2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS + 3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5> + 2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6> + 3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS + 3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5> + 2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7> + 3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5> + 2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5> + 3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5> + 3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0> + 2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS + 2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5> + 4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0> + 2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS + 2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7> + 3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6> + 2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6> + 3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0> + 3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6> + 3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7> + 2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6> + 2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0> + 3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3> + 2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1> + 3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0> + 3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6> + 3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2> + 3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0> + 3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7> + 2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0> + 1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS + 2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1> + 1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS + 2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2> + 1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS + 2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS + 2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS + 2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS + 1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS + 2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS + 2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS + 3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0> + 1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2> + 2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS + 2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2> + 3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0> + 2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1> + 1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2> + 2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS + 2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1> + 2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0> + 2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3> + 2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS + 3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7> + 3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1> + 3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2> + 2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3> + 2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS + 3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1> + 2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2> + 2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0> + 2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS + 3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3> + 3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7> + 2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0> + 2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0> + 2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS + 2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1> + 2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2> + 2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3> + 2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS + 2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5> + 2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3> + 4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7> + 2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u> + 2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5> + 3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6> + 3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5> + 1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4> + 3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS + 2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS + 3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS + 3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6> + 1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4> + 2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS + 2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7> + 3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1> + 2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7> + 2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS + 3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5> + 3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5> + 3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS + 2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7> + 2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS + 3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2> + 2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7> + 3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS + 2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS + 4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5> + 3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6> + 3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1> + 2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS + 2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2> + 3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1> + 3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0> + 2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7> + 3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6> + 3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7> + 3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1> + 3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u> + 2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2> + 2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS + 2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1> + 2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2> + 1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u> + 2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS + 2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5> + 2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3> + 2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1> + 1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u> + 1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2> + 1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS + 2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0> + 2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2> + 2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS + 3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7> + 3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6> + 3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2> + 1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2> + 2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2> + 2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1> + 2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0> + 2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS + 3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS + 2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7> + 3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3> + 3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1> + 2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS + 1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS + 2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2> + 269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS + 2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3> + 1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS + 2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7> + 2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7> + 2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2> + 269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS + 2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1> + 2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0> + 2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2> + 1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS + 2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5> + 3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5> + 2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6> + 2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3> + 1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS + 2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS + 3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2> + 2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5> + 3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5> + 2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS + 1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS + 2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS + 3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4> + 1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS + 3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2> + 2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3> + 2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7> + 4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS + 2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5> + 2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5> + 2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0> + 2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS + 2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS + 3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6> + 3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3> + 2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3> + 2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7> + 3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS + 3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7> + 3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6> + 2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2> + 2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7> + 2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1> + 3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5> + 2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2> + 4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS + 2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6> + 3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7> + 3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7> + 2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7> + 2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1> + 1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS + 1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS + 269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS + 1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS + 1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS + 1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS + 2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6> + 2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS + 269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS + 1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0> + 470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS + 1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3> + 1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7> + 2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7> + 2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0> + 470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS + 1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1> + 1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0> + 1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS + 1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7> + 2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0> + 1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3> + 2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2> + 2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1> + 1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2> + 1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1> + 2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5> + 2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7> + 1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1> + 1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1> + 1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2> + 2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3> + 1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3> + 1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3> + 1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6> + 2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7> + 2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3> + 2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7> + 1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2> + 1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS + 1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4> + 2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0> + 2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2> + 1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS + 470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS + 1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6> + 2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4> + 470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS + 2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS + 1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3> + 2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4> + 2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6> + 1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6> + 1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5> + 1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0> + 1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7> + 1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7> + 2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2> + 2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6> + 1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3> + 2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5> + 2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6> + 2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7> + 1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6> + 1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1> + 1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1> + 1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2> + 2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3> + 2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3> + 2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1> + 1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6> + 2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2> + 2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1> + 1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7> + 1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2> + 1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2> + 470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS + 1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0> + 1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1> + 1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6> + 470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS + 1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7> + 1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1> + 470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS + 2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4> + 2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS + 2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4> + 2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4> + 2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2> + 2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1> + 1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2> + 3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1> + 1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2> + 2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2> + 3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1> + 2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4> + 2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS + 3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4> + 3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7> + 3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3> + 3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1> + 2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS + 2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4> + 3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3> + 2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4> + 2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5> + 2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4> + 2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS + 3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS + 3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0> + 2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS + 2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4> + 4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1> + 3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4> + 3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3> + 3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4> + 2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5> + 2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6> + 3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7> + 2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u> + 2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2> + 3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1> + 3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2> + 3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3> + 2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4> + 2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS + 2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS + 4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7> + 2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS + 2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5> + 2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2> + 2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5> + 2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5> + 2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS + 3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5> + 1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS + 2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS + 1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS + 1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS + 2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2> + 2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4> + 2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2> + 1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS + 2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS + 2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6> + 2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1> + 1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS + 2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4> + 3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2> + 3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4> + 3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4> + 3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6> + 2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0> + 2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1> + 3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7> + 2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4> + 1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS + 2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS + 2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2> + 2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2> + 1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS + 2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5> + 1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS + 2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS + 1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS + 3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0> + 2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS + 2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2> + 2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5> + 2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5> + 2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5> + 3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7> + 3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS + 2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS + 3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2> + 3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1> + 2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0> + 2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7> + 2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5> + 3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7> + 3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2> + 2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3> + 2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5> + 3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2> + 4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1> + 2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2> + 3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1> + 2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5> + 3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1> + 3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7> + 3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS + 3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS + 2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS + 2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3> + 2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5> + 3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3> + 2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6> + 2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3> + 2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6> + 2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7> + 2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u> + 2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS + 3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4> + 2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5> + 2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4> + 2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS + 2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS + 3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6> + 3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS + 2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS + 2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS + 3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7> + 3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5> + 3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5> + 2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS + 2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5> + 4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6> + 2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7> + 2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7> + 2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS + 3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6> + 2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3> + 3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6> + 2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5> + 3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7> + 4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6> + 3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS + 3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS + 2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS + 2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2> + 2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7> + 3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2> + 2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS + 2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7> + 2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2> + 4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS + 2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS + 2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS + 2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS + 2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u> + 2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u> + 2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6> + 2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS + 2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6> + 2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7> + 2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS + 2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0> + 1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS + 2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2> + 3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0> + 1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6> + 3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6> + 2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1> + 2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2> + 1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS + 2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2> + 2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1> + 2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0> + 2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS + 2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6> + 2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7> + 3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7> + 2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2> + 2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS + 2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1> + 2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3> + 2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2> + 2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1> + 2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6> + 2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7> + 2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6> + 2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS + 2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1> + 2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2> + 3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1> + 2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7> + 2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3> + 2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6> + 3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5> + 3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6> + 1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS + 1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS + 2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2> + 2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3> + 2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0> + 3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4> + 2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4> + 1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS + 2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6> + 4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS + 1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS + 3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2> + 2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3> + 2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7> + 3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5> + 2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5> + 2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5> + 2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0> + 2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS + 2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS + 2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS + 3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1> + 2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6> + 3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3> + 2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS + 3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5> + 2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6> + 2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS + 2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS + 2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1> + 2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2> + 2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2> + 2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0> + 2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5> + 3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7> + 2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2> + 2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0> + 2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1> + 2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2> + 1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS + 2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3> + 2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS + 1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6> + 1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS + 2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7> + 1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS + 1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS + 2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2> + 1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2> + 2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2> + 2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0> + 2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2> + 2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7> + 2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7> + 2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2> + 1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2> + 2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2> + 3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1> + 3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0> + 3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7> + 3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5> + 3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7> + 2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7> + 3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0> + 2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2> + 3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2> + 3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7> + 2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2> + 3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1> + 3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6> + 2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7> + 2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7> + 3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1> + 2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7> + 1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS + 2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7> + 2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2> + 2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3> + 1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS + 2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3> + 1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3> + 2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2> + 1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS + 2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6> + 3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4> + 3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7> + 2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4> + 3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4> + 2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS + 2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u> + 3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0> + 2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS + 2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2> + 3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7> + 3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7> + 3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5> + 2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS + 3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5> + 3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7> + 2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6> + 2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS + 2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS + 3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7> + 2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3> + 3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6> + 2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS + 2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6> + 2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6> + 3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0> + 2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS + 2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1> + 3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2> + 3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7> + 3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7> + 2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS + 3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5> + 2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7> + 2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7> + 2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1> + 1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS + 1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2> + 2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2> + 2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u> + 1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS + 2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS + 1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u> + 2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2> + 1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS + 1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0> + 470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS + 1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2> + 1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7> + 1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2> + 3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS + 470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS + 1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1> + 1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0> + 1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5> + 1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7> + 2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3> + 1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3> + 1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS + 2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1> + 269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS + 1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1> + 1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS + 2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6> + 1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS + 269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS + 1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2> + 2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1> + 1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3> + 1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS + 1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6> + 2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5> + 1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3> + 1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS + 1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS + 1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS + 1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4> + 2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5> + 1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4> + 1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS + 470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS + 1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6> + 3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS + 470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS + 2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5> + 1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3> + 2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5> + 2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7> + 1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6> + 1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5> + 1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS + 1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7> + 1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS + 1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS + 2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2> + 1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3> + 2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7> + 1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS + 2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS + 1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6> + 1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1> + 1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS + 1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2> + 2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u> + 2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7> + 2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1> + 1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6> + 2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2> + 2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7> + 1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7> + 1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2> + 1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS + 470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS + 269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS + 1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS + 1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS + 470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS + 1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS + 1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS + 470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS + 1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0> + 1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1> + 1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2> + 3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1> + 2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1> + 3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7> + 3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1> + 3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0> + 1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2> + 1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS + 2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0> + 537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS + 2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3> + 1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS + 2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7> + 2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7> + 2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1> + 537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS + 1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2> + 2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1> + 2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0> + 2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1> + 1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6> + 2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7> + 2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7> + 2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7> + 1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2> + 2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2> + 2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0> + 2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1> + 2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3> + 2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6> + 3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7> + 2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7> + 3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1> + 2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3> + 2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4> + 1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5> + 1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6> + 3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1> + 2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6> + 1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS + 2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6> + 3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4> + 1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6> + 2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7> + 2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3> + 3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS + 3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6> + 2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6> + 2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5> + 2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7> + 2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7> + 3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS + 2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7> + 2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7> + 2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7> + 3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2> + 3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1> + 3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0> + 2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6> + 2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0> + 2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7> + 2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2> + 2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7> + 2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0> + 3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7> + 2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6> + 3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7> + 3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3> + 2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7> + 2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0> + 1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2> + 1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1> + 537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS + 2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1> + 1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6> + 1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS + 2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7> + 2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u> + 537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS + 2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS + 2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS + 2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1> + 1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2> + 2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS + 2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1> + 3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6> + 3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0> + 1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2> + 2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1> + 1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1> + 2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1> + 1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3> + 2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5> + 2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5> + 3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5> + 3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3> + 1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3> + 2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1> + 2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3> + 2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2> + 1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0> + 2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS + 2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3> + 3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7> + 2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0> + 1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0> + 1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS + 1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3> + 2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0> + 2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1> + 1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS + 1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7> + 2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7> + 2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3> + 1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3> + 2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS + 2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5> + 2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5> + 2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5> + 2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS + 2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS + 2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS + 3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4> + 2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5> + 2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1> + 2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7> + 2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5> + 1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7> + 2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5> + 3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7> + 3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0> + 2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS + 1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7> + 3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1> + 2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7> + 2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7> + 2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7> + 3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5> + 2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7> + 3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7> + 2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0> + 2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7> + 3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS + 2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1> + 4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2> + 3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS + 3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS + 2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7> + 3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7> + 3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7> + 3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS + 1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS + 1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3> + 2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0> + 1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0> + 1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS + 1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7> + 2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7> + 2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS + 1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3> + 2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0> + 1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS + 2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0> + 2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0> + 2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5> + 2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7> + 2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4> + 2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0> + 1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS + 1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2> + 2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1> + 2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0> + 2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1> + 2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS + 2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7> + 2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3> + 3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1> + 1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1> + 2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1> + 2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3> + 1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2> + 1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3> + 2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5> + 2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7> + 2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6> + 3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3> + 1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3> + 1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1> + 2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0> + 2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2> + 2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3> + 1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5> + 2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1> + 2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2> + 2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0> + 1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1> + 2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS + 2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4> + 2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4> + 2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5> + 2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS + 1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS + 2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0> + 2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4> + 1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS + 2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS + 2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5> + 2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7> + 2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6> + 1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5> + 2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5> + 2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7> + 2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7> + 1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5> + 2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1> + 2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3> + 2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6> + 1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7> + 2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5> + 2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7> + 2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7> + 2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1> + 1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7> + 2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2> + 2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3> + 4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2> + 2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS + 2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6> + 2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7> + 2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7> + 2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7> + 2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS + 1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1> + 1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS + 1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2> + 1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3> + 1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5> + 1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS + 2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0> + 2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1> + 1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1> + 2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0> + 1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2> + 2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0> + 2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2> + 2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1> + 2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2> + 3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2> + 4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7> + 1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2> + 2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3> + 1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3> + 2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3> + 2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1> + 2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS + 2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3> + 3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3> + 2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3> + 1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3> + 2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS + 2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3> + 1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3> + 2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0> + 2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS + 2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4> + 2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3> + 2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3> + 1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3> + 1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS + 2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3> + 2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3> + 336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS + 1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS + 2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5> + 2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7> + 2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3> + 336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS + 2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS + 2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4> + 2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4> + 2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6> + 2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4> + 1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6> + 2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS + 4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7> + 1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6> + 2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS + 2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5> + 2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5> + 2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5> + 2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS + 3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5> + 2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0> + 2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS + 2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS + 2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7> + 3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7> + 2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7> + 2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7> + 2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7> + 3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7> + 2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6> + 2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3> + 2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3> + 2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS + 2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7> + 2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7> + 2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3> + 2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS + 2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7> + 3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3> + 3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7> + 2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS + 1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS + 1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2> + 1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3> + 336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS + 1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS + 1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6> + 2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3> + 2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS + 336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS + 2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0> + 1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS + 2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2> + 3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4> + 2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5> + 1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1> + 1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2> + 3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0> + 1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS + 2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2> + 2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1> + 1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4> + 2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3> + 2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS + 2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0> + 2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3> + 2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4> + 1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4> + 3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS + 2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3> + 2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2> + 2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1> + 2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3> + 2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3> + 2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0> + 2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4> + 2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0> + 2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2> + 2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4> + 2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4> + 2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3> + 2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1> + 2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS + 3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS + 3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1> + 2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2> + 2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS + 2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4> + 2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4> + 2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4> + 1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4> + 1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS + 1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6> + 3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4> + 1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS + 1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS + 2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3> + 1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5> + 2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2> + 1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS + 2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7> + 537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS + 2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5> + 537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS + 2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1> + 2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6> + 2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3> + 2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6> + 1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6> + 2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7> + 2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7> + 2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4> + 1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2> + 2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2> + 3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5> + 2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4> + 3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7> + 2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6> + 2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0> + 2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0> + 2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7> + 2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4> + 1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS + 1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS + 1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u> + 2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1> + 1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS + 1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS + 537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS + 2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u> + 537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS + 3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0> + 2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS + 2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5> + 3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4> + 2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1> + 2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1> + 2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1> + 2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0> + 2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS + 2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS + 3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1> + 2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5> + 2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5> + 2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5> + 2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3> + 3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7> + 1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3> + 1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3> + 3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3> + 2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5> + 3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2> + 2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5> + 2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5> + 2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3> + 3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7> + 2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3> + 2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5> + 3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2> + 2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5> + 3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4> + 2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3> + 2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6> + 2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5> + 4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6> + 3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS + 3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS + 2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS + 3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5> + 2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4> + 3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0> + 2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS + 2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS + 2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS + 1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6> + 1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6> + 2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS + 2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3> + 2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5> + 2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5> + 2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS + 1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5> + 2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6> + 1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7> + 1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7> + 2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1> + 2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7> + 2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6> + 2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4> + 2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5> + 2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7> + 2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7> + 1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0> + 1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0> + 1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS + 1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7> + 2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2> + 2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2> + 1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS + 1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7> + 2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0> + 2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7> + 1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS + 1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS + 1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u> + 2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2> + 2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2> + 1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS + 1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7> + 2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS + 1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3> + 1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS + 2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS + 2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2> + 2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4> + 3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4> + 2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2> + 2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7> + 2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0> + 2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS + 2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2> + 3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3> + 3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1> + 2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6> + 3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1> + 2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6> + 3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3> + 2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3> + 2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3> + 2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6> + 2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS + 2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3> + 2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6> + 3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0> + 2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6> + 2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6> + 2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3> + 1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3> + 1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3> + 3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2> + 3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3> + 3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3> + 2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3> + 2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6> + 3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6> + 3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2> + 2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS + 2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS + 3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6> + 2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3> + 2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5> + 3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6> + 2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6> + 2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6> + 2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0> + 2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS + 2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6> + 2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS + 3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2> + 2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7> + 2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6> + 2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5> + 3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6> + 4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6> + 2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5> + 2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS + 2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1> + 2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3> + 2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3> + 3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6> + 2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4> + 2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7> + 1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6> + 1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7> + 1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7> + 1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1> + 2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7> + 2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7> + 2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3> + 1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5> + 2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1> + 2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2> + 2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS + 1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1> + 1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1> + 2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2> + 2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u> + 2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6> + 1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5> + 2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6> + 1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6> + 1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3> + 1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1> + 2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0> + 1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS + 2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2> + 3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0> + 2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5> + 2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0> + 2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0> + 2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1> + 1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS + 2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2> + 2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1> + 2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0> + 2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7> + 2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS + 1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7> + 2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7> + 2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3> + 1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7> + 2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS + 3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3> + 2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2> + 2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1> + 2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS + 2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7> + 1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7> + 2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3> + 1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7> + 2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2> + 2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3> + 2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3> + 2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3> + 2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6> + 2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7> + 2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7> + 2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7> + 2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2> + 2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS + 2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7> + 2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7> + 3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7> + 2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS + 1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS + 2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4> + 2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6> + 1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS + 2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2> + 2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3> + 3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3> + 2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0> + 2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5> + 2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5> + 2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7> + 2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS + 2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS + 2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1> + 2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0> + 2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3> + 2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0> + 2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5> + 2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4> + 2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6> + 2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0> + 2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7> + 2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS + 2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7> + 2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7> + 2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7> + 2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS + 2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7> + 2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7> + 1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7> + 1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7> + 2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2> + 1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS + 2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0> + 2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS + 2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6> + 1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS + 1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7> + 1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7> + 1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS + 1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0> + 1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2> + 1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2> + 1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2> + 2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1> + 1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1> + 1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2> + 2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS + 1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2> + 1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u> + 1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u> + 537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS + 1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3> + 1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS + 1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u> + 2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u> + 1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3> + 537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS + 1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2> + 2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3> + 1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u> + 1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0> + 1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6> + 2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7> + 1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u> + 1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3> + 1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0> + 1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1> + 1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3> + 2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2> + 336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS + 1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5> + 1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7> + 2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7> + 2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS + 336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS + 2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS + 1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5> + 1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6> + 2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5> + 1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4> + 1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6> + 1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6> + 1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6> + 1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6> + 1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS + 2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7> + 1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5> + 1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7> + 1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS + 1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5> + 537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS + 1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7> + 537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS + 2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1> + 2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6> + 2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7> + 1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7> + 1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6> + 2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7> + 1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6> + 1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0> + 1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7> + 1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS + 1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7> + 2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2> + 3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS + 1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS + 1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7> + 2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2> + 1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7> + 1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS + 1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1> + 1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2> + 537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS + 336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS + 1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5> + 1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6> + 537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS + 1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0> + 537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS + 2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0> + 2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1> + 2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2> + 3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4> + 2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4> + 3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0> + 3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0> + 3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0> + 2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1> + 2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS + 2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4> + 1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS + 2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1> + 2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS + 2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4> + 3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1> + 2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1> + 1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS + 2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4> + 3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4> + 2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4> + 2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0> + 2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6> + 3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7> + 2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4> + 3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2> + 2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4> + 3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS + 2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4> + 2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4> + 3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4> + 3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS + 3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5> + 3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7> + 3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0> + 2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4> + 2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS + 2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5> + 2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6> + 3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2> + 2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS + 2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS + 3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2> + 3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4> + 2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS + 2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS + 1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS + 2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2> + 2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5> + 2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS + 3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7> + 3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7> + 2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5> + 1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS + 2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS + 3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1> + 1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS + 2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6> + 2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS + 2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6> + 3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6> + 2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0> + 1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS + 3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2> + 3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS + 4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS + 3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0> + 3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5> + 3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5> + 3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0> + 3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7> + 4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS + 2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS + 1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS + 1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS + 2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u> + 2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS + 2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS + 2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4> + 2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u> + 1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS + 2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1> + 2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS + 3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6> + 2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2> + 2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5> + 3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4> + 3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1> + 3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4> + 2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS + 3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2> + 2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4> + 2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4> + 2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3> + 3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5> + 3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1> + 3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7> + 3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1> + 2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3> + 2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS + 2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4> + 3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2> + 1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4> + 2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS + 3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3> + 3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7> + 3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2> + 1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4> + 2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS + 2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3> + 2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4> + 2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4> + 2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS + 2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7> + 2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> + 3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2> + 2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3> + 3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1> + 3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0> + 3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5> + 3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS + 3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5> + 2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS + 3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4> + 3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0> + 2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS + 1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS + 2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2> + 2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5> + 2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2> + 1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS + 2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7> + 2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3> + 2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2> + 1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS + 2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS + 2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1> + 4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2> + 3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS + 2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS + 2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6> + 4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7> + 3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1> + 2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS + 2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1> + 3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1> + 3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1> + 2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4> + 3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6> + 3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4> + 3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7> + 3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7> + 2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1> + 1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS + 2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2> + 2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2> + 1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4> + 1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS + 2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS + 2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> + 2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2> + 1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS + 2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2> + 2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS + 2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6> + 3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2> + 2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6> + 3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7> + 2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4> + 3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2> + 2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS + 3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2> + 3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1> + 3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0> + 3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS + 2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3> + 3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7> + 3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3> + 3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3> + 2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3> + 3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4> + 3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3> + 2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2> + 2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3> + 3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0> + 3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7> + 3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6> + 3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2> + 2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3> + 2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1> + 3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1> + 3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2> + 2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4> + 2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5> + 2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4> + 3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4> + 2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4> + 2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1> + 2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS + 2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4> + 2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4> + 2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4> + 2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS + 2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS + 2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4> + 3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0> + 2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS + 2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS + 2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0> + 2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2> + 2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS + 2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS + 3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7> + 2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7> + 3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS + 2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS + 1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS + 2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2> + 2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2> + 2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS + 1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS + 2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6> + 2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3> + 2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2> + 1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS + 2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2> + 3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2> + 3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2> + 2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4> + 4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4> + 3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7> + 3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4> + 3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7> + 2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2> + 1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS + 2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2> + 2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2> + 2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS + 1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS + 2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS + 2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u> + 2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2> + 1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS + 3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0> + 2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2> + 2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4> + 3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3> + 3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1> + 2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0> + 3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0> + 3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0> + 2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2> + 2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1> + 3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1> + 2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4> + 2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4> + 3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0> + 3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3> + 3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1> + 3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3> + 2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4> + 3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS + 2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3> + 3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2> + 2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4> + 3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS + 2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4> + 3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3> + 3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3> + 2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3> + 3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1> + 3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1> + 3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3> + 2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3> + 2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4> + 3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7> + 3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7> + 3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7> + 2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3> + 2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1> + 2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2> + 2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4> + 3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0> + 2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5> + 2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6> + 2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4> + 3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2> + 2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1> + 2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS + 2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5> + 2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5> + 2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3> + 2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS + 4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5> + 3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5> + 2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4> + 2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS + 2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS + 2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6> + 2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6> + 2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3> + 2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS + 1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6> + 4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6> + 2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4> + 1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6> + 3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1> + 3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5> + 3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7> + 3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7> + 3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5> + 3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7> + 3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7> + 3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4> + 3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1> + 2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS + 2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2> + 2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u> + 2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4> + 2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS + 1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u> + 2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u> + 2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4> + 1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u> + 2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4> + 1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS + 2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS + 3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1> + 2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0> + 2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1> + 2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2> + 3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0> + 1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS + 2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2> + 2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1> + 2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3> + 2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3> + 2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3> + 2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4> + 3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3> + 3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3> + 2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3> + 3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4> + 3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4> + 2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2> + 2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4> + 2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4> + 3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7> + 2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4> + 3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4> + 2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4> + 2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2> + 3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4> + 3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3> + 2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4> + 2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4> + 3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5> + 3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4> + 3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4> + 2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4> + 1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS + 2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4> + 2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2> + 2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4> + 161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS + 1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS + 2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS + 2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4> + 161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS + 2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS + 2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4> + 3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5> + 2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5> + 2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS + 1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS + 1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS + 2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5> + 1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS + 2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS + 3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2> + 2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5> + 2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6> + 2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS + 2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6> + 1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS + 2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4> + 1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS + 2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2> + 3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4> + 3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7> + 3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4> + 2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4> + 2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4> + 2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4> + 2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7> + 2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4> + 1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS + 1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS + 2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS + 2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u> + 161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS + 1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS + 1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS + 2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u> + 161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS + 2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0> + 1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS + 2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5> + 2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0> + 2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5> + 3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7> + 2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5> + 2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0> + 1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS + 2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2> + 2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5> + 2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0> + 2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS + 3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4> + 2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5> + 3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6> + 2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3> + 2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS + 2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS + 3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3> + 2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5> + 1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5> + 2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5> + 3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7> + 2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7> + 4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS + 1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5> + 2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2> + 3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1> + 2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4> + 2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3> + 2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0> + 2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5> + 2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4> + 2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5> + 2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2> + 2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS + 3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4> + 2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5> + 2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4> + 2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS + 1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS + 2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5> + 2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6> + 1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS + 1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS + 2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3> + 2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4> + 2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2> + 1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5> + 2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5> + 2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0> + 2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS + 1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS + 1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS + 2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6> + 2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6> + 1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6> + 1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS + 3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5> + 2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6> + 27705344U, // <4,5,6,7>: Cost 0 copy RHS + 27705344U, // <4,5,6,u>: Cost 0 copy RHS + 2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS + 2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4> + 2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7> + 2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5> + 2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS + 2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7> + 2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4> + 2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4> + 2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS + 1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS + 1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS + 2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0> + 1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u> + 1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS + 1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS + 2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7> + 27705344U, // <4,5,u,7>: Cost 0 copy RHS + 27705344U, // <4,5,u,u>: Cost 0 copy RHS + 2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0> + 1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS + 1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6> + 3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0> + 2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5> + 3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7> + 2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0> + 4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS + 1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS + 2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2> + 2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1> + 2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0> + 2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3> + 3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5> + 2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7> + 3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7> + 4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS + 2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3> + 2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4> + 2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3> + 2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2> + 2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1> + 2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6> + 2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7> + 2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7> + 2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3> + 2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1> + 2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2> + 2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6> + 3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2> + 2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3> + 2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6> + 2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6> + 3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6> + 2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4> + 2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2> + 2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS + 2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3> + 2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4> + 2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4> + 2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS + 1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS + 1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS + 2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4> + 1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS + 2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS + 2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3> + 2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3> + 3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3> + 2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6> + 2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5> + 2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6> + 2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS + 2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS + 1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS + 2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2> + 2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3> + 2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2> + 1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6> + 2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6> + 2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6> + 2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS + 1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS + 2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2> + 2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2> + 3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7> + 2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4> + 2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6> + 2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6> + 3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3> + 2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7> + 2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2> + 1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS + 1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS + 1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS + 2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1> + 1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u> + 1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS + 1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS + 2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS + 1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS + 3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0> + 2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS + 2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4> + 3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4> + 3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5> + 2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0> + 3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7> + 3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4> + 2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS + 2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1> + 3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1> + 2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4> + 3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5> + 3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS + 3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7> + 3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7> + 3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3> + 2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1> + 3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS + 3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3> + 3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2> + 2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4> + 3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7> + 2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7> + 3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7> + 3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3> + 2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7> + 3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2> + 3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4> + 3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4> + 3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3> + 3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6> + 2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7> + 2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7> + 3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4> + 2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7> + 2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1> + 3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3> + 3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7> + 3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5> + 2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4> + 2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS + 2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4> + 3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7> + 2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS + 2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2> + 3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7> + 2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5> + 3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5> + 2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6> + 2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5> + 2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5> + 2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7> + 2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2> + 1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS + 3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2> + 2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2> + 2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2> + 1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS + 1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6> + 2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3> + 3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7> + 1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS + 3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS + 3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4> + 3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7> + 3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4> + 3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7> + 2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7> + 3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7> + 2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7> + 2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7> + 1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS + 2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS + 2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2> + 2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2> + 1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS + 1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u> + 2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3> + 3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7> + 1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS + 2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0> + 1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS + 1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u> + 2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2> + 2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5> + 2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0> + 2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u> + 2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0> + 1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS + 2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2> + 2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1> + 1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS + 2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3> + 2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3> + 2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7> + 3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7> + 2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3> + 1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS + 2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS + 2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u> + 2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2> + 1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u> + 2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u> + 2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4> + 2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7> + 2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3> + 1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u> + 2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2> + 2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2> + 2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u> + 2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3> + 2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6> + 2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7> + 2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> + 2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u> + 2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2> + 1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS + 2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2> + 2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4> + 2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4> + 161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS + 1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS + 1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS + 2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6> + 161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS + 1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS + 1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS + 2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5> + 3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS + 1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS + 1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS + 1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS + 3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS + 1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS + 1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS + 2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2> + 1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS + 1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6> + 1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS + 1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6> + 1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS + 27705344U, // <4,u,6,7>: Cost 0 copy RHS + 27705344U, // <4,u,6,u>: Cost 0 copy RHS + 2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS + 2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4> + 2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7> + 2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u> + 2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS + 2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6> + 2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7> + 2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7> + 2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS + 1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS + 1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS + 1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS + 1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u> + 161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS + 1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS + 1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS + 27705344U, // <4,u,u,7>: Cost 0 copy RHS + 27705344U, // <4,u,u,u>: Cost 0 copy RHS + 2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0> + 2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1> + 2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2> + 3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5> + 2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5> + 3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0> + 3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0> + 3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0> + 2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2> + 2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS + 2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1> + 1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS + 3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7> + 2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS + 2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1> + 3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7> + 3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2> + 1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS + 2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2> + 2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5> + 2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4> + 2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5> + 2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5> + 2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5> + 2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4> + 2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6> + 2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5> + 3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5> + 3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4> + 2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5> + 3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5> + 2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0> + 3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4> + 3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0> + 3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7> + 2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5> + 2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS + 1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5> + 2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6> + 3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5> + 2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS + 2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS + 3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5> + 2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0> + 1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5> + 3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1> + 2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS + 3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS + 3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0> + 3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5> + 2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0> + 3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0> + 3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS + 2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS + 4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0> + 2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS + 3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7> + 3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6> + 3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5> + 3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7> + 3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6> + 2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5> + 2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS + 2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS + 2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0> + 2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7> + 3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2> + 2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS + 2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0> + 2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7> + 4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7> + 2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS + 2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2> + 1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5> + 1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS + 2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5> + 2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6> + 2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS + 2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u> + 2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6> + 1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS + 2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0> + 1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS + 2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2> + 2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2> + 2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5> + 2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0> + 3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7> + 3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0> + 1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS + 2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1> + 2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1> + 2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0> + 2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3> + 2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5> + 2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5> + 3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5> + 3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5> + 2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3> + 3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2> + 3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3> + 2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2> + 2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0> + 2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5> + 2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3> + 2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7> + 3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0> + 2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0> + 2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS + 2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3> + 3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5> + 2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5> + 2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5> + 2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7> + 3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7> + 2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5> + 2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3> + 1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1> + 2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5> + 2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5> + 2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5> + 2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4> + 1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS + 2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6> + 3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4> + 1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1> + 2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1> + 2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1> + 3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1> + 2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7> + 2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5> + 2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5> + 2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0> + 2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7> + 2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1> + 2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS + 3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7> + 2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3> + 3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7> + 2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS + 2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7> + 2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6> + 2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1> + 2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1> + 2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS + 2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1> + 2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1> + 2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS + 2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS + 2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3> + 4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6> + 2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7> + 2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS + 1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1> + 1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS + 2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5> + 2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS + 2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5> + 1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS + 2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7> + 2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1> + 2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS + 3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0> + 2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS + 2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2> + 3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2> + 3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1> + 3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1> + 3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4> + 3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0> + 2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS + 2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2> + 3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2> + 3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5> + 2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5> + 3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS + 3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0> + 3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3> + 3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1> + 2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5> + 3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS + 3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3> + 2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2> + 2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3> + 2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5> + 3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7> + 3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6> + 3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5> + 2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3> + 2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1> + 2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5> + 2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5> + 3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5> + 1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5> + 2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5> + 3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5> + 3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7> + 1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5> + 2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2> + 3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3> + 2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5> + 2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5> + 2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6> + 2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS + 2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2> + 3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4> + 2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS + 2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS + 3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3> + 3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7> + 2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS + 2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS + 2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5> + 3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7> + 3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1> + 2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS + 3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS + 3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3> + 2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3> + 2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7> + 2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5> + 3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7> + 3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7> + 3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1> + 2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7> + 2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS + 2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2> + 2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7> + 2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS + 2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS + 4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5> + 3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6> + 3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7> + 2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS + 2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1> + 2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS + 2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u> + 2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3> + 1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5> + 2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS + 2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5> + 3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u> + 1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5> + 3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0> + 2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2> + 3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0> + 3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2> + 2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1> + 3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2> + 3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0> + 2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0> + 2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2> + 3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3> + 3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1> + 2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3> + 2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5> + 3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3> + 2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7> + 3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7> + 2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5> + 2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3> + 3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1> + 3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5> + 3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2> + 2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4> + 2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5> + 2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4> + 3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3> + 3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3> + 2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4> + 3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1> + 2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3> + 3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2> + 2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3> + 2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5> + 2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5> + 3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7> + 3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5> + 2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5> + 2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5> + 3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0> + 2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3> + 2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5> + 2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5> + 2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6> + 3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5> + 2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4> + 2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6> + 2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS + 2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5> + 2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5> + 2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5> + 2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS + 2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5> + 3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0> + 2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5> + 2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS + 2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS + 3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6> + 2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6> + 2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6> + 2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS + 3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0> + 4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6> + 2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4> + 2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS + 1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS + 1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7> + 2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2> + 2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2> + 1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS + 2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3> + 2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3> + 3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7> + 1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS + 1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS + 1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u> + 2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2> + 2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2> + 1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS + 2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6> + 2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3> + 2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u> + 1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS + 3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS + 2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS + 3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2> + 3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5> + 2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5> + 2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1> + 2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0> + 3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0> + 2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS + 2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1> + 3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4> + 3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4> + 3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7> + 3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4> + 2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0> + 3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5> + 3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1> + 2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1> + 3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4> + 3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4> + 3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4> + 2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5> + 3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4> + 2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3> + 3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3> + 3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5> + 2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5> + 3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS + 3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4> + 2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4> + 3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3> + 2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4> + 3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0> + 2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5> + 3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7> + 2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4> + 2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS + 2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4> + 3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3> + 3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4> + 2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4> + 1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5> + 2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4> + 2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4> + 1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5> + 2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS + 2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5> + 2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3> + 3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2> + 2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS + 2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5> + 1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS + 3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS + 1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS + 2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS + 2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7> + 3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2> + 2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6> + 2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS + 2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5> + 3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7> + 2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5> + 2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS + 2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS + 2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4> + 2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7> + 2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7> + 2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS + 3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5> + 94817590U, // <5,4,7,6>: Cost 1 vrev RHS + 2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7> + 94965064U, // <5,4,7,u>: Cost 1 vrev RHS + 2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS + 2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u> + 2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u> + 2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4> + 2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS + 1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5> + 94825783U, // <5,4,u,6>: Cost 1 vrev RHS + 2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5> + 94973257U, // <5,4,u,u>: Cost 1 vrev RHS + 2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0> + 1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS + 2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2> + 3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2> + 2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1> + 2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0> + 3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7> + 4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS + 1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS + 2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2> + 2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5> + 2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0> + 2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3> + 2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5> + 2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7> + 3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7> + 2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3> + 2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5> + 3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS + 3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3> + 2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2> + 2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4> + 3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3> + 2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3> + 2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7> + 4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7> + 2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4> + 2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2> + 3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5> + 3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3> + 2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5> + 2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6> + 2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5> + 3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7> + 3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1> + 2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2> + 2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1> + 2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5> + 3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3> + 3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4> + 1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5> + 1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS + 2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5> + 2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6> + 1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5> + 1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS + 2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3> + 2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2> + 2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2> + 1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS + 229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS + 2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0> + 2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7> + 229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS + 2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS + 3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6> + 2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3> + 3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6> + 2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5> + 2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5> + 2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6> + 2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1> + 2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1> + 2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS + 3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7> + 2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5> + 2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7> + 2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS + 3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5> + 4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6> + 2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS + 2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS + 1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS + 1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS + 2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5> + 2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u> + 1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS + 229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS + 2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7> + 2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS + 229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS + 2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0> + 1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS + 2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2> + 2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4> + 2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5> + 3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6> + 3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7> + 4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS + 1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS + 2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2> + 2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1> + 2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0> + 2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3> + 2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6> + 2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7> + 3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7> + 2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS + 2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6> + 3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2> + 3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3> + 2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2> + 2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1> + 2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6> + 2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6> + 2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7> + 2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3> + 2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6> + 2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2> + 3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3> + 3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6> + 2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3> + 1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6> + 2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6> + 3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7> + 4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS + 1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6> + 2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS + 3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5> + 2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5> + 2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5> + 2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6> + 1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS + 2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6> + 2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5> + 1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS + 2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS + 2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3> + 3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6> + 3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4> + 2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6> + 2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5> + 2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1> + 2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS + 2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS + 2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS + 3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4> + 2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3> + 2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6> + 2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS + 2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6> + 2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6> + 2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS + 2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS + 430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS + 1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2> + 1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2> + 430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS + 1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6> + 1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6> + 1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7> + 430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS + 430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS + 1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS + 1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2> + 430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS + 1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS + 1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3> + 1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2> + 430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS + 2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0> + 1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS + 2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2> + 2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0> + 2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5> + 2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7> + 3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7> + 2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0> + 1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS + 2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2> + 2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1> + 2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0> + 1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7> + 2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS + 2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7> + 2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7> + 3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7> + 1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7> + 2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7> + 3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3> + 2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2> + 2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1> + 2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7> + 2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7> + 2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7> + 3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7> + 2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1> + 2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2> + 2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5> + 3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1> + 2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3> + 2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6> + 2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0> + 3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7> + 2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7> + 2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2> + 2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS + 2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7> + 3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0> + 2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4> + 2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS + 1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS + 2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6> + 2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7> + 1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS + 2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS + 2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3> + 2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3> + 2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7> + 2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS + 2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5> + 2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7> + 1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS + 1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS + 2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0> + 3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5> + 2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2> + 2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6> + 2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4> + 2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u> + 2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6> + 2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7> + 2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u> + 1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS + 2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1> + 2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2> + 2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3> + 1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS + 1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7> + 2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3> + 2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7> + 1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS + 1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS + 1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS + 2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0> + 1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS + 1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS + 1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS + 2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7> + 1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS + 1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS + 2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0> + 1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS + 2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2> + 2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2> + 2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1> + 2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1> + 2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0> + 2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0> + 1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS + 2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2> + 2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1> + 1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS + 1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u> + 2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u> + 2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0> + 2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u> + 2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS + 1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS + 2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0> + 2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5> + 2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2> + 2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3> + 2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u> + 2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u> + 2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7> + 2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3> + 2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1> + 2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1> + 2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u> + 2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u> + 2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3> + 1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u> + 2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0> + 2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5> + 2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u> + 1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u> + 1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u> + 1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5> + 2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6> + 2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5> + 1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5> + 1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS + 2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6> + 2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u> + 1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS + 1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS + 2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5> + 2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3> + 2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7> + 1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS + 229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS + 1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS + 1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS + 229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS + 2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS + 2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS + 2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6> + 2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7> + 2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS + 2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS + 2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6> + 2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS + 2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7> + 430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS + 1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7> + 1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2> + 2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS + 430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS + 1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3> + 118708378U, // <5,u,7,6>: Cost 1 vrev RHS + 2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS + 430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS + 430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS + 1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS + 1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS + 1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS + 430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS + 229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS + 118716571U, // <5,u,u,6>: Cost 1 vrev RHS + 1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS + 430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS + 2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0> + 2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1> + 2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2> + 3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5> + 2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6> + 3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0> + 3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6> + 3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7> + 2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2> + 2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS + 3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0> + 1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS + 2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6> + 2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS + 2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1> + 2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1> + 2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1> + 1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS + 2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2> + 2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6> + 3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6> + 2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5> + 1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6> + 3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7> + 2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6> + 3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2> + 1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6> + 3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2> + 2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4> + 2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5> + 3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3> + 2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6> + 3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6> + 3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6> + 3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7> + 2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5> + 2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6> + 2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5> + 1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6> + 3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6> + 2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6> + 2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS + 2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0> + 2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0> + 1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6> + 3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS + 2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6> + 3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6> + 3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0> + 2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6> + 3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6> + 2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0> + 2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS + 2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS + 2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0> + 2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS + 3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS + 3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5> + 2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0> + 3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7> + 2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0> + 2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1> + 2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS + 2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS + 2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0> + 2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2> + 2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7> + 2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS + 4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5> + 3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0> + 4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7> + 2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS + 2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2> + 2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1> + 1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS + 2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5> + 1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6> + 2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS + 2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u> + 2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS + 1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS + 2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS + 2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS + 3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6> + 2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2> + 2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS + 3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2> + 2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1> + 2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0> + 2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2> + 3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1> + 2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1> + 3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6> + 2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3> + 2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6> + 3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5> + 3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6> + 3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1> + 2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3> + 2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS + 3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3> + 3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2> + 2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0> + 2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS + 3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3> + 2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3> + 3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0> + 2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0> + 2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS + 2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3> + 2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6> + 3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1> + 2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6> + 2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7> + 2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3> + 3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2> + 2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3> + 2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1> + 2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6> + 3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4> + 2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6> + 3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS + 2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6> + 3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0> + 2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1> + 2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6> + 2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1> + 3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7> + 3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6> + 2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7> + 2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6> + 3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6> + 2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0> + 3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS + 2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7> + 2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS + 3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7> + 3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6> + 3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS + 2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS + 3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7> + 2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6> + 3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1> + 3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS + 2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS + 2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7> + 2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2> + 3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS + 2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS + 2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5> + 3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0> + 4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7> + 2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS + 2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS + 2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3> + 2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6> + 2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0> + 2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6> + 2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7> + 2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u> + 2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u> + 2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0> + 2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0> + 1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS + 2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2> + 3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0> + 2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6> + 3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3> + 2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4> + 3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0> + 1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS + 2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1> + 2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1> + 2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0> + 2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS + 3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6> + 2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7> + 2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3> + 3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1> + 2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1> + 3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1> + 3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3> + 2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2> + 2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3> + 2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6> + 3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7> + 2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6> + 3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7> + 2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3> + 2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1> + 3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0> + 2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6> + 2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4> + 2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5> + 2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6> + 2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6> + 2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4> + 2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1> + 1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2> + 3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u> + 2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6> + 2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6> + 2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6> + 1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS + 2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0> + 3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2> + 1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2> + 3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3> + 2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3> + 3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7> + 2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6> + 2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5> + 2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5> + 2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0> + 2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS + 2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6> + 2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1> + 2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3> + 2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6> + 2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7> + 2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5> + 2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7> + 2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6> + 2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1> + 2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7> + 2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS + 2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2> + 2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7> + 1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS + 2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS + 3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7> + 2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6> + 2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7> + 1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS + 1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2> + 1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS + 2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6> + 1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS + 2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5> + 1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS + 2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0> + 2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS + 1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS + 3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0> + 2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2> + 2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4> + 3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2> + 2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2> + 2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2> + 3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0> + 2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0> + 2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2> + 3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3> + 3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1> + 3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3> + 3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1> + 2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6> + 3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3> + 3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0> + 3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3> + 2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6> + 3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4> + 2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3> + 3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2> + 3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0> + 3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6> + 3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7> + 2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7> + 3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6> + 2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3> + 3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1> + 3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3> + 2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3> + 2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3> + 2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6> + 3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5> + 3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7> + 3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7> + 2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5> + 2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS + 2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3> + 2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3> + 2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6> + 2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS + 1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6> + 2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6> + 2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4> + 1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6> + 3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS + 3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7> + 3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5> + 3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5> + 2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6> + 3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7> + 3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6> + 3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0> + 2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6> + 2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS + 2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3> + 2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6> + 3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1> + 2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6> + 3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7> + 2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6> + 3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7> + 2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS + 1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS + 2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7> + 1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7> + 2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2> + 1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS + 2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3> + 2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3> + 2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7> + 1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS + 1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS + 2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2> + 1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u> + 2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2> + 1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS + 1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6> + 2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0> + 2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7> + 1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS + 3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0> + 2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS + 2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6> + 3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1> + 2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6> + 2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1> + 2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2> + 3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0> + 2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS + 2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1> + 3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1> + 3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0> + 2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS + 3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS + 3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0> + 2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3> + 3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1> + 2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1> + 2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4> + 3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3> + 3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2> + 3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1> + 2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4> + 2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS + 2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0> + 3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0> + 2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4> + 3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2> + 3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3> + 3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6> + 3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3> + 2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6> + 3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6> + 2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5> + 3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7> + 2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6> + 2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS + 3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4> + 2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4> + 3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4> + 2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4> + 2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS + 1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6> + 2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4> + 1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6> + 2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS + 3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3> + 2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5> + 2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6> + 2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS + 2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5> + 1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS + 2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS + 1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS + 1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS + 2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2> + 2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2> + 2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2> + 1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS + 2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3> + 2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3> + 2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2> + 1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS + 2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS + 2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4> + 3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5> + 2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7> + 2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS + 2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5> + 2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6> + 4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7> + 2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS + 1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS + 2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS + 2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u> + 2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6> + 1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6> + 2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS + 1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS + 2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS + 1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS + 3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0> + 2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS + 3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6> + 3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2> + 3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1> + 4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5> + 3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7> + 1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0> + 1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0> + 2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1> + 3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5> + 3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0> + 3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS + 3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6> + 3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7> + 3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4> + 2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3> + 2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3> + 2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS + 3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5> + 3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2> + 3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5> + 2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS + 3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6> + 2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7> + 3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS + 2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5> + 3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2> + 3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5> + 3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3> + 3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3> + 2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6> + 3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6> + 3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0> + 2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3> + 2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3> + 2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS + 3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5> + 3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5> + 2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5> + 2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS + 2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS + 3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0> + 1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6> + 1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6> + 2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS + 3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2> + 3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2> + 3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2> + 2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS + 2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5> + 2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6> + 2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7> + 2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7> + 2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1> + 3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4> + 3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4> + 2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4> + 2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5> + 3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5> + 2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6> + 2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0> + 2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1> + 2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS + 2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7> + 2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7> + 3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2> + 2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS + 2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5> + 2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6> + 3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS + 2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS + 2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS + 2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u> + 2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u> + 2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7> + 2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS + 2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS + 2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6> + 1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u> + 1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u> + 2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS + 1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS + 2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4> + 3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1> + 2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2> + 3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3> + 2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0> + 4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS + 1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS + 2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2> + 2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1> + 2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0> + 2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS + 3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3> + 2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7> + 2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3> + 3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3> + 2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS + 3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6> + 3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3> + 2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6> + 2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1> + 3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3> + 3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7> + 2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7> + 2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3> + 2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3> + 2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2> + 3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4> + 3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6> + 2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3> + 2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5> + 3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4> + 2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6> + 4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS + 2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5> + 2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS + 3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2> + 3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6> + 3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6> + 1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6> + 1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS + 2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0> + 2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6> + 1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6> + 3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS + 2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3> + 3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5> + 3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0> + 2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6> + 2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5> + 2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0> + 2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS + 2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS + 1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS + 2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2> + 2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3> + 2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2> + 1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS + 2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3> + 296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS + 2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7> + 296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS + 2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS + 3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7> + 2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7> + 2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6> + 2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS + 2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4> + 2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6> + 1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS + 1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS + 1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS + 1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS + 2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u> + 2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS + 1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS + 1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS + 296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS + 1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS + 296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS + 1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0> + 497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS + 1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2> + 2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0> + 1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5> + 1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0> + 2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7> + 2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2> + 497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS + 1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2> + 1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1> + 1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0> + 1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3> + 2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5> + 1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7> + 2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7> + 2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2> + 1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3> + 2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2> + 2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3> + 1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2> + 1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1> + 2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6> + 2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7> + 1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7> + 2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1> + 1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1> + 1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2> + 2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3> + 2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1> + 1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3> + 1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6> + 2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7> + 2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7> + 2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1> + 1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2> + 1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1> + 2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3> + 2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3> + 2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5> + 1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4> + 497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS + 1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7> + 497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS + 2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2> + 1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4> + 2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0> + 1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6> + 1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5> + 1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0> + 1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7> + 1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7> + 2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2> + 2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3> + 1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5> + 2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6> + 2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7> + 1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6> + 1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1> + 1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1> + 1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2> + 2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7> + 2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3> + 2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7> + 1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6> + 2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7> + 1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7> + 1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7> + 1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2> + 1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2> + 497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS + 1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0> + 1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1> + 1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6> + 497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS + 1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7> + 1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1> + 497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS + 1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0> + 497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS + 1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2> + 2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2> + 1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5> + 1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0> + 2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7> + 1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0> + 497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS + 1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2> + 1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1> + 1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS + 1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3> + 2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS + 1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7> + 2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7> + 2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3> + 1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS + 1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS + 2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3> + 1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2> + 1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1> + 1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS + 2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u> + 1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7> + 2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3> + 1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1> + 1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2> + 2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3> + 2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3> + 1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3> + 1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6> + 2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7> + 2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5> + 2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1> + 1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2> + 1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1> + 2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3> + 1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6> + 2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6> + 1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4> + 497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS + 1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6> + 497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS + 2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS + 1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5> + 2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7> + 1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6> + 1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5> + 1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS + 1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7> + 1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS + 1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS + 2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2> + 1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7> + 1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS + 2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5> + 296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS + 1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1> + 296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS + 1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS + 2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7> + 1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7> + 1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS + 1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS + 2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5> + 1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7> + 1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS + 1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS + 1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS + 497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS + 1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS + 1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1> + 1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS + 497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS + 296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS + 1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS + 497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS + 1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0> + 1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1> + 1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2> + 3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0> + 2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1> + 2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6> + 2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0> + 3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7> + 1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1> + 1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS + 2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5> + 564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS + 2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7> + 1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS + 2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7> + 1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1> + 2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0> + 564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS + 1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2> + 2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5> + 2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0> + 2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1> + 1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6> + 2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7> + 2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7> + 2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7> + 1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2> + 2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2> + 2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0> + 2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0> + 2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3> + 2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6> + 2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0> + 2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0> + 3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7> + 2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0> + 2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4> + 1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5> + 1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6> + 3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4> + 2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6> + 1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS + 2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6> + 3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5> + 1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5> + 2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS + 2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3> + 2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7> + 2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0> + 2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6> + 2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5> + 1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0> + 2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7> + 1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0> + 2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7> + 2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7> + 2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7> + 2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0> + 3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7> + 2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7> + 2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6> + 2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1> + 2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7> + 2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2> + 2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS + 3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS + 3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7> + 2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6> + 2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7> + 2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0> + 2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7> + 2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2> + 1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2> + 1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1> + 564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS + 2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u> + 1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6> + 1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS + 1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u> + 2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1> + 564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS + 2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS + 2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS + 2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS + 1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2> + 2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS + 2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1> + 2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0> + 3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1> + 1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2> + 2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1> + 1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1> + 3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6> + 1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3> + 2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5> + 2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7> + 2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1> + 3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5> + 1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3> + 2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1> + 2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3> + 3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2> + 1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0> + 2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5> + 2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3> + 2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2> + 2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0> + 1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0> + 2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0> + 1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3> + 2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0> + 2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7> + 2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5> + 1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7> + 2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7> + 2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3> + 1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7> + 2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5> + 2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5> + 2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5> + 2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5> + 3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5> + 2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS + 2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1> + 3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0> + 2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS + 2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS + 2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7> + 3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6> + 1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7> + 2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS + 3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7> + 2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1> + 2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1> + 1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7> + 3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7> + 2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7> + 2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7> + 2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7> + 3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7> + 2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7> + 3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7> + 2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1> + 2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7> + 2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2> + 2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1> + 3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3> + 3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS + 2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS + 3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7> + 3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0> + 2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7> + 3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS + 2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS + 1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3> + 2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0> + 1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7> + 2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS + 1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7> + 2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7> + 2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS + 1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7> + 2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2> + 2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2> + 2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0> + 2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0> + 2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6> + 2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7> + 2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1> + 3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2> + 2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7> + 2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3> + 3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0> + 3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0> + 2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1> + 2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3> + 3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0> + 2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3> + 2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2> + 2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1> + 2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1> + 2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3> + 1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2> + 1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3> + 2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5> + 2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7> + 2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6> + 3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5> + 1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3> + 1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1> + 2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5> + 2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6> + 2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7> + 1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5> + 2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7> + 2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6> + 2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0> + 1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1> + 2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6> + 2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3> + 2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4> + 2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5> + 2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6> + 2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7> + 2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0> + 3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0> + 2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7> + 2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7> + 3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3> + 2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7> + 2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7> + 2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7> + 3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7> + 2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7> + 3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0> + 2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7> + 2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS + 2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3> + 2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7> + 1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7> + 2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS + 2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7> + 2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7> + 3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7> + 1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7> + 2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1> + 3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0> + 3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5> + 2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS + 2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS + 3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7> + 4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6> + 2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7> + 2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS + 1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1> + 2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5> + 1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2> + 1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7> + 1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5> + 2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7> + 2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7> + 2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0> + 1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7> + 2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0> + 1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2> + 2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0> + 2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2> + 2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1> + 2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2> + 2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0> + 3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0> + 1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2> + 2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1> + 2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1> + 2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3> + 2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5> + 2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6> + 2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3> + 3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1> + 2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5> + 2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5> + 2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1> + 2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0> + 2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2> + 2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0> + 2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5> + 2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4> + 2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3> + 2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3> + 2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0> + 2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1> + 2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3> + 2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3> + 1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3> + 2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4> + 2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7> + 2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3> + 2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7> + 1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3> + 2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1> + 2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2> + 2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4> + 2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5> + 2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5> + 1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6> + 2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6> + 3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4> + 1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6> + 2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS + 1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3> + 2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3> + 2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7> + 2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5> + 2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7> + 2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0> + 2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0> + 1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3> + 2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7> + 2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3> + 1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3> + 2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7> + 2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7> + 3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7> + 2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6> + 2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7> + 1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3> + 2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1> + 2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5> + 2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6> + 2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7> + 2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5> + 2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7> + 2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7> + 2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7> + 2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1> + 2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u> + 1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2> + 1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3> + 1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3> + 2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u> + 1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6> + 2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3> + 2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0> + 1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2> + 2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0> + 1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS + 2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2> + 3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1> + 2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5> + 1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1> + 1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2> + 3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1> + 1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1> + 2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2> + 2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1> + 2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3> + 2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7> + 2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3> + 2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0> + 2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3> + 3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3> + 2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0> + 3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1> + 3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3> + 2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2> + 2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1> + 3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7> + 2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3> + 2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0> + 3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5> + 2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0> + 2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2> + 3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5> + 3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1> + 2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3> + 2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6> + 2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4> + 2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5> + 3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7> + 2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4> + 2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2> + 3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1> + 3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4> + 3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3> + 1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4> + 1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5> + 1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6> + 3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7> + 1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5> + 1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS + 2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7> + 2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3> + 2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5> + 1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS + 2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7> + 564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS + 2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7> + 564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS + 2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1> + 3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5> + 2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3> + 2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4> + 1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6> + 2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7> + 2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7> + 2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1> + 1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2> + 2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2> + 3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2> + 3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5> + 3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6> + 2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1> + 2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0> + 2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1> + 2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7> + 2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7> + 1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS + 1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS + 2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3> + 2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u> + 1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6> + 1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1> + 564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS + 2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5> + 564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS + 2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS + 2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS + 2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS + 3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0> + 2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1> + 2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1> + 2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1> + 2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2> + 2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS + 2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1> + 3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1> + 3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0> + 2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7> + 2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5> + 2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3> + 2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1> + 1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3> + 1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3> + 3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1> + 3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0> + 3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7> + 2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4> + 3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7> + 2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3> + 3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3> + 2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3> + 2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4> + 3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS + 2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5> + 3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5> + 3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3> + 3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS + 2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5> + 3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5> + 2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0> + 2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0> + 2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS + 3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7> + 3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5> + 3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4> + 2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS + 2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS + 2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5> + 1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6> + 1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6> + 2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1> + 2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3> + 3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3> + 3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3> + 2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4> + 1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5> + 2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6> + 1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7> + 1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7> + 2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1> + 2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7> + 3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2> + 2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4> + 2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5> + 2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7> + 2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7> + 1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0> + 1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0> + 1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS + 2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3> + 2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2> + 2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7> + 1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS + 1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7> + 2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0> + 2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1> + 1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3> + 1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS + 2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS + 2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0> + 2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u> + 1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS + 1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7> + 2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7> + 1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0> + 1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3> + 2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0> + 1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS + 2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4> + 2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0> + 2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2> + 2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0> + 2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0> + 2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2> + 1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS + 1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1> + 2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1> + 2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0> + 2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3> + 2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS + 2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7> + 2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3> + 2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3> + 1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1> + 2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1> + 2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3> + 2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2> + 2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1> + 2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5> + 2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7> + 2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7> + 1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3> + 1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3> + 2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2> + 2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3> + 2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3> + 2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3> + 2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5> + 2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7> + 3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6> + 2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0> + 2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0> + 2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1> + 2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3> + 2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5> + 2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4> + 2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6> + 1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS + 2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0> + 2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5> + 1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS + 2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS + 2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3> + 2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7> + 3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4> + 1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6> + 2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5> + 2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0> + 2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7> + 1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5> + 2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1> + 2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3> + 2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3> + 3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4> + 2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4> + 2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7> + 1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6> + 1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7> + 1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7> + 1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1> + 2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0> + 2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2> + 2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0> + 1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5> + 2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4> + 2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6> + 2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7> + 1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1> + 1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1> + 1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS + 2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2> + 2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0> + 1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5> + 1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS + 1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6> + 1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3> + 1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1> + 2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0> + 1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2> + 2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0> + 3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0> + 2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1> + 2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0> + 2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0> + 2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2> + 1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2> + 2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3> + 2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1> + 2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3> + 2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5> + 2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS + 2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3> + 2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1> + 2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1> + 2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3> + 3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5> + 3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0> + 2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2> + 2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0> + 3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5> + 3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3> + 2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3> + 2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3> + 2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3> + 2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1> + 3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3> + 3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6> + 2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7> + 2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5> + 2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3> + 2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7> + 2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7> + 2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7> + 2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5> + 3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7> + 3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3> + 3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4> + 2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4> + 1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6> + 2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4> + 2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6> + 1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6> + 2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS + 2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7> + 3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3> + 2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7> + 2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7> + 1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7> + 2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7> + 2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5> + 1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7> + 2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS + 2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0> + 2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7> + 2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7> + 2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS + 2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7> + 1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7> + 2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0> + 1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7> + 1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS + 2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2> + 2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2> + 2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2> + 1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS + 2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7> + 2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7> + 363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS + 363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS + 1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS + 1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2> + 2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3> + 2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0> + 1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS + 1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6> + 1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7> + 363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS + 363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS + 1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0> + 1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2> + 1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2> + 1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2> + 2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1> + 1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1> + 1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2> + 2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS + 1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2> + 1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS + 1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1> + 564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS + 1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3> + 1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS + 2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3> + 1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1> + 1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3> + 564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS + 1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2> + 2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0> + 1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2> + 1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0> + 1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6> + 2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4> + 2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3> + 1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3> + 1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0> + 1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1> + 1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3> + 2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6> + 1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3> + 1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5> + 1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7> + 2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7> + 2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0> + 1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1> + 2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1> + 1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5> + 1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6> + 2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5> + 1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4> + 1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6> + 1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6> + 1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6> + 1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6> + 1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS + 1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u> + 2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3> + 1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7> + 1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u> + 1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u> + 564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS + 1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7> + 564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS + 2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1> + 2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7> + 1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u> + 1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7> + 1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6> + 2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7> + 1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u> + 1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0> + 1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7> + 1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1> + 2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3> + 2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6> + 2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7> + 1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5> + 1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7> + 2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2> + 363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS + 363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS + 1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1> + 1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2> + 564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS + 1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0> + 1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5> + 1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6> + 564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS + 363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS + 564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS + 135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS + 1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1> + 1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2> + 2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0> + 1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS + 2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3> + 2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0> + 2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0> + 135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS + 1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS + 1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS + 537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS + 2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3> + 1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS + 2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7> + 1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1> + 2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2> + 537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS + 1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2> + 2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1> + 1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS + 2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1> + 1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6> + 2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7> + 2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7> + 2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2> + 1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2> + 2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0> + 2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0> + 72589981U, // <u,0,3,2>: Cost 1 vrev LHS + 2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3> + 2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6> + 2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0> + 2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0> + 2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3> + 73032403U, // <u,0,3,u>: Cost 1 vrev LHS + 2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u> + 1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5> + 1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6> + 3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2> + 2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS + 1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS + 2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6> + 2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0> + 1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6> + 2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS + 1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS + 2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7> + 2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0> + 2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5> + 2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5> + 1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0> + 2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS + 1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0> + 2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS + 2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7> + 1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS + 2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0> + 2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS + 2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u> + 2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6> + 2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1> + 1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS + 2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS + 2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0> + 2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7> + 2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7> + 2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS + 2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u> + 2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7> + 2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7> + 2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS + 135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS + 1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1> + 537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS + 2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1> + 1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6> + 1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS + 1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u> + 2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS + 537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS + 1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1> + 1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS + 2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1> + 1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2> + 2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS + 2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1> + 2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0> + 2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0> + 1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2> + 1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS + 202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS + 2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2> + 1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3> + 1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS + 2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5> + 2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1> + 2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1> + 202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS + 1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS + 2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3> + 2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1> + 835584U, // <u,1,2,3>: Cost 0 copy LHS + 1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS + 2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3> + 2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7> + 1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2> + 835584U, // <u,1,2,u>: Cost 0 copy LHS + 1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS + 1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3> + 2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0> + 2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS + 1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS + 1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7> + 2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7> + 2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1> + 1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3> + 1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1> + 2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u> + 2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u> + 1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4> + 2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS + 1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS + 2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1> + 2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4> + 1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4> + 1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS + 2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2> + 2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2> + 1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7> + 1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS + 2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1> + 2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1> + 2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS + 1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7> + 2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS + 2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7> + 2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7> + 2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7> + 2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS + 2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7> + 2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1> + 1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1> + 1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1> + 2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS + 2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7> + 2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1> + 2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS + 2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS + 2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7> + 2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u> + 2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7> + 2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS + 1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS + 202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS + 2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0> + 835584U, // <u,1,u,3>: Cost 0 copy LHS + 1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS + 1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7> + 2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7> + 1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u> + 835584U, // <u,1,u,u>: Cost 0 copy LHS + 1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2> + 1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS + 1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2> + 2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0> + 2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5> + 2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7> + 2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4> + 2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u> + 1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS + 1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2> + 2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1> + 2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0> + 2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1> + 2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS + 2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7> + 2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3> + 2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2> + 1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1> + 1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS + 2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2> + 269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS + 1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3> + 1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS + 2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7> + 2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7> + 2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2> + 269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS + 408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS + 1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2> + 1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS + 408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS + 1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3> + 1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2> + 408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS + 1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2> + 2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4> + 2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5> + 2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5> + 2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS + 1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS + 1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2> + 2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4> + 1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS + 2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS + 2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0> + 2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7> + 2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6> + 1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5> + 2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5> + 2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7> + 2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS + 1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5> + 1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS + 2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2> + 2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2> + 1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7> + 1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS + 2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7> + 2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7> + 2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1> + 1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7> + 1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2> + 2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2> + 2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7> + 1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS + 2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS + 2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6> + 2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6> + 2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7> + 1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS + 408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS + 1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS + 269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS + 1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3> + 408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS + 1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS + 1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS + 1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2> + 408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS + 1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0> + 471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS + 1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0> + 1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0> + 2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7> + 2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0> + 471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS + 1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1> + 1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0> + 1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5> + 1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7> + 2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u> + 1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3> + 1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS + 1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2> + 1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2> + 1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1> + 1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS + 2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6> + 1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u> + 1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1> + 1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2> + 1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3> + 1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3> + 336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS + 1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6> + 2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5> + 2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3> + 2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7> + 336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS + 1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS + 1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4> + 2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0> + 2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2> + 1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS + 471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS + 1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6> + 2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4> + 471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS + 2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS + 1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3> + 2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5> + 2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6> + 1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6> + 1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5> + 1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0> + 1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS + 1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS + 2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2> + 2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6> + 1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3> + 2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u> + 2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6> + 1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6> + 1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6> + 1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1> + 1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6> + 1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS + 1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7> + 1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7> + 2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2> + 1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS + 2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u> + 2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u> + 1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7> + 1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS + 1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2> + 471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS + 1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0> + 336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS + 1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6> + 471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS + 1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7> + 1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS + 471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS + 2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0> + 1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS + 2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4> + 2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4> + 2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5> + 1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1> + 1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2> + 3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1> + 1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS + 2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS + 2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1> + 1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4> + 2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3> + 2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS + 1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS + 2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3> + 2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1> + 1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4> + 2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS + 2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u> + 2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2> + 2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5> + 2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS + 2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3> + 1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS + 2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2> + 1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS + 2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2> + 2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4> + 2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3> + 2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3> + 2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6> + 2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5> + 2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6> + 3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7> + 2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u> + 1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS + 2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4> + 2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4> + 2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4> + 161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS + 1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS + 1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6> + 2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4> + 161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS + 1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS + 2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3> + 1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5> + 2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2> + 1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS + 1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS + 537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS + 2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS + 537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS + 1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS + 2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2> + 2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2> + 2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2> + 1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS + 2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7> + 1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS + 2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1> + 1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS + 2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS + 2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4> + 2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7> + 2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7> + 2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS + 2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0> + 96808489U, // <u,4,7,6>: Cost 1 vrev RHS + 2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7> + 96955963U, // <u,4,7,u>: Cost 1 vrev RHS + 1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS + 1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS + 1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u> + 2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2> + 161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS + 1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS + 537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS + 2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS + 537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS + 2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0> + 1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS + 2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5> + 2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5> + 1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5> + 2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1> + 2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5> + 1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0> + 1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS + 1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS + 2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5> + 2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0> + 2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7> + 1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS + 2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5> + 2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3> + 1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3> + 1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3> + 2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS + 2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7> + 2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5> + 1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5> + 2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS + 2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3> + 2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7> + 3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS + 1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5> + 2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS + 2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3> + 2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u> + 2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3> + 2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6> + 3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5> + 2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6> + 2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS + 2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS + 2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS + 2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0> + 2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5> + 2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4> + 1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5> + 1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS + 2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5> + 1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6> + 1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS + 1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS + 2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7> + 2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2> + 2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2> + 1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS + 229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS + 2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6> + 1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7> + 229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS + 1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS + 2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6> + 2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6> + 1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6> + 1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS + 2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7> + 2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7> + 27705344U, // <u,5,6,7>: Cost 0 copy RHS + 27705344U, // <u,5,6,u>: Cost 0 copy RHS + 1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS + 1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7> + 2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2> + 2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2> + 1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS + 1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7> + 2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0> + 2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS + 1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS + 1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS + 1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u> + 2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2> + 1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u> + 1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS + 229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS + 2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6> + 27705344U, // <u,5,u,7>: Cost 0 copy RHS + 27705344U, // <u,5,u,u>: Cost 0 copy RHS + 2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0> + 1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS + 1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6> + 2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0> + 1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6> + 2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u> + 2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0> + 2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS + 1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS + 1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1> + 2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1> + 2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0> + 2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3> + 2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6> + 2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7> + 2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3> + 2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS + 1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1> + 1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS + 2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2> + 2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2> + 2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1> + 1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS + 2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6> + 2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7> + 1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3> + 1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3> + 2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2> + 2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3> + 2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3> + 2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3> + 1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6> + 2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6> + 3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6> + 1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS + 1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS + 2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS + 2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3> + 2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4> + 2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u> + 1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6> + 1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS + 1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6> + 2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS + 1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS + 2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS + 2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3> + 2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7> + 2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6> + 1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6> + 2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5> + 2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0> + 2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS + 1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5> + 1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS + 2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2> + 2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3> + 2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2> + 1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS + 2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3> + 296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS + 1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7> + 296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS + 432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS + 1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2> + 1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2> + 432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS + 1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3> + 1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3> + 1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS + 432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS + 432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS + 1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS + 1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2> + 432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS + 1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS + 296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS + 1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS + 432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS + 1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0> + 497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS + 1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2> + 2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0> + 1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5> + 1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0> + 2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7> + 2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2> + 497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS + 1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2> + 1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1> + 1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0> + 1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7> + 2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS + 1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7> + 2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7> + 2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u> + 1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7> + 2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7> + 1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2> + 1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2> + 1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1> + 2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7> + 2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7> + 1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7> + 2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u> + 1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7> + 1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2> + 2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u> + 2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u> + 1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3> + 1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6> + 1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3> + 1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3> + 2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7> + 1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2> + 1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1> + 2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7> + 2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3> + 2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4> + 1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4> + 497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS + 1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0> + 497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS + 2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2> + 1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4> + 2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0> + 1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6> + 1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5> + 1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0> + 1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7> + 1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7> + 1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS + 2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2> + 1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5> + 1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS + 1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6> + 1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6> + 1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1> + 1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1> + 1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2> + 2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7> + 2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2> + 2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7> + 1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6> + 1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7> + 1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7> + 363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS + 363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS + 1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2> + 497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS + 1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0> + 1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1> + 1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6> + 497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS + 1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7> + 363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS + 497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS + 135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS + 471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS + 1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2> + 1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0> + 1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2> + 1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0> + 471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS + 1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS + 537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS + 1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS + 1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1> + 1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3> + 537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS + 1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS + 1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2> + 269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS + 835584U, // <u,u,2,3>: Cost 0 copy LHS + 1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS + 2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6> + 1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3> + 835584U, // <u,u,2,u>: Cost 0 copy LHS + 408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS + 1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 120371557U, // <u,u,3,2>: Cost 1 vrev LHS + 336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS + 408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS + 1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7> + 1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS + 408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS + 1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS + 1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4> + 1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6> + 1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4> + 161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS + 471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS + 1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6> + 471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS + 1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS + 1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5> + 1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7> + 1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS + 229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS + 537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS + 1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS + 537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS + 1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS + 2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2> + 1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7> + 1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS + 1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6> + 296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS + 27705344U, // <u,u,6,7>: Cost 0 copy RHS + 27705344U, // <u,u,6,u>: Cost 0 copy RHS + 432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS + 1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7> + 1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7> + 1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS + 432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS + 1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3> + 120699277U, // <u,u,7,6>: Cost 1 vrev RHS + 363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS + 432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS + 408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS + 471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS + 537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS + 835584U, // <u,u,u,3>: Cost 0 copy LHS + 408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS + 471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS + 537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS + 27705344U, // <u,u,u,7>: Cost 0 copy RHS + 835584U, // <u,u,u,u>: Cost 0 copy LHS + 0 +}; + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp new file mode 100644 index 0000000..e6e8cdf --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp @@ -0,0 +1,19 @@ +//===-- ARMRegisterInfo.cpp - ARM Register Information --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the ARM implementation of the TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#include "ARMRegisterInfo.h" +using namespace llvm; + +void ARMRegisterInfo::anchor() { } + +ARMRegisterInfo::ARMRegisterInfo() : ARMBaseRegisterInfo() {} diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.h new file mode 100644 index 0000000..e2e650e --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.h @@ -0,0 +1,31 @@ +//===-- ARMRegisterInfo.h - ARM Register Information Impl -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the ARM implementation of the TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_ARMREGISTERINFO_H +#define LLVM_LIB_TARGET_ARM_ARMREGISTERINFO_H + +#include "ARMBaseRegisterInfo.h" + +namespace llvm { + +class ARMSubtarget; + +struct ARMRegisterInfo : public ARMBaseRegisterInfo { + virtual void anchor(); +public: + ARMRegisterInfo(); +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td new file mode 100644 index 0000000..02cbfb1 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td @@ -0,0 +1,430 @@ +//===-- ARMRegisterInfo.td - ARM Register defs -------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Declarations that describe the ARM register file +//===----------------------------------------------------------------------===// + +// Registers are identified with 4-bit ID numbers. +class ARMReg<bits<16> Enc, string n, list<Register> subregs = []> : Register<n> { + let HWEncoding = Enc; + let Namespace = "ARM"; + let SubRegs = subregs; + // All bits of ARM registers with sub-registers are covered by sub-registers. + let CoveredBySubRegs = 1; +} + +class ARMFReg<bits<16> Enc, string n> : Register<n> { + let HWEncoding = Enc; + let Namespace = "ARM"; +} + +// Subregister indices. +let Namespace = "ARM" in { +def qqsub_0 : SubRegIndex<256>; +def qqsub_1 : SubRegIndex<256, 256>; + +// Note: Code depends on these having consecutive numbers. +def qsub_0 : SubRegIndex<128>; +def qsub_1 : SubRegIndex<128, 128>; +def qsub_2 : ComposedSubRegIndex<qqsub_1, qsub_0>; +def qsub_3 : ComposedSubRegIndex<qqsub_1, qsub_1>; + +def dsub_0 : SubRegIndex<64>; +def dsub_1 : SubRegIndex<64, 64>; +def dsub_2 : ComposedSubRegIndex<qsub_1, dsub_0>; +def dsub_3 : ComposedSubRegIndex<qsub_1, dsub_1>; +def dsub_4 : ComposedSubRegIndex<qsub_2, dsub_0>; +def dsub_5 : ComposedSubRegIndex<qsub_2, dsub_1>; +def dsub_6 : ComposedSubRegIndex<qsub_3, dsub_0>; +def dsub_7 : ComposedSubRegIndex<qsub_3, dsub_1>; + +def ssub_0 : SubRegIndex<32>; +def ssub_1 : SubRegIndex<32, 32>; +def ssub_2 : ComposedSubRegIndex<dsub_1, ssub_0>; +def ssub_3 : ComposedSubRegIndex<dsub_1, ssub_1>; + +def gsub_0 : SubRegIndex<32>; +def gsub_1 : SubRegIndex<32, 32>; +// Let TableGen synthesize the remaining 12 ssub_* indices. +// We don't need to name them. +} + +// Integer registers +def R0 : ARMReg< 0, "r0">, DwarfRegNum<[0]>; +def R1 : ARMReg< 1, "r1">, DwarfRegNum<[1]>; +def R2 : ARMReg< 2, "r2">, DwarfRegNum<[2]>; +def R3 : ARMReg< 3, "r3">, DwarfRegNum<[3]>; +def R4 : ARMReg< 4, "r4">, DwarfRegNum<[4]>; +def R5 : ARMReg< 5, "r5">, DwarfRegNum<[5]>; +def R6 : ARMReg< 6, "r6">, DwarfRegNum<[6]>; +def R7 : ARMReg< 7, "r7">, DwarfRegNum<[7]>; +// These require 32-bit instructions. +let CostPerUse = 1 in { +def R8 : ARMReg< 8, "r8">, DwarfRegNum<[8]>; +def R9 : ARMReg< 9, "r9">, DwarfRegNum<[9]>; +def R10 : ARMReg<10, "r10">, DwarfRegNum<[10]>; +def R11 : ARMReg<11, "r11">, DwarfRegNum<[11]>; +def R12 : ARMReg<12, "r12">, DwarfRegNum<[12]>; +def SP : ARMReg<13, "sp">, DwarfRegNum<[13]>; +def LR : ARMReg<14, "lr">, DwarfRegNum<[14]>; +def PC : ARMReg<15, "pc">, DwarfRegNum<[15]>; +} + +// Float registers +def S0 : ARMFReg< 0, "s0">; def S1 : ARMFReg< 1, "s1">; +def S2 : ARMFReg< 2, "s2">; def S3 : ARMFReg< 3, "s3">; +def S4 : ARMFReg< 4, "s4">; def S5 : ARMFReg< 5, "s5">; +def S6 : ARMFReg< 6, "s6">; def S7 : ARMFReg< 7, "s7">; +def S8 : ARMFReg< 8, "s8">; def S9 : ARMFReg< 9, "s9">; +def S10 : ARMFReg<10, "s10">; def S11 : ARMFReg<11, "s11">; +def S12 : ARMFReg<12, "s12">; def S13 : ARMFReg<13, "s13">; +def S14 : ARMFReg<14, "s14">; def S15 : ARMFReg<15, "s15">; +def S16 : ARMFReg<16, "s16">; def S17 : ARMFReg<17, "s17">; +def S18 : ARMFReg<18, "s18">; def S19 : ARMFReg<19, "s19">; +def S20 : ARMFReg<20, "s20">; def S21 : ARMFReg<21, "s21">; +def S22 : ARMFReg<22, "s22">; def S23 : ARMFReg<23, "s23">; +def S24 : ARMFReg<24, "s24">; def S25 : ARMFReg<25, "s25">; +def S26 : ARMFReg<26, "s26">; def S27 : ARMFReg<27, "s27">; +def S28 : ARMFReg<28, "s28">; def S29 : ARMFReg<29, "s29">; +def S30 : ARMFReg<30, "s30">; def S31 : ARMFReg<31, "s31">; + +// Aliases of the F* registers used to hold 64-bit fp values (doubles) +let SubRegIndices = [ssub_0, ssub_1] in { +def D0 : ARMReg< 0, "d0", [S0, S1]>, DwarfRegNum<[256]>; +def D1 : ARMReg< 1, "d1", [S2, S3]>, DwarfRegNum<[257]>; +def D2 : ARMReg< 2, "d2", [S4, S5]>, DwarfRegNum<[258]>; +def D3 : ARMReg< 3, "d3", [S6, S7]>, DwarfRegNum<[259]>; +def D4 : ARMReg< 4, "d4", [S8, S9]>, DwarfRegNum<[260]>; +def D5 : ARMReg< 5, "d5", [S10, S11]>, DwarfRegNum<[261]>; +def D6 : ARMReg< 6, "d6", [S12, S13]>, DwarfRegNum<[262]>; +def D7 : ARMReg< 7, "d7", [S14, S15]>, DwarfRegNum<[263]>; +def D8 : ARMReg< 8, "d8", [S16, S17]>, DwarfRegNum<[264]>; +def D9 : ARMReg< 9, "d9", [S18, S19]>, DwarfRegNum<[265]>; +def D10 : ARMReg<10, "d10", [S20, S21]>, DwarfRegNum<[266]>; +def D11 : ARMReg<11, "d11", [S22, S23]>, DwarfRegNum<[267]>; +def D12 : ARMReg<12, "d12", [S24, S25]>, DwarfRegNum<[268]>; +def D13 : ARMReg<13, "d13", [S26, S27]>, DwarfRegNum<[269]>; +def D14 : ARMReg<14, "d14", [S28, S29]>, DwarfRegNum<[270]>; +def D15 : ARMReg<15, "d15", [S30, S31]>, DwarfRegNum<[271]>; +} + +// VFP3 defines 16 additional double registers +def D16 : ARMFReg<16, "d16">, DwarfRegNum<[272]>; +def D17 : ARMFReg<17, "d17">, DwarfRegNum<[273]>; +def D18 : ARMFReg<18, "d18">, DwarfRegNum<[274]>; +def D19 : ARMFReg<19, "d19">, DwarfRegNum<[275]>; +def D20 : ARMFReg<20, "d20">, DwarfRegNum<[276]>; +def D21 : ARMFReg<21, "d21">, DwarfRegNum<[277]>; +def D22 : ARMFReg<22, "d22">, DwarfRegNum<[278]>; +def D23 : ARMFReg<23, "d23">, DwarfRegNum<[279]>; +def D24 : ARMFReg<24, "d24">, DwarfRegNum<[280]>; +def D25 : ARMFReg<25, "d25">, DwarfRegNum<[281]>; +def D26 : ARMFReg<26, "d26">, DwarfRegNum<[282]>; +def D27 : ARMFReg<27, "d27">, DwarfRegNum<[283]>; +def D28 : ARMFReg<28, "d28">, DwarfRegNum<[284]>; +def D29 : ARMFReg<29, "d29">, DwarfRegNum<[285]>; +def D30 : ARMFReg<30, "d30">, DwarfRegNum<[286]>; +def D31 : ARMFReg<31, "d31">, DwarfRegNum<[287]>; + +// Advanced SIMD (NEON) defines 16 quad-word aliases +let SubRegIndices = [dsub_0, dsub_1] in { +def Q0 : ARMReg< 0, "q0", [D0, D1]>; +def Q1 : ARMReg< 1, "q1", [D2, D3]>; +def Q2 : ARMReg< 2, "q2", [D4, D5]>; +def Q3 : ARMReg< 3, "q3", [D6, D7]>; +def Q4 : ARMReg< 4, "q4", [D8, D9]>; +def Q5 : ARMReg< 5, "q5", [D10, D11]>; +def Q6 : ARMReg< 6, "q6", [D12, D13]>; +def Q7 : ARMReg< 7, "q7", [D14, D15]>; +} +let SubRegIndices = [dsub_0, dsub_1] in { +def Q8 : ARMReg< 8, "q8", [D16, D17]>; +def Q9 : ARMReg< 9, "q9", [D18, D19]>; +def Q10 : ARMReg<10, "q10", [D20, D21]>; +def Q11 : ARMReg<11, "q11", [D22, D23]>; +def Q12 : ARMReg<12, "q12", [D24, D25]>; +def Q13 : ARMReg<13, "q13", [D26, D27]>; +def Q14 : ARMReg<14, "q14", [D28, D29]>; +def Q15 : ARMReg<15, "q15", [D30, D31]>; +} + +// Current Program Status Register. +// We model fpscr with two registers: FPSCR models the control bits and will be +// reserved. FPSCR_NZCV models the flag bits and will be unreserved. APSR_NZCV +// models the APSR when it's accessed by some special instructions. In such cases +// it has the same encoding as PC. +def CPSR : ARMReg<0, "cpsr">; +def APSR : ARMReg<1, "apsr">; +def APSR_NZCV : ARMReg<15, "apsr_nzcv">; +def SPSR : ARMReg<2, "spsr">; +def FPSCR : ARMReg<3, "fpscr">; +def FPSCR_NZCV : ARMReg<3, "fpscr_nzcv"> { + let Aliases = [FPSCR]; +} +def ITSTATE : ARMReg<4, "itstate">; + +// Special Registers - only available in privileged mode. +def FPSID : ARMReg<0, "fpsid">; +def MVFR2 : ARMReg<5, "mvfr2">; +def MVFR1 : ARMReg<6, "mvfr1">; +def MVFR0 : ARMReg<7, "mvfr0">; +def FPEXC : ARMReg<8, "fpexc">; +def FPINST : ARMReg<9, "fpinst">; +def FPINST2 : ARMReg<10, "fpinst2">; + +// Register classes. +// +// pc == Program Counter +// lr == Link Register +// sp == Stack Pointer +// r12 == ip (scratch) +// r7 == Frame Pointer (thumb-style backtraces) +// r9 == May be reserved as Thread Register +// r11 == Frame Pointer (arm-style backtraces) +// r10 == Stack Limit +// +def GPR : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12), + SP, LR, PC)> { + // Allocate LR as the first CSR since it is always saved anyway. + // For Thumb1 mode, we don't want to allocate hi regs at all, as we don't + // know how to spill them. If we make our prologue/epilogue code smarter at + // some point, we can go back to using the above allocation orders for the + // Thumb1 instructions that know how to use hi regs. + let AltOrders = [(add LR, GPR), (trunc GPR, 8)]; + let AltOrderSelect = [{ + return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only(); + }]; +} + +// GPRs without the PC. Some ARM instructions do not allow the PC in +// certain operand slots, particularly as the destination. Primarily +// useful for disassembly. +def GPRnopc : RegisterClass<"ARM", [i32], 32, (sub GPR, PC)> { + let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)]; + let AltOrderSelect = [{ + return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only(); + }]; +} + +// GPRs without the PC but with APSR. Some instructions allow accessing the +// APSR, while actually encoding PC in the register field. This is useful +// for assembly and disassembly only. +def GPRwithAPSR : RegisterClass<"ARM", [i32], 32, (add (sub GPR, PC), APSR_NZCV)> { + let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)]; + let AltOrderSelect = [{ + return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only(); + }]; +} + +// GPRsp - Only the SP is legal. Used by Thumb1 instructions that want the +// implied SP argument list. +// FIXME: It would be better to not use this at all and refactor the +// instructions to not have SP an an explicit argument. That makes +// frame index resolution a bit trickier, though. +def GPRsp : RegisterClass<"ARM", [i32], 32, (add SP)>; + +// restricted GPR register class. Many Thumb2 instructions allow the full +// register range for operands, but have undefined behaviours when PC +// or SP (R13 or R15) are used. The ARM ISA refers to these operands +// via the BadReg() pseudo-code description. +def rGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, SP, PC)> { + let AltOrders = [(add LR, rGPR), (trunc rGPR, 8)]; + let AltOrderSelect = [{ + return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only(); + }]; +} + +// Thumb registers are R0-R7 normally. Some instructions can still use +// the general GPR register class above (MOV, e.g.) +def tGPR : RegisterClass<"ARM", [i32], 32, (trunc GPR, 8)>; + +// The high registers in thumb mode, R8-R15. +def hGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, tGPR)>; + +// For tail calls, we can't use callee-saved registers, as they are restored +// to the saved value before the tail call, which would clobber a call address. +// Note, getMinimalPhysRegClass(R0) returns tGPR because of the names of +// this class and the preceding one(!) This is what we want. +def tcGPR : RegisterClass<"ARM", [i32], 32, (add R0, R1, R2, R3, R12)> { + let AltOrders = [(and tcGPR, tGPR)]; + let AltOrderSelect = [{ + return MF.getSubtarget<ARMSubtarget>().isThumb1Only(); + }]; +} + +// Condition code registers. +def CCR : RegisterClass<"ARM", [i32], 32, (add CPSR)> { + let CopyCost = -1; // Don't allow copying of status registers. + let isAllocatable = 0; +} + +// Scalar single precision floating point register class.. +// FIXME: Allocation order changed to s0, s2, ... or s0, s4, ... as a quick hack +// to avoid partial-write dependencies on D or Q (depending on platform) +// registers (S registers are renamed as portions of D/Q registers). +def SPR : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 31)> { + let AltOrders = [(add (decimate SPR, 2), SPR), + (add (decimate SPR, 4), + (decimate SPR, 2), + (decimate (rotl SPR, 1), 4), + (decimate (rotl SPR, 1), 2))]; + let AltOrderSelect = [{ + return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs(MF); + }]; +} + +// Subset of SPR which can be used as a source of NEON scalars for 16-bit +// operations +def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)>; + +// Scalar double precision floating point / generic 64-bit vector register +// class. +// ARM requires only word alignment for double. It's more performant if it +// is double-word alignment though. +def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64, + (sequence "D%u", 0, 31)> { + // Allocate non-VFP2 registers D16-D31 first, and prefer even registers on + // Darwin platforms. + let AltOrders = [(rotl DPR, 16), + (add (decimate (rotl DPR, 16), 2), (rotl DPR, 16))]; + let AltOrderSelect = [{ + return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs(MF); + }]; +} + +// Subset of DPR that are accessible with VFP2 (and so that also have +// 32-bit SPR subregs). +def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64, + (trunc DPR, 16)>; + +// Subset of DPR which can be used as a source of NEON scalars for 16-bit +// operations +def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64, + (trunc DPR, 8)>; + +// Generic 128-bit vector register class. +def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16], 128, + (sequence "Q%u", 0, 15)> { + // Allocate non-VFP2 aliases Q8-Q15 first. + let AltOrders = [(rotl QPR, 8)]; + let AltOrderSelect = [{ return 1; }]; +} + +// Subset of QPR that have 32-bit SPR subregs. +def QPR_VFP2 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + 128, (trunc QPR, 8)>; + +// Subset of QPR that have DPR_8 and SPR_8 subregs. +def QPR_8 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + 128, (trunc QPR, 4)>; + +// Pseudo-registers representing odd-even pairs of D registers. The even-odd +// pairs are already represented by the Q registers. +// These are needed by NEON instructions requiring two consecutive D registers. +// There is no D31_D0 register as that is always an UNPREDICTABLE encoding. +def TuplesOE2D : RegisterTuples<[dsub_0, dsub_1], + [(decimate (shl DPR, 1), 2), + (decimate (shl DPR, 2), 2)]>; + +// Register class representing a pair of consecutive D registers. +// Use the Q registers for the even-odd pairs. +def DPair : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + 128, (interleave QPR, TuplesOE2D)> { + // Allocate starting at non-VFP2 registers D16-D31 first. + // Prefer even-odd pairs as they are easier to copy. + let AltOrders = [(add (rotl QPR, 8), (rotl DPair, 16))]; + let AltOrderSelect = [{ return 1; }]; +} + +// Pseudo-registers representing even-odd pairs of GPRs from R1 to R13/SP. +// These are needed by instructions (e.g. ldrexd/strexd) requiring even-odd GPRs. +def Tuples2R : RegisterTuples<[gsub_0, gsub_1], + [(add R0, R2, R4, R6, R8, R10, R12), + (add R1, R3, R5, R7, R9, R11, SP)]>; + +// Register class representing a pair of even-odd GPRs. +def GPRPair : RegisterClass<"ARM", [untyped], 64, (add Tuples2R)> { + let Size = 64; // 2 x 32 bits, we have no predefined type of that size. +} + +// Pseudo-registers representing 3 consecutive D registers. +def Tuples3D : RegisterTuples<[dsub_0, dsub_1, dsub_2], + [(shl DPR, 0), + (shl DPR, 1), + (shl DPR, 2)]>; + +// 3 consecutive D registers. +def DTriple : RegisterClass<"ARM", [untyped], 64, (add Tuples3D)> { + let Size = 192; // 3 x 64 bits, we have no predefined type of that size. +} + +// Pseudo 256-bit registers to represent pairs of Q registers. These should +// never be present in the emitted code. +// These are used for NEON load / store instructions, e.g., vld4, vst3. +def Tuples2Q : RegisterTuples<[qsub_0, qsub_1], [(shl QPR, 0), (shl QPR, 1)]>; + +// Pseudo 256-bit vector register class to model pairs of Q registers +// (4 consecutive D registers). +def QQPR : RegisterClass<"ARM", [v4i64], 256, (add Tuples2Q)> { + // Allocate non-VFP2 aliases first. + let AltOrders = [(rotl QQPR, 8)]; + let AltOrderSelect = [{ return 1; }]; +} + +// Tuples of 4 D regs that isn't also a pair of Q regs. +def TuplesOE4D : RegisterTuples<[dsub_0, dsub_1, dsub_2, dsub_3], + [(decimate (shl DPR, 1), 2), + (decimate (shl DPR, 2), 2), + (decimate (shl DPR, 3), 2), + (decimate (shl DPR, 4), 2)]>; + +// 4 consecutive D registers. +def DQuad : RegisterClass<"ARM", [v4i64], 256, + (interleave Tuples2Q, TuplesOE4D)>; + +// Pseudo 512-bit registers to represent four consecutive Q registers. +def Tuples2QQ : RegisterTuples<[qqsub_0, qqsub_1], + [(shl QQPR, 0), (shl QQPR, 2)]>; + +// Pseudo 512-bit vector register class to model 4 consecutive Q registers +// (8 consecutive D registers). +def QQQQPR : RegisterClass<"ARM", [v8i64], 256, (add Tuples2QQ)> { + // Allocate non-VFP2 aliases first. + let AltOrders = [(rotl QQQQPR, 8)]; + let AltOrderSelect = [{ return 1; }]; +} + + +// Pseudo-registers representing 2-spaced consecutive D registers. +def Tuples2DSpc : RegisterTuples<[dsub_0, dsub_2], + [(shl DPR, 0), + (shl DPR, 2)]>; + +// Spaced pairs of D registers. +def DPairSpc : RegisterClass<"ARM", [v2i64], 64, (add Tuples2DSpc)>; + +def Tuples3DSpc : RegisterTuples<[dsub_0, dsub_2, dsub_4], + [(shl DPR, 0), + (shl DPR, 2), + (shl DPR, 4)]>; + +// Spaced triples of D registers. +def DTripleSpc : RegisterClass<"ARM", [untyped], 64, (add Tuples3DSpc)> { + let Size = 192; // 3 x 64 bits, we have no predefined type of that size. +} + +def Tuples4DSpc : RegisterTuples<[dsub_0, dsub_2, dsub_4, dsub_6], + [(shl DPR, 0), + (shl DPR, 2), + (shl DPR, 4), + (shl DPR, 6)]>; + +// Spaced quads of D registers. +def DQuadSpc : RegisterClass<"ARM", [v4i64], 64, (add Tuples3DSpc)>; diff --git a/contrib/llvm/lib/Target/ARM/ARMSchedule.td b/contrib/llvm/lib/Target/ARM/ARMSchedule.td new file mode 100644 index 0000000..528c4ec --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMSchedule.td @@ -0,0 +1,354 @@ +//===-- ARMSchedule.td - ARM Scheduling Definitions --------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// +// Instruction scheduling annotations for out-of-order CPUs. +// These annotations are independent of the itinerary class defined below. +// Here we define the subtarget independent read/write per-operand resources. +// The subtarget schedule definitions will then map these to the subtarget's +// resource usages. +// For example: +// The instruction cycle timings table might contain an entry for an operation +// like the following: +// Rd <- ADD Rn, Rm, <shift> Rs +// Uops | Latency from register | Uops - resource requirements - latency +// 2 | Rn: 1 Rm: 4 Rs: 4 | uop T0, Rm, Rs - P01 - 3 +// | | uopc Rd, Rn, T0 - P01 - 1 +// This is telling us that the result will be available in destination register +// Rd after a minimum of three cycles after the result in Rm and Rs is available +// and one cycle after the result in Rn is available. The micro-ops can execute +// on resource P01. +// To model this, we need to express that we need to dispatch two micro-ops, +// that the resource P01 is needed and that the latency to Rn is different than +// the latency to Rm and Rs. The scheduler can decrease Rn's producer latency by +// two. +// We will do this by assigning (abstract) resources to register defs/uses. +// ARMSchedule.td: +// def WriteALUsr : SchedWrite; +// def ReadAdvanceALUsr : ScheRead; +// +// ARMInstrInfo.td: +// def ADDrs : I<>, Sched<[WriteALUsr, ReadAdvanceALUsr, ReadDefault, +// ReadDefault]> { ...} +// ReadAdvance read resources allow us to define "pipeline by-passes" or +// shorter latencies to certain registers as needed in the example above. +// The "ReadDefault" can be omitted. +// Next, the subtarget td file assigns resources to the abstract resources +// defined here. +// ARMScheduleSubtarget.td: +// // Resources. +// def P01 : ProcResource<3>; // ALU unit (3 of it). +// ... +// // Resource usages. +// def : WriteRes<WriteALUsr, [P01, P01]> { +// Latency = 4; // Latency of 4. +// NumMicroOps = 2; // Dispatch 2 micro-ops. +// // The two instances of resource P01 are occupied for one cycle. It is one +// // cycle because these resources happen to be pipelined. +// ResourceCycles = [1, 1]; +// } +// def : ReadAdvance<ReadAdvanceALUsr, 3>; + +// Basic ALU operation. +def WriteALU : SchedWrite; +def ReadALU : SchedRead; + +// Basic ALU with shifts. +def WriteALUsi : SchedWrite; // Shift by immediate. +def WriteALUsr : SchedWrite; // Shift by register. +def WriteALUSsr : SchedWrite; // Shift by register (flag setting). +def ReadALUsr : SchedRead; // Some operands are read later. + +// Compares. +def WriteCMP : SchedWrite; +def WriteCMPsi : SchedWrite; +def WriteCMPsr : SchedWrite; + +// Division. +def WriteDiv : SchedWrite; + +// Loads. +def WriteLd : SchedWrite; +def WritePreLd : SchedWrite; + +// Branches. +def WriteBr : SchedWrite; +def WriteBrL : SchedWrite; +def WriteBrTbl : SchedWrite; + +// Fixpoint conversions. +def WriteCvtFP : SchedWrite; + +// Noop. +def WriteNoop : SchedWrite; + +// Define TII for use in SchedVariant Predicates. +def : PredicateProlog<[{ + const ARMBaseInstrInfo *TII = + static_cast<const ARMBaseInstrInfo*>(SchedModel->getInstrInfo()); + (void)TII; +}]>; + +def IsPredicatedPred : SchedPredicate<[{TII->isPredicated(MI)}]>; + +//===----------------------------------------------------------------------===// +// Instruction Itinerary classes used for ARM +// +def IIC_iALUx : InstrItinClass; +def IIC_iALUi : InstrItinClass; +def IIC_iALUr : InstrItinClass; +def IIC_iALUsi : InstrItinClass; +def IIC_iALUsir : InstrItinClass; +def IIC_iALUsr : InstrItinClass; +def IIC_iBITi : InstrItinClass; +def IIC_iBITr : InstrItinClass; +def IIC_iBITsi : InstrItinClass; +def IIC_iBITsr : InstrItinClass; +def IIC_iUNAr : InstrItinClass; +def IIC_iUNAsi : InstrItinClass; +def IIC_iEXTr : InstrItinClass; +def IIC_iEXTAr : InstrItinClass; +def IIC_iEXTAsr : InstrItinClass; +def IIC_iCMPi : InstrItinClass; +def IIC_iCMPr : InstrItinClass; +def IIC_iCMPsi : InstrItinClass; +def IIC_iCMPsr : InstrItinClass; +def IIC_iTSTi : InstrItinClass; +def IIC_iTSTr : InstrItinClass; +def IIC_iTSTsi : InstrItinClass; +def IIC_iTSTsr : InstrItinClass; +def IIC_iMOVi : InstrItinClass; +def IIC_iMOVr : InstrItinClass; +def IIC_iMOVsi : InstrItinClass; +def IIC_iMOVsr : InstrItinClass; +def IIC_iMOVix2 : InstrItinClass; +def IIC_iMOVix2addpc : InstrItinClass; +def IIC_iMOVix2ld : InstrItinClass; +def IIC_iMVNi : InstrItinClass; +def IIC_iMVNr : InstrItinClass; +def IIC_iMVNsi : InstrItinClass; +def IIC_iMVNsr : InstrItinClass; +def IIC_iCMOVi : InstrItinClass; +def IIC_iCMOVr : InstrItinClass; +def IIC_iCMOVsi : InstrItinClass; +def IIC_iCMOVsr : InstrItinClass; +def IIC_iCMOVix2 : InstrItinClass; +def IIC_iMUL16 : InstrItinClass; +def IIC_iMAC16 : InstrItinClass; +def IIC_iMUL32 : InstrItinClass; +def IIC_iMAC32 : InstrItinClass; +def IIC_iMUL64 : InstrItinClass; +def IIC_iMAC64 : InstrItinClass; +def IIC_iDIV : InstrItinClass; +def IIC_iLoad_i : InstrItinClass; +def IIC_iLoad_r : InstrItinClass; +def IIC_iLoad_si : InstrItinClass; +def IIC_iLoad_iu : InstrItinClass; +def IIC_iLoad_ru : InstrItinClass; +def IIC_iLoad_siu : InstrItinClass; +def IIC_iLoad_bh_i : InstrItinClass; +def IIC_iLoad_bh_r : InstrItinClass; +def IIC_iLoad_bh_si : InstrItinClass; +def IIC_iLoad_bh_iu : InstrItinClass; +def IIC_iLoad_bh_ru : InstrItinClass; +def IIC_iLoad_bh_siu : InstrItinClass; +def IIC_iLoad_d_i : InstrItinClass; +def IIC_iLoad_d_r : InstrItinClass; +def IIC_iLoad_d_ru : InstrItinClass; +def IIC_iLoad_m : InstrItinClass; +def IIC_iLoad_mu : InstrItinClass; +def IIC_iLoad_mBr : InstrItinClass; +def IIC_iPop : InstrItinClass; +def IIC_iPop_Br : InstrItinClass; +def IIC_iLoadiALU : InstrItinClass; +def IIC_iStore_i : InstrItinClass; +def IIC_iStore_r : InstrItinClass; +def IIC_iStore_si : InstrItinClass; +def IIC_iStore_iu : InstrItinClass; +def IIC_iStore_ru : InstrItinClass; +def IIC_iStore_siu : InstrItinClass; +def IIC_iStore_bh_i : InstrItinClass; +def IIC_iStore_bh_r : InstrItinClass; +def IIC_iStore_bh_si : InstrItinClass; +def IIC_iStore_bh_iu : InstrItinClass; +def IIC_iStore_bh_ru : InstrItinClass; +def IIC_iStore_bh_siu : InstrItinClass; +def IIC_iStore_d_i : InstrItinClass; +def IIC_iStore_d_r : InstrItinClass; +def IIC_iStore_d_ru : InstrItinClass; +def IIC_iStore_m : InstrItinClass; +def IIC_iStore_mu : InstrItinClass; +def IIC_Preload : InstrItinClass; +def IIC_Br : InstrItinClass; +def IIC_fpSTAT : InstrItinClass; +def IIC_fpUNA32 : InstrItinClass; +def IIC_fpUNA64 : InstrItinClass; +def IIC_fpCMP32 : InstrItinClass; +def IIC_fpCMP64 : InstrItinClass; +def IIC_fpCVTSD : InstrItinClass; +def IIC_fpCVTDS : InstrItinClass; +def IIC_fpCVTSH : InstrItinClass; +def IIC_fpCVTHS : InstrItinClass; +def IIC_fpCVTIS : InstrItinClass; +def IIC_fpCVTID : InstrItinClass; +def IIC_fpCVTSI : InstrItinClass; +def IIC_fpCVTDI : InstrItinClass; +def IIC_fpMOVIS : InstrItinClass; +def IIC_fpMOVID : InstrItinClass; +def IIC_fpMOVSI : InstrItinClass; +def IIC_fpMOVDI : InstrItinClass; +def IIC_fpALU32 : InstrItinClass; +def IIC_fpALU64 : InstrItinClass; +def IIC_fpMUL32 : InstrItinClass; +def IIC_fpMUL64 : InstrItinClass; +def IIC_fpMAC32 : InstrItinClass; +def IIC_fpMAC64 : InstrItinClass; +def IIC_fpFMAC32 : InstrItinClass; +def IIC_fpFMAC64 : InstrItinClass; +def IIC_fpDIV32 : InstrItinClass; +def IIC_fpDIV64 : InstrItinClass; +def IIC_fpSQRT32 : InstrItinClass; +def IIC_fpSQRT64 : InstrItinClass; +def IIC_fpLoad32 : InstrItinClass; +def IIC_fpLoad64 : InstrItinClass; +def IIC_fpLoad_m : InstrItinClass; +def IIC_fpLoad_mu : InstrItinClass; +def IIC_fpStore32 : InstrItinClass; +def IIC_fpStore64 : InstrItinClass; +def IIC_fpStore_m : InstrItinClass; +def IIC_fpStore_mu : InstrItinClass; +def IIC_VLD1 : InstrItinClass; +def IIC_VLD1x2 : InstrItinClass; +def IIC_VLD1x3 : InstrItinClass; +def IIC_VLD1x4 : InstrItinClass; +def IIC_VLD1u : InstrItinClass; +def IIC_VLD1x2u : InstrItinClass; +def IIC_VLD1x3u : InstrItinClass; +def IIC_VLD1x4u : InstrItinClass; +def IIC_VLD1ln : InstrItinClass; +def IIC_VLD1lnu : InstrItinClass; +def IIC_VLD1dup : InstrItinClass; +def IIC_VLD1dupu : InstrItinClass; +def IIC_VLD2 : InstrItinClass; +def IIC_VLD2x2 : InstrItinClass; +def IIC_VLD2u : InstrItinClass; +def IIC_VLD2x2u : InstrItinClass; +def IIC_VLD2ln : InstrItinClass; +def IIC_VLD2lnu : InstrItinClass; +def IIC_VLD2dup : InstrItinClass; +def IIC_VLD2dupu : InstrItinClass; +def IIC_VLD3 : InstrItinClass; +def IIC_VLD3ln : InstrItinClass; +def IIC_VLD3u : InstrItinClass; +def IIC_VLD3lnu : InstrItinClass; +def IIC_VLD3dup : InstrItinClass; +def IIC_VLD3dupu : InstrItinClass; +def IIC_VLD4 : InstrItinClass; +def IIC_VLD4ln : InstrItinClass; +def IIC_VLD4u : InstrItinClass; +def IIC_VLD4lnu : InstrItinClass; +def IIC_VLD4dup : InstrItinClass; +def IIC_VLD4dupu : InstrItinClass; +def IIC_VST1 : InstrItinClass; +def IIC_VST1x2 : InstrItinClass; +def IIC_VST1x3 : InstrItinClass; +def IIC_VST1x4 : InstrItinClass; +def IIC_VST1u : InstrItinClass; +def IIC_VST1x2u : InstrItinClass; +def IIC_VST1x3u : InstrItinClass; +def IIC_VST1x4u : InstrItinClass; +def IIC_VST1ln : InstrItinClass; +def IIC_VST1lnu : InstrItinClass; +def IIC_VST2 : InstrItinClass; +def IIC_VST2x2 : InstrItinClass; +def IIC_VST2u : InstrItinClass; +def IIC_VST2x2u : InstrItinClass; +def IIC_VST2ln : InstrItinClass; +def IIC_VST2lnu : InstrItinClass; +def IIC_VST3 : InstrItinClass; +def IIC_VST3u : InstrItinClass; +def IIC_VST3ln : InstrItinClass; +def IIC_VST3lnu : InstrItinClass; +def IIC_VST4 : InstrItinClass; +def IIC_VST4u : InstrItinClass; +def IIC_VST4ln : InstrItinClass; +def IIC_VST4lnu : InstrItinClass; +def IIC_VUNAD : InstrItinClass; +def IIC_VUNAQ : InstrItinClass; +def IIC_VBIND : InstrItinClass; +def IIC_VBINQ : InstrItinClass; +def IIC_VPBIND : InstrItinClass; +def IIC_VFMULD : InstrItinClass; +def IIC_VFMULQ : InstrItinClass; +def IIC_VMOV : InstrItinClass; +def IIC_VMOVImm : InstrItinClass; +def IIC_VMOVD : InstrItinClass; +def IIC_VMOVQ : InstrItinClass; +def IIC_VMOVIS : InstrItinClass; +def IIC_VMOVID : InstrItinClass; +def IIC_VMOVISL : InstrItinClass; +def IIC_VMOVSI : InstrItinClass; +def IIC_VMOVDI : InstrItinClass; +def IIC_VMOVN : InstrItinClass; +def IIC_VPERMD : InstrItinClass; +def IIC_VPERMQ : InstrItinClass; +def IIC_VPERMQ3 : InstrItinClass; +def IIC_VMACD : InstrItinClass; +def IIC_VMACQ : InstrItinClass; +def IIC_VFMACD : InstrItinClass; +def IIC_VFMACQ : InstrItinClass; +def IIC_VRECSD : InstrItinClass; +def IIC_VRECSQ : InstrItinClass; +def IIC_VCNTiD : InstrItinClass; +def IIC_VCNTiQ : InstrItinClass; +def IIC_VUNAiD : InstrItinClass; +def IIC_VUNAiQ : InstrItinClass; +def IIC_VQUNAiD : InstrItinClass; +def IIC_VQUNAiQ : InstrItinClass; +def IIC_VBINiD : InstrItinClass; +def IIC_VBINiQ : InstrItinClass; +def IIC_VSUBiD : InstrItinClass; +def IIC_VSUBiQ : InstrItinClass; +def IIC_VBINi4D : InstrItinClass; +def IIC_VBINi4Q : InstrItinClass; +def IIC_VSUBi4D : InstrItinClass; +def IIC_VSUBi4Q : InstrItinClass; +def IIC_VABAD : InstrItinClass; +def IIC_VABAQ : InstrItinClass; +def IIC_VSHLiD : InstrItinClass; +def IIC_VSHLiQ : InstrItinClass; +def IIC_VSHLi4D : InstrItinClass; +def IIC_VSHLi4Q : InstrItinClass; +def IIC_VPALiD : InstrItinClass; +def IIC_VPALiQ : InstrItinClass; +def IIC_VMULi16D : InstrItinClass; +def IIC_VMULi32D : InstrItinClass; +def IIC_VMULi16Q : InstrItinClass; +def IIC_VMULi32Q : InstrItinClass; +def IIC_VMACi16D : InstrItinClass; +def IIC_VMACi32D : InstrItinClass; +def IIC_VMACi16Q : InstrItinClass; +def IIC_VMACi32Q : InstrItinClass; +def IIC_VEXTD : InstrItinClass; +def IIC_VEXTQ : InstrItinClass; +def IIC_VTB1 : InstrItinClass; +def IIC_VTB2 : InstrItinClass; +def IIC_VTB3 : InstrItinClass; +def IIC_VTB4 : InstrItinClass; +def IIC_VTBX1 : InstrItinClass; +def IIC_VTBX2 : InstrItinClass; +def IIC_VTBX3 : InstrItinClass; +def IIC_VTBX4 : InstrItinClass; + +//===----------------------------------------------------------------------===// +// Processor instruction itineraries. + +include "ARMScheduleV6.td" +include "ARMScheduleA8.td" +include "ARMScheduleA9.td" +include "ARMScheduleSwift.td" diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td b/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td new file mode 100644 index 0000000..2c63825 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td @@ -0,0 +1,1075 @@ +//=- ARMScheduleA8.td - ARM Cortex-A8 Scheduling Definitions -*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the ARM Cortex A8 processors. +// +//===----------------------------------------------------------------------===// + +// +// Scheduling information derived from "Cortex-A8 Technical Reference Manual". +// Functional Units. +def A8_Pipe0 : FuncUnit; // pipeline 0 +def A8_Pipe1 : FuncUnit; // pipeline 1 +def A8_LSPipe : FuncUnit; // Load / store pipeline +def A8_NPipe : FuncUnit; // NEON ALU/MUL pipe +def A8_NLSPipe : FuncUnit; // NEON LS pipe +// +// Dual issue pipeline represented by A8_Pipe0 | A8_Pipe1 +// +def CortexA8Itineraries : ProcessorItineraries< + [A8_Pipe0, A8_Pipe1, A8_LSPipe, A8_NPipe, A8_NLSPipe], + [], [ + // Two fully-pipelined integer ALU pipelines + // + // No operand cycles + InstrItinData<IIC_iALUx , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>]>, + // + // Binary Instructions that produce a result + InstrItinData<IIC_iALUi ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>, + InstrItinData<IIC_iALUr ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 2]>, + InstrItinData<IIC_iALUsi,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>, + InstrItinData<IIC_iALUsir,[InstrStage<1,[A8_Pipe0, A8_Pipe1]>], [2, 1, 2]>, + InstrItinData<IIC_iALUsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1, 1]>, + // + // Bitwise Instructions that produce a result + InstrItinData<IIC_iBITi ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>, + InstrItinData<IIC_iBITr ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 2]>, + InstrItinData<IIC_iBITsi,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>, + InstrItinData<IIC_iBITsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1, 1]>, + // + // Unary Instructions that produce a result + InstrItinData<IIC_iUNAr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>, + InstrItinData<IIC_iUNAsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>, + // + // Zero and sign extension instructions + InstrItinData<IIC_iEXTr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>, + InstrItinData<IIC_iEXTAr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>, + InstrItinData<IIC_iEXTAsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>],[2, 2, 1, 1]>, + // + // Compare instructions + InstrItinData<IIC_iCMPi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>, + InstrItinData<IIC_iCMPr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>, + InstrItinData<IIC_iCMPsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>, + InstrItinData<IIC_iCMPsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>, + // + // Test instructions + InstrItinData<IIC_iTSTi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>, + InstrItinData<IIC_iTSTr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>, + InstrItinData<IIC_iTSTsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>, + InstrItinData<IIC_iTSTsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>, + // + // Move instructions, unconditional + InstrItinData<IIC_iMOVi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1]>, + InstrItinData<IIC_iMOVr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>, + InstrItinData<IIC_iMOVsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>, + InstrItinData<IIC_iMOVsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1, 1]>, + InstrItinData<IIC_iMOVix2,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>, + InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [3]>, + InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LSPipe]>], [5]>, + // + // Move instructions, conditional + InstrItinData<IIC_iCMOVi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>, + InstrItinData<IIC_iCMOVr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>, + InstrItinData<IIC_iCMOVsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>, + InstrItinData<IIC_iCMOVsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>, + InstrItinData<IIC_iCMOVix2,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [3, 1]>, + // + // MVN instructions + InstrItinData<IIC_iMVNi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1]>, + InstrItinData<IIC_iMVNr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>, + InstrItinData<IIC_iMVNsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>, + InstrItinData<IIC_iMVNsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1, 1]>, + + // Integer multiply pipeline + // Result written in E5, but that is relative to the last cycle of multicycle, + // so we use 6 for those cases + // + InstrItinData<IIC_iMUL16 , [InstrStage<1, [A8_Pipe0]>], [5, 1, 1]>, + InstrItinData<IIC_iMAC16 , [InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>, + InstrItinData<IIC_iMUL32 , [InstrStage<2, [A8_Pipe0]>], [6, 1, 1]>, + InstrItinData<IIC_iMAC32 , [InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>, + InstrItinData<IIC_iMUL64 , [InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>, + InstrItinData<IIC_iMAC64 , [InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>, + + // Integer load pipeline + // + // Immediate offset + InstrItinData<IIC_iLoad_i , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1]>, + InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1]>, + InstrItinData<IIC_iLoad_d_i, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1]>, + // + // Register offset + InstrItinData<IIC_iLoad_r , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>, + InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>, + InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>, + // + // Scaled register offset, issues over 2 cycles + // FIXME: lsl by 2 takes 1 cycle. + InstrItinData<IIC_iLoad_si , [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [4, 1, 1]>, + InstrItinData<IIC_iLoad_bh_si,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [4, 1, 1]>, + // + // Immediate offset with update + InstrItinData<IIC_iLoad_iu , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 2, 1]>, + InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 2, 1]>, + // + // Register offset with update + InstrItinData<IIC_iLoad_ru , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 2, 1, 1]>, + InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 2, 1, 1]>, + InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 2, 1, 1]>, + // + // Scaled register offset with update, issues over 2 cycles + InstrItinData<IIC_iLoad_siu , [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_LSPipe]>], [4, 3, 1, 1]>, + InstrItinData<IIC_iLoad_bh_siu,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_LSPipe]>], [4, 3, 1, 1]>, + // + // Load multiple, def is the 5th operand. Pipeline 0 only. + // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers. + InstrItinData<IIC_iLoad_m , [InstrStage<2, [A8_Pipe0], 0>, + InstrStage<2, [A8_LSPipe]>], + [1, 1, 1, 1, 3], [], -1>, // dynamic uops + // + // Load multiple + update, defs are the 1st and 5th operands. + InstrItinData<IIC_iLoad_mu , [InstrStage<3, [A8_Pipe0], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 1, 1, 1, 3], [], -1>, // dynamic uops + // + // Load multiple plus branch + InstrItinData<IIC_iLoad_mBr, [InstrStage<3, [A8_Pipe0], 0>, + InstrStage<3, [A8_LSPipe]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>], + [1, 2, 1, 1, 3], [], -1>, // dynamic uops + // + // Pop, def is the 3rd operand. + InstrItinData<IIC_iPop , [InstrStage<3, [A8_Pipe0], 0>, + InstrStage<3, [A8_LSPipe]>], + [1, 1, 3], [], -1>, // dynamic uops + // + // Push, def is the 3th operand. + InstrItinData<IIC_iPop_Br, [InstrStage<3, [A8_Pipe0], 0>, + InstrStage<3, [A8_LSPipe]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>], + [1, 1, 3], [], -1>, // dynamic uops + // + // iLoadi + iALUr for t2LDRpci_pic. + InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [4, 1]>, + + + // Integer store pipeline + // + // Immediate offset + InstrItinData<IIC_iStore_i , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1]>, + InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1]>, + InstrItinData<IIC_iStore_d_i, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1]>, + // + // Register offset + InstrItinData<IIC_iStore_r , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>, + InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>, + InstrItinData<IIC_iStore_d_r, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>, + // + // Scaled register offset, issues over 2 cycles + InstrItinData<IIC_iStore_si , [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_LSPipe]>], [3, 1, 1]>, + InstrItinData<IIC_iStore_bh_si,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_LSPipe]>], [3, 1, 1]>, + // + // Immediate offset with update + InstrItinData<IIC_iStore_iu , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [2, 3, 1]>, + InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [2, 3, 1]>, + // + // Register offset with update + InstrItinData<IIC_iStore_ru , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [2, 3, 1, 1]>, + InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [2, 3, 1, 1]>, + InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [2, 3, 1, 1]>, + // + // Scaled register offset with update, issues over 2 cycles + InstrItinData<IIC_iStore_siu, [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_LSPipe]>], [3, 3, 1, 1]>, + InstrItinData<IIC_iStore_bh_siu,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_LSPipe]>], [3, 3, 1, 1]>, + // + // Store multiple. Pipeline 0 only. + // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers. + InstrItinData<IIC_iStore_m , [InstrStage<2, [A8_Pipe0], 0>, + InstrStage<2, [A8_LSPipe]>], + [], [], -1>, // dynamic uops + // + // Store multiple + update + InstrItinData<IIC_iStore_mu, [InstrStage<2, [A8_Pipe0], 0>, + InstrStage<2, [A8_LSPipe]>], + [2], [], -1>, // dynamic uops + // + // Preload + InstrItinData<IIC_Preload, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>, + + // Branch + // + // no delay slots, so the latency of a branch is unimportant + InstrItinData<IIC_Br , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>]>, + + // VFP + // Issue through integer pipeline, and execute in NEON unit. We assume + // RunFast mode so that NFP pipeline is used for single-precision when + // possible. + // + // FP Special Register to Integer Register File Move + InstrItinData<IIC_fpSTAT , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe]>], [20]>, + // + // Single-precision FP Unary + InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [7, 1]>, + // + // Double-precision FP Unary + InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NPipe], 0>, + InstrStage<4, [A8_NLSPipe]>], [4, 1]>, + // + // Single-precision FP Compare + InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [1, 1]>, + // + // Double-precision FP Compare + InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NPipe], 0>, + InstrStage<4, [A8_NLSPipe]>], [4, 1]>, + // + // Single to Double FP Convert + InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<7, [A8_NPipe], 0>, + InstrStage<7, [A8_NLSPipe]>], [7, 1]>, + // + // Double to Single FP Convert + InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<5, [A8_NPipe], 0>, + InstrStage<5, [A8_NLSPipe]>], [5, 1]>, + // + // Single-Precision FP to Integer Convert + InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [7, 1]>, + // + // Double-Precision FP to Integer Convert + InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<8, [A8_NPipe], 0>, + InstrStage<8, [A8_NLSPipe]>], [8, 1]>, + // + // Integer to Single-Precision FP Convert + InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [7, 1]>, + // + // Integer to Double-Precision FP Convert + InstrItinData<IIC_fpCVTID , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<8, [A8_NPipe], 0>, + InstrStage<8, [A8_NLSPipe]>], [8, 1]>, + // + // Single-precision FP ALU + InstrItinData<IIC_fpALU32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [7, 1, 1]>, + // + // Double-precision FP ALU + InstrItinData<IIC_fpALU64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<9, [A8_NPipe], 0>, + InstrStage<9, [A8_NLSPipe]>], [9, 1, 1]>, + // + // Single-precision FP Multiply + InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [7, 1, 1]>, + // + // Double-precision FP Multiply + InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<11, [A8_NPipe], 0>, + InstrStage<11, [A8_NLSPipe]>], [11, 1, 1]>, + // + // Single-precision FP MAC + InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [7, 2, 1, 1]>, + // + // Double-precision FP MAC + InstrItinData<IIC_fpMAC64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<19, [A8_NPipe], 0>, + InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>, + // + // Single-precision Fused FP MAC + InstrItinData<IIC_fpFMAC32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [7, 2, 1, 1]>, + // + // Double-precision Fused FP MAC + InstrItinData<IIC_fpFMAC64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<19, [A8_NPipe], 0>, + InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>, + // + // Single-precision FP DIV + InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<20, [A8_NPipe], 0>, + InstrStage<20, [A8_NLSPipe]>], [20, 1, 1]>, + // + // Double-precision FP DIV + InstrItinData<IIC_fpDIV64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<29, [A8_NPipe], 0>, + InstrStage<29, [A8_NLSPipe]>], [29, 1, 1]>, + // + // Single-precision FP SQRT + InstrItinData<IIC_fpSQRT32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<19, [A8_NPipe], 0>, + InstrStage<19, [A8_NLSPipe]>], [19, 1]>, + // + // Double-precision FP SQRT + InstrItinData<IIC_fpSQRT64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<29, [A8_NPipe], 0>, + InstrStage<29, [A8_NLSPipe]>], [29, 1]>, + + // + // Integer to Single-precision Move + InstrItinData<IIC_fpMOVIS, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], + [2, 1]>, + // + // Integer to Double-precision Move + InstrItinData<IIC_fpMOVID, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], + [2, 1, 1]>, + // + // Single-precision to Integer Move + InstrItinData<IIC_fpMOVSI, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], + [20, 1]>, + // + // Double-precision to Integer Move + InstrItinData<IIC_fpMOVDI, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], + [20, 20, 1]>, + + // + // Single-precision FP Load + InstrItinData<IIC_fpLoad32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>], + [2, 1]>, + // + // Double-precision FP Load + InstrItinData<IIC_fpLoad64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>], + [2, 1]>, + // + // FP Load Multiple + // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers. + InstrItinData<IIC_fpLoad_m, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>], + [1, 1, 1, 2], [], -1>, // dynamic uops + // + // FP Load Multiple + update + InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>], + [2, 1, 1, 1, 2], [], -1>, // dynamic uops + // + // Single-precision FP Store + InstrItinData<IIC_fpStore32,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>], + [1, 1]>, + // + // Double-precision FP Store + InstrItinData<IIC_fpStore64,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>], + [1, 1]>, + // + // FP Store Multiple + InstrItinData<IIC_fpStore_m,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>], + [1, 1, 1, 1], [], -1>, // dynamic uops + // + // FP Store Multiple + update + InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>], + [2, 1, 1, 1, 1], [], -1>, // dynamic uops + // NEON + // Issue through integer pipeline, and execute in NEON unit. + // + // VLD1 + InstrItinData<IIC_VLD1, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 1]>, + // VLD1x2 + InstrItinData<IIC_VLD1x2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 2, 1]>, + // + // VLD1x3 + InstrItinData<IIC_VLD1x3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 1]>, + // + // VLD1x4 + InstrItinData<IIC_VLD1x4, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 3, 1]>, + // + // VLD1u + InstrItinData<IIC_VLD1u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 2, 1]>, + // + // VLD1x2u + InstrItinData<IIC_VLD1x2u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 2, 2, 1]>, + // + // VLD1x3u + InstrItinData<IIC_VLD1x3u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 2, 1]>, + // + // VLD1x4u + InstrItinData<IIC_VLD1x4u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 3, 2, 1]>, + // + // VLD1ln + InstrItinData<IIC_VLD1ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [3, 1, 1, 1]>, + // + // VLD1lnu + InstrItinData<IIC_VLD1lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [3, 2, 1, 1, 1, 1]>, + // + // VLD1dup + InstrItinData<IIC_VLD1dup, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 1]>, + // + // VLD1dupu + InstrItinData<IIC_VLD1dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 2, 1, 1]>, + // + // VLD2 + InstrItinData<IIC_VLD2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 2, 1]>, + // + // VLD2x2 + InstrItinData<IIC_VLD2x2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 3, 1]>, + // + // VLD2ln + InstrItinData<IIC_VLD2ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [3, 3, 1, 1, 1, 1]>, + // + // VLD2u + InstrItinData<IIC_VLD2u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 2, 2, 1, 1, 1]>, + // + // VLD2x2u + InstrItinData<IIC_VLD2x2u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 3, 2, 1]>, + // + // VLD2lnu + InstrItinData<IIC_VLD2lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [3, 3, 2, 1, 1, 1, 1, 1]>, + // + // VLD2dup + InstrItinData<IIC_VLD2dup, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 2, 1]>, + // + // VLD2dupu + InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 2, 2, 1, 1]>, + // + // VLD3 + InstrItinData<IIC_VLD3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [3, 3, 4, 1]>, + // + // VLD3ln + InstrItinData<IIC_VLD3ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<5, [A8_NLSPipe], 0>, + InstrStage<5, [A8_LSPipe]>], + [4, 4, 5, 1, 1, 1, 1, 2]>, + // + // VLD3u + InstrItinData<IIC_VLD3u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [3, 3, 4, 2, 1]>, + // + // VLD3lnu + InstrItinData<IIC_VLD3lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<5, [A8_NLSPipe], 0>, + InstrStage<5, [A8_LSPipe]>], + [4, 4, 5, 2, 1, 1, 1, 1, 1, 2]>, + // + // VLD3dup + InstrItinData<IIC_VLD3dup, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 1]>, + // + // VLD3dupu + InstrItinData<IIC_VLD3dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 2, 1, 1]>, + // + // VLD4 + InstrItinData<IIC_VLD4, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [3, 3, 4, 4, 1]>, + // + // VLD4ln + InstrItinData<IIC_VLD4ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<5, [A8_NLSPipe], 0>, + InstrStage<5, [A8_LSPipe]>], + [4, 4, 5, 5, 1, 1, 1, 1, 2, 2]>, + // + // VLD4u + InstrItinData<IIC_VLD4u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [3, 3, 4, 4, 2, 1]>, + // + // VLD4lnu + InstrItinData<IIC_VLD4lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<5, [A8_NLSPipe], 0>, + InstrStage<5, [A8_LSPipe]>], + [4, 4, 5, 5, 2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VLD4dup + InstrItinData<IIC_VLD4dup, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 3, 1]>, + // + // VLD4dupu + InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 3, 2, 1, 1]>, + // + // VST1 + InstrItinData<IIC_VST1, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [1, 1, 1]>, + // + // VST1x2 + InstrItinData<IIC_VST1x2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [1, 1, 1, 1]>, + // + // VST1x3 + InstrItinData<IIC_VST1x3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [1, 1, 1, 1, 2]>, + // + // VST1x4 + InstrItinData<IIC_VST1x4, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST1u + InstrItinData<IIC_VST1u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 1, 1, 1, 1]>, + // + // VST1x2u + InstrItinData<IIC_VST1x2u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1]>, + // + // VST1x3u + InstrItinData<IIC_VST1x3u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1, 2]>, + // + // VST1x4u + InstrItinData<IIC_VST1x4u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VST1ln + InstrItinData<IIC_VST1ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [1, 1, 1]>, + // + // VST1lnu + InstrItinData<IIC_VST1lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 1, 1, 1, 1]>, + // + // VST2 + InstrItinData<IIC_VST2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [1, 1, 1, 1]>, + // + // VST2x2 + InstrItinData<IIC_VST2x2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST2u + InstrItinData<IIC_VST2u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1]>, + // + // VST2x2u + InstrItinData<IIC_VST2x2u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VST2ln + InstrItinData<IIC_VST2ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [1, 1, 1, 1]>, + // + // VST2lnu + InstrItinData<IIC_VST2lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1]>, + // + // VST3 + InstrItinData<IIC_VST3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [1, 1, 1, 1, 2]>, + // + // VST3u + InstrItinData<IIC_VST3u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1, 2]>, + // + // VST3ln + InstrItinData<IIC_VST3ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [1, 1, 1, 1, 2]>, + // + // VST3lnu + InstrItinData<IIC_VST3lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1, 2]>, + // + // VST4 + InstrItinData<IIC_VST4, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST4u + InstrItinData<IIC_VST4u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VST4ln + InstrItinData<IIC_VST4ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST4lnu + InstrItinData<IIC_VST4lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, + // + // Double-register FP Unary + InstrItinData<IIC_VUNAD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [5, 2]>, + // + // Quad-register FP Unary + // Result written in N5, but that is relative to the last cycle of multicycle, + // so we use 6 for those cases + InstrItinData<IIC_VUNAQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [6, 2]>, + // + // Double-register FP Binary + InstrItinData<IIC_VBIND, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [5, 2, 2]>, + // + // VPADD, etc. + InstrItinData<IIC_VPBIND, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [5, 2, 2]>, + // + // Double-register FP VMUL + InstrItinData<IIC_VFMULD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [5, 2, 1]>, + + // + // Quad-register FP Binary + // Result written in N5, but that is relative to the last cycle of multicycle, + // so we use 6 for those cases + InstrItinData<IIC_VBINQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [6, 2, 2]>, + // + // Quad-register FP VMUL + InstrItinData<IIC_VFMULQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [6, 2, 1]>, + // + // Move + InstrItinData<IIC_VMOV, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [1, 1]>, + // + // Move Immediate + InstrItinData<IIC_VMOVImm, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [3]>, + // + // Double-register Permute Move + InstrItinData<IIC_VMOVD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe]>], [2, 1]>, + // + // Quad-register Permute Move + // Result written in N2, but that is relative to the last cycle of multicycle, + // so we use 3 for those cases + InstrItinData<IIC_VMOVQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe]>], [3, 1]>, + // + // Integer to Single-precision Move + InstrItinData<IIC_VMOVIS , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe]>], [2, 1]>, + // + // Integer to Double-precision Move + InstrItinData<IIC_VMOVID , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe]>], [2, 1, 1]>, + // + // Single-precision to Integer Move + InstrItinData<IIC_VMOVSI , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe]>], [20, 1]>, + // + // Double-precision to Integer Move + InstrItinData<IIC_VMOVDI , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe]>], [20, 20, 1]>, + // + // Integer to Lane Move + InstrItinData<IIC_VMOVISL , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe]>], [3, 1, 1]>, + // + // Vector narrow move + InstrItinData<IIC_VMOVN , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [2, 1]>, + // + // Double-register Permute + InstrItinData<IIC_VPERMD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe]>], [2, 2, 1, 1]>, + // + // Quad-register Permute + // Result written in N2, but that is relative to the last cycle of multicycle, + // so we use 3 for those cases + InstrItinData<IIC_VPERMQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe]>], [3, 3, 1, 1]>, + // + // Quad-register Permute (3 cycle issue) + // Result written in N2, but that is relative to the last cycle of multicycle, + // so we use 4 for those cases + InstrItinData<IIC_VPERMQ3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe]>, + InstrStage<1, [A8_NPipe], 0>, + InstrStage<2, [A8_NLSPipe]>], [4, 4, 1, 1]>, + // + // Double-register FP Multiple-Accumulate + InstrItinData<IIC_VMACD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [9, 3, 2, 2]>, + // + // Quad-register FP Multiple-Accumulate + // Result written in N9, but that is relative to the last cycle of multicycle, + // so we use 10 for those cases + InstrItinData<IIC_VMACQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>, + // + // Double-register Fused FP Multiple-Accumulate + InstrItinData<IIC_VFMACD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [9, 3, 2, 2]>, + // + // Quad-register Fused FP Multiple-Accumulate + // Result written in N9, but that is relative to the last cycle of multicycle, + // so we use 10 for those cases + InstrItinData<IIC_VFMACQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>, + // + // Double-register Reciprical Step + InstrItinData<IIC_VRECSD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [9, 2, 2]>, + // + // Quad-register Reciprical Step + InstrItinData<IIC_VRECSQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [10, 2, 2]>, + // + // Double-register Integer Count + InstrItinData<IIC_VCNTiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [3, 2, 2]>, + // + // Quad-register Integer Count + // Result written in N3, but that is relative to the last cycle of multicycle, + // so we use 4 for those cases + InstrItinData<IIC_VCNTiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [4, 2, 2]>, + // + // Double-register Integer Unary + InstrItinData<IIC_VUNAiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [4, 2]>, + // + // Quad-register Integer Unary + InstrItinData<IIC_VUNAiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [4, 2]>, + // + // Double-register Integer Q-Unary + InstrItinData<IIC_VQUNAiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [4, 1]>, + // + // Quad-register Integer CountQ-Unary + InstrItinData<IIC_VQUNAiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [4, 1]>, + // + // Double-register Integer Binary + InstrItinData<IIC_VBINiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [3, 2, 2]>, + // + // Quad-register Integer Binary + InstrItinData<IIC_VBINiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [3, 2, 2]>, + // + // Double-register Integer Binary (4 cycle) + InstrItinData<IIC_VBINi4D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [4, 2, 1]>, + // + // Quad-register Integer Binary (4 cycle) + InstrItinData<IIC_VBINi4Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [4, 2, 1]>, + + // + // Double-register Integer Subtract + InstrItinData<IIC_VSUBiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [3, 2, 1]>, + // + // Quad-register Integer Subtract + InstrItinData<IIC_VSUBiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [3, 2, 1]>, + // + // Double-register Integer Subtract + InstrItinData<IIC_VSUBi4D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [4, 2, 1]>, + // + // Quad-register Integer Subtract + InstrItinData<IIC_VSUBi4Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [4, 2, 1]>, + // + // Double-register Integer Shift + InstrItinData<IIC_VSHLiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [3, 1, 1]>, + // + // Quad-register Integer Shift + InstrItinData<IIC_VSHLiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [4, 1, 1]>, + // + // Double-register Integer Shift (4 cycle) + InstrItinData<IIC_VSHLi4D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [4, 1, 1]>, + // + // Quad-register Integer Shift (4 cycle) + InstrItinData<IIC_VSHLi4Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [5, 1, 1]>, + // + // Double-register Integer Pair Add Long + InstrItinData<IIC_VPALiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [6, 3, 1]>, + // + // Quad-register Integer Pair Add Long + InstrItinData<IIC_VPALiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [7, 3, 1]>, + // + // Double-register Absolute Difference and Accumulate + InstrItinData<IIC_VABAD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [6, 3, 2, 1]>, + // + // Quad-register Absolute Difference and Accumulate + InstrItinData<IIC_VABAQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [6, 3, 2, 1]>, + + // + // Double-register Integer Multiply (.8, .16) + InstrItinData<IIC_VMULi16D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [6, 2, 2]>, + // + // Double-register Integer Multiply (.32) + InstrItinData<IIC_VMULi32D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [7, 2, 1]>, + // + // Quad-register Integer Multiply (.8, .16) + InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [7, 2, 2]>, + // + // Quad-register Integer Multiply (.32) + InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<3, [A8_NPipe]>], [9, 2, 1]>, + // + // Double-register Integer Multiply-Accumulate (.8, .16) + InstrItinData<IIC_VMACi16D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [6, 3, 2, 2]>, + // + // Double-register Integer Multiply-Accumulate (.32) + InstrItinData<IIC_VMACi32D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [7, 3, 2, 1]>, + // + // Quad-register Integer Multiply-Accumulate (.8, .16) + InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [7, 3, 2, 2]>, + // + // Quad-register Integer Multiply-Accumulate (.32) + InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<3, [A8_NPipe]>], [9, 3, 2, 1]>, + // + // Double-register VEXT + InstrItinData<IIC_VEXTD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe]>], [2, 1, 1]>, + // + // Quad-register VEXT + InstrItinData<IIC_VEXTQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe]>], [3, 1, 1]>, + // + // VTB + InstrItinData<IIC_VTB1, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe]>], [3, 2, 1]>, + InstrItinData<IIC_VTB2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe]>], [3, 2, 2, 1]>, + InstrItinData<IIC_VTB3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe]>, + InstrStage<1, [A8_NPipe], 0>, + InstrStage<2, [A8_NLSPipe]>], [4, 2, 2, 3, 1]>, + InstrItinData<IIC_VTB4, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe]>, + InstrStage<1, [A8_NPipe], 0>, + InstrStage<2, [A8_NLSPipe]>],[4, 2, 2, 3, 3, 1]>, + // + // VTBX + InstrItinData<IIC_VTBX1, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe]>], [3, 1, 2, 1]>, + InstrItinData<IIC_VTBX2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe]>], [3, 1, 2, 2, 1]>, + InstrItinData<IIC_VTBX3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe]>, + InstrStage<1, [A8_NPipe], 0>, + InstrStage<2, [A8_NLSPipe]>],[4, 1, 2, 2, 3, 1]>, + InstrItinData<IIC_VTBX4, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe]>, + InstrStage<1, [A8_NPipe], 0>, + InstrStage<2, [A8_NLSPipe]>], [4, 1, 2, 2, 3, 3, 1]> +]>; + +// ===---------------------------------------------------------------------===// +// This following definitions describe the simple machine model which +// will replace itineraries. + +// Cortex-A8 machine model for scheduling and other instruction cost heuristics. +def CortexA8Model : SchedMachineModel { + let IssueWidth = 2; // 2 micro-ops are dispatched per cycle. + let MinLatency = -1; // OperandCycles are interpreted as MinLatency. + let LoadLatency = 2; // Optimistic load latency assuming bypass. + // This is overriden by OperandCycles if the + // Itineraries are queried instead. + let MispredictPenalty = 13; // Based on estimate of pipeline depth. + + let Itineraries = CortexA8Itineraries; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td b/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td new file mode 100644 index 0000000..9a1d222 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td @@ -0,0 +1,2529 @@ +//=- ARMScheduleA9.td - ARM Cortex-A9 Scheduling Definitions -*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the ARM Cortex A9 processors. +// +//===----------------------------------------------------------------------===// + +// ===---------------------------------------------------------------------===// +// This section contains legacy support for itineraries. This is +// required until SD and PostRA schedulers are replaced by MachineScheduler. + +// +// Ad-hoc scheduling information derived from pretty vague "Cortex-A9 Technical +// Reference Manual". +// +// Functional units +def A9_Issue0 : FuncUnit; // Issue 0 +def A9_Issue1 : FuncUnit; // Issue 1 +def A9_Branch : FuncUnit; // Branch +def A9_ALU0 : FuncUnit; // ALU / MUL pipeline 0 +def A9_ALU1 : FuncUnit; // ALU pipeline 1 +def A9_AGU : FuncUnit; // Address generation unit for ld / st +def A9_NPipe : FuncUnit; // NEON pipeline +def A9_MUX0 : FuncUnit; // AGU + NEON/FPU multiplexer +def A9_LSUnit : FuncUnit; // L/S Unit +def A9_DRegsVFP: FuncUnit; // FP register set, VFP side +def A9_DRegsN : FuncUnit; // FP register set, NEON side + +// Bypasses +def A9_LdBypass : Bypass; + +def CortexA9Itineraries : ProcessorItineraries< + [A9_Issue0, A9_Issue1, A9_Branch, A9_ALU0, A9_ALU1, A9_AGU, A9_NPipe, A9_MUX0, + A9_LSUnit, A9_DRegsVFP, A9_DRegsN], + [A9_LdBypass], [ + // Two fully-pipelined integer ALU pipelines + + // + // Move instructions, unconditional + InstrItinData<IIC_iMOVi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>, + InstrItinData<IIC_iMOVr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData<IIC_iMOVsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData<IIC_iMOVsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>, + InstrItinData<IIC_iMOVix2 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [2]>, + InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [3]>, + InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], [5]>, + // + // MVN instructions + InstrItinData<IIC_iMVNi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [1]>, + InstrItinData<IIC_iMVNr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [1, 1], [NoBypass, A9_LdBypass]>, + InstrItinData<IIC_iMVNsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], + [2, 1]>, + InstrItinData<IIC_iMVNsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<3, [A9_ALU0, A9_ALU1]>], + [3, 1, 1]>, + // + // No operand cycles + InstrItinData<IIC_iALUx , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>]>, + // + // Binary Instructions that produce a result + InstrItinData<IIC_iALUi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [1, 1], [NoBypass, A9_LdBypass]>, + InstrItinData<IIC_iALUr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [1, 1, 1], [NoBypass, A9_LdBypass, A9_LdBypass]>, + InstrItinData<IIC_iALUsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], + [2, 1, 1], [NoBypass, A9_LdBypass, NoBypass]>, + InstrItinData<IIC_iALUsir,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], + [2, 1, 1], [NoBypass, NoBypass, A9_LdBypass]>, + InstrItinData<IIC_iALUsr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<3, [A9_ALU0, A9_ALU1]>], + [3, 1, 1, 1], + [NoBypass, A9_LdBypass, NoBypass, NoBypass]>, + // + // Bitwise Instructions that produce a result + InstrItinData<IIC_iBITi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData<IIC_iBITr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1, 1]>, + InstrItinData<IIC_iBITsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>, + InstrItinData<IIC_iBITsr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<3, [A9_ALU0, A9_ALU1]>], [3, 1, 1, 1]>, + // + // Unary Instructions that produce a result + + // CLZ, RBIT, etc. + InstrItinData<IIC_iUNAr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + + // BFC, BFI, UBFX, SBFX + InstrItinData<IIC_iUNAsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1]>, + + // + // Zero and sign extension instructions + InstrItinData<IIC_iEXTr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [2, 1]>, + InstrItinData<IIC_iEXTAr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], [3, 1, 1]>, + InstrItinData<IIC_iEXTAsr,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<3, [A9_ALU0, A9_ALU1]>], [3, 1, 1, 1]>, + // + // Compare instructions + InstrItinData<IIC_iCMPi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [1], [A9_LdBypass]>, + InstrItinData<IIC_iCMPr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [1, 1], [A9_LdBypass, A9_LdBypass]>, + InstrItinData<IIC_iCMPsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], + [1, 1], [A9_LdBypass, NoBypass]>, + InstrItinData<IIC_iCMPsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<3, [A9_ALU0, A9_ALU1]>], + [1, 1, 1], [A9_LdBypass, NoBypass, NoBypass]>, + // + // Test instructions + InstrItinData<IIC_iTSTi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>, + InstrItinData<IIC_iTSTr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData<IIC_iTSTsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData<IIC_iTSTsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<3, [A9_ALU0, A9_ALU1]>], [1, 1, 1]>, + // + // Move instructions, conditional + // FIXME: Correctly model the extra input dep on the destination. + InstrItinData<IIC_iCMOVi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>, + InstrItinData<IIC_iCMOVr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData<IIC_iCMOVsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData<IIC_iCMOVsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>, + InstrItinData<IIC_iCMOVix2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [2]>, + + // Integer multiply pipeline + // + InstrItinData<IIC_iMUL16 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0]>], [3, 1, 1]>, + InstrItinData<IIC_iMAC16 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0]>], + [3, 1, 1, 1]>, + InstrItinData<IIC_iMUL32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0]>], [4, 1, 1]>, + InstrItinData<IIC_iMAC32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0]>], + [4, 1, 1, 1]>, + InstrItinData<IIC_iMUL64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<3, [A9_ALU0]>], [4, 5, 1, 1]>, + InstrItinData<IIC_iMAC64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<3, [A9_ALU0]>], + [4, 5, 1, 1]>, + // Integer load pipeline + // FIXME: The timings are some rough approximations + // + // Immediate offset + InstrItinData<IIC_iLoad_i , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 1], [A9_LdBypass]>, + InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [4, 1], [A9_LdBypass]>, + // FIXME: If address is 64-bit aligned, AGU cycles is 1. + InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 3, 1], [A9_LdBypass]>, + // + // Register offset + InstrItinData<IIC_iLoad_r , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 1, 1], [A9_LdBypass]>, + InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [4, 1, 1], [A9_LdBypass]>, + InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 3, 1, 1], [A9_LdBypass]>, + // + // Scaled register offset + InstrItinData<IIC_iLoad_si , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit], 0>], + [4, 1, 1], [A9_LdBypass]>, + InstrItinData<IIC_iLoad_bh_si,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [5, 1, 1], [A9_LdBypass]>, + // + // Immediate offset with update + InstrItinData<IIC_iLoad_iu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 2, 1], [A9_LdBypass]>, + InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [4, 3, 1], [A9_LdBypass]>, + // + // Register offset with update + InstrItinData<IIC_iLoad_ru , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 2, 1, 1], [A9_LdBypass]>, + InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [4, 3, 1, 1], [A9_LdBypass]>, + InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 3, 1, 1], [A9_LdBypass]>, + // + // Scaled register offset with update + InstrItinData<IIC_iLoad_siu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [4, 3, 1, 1], [A9_LdBypass]>, + InstrItinData<IIC_iLoad_bh_siu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [5, 4, 1, 1], [A9_LdBypass]>, + // + // Load multiple, def is the 5th operand. + // FIXME: This assumes 3 to 4 registers. + InstrItinData<IIC_iLoad_m , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1, 3], + [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass], + -1>, // dynamic uops + // + // Load multiple + update, defs are the 1st and 5th operands. + InstrItinData<IIC_iLoad_mu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1, 3], + [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass], + -1>, // dynamic uops + // + // Load multiple plus branch + InstrItinData<IIC_iLoad_mBr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 1>, + InstrStage<2, [A9_LSUnit]>, + InstrStage<1, [A9_Branch]>], + [1, 2, 1, 1, 3], + [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass], + -1>, // dynamic uops + // + // Pop, def is the 3rd operand. + InstrItinData<IIC_iPop , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 3], + [NoBypass, NoBypass, A9_LdBypass], + -1>, // dynamic uops + // + // Pop + branch, def is the 3rd operand. + InstrItinData<IIC_iPop_Br, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<2, [A9_LSUnit]>, + InstrStage<1, [A9_Branch]>], + [1, 1, 3], + [NoBypass, NoBypass, A9_LdBypass], + -1>, // dynamic uops + // + // iLoadi + iALUr for t2LDRpci_pic. + InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [2, 1]>, + + // Integer store pipeline + /// + // Immediate offset + InstrItinData<IIC_iStore_i , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1]>, + InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], [1, 1]>, + // FIXME: If address is 64-bit aligned, AGU cycles is 1. + InstrItinData<IIC_iStore_d_i, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], [1, 1]>, + // + // Register offset + InstrItinData<IIC_iStore_r , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>, + InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>, + InstrItinData<IIC_iStore_d_r, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>, + // + // Scaled register offset + InstrItinData<IIC_iStore_si , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>, + InstrItinData<IIC_iStore_bh_si,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>, + // + // Immediate offset with update + InstrItinData<IIC_iStore_iu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], [2, 1, 1]>, + InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], [3, 1, 1]>, + // + // Register offset with update + InstrItinData<IIC_iStore_ru , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1, 1, 1]>, + InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], + [3, 1, 1, 1]>, + InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], + [3, 1, 1, 1]>, + // + // Scaled register offset with update + InstrItinData<IIC_iStore_siu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1, 1, 1]>, + InstrItinData<IIC_iStore_bh_siu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], + [3, 1, 1, 1]>, + // + // Store multiple + InstrItinData<IIC_iStore_m , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<2, [A9_LSUnit]>], + [], [], -1>, // dynamic uops + // + // Store multiple + update + InstrItinData<IIC_iStore_mu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<2, [A9_LSUnit]>], + [2], [], -1>, // dynamic uops + // + // Preload + InstrItinData<IIC_Preload, [InstrStage<1, [A9_Issue0, A9_Issue1]>], [1, 1]>, + + // Branch + // + // no delay slots, so the latency of a branch is unimportant + InstrItinData<IIC_Br , [InstrStage<1, [A9_Issue0], 0>, + InstrStage<1, [A9_Issue1], 0>, + InstrStage<1, [A9_Branch]>]>, + + // VFP and NEON shares the same register file. This means that every VFP + // instruction should wait for full completion of the consecutive NEON + // instruction and vice-versa. We model this behavior with two artificial FUs: + // DRegsVFP and DRegsVFP. + // + // Every VFP instruction: + // - Acquires DRegsVFP resource for 1 cycle + // - Reserves DRegsN resource for the whole duration (including time to + // register file writeback!). + // Every NEON instruction does the same but with FUs swapped. + // + // Since the reserved FU cannot be acquired, this models precisely + // "cross-domain" stalls. + + // VFP + // Issue through integer pipeline, and execute in NEON unit. + + // FP Special Register to Integer Register File Move + InstrItinData<IIC_fpSTAT , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [1]>, + // + // Single-precision FP Unary + InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + // Extra latency cycles since wbck is 2 cycles + InstrStage<3, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [1, 1]>, + // + // Double-precision FP Unary + InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + // Extra latency cycles since wbck is 2 cycles + InstrStage<3, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [1, 1]>, + + // + // Single-precision FP Compare + InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + // Extra latency cycles since wbck is 4 cycles + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [1, 1]>, + // + // Double-precision FP Compare + InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + // Extra latency cycles since wbck is 4 cycles + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [1, 1]>, + // + // Single to Double FP Convert + InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, + // + // Double to Single FP Convert + InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, + + // + // Single to Half FP Convert + InstrItinData<IIC_fpCVTSH , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, + // + // Half to Single FP Convert + InstrItinData<IIC_fpCVTHS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<3, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [2, 1]>, + + // + // Single-Precision FP to Integer Convert + InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, + // + // Double-Precision FP to Integer Convert + InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, + // + // Integer to Single-Precision FP Convert + InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, + // + // Integer to Double-Precision FP Convert + InstrItinData<IIC_fpCVTID , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, + // + // Single-precision FP ALU + InstrItinData<IIC_fpALU32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 1, 1]>, + // + // Double-precision FP ALU + InstrItinData<IIC_fpALU64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 1, 1]>, + // + // Single-precision FP Multiply + InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<6, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [5, 1, 1]>, + // + // Double-precision FP Multiply + InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<7, [A9_DRegsN], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [6, 1, 1]>, + // + // Single-precision FP MAC + InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<9, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [8, 1, 1, 1]>, + // + // Double-precision FP MAC + InstrItinData<IIC_fpMAC64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<10, [A9_DRegsN], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [9, 1, 1, 1]>, + // + // Single-precision Fused FP MAC + InstrItinData<IIC_fpFMAC32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<9, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [8, 1, 1, 1]>, + // + // Double-precision Fused FP MAC + InstrItinData<IIC_fpFMAC64, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<10, [A9_DRegsN], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [9, 1, 1, 1]>, + // + // Single-precision FP DIV + InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<16, [A9_DRegsN], 0, Reserved>, + InstrStage<10, [A9_NPipe]>], + [15, 1, 1]>, + // + // Double-precision FP DIV + InstrItinData<IIC_fpDIV64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<26, [A9_DRegsN], 0, Reserved>, + InstrStage<20, [A9_NPipe]>], + [25, 1, 1]>, + // + // Single-precision FP SQRT + InstrItinData<IIC_fpSQRT32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<18, [A9_DRegsN], 0, Reserved>, + InstrStage<13, [A9_NPipe]>], + [17, 1]>, + // + // Double-precision FP SQRT + InstrItinData<IIC_fpSQRT64, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<33, [A9_DRegsN], 0, Reserved>, + InstrStage<28, [A9_NPipe]>], + [32, 1]>, + + // + // Integer to Single-precision Move + InstrItinData<IIC_fpMOVIS, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + // Extra 1 latency cycle since wbck is 2 cycles + InstrStage<3, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [1, 1]>, + // + // Integer to Double-precision Move + InstrItinData<IIC_fpMOVID, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + // Extra 1 latency cycle since wbck is 2 cycles + InstrStage<3, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [1, 1, 1]>, + // + // Single-precision to Integer Move + // + // On A9 move-from-VFP is free to issue with no stall if other VFP + // operations are in flight. I assume it still can't dual-issue though. + InstrItinData<IIC_fpMOVSI, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>], + [2, 1]>, + // + // Double-precision to Integer Move + // + // On A9 move-from-VFP is free to issue with no stall if other VFP + // operations are in flight. I assume it still can't dual-issue though. + InstrItinData<IIC_fpMOVDI, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>], + [2, 1, 1]>, + // + // Single-precision FP Load + InstrItinData<IIC_fpLoad32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1]>, + // + // Double-precision FP Load + // FIXME: Result latency is 1 if address is 64-bit aligned. + InstrItinData<IIC_fpLoad64, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1]>, + // + // FP Load Multiple + // FIXME: assumes 2 doubles which requires 2 LS cycles. + InstrItinData<IIC_fpLoad_m, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1], [], -1>, // dynamic uops + // + // FP Load Multiple + update + // FIXME: assumes 2 doubles which requires 2 LS cycles. + InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1], [], -1>, // dynamic uops + // + // Single-precision FP Store + InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1]>, + // + // Double-precision FP Store + InstrItinData<IIC_fpStore64,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1]>, + // + // FP Store Multiple + // FIXME: assumes 2 doubles which requires 2 LS cycles. + InstrItinData<IIC_fpStore_m,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1], [], -1>, // dynamic uops + // + // FP Store Multiple + update + // FIXME: assumes 2 doubles which requires 2 LS cycles. + InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1], [], -1>, // dynamic uops + // NEON + // VLD1 + InstrItinData<IIC_VLD1, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1]>, + // VLD1x2 + InstrItinData<IIC_VLD1x2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1, 1]>, + // VLD1x3 + InstrItinData<IIC_VLD1x3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 2, 1]>, + // VLD1x4 + InstrItinData<IIC_VLD1x4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 2, 2, 1]>, + // VLD1u + InstrItinData<IIC_VLD1u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 2, 1]>, + // VLD1x2u + InstrItinData<IIC_VLD1x2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1, 2, 1]>, + // VLD1x3u + InstrItinData<IIC_VLD1x3u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 2, 2, 1]>, + // VLD1x4u + InstrItinData<IIC_VLD1x4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 2, 2, 2, 1]>, + // + // VLD1ln + InstrItinData<IIC_VLD1ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 1, 1, 1]>, + // + // VLD1lnu + InstrItinData<IIC_VLD1lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 2, 1, 1, 1, 1]>, + // + // VLD1dup + InstrItinData<IIC_VLD1dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1]>, + // + // VLD1dupu + InstrItinData<IIC_VLD1dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 1, 1]>, + // + // VLD2 + InstrItinData<IIC_VLD2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 1]>, + // + // VLD2x2 + InstrItinData<IIC_VLD2x2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 3, 2, 3, 1]>, + // + // VLD2ln + InstrItinData<IIC_VLD2ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 3, 1, 1, 1, 1]>, + // + // VLD2u + InstrItinData<IIC_VLD2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 2, 1, 1, 1]>, + // + // VLD2x2u + InstrItinData<IIC_VLD2x2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 3, 2, 3, 2, 1]>, + // + // VLD2lnu + InstrItinData<IIC_VLD2lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 3, 2, 1, 1, 1, 1, 1]>, + // + // VLD2dup + InstrItinData<IIC_VLD2dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 1]>, + // + // VLD2dupu + InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 2, 1, 1]>, + // + // VLD3 + InstrItinData<IIC_VLD3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9,[A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 1]>, + // + // VLD3ln + InstrItinData<IIC_VLD3ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<11,[A9_DRegsVFP], 0, Reserved>, + InstrStage<5, [A9_NPipe], 0>, + InstrStage<5, [A9_LSUnit]>], + [5, 5, 6, 1, 1, 1, 1, 2]>, + // + // VLD3u + InstrItinData<IIC_VLD3u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9,[A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 2, 1]>, + // + // VLD3lnu + InstrItinData<IIC_VLD3lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<11,[A9_DRegsVFP], 0, Reserved>, + InstrStage<5, [A9_NPipe], 0>, + InstrStage<5, [A9_LSUnit]>], + [5, 5, 6, 2, 1, 1, 1, 1, 1, 2]>, + // + // VLD3dup + InstrItinData<IIC_VLD3dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 1]>, + // + // VLD3dupu + InstrItinData<IIC_VLD3dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 2, 1, 1]>, + // + // VLD4 + InstrItinData<IIC_VLD4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9,[A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 4, 1]>, + // + // VLD4ln + InstrItinData<IIC_VLD4ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<10,[A9_DRegsVFP], 0, Reserved>, + InstrStage<4, [A9_NPipe], 0>, + InstrStage<4, [A9_LSUnit]>], + [4, 4, 5, 5, 1, 1, 1, 1, 2, 2]>, + // + // VLD4u + InstrItinData<IIC_VLD4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9,[A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 4, 2, 1]>, + // + // VLD4lnu + InstrItinData<IIC_VLD4lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<10,[A9_DRegsVFP], 0, Reserved>, + InstrStage<4, [A9_NPipe], 0>, + InstrStage<4, [A9_LSUnit]>], + [4, 4, 5, 5, 2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VLD4dup + InstrItinData<IIC_VLD4dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 2, 3, 3, 1]>, + // + // VLD4dupu + InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 2, 3, 3, 2, 1, 1]>, + // + // VST1 + InstrItinData<IIC_VST1, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1, 1]>, + // + // VST1x2 + InstrItinData<IIC_VST1x2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1, 1, 1]>, + // + // VST1x3 + InstrItinData<IIC_VST1x3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1, 2]>, + // + // VST1x4 + InstrItinData<IIC_VST1x4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST1u + InstrItinData<IIC_VST1u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1, 1, 1, 1]>, + // + // VST1x2u + InstrItinData<IIC_VST1x2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1]>, + // + // VST1x3u + InstrItinData<IIC_VST1x3u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2]>, + // + // VST1x4u + InstrItinData<IIC_VST1x4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VST1ln + InstrItinData<IIC_VST1ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1, 1]>, + // + // VST1lnu + InstrItinData<IIC_VST1lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1, 1, 1, 1]>, + // + // VST2 + InstrItinData<IIC_VST2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1, 1, 1]>, + // + // VST2x2 + InstrItinData<IIC_VST2x2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST2u + InstrItinData<IIC_VST2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1]>, + // + // VST2x2u + InstrItinData<IIC_VST2x2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VST2ln + InstrItinData<IIC_VST2ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1, 1, 1]>, + // + // VST2lnu + InstrItinData<IIC_VST2lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1]>, + // + // VST3 + InstrItinData<IIC_VST3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1, 2]>, + // + // VST3u + InstrItinData<IIC_VST3u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2]>, + // + // VST3ln + InstrItinData<IIC_VST3ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [1, 1, 1, 1, 2]>, + // + // VST3lnu + InstrItinData<IIC_VST3lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2]>, + // + // VST4 + InstrItinData<IIC_VST4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST4u + InstrItinData<IIC_VST4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VST4ln + InstrItinData<IIC_VST4ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST4lnu + InstrItinData<IIC_VST4lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, + + // + // Double-register Integer Unary + InstrItinData<IIC_VUNAiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 2]>, + // + // Quad-register Integer Unary + InstrItinData<IIC_VUNAiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 2]>, + // + // Double-register Integer Q-Unary + InstrItinData<IIC_VQUNAiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, + // + // Quad-register Integer CountQ-Unary + InstrItinData<IIC_VQUNAiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, + // + // Double-register Integer Binary + InstrItinData<IIC_VBINiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [3, 2, 2]>, + // + // Quad-register Integer Binary + InstrItinData<IIC_VBINiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [3, 2, 2]>, + // + // Double-register Integer Subtract + InstrItinData<IIC_VSUBiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [3, 2, 1]>, + // + // Quad-register Integer Subtract + InstrItinData<IIC_VSUBiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [3, 2, 1]>, + // + // Double-register Integer Shift + InstrItinData<IIC_VSHLiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [3, 1, 1]>, + // + // Quad-register Integer Shift + InstrItinData<IIC_VSHLiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [3, 1, 1]>, + // + // Double-register Integer Shift (4 cycle) + InstrItinData<IIC_VSHLi4D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 1, 1]>, + // + // Quad-register Integer Shift (4 cycle) + InstrItinData<IIC_VSHLi4Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 1, 1]>, + // + // Double-register Integer Binary (4 cycle) + InstrItinData<IIC_VBINi4D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 2, 2]>, + // + // Quad-register Integer Binary (4 cycle) + InstrItinData<IIC_VBINi4Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 2, 2]>, + // + // Double-register Integer Subtract (4 cycle) + InstrItinData<IIC_VSUBi4D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 2, 1]>, + // + // Quad-register Integer Subtract (4 cycle) + InstrItinData<IIC_VSUBi4Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 2, 1]>, + + // + // Double-register Integer Count + InstrItinData<IIC_VCNTiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [3, 2, 2]>, + // + // Quad-register Integer Count + // Result written in N3, but that is relative to the last cycle of multicycle, + // so we use 4 for those cases + InstrItinData<IIC_VCNTiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [4, 2, 2]>, + // + // Double-register Absolute Difference and Accumulate + InstrItinData<IIC_VABAD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [6, 3, 2, 1]>, + // + // Quad-register Absolute Difference and Accumulate + InstrItinData<IIC_VABAQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [6, 3, 2, 1]>, + // + // Double-register Integer Pair Add Long + InstrItinData<IIC_VPALiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [6, 3, 1]>, + // + // Quad-register Integer Pair Add Long + InstrItinData<IIC_VPALiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [6, 3, 1]>, + + // + // Double-register Integer Multiply (.8, .16) + InstrItinData<IIC_VMULi16D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [6, 2, 2]>, + // + // Quad-register Integer Multiply (.8, .16) + InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [7, 2, 2]>, + + // + // Double-register Integer Multiply (.32) + InstrItinData<IIC_VMULi32D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [7, 2, 1]>, + // + // Quad-register Integer Multiply (.32) + InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 9 cycles + InstrStage<10, [A9_DRegsVFP], 0, Reserved>, + InstrStage<4, [A9_NPipe]>], + [9, 2, 1]>, + // + // Double-register Integer Multiply-Accumulate (.8, .16) + InstrItinData<IIC_VMACi16D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [6, 3, 2, 2]>, + // + // Double-register Integer Multiply-Accumulate (.32) + InstrItinData<IIC_VMACi32D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [7, 3, 2, 1]>, + // + // Quad-register Integer Multiply-Accumulate (.8, .16) + InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [7, 3, 2, 2]>, + // + // Quad-register Integer Multiply-Accumulate (.32) + InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 9 cycles + InstrStage<10, [A9_DRegsVFP], 0, Reserved>, + InstrStage<4, [A9_NPipe]>], + [9, 3, 2, 1]>, + + // + // Move + InstrItinData<IIC_VMOV, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [1,1]>, + // + // Move Immediate + InstrItinData<IIC_VMOVImm, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [3]>, + // + // Double-register Permute Move + InstrItinData<IIC_VMOVD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [2, 1]>, + // + // Quad-register Permute Move + InstrItinData<IIC_VMOVQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [2, 1]>, + // + // Integer to Single-precision Move + InstrItinData<IIC_VMOVIS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [1, 1]>, + // + // Integer to Double-precision Move + InstrItinData<IIC_VMOVID , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [1, 1, 1]>, + // + // Single-precision to Integer Move + InstrItinData<IIC_VMOVSI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [2, 1]>, + // + // Double-precision to Integer Move + InstrItinData<IIC_VMOVDI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [2, 2, 1]>, + // + // Integer to Lane Move + InstrItinData<IIC_VMOVISL , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<4, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [3, 1, 1]>, + + // + // Vector narrow move + InstrItinData<IIC_VMOVN, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [3, 1]>, + // + // Double-register FP Unary + InstrItinData<IIC_VUNAD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [5, 2]>, + // + // Quad-register FP Unary + // Result written in N5, but that is relative to the last cycle of multicycle, + // so we use 6 for those cases + InstrItinData<IIC_VUNAQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [6, 2]>, + // + // Double-register FP Binary + // FIXME: We're using this itin for many instructions and [2, 2] here is too + // optimistic. + InstrItinData<IIC_VBIND, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [5, 2, 2]>, + + // + // VPADD, etc. + InstrItinData<IIC_VPBIND, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [5, 1, 1]>, + // + // Double-register FP VMUL + InstrItinData<IIC_VFMULD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [5, 2, 1]>, + // + // Quad-register FP Binary + // Result written in N5, but that is relative to the last cycle of multicycle, + // so we use 6 for those cases + // FIXME: We're using this itin for many instructions and [2, 2] here is too + // optimistic. + InstrItinData<IIC_VBINQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [6, 2, 2]>, + // + // Quad-register FP VMUL + InstrItinData<IIC_VFMULQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [6, 2, 1]>, + // + // Double-register FP Multiple-Accumulate + InstrItinData<IIC_VMACD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [6, 3, 2, 1]>, + // + // Quad-register FP Multiple-Accumulate + // Result written in N9, but that is relative to the last cycle of multicycle, + // so we use 10 for those cases + InstrItinData<IIC_VMACQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 9 cycles + InstrStage<10, [A9_DRegsVFP], 0, Reserved>, + InstrStage<4, [A9_NPipe]>], + [8, 4, 2, 1]>, + // + // Double-register Fused FP Multiple-Accumulate + InstrItinData<IIC_VFMACD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [6, 3, 2, 1]>, + // + // Quad-register Fused FP Multiple-Accumulate + // Result written in N9, but that is relative to the last cycle of multicycle, + // so we use 10 for those cases + InstrItinData<IIC_VFMACQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 9 cycles + InstrStage<10, [A9_DRegsVFP], 0, Reserved>, + InstrStage<4, [A9_NPipe]>], + [8, 4, 2, 1]>, + // + // Double-register Reciprical Step + InstrItinData<IIC_VRECSD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 10 cycles + InstrStage<11, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [9, 2, 2]>, + // + // Quad-register Reciprical Step + InstrItinData<IIC_VRECSQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 11 cycles + InstrStage<12, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [10, 2, 2]>, + // + // Double-register Permute + InstrItinData<IIC_VPERMD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [2, 2, 1, 1]>, + // + // Quad-register Permute + // Result written in N2, but that is relative to the last cycle of multicycle, + // so we use 3 for those cases + InstrItinData<IIC_VPERMQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [3, 3, 1, 1]>, + // + // Quad-register Permute (3 cycle issue) + // Result written in N2, but that is relative to the last cycle of multicycle, + // so we use 4 for those cases + InstrItinData<IIC_VPERMQ3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 8 cycles + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe]>], + [4, 4, 1, 1]>, + + // + // Double-register VEXT + InstrItinData<IIC_VEXTD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [2, 1, 1]>, + // + // Quad-register VEXT + InstrItinData<IIC_VEXTQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [3, 1, 2]>, + // + // VTB + InstrItinData<IIC_VTB1, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [3, 2, 1]>, + InstrItinData<IIC_VTB2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [3, 2, 2, 1]>, + InstrItinData<IIC_VTB3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 8 cycles + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe]>], + [4, 2, 2, 3, 1]>, + InstrItinData<IIC_VTB4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 8 cycles + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe]>], + [4, 2, 2, 3, 3, 1]>, + // + // VTBX + InstrItinData<IIC_VTBX1, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [3, 1, 2, 1]>, + InstrItinData<IIC_VTBX2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [3, 1, 2, 2, 1]>, + InstrItinData<IIC_VTBX3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 8 cycles + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe]>], + [4, 1, 2, 2, 3, 1]>, + InstrItinData<IIC_VTBX4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 8 cycles + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [4, 1, 2, 2, 3, 3, 1]> +]>; + +// ===---------------------------------------------------------------------===// +// The following definitions describe the simpler per-operand machine model. +// This works with MachineScheduler and will eventually replace itineraries. + +class A9WriteLMOpsListType<list<WriteSequence> writes> { + list <WriteSequence> Writes = writes; + SchedMachineModel SchedModel = ?; +} + +// Cortex-A9 machine model for scheduling and other instruction cost heuristics. +def CortexA9Model : SchedMachineModel { + let IssueWidth = 2; // 2 micro-ops are dispatched per cycle. + let MicroOpBufferSize = 56; // Based on available renamed registers. + let LoadLatency = 2; // Optimistic load latency assuming bypass. + // This is overriden by OperandCycles if the + // Itineraries are queried instead. + let MispredictPenalty = 8; // Based on estimate of pipeline depth. + + let Itineraries = CortexA9Itineraries; + + // FIXME: Many vector operations were never given an itinerary. We + // haven't mapped these to the new model either. + let CompleteModel = 0; +} + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available. +// +// The AGU unit has BufferSize=1 so that the latency between operations +// that use it are considered to stall other operations. +// +// The FP unit has BufferSize=0 so that it is a hard dispatch +// hazard. No instruction may be dispatched while the unit is reserved. + +let SchedModel = CortexA9Model in { + +def A9UnitALU : ProcResource<2>; +def A9UnitMul : ProcResource<1> { let Super = A9UnitALU; } +def A9UnitAGU : ProcResource<1> { let BufferSize = 1; } +def A9UnitLS : ProcResource<1>; +def A9UnitFP : ProcResource<1> { let BufferSize = 0; } +def A9UnitB : ProcResource<1>; + +//===----------------------------------------------------------------------===// +// Define scheduler read/write types with their resources and latency on A9. + +// Consume an issue slot, but no processor resources. This is useful when all +// other writes associated with the operand have NumMicroOps = 0. +def A9WriteIssue : SchedWriteRes<[]> { let Latency = 0; } + +// Write an integer register. +def A9WriteI : SchedWriteRes<[A9UnitALU]>; +// Write an integer shifted-by register +def A9WriteIsr : SchedWriteRes<[A9UnitALU]> { let Latency = 2; } + +// Basic ALU. +def A9WriteALU : SchedWriteRes<[A9UnitALU]>; +// ALU with operand shifted by immediate. +def : WriteRes<WriteALUsi, [A9UnitALU]> { let Latency = 2; } +// ALU with operand shifted by register. +def A9WriteALUsr : SchedWriteRes<[A9UnitALU]> { let Latency = 3; } + +// Multiplication +def A9WriteM : SchedWriteRes<[A9UnitMul, A9UnitMul]> { let Latency = 4; } +def A9WriteMHi : SchedWriteRes<[A9UnitMul]> { let Latency = 5; + let NumMicroOps = 0; } +def A9WriteM16 : SchedWriteRes<[A9UnitMul]> { let Latency = 3; } +def A9WriteM16Hi : SchedWriteRes<[A9UnitMul]> { let Latency = 4; + let NumMicroOps = 0; } + +// Floating-point +// Only one FP or AGU instruction may issue per cycle. We model this +// by having FP instructions consume the AGU resource. +def A9WriteF : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 4; } +def A9WriteFMov : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 1; } +def A9WriteFMulS : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 5; } +def A9WriteFMulD : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 6; } +def A9WriteFMAS : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 8; } +def A9WriteFMAD : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; } +def A9WriteFDivS : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 15; } +def A9WriteFDivD : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 25; } +def A9WriteFSqrtS : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 17; } +def A9WriteFSqrtD : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 32; } + +// NEON has an odd mix of latencies. Simply name the write types by latency. +def A9WriteV1 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 1; } +def A9WriteV2 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 2; } +def A9WriteV3 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 3; } +def A9WriteV4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 4; } +def A9WriteV5 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 5; } +def A9WriteV6 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 6; } +def A9WriteV7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 7; } +def A9WriteV9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; } +def A9WriteV10 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 10; } + +// Reserve A9UnitFP for 2 consecutive cycles. +def A9Write2V4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { + let Latency = 4; + let ResourceCycles = [2]; +} +def A9Write2V7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { + let Latency = 7; + let ResourceCycles = [2]; +} +def A9Write2V9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { + let Latency = 9; + let ResourceCycles = [2]; +} + +// Branches don't have a def operand but still consume resources. +def A9WriteB : SchedWriteRes<[A9UnitB]>; + +// Address generation. +def A9WriteAdr : SchedWriteRes<[A9UnitAGU]> { let NumMicroOps = 0; } + +// Load Integer. +def A9WriteL : SchedWriteRes<[A9UnitLS]> { let Latency = 3; } +// Load the upper 32-bits using the same micro-op. +def A9WriteLHi : SchedWriteRes<[]> { let Latency = 3; + let NumMicroOps = 0; } +// Offset shifted by register. +def A9WriteLsi : SchedWriteRes<[A9UnitLS]> { let Latency = 4; } +// Load (and zero extend) a byte. +def A9WriteLb : SchedWriteRes<[A9UnitLS]> { let Latency = 4; } +def A9WriteLbsi : SchedWriteRes<[A9UnitLS]> { let Latency = 5; } + +// Load or Store Float, aligned. +def A9WriteLSfp : SchedWriteRes<[A9UnitLS, A9UnitFP]> { let Latency = 1; } + +// Store Integer. +def A9WriteS : SchedWriteRes<[A9UnitLS]>; + +//===----------------------------------------------------------------------===// +// Define resources dynamically for load multiple variants. + +// Define helpers for extra latency without consuming resources. +def A9WriteCycle1 : SchedWriteRes<[]> { let Latency = 1; let NumMicroOps = 0; } +foreach NumCycles = 2-8 in { +def A9WriteCycle#NumCycles : WriteSequence<[A9WriteCycle1], NumCycles>; +} // foreach NumCycles + +// Define address generation sequences and predicates for 8 flavors of LDMs. +foreach NumAddr = 1-8 in { + +// Define A9WriteAdr1-8 as a sequence of A9WriteAdr with additive +// latency for instructions that generate multiple loads or stores. +def A9WriteAdr#NumAddr : WriteSequence<[A9WriteAdr], NumAddr>; + +// Define a predicate to select the LDM based on number of memory addresses. +def A9LMAdr#NumAddr#Pred : + SchedPredicate<"(TII->getNumLDMAddresses(MI)+1)/2 == "#NumAddr>; + +} // foreach NumAddr + +// Fall-back for unknown LDMs. +def A9LMUnknownPred : SchedPredicate<"TII->getNumLDMAddresses(MI) == 0">; + +// LDM/VLDM/VLDn address generation latency & resources. +// Dynamically select the A9WriteAdrN sequence using a predicate. +def A9WriteLMAdr : SchedWriteVariant<[ + SchedVar<A9LMAdr1Pred, [A9WriteAdr1]>, + SchedVar<A9LMAdr2Pred, [A9WriteAdr2]>, + SchedVar<A9LMAdr3Pred, [A9WriteAdr3]>, + SchedVar<A9LMAdr4Pred, [A9WriteAdr4]>, + SchedVar<A9LMAdr5Pred, [A9WriteAdr5]>, + SchedVar<A9LMAdr6Pred, [A9WriteAdr6]>, + SchedVar<A9LMAdr7Pred, [A9WriteAdr7]>, + SchedVar<A9LMAdr8Pred, [A9WriteAdr8]>, + // For unknown LDM/VLDM/VSTM, assume 2 32-bit registers. + SchedVar<A9LMUnknownPred, [A9WriteAdr2]>]>; + +// Define LDM Resources. +// These take no issue resource, so they can be combined with other +// writes like WriteB. +// A9WriteLMLo takes a single LS resource and 2 cycles. +def A9WriteLMLo : SchedWriteRes<[A9UnitLS]> { let Latency = 2; + let NumMicroOps = 0; } +// Assuming aligned access, the upper half of each pair is free with +// the same latency. +def A9WriteLMHi : SchedWriteRes<[]> { let Latency = 2; + let NumMicroOps = 0; } +// Each A9WriteL#N variant adds N cycles of latency without consuming +// additional resources. +foreach NumAddr = 1-8 in { +def A9WriteL#NumAddr : WriteSequence< + [A9WriteLMLo, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>; +def A9WriteL#NumAddr#Hi : WriteSequence< + [A9WriteLMHi, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>; +} + +//===----------------------------------------------------------------------===// +// LDM: Load multiple into 32-bit integer registers. + +def A9WriteLMOpsList : A9WriteLMOpsListType< + [A9WriteL1, A9WriteL1Hi, + A9WriteL2, A9WriteL2Hi, + A9WriteL3, A9WriteL3Hi, + A9WriteL4, A9WriteL4Hi, + A9WriteL5, A9WriteL5Hi, + A9WriteL6, A9WriteL6Hi, + A9WriteL7, A9WriteL7Hi, + A9WriteL8, A9WriteL8Hi]>; + +// A9WriteLM variants expand into a pair of writes for each 64-bit +// value loaded. When the number of registers is odd, the last +// A9WriteLnHi is naturally ignored because the instruction has no +// following def operands. These variants take no issue resource, so +// they may need to be part of a WriteSequence that includes A9WriteIssue. +def A9WriteLM : SchedWriteVariant<[ + SchedVar<A9LMAdr1Pred, A9WriteLMOpsList.Writes[0-1]>, + SchedVar<A9LMAdr2Pred, A9WriteLMOpsList.Writes[0-3]>, + SchedVar<A9LMAdr3Pred, A9WriteLMOpsList.Writes[0-5]>, + SchedVar<A9LMAdr4Pred, A9WriteLMOpsList.Writes[0-7]>, + SchedVar<A9LMAdr5Pred, A9WriteLMOpsList.Writes[0-9]>, + SchedVar<A9LMAdr6Pred, A9WriteLMOpsList.Writes[0-11]>, + SchedVar<A9LMAdr7Pred, A9WriteLMOpsList.Writes[0-13]>, + SchedVar<A9LMAdr8Pred, A9WriteLMOpsList.Writes[0-15]>, + // For unknown LDMs, define the maximum number of writes, but only + // make the first two consume resources. + SchedVar<A9LMUnknownPred, [A9WriteL1, A9WriteL1Hi, + A9WriteL2, A9WriteL2Hi, + A9WriteL3Hi, A9WriteL3Hi, + A9WriteL4Hi, A9WriteL4Hi, + A9WriteL5Hi, A9WriteL5Hi, + A9WriteL6Hi, A9WriteL6Hi, + A9WriteL7Hi, A9WriteL7Hi, + A9WriteL8Hi, A9WriteL8Hi]>]> { + let Variadic = 1; +} + +//===----------------------------------------------------------------------===// +// VFP Load/Store Multiple Variants, and NEON VLDn/VSTn support. + +// A9WriteLfpOp is the same as A9WriteLSfp but takes no issue resources +// so can be used in WriteSequences for in single-issue instructions that +// encapsulate multiple loads. +def A9WriteLfpOp : SchedWriteRes<[A9UnitLS, A9UnitFP]> { + let Latency = 1; + let NumMicroOps = 0; +} + +foreach NumAddr = 1-8 in { + +// Helper for A9WriteLfp1-8: A sequence of fp loads with no micro-ops. +def A9WriteLfp#NumAddr#Seq : WriteSequence<[A9WriteLfpOp], NumAddr>; + +// A9WriteLfp1-8 definitions are statically expanded into a sequence of +// A9WriteLfpOps with additive latency that takes a single issue slot. +// Used directly to describe NEON VLDn. +def A9WriteLfp#NumAddr : WriteSequence< + [A9WriteIssue, !cast<SchedWrite>("A9WriteLfp"#NumAddr#Seq)]>; + +// A9WriteLfp1-8Mov adds a cycle of latency and FP resource for +// permuting loaded values. +def A9WriteLfp#NumAddr#Mov : WriteSequence< + [A9WriteF, !cast<SchedWrite>("A9WriteLfp"#NumAddr#Seq)]>; + +} // foreach NumAddr + +// Define VLDM/VSTM PreRA resources. +// A9WriteLMfpPreRA are dynamically expanded into the correct +// A9WriteLfp1-8 sequence based on a predicate. This supports the +// preRA VLDM variants in which all 64-bit loads are written to the +// same tuple of either single or double precision registers. +def A9WriteLMfpPreRA : SchedWriteVariant<[ + SchedVar<A9LMAdr1Pred, [A9WriteLfp1]>, + SchedVar<A9LMAdr2Pred, [A9WriteLfp2]>, + SchedVar<A9LMAdr3Pred, [A9WriteLfp3]>, + SchedVar<A9LMAdr4Pred, [A9WriteLfp4]>, + SchedVar<A9LMAdr5Pred, [A9WriteLfp5]>, + SchedVar<A9LMAdr6Pred, [A9WriteLfp6]>, + SchedVar<A9LMAdr7Pred, [A9WriteLfp7]>, + SchedVar<A9LMAdr8Pred, [A9WriteLfp8]>, + // For unknown VLDM/VSTM PreRA, assume 2xS registers. + SchedVar<A9LMUnknownPred, [A9WriteLfp2]>]>; + +// Define VLDM/VSTM PostRA Resources. +// A9WriteLMfpLo takes a LS and FP resource and one issue slot but no latency. +def A9WriteLMfpLo : SchedWriteRes<[A9UnitLS, A9UnitFP]> { let Latency = 0; } + +foreach NumAddr = 1-8 in { + +// Each A9WriteL#N variant adds N cycles of latency without consuming +// additional resources. +def A9WriteLMfp#NumAddr : WriteSequence< + [A9WriteLMfpLo, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>; + +// Assuming aligned access, the upper half of each pair is free with +// the same latency. +def A9WriteLMfp#NumAddr#Hi : WriteSequence< + [A9WriteLMHi, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>; + +} // foreach NumAddr + +// VLDM PostRA Variants. These variants expand A9WriteLMfpPostRA into a +// pair of writes for each 64-bit data loaded. When the number of +// registers is odd, the last WriteLMfpnHi is naturally ignored because +// the instruction has no following def operands. + +def A9WriteLMfpPostRAOpsList : A9WriteLMOpsListType< + [A9WriteLMfp1, A9WriteLMfp2, // 0-1 + A9WriteLMfp3, A9WriteLMfp4, // 2-3 + A9WriteLMfp5, A9WriteLMfp6, // 4-5 + A9WriteLMfp7, A9WriteLMfp8, // 6-7 + A9WriteLMfp1Hi, // 8-8 + A9WriteLMfp2Hi, A9WriteLMfp2Hi, // 9-10 + A9WriteLMfp3Hi, A9WriteLMfp3Hi, // 11-12 + A9WriteLMfp4Hi, A9WriteLMfp4Hi, // 13-14 + A9WriteLMfp5Hi, A9WriteLMfp5Hi, // 15-16 + A9WriteLMfp6Hi, A9WriteLMfp6Hi, // 17-18 + A9WriteLMfp7Hi, A9WriteLMfp7Hi, // 19-20 + A9WriteLMfp8Hi, A9WriteLMfp8Hi]>; // 21-22 + +def A9WriteLMfpPostRA : SchedWriteVariant<[ + SchedVar<A9LMAdr1Pred, A9WriteLMfpPostRAOpsList.Writes[0-0, 8-8]>, + SchedVar<A9LMAdr2Pred, A9WriteLMfpPostRAOpsList.Writes[0-1, 9-10]>, + SchedVar<A9LMAdr3Pred, A9WriteLMfpPostRAOpsList.Writes[0-2, 10-12]>, + SchedVar<A9LMAdr4Pred, A9WriteLMfpPostRAOpsList.Writes[0-3, 11-14]>, + SchedVar<A9LMAdr5Pred, A9WriteLMfpPostRAOpsList.Writes[0-4, 12-16]>, + SchedVar<A9LMAdr6Pred, A9WriteLMfpPostRAOpsList.Writes[0-5, 13-18]>, + SchedVar<A9LMAdr7Pred, A9WriteLMfpPostRAOpsList.Writes[0-6, 14-20]>, + SchedVar<A9LMAdr8Pred, A9WriteLMfpPostRAOpsList.Writes[0-7, 15-22]>, + // For unknown LDMs, define the maximum number of writes, but only + // make the first two consume resources. We are optimizing for the case + // where the operands are DPRs, and this determines the first eight + // types. The remaining eight types are filled to cover the case + // where the operands are SPRs. + SchedVar<A9LMUnknownPred, [A9WriteLMfp1, A9WriteLMfp2, + A9WriteLMfp3Hi, A9WriteLMfp4Hi, + A9WriteLMfp5Hi, A9WriteLMfp6Hi, + A9WriteLMfp7Hi, A9WriteLMfp8Hi, + A9WriteLMfp5Hi, A9WriteLMfp5Hi, + A9WriteLMfp6Hi, A9WriteLMfp6Hi, + A9WriteLMfp7Hi, A9WriteLMfp7Hi, + A9WriteLMfp8Hi, A9WriteLMfp8Hi]>]> { + let Variadic = 1; +} + +// Distinguish between our multiple MI-level forms of the same +// VLDM/VSTM instructions. +def A9PreRA : SchedPredicate< + "TargetRegisterInfo::isVirtualRegister(MI->getOperand(0).getReg())">; +def A9PostRA : SchedPredicate< + "TargetRegisterInfo::isPhysicalRegister(MI->getOperand(0).getReg())">; + +// VLDM represents all destination registers as a single register +// tuple, unlike LDM. So the number of write operands is not variadic. +def A9WriteLMfp : SchedWriteVariant<[ + SchedVar<A9PreRA, [A9WriteLMfpPreRA]>, + SchedVar<A9PostRA, [A9WriteLMfpPostRA]>]>; + +//===----------------------------------------------------------------------===// +// Resources for other (non-LDM/VLDM) Variants. + +// These mov immediate writers are unconditionally expanded with +// additive latency. +def A9WriteI2 : WriteSequence<[A9WriteI, A9WriteI]>; +def A9WriteI2pc : WriteSequence<[A9WriteI, A9WriteI, WriteALU]>; +def A9WriteI2ld : WriteSequence<[A9WriteI, A9WriteI, A9WriteL]>; + +// Some ALU operations can read loaded integer values one cycle early. +def A9ReadALU : SchedReadAdvance<1, + [A9WriteL, A9WriteLHi, A9WriteLsi, A9WriteLb, A9WriteLbsi, + A9WriteL1, A9WriteL2, A9WriteL3, A9WriteL4, + A9WriteL5, A9WriteL6, A9WriteL7, A9WriteL8, + A9WriteL1Hi, A9WriteL2Hi, A9WriteL3Hi, A9WriteL4Hi, + A9WriteL5Hi, A9WriteL6Hi, A9WriteL7Hi, A9WriteL8Hi]>; + +// Read types for operands that are unconditionally read in cycle N +// after the instruction issues, decreases producer latency by N-1. +def A9Read2 : SchedReadAdvance<1>; +def A9Read3 : SchedReadAdvance<2>; +def A9Read4 : SchedReadAdvance<3>; + +//===----------------------------------------------------------------------===// +// Map itinerary classes to scheduler read/write resources per operand. +// +// For ARM, we piggyback scheduler resources on the Itinerary classes +// to avoid perturbing the existing instruction definitions. + +// This table follows the ARM Cortex-A9 Technical Reference Manuals, +// mostly in order. + +def :ItinRW<[WriteALU], [IIC_iMOVi,IIC_iMOVr,IIC_iMOVsi, + IIC_iMVNi,IIC_iMVNsi, + IIC_iCMOVi,IIC_iCMOVr,IIC_iCMOVsi]>; +def :ItinRW<[WriteALU, A9ReadALU],[IIC_iMVNr]>; +def :ItinRW<[A9WriteIsr], [IIC_iMOVsr,IIC_iMVNsr,IIC_iCMOVsr]>; + +def :ItinRW<[A9WriteI2], [IIC_iMOVix2,IIC_iCMOVix2]>; +def :ItinRW<[A9WriteI2pc], [IIC_iMOVix2addpc]>; +def :ItinRW<[A9WriteI2ld], [IIC_iMOVix2ld]>; + +def :ItinRW<[WriteALU], [IIC_iBITi,IIC_iBITr,IIC_iUNAr,IIC_iTSTi,IIC_iTSTr]>; +def :ItinRW<[WriteALU, A9ReadALU], [IIC_iALUi, IIC_iCMPi, IIC_iCMPsi]>; +def :ItinRW<[WriteALU, A9ReadALU, A9ReadALU],[IIC_iALUr,IIC_iCMPr]>; +def :ItinRW<[WriteALUsi], [IIC_iBITsi,IIC_iUNAsi,IIC_iEXTr,IIC_iTSTsi]>; +def :ItinRW<[WriteALUsi, A9ReadALU], [IIC_iALUsi]>; +def :ItinRW<[WriteALUsi, ReadDefault, A9ReadALU], [IIC_iALUsir]>; // RSB +def :ItinRW<[A9WriteALUsr], [IIC_iBITsr,IIC_iTSTsr,IIC_iEXTAr,IIC_iEXTAsr]>; +def :ItinRW<[A9WriteALUsr, A9ReadALU], [IIC_iALUsr,IIC_iCMPsr]>; + +// A9WriteHi ignored for MUL32. +def :ItinRW<[A9WriteM, A9WriteMHi], [IIC_iMUL32,IIC_iMAC32, + IIC_iMUL64,IIC_iMAC64]>; +// FIXME: SMLALxx needs itin classes +def :ItinRW<[A9WriteM16, A9WriteM16Hi], [IIC_iMUL16,IIC_iMAC16]>; + +// TODO: For floating-point ops, we model the pipeline forwarding +// latencies here. WAW latencies are sometimes longer. + +def :ItinRW<[A9WriteFMov], [IIC_fpSTAT, IIC_fpMOVIS, IIC_fpMOVID, IIC_fpMOVSI, + IIC_fpUNA32, IIC_fpUNA64, + IIC_fpCMP32, IIC_fpCMP64]>; +def :ItinRW<[A9WriteFMov, A9WriteFMov], [IIC_fpMOVDI]>; +def :ItinRW<[A9WriteF], [IIC_fpCVTSD, IIC_fpCVTDS, IIC_fpCVTSH, IIC_fpCVTHS, + IIC_fpCVTIS, IIC_fpCVTID, IIC_fpCVTSI, IIC_fpCVTDI, + IIC_fpALU32, IIC_fpALU64]>; +def :ItinRW<[A9WriteFMulS], [IIC_fpMUL32]>; +def :ItinRW<[A9WriteFMulD], [IIC_fpMUL64]>; +def :ItinRW<[A9WriteFMAS], [IIC_fpMAC32]>; +def :ItinRW<[A9WriteFMAD], [IIC_fpMAC64]>; +def :ItinRW<[A9WriteFDivS], [IIC_fpDIV32]>; +def :ItinRW<[A9WriteFDivD], [IIC_fpDIV64]>; +def :ItinRW<[A9WriteFSqrtS], [IIC_fpSQRT32]>; +def :ItinRW<[A9WriteFSqrtD], [IIC_fpSQRT64]>; + +def :ItinRW<[A9WriteB], [IIC_Br]>; + +// A9 PLD is processed in a dedicated unit. +def :ItinRW<[], [IIC_Preload]>; + +// Note: We must assume that loads are aligned, since the machine +// model cannot know this statically and A9 ignores alignment hints. + +// A9WriteAdr consumes AGU regardless address writeback. But it's +// latency is only relevant for users of an updated address. +def :ItinRW<[A9WriteL, A9WriteAdr], [IIC_iLoad_i,IIC_iLoad_r, + IIC_iLoad_iu,IIC_iLoad_ru]>; +def :ItinRW<[A9WriteLsi, A9WriteAdr], [IIC_iLoad_si,IIC_iLoad_siu]>; +def :ItinRW<[A9WriteLb, A9WriteAdr2], [IIC_iLoad_bh_i,IIC_iLoad_bh_r, + IIC_iLoad_bh_iu,IIC_iLoad_bh_ru]>; +def :ItinRW<[A9WriteLbsi, A9WriteAdr2], [IIC_iLoad_bh_si,IIC_iLoad_bh_siu]>; +def :ItinRW<[A9WriteL, A9WriteLHi, A9WriteAdr], [IIC_iLoad_d_i,IIC_iLoad_d_r, + IIC_iLoad_d_ru]>; +// Store either has no def operands, or the one def for address writeback. +def :ItinRW<[A9WriteAdr, A9WriteS], [IIC_iStore_i, IIC_iStore_r, + IIC_iStore_iu, IIC_iStore_ru, + IIC_iStore_d_i, IIC_iStore_d_r, + IIC_iStore_d_ru]>; +def :ItinRW<[A9WriteAdr2, A9WriteS], [IIC_iStore_si, IIC_iStore_siu, + IIC_iStore_bh_i, IIC_iStore_bh_r, + IIC_iStore_bh_iu, IIC_iStore_bh_ru]>; +def :ItinRW<[A9WriteAdr3, A9WriteS], [IIC_iStore_bh_si, IIC_iStore_bh_siu]>; + +// A9WriteML will be expanded into a separate write for each def +// operand. Address generation consumes resources, but A9WriteLMAdr +// is listed after all def operands, so has no effective latency. +// +// Note: A9WriteLM expands into an even number of def operands. The +// actual number of def operands may be less by one. +def :ItinRW<[A9WriteLM, A9WriteLMAdr, A9WriteIssue], [IIC_iLoad_m, IIC_iPop]>; + +// Load multiple with address writeback has an extra def operand in +// front of the loaded registers. +// +// Reuse the load-multiple variants for store-multiple because the +// resources are identical, For stores only the address writeback +// has a def operand so the WriteL latencies are unused. +def :ItinRW<[A9WriteLMAdr, A9WriteLM, A9WriteIssue], [IIC_iLoad_mu, + IIC_iStore_m, + IIC_iStore_mu]>; +def :ItinRW<[A9WriteLM, A9WriteLMAdr, A9WriteB], [IIC_iLoad_mBr, IIC_iPop_Br]>; +def :ItinRW<[A9WriteL, A9WriteAdr, WriteALU], [IIC_iLoadiALU]>; + +def :ItinRW<[A9WriteLSfp, A9WriteAdr], [IIC_fpLoad32, IIC_fpLoad64]>; + +def :ItinRW<[A9WriteLMfp, A9WriteLMAdr], [IIC_fpLoad_m]>; +def :ItinRW<[A9WriteLMAdr, A9WriteLMfp], [IIC_fpLoad_mu]>; +def :ItinRW<[A9WriteAdr, A9WriteLSfp], [IIC_fpStore32, IIC_fpStore64, + IIC_fpStore_m, IIC_fpStore_mu]>; + +// Note: Unlike VLDM, VLD1 expects the writeback operand after the +// normal writes. +def :ItinRW<[A9WriteLfp1, A9WriteAdr1], [IIC_VLD1, IIC_VLD1u, + IIC_VLD1x2, IIC_VLD1x2u]>; +def :ItinRW<[A9WriteLfp2, A9WriteAdr2], [IIC_VLD1x3, IIC_VLD1x3u, + IIC_VLD1x4, IIC_VLD1x4u, + IIC_VLD4dup, IIC_VLD4dupu]>; +def :ItinRW<[A9WriteLfp1Mov, A9WriteAdr1], [IIC_VLD1dup, IIC_VLD1dupu, + IIC_VLD2, IIC_VLD2u, + IIC_VLD2dup, IIC_VLD2dupu]>; +def :ItinRW<[A9WriteLfp2Mov, A9WriteAdr1], [IIC_VLD1ln, IIC_VLD1lnu, + IIC_VLD2x2, IIC_VLD2x2u, + IIC_VLD2ln, IIC_VLD2lnu]>; +def :ItinRW<[A9WriteLfp3Mov, A9WriteAdr3], [IIC_VLD3, IIC_VLD3u, + IIC_VLD3dup, IIC_VLD3dupu]>; +def :ItinRW<[A9WriteLfp4Mov, A9WriteAdr4], [IIC_VLD4, IIC_VLD4u, + IIC_VLD4ln, IIC_VLD4lnu]>; +def :ItinRW<[A9WriteLfp5Mov, A9WriteAdr5], [IIC_VLD3ln, IIC_VLD3lnu]>; + +// Vector stores use similar resources to vector loads, so use the +// same write types. The address write must be first for stores with +// address writeback. +def :ItinRW<[A9WriteAdr1, A9WriteLfp1], [IIC_VST1, IIC_VST1u, + IIC_VST1x2, IIC_VST1x2u, + IIC_VST1ln, IIC_VST1lnu, + IIC_VST2, IIC_VST2u, + IIC_VST2x2, IIC_VST2x2u, + IIC_VST2ln, IIC_VST2lnu]>; +def :ItinRW<[A9WriteAdr2, A9WriteLfp2], [IIC_VST1x3, IIC_VST1x3u, + IIC_VST1x4, IIC_VST1x4u, + IIC_VST3, IIC_VST3u, + IIC_VST3ln, IIC_VST3lnu, + IIC_VST4, IIC_VST4u, + IIC_VST4ln, IIC_VST4lnu]>; + +// NEON moves. +def :ItinRW<[A9WriteV2], [IIC_VMOVSI, IIC_VMOVDI, IIC_VMOVD, IIC_VMOVQ]>; +def :ItinRW<[A9WriteV1], [IIC_VMOV, IIC_VMOVIS, IIC_VMOVID]>; +def :ItinRW<[A9WriteV3], [IIC_VMOVISL, IIC_VMOVN]>; + +// NEON integer arithmetic +// +// VADD/VAND/VORR/VEOR/VBIC/VORN/VBIT/VBIF/VBSL +def :ItinRW<[A9WriteV3, A9Read2, A9Read2], [IIC_VBINiD, IIC_VBINiQ]>; +// VSUB/VMVN/VCLSD/VCLZD/VCNTD +def :ItinRW<[A9WriteV3, A9Read2], [IIC_VSUBiD, IIC_VSUBiQ, IIC_VCNTiD]>; +// VADDL/VSUBL/VNEG are mapped later under IIC_SHLi. +// ... +// VHADD/VRHADD/VQADD/VTST/VADH/VRADH +def :ItinRW<[A9WriteV4, A9Read2, A9Read2], [IIC_VBINi4D, IIC_VBINi4Q]>; + +// VSBH/VRSBH/VHSUB/VQSUB/VABD/VCEQ/VCGE/VCGT/VMAX/VMIN/VPMAX/VPMIN/VABDL +def :ItinRW<[A9WriteV4, A9Read2], [IIC_VSUBi4D, IIC_VSUBi4Q]>; +// VQNEG/VQABS +def :ItinRW<[A9WriteV4], [IIC_VQUNAiD, IIC_VQUNAiQ]>; +// VABS +def :ItinRW<[A9WriteV4, A9Read2], [IIC_VUNAiD, IIC_VUNAiQ]>; +// VPADD/VPADDL are mapped later under IIC_SHLi. +// ... +// VCLSQ/VCLZQ/VCNTQ, takes two cycles. +def :ItinRW<[A9Write2V4, A9Read3], [IIC_VCNTiQ]>; +// VMOVimm/VMVNimm/VORRimm/VBICimm +def :ItinRW<[A9WriteV3], [IIC_VMOVImm]>; +def :ItinRW<[A9WriteV6, A9Read3, A9Read2], [IIC_VABAD, IIC_VABAQ]>; +def :ItinRW<[A9WriteV6, A9Read3], [IIC_VPALiD, IIC_VPALiQ]>; + +// NEON integer multiply +// +// Note: these don't quite match the timing docs, but they do match +// the original A9 itinerary. +def :ItinRW<[A9WriteV6, A9Read2, A9Read2], [IIC_VMULi16D]>; +def :ItinRW<[A9WriteV7, A9Read2, A9Read2], [IIC_VMULi16Q]>; +def :ItinRW<[A9Write2V7, A9Read2], [IIC_VMULi32D]>; +def :ItinRW<[A9Write2V9, A9Read2], [IIC_VMULi32Q]>; +def :ItinRW<[A9WriteV6, A9Read3, A9Read2, A9Read2], [IIC_VMACi16D]>; +def :ItinRW<[A9WriteV7, A9Read3, A9Read2, A9Read2], [IIC_VMACi16Q]>; +def :ItinRW<[A9Write2V7, A9Read3, A9Read2], [IIC_VMACi32D]>; +def :ItinRW<[A9Write2V9, A9Read3, A9Read2], [IIC_VMACi32Q]>; + +// NEON integer shift +// TODO: Q,Q,Q shifts should actually reserve FP for 2 cycles. +def :ItinRW<[A9WriteV3], [IIC_VSHLiD, IIC_VSHLiQ]>; +def :ItinRW<[A9WriteV4], [IIC_VSHLi4D, IIC_VSHLi4Q]>; + +// NEON permute +def :ItinRW<[A9WriteV2, A9WriteV2], [IIC_VPERMD, IIC_VPERMQ, IIC_VEXTD]>; +def :ItinRW<[A9WriteV3, A9WriteV4, ReadDefault, A9Read2], + [IIC_VPERMQ3, IIC_VEXTQ]>; +def :ItinRW<[A9WriteV3, A9Read2], [IIC_VTB1]>; +def :ItinRW<[A9WriteV3, A9Read2, A9Read2], [IIC_VTB2]>; +def :ItinRW<[A9WriteV4, A9Read2, A9Read2, A9Read3], [IIC_VTB3]>; +def :ItinRW<[A9WriteV4, A9Read2, A9Read2, A9Read3, A9Read3], [IIC_VTB4]>; +def :ItinRW<[A9WriteV3, ReadDefault, A9Read2], [IIC_VTBX1]>; +def :ItinRW<[A9WriteV3, ReadDefault, A9Read2, A9Read2], [IIC_VTBX2]>; +def :ItinRW<[A9WriteV4, ReadDefault, A9Read2, A9Read2, A9Read3], [IIC_VTBX3]>; +def :ItinRW<[A9WriteV4, ReadDefault, A9Read2, A9Read2, A9Read3, A9Read3], + [IIC_VTBX4]>; + +// NEON floating-point +def :ItinRW<[A9WriteV5, A9Read2, A9Read2], [IIC_VBIND]>; +def :ItinRW<[A9WriteV6, A9Read2, A9Read2], [IIC_VBINQ]>; +def :ItinRW<[A9WriteV5, A9Read2], [IIC_VUNAD, IIC_VFMULD]>; +def :ItinRW<[A9WriteV6, A9Read2], [IIC_VUNAQ, IIC_VFMULQ]>; +def :ItinRW<[A9WriteV9, A9Read3, A9Read2], [IIC_VMACD, IIC_VFMACD]>; +def :ItinRW<[A9WriteV10, A9Read3, A9Read2], [IIC_VMACQ, IIC_VFMACQ]>; +def :ItinRW<[A9WriteV9, A9Read2, A9Read2], [IIC_VRECSD]>; +def :ItinRW<[A9WriteV10, A9Read2, A9Read2], [IIC_VRECSQ]>; + +// Map SchedRWs that are identical for cortexa9 to existing resources. +def : SchedAlias<WriteALU, A9WriteALU>; +def : SchedAlias<WriteALUsr, A9WriteALUsr>; +def : SchedAlias<WriteALUSsr, A9WriteALUsr>; +def : SchedAlias<ReadALU, A9ReadALU>; +def : SchedAlias<ReadALUsr, A9ReadALU>; +def : InstRW< [WriteALU], + (instregex "ANDri", "ORRri", "EORri", "BICri", "ANDrr", "ORRrr", "EORrr", + "BICrr")>; +def : InstRW< [WriteALUsi], (instregex "ANDrsi", "ORRrsi", "EORrsi", "BICrsi")>; +def : InstRW< [WriteALUsr], (instregex "ANDrsr", "ORRrsr", "EORrsr", "BICrsr")>; + + +def : SchedAlias<WriteCMP, A9WriteALU>; +def : SchedAlias<WriteCMPsi, A9WriteALU>; +def : SchedAlias<WriteCMPsr, A9WriteALU>; + +def : InstRW< [A9WriteIsr], (instregex "MOVsr", "MOVsi", "MVNsr", "MOVCCsi", + "MOVCCsr")>; +def : InstRW< [WriteALU, A9ReadALU], (instregex "MVNr")>; +def : InstRW< [A9WriteI2], (instregex "MOVCCi32imm", "MOVi32imm", + "MOV_ga_dyn")>; +def : InstRW< [A9WriteI2pc], (instregex "MOV_ga_pcrel")>; +def : InstRW< [A9WriteI2ld], (instregex "MOV_ga_pcrel_ldr")>; + +def : InstRW< [WriteALU], (instregex "SEL")>; + +def : InstRW< [WriteALUsi], (instregex "BFC", "BFI", "UBFX", "SBFX")>; + +def : InstRW< [A9WriteM], + (instregex "MUL", "MULv5", "SMMUL", "SMMULR", "MLA", "MLAv5", "MLS", + "SMMLA", "SMMLAR", "SMMLS", "SMMLSR")>; +def : InstRW< [A9WriteM, A9WriteMHi], + (instregex "SMULL", "SMULLv5", "UMULL", "UMULLv5", "SMLAL$", "UMLAL", + "UMAAL", "SMLALv5", "UMLALv5", "UMAALv5", "SMLALBB", "SMLALBT", "SMLALTB", + "SMLALTT")>; +// FIXME: These instructions used to have NoItinerary. Just copied the one from above. +def : InstRW< [A9WriteM, A9WriteMHi], + (instregex "SMLAD", "SMLADX", "SMLALD", "SMLALDX", "SMLSD", "SMLSDX", + "SMLSLD", "SMLLDX", "SMUAD", "SMUADX", "SMUSD", "SMUSDX")>; + +def : InstRW<[A9WriteM16, A9WriteM16Hi], + (instregex "SMULBB", "SMULBT", "SMULTB", "SMULTT", "SMULWB", "SMULWT")>; +def : InstRW<[A9WriteM16, A9WriteM16Hi], + (instregex "SMLABB", "SMLABT", "SMLATB", "SMLATT", "SMLAWB", "SMLAWT")>; + +def : InstRW<[A9WriteL], (instregex "LDRi12", "PICLDR$")>; +def : InstRW<[A9WriteLsi], (instregex "LDRrs")>; +def : InstRW<[A9WriteLb], + (instregex "LDRBi12", "PICLDRH", "PICLDRB", "PICLDRSH", "PICLDRSB", + "LDRH", "LDRSH", "LDRSB")>; +def : InstRW<[A9WriteLbsi], (instregex "LDRrs")>; + +def : WriteRes<WriteDiv, []> { let Latency = 0; } + +def : WriteRes<WriteBr, [A9UnitB]>; +def : WriteRes<WriteBrL, [A9UnitB]>; +def : WriteRes<WriteBrTbl, [A9UnitB]>; +def : WriteRes<WritePreLd, []>; +def : SchedAlias<WriteCvtFP, A9WriteF>; +def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; } +} // SchedModel = CortexA9Model diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td b/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td new file mode 100644 index 0000000..3ad7730 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td @@ -0,0 +1,1046 @@ +//=- ARMScheduleSwift.td - Swift Scheduling Definitions -*- tablegen -*----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the Swift processor.. +// +//===----------------------------------------------------------------------===// + +// ===---------------------------------------------------------------------===// +// This section contains legacy support for itineraries. This is +// required until SD and PostRA schedulers are replaced by MachineScheduler. + +def SW_DIS0 : FuncUnit; +def SW_DIS1 : FuncUnit; +def SW_DIS2 : FuncUnit; + +def SW_ALU0 : FuncUnit; +def SW_ALU1 : FuncUnit; +def SW_LS : FuncUnit; +def SW_IDIV : FuncUnit; +def SW_FDIV : FuncUnit; + +// FIXME: Need bypasses. +// FIXME: Model the multiple stages of IIC_iMOVix2, IIC_iMOVix2addpc, and +// IIC_iMOVix2ld better. +// FIXME: Model the special immediate shifts that are not microcoded. +// FIXME: Do we need to model the fact that uses of r15 in a micro-op force it +// to issue on pipe 1? +// FIXME: Model the pipelined behavior of CMP / TST instructions. +// FIXME: Better model the microcode stages of multiply instructions, especially +// conditional variants. +// FIXME: Add preload instruction when it is documented. +// FIXME: Model non-pipelined nature of FP div / sqrt unit. + +// Swift machine model for scheduling and other instruction cost heuristics. +def SwiftModel : SchedMachineModel { + let IssueWidth = 3; // 3 micro-ops are dispatched per cycle. + let MicroOpBufferSize = 45; // Based on NEON renamed registers. + let LoadLatency = 3; + let MispredictPenalty = 14; // A branch direction mispredict. + let CompleteModel = 0; // FIXME: Remove if all instructions are covered. +} + +// Swift predicates. +def IsFastImmShiftSwiftPred : SchedPredicate<[{TII->isSwiftFastImmShift(MI)}]>; + +// Swift resource mapping. +let SchedModel = SwiftModel in { + // Processor resources. + def SwiftUnitP01 : ProcResource<2>; // ALU unit. + def SwiftUnitP0 : ProcResource<1> { let Super = SwiftUnitP01; } // Mul unit. + def SwiftUnitP1 : ProcResource<1> { let Super = SwiftUnitP01; } // Br unit. + def SwiftUnitP2 : ProcResource<1>; // LS unit. + def SwiftUnitDiv : ProcResource<1>; + + // Generic resource requirements. + def SwiftWriteP0OneCycle : SchedWriteRes<[SwiftUnitP0]>; + def SwiftWriteP0TwoCycle : SchedWriteRes<[SwiftUnitP0]> { let Latency = 2; } + def SwiftWriteP0FourCycle : SchedWriteRes<[SwiftUnitP0]> { let Latency = 4; } + def SwiftWriteP0SixCycle : SchedWriteRes<[SwiftUnitP0]> { let Latency = 6; } + def SwiftWriteP0P1FourCycle : SchedWriteRes<[SwiftUnitP0, SwiftUnitP1]> { + let Latency = 4; + } + def SwiftWriteP0P1SixCycle : SchedWriteRes<[SwiftUnitP0, SwiftUnitP1]> { + let Latency = 6; + } + def SwiftWriteP01OneCycle : SchedWriteRes<[SwiftUnitP01]>; + def SwiftWriteP1TwoCycle : SchedWriteRes<[SwiftUnitP1]> { let Latency = 2; } + def SwiftWriteP1FourCycle : SchedWriteRes<[SwiftUnitP1]> { let Latency = 4; } + def SwiftWriteP1SixCycle : SchedWriteRes<[SwiftUnitP1]> { let Latency = 6; } + def SwiftWriteP1EightCycle : SchedWriteRes<[SwiftUnitP1]> { let Latency = 8; } + def SwiftWriteP1TwelveCyc : SchedWriteRes<[SwiftUnitP1]> { let Latency = 12; } + def SwiftWriteP01OneCycle2x : WriteSequence<[SwiftWriteP01OneCycle], 2>; + def SwiftWriteP01OneCycle3x : WriteSequence<[SwiftWriteP01OneCycle], 3>; + def SwiftWriteP01TwoCycle : SchedWriteRes<[SwiftUnitP01]> { let Latency = 2; } + def SwiftWriteP01ThreeCycleTwoUops : SchedWriteRes<[SwiftUnitP01, + SwiftUnitP01]> { + let Latency = 3; + let NumMicroOps = 2; + } + def SwiftWriteP0ThreeCycleThreeUops : SchedWriteRes<[SwiftUnitP0]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [3]; + } + // Plain load without writeback. + def SwiftWriteP2ThreeCycle : SchedWriteRes<[SwiftUnitP2]> { + let Latency = 3; + } + def SwiftWriteP2FourCycle : SchedWriteRes<[SwiftUnitP2]> { + let Latency = 4; + } + // A store does not write to a register. + def SwiftWriteP2 : SchedWriteRes<[SwiftUnitP2]> { + let Latency = 0; + } + foreach Num = 1-4 in { + def SwiftWrite#Num#xP2 : WriteSequence<[SwiftWriteP2], Num>; + } + def SwiftWriteP01OneCycle2x_load : WriteSequence<[SwiftWriteP01OneCycle, + SwiftWriteP01OneCycle, + SwiftWriteP2ThreeCycle]>; + // 4.2.4 Arithmetic and Logical. + // ALU operation register shifted by immediate variant. + def SwiftWriteALUsi : SchedWriteVariant<[ + // lsl #2, lsl #1, or lsr #1. + SchedVar<IsFastImmShiftSwiftPred, [SwiftWriteP01TwoCycle]>, + SchedVar<NoSchedPred, [WriteALU]> + ]>; + def SwiftWriteALUsr : SchedWriteVariant<[ + SchedVar<IsPredicatedPred, [SwiftWriteP01ThreeCycleTwoUops]>, + SchedVar<NoSchedPred, [SwiftWriteP01TwoCycle]> + ]>; + def SwiftWriteALUSsr : SchedWriteVariant<[ + SchedVar<IsPredicatedPred, [SwiftWriteP0ThreeCycleThreeUops]>, + SchedVar<NoSchedPred, [SwiftWriteP01TwoCycle]> + ]>; + def SwiftReadAdvanceALUsr : SchedReadVariant<[ + SchedVar<IsPredicatedPred, [SchedReadAdvance<2>]>, + SchedVar<NoSchedPred, [NoReadAdvance]> + ]>; + // ADC,ADD,NEG,RSB,RSC,SBC,SUB,ADR + // AND,BIC,EOR,ORN,ORR + // CLZ,RBIT,REV,REV16,REVSH,PKH + def : WriteRes<WriteALU, [SwiftUnitP01]>; + def : SchedAlias<WriteALUsi, SwiftWriteALUsi>; + def : SchedAlias<WriteALUsr, SwiftWriteALUsr>; + def : SchedAlias<WriteALUSsr, SwiftWriteALUSsr>; + def : ReadAdvance<ReadALU, 0>; + def : SchedAlias<ReadALUsr, SwiftReadAdvanceALUsr>; + + + def SwiftChooseShiftKindP01OneOrTwoCycle : SchedWriteVariant<[ + SchedVar<IsFastImmShiftSwiftPred, [SwiftWriteP01OneCycle]>, + SchedVar<NoSchedPred, [SwiftWriteP01TwoCycle]> + ]>; + + // 4.2.5 Integer comparison + def : WriteRes<WriteCMP, [SwiftUnitP01]>; + def : SchedAlias<WriteCMPsi, SwiftChooseShiftKindP01OneOrTwoCycle>; + def : SchedAlias<WriteCMPsr, SwiftWriteP01TwoCycle>; + + // 4.2.6 Shift, Move + // Shift + // ASR,LSL,ROR,RRX + // MOV(register-shiftedregister) MVN(register-shiftedregister) + // Move + // MOV,MVN + // MOVT + // Sign/Zero extension + def : InstRW<[SwiftWriteP01OneCycle], + (instregex "SXTB", "SXTH", "SXTB16", "UXTB", "UXTH", "UXTB16", + "t2SXTB", "t2SXTH", "t2SXTB16", "t2UXTB", "t2UXTH", + "t2UXTB16")>; + // Pseudo instructions. + def : InstRW<[SwiftWriteP01OneCycle2x], + (instregex "MOVCCi32imm", "MOVi32imm", "MOV_ga_dyn", "t2MOVCCi32imm", + "t2MOVi32imm", "t2MOV_ga_dyn")>; + def : InstRW<[SwiftWriteP01OneCycle3x], + (instregex "MOV_ga_pcrel", "t2MOV_ga_pcrel", "t2MOVi16_ga_pcrel")>; + def : InstRW<[SwiftWriteP01OneCycle2x_load], + (instregex "MOV_ga_pcrel_ldr", "t2MOV_ga_pcrel_ldr")>; + + def SwiftWriteP0TwoCyleTwoUops : WriteSequence<[SwiftWriteP0OneCycle], 2>; + + def SwiftPredP0OneOrTwoCycle : SchedWriteVariant<[ + SchedVar<IsPredicatedPred, [ SwiftWriteP0TwoCyleTwoUops ]>, + SchedVar<NoSchedPred, [ SwiftWriteP0OneCycle ]> + ]>; + + // 4.2.7 Select + // SEL + def : InstRW<[SwiftPredP0OneOrTwoCycle], (instregex "SEL", "t2SEL")>; + + // 4.2.8 Bitfield + // BFI,BFC, SBFX,UBFX + def : InstRW< [SwiftWriteP01TwoCycle], + (instregex "BFC", "BFI", "UBFX", "SBFX", "(t|t2)BFC", "(t|t2)BFI", + "(t|t2)UBFX", "(t|t2)SBFX")>; + + // 4.2.9 Saturating arithmetic + def : InstRW< [SwiftWriteP01TwoCycle], + (instregex "QADD", "QSUB", "QDADD", "QDSUB", "SSAT", "SSAT16", "USAT", + "USAT16", "QADD8", "QADD16", "QSUB8", "QSUB16", "QASX", "QSAX", + "UQADD8", "UQADD16","UQSUB8","UQSUB16","UQASX","UQSAX", "t2QADD", + "t2QSUB", "t2QDADD", "t2QDSUB", "t2SSAT", "t2SSAT16", "t2USAT", + "t2QADD8", "t2QADD16", "t2QSUB8", "t2QSUB16", "t2QASX", "t2QSAX", + "t2UQADD8", "t2UQADD16","t2UQSUB8","t2UQSUB16","t2UQASX","t2UQSAX")>; + + // 4.2.10 Parallel Arithmetic + // Not flag setting. + def : InstRW< [SwiftWriteALUsr], + (instregex "SADD8", "SADD16", "SSUB8", "SSUB16", "SASX", "SSAX", + "UADD8", "UADD16", "USUB8", "USUB16", "UASX", "USAX", "t2SADD8", + "t2SADD16", "t2SSUB8", "t2SSUB16", "t2SASX", "t2SSAX", "t2UADD8", + "t2UADD16", "t2USUB8", "t2USUB16", "t2UASX", "t2USAX")>; + // Flag setting. + def : InstRW< [SwiftWriteP01TwoCycle], + (instregex "SHADD8", "SHADD16", "SHSUB8", "SHSUB16", "SHASX", "SHSAX", + "SXTAB", "SXTAB16", "SXTAH", "UHADD8", "UHADD16", "UHSUB8", "UHSUB16", + "UHASX", "UHSAX", "UXTAB", "UXTAB16", "UXTAH", "t2SHADD8", "t2SHADD16", + "t2SHSUB8", "t2SHSUB16", "t2SHASX", "t2SHSAX", "t2SXTAB", "t2SXTAB16", + "t2SXTAH", "t2UHADD8", "t2UHADD16", "t2UHSUB8", "t2UHSUB16", "t2UHASX", + "t2UHSAX", "t2UXTAB", "t2UXTAB16", "t2UXTAH")>; + + // 4.2.11 Sum of Absolute Difference + def : InstRW< [SwiftWriteP0P1FourCycle], (instregex "USAD8") >; + def : InstRW<[SwiftWriteP0P1FourCycle, ReadALU, ReadALU, SchedReadAdvance<2>], + (instregex "USADA8")>; + + // 4.2.12 Integer Multiply (32-bit result) + // Two sources. + def : InstRW< [SwiftWriteP0FourCycle], + (instregex "MULS", "MUL", "SMMUL", "SMMULR", "SMULBB", "SMULBT", + "SMULTB", "SMULTT", "SMULWB", "SMULWT", "SMUSD", "SMUSDXi", "t2MUL", + "t2SMMUL", "t2SMMULR", "t2SMULBB", "t2SMULBT", "t2SMULTB", "t2SMULTT", + "t2SMULWB", "t2SMULWT", "t2SMUSD")>; + + def SwiftWriteP0P01FiveCycleTwoUops : + SchedWriteRes<[SwiftUnitP0, SwiftUnitP01]> { + let Latency = 5; + } + + def SwiftPredP0P01FourFiveCycle : SchedWriteVariant<[ + SchedVar<IsPredicatedPred, [ SwiftWriteP0P01FiveCycleTwoUops ]>, + SchedVar<NoSchedPred, [ SwiftWriteP0FourCycle ]> + ]>; + + def SwiftReadAdvanceFourCyclesPred : SchedReadVariant<[ + SchedVar<IsPredicatedPred, [SchedReadAdvance<4>]>, + SchedVar<NoSchedPred, [ReadALU]> + ]>; + + // Multiply accumulate, three sources + def : InstRW< [SwiftPredP0P01FourFiveCycle, ReadALU, ReadALU, + SwiftReadAdvanceFourCyclesPred], + (instregex "MLAS", "MLA", "MLS", "SMMLA", "SMMLAR", "SMMLS", "SMMLSR", + "t2MLA", "t2MLS", "t2MLAS", "t2SMMLA", "t2SMMLAR", "t2SMMLS", + "t2SMMLSR")>; + + // 4.2.13 Integer Multiply (32-bit result, Q flag) + def : InstRW< [SwiftWriteP0FourCycle], + (instregex "SMUAD", "SMUADX", "t2SMUAD", "t2SMUADX")>; + def : InstRW< [SwiftPredP0P01FourFiveCycle, ReadALU, ReadALU, + SwiftReadAdvanceFourCyclesPred], + (instregex "SMLABB", "SMLABT", "SMLATB", "SMLATT", "SMLSD", "SMLSDX", + "SMLAWB", "SMLAWT", "t2SMLABB", "t2SMLABT", "t2SMLATB", "t2SMLATT", + "t2SMLSD", "t2SMLSDX", "t2SMLAWB", "t2SMLAWT")>; + def : InstRW< [SwiftPredP0P01FourFiveCycle], + (instregex "SMLAD", "SMLADX", "t2SMLAD", "t2SMLADX")>; + + def SwiftP0P0P01FiveCycle : SchedWriteRes<[SwiftUnitP0, SwiftUnitP01]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; + } + def SwiftWrite1Cycle : SchedWriteRes<[]> { + let Latency = 1; + let NumMicroOps = 0; + } + def SwiftWrite5Cycle : SchedWriteRes<[]> { + let Latency = 5; + let NumMicroOps = 0; + } + def SwiftWrite6Cycle : SchedWriteRes<[]> { + let Latency = 6; + let NumMicroOps = 0; + } + + // 4.2.14 Integer Multiply, Long + def : InstRW< [SwiftP0P0P01FiveCycle, SwiftWrite5Cycle], + (instregex "SMULL$", "UMULL$", "t2SMULL$", "t2UMULL$")>; + + def Swift2P03P01FiveCycle : SchedWriteRes<[SwiftUnitP0, SwiftUnitP01]> { + let Latency = 7; + let NumMicroOps = 5; + let ResourceCycles = [2, 3]; + } + + // 4.2.15 Integer Multiply Accumulate, Long + // 4.2.16 Integer Multiply Accumulate, Dual + // 4.2.17 Integer Multiply Accumulate Accumulate, Long + // We are being a bit inaccurate here. + def : InstRW< [SwiftWrite5Cycle, Swift2P03P01FiveCycle, ReadALU, ReadALU, + SchedReadAdvance<4>, SchedReadAdvance<3>], + (instregex "SMLALS", "UMLALS", "SMLAL", "UMLAL", "MLALBB", "SMLALBT", + "SMLALTB", "SMLALTT", "SMLALD", "SMLALDX", "SMLSLD", "SMLSLDX", + "UMAAL", "t2SMLALS", "t2UMLALS", "t2SMLAL", "t2UMLAL", "t2MLALBB", "t2SMLALBT", + "t2SMLALTB", "t2SMLALTT", "t2SMLALD", "t2SMLALDX", "t2SMLSLD", "t2SMLSLDX", + "t2UMAAL")>; + + def SwiftDiv : SchedWriteRes<[SwiftUnitP0, SwiftUnitDiv]> { + let NumMicroOps = 1; + let Latency = 14; + let ResourceCycles = [1, 14]; + } + // 4.2.18 Integer Divide + def : WriteRes<WriteDiv, [SwiftUnitDiv]>; // Workaround. + def : InstRW <[SwiftDiv], + (instregex "SDIV", "UDIV", "t2SDIV", "t2UDIV")>; + + // 4.2.19 Integer Load Single Element + // 4.2.20 Integer Load Signextended + def SwiftWriteP2P01ThreeCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> { + let Latency = 3; + let NumMicroOps = 2; + } + def SwiftWriteP2P01FourCyle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> { + let Latency = 4; + let NumMicroOps = 2; + } + def SwiftWriteP2P01P01FourCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01, + SwiftUnitP01]> { + let Latency = 4; + let NumMicroOps = 3; + } + def SwiftWriteP2P2ThreeCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP2]> { + let Latency = 3; + let NumMicroOps = 2; + } + def SwiftWriteP2P2P01ThreeCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP2, + SwiftUnitP01]> { + let Latency = 3; + let NumMicroOps = 3; + } + def SwiftWrBackOne : SchedWriteRes<[]> { + let Latency = 1; + let NumMicroOps = 0; + } + def SwiftWriteLdFour : SchedWriteRes<[]> { + let Latency = 4; + let NumMicroOps = 0; + } + // Not accurate. + def : InstRW<[SwiftWriteP2ThreeCycle], + (instregex "LDR(i12|rs)$", "LDRB(i12|rs)$", "t2LDR(i8|i12|s|pci)", + "t2LDR(H|B)(i8|i12|s|pci)", "LDREX", "tLDR[BH](r|i|spi|pci|pciASM)", + "tLDR(r|i|spi|pci|pciASM)")>; + def : InstRW<[SwiftWriteP2ThreeCycle], + (instregex "LDRH$", "PICLDR$", "PICLDR(H|B)$", "LDRcp$")>; + def : InstRW<[SwiftWriteP2P01FourCyle], + (instregex "PICLDRS(H|B)$", "t2LDRS(H|B)(i|r|p|s)", "LDRS(H|B)$", + "t2LDRpci_pic", "tLDRS(B|H)")>; + def : InstRW<[SwiftWriteP2P01ThreeCycle, SwiftWrBackOne], + (instregex "LD(RB|R)(_|T_)(POST|PRE)_(IMM|REG)", "LDRH(_PRE|_POST)", + "LDR(T|BT)_POST_(REG|IMM)", "LDRHT(i|r)", + "t2LD(R|RB|RH)_(PRE|POST)", "t2LD(R|RB|RH)T")>; + def : InstRW<[SwiftWriteP2P01P01FourCycle, SwiftWrBackOne], + (instregex "LDR(SH|SB)(_POST|_PRE)", "t2LDR(SH|SB)(_POST|_PRE)", + "LDRS(B|H)T(i|r)", "t2LDRS(B|H)T(i|r)", "t2LDRS(B|H)T")>; + + // 4.2.21 Integer Dual Load + // Not accurate. + def : InstRW<[SwiftWriteP2P2ThreeCycle, SwiftWriteLdFour], + (instregex "t2LDRDi8", "LDRD$")>; + def : InstRW<[SwiftWriteP2P2P01ThreeCycle, SwiftWriteLdFour, SwiftWrBackOne], + (instregex "LDRD_(POST|PRE)", "t2LDRD_(POST|PRE)")>; + + // 4.2.22 Integer Load, Multiple + // NumReg = 1 .. 16 + foreach Lat = 3-25 in { + def SwiftWriteLM#Lat#Cy : SchedWriteRes<[SwiftUnitP2]> { + let Latency = Lat; + } + def SwiftWriteLM#Lat#CyNo : SchedWriteRes<[]> { + let Latency = Lat; + let NumMicroOps = 0; + } + } + // Predicate. + foreach NumAddr = 1-16 in { + def SwiftLMAddr#NumAddr#Pred : SchedPredicate<"TII->getNumLDMAddresses(MI) == "#NumAddr>; + } + def SwiftWriteLDMAddrNoWB : SchedWriteRes<[SwiftUnitP01]> { let Latency = 0; } + def SwiftWriteLDMAddrWB : SchedWriteRes<[SwiftUnitP01, SwiftUnitP01]>; + def SwiftWriteLM : SchedWriteVariant<[ + SchedVar<SwiftLMAddr2Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy]>, + SchedVar<SwiftLMAddr3Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy, + SwiftWriteLM5Cy]>, + SchedVar<SwiftLMAddr4Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy, + SwiftWriteLM5Cy, SwiftWriteLM6Cy]>, + SchedVar<SwiftLMAddr5Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy, + SwiftWriteLM5Cy, SwiftWriteLM6Cy, + SwiftWriteLM7Cy]>, + SchedVar<SwiftLMAddr6Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy, + SwiftWriteLM5Cy, SwiftWriteLM6Cy, + SwiftWriteLM7Cy, SwiftWriteLM8Cy]>, + SchedVar<SwiftLMAddr7Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy, + SwiftWriteLM5Cy, SwiftWriteLM6Cy, + SwiftWriteLM7Cy, SwiftWriteLM8Cy, + SwiftWriteLM9Cy]>, + SchedVar<SwiftLMAddr8Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy, + SwiftWriteLM5Cy, SwiftWriteLM6Cy, + SwiftWriteLM7Cy, SwiftWriteLM8Cy, + SwiftWriteLM9Cy, SwiftWriteLM10Cy]>, + SchedVar<SwiftLMAddr9Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy, + SwiftWriteLM5Cy, SwiftWriteLM6Cy, + SwiftWriteLM7Cy, SwiftWriteLM8Cy, + SwiftWriteLM9Cy, SwiftWriteLM10Cy, + SwiftWriteLM11Cy]>, + SchedVar<SwiftLMAddr10Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy, + SwiftWriteLM5Cy, SwiftWriteLM6Cy, + SwiftWriteLM7Cy, SwiftWriteLM8Cy, + SwiftWriteLM9Cy, SwiftWriteLM10Cy, + SwiftWriteLM11Cy, SwiftWriteLM12Cy]>, + SchedVar<SwiftLMAddr11Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy, + SwiftWriteLM5Cy, SwiftWriteLM6Cy, + SwiftWriteLM7Cy, SwiftWriteLM8Cy, + SwiftWriteLM9Cy, SwiftWriteLM10Cy, + SwiftWriteLM11Cy, SwiftWriteLM12Cy, + SwiftWriteLM13Cy]>, + SchedVar<SwiftLMAddr12Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy, + SwiftWriteLM5Cy, SwiftWriteLM6Cy, + SwiftWriteLM7Cy, SwiftWriteLM8Cy, + SwiftWriteLM9Cy, SwiftWriteLM10Cy, + SwiftWriteLM11Cy, SwiftWriteLM12Cy, + SwiftWriteLM13Cy, SwiftWriteLM14Cy]>, + SchedVar<SwiftLMAddr13Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy, + SwiftWriteLM5Cy, SwiftWriteLM6Cy, + SwiftWriteLM7Cy, SwiftWriteLM8Cy, + SwiftWriteLM9Cy, SwiftWriteLM10Cy, + SwiftWriteLM11Cy, SwiftWriteLM12Cy, + SwiftWriteLM13Cy, SwiftWriteLM14Cy, + SwiftWriteLM15Cy]>, + SchedVar<SwiftLMAddr14Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy, + SwiftWriteLM5Cy, SwiftWriteLM6Cy, + SwiftWriteLM7Cy, SwiftWriteLM8Cy, + SwiftWriteLM9Cy, SwiftWriteLM10Cy, + SwiftWriteLM11Cy, SwiftWriteLM12Cy, + SwiftWriteLM13Cy, SwiftWriteLM14Cy, + SwiftWriteLM15Cy, SwiftWriteLM16Cy]>, + SchedVar<SwiftLMAddr15Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy, + SwiftWriteLM5Cy, SwiftWriteLM6Cy, + SwiftWriteLM7Cy, SwiftWriteLM8Cy, + SwiftWriteLM9Cy, SwiftWriteLM10Cy, + SwiftWriteLM11Cy, SwiftWriteLM12Cy, + SwiftWriteLM13Cy, SwiftWriteLM14Cy, + SwiftWriteLM15Cy, SwiftWriteLM16Cy, + SwiftWriteLM17Cy]>, + SchedVar<SwiftLMAddr16Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy, + SwiftWriteLM5Cy, SwiftWriteLM6Cy, + SwiftWriteLM7Cy, SwiftWriteLM8Cy, + SwiftWriteLM9Cy, SwiftWriteLM10Cy, + SwiftWriteLM11Cy, SwiftWriteLM12Cy, + SwiftWriteLM13Cy, SwiftWriteLM14Cy, + SwiftWriteLM15Cy, SwiftWriteLM16Cy, + SwiftWriteLM17Cy, SwiftWriteLM18Cy]>, + // Unknow number of registers, just use resources for two registers. + SchedVar<NoSchedPred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy, + SwiftWriteLM5CyNo, SwiftWriteLM6CyNo, + SwiftWriteLM7CyNo, SwiftWriteLM8CyNo, + SwiftWriteLM9CyNo, SwiftWriteLM10CyNo, + SwiftWriteLM11CyNo, SwiftWriteLM12CyNo, + SwiftWriteLM13CyNo, SwiftWriteLM14CyNo, + SwiftWriteLM15CyNo, SwiftWriteLM16CyNo, + SwiftWriteLM17CyNo, SwiftWriteLM18CyNo]> + + ]> { let Variadic=1; } + + def : InstRW<[SwiftWriteLM, SwiftWriteLDMAddrNoWB], + (instregex "LDM(IA|DA|DB|IB)$", "t2LDM(IA|DA|DB|IB)$", + "(t|sys)LDM(IA|DA|DB|IB)$")>; + def : InstRW<[SwiftWriteLDMAddrWB, SwiftWriteLM], + (instregex /*"t2LDMIA_RET", "tLDMIA_RET", "LDMIA_RET",*/ + "LDM(IA|DA|DB|IB)_UPD", "(t2|sys|t)LDM(IA|DA|DB|IB)_UPD")>; + def : InstRW<[SwiftWriteLDMAddrWB, SwiftWriteLM, SwiftWriteP1TwoCycle], + (instregex "LDMIA_RET", "(t|t2)LDMIA_RET", "POP", "tPOP")>; + // 4.2.23 Integer Store, Single Element + def : InstRW<[SwiftWriteP2], + (instregex "PICSTR", "STR(i12|rs)", "STRB(i12|rs)", "STRH$", "STREX", + "t2STR(i12|i8|s)$", "t2STR[BH](i12|i8|s)$", "tSTR[BH](i|r)", "tSTR(i|r)", "tSTRspi")>; + + def : InstRW<[SwiftWriteP01OneCycle, SwiftWriteP2], + (instregex "STR(B_|_|BT_|T_)(PRE_IMM|PRE_REG|POST_REG|POST_IMM)", + "STR(i|r)_preidx", "STRB(i|r)_preidx", "STRH_preidx", "STR(H_|HT_)(PRE|POST)", + "STR(BT|HT|T)", "t2STR_(PRE|POST)", "t2STR[BH]_(PRE|POST)", + "t2STR_preidx", "t2STR[BH]_preidx", "t2ST(RB|RH|R)T")>; + + // 4.2.24 Integer Store, Dual + def : InstRW<[SwiftWriteP2, SwiftWriteP2, SwiftWriteP01OneCycle], + (instregex "STRD$", "t2STRDi8")>; + def : InstRW<[SwiftWriteP01OneCycle, SwiftWriteP2, SwiftWriteP2, + SwiftWriteP01OneCycle], + (instregex "(t2|t)STRD_(POST|PRE)", "STRD_(POST|PRE)")>; + + // 4.2.25 Integer Store, Multiple + def SwiftWriteStIncAddr : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> { + let Latency = 0; + let NumMicroOps = 2; + } + foreach NumAddr = 1-16 in { + def SwiftWriteSTM#NumAddr : WriteSequence<[SwiftWriteStIncAddr], NumAddr>; + } + def SwiftWriteSTM : SchedWriteVariant<[ + SchedVar<SwiftLMAddr2Pred, [SwiftWriteSTM2]>, + SchedVar<SwiftLMAddr3Pred, [SwiftWriteSTM3]>, + SchedVar<SwiftLMAddr4Pred, [SwiftWriteSTM4]>, + SchedVar<SwiftLMAddr5Pred, [SwiftWriteSTM5]>, + SchedVar<SwiftLMAddr6Pred, [SwiftWriteSTM6]>, + SchedVar<SwiftLMAddr7Pred, [SwiftWriteSTM7]>, + SchedVar<SwiftLMAddr8Pred, [SwiftWriteSTM8]>, + SchedVar<SwiftLMAddr9Pred, [SwiftWriteSTM9]>, + SchedVar<SwiftLMAddr10Pred,[SwiftWriteSTM10]>, + SchedVar<SwiftLMAddr11Pred,[SwiftWriteSTM11]>, + SchedVar<SwiftLMAddr12Pred,[SwiftWriteSTM12]>, + SchedVar<SwiftLMAddr13Pred,[SwiftWriteSTM13]>, + SchedVar<SwiftLMAddr14Pred,[SwiftWriteSTM14]>, + SchedVar<SwiftLMAddr15Pred,[SwiftWriteSTM15]>, + SchedVar<SwiftLMAddr16Pred,[SwiftWriteSTM16]>, + // Unknow number of registers, just use resources for two registers. + SchedVar<NoSchedPred, [SwiftWriteSTM2]> + ]>; + def : InstRW<[SwiftWriteSTM], + (instregex "STM(IB|IA|DB|DA)$", "(t2|sys|t)STM(IB|IA|DB|DA)$")>; + def : InstRW<[SwiftWriteP01OneCycle, SwiftWriteSTM], + (instregex "STM(IB|IA|DB|DA)_UPD", "(t2|sys|t)STM(IB|IA|DB|DA)_UPD", + "PUSH", "tPUSH")>; + + // LDRLIT pseudo instructions, they expand to LDR + PICADD + def : InstRW<[SwiftWriteP2ThreeCycle, WriteALU], + (instregex "t?LDRLIT_ga_abs", "t?LDRLIT_ga_pcrel")>; + // LDRLIT_ga_pcrel_ldr expands to LDR + PICLDR + def : InstRW<[SwiftWriteP2ThreeCycle, SwiftWriteP2ThreeCycle], + (instregex "LDRLIT_ga_pcrel_ldr")>; + + // 4.2.26 Branch + def : WriteRes<WriteBr, [SwiftUnitP1]> { let Latency = 0; } + def : WriteRes<WriteBrL, [SwiftUnitP1]> { let Latency = 2; } + def : WriteRes<WriteBrTbl, [SwiftUnitP1, SwiftUnitP2]> { let Latency = 0; } + + // 4.2.27 Not issued + def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; } + def : InstRW<[WriteNoop], (instregex "t2IT", "IT", "NOP")>; + + // 4.2.28 Advanced SIMD, Integer, 2 cycle + def : InstRW<[SwiftWriteP0TwoCycle], + (instregex "VADDv", "VSUBv", "VNEG(s|f|v)", "VADDL", "VSUBL", + "VADDW", "VSUBW", "VHADD", "VHSUB", "VRHADD", "VPADDi", + "VPADDL", "VAND", "VBIC", "VEOR", "VORN", "VORR", "VTST", + "VSHL", "VSHR(s|u)", "VSHLL", "VQSHL", "VQSHLU", "VBIF", + "VBIT", "VBSL", "VSLI", "VSRI", "VCLS", "VCLZ", "VCNT")>; + + def : InstRW<[SwiftWriteP1TwoCycle], + (instregex "VEXT", "VREV16", "VREV32", "VREV64")>; + + // 4.2.29 Advanced SIMD, Integer, 4 cycle + // 4.2.30 Advanced SIMD, Integer with Accumulate + def : InstRW<[SwiftWriteP0FourCycle], + (instregex "VABA", "VABAL", "VPADAL", "VRSRA", "VSRA", "VACGE", "VACGT", + "VACLE", "VACLT", "VCEQ", "VCGE", "VCGT", "VCLE", "VCLT", "VRSHL", + "VQRSHL", "VRSHR(u|s)", "VABS(f|v)", "VQABS", "VQNEG", "VQADD", + "VQSUB")>; + def : InstRW<[SwiftWriteP1FourCycle], + (instregex "VRECPE", "VRSQRTE")>; + + // 4.2.31 Advanced SIMD, Add and Shift with Narrow + def : InstRW<[SwiftWriteP0P1FourCycle], + (instregex "VADDHN", "VSUBHN", "VSHRN")>; + def : InstRW<[SwiftWriteP0P1SixCycle], + (instregex "VRADDHN", "VRSUBHN", "VRSHRN", "VQSHRN", "VQSHRUN", + "VQRSHRN", "VQRSHRUN")>; + + // 4.2.32 Advanced SIMD, Vector Table Lookup + foreach Num = 1-4 in { + def SwiftWrite#Num#xP1TwoCycle : WriteSequence<[SwiftWriteP1TwoCycle], Num>; + } + def : InstRW<[SwiftWrite1xP1TwoCycle], + (instregex "VTB(L|X)1")>; + def : InstRW<[SwiftWrite2xP1TwoCycle], + (instregex "VTB(L|X)2")>; + def : InstRW<[SwiftWrite3xP1TwoCycle], + (instregex "VTB(L|X)3")>; + def : InstRW<[SwiftWrite4xP1TwoCycle], + (instregex "VTB(L|X)4")>; + + // 4.2.33 Advanced SIMD, Transpose + def : InstRW<[SwiftWriteP1FourCycle, SwiftWriteP1FourCycle, + SwiftWriteP1TwoCycle/*RsrcOnly*/, SchedReadAdvance<2>], + (instregex "VSWP", "VTRN", "VUZP", "VZIP")>; + + // 4.2.34 Advanced SIMD and VFP, Floating Point + def : InstRW<[SwiftWriteP0TwoCycle], (instregex "VABS(S|D)$", "VNEG(S|D)$")>; + def : InstRW<[SwiftWriteP0FourCycle], + (instregex "VCMP(D|S|ZD|ZS)$", "VCMPE(D|S|ZD|ZS)")>; + def : InstRW<[SwiftWriteP0FourCycle], + (instregex "VADD(S|f)", "VSUB(S|f)", "VABD", "VPADDf", "VMAX", "VMIN", "VPMAX", + "VPMIN")>; + def : InstRW<[SwiftWriteP0SixCycle], (instregex "VADDD$", "VSUBD$")>; + def : InstRW<[SwiftWriteP1EightCycle], (instregex "VRECPS", "VRSQRTS")>; + + // 4.2.35 Advanced SIMD and VFP, Multiply + def : InstRW<[SwiftWriteP1FourCycle], + (instregex "VMUL(S|v|p|f|s)", "VNMULS", "VQDMULH", "VQRDMULH", + "VMULL", "VQDMULL")>; + def : InstRW<[SwiftWriteP1SixCycle], + (instregex "VMULD", "VNMULD")>; + def : InstRW<[SwiftWriteP1FourCycle], + (instregex "VMLA", "VMLS", "VNMLA", "VNMLS", "VFMA(S|D)", "VFMS(S|D)", + "VFNMA", "VFNMS", "VMLAL", "VMLSL","VQDMLAL", "VQDMLSL")>; + def : InstRW<[SwiftWriteP1EightCycle], (instregex "VFMAfd", "VFMSfd")>; + def : InstRW<[SwiftWriteP1TwelveCyc], (instregex "VFMAfq", "VFMSfq")>; + + // 4.2.36 Advanced SIMD and VFP, Convert + def : InstRW<[SwiftWriteP1FourCycle], (instregex "VCVT", "V(S|U)IT", "VTO(S|U)")>; + // Fixpoint conversions. + def : WriteRes<WriteCvtFP, [SwiftUnitP1]> { let Latency = 4; } + + // 4.2.37 Advanced SIMD and VFP, Move + def : InstRW<[SwiftWriteP0TwoCycle], + (instregex "VMOVv", "VMOV(S|D)$", "VMOV(S|D)cc", + "VMVNv", "VMVN(d|q)", "VMVN(S|D)cc", + "FCONST(D|S)")>; + def : InstRW<[SwiftWriteP1TwoCycle], (instregex "VMOVN", "VMOVL")>; + def : InstRW<[WriteSequence<[SwiftWriteP0FourCycle, SwiftWriteP1TwoCycle]>], + (instregex "VQMOVN")>; + def : InstRW<[SwiftWriteP1TwoCycle], (instregex "VDUPLN", "VDUPf")>; + def : InstRW<[WriteSequence<[SwiftWriteP2FourCycle, SwiftWriteP1TwoCycle]>], + (instregex "VDUP(8|16|32)")>; + def : InstRW<[SwiftWriteP2ThreeCycle], (instregex "VMOVRS$")>; + def : InstRW<[WriteSequence<[SwiftWriteP2FourCycle, SwiftWriteP0TwoCycle]>], + (instregex "VMOVSR$", "VSETLN")>; + def : InstRW<[SwiftWriteP2ThreeCycle, SwiftWriteP2FourCycle], + (instregex "VMOVRR(D|S)$")>; + def : InstRW<[SwiftWriteP2FourCycle], (instregex "VMOVDRR$")>; + def : InstRW<[WriteSequence<[SwiftWriteP2FourCycle, SwiftWriteP1TwoCycle]>, + WriteSequence<[SwiftWrite1Cycle, SwiftWriteP2FourCycle, + SwiftWriteP1TwoCycle]>], + (instregex "VMOVSRR$")>; + def : InstRW<[WriteSequence<[SwiftWriteP1TwoCycle, SwiftWriteP2ThreeCycle]>], + (instregex "VGETLN(u|i)")>; + def : InstRW<[WriteSequence<[SwiftWriteP1TwoCycle, SwiftWriteP2ThreeCycle, + SwiftWriteP01OneCycle]>], + (instregex "VGETLNs")>; + + // 4.2.38 Advanced SIMD and VFP, Move FPSCR + // Serializing instructions. + def SwiftWaitP0For15Cy : SchedWriteRes<[SwiftUnitP0]> { + let Latency = 15; + let ResourceCycles = [15]; + } + def SwiftWaitP1For15Cy : SchedWriteRes<[SwiftUnitP1]> { + let Latency = 15; + let ResourceCycles = [15]; + } + def SwiftWaitP2For15Cy : SchedWriteRes<[SwiftUnitP2]> { + let Latency = 15; + let ResourceCycles = [15]; + } + def : InstRW<[SwiftWaitP0For15Cy, SwiftWaitP1For15Cy, SwiftWaitP2For15Cy], + (instregex "VMRS")>; + def : InstRW<[SwiftWaitP0For15Cy, SwiftWaitP1For15Cy, SwiftWaitP2For15Cy], + (instregex "VMSR")>; + // Not serializing. + def : InstRW<[SwiftWriteP0TwoCycle], (instregex "FMSTAT")>; + + // 4.2.39 Advanced SIMD and VFP, Load Single Element + def : InstRW<[SwiftWriteLM4Cy], (instregex "VLDRD$", "VLDRS$")>; + + // 4.2.40 Advanced SIMD and VFP, Store Single Element + def : InstRW<[SwiftWriteLM4Cy], (instregex "VSTRD$", "VSTRS$")>; + + // 4.2.41 Advanced SIMD and VFP, Load Multiple + // 4.2.42 Advanced SIMD and VFP, Store Multiple + + // Resource requirement for permuting, just reserves the resources. + foreach Num = 1-28 in { + def SwiftVLDMPerm#Num : SchedWriteRes<[SwiftUnitP1]> { + let Latency = 0; + let NumMicroOps = Num; + let ResourceCycles = [Num]; + } + } + + // Pre RA pseudos - load/store to a Q register as a D register pair. + def : InstRW<[SwiftWriteLM4Cy], (instregex "VLDMQIA$", "VSTMQIA$")>; + + // Post RA not modelled accurately. We assume that register use of width 64 + // bit maps to a D register, 128 maps to a Q register. Not all different kinds + // are accurately represented. + def SwiftWriteVLDM : SchedWriteVariant<[ + // Load of one S register. + SchedVar<SwiftLMAddr1Pred, [SwiftWriteLM4Cy]>, + // Load of one D register. + SchedVar<SwiftLMAddr2Pred, [SwiftWriteLM4Cy, SwiftWriteLM4CyNo]>, + // Load of 3 S register. + SchedVar<SwiftLMAddr3Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy, + SwiftWriteLM13CyNo, SwiftWriteP01OneCycle, + SwiftVLDMPerm3]>, + // Load of a Q register (not necessarily true). We should not be mapping to + // 4 S registers, either. + SchedVar<SwiftLMAddr4Pred, [SwiftWriteLM4Cy, SwiftWriteLM4CyNo, + SwiftWriteLM4CyNo, SwiftWriteLM4CyNo]>, + // Load of 5 S registers. + SchedVar<SwiftLMAddr5Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy, + SwiftWriteLM13CyNo, SwiftWriteLM14CyNo, + SwiftWriteLM17CyNo, SwiftWriteP01OneCycle, + SwiftVLDMPerm5]>, + // Load of 3 D registers. (Must also be able to handle s register list - + // though, not accurate) + SchedVar<SwiftLMAddr6Pred, [SwiftWriteLM7Cy, SwiftWriteLM8Cy, + SwiftWriteLM10Cy, SwiftWriteLM14CyNo, + SwiftWriteLM14CyNo, SwiftWriteLM14CyNo, + SwiftWriteP01OneCycle, SwiftVLDMPerm5]>, + // Load of 7 S registers. + SchedVar<SwiftLMAddr7Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy, + SwiftWriteLM13Cy, SwiftWriteLM14CyNo, + SwiftWriteLM17CyNo, SwiftWriteLM18CyNo, + SwiftWriteLM21CyNo, SwiftWriteP01OneCycle, + SwiftVLDMPerm7]>, + // Load of two Q registers. + SchedVar<SwiftLMAddr8Pred, [SwiftWriteLM7Cy, SwiftWriteLM8Cy, + SwiftWriteLM13Cy, SwiftWriteLM13CyNo, + SwiftWriteLM13CyNo, SwiftWriteLM13CyNo, + SwiftWriteLM13CyNo, SwiftWriteLM13CyNo, + SwiftWriteP01OneCycle, SwiftVLDMPerm2]>, + // Load of 9 S registers. + SchedVar<SwiftLMAddr9Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy, + SwiftWriteLM13Cy, SwiftWriteLM14CyNo, + SwiftWriteLM17CyNo, SwiftWriteLM18CyNo, + SwiftWriteLM21CyNo, SwiftWriteLM22CyNo, + SwiftWriteLM25CyNo, SwiftWriteP01OneCycle, + SwiftVLDMPerm9]>, + // Load of 5 D registers. + SchedVar<SwiftLMAddr10Pred,[SwiftWriteLM7Cy, SwiftWriteLM8Cy, + SwiftWriteLM10Cy, SwiftWriteLM14Cy, + SwiftWriteLM14CyNo, SwiftWriteLM14CyNo, + SwiftWriteLM14CyNo, SwiftWriteLM14CyNo, + SwiftWriteLM14CyNo, SwiftWriteLM14CyNo, + SwiftWriteP01OneCycle, SwiftVLDMPerm5]>, + // Inaccurate: reuse describtion from 9 S registers. + SchedVar<SwiftLMAddr11Pred,[SwiftWriteLM9Cy, SwiftWriteLM10Cy, + SwiftWriteLM13Cy, SwiftWriteLM14CyNo, + SwiftWriteLM17CyNo, SwiftWriteLM18CyNo, + SwiftWriteLM21CyNo, SwiftWriteLM22CyNo, + SwiftWriteLM21CyNo, SwiftWriteLM22CyNo, + SwiftWriteLM25CyNo, SwiftWriteP01OneCycle, + SwiftVLDMPerm9]>, + // Load of three Q registers. + SchedVar<SwiftLMAddr12Pred,[SwiftWriteLM7Cy, SwiftWriteLM8Cy, + SwiftWriteLM11Cy, SwiftWriteLM11Cy, + SwiftWriteLM11CyNo, SwiftWriteLM11CyNo, + SwiftWriteLM11CyNo, SwiftWriteLM11CyNo, + SwiftWriteLM11CyNo, SwiftWriteLM11CyNo, + SwiftWriteLM11CyNo, SwiftWriteLM11CyNo, + SwiftWriteP01OneCycle, SwiftVLDMPerm3]>, + // Inaccurate: reuse describtion from 9 S registers. + SchedVar<SwiftLMAddr13Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy, + SwiftWriteLM13Cy, SwiftWriteLM14CyNo, + SwiftWriteLM17CyNo, SwiftWriteLM18CyNo, + SwiftWriteLM21CyNo, SwiftWriteLM22CyNo, + SwiftWriteLM21CyNo, SwiftWriteLM22CyNo, + SwiftWriteLM21CyNo, SwiftWriteLM22CyNo, + SwiftWriteLM25CyNo, SwiftWriteP01OneCycle, + SwiftVLDMPerm9]>, + // Load of 7 D registers inaccurate. + SchedVar<SwiftLMAddr14Pred,[SwiftWriteLM7Cy, SwiftWriteLM8Cy, + SwiftWriteLM10Cy, SwiftWriteLM14Cy, + SwiftWriteLM14Cy, SwiftWriteLM14CyNo, + SwiftWriteLM14CyNo, SwiftWriteLM14CyNo, + SwiftWriteLM14CyNo, SwiftWriteLM14CyNo, + SwiftWriteLM14CyNo, SwiftWriteLM14CyNo, + SwiftWriteP01OneCycle, SwiftVLDMPerm7]>, + SchedVar<SwiftLMAddr15Pred,[SwiftWriteLM9Cy, SwiftWriteLM10Cy, + SwiftWriteLM13Cy, SwiftWriteLM14Cy, + SwiftWriteLM17Cy, SwiftWriteLM18CyNo, + SwiftWriteLM21CyNo, SwiftWriteLM22CyNo, + SwiftWriteLM21CyNo, SwiftWriteLM22CyNo, + SwiftWriteLM21CyNo, SwiftWriteLM22CyNo, + SwiftWriteLM21CyNo, SwiftWriteLM22CyNo, + SwiftWriteLM25CyNo, SwiftWriteP01OneCycle, + SwiftVLDMPerm9]>, + // Load of 4 Q registers. + SchedVar<SwiftLMAddr16Pred,[SwiftWriteLM7Cy, SwiftWriteLM10Cy, + SwiftWriteLM11Cy, SwiftWriteLM14Cy, + SwiftWriteLM15Cy, SwiftWriteLM18CyNo, + SwiftWriteLM19CyNo, SwiftWriteLM22CyNo, + SwiftWriteLM19CyNo, SwiftWriteLM22CyNo, + SwiftWriteLM19CyNo, SwiftWriteLM22CyNo, + SwiftWriteLM19CyNo, SwiftWriteLM22CyNo, + SwiftWriteLM19CyNo, SwiftWriteLM22CyNo, + SwiftWriteP01OneCycle, SwiftVLDMPerm4]>, + // Unknow number of registers, just use resources for two registers. + SchedVar<NoSchedPred, [SwiftWriteLM7Cy, SwiftWriteLM8Cy, + SwiftWriteLM13Cy, SwiftWriteLM13CyNo, + SwiftWriteLM13CyNo, SwiftWriteLM13CyNo, + SwiftWriteLM13CyNo, SwiftWriteLM13CyNo, + SwiftWriteLM13CyNo, SwiftWriteLM13CyNo, + SwiftWriteLM13CyNo, SwiftWriteLM13CyNo, + SwiftWriteLM13CyNo, SwiftWriteLM13CyNo, + SwiftWriteLM13CyNo, SwiftWriteLM13CyNo, + SwiftWriteLM13CyNo, SwiftWriteLM13CyNo, + SwiftWriteLM13CyNo, SwiftWriteLM13CyNo, + SwiftWriteLM13CyNo, SwiftWriteLM13CyNo, + SwiftWriteLM13CyNo, SwiftWriteLM13CyNo, + SwiftWriteLM13CyNo, SwiftWriteLM13CyNo, + SwiftWriteLM13CyNo, SwiftWriteLM13CyNo, + SwiftWriteLM13CyNo, SwiftWriteLM13CyNo, + SwiftWriteLM13CyNo, SwiftWriteLM13CyNo, + SwiftWriteP01OneCycle, SwiftVLDMPerm2]> + ]> { let Variadic = 1; } + + def : InstRW<[SwiftWriteVLDM], (instregex "VLDM[SD](IA|DB)$")>; + + def : InstRW<[SwiftWriteP01OneCycle2x, SwiftWriteVLDM], + (instregex "VLDM[SD](IA|DB)_UPD$")>; + + def SwiftWriteVSTM : SchedWriteVariant<[ + // One S register. + SchedVar<SwiftLMAddr1Pred, [SwiftWriteSTM1]>, + // One D register. + SchedVar<SwiftLMAddr2Pred, [SwiftWriteSTM1]>, + // Three S registers. + SchedVar<SwiftLMAddr3Pred, [SwiftWriteSTM4]>, + // Assume one Q register. + SchedVar<SwiftLMAddr4Pred, [SwiftWriteSTM1]>, + SchedVar<SwiftLMAddr5Pred, [SwiftWriteSTM6]>, + // Assume three D registers. + SchedVar<SwiftLMAddr6Pred, [SwiftWriteSTM4]>, + SchedVar<SwiftLMAddr7Pred, [SwiftWriteSTM8]>, + // Assume two Q registers. + SchedVar<SwiftLMAddr8Pred, [SwiftWriteSTM3]>, + SchedVar<SwiftLMAddr9Pred, [SwiftWriteSTM10]>, + // Assume 5 D registers. + SchedVar<SwiftLMAddr10Pred, [SwiftWriteSTM6]>, + SchedVar<SwiftLMAddr11Pred, [SwiftWriteSTM12]>, + // Assume three Q registers. + SchedVar<SwiftLMAddr12Pred, [SwiftWriteSTM4]>, + SchedVar<SwiftLMAddr13Pred, [SwiftWriteSTM14]>, + // Assume 7 D registers. + SchedVar<SwiftLMAddr14Pred, [SwiftWriteSTM8]>, + SchedVar<SwiftLMAddr15Pred, [SwiftWriteSTM16]>, + // Assume four Q registers. + SchedVar<SwiftLMAddr16Pred, [SwiftWriteSTM5]>, + // Asumme two Q registers. + SchedVar<NoSchedPred, [SwiftWriteSTM3]> + ]> { let Variadic = 1; } + + def : InstRW<[SwiftWriteVSTM], (instregex "VSTM[SD](IA|DB)$")>; + + def : InstRW<[SwiftWriteP01OneCycle2x, SwiftWriteVSTM], + (instregex "VSTM[SD](IA|DB)_UPD")>; + + // 4.2.43 Advanced SIMD, Element or Structure Load and Store + def SwiftWrite2xP2FourCy : SchedWriteRes<[SwiftUnitP2]> { + let Latency = 4; + let ResourceCycles = [2]; + } + def SwiftWrite3xP2FourCy : SchedWriteRes<[SwiftUnitP2]> { + let Latency = 4; + let ResourceCycles = [3]; + } + foreach Num = 1-2 in { + def SwiftExt#Num#xP0 : SchedWriteRes<[SwiftUnitP0]> { + let Latency = 0; + let NumMicroOps = Num; + let ResourceCycles = [Num]; + } + } + // VLDx + // Multiple structures. + // Single element structure loads. + // We assume aligned. + // Single/two register. + def : InstRW<[SwiftWriteLM4Cy], (instregex "VLD1(d|q)(8|16|32|64)$")>; + def : InstRW<[SwiftWriteLM4Cy, SwiftWriteP01OneCycle], + (instregex "VLD1(d|q)(8|16|32|64)wb")>; + // Three register. + def : InstRW<[SwiftWrite3xP2FourCy], + (instregex "VLD1(d|q)(8|16|32|64)T$", "VLD1d64TPseudo")>; + def : InstRW<[SwiftWrite3xP2FourCy, SwiftWriteP01OneCycle], + (instregex "VLD1(d|q)(8|16|32|64)Twb")>; + /// Four Register. + def : InstRW<[SwiftWrite2xP2FourCy], + (instregex "VLD1(d|q)(8|16|32|64)Q$", "VLD1d64QPseudo")>; + def : InstRW<[SwiftWrite2xP2FourCy, SwiftWriteP01OneCycle], + (instregex "VLD1(d|q)(8|16|32|64)Qwb")>; + // Two element structure loads. + // Two/four register. + def : InstRW<[SwiftWriteLM9Cy, SwiftExt2xP0, SwiftVLDMPerm2], + (instregex "VLD2(d|q|b)(8|16|32)$", "VLD2q(8|16|32)Pseudo$")>; + def : InstRW<[SwiftWriteLM9Cy, SwiftWriteP01OneCycle, SwiftExt2xP0, + SwiftVLDMPerm2], + (instregex "VLD2(d|q|b)(8|16|32)wb", "VLD2q(8|16|32)PseudoWB")>; + // Three element structure. + def : InstRW<[SwiftWriteLM9Cy, SwiftWriteLM9CyNo, SwiftWriteLM9CyNo, + SwiftVLDMPerm3, SwiftWrite3xP2FourCy], + (instregex "VLD3(d|q)(8|16|32)$")>; + def : InstRW<[SwiftWriteLM9Cy, SwiftVLDMPerm3, SwiftWrite3xP2FourCy], + (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo$")>; + + def : InstRW<[SwiftWriteLM9Cy, SwiftWriteLM9CyNo, SwiftWriteLM9CyNo, + SwiftWriteP01OneCycle, SwiftVLDMPerm3, SwiftWrite3xP2FourCy], + (instregex "VLD3(d|q)(8|16|32)_UPD$")>; + def : InstRW<[SwiftWriteLM9Cy, SwiftWriteP01OneCycle, SwiftVLDMPerm3, + SwiftWrite3xP2FourCy], + (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo_UPD")>; + // Four element structure loads. + def : InstRW<[SwiftWriteLM11Cy, SwiftWriteLM11Cy, SwiftWriteLM11Cy, + SwiftWriteLM11Cy, SwiftExt2xP0, SwiftVLDMPerm4, + SwiftWrite3xP2FourCy], + (instregex "VLD4(d|q)(8|16|32)$")>; + def : InstRW<[SwiftWriteLM11Cy, SwiftExt2xP0, SwiftVLDMPerm4, + SwiftWrite3xP2FourCy], + (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo$")>; + def : InstRW<[SwiftWriteLM11Cy, SwiftWriteLM11Cy, SwiftWriteLM11Cy, + SwiftWriteLM11Cy, SwiftWriteP01OneCycle, SwiftExt2xP0, + SwiftVLDMPerm4, SwiftWrite3xP2FourCy], + (instregex "VLD4(d|q)(8|16|32)_UPD")>; + def : InstRW<[SwiftWriteLM11Cy, SwiftWriteP01OneCycle, SwiftExt2xP0, + SwiftVLDMPerm4, SwiftWrite3xP2FourCy], + (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo_UPD")>; + + // Single all/lane loads. + // One element structure. + def : InstRW<[SwiftWriteLM6Cy, SwiftVLDMPerm2], + (instregex "VLD1(LN|DUP)(d|q)(8|16|32)$", "VLD1(LN|DUP)(d|q)(8|16|32)Pseudo$")>; + def : InstRW<[SwiftWriteLM6Cy, SwiftWriteP01OneCycle, SwiftVLDMPerm2], + (instregex "VLD1(LN|DUP)(d|q)(8|16|32)(wb|_UPD)", + "VLD1LNq(8|16|32)Pseudo_UPD")>; + // Two element structure. + def : InstRW<[SwiftWriteLM6Cy, SwiftWriteLM6Cy, SwiftExt1xP0, SwiftVLDMPerm2], + (instregex "VLD2(DUP|LN)(d|q)(8|16|32|8x2|16x2|32x2)$", + "VLD2LN(d|q)(8|16|32)Pseudo$")>; + def : InstRW<[SwiftWriteLM6Cy, SwiftWriteLM6Cy, SwiftWriteP01OneCycle, + SwiftExt1xP0, SwiftVLDMPerm2], + (instregex "VLD2LN(d|q)(8|16|32)_UPD$")>; + def : InstRW<[SwiftWriteLM6Cy, SwiftWriteP01OneCycle, SwiftWriteLM6Cy, + SwiftExt1xP0, SwiftVLDMPerm2], + (instregex "VLD2DUPd(8|16|32|8x2|16x2|32x2)wb")>; + def : InstRW<[SwiftWriteLM6Cy, SwiftWriteP01OneCycle, SwiftWriteLM6Cy, + SwiftExt1xP0, SwiftVLDMPerm2], + (instregex "VLD2LN(d|q)(8|16|32)Pseudo_UPD")>; + // Three element structure. + def : InstRW<[SwiftWriteLM7Cy, SwiftWriteLM8Cy, SwiftWriteLM8Cy, SwiftExt1xP0, + SwiftVLDMPerm3], + (instregex "VLD3(DUP|LN)(d|q)(8|16|32)$", + "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo$")>; + def : InstRW<[SwiftWriteLM7Cy, SwiftWriteLM8Cy, SwiftWriteLM8Cy, + SwiftWriteP01OneCycle, SwiftExt1xP0, SwiftVLDMPerm3], + (instregex "VLD3(LN|DUP)(d|q)(8|16|32)_UPD")>; + def : InstRW<[SwiftWriteLM7Cy, SwiftWriteP01OneCycle, SwiftWriteLM8Cy, + SwiftWriteLM8Cy, SwiftExt1xP0, SwiftVLDMPerm3], + (instregex "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo_UPD")>; + // Four element struture. + def : InstRW<[SwiftWriteLM8Cy, SwiftWriteLM9Cy, SwiftWriteLM10CyNo, + SwiftWriteLM10CyNo, SwiftExt1xP0, SwiftVLDMPerm5], + (instregex "VLD4(LN|DUP)(d|q)(8|16|32)$", + "VLD4(LN|DUP)(d|q)(8|16|32)Pseudo$")>; + def : InstRW<[SwiftWriteLM8Cy, SwiftWriteLM9Cy, SwiftWriteLM10CyNo, + SwiftWriteLM10CyNo, SwiftWriteP01OneCycle, SwiftExt1xP0, + SwiftVLDMPerm5], + (instregex "VLD4(DUP|LN)(d|q)(8|16|32)_UPD")>; + def : InstRW<[SwiftWriteLM8Cy, SwiftWriteP01OneCycle, SwiftWriteLM9Cy, + SwiftWriteLM10CyNo, SwiftWriteLM10CyNo, SwiftExt1xP0, + SwiftVLDMPerm5], + (instregex "VLD4(DUP|LN)(d|q)(8|16|32)Pseudo_UPD")>; + // VSTx + // Multiple structures. + // Single element structure store. + def : InstRW<[SwiftWrite1xP2], (instregex "VST1d(8|16|32|64)$")>; + def : InstRW<[SwiftWrite2xP2], (instregex "VST1q(8|16|32|64)$")>; + def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite1xP2], + (instregex "VST1d(8|16|32|64)wb")>; + def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite2xP2], + (instregex "VST1q(8|16|32|64)wb")>; + def : InstRW<[SwiftWrite3xP2], + (instregex "VST1d(8|16|32|64)T$", "VST1d64TPseudo$")>; + def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite3xP2], + (instregex "VST1d(8|16|32|64)Twb", "VST1d64TPseudoWB")>; + def : InstRW<[SwiftWrite4xP2], + (instregex "VST1d(8|16|32|64)(Q|QPseudo)$")>; + def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite4xP2], + (instregex "VST1d(8|16|32|64)(Qwb|QPseudoWB)")>; + // Two element structure store. + def : InstRW<[SwiftWrite1xP2, SwiftVLDMPerm1], + (instregex "VST2(d|b)(8|16|32)$")>; + def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite1xP2, SwiftVLDMPerm1], + (instregex "VST2(b|d)(8|16|32)wb")>; + def : InstRW<[SwiftWrite2xP2, SwiftVLDMPerm2], + (instregex "VST2q(8|16|32)$", "VST2q(8|16|32)Pseudo$")>; + def : InstRW<[SwiftWrite2xP2, SwiftVLDMPerm2], + (instregex "VST2q(8|16|32)wb", "VST2q(8|16|32)PseudoWB")>; + // Three element structure store. + def : InstRW<[SwiftWrite4xP2, SwiftVLDMPerm2], + (instregex "VST3(d|q)(8|16|32)$", "VST3(d|q)(8|16|32)(oddP|P)seudo$")>; + def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite4xP2, SwiftVLDMPerm2], + (instregex "VST3(d|q)(8|16|32)_UPD", + "VST3(d|q)(8|16|32)(oddP|P)seudo_UPD$")>; + // Four element structure store. + def : InstRW<[SwiftWrite4xP2, SwiftVLDMPerm2], + (instregex "VST4(d|q)(8|16|32)$", "VST4(d|q)(8|16|32)(oddP|P)seudo$")>; + def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite4xP2, SwiftVLDMPerm4], + (instregex "VST4(d|q)(8|16|32)_UPD", + "VST4(d|q)(8|16|32)(oddP|P)seudo_UPD$")>; + // Single/all lane store. + // One element structure. + def : InstRW<[SwiftWrite1xP2, SwiftVLDMPerm1], + (instregex "VST1LNd(8|16|32)$", "VST1LNq(8|16|32)Pseudo$")>; + def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite1xP2, SwiftVLDMPerm1], + (instregex "VST1LNd(8|16|32)_UPD", "VST1LNq(8|16|32)Pseudo_UPD")>; + // Two element structure. + def : InstRW<[SwiftWrite1xP2, SwiftVLDMPerm2], + (instregex "VST2LN(d|q)(8|16|32)$", "VST2LN(d|q)(8|16|32)Pseudo$")>; + def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite1xP2, SwiftVLDMPerm2], + (instregex "VST2LN(d|q)(8|16|32)_UPD", + "VST2LN(d|q)(8|16|32)Pseudo_UPD")>; + // Three element structure. + def : InstRW<[SwiftWrite4xP2, SwiftVLDMPerm2], + (instregex "VST3LN(d|q)(8|16|32)$", "VST3LN(d|q)(8|16|32)Pseudo$")>; + def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite4xP2, SwiftVLDMPerm2], + (instregex "VST3LN(d|q)(8|16|32)_UPD", + "VST3LN(d|q)(8|16|32)Pseudo_UPD")>; + // Four element structure. + def : InstRW<[SwiftWrite2xP2, SwiftVLDMPerm2], + (instregex "VST4LN(d|q)(8|16|32)$", "VST4LN(d|q)(8|16|32)Pseudo$")>; + def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite2xP2, SwiftVLDMPerm2], + (instregex "VST4LN(d|q)(8|16|32)_UPD", + "VST4LN(d|q)(8|16|32)Pseudo_UPD")>; + + // 4.2.44 VFP, Divide and Square Root + def SwiftDiv17 : SchedWriteRes<[SwiftUnitP0, SwiftUnitDiv]> { + let NumMicroOps = 1; + let Latency = 17; + let ResourceCycles = [1, 15]; + } + def SwiftDiv32 : SchedWriteRes<[SwiftUnitP0, SwiftUnitDiv]> { + let NumMicroOps = 1; + let Latency = 32; + let ResourceCycles = [1, 30]; + } + def : InstRW<[SwiftDiv17], (instregex "VDIVS", "VSQRTS")>; + def : InstRW<[SwiftDiv32], (instregex "VDIVD", "VSQRTD")>; + + // Not specified. + def : InstRW<[SwiftWriteP01OneCycle2x], (instregex "ABS")>; + // Preload. + def : WriteRes<WritePreLd, [SwiftUnitP2]> { let Latency = 0; + let ResourceCycles = [0]; + } + +} diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td b/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td new file mode 100644 index 0000000..57d0bfb --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td @@ -0,0 +1,300 @@ +//===-- ARMScheduleV6.td - ARM v6 Scheduling Definitions ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the ARM v6 processors. +// +//===----------------------------------------------------------------------===// + +// Model based on ARM1176 +// +// Functional Units +def V6_Pipe : FuncUnit; // pipeline + +// Scheduling information derived from "ARM1176JZF-S Technical Reference Manual" +// +def ARMV6Itineraries : ProcessorItineraries< + [V6_Pipe], [], [ + // + // No operand cycles + InstrItinData<IIC_iALUx , [InstrStage<1, [V6_Pipe]>]>, + // + // Binary Instructions that produce a result + InstrItinData<IIC_iALUi , [InstrStage<1, [V6_Pipe]>], [2, 2]>, + InstrItinData<IIC_iALUr , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>, + InstrItinData<IIC_iALUsi , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>, + InstrItinData<IIC_iALUsr , [InstrStage<2, [V6_Pipe]>], [3, 3, 2, 1]>, + // + // Bitwise Instructions that produce a result + InstrItinData<IIC_iBITi , [InstrStage<1, [V6_Pipe]>], [2, 2]>, + InstrItinData<IIC_iBITr , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>, + InstrItinData<IIC_iBITsi , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>, + InstrItinData<IIC_iBITsr , [InstrStage<2, [V6_Pipe]>], [3, 3, 2, 1]>, + // + // Unary Instructions that produce a result + InstrItinData<IIC_iUNAr , [InstrStage<1, [V6_Pipe]>], [2, 2]>, + InstrItinData<IIC_iUNAsi , [InstrStage<1, [V6_Pipe]>], [2, 1]>, + // + // Zero and sign extension instructions + InstrItinData<IIC_iEXTr , [InstrStage<1, [V6_Pipe]>], [1, 1]>, + InstrItinData<IIC_iEXTAr , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>, + InstrItinData<IIC_iEXTAsr , [InstrStage<2, [V6_Pipe]>], [3, 3, 2, 1]>, + // + // Compare instructions + InstrItinData<IIC_iCMPi , [InstrStage<1, [V6_Pipe]>], [2]>, + InstrItinData<IIC_iCMPr , [InstrStage<1, [V6_Pipe]>], [2, 2]>, + InstrItinData<IIC_iCMPsi , [InstrStage<1, [V6_Pipe]>], [2, 1]>, + InstrItinData<IIC_iCMPsr , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>, + // + // Test instructions + InstrItinData<IIC_iTSTi , [InstrStage<1, [V6_Pipe]>], [2]>, + InstrItinData<IIC_iTSTr , [InstrStage<1, [V6_Pipe]>], [2, 2]>, + InstrItinData<IIC_iTSTsi , [InstrStage<1, [V6_Pipe]>], [2, 1]>, + InstrItinData<IIC_iTSTsr , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>, + // + // Move instructions, unconditional + InstrItinData<IIC_iMOVi , [InstrStage<1, [V6_Pipe]>], [2]>, + InstrItinData<IIC_iMOVr , [InstrStage<1, [V6_Pipe]>], [2, 2]>, + InstrItinData<IIC_iMOVsi , [InstrStage<1, [V6_Pipe]>], [2, 1]>, + InstrItinData<IIC_iMOVsr , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>, + InstrItinData<IIC_iMOVix2 , [InstrStage<1, [V6_Pipe]>, + InstrStage<1, [V6_Pipe]>], [2]>, + InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [V6_Pipe]>, + InstrStage<1, [V6_Pipe]>, + InstrStage<1, [V6_Pipe]>], [3]>, + InstrItinData<IIC_iMOVix2ld , [InstrStage<1, [V6_Pipe]>, + InstrStage<1, [V6_Pipe]>, + InstrStage<1, [V6_Pipe]>], [5]>, + // + // Move instructions, conditional + InstrItinData<IIC_iCMOVi , [InstrStage<1, [V6_Pipe]>], [3]>, + InstrItinData<IIC_iCMOVr , [InstrStage<1, [V6_Pipe]>], [3, 2]>, + InstrItinData<IIC_iCMOVsi , [InstrStage<1, [V6_Pipe]>], [3, 1]>, + InstrItinData<IIC_iCMOVsr , [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>, + InstrItinData<IIC_iCMOVix2 , [InstrStage<1, [V6_Pipe]>, + InstrStage<1, [V6_Pipe]>], [4]>, + // + // MVN instructions + InstrItinData<IIC_iMVNi , [InstrStage<1, [V6_Pipe]>], [2]>, + InstrItinData<IIC_iMVNr , [InstrStage<1, [V6_Pipe]>], [2, 2]>, + InstrItinData<IIC_iMVNsi , [InstrStage<1, [V6_Pipe]>], [2, 1]>, + InstrItinData<IIC_iMVNsr , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>, + + // Integer multiply pipeline + // + InstrItinData<IIC_iMUL16 , [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>, + InstrItinData<IIC_iMAC16 , [InstrStage<1, [V6_Pipe]>], [4, 1, 1, 2]>, + InstrItinData<IIC_iMUL32 , [InstrStage<2, [V6_Pipe]>], [5, 1, 1]>, + InstrItinData<IIC_iMAC32 , [InstrStage<2, [V6_Pipe]>], [5, 1, 1, 2]>, + InstrItinData<IIC_iMUL64 , [InstrStage<3, [V6_Pipe]>], [6, 1, 1]>, + InstrItinData<IIC_iMAC64 , [InstrStage<3, [V6_Pipe]>], [6, 1, 1, 2]>, + + // Integer load pipeline + // + // Immediate offset + InstrItinData<IIC_iLoad_i , [InstrStage<1, [V6_Pipe]>], [4, 1]>, + InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [V6_Pipe]>], [4, 1]>, + InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [V6_Pipe]>], [4, 1]>, + // + // Register offset + InstrItinData<IIC_iLoad_r , [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>, + InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>, + InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>, + // + // Scaled register offset, issues over 2 cycles + InstrItinData<IIC_iLoad_si , [InstrStage<2, [V6_Pipe]>], [5, 2, 1]>, + InstrItinData<IIC_iLoad_bh_si, [InstrStage<2, [V6_Pipe]>], [5, 2, 1]>, + // + // Immediate offset with update + InstrItinData<IIC_iLoad_iu , [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>, + InstrItinData<IIC_iLoad_bh_iu, [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>, + // + // Register offset with update + InstrItinData<IIC_iLoad_ru , [InstrStage<1, [V6_Pipe]>], [4, 2, 1, 1]>, + InstrItinData<IIC_iLoad_bh_ru, [InstrStage<1, [V6_Pipe]>], [4, 2, 1, 1]>, + InstrItinData<IIC_iLoad_d_ru , [InstrStage<1, [V6_Pipe]>], [4, 2, 1, 1]>, + // + // Scaled register offset with update, issues over 2 cycles + InstrItinData<IIC_iLoad_siu, [InstrStage<2, [V6_Pipe]>], [5, 2, 2, 1]>, + InstrItinData<IIC_iLoad_bh_siu,[InstrStage<2, [V6_Pipe]>], [5, 2, 2, 1]>, + + // + // Load multiple, def is the 5th operand. + InstrItinData<IIC_iLoad_m , [InstrStage<3, [V6_Pipe]>], [1, 1, 1, 1, 4]>, + // + // Load multiple + update, defs are the 1st and 5th operands. + InstrItinData<IIC_iLoad_mu , [InstrStage<3, [V6_Pipe]>], [2, 1, 1, 1, 4]>, + // + // Load multiple plus branch + InstrItinData<IIC_iLoad_mBr, [InstrStage<3, [V6_Pipe]>, + InstrStage<1, [V6_Pipe]>], [1, 2, 1, 1, 4]>, + + // + // iLoadi + iALUr for t2LDRpci_pic. + InstrItinData<IIC_iLoadiALU, [InstrStage<1, [V6_Pipe]>, + InstrStage<1, [V6_Pipe]>], [3, 1]>, + + // + // Pop, def is the 3rd operand. + InstrItinData<IIC_iPop , [InstrStage<3, [V6_Pipe]>], [1, 1, 4]>, + // + // Pop + branch, def is the 3rd operand. + InstrItinData<IIC_iPop_Br, [InstrStage<3, [V6_Pipe]>, + InstrStage<1, [V6_Pipe]>], [1, 2, 4]>, + + // Integer store pipeline + // + // Immediate offset + InstrItinData<IIC_iStore_i , [InstrStage<1, [V6_Pipe]>], [2, 1]>, + InstrItinData<IIC_iStore_bh_i, [InstrStage<1, [V6_Pipe]>], [2, 1]>, + InstrItinData<IIC_iStore_d_i , [InstrStage<1, [V6_Pipe]>], [2, 1]>, + // + // Register offset + InstrItinData<IIC_iStore_r , [InstrStage<1, [V6_Pipe]>], [2, 1, 1]>, + InstrItinData<IIC_iStore_bh_r, [InstrStage<1, [V6_Pipe]>], [2, 1, 1]>, + InstrItinData<IIC_iStore_d_r , [InstrStage<1, [V6_Pipe]>], [2, 1, 1]>, + // + // Scaled register offset, issues over 2 cycles + InstrItinData<IIC_iStore_si , [InstrStage<2, [V6_Pipe]>], [2, 2, 1]>, + InstrItinData<IIC_iStore_bh_si, [InstrStage<2, [V6_Pipe]>], [2, 2, 1]>, + // + // Immediate offset with update + InstrItinData<IIC_iStore_iu , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>, + InstrItinData<IIC_iStore_bh_iu, [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>, + // + // Register offset with update + InstrItinData<IIC_iStore_ru, [InstrStage<1, [V6_Pipe]>], [2, 2, 1, 1]>, + InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [V6_Pipe]>], [2, 2, 1, 1]>, + InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [V6_Pipe]>], [2, 2, 1, 1]>, + // + // Scaled register offset with update, issues over 2 cycles + InstrItinData<IIC_iStore_siu, [InstrStage<2, [V6_Pipe]>], [2, 2, 2, 1]>, + InstrItinData<IIC_iStore_bh_siu,[InstrStage<2, [V6_Pipe]>], [2, 2, 2, 1]>, + // + // Store multiple + InstrItinData<IIC_iStore_m , [InstrStage<3, [V6_Pipe]>]>, + // + // Store multiple + update + InstrItinData<IIC_iStore_mu , [InstrStage<3, [V6_Pipe]>], [2]>, + + // Branch + // + // no delay slots, so the latency of a branch is unimportant + InstrItinData<IIC_Br , [InstrStage<1, [V6_Pipe]>]>, + + // VFP + // Issue through integer pipeline, and execute in NEON unit. We assume + // RunFast mode so that NFP pipeline is used for single-precision when + // possible. + // + // FP Special Register to Integer Register File Move + InstrItinData<IIC_fpSTAT , [InstrStage<1, [V6_Pipe]>], [3]>, + // + // Single-precision FP Unary + InstrItinData<IIC_fpUNA32 , [InstrStage<1, [V6_Pipe]>], [5, 2]>, + // + // Double-precision FP Unary + InstrItinData<IIC_fpUNA64 , [InstrStage<1, [V6_Pipe]>], [5, 2]>, + // + // Single-precision FP Compare + InstrItinData<IIC_fpCMP32 , [InstrStage<1, [V6_Pipe]>], [2, 2]>, + // + // Double-precision FP Compare + InstrItinData<IIC_fpCMP64 , [InstrStage<1, [V6_Pipe]>], [2, 2]>, + // + // Single to Double FP Convert + InstrItinData<IIC_fpCVTSD , [InstrStage<1, [V6_Pipe]>], [5, 2]>, + // + // Double to Single FP Convert + InstrItinData<IIC_fpCVTDS , [InstrStage<1, [V6_Pipe]>], [5, 2]>, + // + // Single-Precision FP to Integer Convert + InstrItinData<IIC_fpCVTSI , [InstrStage<1, [V6_Pipe]>], [9, 2]>, + // + // Double-Precision FP to Integer Convert + InstrItinData<IIC_fpCVTDI , [InstrStage<1, [V6_Pipe]>], [9, 2]>, + // + // Integer to Single-Precision FP Convert + InstrItinData<IIC_fpCVTIS , [InstrStage<1, [V6_Pipe]>], [9, 2]>, + // + // Integer to Double-Precision FP Convert + InstrItinData<IIC_fpCVTID , [InstrStage<1, [V6_Pipe]>], [9, 2]>, + // + // Single-precision FP ALU + InstrItinData<IIC_fpALU32 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2]>, + // + // Double-precision FP ALU + InstrItinData<IIC_fpALU64 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2]>, + // + // Single-precision FP Multiply + InstrItinData<IIC_fpMUL32 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2]>, + // + // Double-precision FP Multiply + InstrItinData<IIC_fpMUL64 , [InstrStage<2, [V6_Pipe]>], [9, 2, 2]>, + // + // Single-precision FP MAC + InstrItinData<IIC_fpMAC32 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2, 2]>, + // + // Double-precision FP MAC + InstrItinData<IIC_fpMAC64 , [InstrStage<2, [V6_Pipe]>], [9, 2, 2, 2]>, + // + // Single-precision Fused FP MAC + InstrItinData<IIC_fpFMAC32, [InstrStage<1, [V6_Pipe]>], [9, 2, 2, 2]>, + // + // Double-precision Fused FP MAC + InstrItinData<IIC_fpFMAC64, [InstrStage<2, [V6_Pipe]>], [9, 2, 2, 2]>, + // + // Single-precision FP DIV + InstrItinData<IIC_fpDIV32 , [InstrStage<15, [V6_Pipe]>], [20, 2, 2]>, + // + // Double-precision FP DIV + InstrItinData<IIC_fpDIV64 , [InstrStage<29, [V6_Pipe]>], [34, 2, 2]>, + // + // Single-precision FP SQRT + InstrItinData<IIC_fpSQRT32 , [InstrStage<15, [V6_Pipe]>], [20, 2, 2]>, + // + // Double-precision FP SQRT + InstrItinData<IIC_fpSQRT64 , [InstrStage<29, [V6_Pipe]>], [34, 2, 2]>, + // + // Integer to Single-precision Move + InstrItinData<IIC_fpMOVIS, [InstrStage<1, [V6_Pipe]>], [10, 1]>, + // + // Integer to Double-precision Move + InstrItinData<IIC_fpMOVID, [InstrStage<1, [V6_Pipe]>], [10, 1, 1]>, + // + // Single-precision to Integer Move + InstrItinData<IIC_fpMOVSI, [InstrStage<1, [V6_Pipe]>], [10, 1]>, + // + // Double-precision to Integer Move + InstrItinData<IIC_fpMOVDI, [InstrStage<1, [V6_Pipe]>], [10, 10, 1]>, + // + // Single-precision FP Load + InstrItinData<IIC_fpLoad32 , [InstrStage<1, [V6_Pipe]>], [5, 2, 2]>, + // + // Double-precision FP Load + InstrItinData<IIC_fpLoad64 , [InstrStage<1, [V6_Pipe]>], [5, 2, 2]>, + // + // FP Load Multiple + InstrItinData<IIC_fpLoad_m , [InstrStage<3, [V6_Pipe]>], [2, 1, 1, 5]>, + // + // FP Load Multiple + update + InstrItinData<IIC_fpLoad_mu, [InstrStage<3, [V6_Pipe]>], [3, 2, 1, 1, 5]>, + // + // Single-precision FP Store + InstrItinData<IIC_fpStore32 , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>, + // + // Double-precision FP Store + // use FU_Issue to enforce the 1 load/store per cycle limit + InstrItinData<IIC_fpStore64 , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>, + // + // FP Store Multiple + InstrItinData<IIC_fpStore_m, [InstrStage<3, [V6_Pipe]>], [2, 2, 2, 2]>, + // + // FP Store Multiple + update + InstrItinData<IIC_fpStore_mu,[InstrStage<3, [V6_Pipe]>], [3, 2, 2, 2, 2]> +]>; diff --git a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp new file mode 100644 index 0000000..6fded9c --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -0,0 +1,271 @@ +//===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the ARMSelectionDAGInfo class. +// +//===----------------------------------------------------------------------===// + +#include "ARMTargetMachine.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/IR/DerivedTypes.h" +using namespace llvm; + +#define DEBUG_TYPE "arm-selectiondag-info" + +// Emit, if possible, a specialized version of the given Libcall. Typically this +// means selecting the appropriately aligned version, but we also convert memset +// of 0 into memclr. +SDValue ARMSelectionDAGInfo:: +EmitSpecializedLibcall(SelectionDAG &DAG, SDLoc dl, + SDValue Chain, + SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, + RTLIB::Libcall LC) const { + const ARMSubtarget &Subtarget = + DAG.getMachineFunction().getSubtarget<ARMSubtarget>(); + const ARMTargetLowering *TLI = Subtarget.getTargetLowering(); + + // Only use a specialized AEABI function if the default version of this + // Libcall is an AEABI function. + if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0) + return SDValue(); + + // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be + // able to translate memset to memclr and use the value to index the function + // name array. + enum { + AEABI_MEMCPY = 0, + AEABI_MEMMOVE, + AEABI_MEMSET, + AEABI_MEMCLR + } AEABILibcall; + switch (LC) { + case RTLIB::MEMCPY: + AEABILibcall = AEABI_MEMCPY; + break; + case RTLIB::MEMMOVE: + AEABILibcall = AEABI_MEMMOVE; + break; + case RTLIB::MEMSET: + AEABILibcall = AEABI_MEMSET; + if (ConstantSDNode *ConstantSrc = dyn_cast<ConstantSDNode>(Src)) + if (ConstantSrc->getZExtValue() == 0) + AEABILibcall = AEABI_MEMCLR; + break; + default: + return SDValue(); + } + + // Choose the most-aligned libcall variant that we can + enum { + ALIGN1 = 0, + ALIGN4, + ALIGN8 + } AlignVariant; + if ((Align & 7) == 0) + AlignVariant = ALIGN8; + else if ((Align & 3) == 0) + AlignVariant = ALIGN4; + else + AlignVariant = ALIGN1; + + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); + Entry.Node = Dst; + Args.push_back(Entry); + if (AEABILibcall == AEABI_MEMCLR) { + Entry.Node = Size; + Args.push_back(Entry); + } else if (AEABILibcall == AEABI_MEMSET) { + // Adjust parameters for memset, EABI uses format (ptr, size, value), + // GNU library uses (ptr, value, size) + // See RTABI section 4.3.4 + Entry.Node = Size; + Args.push_back(Entry); + + // Extend or truncate the argument to be an i32 value for the call. + if (Src.getValueType().bitsGT(MVT::i32)) + Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src); + else if (Src.getValueType().bitsLT(MVT::i32)) + Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src); + + Entry.Node = Src; + Entry.Ty = Type::getInt32Ty(*DAG.getContext()); + Entry.isSExt = false; + Args.push_back(Entry); + } else { + Entry.Node = Src; + Args.push_back(Entry); + + Entry.Node = Size; + Args.push_back(Entry); + } + + char const *FunctionNames[4][3] = { + { "__aeabi_memcpy", "__aeabi_memcpy4", "__aeabi_memcpy8" }, + { "__aeabi_memmove", "__aeabi_memmove4", "__aeabi_memmove8" }, + { "__aeabi_memset", "__aeabi_memset4", "__aeabi_memset8" }, + { "__aeabi_memclr", "__aeabi_memclr4", "__aeabi_memclr8" } + }; + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl) + .setChain(Chain) + .setCallee( + TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant], + TLI->getPointerTy(DAG.getDataLayout())), + std::move(Args), 0) + .setDiscardResult(); + std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI); + + return CallResult.second; +} + +SDValue +ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, + SDValue Chain, + SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, + bool isVolatile, bool AlwaysInline, + MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo) const { + const ARMSubtarget &Subtarget = + DAG.getMachineFunction().getSubtarget<ARMSubtarget>(); + // Do repeated 4-byte loads and stores. To be improved. + // This requires 4-byte alignment. + if ((Align & 3) != 0) + return SDValue(); + // This requires the copy size to be a constant, preferably + // within a subtarget-specific limit. + ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); + if (!ConstantSize) + return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align, + RTLIB::MEMCPY); + uint64_t SizeVal = ConstantSize->getZExtValue(); + if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold()) + return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align, + RTLIB::MEMCPY); + + unsigned BytesLeft = SizeVal & 3; + unsigned NumMemOps = SizeVal >> 2; + unsigned EmittedNumMemOps = 0; + EVT VT = MVT::i32; + unsigned VTSize = 4; + unsigned i = 0; + // Emit a maximum of 4 loads in Thumb1 since we have fewer registers + const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6; + SDValue TFOps[6]; + SDValue Loads[6]; + uint64_t SrcOff = 0, DstOff = 0; + + // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to + // VLDM/VSTM and make this code emit it when appropriate. This would reduce + // pressure on the general purpose registers. However this seems harder to map + // onto the register allocator's view of the world. + + // The number of MEMCPY pseudo-instructions to emit. We use up to + // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm + // later on. This is a lower bound on the number of MEMCPY operations we must + // emit. + unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM; + + SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue); + + for (unsigned I = 0; I != NumMEMCPYs; ++I) { + // Evenly distribute registers among MEMCPY operations to reduce register + // pressure. + unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs; + unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps; + + Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src, + DAG.getConstant(NumRegs, dl, MVT::i32)); + Src = Dst.getValue(1); + Chain = Dst.getValue(2); + + DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize); + SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize); + + EmittedNumMemOps = NextEmittedNumMemOps; + } + + if (BytesLeft == 0) + return Chain; + + // Issue loads / stores for the trailing (1 - 3) bytes. + unsigned BytesLeftSave = BytesLeft; + i = 0; + while (BytesLeft) { + if (BytesLeft >= 2) { + VT = MVT::i16; + VTSize = 2; + } else { + VT = MVT::i8; + VTSize = 1; + } + + Loads[i] = DAG.getLoad(VT, dl, Chain, + DAG.getNode(ISD::ADD, dl, MVT::i32, Src, + DAG.getConstant(SrcOff, dl, MVT::i32)), + SrcPtrInfo.getWithOffset(SrcOff), + false, false, false, 0); + TFOps[i] = Loads[i].getValue(1); + ++i; + SrcOff += VTSize; + BytesLeft -= VTSize; + } + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + makeArrayRef(TFOps, i)); + + i = 0; + BytesLeft = BytesLeftSave; + while (BytesLeft) { + if (BytesLeft >= 2) { + VT = MVT::i16; + VTSize = 2; + } else { + VT = MVT::i8; + VTSize = 1; + } + + TFOps[i] = DAG.getStore(Chain, dl, Loads[i], + DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, + DAG.getConstant(DstOff, dl, MVT::i32)), + DstPtrInfo.getWithOffset(DstOff), false, false, 0); + ++i; + DstOff += VTSize; + BytesLeft -= VTSize; + } + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + makeArrayRef(TFOps, i)); +} + + +SDValue ARMSelectionDAGInfo:: +EmitTargetCodeForMemmove(SelectionDAG &DAG, SDLoc dl, + SDValue Chain, + SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, + bool isVolatile, + MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo) const { + return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align, + RTLIB::MEMMOVE); +} + + +SDValue ARMSelectionDAGInfo:: +EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, + SDValue Chain, SDValue Dst, + SDValue Src, SDValue Size, + unsigned Align, bool isVolatile, + MachinePointerInfo DstPtrInfo) const { + return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align, + RTLIB::MEMSET); +} diff --git a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h new file mode 100644 index 0000000..289879e --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h @@ -0,0 +1,73 @@ +//===-- ARMSelectionDAGInfo.h - ARM SelectionDAG Info -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the ARM subclass for TargetSelectionDAGInfo. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_ARMSELECTIONDAGINFO_H +#define LLVM_LIB_TARGET_ARM_ARMSELECTIONDAGINFO_H + +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/Target/TargetSelectionDAGInfo.h" + +namespace llvm { + +namespace ARM_AM { + static inline ShiftOpc getShiftOpcForNode(unsigned Opcode) { + switch (Opcode) { + default: return ARM_AM::no_shift; + case ISD::SHL: return ARM_AM::lsl; + case ISD::SRL: return ARM_AM::lsr; + case ISD::SRA: return ARM_AM::asr; + case ISD::ROTR: return ARM_AM::ror; + //case ISD::ROTL: // Only if imm -> turn into ROTR. + // Can't handle RRX here, because it would require folding a flag into + // the addressing mode. :( This causes us to miss certain things. + //case ARMISD::RRX: return ARM_AM::rrx; + } + } +} // end namespace ARM_AM + +class ARMSelectionDAGInfo : public TargetSelectionDAGInfo { +public: + + SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, + SDValue Chain, + SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, + bool isVolatile, bool AlwaysInline, + MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo) const override; + + SDValue EmitTargetCodeForMemmove(SelectionDAG &DAG, SDLoc dl, + SDValue Chain, + SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, bool isVolatile, + MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo) const override; + + // Adjust parameters for memset, see RTABI section 4.3.4 + SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, + SDValue Chain, + SDValue Op1, SDValue Op2, + SDValue Op3, unsigned Align, + bool isVolatile, + MachinePointerInfo DstPtrInfo) const override; + + SDValue EmitSpecializedLibcall(SelectionDAG &DAG, SDLoc dl, + SDValue Chain, + SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, + RTLIB::Libcall LC) const; +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp new file mode 100644 index 0000000..bb6ae28 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -0,0 +1,366 @@ +//===-- ARMSubtarget.cpp - ARM Subtarget Information ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the ARM specific subclass of TargetSubtargetInfo. +// +//===----------------------------------------------------------------------===// + +#include "ARMSubtarget.h" +#include "ARMFrameLowering.h" +#include "ARMISelLowering.h" +#include "ARMInstrInfo.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMSelectionDAGInfo.h" +#include "ARMSubtarget.h" +#include "ARMTargetMachine.h" +#include "Thumb1FrameLowering.h" +#include "Thumb1InstrInfo.h" +#include "Thumb2InstrInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetRegisterInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "arm-subtarget" + +#define GET_SUBTARGETINFO_TARGET_DESC +#define GET_SUBTARGETINFO_CTOR +#include "ARMGenSubtargetInfo.inc" + +static cl::opt<bool> +UseFusedMulOps("arm-use-mulops", + cl::init(true), cl::Hidden); + +enum ITMode { + DefaultIT, + RestrictedIT, + NoRestrictedIT +}; + +static cl::opt<ITMode> +IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), + cl::ZeroOrMore, + cl::values(clEnumValN(DefaultIT, "arm-default-it", + "Generate IT block based on arch"), + clEnumValN(RestrictedIT, "arm-restrict-it", + "Disallow deprecated IT based on ARMv8"), + clEnumValN(NoRestrictedIT, "arm-no-restrict-it", + "Allow IT blocks based on ARMv7"), + clEnumValEnd)); + +/// ForceFastISel - Use the fast-isel, even for subtargets where it is not +/// currently supported (for testing only). +static cl::opt<bool> +ForceFastISel("arm-force-fast-isel", + cl::init(false), cl::Hidden); + +/// initializeSubtargetDependencies - Initializes using a CPU and feature string +/// so that we can use initializer lists for subtarget initialization. +ARMSubtarget &ARMSubtarget::initializeSubtargetDependencies(StringRef CPU, + StringRef FS) { + initializeEnvironment(); + initSubtargetFeatures(CPU, FS); + return *this; +} + +ARMFrameLowering *ARMSubtarget::initializeFrameLowering(StringRef CPU, + StringRef FS) { + ARMSubtarget &STI = initializeSubtargetDependencies(CPU, FS); + if (STI.isThumb1Only()) + return (ARMFrameLowering *)new Thumb1FrameLowering(STI); + + return new ARMFrameLowering(STI); +} + +ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU, + const std::string &FS, + const ARMBaseTargetMachine &TM, bool IsLittle) + : ARMGenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others), + ARMProcClass(None), ARMArch(ARMv4t), stackAlignment(4), CPUString(CPU), + IsLittle(IsLittle), TargetTriple(TT), Options(TM.Options), TM(TM), + FrameLowering(initializeFrameLowering(CPU, FS)), + // At this point initializeSubtargetDependencies has been called so + // we can query directly. + InstrInfo(isThumb1Only() + ? (ARMBaseInstrInfo *)new Thumb1InstrInfo(*this) + : !isThumb() + ? (ARMBaseInstrInfo *)new ARMInstrInfo(*this) + : (ARMBaseInstrInfo *)new Thumb2InstrInfo(*this)), + TLInfo(TM, *this) {} + +void ARMSubtarget::initializeEnvironment() { + HasV4TOps = false; + HasV5TOps = false; + HasV5TEOps = false; + HasV6Ops = false; + HasV6MOps = false; + HasV6KOps = false; + HasV6T2Ops = false; + HasV7Ops = false; + HasV8Ops = false; + HasV8_1aOps = false; + HasV8_2aOps = false; + HasVFPv2 = false; + HasVFPv3 = false; + HasVFPv4 = false; + HasFPARMv8 = false; + HasNEON = false; + UseNEONForSinglePrecisionFP = false; + UseMulOps = UseFusedMulOps; + SlowFPVMLx = false; + HasVMLxForwarding = false; + SlowFPBrcc = false; + InThumbMode = false; + UseSoftFloat = false; + HasThumb2 = false; + NoARM = false; + ReserveR9 = false; + NoMovt = false; + SupportsTailCall = false; + HasFP16 = false; + HasFullFP16 = false; + HasD16 = false; + HasHardwareDivide = false; + HasHardwareDivideInARM = false; + HasT2ExtractPack = false; + HasDataBarrier = false; + Pref32BitThumb = false; + AvoidCPSRPartialUpdate = false; + AvoidMOVsShifterOperand = false; + HasRAS = false; + HasMPExtension = false; + HasVirtualization = false; + FPOnlySP = false; + HasPerfMon = false; + HasTrustZone = false; + HasCrypto = false; + HasCRC = false; + HasZeroCycleZeroing = false; + StrictAlign = false; + HasDSP = false; + UseNaClTrap = false; + GenLongCalls = false; + UnsafeFPMath = false; + + // MCAsmInfo isn't always present (e.g. in opt) so we can't initialize this + // directly from it, but we can try to make sure they're consistent when both + // available. + UseSjLjEH = isTargetDarwin() && !isTargetWatchOS(); + assert((!TM.getMCAsmInfo() || + (TM.getMCAsmInfo()->getExceptionHandlingType() == + ExceptionHandling::SjLj) == UseSjLjEH) && + "inconsistent sjlj choice between CodeGen and MC"); +} + +void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { + if (CPUString.empty()) { + CPUString = "generic"; + + if (isTargetDarwin()) { + StringRef ArchName = TargetTriple.getArchName(); + if (ArchName.endswith("v7s")) + // Default to the Swift CPU when targeting armv7s/thumbv7s. + CPUString = "swift"; + else if (ArchName.endswith("v7k")) + // Default to the Cortex-a7 CPU when targeting armv7k/thumbv7k. + // ARMv7k does not use SjLj exception handling. + CPUString = "cortex-a7"; + } + } + + // Insert the architecture feature derived from the target triple into the + // feature string. This is important for setting features that are implied + // based on the architecture version. + std::string ArchFS = ARM_MC::ParseARMTriple(TargetTriple, CPUString); + if (!FS.empty()) { + if (!ArchFS.empty()) + ArchFS = (Twine(ArchFS) + "," + FS).str(); + else + ArchFS = FS; + } + ParseSubtargetFeatures(CPUString, ArchFS); + + // FIXME: This used enable V6T2 support implicitly for Thumb2 mode. + // Assert this for now to make the change obvious. + assert(hasV6T2Ops() || !hasThumb2()); + + // Keep a pointer to static instruction cost data for the specified CPU. + SchedModel = getSchedModelForCPU(CPUString); + + // Initialize scheduling itinerary for the specified CPU. + InstrItins = getInstrItineraryForCPU(CPUString); + + // FIXME: this is invalid for WindowsCE + if (isTargetWindows()) + NoARM = true; + + if (isAAPCS_ABI()) + stackAlignment = 8; + if (isTargetNaCl() || isAAPCS16_ABI()) + stackAlignment = 16; + + // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo:: + // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as + // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation + // support in the assembler and linker to be used. This would need to be + // fixed to fully support tail calls in Thumb1. + // + // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take + // LR. This means if we need to reload LR, it takes an extra instructions, + // which outweighs the value of the tail call; but here we don't know yet + // whether LR is going to be used. Probably the right approach is to + // generate the tail call here and turn it back into CALL/RET in + // emitEpilogue if LR is used. + + // Thumb1 PIC calls to external symbols use BX, so they can be tail calls, + // but we need to make sure there are enough registers; the only valid + // registers are the 4 used for parameters. We don't currently do this + // case. + + SupportsTailCall = !isThumb1Only(); + + if (isTargetMachO() && isTargetIOS() && getTargetTriple().isOSVersionLT(5, 0)) + SupportsTailCall = false; + + switch (IT) { + case DefaultIT: + RestrictIT = hasV8Ops(); + break; + case RestrictedIT: + RestrictIT = true; + break; + case NoRestrictedIT: + RestrictIT = false; + break; + } + + // NEON f32 ops are non-IEEE 754 compliant. Darwin is ok with it by default. + const FeatureBitset &Bits = getFeatureBits(); + if ((Bits[ARM::ProcA5] || Bits[ARM::ProcA8]) && // Where this matters + (Options.UnsafeFPMath || isTargetDarwin())) + UseNEONForSinglePrecisionFP = true; +} + +bool ARMSubtarget::isAPCS_ABI() const { + assert(TM.TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN); + return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_APCS; +} +bool ARMSubtarget::isAAPCS_ABI() const { + assert(TM.TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN); + return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS || + TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16; +} +bool ARMSubtarget::isAAPCS16_ABI() const { + assert(TM.TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN); + return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16; +} + + +/// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol. +bool +ARMSubtarget::GVIsIndirectSymbol(const GlobalValue *GV, + Reloc::Model RelocM) const { + if (RelocM == Reloc::Static) + return false; + + bool isDef = GV->isStrongDefinitionForLinker(); + + if (!isTargetMachO()) { + // Extra load is needed for all externally visible. + if (GV->hasLocalLinkage() || GV->hasHiddenVisibility()) + return false; + return true; + } else { + // If this is a strong reference to a definition, it is definitely not + // through a stub. + if (isDef) + return false; + + // Unless we have a symbol with hidden visibility, we have to go through a + // normal $non_lazy_ptr stub because this symbol might be resolved late. + if (!GV->hasHiddenVisibility()) // Non-hidden $non_lazy_ptr reference. + return true; + + if (RelocM == Reloc::PIC_) { + // If symbol visibility is hidden, we have a stub for common symbol + // references and external declarations. + if (GV->isDeclarationForLinker() || GV->hasCommonLinkage()) + // Hidden $non_lazy_ptr reference. + return true; + } + } + + return false; +} + +unsigned ARMSubtarget::getMispredictionPenalty() const { + return SchedModel.MispredictPenalty; +} + +bool ARMSubtarget::hasSinCos() const { + return isTargetWatchOS() || + (isTargetIOS() && !getTargetTriple().isOSVersionLT(7, 0)); +} + +bool ARMSubtarget::enableMachineScheduler() const { + // Enable the MachineScheduler before register allocation for out-of-order + // architectures where we do not use the PostRA scheduler anymore (for now + // restricted to swift). + return getSchedModel().isOutOfOrder() && isSwift(); +} + +// This overrides the PostRAScheduler bit in the SchedModel for any CPU. +bool ARMSubtarget::enablePostRAScheduler() const { + // No need for PostRA scheduling on out of order CPUs (for now restricted to + // swift). + if (getSchedModel().isOutOfOrder() && isSwift()) + return false; + return (!isThumb() || hasThumb2()); +} + +bool ARMSubtarget::enableAtomicExpand() const { + return hasAnyDataBarrier() && !isThumb1Only(); +} + +bool ARMSubtarget::useStride4VFPs(const MachineFunction &MF) const { + // For general targets, the prologue can grow when VFPs are allocated with + // stride 4 (more vpush instructions). But WatchOS uses a compact unwind + // format which it's more important to get right. + return isTargetWatchOS() || (isSwift() && !MF.getFunction()->optForMinSize()); +} + +bool ARMSubtarget::useMovt(const MachineFunction &MF) const { + // NOTE Windows on ARM needs to use mov.w/mov.t pairs to materialise 32-bit + // immediates as it is inherently position independent, and may be out of + // range otherwise. + return !NoMovt && hasV6T2Ops() && + (isTargetWindows() || !MF.getFunction()->optForMinSize()); +} + +bool ARMSubtarget::useFastISel() const { + // Enable fast-isel for any target, for testing only. + if (ForceFastISel) + return true; + + // Limit fast-isel to the targets that are or have been tested. + if (!hasV6Ops()) + return false; + + // Thumb2 support on iOS; ARM support on iOS, Linux and NaCl. + return TM.Options.EnableFastISel && + ((isTargetMachO() && !isThumb1Only()) || + (isTargetLinux() && !isThumb()) || (isTargetNaCl() && !isThumb())); +} diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h new file mode 100644 index 0000000..4d54e57 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h @@ -0,0 +1,488 @@ +//===-- ARMSubtarget.h - Define Subtarget for the ARM ----------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the ARM specific subclass of TargetSubtargetInfo. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_ARMSUBTARGET_H +#define LLVM_LIB_TARGET_ARM_ARMSUBTARGET_H + + +#include "ARMFrameLowering.h" +#include "ARMISelLowering.h" +#include "ARMInstrInfo.h" +#include "ARMSelectionDAGInfo.h" +#include "ARMSubtarget.h" +#include "MCTargetDesc/ARMMCTargetDesc.h" +#include "Thumb1FrameLowering.h" +#include "Thumb1InstrInfo.h" +#include "Thumb2InstrInfo.h" +#include "llvm/ADT/Triple.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/MC/MCInstrItineraries.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <string> + +#define GET_SUBTARGETINFO_HEADER +#include "ARMGenSubtargetInfo.inc" + +namespace llvm { +class GlobalValue; +class StringRef; +class TargetOptions; +class ARMBaseTargetMachine; + +class ARMSubtarget : public ARMGenSubtargetInfo { +protected: + enum ARMProcFamilyEnum { + Others, CortexA5, CortexA7, CortexA8, CortexA9, CortexA12, CortexA15, + CortexA17, CortexR4, CortexR4F, CortexR5, CortexR7, CortexA35, CortexA53, + CortexA57, CortexA72, Krait, Swift, ExynosM1 + }; + enum ARMProcClassEnum { + None, AClass, RClass, MClass + }; + enum ARMArchEnum { + ARMv2, ARMv2a, ARMv3, ARMv3m, ARMv4, ARMv4t, ARMv5, ARMv5t, ARMv5te, + ARMv5tej, ARMv6, ARMv6k, ARMv6kz, ARMv6t2, ARMv6m, ARMv6sm, ARMv7a, ARMv7r, + ARMv7m, ARMv7em, ARMv8a, ARMv81a, ARMv82a + }; + + /// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others. + ARMProcFamilyEnum ARMProcFamily; + + /// ARMProcClass - ARM processor class: None, AClass, RClass or MClass. + ARMProcClassEnum ARMProcClass; + + /// ARMArch - ARM architecture + ARMArchEnum ARMArch; + + /// HasV4TOps, HasV5TOps, HasV5TEOps, + /// HasV6Ops, HasV6MOps, HasV6KOps, HasV6T2Ops, HasV7Ops, HasV8Ops - + /// Specify whether target support specific ARM ISA variants. + bool HasV4TOps; + bool HasV5TOps; + bool HasV5TEOps; + bool HasV6Ops; + bool HasV6MOps; + bool HasV6KOps; + bool HasV6T2Ops; + bool HasV7Ops; + bool HasV8Ops; + bool HasV8_1aOps; + bool HasV8_2aOps; + + /// HasVFPv2, HasVFPv3, HasVFPv4, HasFPARMv8, HasNEON - Specify what + /// floating point ISAs are supported. + bool HasVFPv2; + bool HasVFPv3; + bool HasVFPv4; + bool HasFPARMv8; + bool HasNEON; + + /// UseNEONForSinglePrecisionFP - if the NEONFP attribute has been + /// specified. Use the method useNEONForSinglePrecisionFP() to + /// determine if NEON should actually be used. + bool UseNEONForSinglePrecisionFP; + + /// UseMulOps - True if non-microcoded fused integer multiply-add and + /// multiply-subtract instructions should be used. + bool UseMulOps; + + /// SlowFPVMLx - If the VFP2 / NEON instructions are available, indicates + /// whether the FP VML[AS] instructions are slow (if so, don't use them). + bool SlowFPVMLx; + + /// HasVMLxForwarding - If true, NEON has special multiplier accumulator + /// forwarding to allow mul + mla being issued back to back. + bool HasVMLxForwarding; + + /// SlowFPBrcc - True if floating point compare + branch is slow. + bool SlowFPBrcc; + + /// InThumbMode - True if compiling for Thumb, false for ARM. + bool InThumbMode; + + /// UseSoftFloat - True if we're using software floating point features. + bool UseSoftFloat; + + /// HasThumb2 - True if Thumb2 instructions are supported. + bool HasThumb2; + + /// NoARM - True if subtarget does not support ARM mode execution. + bool NoARM; + + /// ReserveR9 - True if R9 is not available as a general purpose register. + bool ReserveR9; + + /// NoMovt - True if MOVT / MOVW pairs are not used for materialization of + /// 32-bit imms (including global addresses). + bool NoMovt; + + /// SupportsTailCall - True if the OS supports tail call. The dynamic linker + /// must be able to synthesize call stubs for interworking between ARM and + /// Thumb. + bool SupportsTailCall; + + /// HasFP16 - True if subtarget supports half-precision FP conversions + bool HasFP16; + + /// HasFullFP16 - True if subtarget supports half-precision FP operations + bool HasFullFP16; + + /// HasD16 - True if subtarget is limited to 16 double precision + /// FP registers for VFPv3. + bool HasD16; + + /// HasHardwareDivide - True if subtarget supports [su]div + bool HasHardwareDivide; + + /// HasHardwareDivideInARM - True if subtarget supports [su]div in ARM mode + bool HasHardwareDivideInARM; + + /// HasT2ExtractPack - True if subtarget supports thumb2 extract/pack + /// instructions. + bool HasT2ExtractPack; + + /// HasDataBarrier - True if the subtarget supports DMB / DSB data barrier + /// instructions. + bool HasDataBarrier; + + /// Pref32BitThumb - If true, codegen would prefer 32-bit Thumb instructions + /// over 16-bit ones. + bool Pref32BitThumb; + + /// AvoidCPSRPartialUpdate - If true, codegen would avoid using instructions + /// that partially update CPSR and add false dependency on the previous + /// CPSR setting instruction. + bool AvoidCPSRPartialUpdate; + + /// AvoidMOVsShifterOperand - If true, codegen should avoid using flag setting + /// movs with shifter operand (i.e. asr, lsl, lsr). + bool AvoidMOVsShifterOperand; + + /// HasRAS - Some processors perform return stack prediction. CodeGen should + /// avoid issue "normal" call instructions to callees which do not return. + bool HasRAS; + + /// HasMPExtension - True if the subtarget supports Multiprocessing + /// extension (ARMv7 only). + bool HasMPExtension; + + /// HasVirtualization - True if the subtarget supports the Virtualization + /// extension. + bool HasVirtualization; + + /// FPOnlySP - If true, the floating point unit only supports single + /// precision. + bool FPOnlySP; + + /// If true, the processor supports the Performance Monitor Extensions. These + /// include a generic cycle-counter as well as more fine-grained (often + /// implementation-specific) events. + bool HasPerfMon; + + /// HasTrustZone - if true, processor supports TrustZone security extensions + bool HasTrustZone; + + /// HasCrypto - if true, processor supports Cryptography extensions + bool HasCrypto; + + /// HasCRC - if true, processor supports CRC instructions + bool HasCRC; + + /// If true, the instructions "vmov.i32 d0, #0" and "vmov.i32 q0, #0" are + /// particularly effective at zeroing a VFP register. + bool HasZeroCycleZeroing; + + /// StrictAlign - If true, the subtarget disallows unaligned memory + /// accesses for some types. For details, see + /// ARMTargetLowering::allowsMisalignedMemoryAccesses(). + bool StrictAlign; + + /// RestrictIT - If true, the subtarget disallows generation of deprecated IT + /// blocks to conform to ARMv8 rule. + bool RestrictIT; + + /// HasDSP - If true, the subtarget supports the DSP (saturating arith + /// and such) instructions. + bool HasDSP; + + /// NaCl TRAP instruction is generated instead of the regular TRAP. + bool UseNaClTrap; + + /// Generate calls via indirect call instructions. + bool GenLongCalls; + + /// Target machine allowed unsafe FP math (such as use of NEON fp) + bool UnsafeFPMath; + + /// UseSjLjEH - If true, the target uses SjLj exception handling (e.g. iOS). + bool UseSjLjEH; + + /// stackAlignment - The minimum alignment known to hold of the stack frame on + /// entry to the function and which must be maintained by every function. + unsigned stackAlignment; + + /// CPUString - String name of used CPU. + std::string CPUString; + + /// IsLittle - The target is Little Endian + bool IsLittle; + + /// TargetTriple - What processor and OS we're targeting. + Triple TargetTriple; + + /// SchedModel - Processor specific instruction costs. + MCSchedModel SchedModel; + + /// Selected instruction itineraries (one entry per itinerary class.) + InstrItineraryData InstrItins; + + /// Options passed via command line that could influence the target + const TargetOptions &Options; + + const ARMBaseTargetMachine &TM; + +public: + /// This constructor initializes the data members to match that + /// of the specified triple. + /// + ARMSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, + const ARMBaseTargetMachine &TM, bool IsLittle); + + /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size + /// that still makes it profitable to inline the call. + unsigned getMaxInlineSizeThreshold() const { + return 64; + } + /// ParseSubtargetFeatures - Parses features string setting specified + /// subtarget options. Definition of function is auto generated by tblgen. + void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + + /// initializeSubtargetDependencies - Initializes using a CPU and feature string + /// so that we can use initializer lists for subtarget initialization. + ARMSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); + + const ARMSelectionDAGInfo *getSelectionDAGInfo() const override { + return &TSInfo; + } + const ARMBaseInstrInfo *getInstrInfo() const override { + return InstrInfo.get(); + } + const ARMTargetLowering *getTargetLowering() const override { + return &TLInfo; + } + const ARMFrameLowering *getFrameLowering() const override { + return FrameLowering.get(); + } + const ARMBaseRegisterInfo *getRegisterInfo() const override { + return &InstrInfo->getRegisterInfo(); + } + +private: + ARMSelectionDAGInfo TSInfo; + // Either Thumb1FrameLowering or ARMFrameLowering. + std::unique_ptr<ARMFrameLowering> FrameLowering; + // Either Thumb1InstrInfo or Thumb2InstrInfo. + std::unique_ptr<ARMBaseInstrInfo> InstrInfo; + ARMTargetLowering TLInfo; + + void initializeEnvironment(); + void initSubtargetFeatures(StringRef CPU, StringRef FS); + ARMFrameLowering *initializeFrameLowering(StringRef CPU, StringRef FS); + +public: + void computeIssueWidth(); + + bool hasV4TOps() const { return HasV4TOps; } + bool hasV5TOps() const { return HasV5TOps; } + bool hasV5TEOps() const { return HasV5TEOps; } + bool hasV6Ops() const { return HasV6Ops; } + bool hasV6MOps() const { return HasV6MOps; } + bool hasV6KOps() const { return HasV6KOps; } + bool hasV6T2Ops() const { return HasV6T2Ops; } + bool hasV7Ops() const { return HasV7Ops; } + bool hasV8Ops() const { return HasV8Ops; } + bool hasV8_1aOps() const { return HasV8_1aOps; } + bool hasV8_2aOps() const { return HasV8_2aOps; } + + bool isCortexA5() const { return ARMProcFamily == CortexA5; } + bool isCortexA7() const { return ARMProcFamily == CortexA7; } + bool isCortexA8() const { return ARMProcFamily == CortexA8; } + bool isCortexA9() const { return ARMProcFamily == CortexA9; } + bool isCortexA15() const { return ARMProcFamily == CortexA15; } + bool isSwift() const { return ARMProcFamily == Swift; } + bool isCortexM3() const { return CPUString == "cortex-m3"; } + bool isLikeA9() const { return isCortexA9() || isCortexA15() || isKrait(); } + bool isCortexR5() const { return ARMProcFamily == CortexR5; } + bool isKrait() const { return ARMProcFamily == Krait; } + + bool hasARMOps() const { return !NoARM; } + + bool hasVFP2() const { return HasVFPv2; } + bool hasVFP3() const { return HasVFPv3; } + bool hasVFP4() const { return HasVFPv4; } + bool hasFPARMv8() const { return HasFPARMv8; } + bool hasNEON() const { return HasNEON; } + bool hasCrypto() const { return HasCrypto; } + bool hasCRC() const { return HasCRC; } + bool hasVirtualization() const { return HasVirtualization; } + bool useNEONForSinglePrecisionFP() const { + return hasNEON() && UseNEONForSinglePrecisionFP; + } + + bool hasDivide() const { return HasHardwareDivide; } + bool hasDivideInARMMode() const { return HasHardwareDivideInARM; } + bool hasT2ExtractPack() const { return HasT2ExtractPack; } + bool hasDataBarrier() const { return HasDataBarrier; } + bool hasAnyDataBarrier() const { + return HasDataBarrier || (hasV6Ops() && !isThumb()); + } + bool useMulOps() const { return UseMulOps; } + bool useFPVMLx() const { return !SlowFPVMLx; } + bool hasVMLxForwarding() const { return HasVMLxForwarding; } + bool isFPBrccSlow() const { return SlowFPBrcc; } + bool isFPOnlySP() const { return FPOnlySP; } + bool hasPerfMon() const { return HasPerfMon; } + bool hasTrustZone() const { return HasTrustZone; } + bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; } + bool prefers32BitThumb() const { return Pref32BitThumb; } + bool avoidCPSRPartialUpdate() const { return AvoidCPSRPartialUpdate; } + bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; } + bool hasRAS() const { return HasRAS; } + bool hasMPExtension() const { return HasMPExtension; } + bool hasDSP() const { return HasDSP; } + bool useNaClTrap() const { return UseNaClTrap; } + bool useSjLjEH() const { return UseSjLjEH; } + bool genLongCalls() const { return GenLongCalls; } + + bool hasFP16() const { return HasFP16; } + bool hasD16() const { return HasD16; } + bool hasFullFP16() const { return HasFullFP16; } + + const Triple &getTargetTriple() const { return TargetTriple; } + + bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } + bool isTargetIOS() const { return TargetTriple.isiOS(); } + bool isTargetWatchOS() const { return TargetTriple.isWatchOS(); } + bool isTargetLinux() const { return TargetTriple.isOSLinux(); } + bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); } + bool isTargetNetBSD() const { return TargetTriple.isOSNetBSD(); } + bool isTargetWindows() const { return TargetTriple.isOSWindows(); } + + bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); } + bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } + bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } + + // ARM EABI is the bare-metal EABI described in ARM ABI documents and + // can be accessed via -target arm-none-eabi. This is NOT GNUEABI. + // FIXME: Add a flag for bare-metal for that target and set Triple::EABI + // even for GNUEABI, so we can make a distinction here and still conform to + // the EABI on GNU (and Android) mode. This requires change in Clang, too. + // FIXME: The Darwin exception is temporary, while we move users to + // "*-*-*-macho" triples as quickly as possible. + bool isTargetAEABI() const { + return (TargetTriple.getEnvironment() == Triple::EABI || + TargetTriple.getEnvironment() == Triple::EABIHF) && + !isTargetDarwin() && !isTargetWindows(); + } + bool isTargetGNUAEABI() const { + return (TargetTriple.getEnvironment() == Triple::GNUEABI || + TargetTriple.getEnvironment() == Triple::GNUEABIHF) && + !isTargetDarwin() && !isTargetWindows(); + } + + // ARM Targets that support EHABI exception handling standard + // Darwin uses SjLj. Other targets might need more checks. + bool isTargetEHABICompatible() const { + return (TargetTriple.getEnvironment() == Triple::EABI || + TargetTriple.getEnvironment() == Triple::GNUEABI || + TargetTriple.getEnvironment() == Triple::EABIHF || + TargetTriple.getEnvironment() == Triple::GNUEABIHF || + isTargetAndroid()) && + !isTargetDarwin() && !isTargetWindows(); + } + + bool isTargetHardFloat() const { + // FIXME: this is invalid for WindowsCE + return TargetTriple.getEnvironment() == Triple::GNUEABIHF || + TargetTriple.getEnvironment() == Triple::EABIHF || + isTargetWindows() || isAAPCS16_ABI(); + } + bool isTargetAndroid() const { return TargetTriple.isAndroid(); } + + bool isAPCS_ABI() const; + bool isAAPCS_ABI() const; + bool isAAPCS16_ABI() const; + + bool useSoftFloat() const { return UseSoftFloat; } + bool isThumb() const { return InThumbMode; } + bool isThumb1Only() const { return InThumbMode && !HasThumb2; } + bool isThumb2() const { return InThumbMode && HasThumb2; } + bool hasThumb2() const { return HasThumb2; } + bool isMClass() const { return ARMProcClass == MClass; } + bool isRClass() const { return ARMProcClass == RClass; } + bool isAClass() const { return ARMProcClass == AClass; } + + bool isR9Reserved() const { + return isTargetMachO() ? (ReserveR9 || !HasV6Ops) : ReserveR9; + } + + bool useStride4VFPs(const MachineFunction &MF) const; + + bool useMovt(const MachineFunction &MF) const; + + bool supportsTailCall() const { return SupportsTailCall; } + + bool allowsUnalignedMem() const { return !StrictAlign; } + + bool restrictIT() const { return RestrictIT; } + + const std::string & getCPUString() const { return CPUString; } + + bool isLittle() const { return IsLittle; } + + unsigned getMispredictionPenalty() const; + + /// This function returns true if the target has sincos() routine in its + /// compiler runtime or math libraries. + bool hasSinCos() const; + + /// Returns true if machine scheduler should be enabled. + bool enableMachineScheduler() const override; + + /// True for some subtargets at > -O0. + bool enablePostRAScheduler() const override; + + // enableAtomicExpand- True if we need to expand our atomics. + bool enableAtomicExpand() const override; + + /// getInstrItins - Return the instruction itineraries based on subtarget + /// selection. + const InstrItineraryData *getInstrItineraryData() const override { + return &InstrItins; + } + + /// getStackAlignment - Returns the minimum alignment known to hold of the + /// stack frame on entry to the function and which must be maintained by every + /// function for this subtarget. + unsigned getStackAlignment() const { return stackAlignment; } + + /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect + /// symbol. + bool GVIsIndirectSymbol(const GlobalValue *GV, Reloc::Model RelocM) const; + + /// True if fast-isel is used. + bool useFastISel() const; +}; +} // End llvm namespace + +#endif // ARMSUBTARGET_H diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp new file mode 100644 index 0000000..fca1901 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -0,0 +1,431 @@ +//===-- ARMTargetMachine.cpp - Define TargetMachine for ARM ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMFrameLowering.h" +#include "ARMTargetMachine.h" +#include "ARMTargetObjectFile.h" +#include "ARMTargetTransformInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Transforms/Scalar.h" +using namespace llvm; + +static cl::opt<bool> +DisableA15SDOptimization("disable-a15-sd-optimization", cl::Hidden, + cl::desc("Inhibit optimization of S->D register accesses on A15"), + cl::init(false)); + +static cl::opt<bool> +EnableAtomicTidy("arm-atomic-cfg-tidy", cl::Hidden, + cl::desc("Run SimplifyCFG after expanding atomic operations" + " to make use of cmpxchg flow-based information"), + cl::init(true)); + +static cl::opt<bool> +EnableARMLoadStoreOpt("arm-load-store-opt", cl::Hidden, + cl::desc("Enable ARM load/store optimization pass"), + cl::init(true)); + +// FIXME: Unify control over GlobalMerge. +static cl::opt<cl::boolOrDefault> +EnableGlobalMerge("arm-global-merge", cl::Hidden, + cl::desc("Enable the global merge pass")); + +extern "C" void LLVMInitializeARMTarget() { + // Register the target. + RegisterTargetMachine<ARMLETargetMachine> X(TheARMLETarget); + RegisterTargetMachine<ARMBETargetMachine> Y(TheARMBETarget); + RegisterTargetMachine<ThumbLETargetMachine> A(TheThumbLETarget); + RegisterTargetMachine<ThumbBETargetMachine> B(TheThumbBETarget); +} + +static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { + if (TT.isOSBinFormatMachO()) + return make_unique<TargetLoweringObjectFileMachO>(); + if (TT.isOSWindows()) + return make_unique<TargetLoweringObjectFileCOFF>(); + return make_unique<ARMElfTargetObjectFile>(); +} + +static ARMBaseTargetMachine::ARMABI +computeTargetABI(const Triple &TT, StringRef CPU, + const TargetOptions &Options) { + if (Options.MCOptions.getABIName() == "aapcs16") + return ARMBaseTargetMachine::ARM_ABI_AAPCS16; + else if (Options.MCOptions.getABIName().startswith("aapcs")) + return ARMBaseTargetMachine::ARM_ABI_AAPCS; + else if (Options.MCOptions.getABIName().startswith("apcs")) + return ARMBaseTargetMachine::ARM_ABI_APCS; + + assert(Options.MCOptions.getABIName().empty() && + "Unknown target-abi option!"); + + ARMBaseTargetMachine::ARMABI TargetABI = + ARMBaseTargetMachine::ARM_ABI_UNKNOWN; + + // FIXME: This is duplicated code from the front end and should be unified. + if (TT.isOSBinFormatMachO()) { + if (TT.getEnvironment() == llvm::Triple::EABI || + (TT.getOS() == llvm::Triple::UnknownOS && TT.isOSBinFormatMachO()) || + CPU.startswith("cortex-m")) { + TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS; + } else if (TT.isWatchOS()) { + TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS16; + } else { + TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS; + } + } else if (TT.isOSWindows()) { + // FIXME: this is invalid for WindowsCE + TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS; + } else { + // Select the default based on the platform. + switch (TT.getEnvironment()) { + case llvm::Triple::Android: + case llvm::Triple::GNUEABI: + case llvm::Triple::GNUEABIHF: + case llvm::Triple::EABIHF: + case llvm::Triple::EABI: + TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS; + break; + case llvm::Triple::GNU: + TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS; + break; + default: + if (TT.isOSNetBSD()) + TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS; + else + TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS; + break; + } + } + + return TargetABI; +} + +static std::string computeDataLayout(const Triple &TT, StringRef CPU, + const TargetOptions &Options, + bool isLittle) { + auto ABI = computeTargetABI(TT, CPU, Options); + std::string Ret = ""; + + if (isLittle) + // Little endian. + Ret += "e"; + else + // Big endian. + Ret += "E"; + + Ret += DataLayout::getManglingComponent(TT); + + // Pointers are 32 bits and aligned to 32 bits. + Ret += "-p:32:32"; + + // ABIs other than APCS have 64 bit integers with natural alignment. + if (ABI != ARMBaseTargetMachine::ARM_ABI_APCS) + Ret += "-i64:64"; + + // We have 64 bits floats. The APCS ABI requires them to be aligned to 32 + // bits, others to 64 bits. We always try to align to 64 bits. + if (ABI == ARMBaseTargetMachine::ARM_ABI_APCS) + Ret += "-f64:32:64"; + + // We have 128 and 64 bit vectors. The APCS ABI aligns them to 32 bits, others + // to 64. We always ty to give them natural alignment. + if (ABI == ARMBaseTargetMachine::ARM_ABI_APCS) + Ret += "-v64:32:64-v128:32:128"; + else if (ABI != ARMBaseTargetMachine::ARM_ABI_AAPCS16) + Ret += "-v128:64:128"; + + // Try to align aggregates to 32 bits (the default is 64 bits, which has no + // particular hardware support on 32-bit ARM). + Ret += "-a:0:32"; + + // Integer registers are 32 bits. + Ret += "-n32"; + + // The stack is 128 bit aligned on NaCl, 64 bit aligned on AAPCS and 32 bit + // aligned everywhere else. + if (TT.isOSNaCl() || ABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16) + Ret += "-S128"; + else if (ABI == ARMBaseTargetMachine::ARM_ABI_AAPCS) + Ret += "-S64"; + else + Ret += "-S32"; + + return Ret; +} + +/// TargetMachine ctor - Create an ARM architecture model. +/// +ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL, bool isLittle) + : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options, isLittle), TT, + CPU, FS, Options, RM, CM, OL), + TargetABI(computeTargetABI(TT, CPU, Options)), + TLOF(createTLOF(getTargetTriple())), + Subtarget(TT, CPU, FS, *this, isLittle), isLittle(isLittle) { + + // Default to triple-appropriate float ABI + if (Options.FloatABIType == FloatABI::Default) + this->Options.FloatABIType = + Subtarget.isTargetHardFloat() ? FloatABI::Hard : FloatABI::Soft; + + // Default to triple-appropriate EABI + if (Options.EABIVersion == EABI::Default || + Options.EABIVersion == EABI::Unknown) { + if (Subtarget.isTargetGNUAEABI()) + this->Options.EABIVersion = EABI::GNU; + else + this->Options.EABIVersion = EABI::EABI5; + } +} + +ARMBaseTargetMachine::~ARMBaseTargetMachine() {} + +const ARMSubtarget * +ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const { + Attribute CPUAttr = F.getFnAttribute("target-cpu"); + Attribute FSAttr = F.getFnAttribute("target-features"); + + std::string CPU = !CPUAttr.hasAttribute(Attribute::None) + ? CPUAttr.getValueAsString().str() + : TargetCPU; + std::string FS = !FSAttr.hasAttribute(Attribute::None) + ? FSAttr.getValueAsString().str() + : TargetFS; + + // FIXME: This is related to the code below to reset the target options, + // we need to know whether or not the soft float flag is set on the + // function before we can generate a subtarget. We also need to use + // it as a key for the subtarget since that can be the only difference + // between two functions. + bool SoftFloat = + F.hasFnAttribute("use-soft-float") && + F.getFnAttribute("use-soft-float").getValueAsString() == "true"; + // If the soft float attribute is set on the function turn on the soft float + // subtarget feature. + if (SoftFloat) + FS += FS.empty() ? "+soft-float" : ",+soft-float"; + + auto &I = SubtargetMap[CPU + FS]; + if (!I) { + // This needs to be done before we create a new subtarget since any + // creation will depend on the TM and the code generation flags on the + // function that reside in TargetOptions. + resetTargetOptions(F); + I = llvm::make_unique<ARMSubtarget>(TargetTriple, CPU, FS, *this, isLittle); + } + return I.get(); +} + +TargetIRAnalysis ARMBaseTargetMachine::getTargetIRAnalysis() { + return TargetIRAnalysis([this](const Function &F) { + return TargetTransformInfo(ARMTTIImpl(this, F)); + }); +} + +void ARMTargetMachine::anchor() {} + +ARMTargetMachine::ARMTargetMachine(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL, bool isLittle) + : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) { + initAsmInfo(); + if (!Subtarget.hasARMOps()) + report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not " + "support ARM mode execution!"); +} + +void ARMLETargetMachine::anchor() {} + +ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} + +void ARMBETargetMachine::anchor() {} + +ARMBETargetMachine::ARMBETargetMachine(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} + +void ThumbTargetMachine::anchor() {} + +ThumbTargetMachine::ThumbTargetMachine(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL, bool isLittle) + : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) { + initAsmInfo(); +} + +void ThumbLETargetMachine::anchor() {} + +ThumbLETargetMachine::ThumbLETargetMachine(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} + +void ThumbBETargetMachine::anchor() {} + +ThumbBETargetMachine::ThumbBETargetMachine(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} + +namespace { +/// ARM Code Generator Pass Configuration Options. +class ARMPassConfig : public TargetPassConfig { +public: + ARMPassConfig(ARMBaseTargetMachine *TM, PassManagerBase &PM) + : TargetPassConfig(TM, PM) {} + + ARMBaseTargetMachine &getARMTargetMachine() const { + return getTM<ARMBaseTargetMachine>(); + } + + void addIRPasses() override; + bool addPreISel() override; + bool addInstSelector() override; + void addPreRegAlloc() override; + void addPreSched2() override; + void addPreEmitPass() override; +}; +} // namespace + +TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) { + return new ARMPassConfig(this, PM); +} + +void ARMPassConfig::addIRPasses() { + if (TM->Options.ThreadModel == ThreadModel::Single) + addPass(createLowerAtomicPass()); + else + addPass(createAtomicExpandPass(TM)); + + // Cmpxchg instructions are often used with a subsequent comparison to + // determine whether it succeeded. We can exploit existing control-flow in + // ldrex/strex loops to simplify this, but it needs tidying up. + if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy) + addPass(createCFGSimplificationPass(-1, [this](const Function &F) { + const auto &ST = this->TM->getSubtarget<ARMSubtarget>(F); + return ST.hasAnyDataBarrier() && !ST.isThumb1Only(); + })); + + TargetPassConfig::addIRPasses(); + + // Match interleaved memory accesses to ldN/stN intrinsics. + if (TM->getOptLevel() != CodeGenOpt::None) + addPass(createInterleavedAccessPass(TM)); +} + +bool ARMPassConfig::addPreISel() { + if ((TM->getOptLevel() != CodeGenOpt::None && + EnableGlobalMerge == cl::BOU_UNSET) || + EnableGlobalMerge == cl::BOU_TRUE) { + // FIXME: This is using the thumb1 only constant value for + // maximal global offset for merging globals. We may want + // to look into using the old value for non-thumb1 code of + // 4095 based on the TargetMachine, but this starts to become + // tricky when doing code gen per function. + bool OnlyOptimizeForSize = (TM->getOptLevel() < CodeGenOpt::Aggressive) && + (EnableGlobalMerge == cl::BOU_UNSET); + // Merging of extern globals is enabled by default on non-Mach-O as we + // expect it to be generally either beneficial or harmless. On Mach-O it + // is disabled as we emit the .subsections_via_symbols directive which + // means that merging extern globals is not safe. + bool MergeExternalByDefault = !TM->getTargetTriple().isOSBinFormatMachO(); + addPass(createGlobalMergePass(TM, 127, OnlyOptimizeForSize, + MergeExternalByDefault)); + } + + return false; +} + +bool ARMPassConfig::addInstSelector() { + addPass(createARMISelDag(getARMTargetMachine(), getOptLevel())); + return false; +} + +void ARMPassConfig::addPreRegAlloc() { + if (getOptLevel() != CodeGenOpt::None) { + addPass(createMLxExpansionPass()); + + if (EnableARMLoadStoreOpt) + addPass(createARMLoadStoreOptimizationPass(/* pre-register alloc */ true)); + + if (!DisableA15SDOptimization) + addPass(createA15SDOptimizerPass()); + } +} + +void ARMPassConfig::addPreSched2() { + if (getOptLevel() != CodeGenOpt::None) { + if (EnableARMLoadStoreOpt) + addPass(createARMLoadStoreOptimizationPass()); + + addPass(createExecutionDependencyFixPass(&ARM::DPRRegClass)); + } + + // Expand some pseudo instructions into multiple instructions to allow + // proper scheduling. + addPass(createARMExpandPseudoPass()); + + if (getOptLevel() != CodeGenOpt::None) { + // in v8, IfConversion depends on Thumb instruction widths + addPass(createThumb2SizeReductionPass([this](const Function &F) { + return this->TM->getSubtarget<ARMSubtarget>(F).restrictIT(); + })); + + addPass(createIfConverter([this](const Function &F) { + return !this->TM->getSubtarget<ARMSubtarget>(F).isThumb1Only(); + })); + } + addPass(createThumb2ITBlockPass()); +} + +void ARMPassConfig::addPreEmitPass() { + addPass(createThumb2SizeReductionPass()); + + // Constant island pass work on unbundled instructions. + addPass(createUnpackMachineBundles([this](const Function &F) { + return this->TM->getSubtarget<ARMSubtarget>(F).isThumb2(); + })); + + // Don't optimize barriers at -O0. + if (getOptLevel() != CodeGenOpt::None) + addPass(createARMOptimizeBarriersPass()); + + addPass(createARMConstantIslandPass()); +} diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h new file mode 100644 index 0000000..8ad1f3d --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h @@ -0,0 +1,130 @@ +//===-- ARMTargetMachine.h - Define TargetMachine for ARM -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the ARM specific subclass of TargetMachine. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_ARMTARGETMACHINE_H +#define LLVM_LIB_TARGET_ARM_ARMTARGETMACHINE_H + +#include "ARMInstrInfo.h" +#include "ARMSubtarget.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + +class ARMBaseTargetMachine : public LLVMTargetMachine { +public: + enum ARMABI { + ARM_ABI_UNKNOWN, + ARM_ABI_APCS, + ARM_ABI_AAPCS, // ARM EABI + ARM_ABI_AAPCS16 + } TargetABI; + +protected: + std::unique_ptr<TargetLoweringObjectFile> TLOF; + ARMSubtarget Subtarget; + bool isLittle; + mutable StringMap<std::unique_ptr<ARMSubtarget>> SubtargetMap; + +public: + ARMBaseTargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL, bool isLittle); + ~ARMBaseTargetMachine() override; + + const ARMSubtarget *getSubtargetImpl() const { return &Subtarget; } + const ARMSubtarget *getSubtargetImpl(const Function &F) const override; + bool isLittleEndian() const { return isLittle; } + + /// \brief Get the TargetIRAnalysis for this target. + TargetIRAnalysis getTargetIRAnalysis() override; + + // Pass Pipeline Configuration + TargetPassConfig *createPassConfig(PassManagerBase &PM) override; + + TargetLoweringObjectFile *getObjFileLowering() const override { + return TLOF.get(); + } +}; + +/// ARMTargetMachine - ARM target machine. +/// +class ARMTargetMachine : public ARMBaseTargetMachine { + virtual void anchor(); + public: + ARMTargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL, bool isLittle); +}; + +/// ARMLETargetMachine - ARM little endian target machine. +/// +class ARMLETargetMachine : public ARMTargetMachine { + void anchor() override; +public: + ARMLETargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL); +}; + +/// ARMBETargetMachine - ARM big endian target machine. +/// +class ARMBETargetMachine : public ARMTargetMachine { + void anchor() override; +public: + ARMBETargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL); +}; + +/// ThumbTargetMachine - Thumb target machine. +/// Due to the way architectures are handled, this represents both +/// Thumb-1 and Thumb-2. +/// +class ThumbTargetMachine : public ARMBaseTargetMachine { + virtual void anchor(); +public: + ThumbTargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL, + bool isLittle); +}; + +/// ThumbLETargetMachine - Thumb little endian target machine. +/// +class ThumbLETargetMachine : public ThumbTargetMachine { + void anchor() override; +public: + ThumbLETargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL); +}; + +/// ThumbBETargetMachine - Thumb big endian target machine. +/// +class ThumbBETargetMachine : public ThumbTargetMachine { + void anchor() override; +public: + ThumbBETargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL); +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp new file mode 100644 index 0000000..eaed5cc --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp @@ -0,0 +1,61 @@ +//===-- llvm/Target/ARMTargetObjectFile.cpp - ARM Object Info Impl --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "ARMTargetObjectFile.h" +#include "ARMTargetMachine.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/IR/Mangler.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/Support/Dwarf.h" +#include "llvm/Support/ELF.h" +#include "llvm/Target/TargetLowering.h" +using namespace llvm; +using namespace dwarf; + +//===----------------------------------------------------------------------===// +// ELF Target +//===----------------------------------------------------------------------===// + +void ARMElfTargetObjectFile::Initialize(MCContext &Ctx, + const TargetMachine &TM) { + bool isAAPCS_ABI = static_cast<const ARMTargetMachine &>(TM).TargetABI == + ARMTargetMachine::ARMABI::ARM_ABI_AAPCS; + TargetLoweringObjectFileELF::Initialize(Ctx, TM); + InitializeELF(isAAPCS_ABI); + + if (isAAPCS_ABI) { + LSDASection = nullptr; + } + + AttributesSection = + getContext().getELFSection(".ARM.attributes", ELF::SHT_ARM_ATTRIBUTES, 0); +} + +const MCExpr *ARMElfTargetObjectFile::getTTypeGlobalReference( + const GlobalValue *GV, unsigned Encoding, Mangler &Mang, + const TargetMachine &TM, MachineModuleInfo *MMI, + MCStreamer &Streamer) const { + if (TM.getMCAsmInfo()->getExceptionHandlingType() != ExceptionHandling::ARM) + return TargetLoweringObjectFileELF::getTTypeGlobalReference( + GV, Encoding, Mang, TM, MMI, Streamer); + + assert(Encoding == DW_EH_PE_absptr && "Can handle absptr encoding only"); + + return MCSymbolRefExpr::create(TM.getSymbol(GV, Mang), + MCSymbolRefExpr::VK_ARM_TARGET2, getContext()); +} + +const MCExpr *ARMElfTargetObjectFile:: +getDebugThreadLocalSymbol(const MCSymbol *Sym) const { + return MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_ARM_TLSLDO, + getContext()); +} diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h new file mode 100644 index 0000000..98e8763 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h @@ -0,0 +1,43 @@ +//===-- llvm/Target/ARMTargetObjectFile.h - ARM Object Info -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_ARMTARGETOBJECTFILE_H +#define LLVM_LIB_TARGET_ARM_ARMTARGETOBJECTFILE_H + +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" + +namespace llvm { + +class MCContext; +class TargetMachine; + +class ARMElfTargetObjectFile : public TargetLoweringObjectFileELF { +protected: + const MCSection *AttributesSection; +public: + ARMElfTargetObjectFile() : + TargetLoweringObjectFileELF(), + AttributesSection(nullptr) + {} + + void Initialize(MCContext &Ctx, const TargetMachine &TM) override; + + const MCExpr * + getTTypeGlobalReference(const GlobalValue *GV, unsigned Encoding, + Mangler &Mang, const TargetMachine &TM, + MachineModuleInfo *MMI, + MCStreamer &Streamer) const override; + + /// \brief Describe a TLS variable address within debug info. + const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override; +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp new file mode 100644 index 0000000..c152011 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -0,0 +1,495 @@ +//===-- ARMTargetTransformInfo.cpp - ARM specific TTI ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "ARMTargetTransformInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/CostTable.h" +#include "llvm/Target/TargetLowering.h" +using namespace llvm; + +#define DEBUG_TYPE "armtti" + +int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { + assert(Ty->isIntegerTy()); + + unsigned Bits = Ty->getPrimitiveSizeInBits(); + if (Bits == 0 || Bits > 32) + return 4; + + int32_t SImmVal = Imm.getSExtValue(); + uint32_t ZImmVal = Imm.getZExtValue(); + if (!ST->isThumb()) { + if ((SImmVal >= 0 && SImmVal < 65536) || + (ARM_AM::getSOImmVal(ZImmVal) != -1) || + (ARM_AM::getSOImmVal(~ZImmVal) != -1)) + return 1; + return ST->hasV6T2Ops() ? 2 : 3; + } + if (ST->isThumb2()) { + if ((SImmVal >= 0 && SImmVal < 65536) || + (ARM_AM::getT2SOImmVal(ZImmVal) != -1) || + (ARM_AM::getT2SOImmVal(~ZImmVal) != -1)) + return 1; + return ST->hasV6T2Ops() ? 2 : 3; + } + // Thumb1. + if (SImmVal >= 0 && SImmVal < 256) + return 1; + if ((~ZImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal)) + return 2; + // Load from constantpool. + return 3; +} + +int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + // Single to/from double precision conversions. + static const CostTblEntry NEONFltDblTbl[] = { + // Vector fptrunc/fpext conversions. + { ISD::FP_ROUND, MVT::v2f64, 2 }, + { ISD::FP_EXTEND, MVT::v2f32, 2 }, + { ISD::FP_EXTEND, MVT::v4f32, 4 } + }; + + if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND || + ISD == ISD::FP_EXTEND)) { + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); + if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second)) + return LT.first * Entry->Cost; + } + + EVT SrcTy = TLI->getValueType(DL, Src); + EVT DstTy = TLI->getValueType(DL, Dst); + + if (!SrcTy.isSimple() || !DstTy.isSimple()) + return BaseT::getCastInstrCost(Opcode, Dst, Src); + + // Some arithmetic, load and store operations have specific instructions + // to cast up/down their types automatically at no extra cost. + // TODO: Get these tables to know at least what the related operations are. + static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = { + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0 }, + { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 }, + { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 }, + { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, + { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, + + // The number of vmovl instructions for the extension. + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, + + // Operations that we legalize using splitting. + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, + + // Vector float <-> i32 conversions. + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, + + { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, + { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 }, + { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, + { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 }, + { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 }, + + { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, + { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 }, + { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 }, + { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, + { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, + + // Vector double <-> i32 conversions. + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, + + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, + + { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, + { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, + { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 }, + { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 }, + { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 }, + { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 } + }; + + if (SrcTy.isVector() && ST->hasNEON()) { + if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + } + + // Scalar float to integer conversions. + static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = { + { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 }, + { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 }, + { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 }, + { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 }, + { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 }, + { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 }, + { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 }, + { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 }, + { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 }, + { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 }, + { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 }, + { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 }, + { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 }, + { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 }, + { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 }, + { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 }, + { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 }, + { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 }, + { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 }, + { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 } + }; + if (SrcTy.isFloatingPoint() && ST->hasNEON()) { + if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + } + + // Scalar integer to float conversions. + static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = { + { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 }, + { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 }, + { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 }, + { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 }, + { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 }, + { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 }, + { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 }, + { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 }, + { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 }, + { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 }, + { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 }, + { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 }, + { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 }, + { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 }, + { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 }, + { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 }, + { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 }, + { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 }, + { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 }, + { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 } + }; + + if (SrcTy.isInteger() && ST->hasNEON()) { + if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl, + ISD, DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + } + + // Scalar integer conversion costs. + static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = { + // i16 -> i64 requires two dependent operations. + { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 }, + + // Truncates on i64 are assumed to be free. + { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 }, + { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 }, + { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 }, + { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 } + }; + + if (SrcTy.isInteger()) { + if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + } + + return BaseT::getCastInstrCost(Opcode, Dst, Src); +} + +int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, + unsigned Index) { + // Penalize inserting into an D-subregister. We end up with a three times + // lower estimated throughput on swift. + if (ST->isSwift() && + Opcode == Instruction::InsertElement && + ValTy->isVectorTy() && + ValTy->getScalarSizeInBits() <= 32) + return 3; + + if ((Opcode == Instruction::InsertElement || + Opcode == Instruction::ExtractElement)) { + // Cross-class copies are expensive on many microarchitectures, + // so assume they are expensive by default. + if (ValTy->getVectorElementType()->isIntegerTy()) + return 3; + + // Even if it's not a cross class copy, this likely leads to mixing + // of NEON and VFP code and should be therefore penalized. + if (ValTy->isVectorTy() && + ValTy->getScalarSizeInBits() <= 32) + return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U); + } + + return BaseT::getVectorInstrCost(Opcode, ValTy, Index); +} + +int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { + + int ISD = TLI->InstructionOpcodeToISD(Opcode); + // On NEON a a vector select gets lowered to vbsl. + if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) { + // Lowering of some vector selects is currently far from perfect. + static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = { + { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 }, + { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 }, + { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 } + }; + + EVT SelCondTy = TLI->getValueType(DL, CondTy); + EVT SelValTy = TLI->getValueType(DL, ValTy); + if (SelCondTy.isSimple() && SelValTy.isSimple()) { + if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD, + SelCondTy.getSimpleVT(), + SelValTy.getSimpleVT())) + return Entry->Cost; + } + + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); + return LT.first; + } + + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); +} + +int ARMTTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { + // Address computations in vectorized code with non-consecutive addresses will + // likely result in more instructions compared to scalar code where the + // computation can more often be merged into the index mode. The resulting + // extra micro-ops can significantly decrease throughput. + unsigned NumVectorInstToHideOverhead = 10; + + if (Ty->isVectorTy() && IsComplex) + return NumVectorInstToHideOverhead; + + // In many cases the address computation is not merged into the instruction + // addressing mode. + return 1; +} + +int ARMTTIImpl::getFPOpCost(Type *Ty) { + // Use similar logic that's in ARMISelLowering: + // Any ARM CPU with VFP2 has floating point, but Thumb1 didn't have access + // to VFP. + + if (ST->hasVFP2() && !ST->isThumb1Only()) { + if (Ty->isFloatTy()) { + return TargetTransformInfo::TCC_Basic; + } + + if (Ty->isDoubleTy()) { + return ST->isFPOnlySP() ? TargetTransformInfo::TCC_Expensive : + TargetTransformInfo::TCC_Basic; + } + } + + return TargetTransformInfo::TCC_Expensive; +} + +int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, + Type *SubTp) { + // We only handle costs of reverse and alternate shuffles for now. + if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate) + return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + + if (Kind == TTI::SK_Reverse) { + static const CostTblEntry NEONShuffleTbl[] = { + // Reverse shuffle cost one instruction if we are shuffling within a + // double word (vrev) or two if we shuffle a quad word (vrev, vext). + {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, + + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}}; + + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + + if (const auto *Entry = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, + LT.second)) + return LT.first * Entry->Cost; + + return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + } + if (Kind == TTI::SK_Alternate) { + static const CostTblEntry NEONAltShuffleTbl[] = { + // Alt shuffle cost table for ARM. Cost is the number of instructions + // required to create the shuffled vector. + + {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, + + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2}, + + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16}, + + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}}; + + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + if (const auto *Entry = CostTableLookup(NEONAltShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; + return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + } + return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); +} + +int ARMTTIImpl::getArithmeticInstrCost( + unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, + TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, + TTI::OperandValueProperties Opd2PropInfo) { + + int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + + const unsigned FunctionCallDivCost = 20; + const unsigned ReciprocalDivCost = 10; + static const CostTblEntry CostTbl[] = { + // Division. + // These costs are somewhat random. Choose a cost of 20 to indicate that + // vectorizing devision (added function call) is going to be very expensive. + // Double registers types. + { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost}, + { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost}, + { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost}, + { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost}, + { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost}, + { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost}, + { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost}, + { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost}, + { ISD::SDIV, MVT::v4i16, ReciprocalDivCost}, + { ISD::UDIV, MVT::v4i16, ReciprocalDivCost}, + { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost}, + { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost}, + { ISD::SDIV, MVT::v8i8, ReciprocalDivCost}, + { ISD::UDIV, MVT::v8i8, ReciprocalDivCost}, + { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost}, + { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost}, + // Quad register types. + { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost}, + { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost}, + { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost}, + { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost}, + { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost}, + { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost}, + { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost}, + { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost}, + { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost}, + { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost}, + { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost}, + { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost}, + { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost}, + { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost}, + { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost}, + { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost}, + // Multiplication. + }; + + if (ST->hasNEON()) + if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second)) + return LT.first * Entry->Cost; + + int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, + Opd1PropInfo, Opd2PropInfo); + + // This is somewhat of a hack. The problem that we are facing is that SROA + // creates a sequence of shift, and, or instructions to construct values. + // These sequences are recognized by the ISel and have zero-cost. Not so for + // the vectorized code. Because we have support for v2i64 but not i64 those + // sequences look particularly beneficial to vectorize. + // To work around this we increase the cost of v2i64 operations to make them + // seem less beneficial. + if (LT.second == MVT::v2i64 && + Op2Info == TargetTransformInfo::OK_UniformConstantValue) + Cost += 4; + + return Cost; +} + +int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace) { + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); + + if (Src->isVectorTy() && Alignment != 16 && + Src->getVectorElementType()->isDoubleTy()) { + // Unaligned loads/stores are extremely inefficient. + // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr. + return LT.first * 4; + } + return LT.first; +} + +int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef<unsigned> Indices, + unsigned Alignment, + unsigned AddressSpace) { + assert(Factor >= 2 && "Invalid interleave factor"); + assert(isa<VectorType>(VecTy) && "Expect a vector type"); + + // vldN/vstN doesn't support vector types of i64/f64 element. + bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64; + + if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) { + unsigned NumElts = VecTy->getVectorNumElements(); + Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); + unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); + + // vldN/vstN only support legal vector types of size 64 or 128 in bits. + if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128)) + return Factor; + } + + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); +} diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h new file mode 100644 index 0000000..7d8d238 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -0,0 +1,127 @@ +//===-- ARMTargetTransformInfo.h - ARM specific TTI -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file a TargetTransformInfo::Concept conforming object specific to the +/// ARM target machine. It uses the target's detailed information to +/// provide more precise answers to certain TTI queries, while letting the +/// target independent and default TTI implementations handle the rest. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_ARMTARGETTRANSFORMINFO_H +#define LLVM_LIB_TARGET_ARM_ARMTARGETTRANSFORMINFO_H + +#include "ARM.h" +#include "ARMTargetMachine.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/Target/TargetLowering.h" + +namespace llvm { + +class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> { + typedef BasicTTIImplBase<ARMTTIImpl> BaseT; + typedef TargetTransformInfo TTI; + friend BaseT; + + const ARMSubtarget *ST; + const ARMTargetLowering *TLI; + + /// Estimate the overhead of scalarizing an instruction. Insert and Extract + /// are set if the result needs to be inserted and/or extracted from vectors. + unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract); + + const ARMSubtarget *getST() const { return ST; } + const ARMTargetLowering *getTLI() const { return TLI; } + +public: + explicit ARMTTIImpl(const ARMBaseTargetMachine *TM, const Function &F) + : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), + TLI(ST->getTargetLowering()) {} + + // Provide value semantics. MSVC requires that we spell all of these out. + ARMTTIImpl(const ARMTTIImpl &Arg) + : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {} + ARMTTIImpl(ARMTTIImpl &&Arg) + : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)), + TLI(std::move(Arg.TLI)) {} + + bool enableInterleavedAccessVectorization() { return true; } + + /// \name Scalar TTI Implementations + /// @{ + + using BaseT::getIntImmCost; + int getIntImmCost(const APInt &Imm, Type *Ty); + + /// @} + + /// \name Vector TTI Implementations + /// @{ + + unsigned getNumberOfRegisters(bool Vector) { + if (Vector) { + if (ST->hasNEON()) + return 16; + return 0; + } + + if (ST->isThumb1Only()) + return 8; + return 13; + } + + unsigned getRegisterBitWidth(bool Vector) { + if (Vector) { + if (ST->hasNEON()) + return 128; + return 0; + } + + return 32; + } + + unsigned getMaxInterleaveFactor(unsigned VF) { + // These are out of order CPUs: + if (ST->isCortexA15() || ST->isSwift()) + return 2; + return 1; + } + + int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); + + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); + + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + + int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); + + int getAddressComputationCost(Type *Val, bool IsComplex); + + int getFPOpCost(Type *Ty); + + int getArithmeticInstrCost( + unsigned Opcode, Type *Ty, + TTI::OperandValueKind Op1Info = TTI::OK_AnyValue, + TTI::OperandValueKind Op2Info = TTI::OK_AnyValue, + TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + + int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace); + + int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, + ArrayRef<unsigned> Indices, unsigned Alignment, + unsigned AddressSpace); + /// @} +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp new file mode 100644 index 0000000..c69a741 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -0,0 +1,10034 @@ +//===-- ARMAsmParser.cpp - Parse ARM assembly to MCInst instructions ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "ARMFeatures.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "MCTargetDesc/ARMBaseInfo.h" +#include "MCTargetDesc/ARMMCExpr.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Triple.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDisassembler.h" +#include "llvm/MC/MCELFStreamer.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCParser/MCAsmLexer.h" +#include "llvm/MC/MCParser/MCAsmParser.h" +#include "llvm/MC/MCParser/MCAsmParserUtils.h" +#include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSection.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCTargetAsmParser.h" +#include "llvm/Support/ARMBuildAttributes.h" +#include "llvm/Support/ARMEHABI.h" +#include "llvm/Support/TargetParser.h" +#include "llvm/Support/COFF.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace { + +class ARMOperand; + +enum VectorLaneTy { NoLanes, AllLanes, IndexedLane }; + +class UnwindContext { + MCAsmParser &Parser; + + typedef SmallVector<SMLoc, 4> Locs; + + Locs FnStartLocs; + Locs CantUnwindLocs; + Locs PersonalityLocs; + Locs PersonalityIndexLocs; + Locs HandlerDataLocs; + int FPReg; + +public: + UnwindContext(MCAsmParser &P) : Parser(P), FPReg(ARM::SP) {} + + bool hasFnStart() const { return !FnStartLocs.empty(); } + bool cantUnwind() const { return !CantUnwindLocs.empty(); } + bool hasHandlerData() const { return !HandlerDataLocs.empty(); } + bool hasPersonality() const { + return !(PersonalityLocs.empty() && PersonalityIndexLocs.empty()); + } + + void recordFnStart(SMLoc L) { FnStartLocs.push_back(L); } + void recordCantUnwind(SMLoc L) { CantUnwindLocs.push_back(L); } + void recordPersonality(SMLoc L) { PersonalityLocs.push_back(L); } + void recordHandlerData(SMLoc L) { HandlerDataLocs.push_back(L); } + void recordPersonalityIndex(SMLoc L) { PersonalityIndexLocs.push_back(L); } + + void saveFPReg(int Reg) { FPReg = Reg; } + int getFPReg() const { return FPReg; } + + void emitFnStartLocNotes() const { + for (Locs::const_iterator FI = FnStartLocs.begin(), FE = FnStartLocs.end(); + FI != FE; ++FI) + Parser.Note(*FI, ".fnstart was specified here"); + } + void emitCantUnwindLocNotes() const { + for (Locs::const_iterator UI = CantUnwindLocs.begin(), + UE = CantUnwindLocs.end(); UI != UE; ++UI) + Parser.Note(*UI, ".cantunwind was specified here"); + } + void emitHandlerDataLocNotes() const { + for (Locs::const_iterator HI = HandlerDataLocs.begin(), + HE = HandlerDataLocs.end(); HI != HE; ++HI) + Parser.Note(*HI, ".handlerdata was specified here"); + } + void emitPersonalityLocNotes() const { + for (Locs::const_iterator PI = PersonalityLocs.begin(), + PE = PersonalityLocs.end(), + PII = PersonalityIndexLocs.begin(), + PIE = PersonalityIndexLocs.end(); + PI != PE || PII != PIE;) { + if (PI != PE && (PII == PIE || PI->getPointer() < PII->getPointer())) + Parser.Note(*PI++, ".personality was specified here"); + else if (PII != PIE && (PI == PE || PII->getPointer() < PI->getPointer())) + Parser.Note(*PII++, ".personalityindex was specified here"); + else + llvm_unreachable(".personality and .personalityindex cannot be " + "at the same location"); + } + } + + void reset() { + FnStartLocs = Locs(); + CantUnwindLocs = Locs(); + PersonalityLocs = Locs(); + HandlerDataLocs = Locs(); + PersonalityIndexLocs = Locs(); + FPReg = ARM::SP; + } +}; + +class ARMAsmParser : public MCTargetAsmParser { + const MCInstrInfo &MII; + const MCRegisterInfo *MRI; + UnwindContext UC; + + ARMTargetStreamer &getTargetStreamer() { + assert(getParser().getStreamer().getTargetStreamer() && + "do not have a target streamer"); + MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer(); + return static_cast<ARMTargetStreamer &>(TS); + } + + // Map of register aliases registers via the .req directive. + StringMap<unsigned> RegisterReqs; + + bool NextSymbolIsThumb; + + struct { + ARMCC::CondCodes Cond; // Condition for IT block. + unsigned Mask:4; // Condition mask for instructions. + // Starting at first 1 (from lsb). + // '1' condition as indicated in IT. + // '0' inverse of condition (else). + // Count of instructions in IT block is + // 4 - trailingzeroes(mask) + + bool FirstCond; // Explicit flag for when we're parsing the + // First instruction in the IT block. It's + // implied in the mask, so needs special + // handling. + + unsigned CurPosition; // Current position in parsing of IT + // block. In range [0,3]. Initialized + // according to count of instructions in block. + // ~0U if no active IT block. + } ITState; + bool inITBlock() { return ITState.CurPosition != ~0U; } + bool lastInITBlock() { + return ITState.CurPosition == 4 - countTrailingZeros(ITState.Mask); + } + void forwardITPosition() { + if (!inITBlock()) return; + // Move to the next instruction in the IT block, if there is one. If not, + // mark the block as done. + unsigned TZ = countTrailingZeros(ITState.Mask); + if (++ITState.CurPosition == 5 - TZ) + ITState.CurPosition = ~0U; // Done with the IT block after this. + } + + void Note(SMLoc L, const Twine &Msg, ArrayRef<SMRange> Ranges = None) { + return getParser().Note(L, Msg, Ranges); + } + bool Warning(SMLoc L, const Twine &Msg, + ArrayRef<SMRange> Ranges = None) { + return getParser().Warning(L, Msg, Ranges); + } + bool Error(SMLoc L, const Twine &Msg, + ArrayRef<SMRange> Ranges = None) { + return getParser().Error(L, Msg, Ranges); + } + + bool validatetLDMRegList(const MCInst &Inst, const OperandVector &Operands, + unsigned ListNo, bool IsARPop = false); + bool validatetSTMRegList(const MCInst &Inst, const OperandVector &Operands, + unsigned ListNo); + + int tryParseRegister(); + bool tryParseRegisterWithWriteBack(OperandVector &); + int tryParseShiftRegister(OperandVector &); + bool parseRegisterList(OperandVector &); + bool parseMemory(OperandVector &); + bool parseOperand(OperandVector &, StringRef Mnemonic); + bool parsePrefix(ARMMCExpr::VariantKind &RefKind); + bool parseMemRegOffsetShift(ARM_AM::ShiftOpc &ShiftType, + unsigned &ShiftAmount); + bool parseLiteralValues(unsigned Size, SMLoc L); + bool parseDirectiveThumb(SMLoc L); + bool parseDirectiveARM(SMLoc L); + bool parseDirectiveThumbFunc(SMLoc L); + bool parseDirectiveCode(SMLoc L); + bool parseDirectiveSyntax(SMLoc L); + bool parseDirectiveReq(StringRef Name, SMLoc L); + bool parseDirectiveUnreq(SMLoc L); + bool parseDirectiveArch(SMLoc L); + bool parseDirectiveEabiAttr(SMLoc L); + bool parseDirectiveCPU(SMLoc L); + bool parseDirectiveFPU(SMLoc L); + bool parseDirectiveFnStart(SMLoc L); + bool parseDirectiveFnEnd(SMLoc L); + bool parseDirectiveCantUnwind(SMLoc L); + bool parseDirectivePersonality(SMLoc L); + bool parseDirectiveHandlerData(SMLoc L); + bool parseDirectiveSetFP(SMLoc L); + bool parseDirectivePad(SMLoc L); + bool parseDirectiveRegSave(SMLoc L, bool IsVector); + bool parseDirectiveInst(SMLoc L, char Suffix = '\0'); + bool parseDirectiveLtorg(SMLoc L); + bool parseDirectiveEven(SMLoc L); + bool parseDirectivePersonalityIndex(SMLoc L); + bool parseDirectiveUnwindRaw(SMLoc L); + bool parseDirectiveTLSDescSeq(SMLoc L); + bool parseDirectiveMovSP(SMLoc L); + bool parseDirectiveObjectArch(SMLoc L); + bool parseDirectiveArchExtension(SMLoc L); + bool parseDirectiveAlign(SMLoc L); + bool parseDirectiveThumbSet(SMLoc L); + + StringRef splitMnemonic(StringRef Mnemonic, unsigned &PredicationCode, + bool &CarrySetting, unsigned &ProcessorIMod, + StringRef &ITMask); + void getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst, + bool &CanAcceptCarrySet, + bool &CanAcceptPredicationCode); + + void tryConvertingToTwoOperandForm(StringRef Mnemonic, bool CarrySetting, + OperandVector &Operands); + bool isThumb() const { + // FIXME: Can tablegen auto-generate this? + return getSTI().getFeatureBits()[ARM::ModeThumb]; + } + bool isThumbOne() const { + return isThumb() && !getSTI().getFeatureBits()[ARM::FeatureThumb2]; + } + bool isThumbTwo() const { + return isThumb() && getSTI().getFeatureBits()[ARM::FeatureThumb2]; + } + bool hasThumb() const { + return getSTI().getFeatureBits()[ARM::HasV4TOps]; + } + bool hasV6Ops() const { + return getSTI().getFeatureBits()[ARM::HasV6Ops]; + } + bool hasV6MOps() const { + return getSTI().getFeatureBits()[ARM::HasV6MOps]; + } + bool hasV7Ops() const { + return getSTI().getFeatureBits()[ARM::HasV7Ops]; + } + bool hasV8Ops() const { + return getSTI().getFeatureBits()[ARM::HasV8Ops]; + } + bool hasARM() const { + return !getSTI().getFeatureBits()[ARM::FeatureNoARM]; + } + bool hasDSP() const { + return getSTI().getFeatureBits()[ARM::FeatureDSP]; + } + bool hasD16() const { + return getSTI().getFeatureBits()[ARM::FeatureD16]; + } + bool hasV8_1aOps() const { + return getSTI().getFeatureBits()[ARM::HasV8_1aOps]; + } + + void SwitchMode() { + MCSubtargetInfo &STI = copySTI(); + uint64_t FB = ComputeAvailableFeatures(STI.ToggleFeature(ARM::ModeThumb)); + setAvailableFeatures(FB); + } + bool isMClass() const { + return getSTI().getFeatureBits()[ARM::FeatureMClass]; + } + + /// @name Auto-generated Match Functions + /// { + +#define GET_ASSEMBLER_HEADER +#include "ARMGenAsmMatcher.inc" + + /// } + + OperandMatchResultTy parseITCondCode(OperandVector &); + OperandMatchResultTy parseCoprocNumOperand(OperandVector &); + OperandMatchResultTy parseCoprocRegOperand(OperandVector &); + OperandMatchResultTy parseCoprocOptionOperand(OperandVector &); + OperandMatchResultTy parseMemBarrierOptOperand(OperandVector &); + OperandMatchResultTy parseInstSyncBarrierOptOperand(OperandVector &); + OperandMatchResultTy parseProcIFlagsOperand(OperandVector &); + OperandMatchResultTy parseMSRMaskOperand(OperandVector &); + OperandMatchResultTy parseBankedRegOperand(OperandVector &); + OperandMatchResultTy parsePKHImm(OperandVector &O, StringRef Op, int Low, + int High); + OperandMatchResultTy parsePKHLSLImm(OperandVector &O) { + return parsePKHImm(O, "lsl", 0, 31); + } + OperandMatchResultTy parsePKHASRImm(OperandVector &O) { + return parsePKHImm(O, "asr", 1, 32); + } + OperandMatchResultTy parseSetEndImm(OperandVector &); + OperandMatchResultTy parseShifterImm(OperandVector &); + OperandMatchResultTy parseRotImm(OperandVector &); + OperandMatchResultTy parseModImm(OperandVector &); + OperandMatchResultTy parseBitfield(OperandVector &); + OperandMatchResultTy parsePostIdxReg(OperandVector &); + OperandMatchResultTy parseAM3Offset(OperandVector &); + OperandMatchResultTy parseFPImm(OperandVector &); + OperandMatchResultTy parseVectorList(OperandVector &); + OperandMatchResultTy parseVectorLane(VectorLaneTy &LaneKind, unsigned &Index, + SMLoc &EndLoc); + + // Asm Match Converter Methods + void cvtThumbMultiply(MCInst &Inst, const OperandVector &); + void cvtThumbBranches(MCInst &Inst, const OperandVector &); + + bool validateInstruction(MCInst &Inst, const OperandVector &Ops); + bool processInstruction(MCInst &Inst, const OperandVector &Ops, MCStreamer &Out); + bool shouldOmitCCOutOperand(StringRef Mnemonic, OperandVector &Operands); + bool shouldOmitPredicateOperand(StringRef Mnemonic, OperandVector &Operands); + +public: + enum ARMMatchResultTy { + Match_RequiresITBlock = FIRST_TARGET_MATCH_RESULT_TY, + Match_RequiresNotITBlock, + Match_RequiresV6, + Match_RequiresThumb2, + Match_RequiresV8, +#define GET_OPERAND_DIAGNOSTIC_TYPES +#include "ARMGenAsmMatcher.inc" + + }; + + ARMAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser, + const MCInstrInfo &MII, const MCTargetOptions &Options) + : MCTargetAsmParser(Options, STI), MII(MII), UC(Parser) { + MCAsmParserExtension::Initialize(Parser); + + // Cache the MCRegisterInfo. + MRI = getContext().getRegisterInfo(); + + // Initialize the set of available features. + setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + + // Not in an ITBlock to start with. + ITState.CurPosition = ~0U; + + NextSymbolIsThumb = false; + } + + // Implementation of the MCTargetAsmParser interface: + bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; + bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) override; + bool ParseDirective(AsmToken DirectiveID) override; + + unsigned validateTargetOperandClass(MCParsedAsmOperand &Op, + unsigned Kind) override; + unsigned checkTargetMatchPredicate(MCInst &Inst) override; + + bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm) override; + void onLabelParsed(MCSymbol *Symbol) override; +}; +} // end anonymous namespace + +namespace { + +/// ARMOperand - Instances of this class represent a parsed ARM machine +/// operand. +class ARMOperand : public MCParsedAsmOperand { + enum KindTy { + k_CondCode, + k_CCOut, + k_ITCondMask, + k_CoprocNum, + k_CoprocReg, + k_CoprocOption, + k_Immediate, + k_MemBarrierOpt, + k_InstSyncBarrierOpt, + k_Memory, + k_PostIndexRegister, + k_MSRMask, + k_BankedReg, + k_ProcIFlags, + k_VectorIndex, + k_Register, + k_RegisterList, + k_DPRRegisterList, + k_SPRRegisterList, + k_VectorList, + k_VectorListAllLanes, + k_VectorListIndexed, + k_ShiftedRegister, + k_ShiftedImmediate, + k_ShifterImmediate, + k_RotateImmediate, + k_ModifiedImmediate, + k_BitfieldDescriptor, + k_Token + } Kind; + + SMLoc StartLoc, EndLoc, AlignmentLoc; + SmallVector<unsigned, 8> Registers; + + struct CCOp { + ARMCC::CondCodes Val; + }; + + struct CopOp { + unsigned Val; + }; + + struct CoprocOptionOp { + unsigned Val; + }; + + struct ITMaskOp { + unsigned Mask:4; + }; + + struct MBOptOp { + ARM_MB::MemBOpt Val; + }; + + struct ISBOptOp { + ARM_ISB::InstSyncBOpt Val; + }; + + struct IFlagsOp { + ARM_PROC::IFlags Val; + }; + + struct MMaskOp { + unsigned Val; + }; + + struct BankedRegOp { + unsigned Val; + }; + + struct TokOp { + const char *Data; + unsigned Length; + }; + + struct RegOp { + unsigned RegNum; + }; + + // A vector register list is a sequential list of 1 to 4 registers. + struct VectorListOp { + unsigned RegNum; + unsigned Count; + unsigned LaneIndex; + bool isDoubleSpaced; + }; + + struct VectorIndexOp { + unsigned Val; + }; + + struct ImmOp { + const MCExpr *Val; + }; + + /// Combined record for all forms of ARM address expressions. + struct MemoryOp { + unsigned BaseRegNum; + // Offset is in OffsetReg or OffsetImm. If both are zero, no offset + // was specified. + const MCConstantExpr *OffsetImm; // Offset immediate value + unsigned OffsetRegNum; // Offset register num, when OffsetImm == NULL + ARM_AM::ShiftOpc ShiftType; // Shift type for OffsetReg + unsigned ShiftImm; // shift for OffsetReg. + unsigned Alignment; // 0 = no alignment specified + // n = alignment in bytes (2, 4, 8, 16, or 32) + unsigned isNegative : 1; // Negated OffsetReg? (~'U' bit) + }; + + struct PostIdxRegOp { + unsigned RegNum; + bool isAdd; + ARM_AM::ShiftOpc ShiftTy; + unsigned ShiftImm; + }; + + struct ShifterImmOp { + bool isASR; + unsigned Imm; + }; + + struct RegShiftedRegOp { + ARM_AM::ShiftOpc ShiftTy; + unsigned SrcReg; + unsigned ShiftReg; + unsigned ShiftImm; + }; + + struct RegShiftedImmOp { + ARM_AM::ShiftOpc ShiftTy; + unsigned SrcReg; + unsigned ShiftImm; + }; + + struct RotImmOp { + unsigned Imm; + }; + + struct ModImmOp { + unsigned Bits; + unsigned Rot; + }; + + struct BitfieldOp { + unsigned LSB; + unsigned Width; + }; + + union { + struct CCOp CC; + struct CopOp Cop; + struct CoprocOptionOp CoprocOption; + struct MBOptOp MBOpt; + struct ISBOptOp ISBOpt; + struct ITMaskOp ITMask; + struct IFlagsOp IFlags; + struct MMaskOp MMask; + struct BankedRegOp BankedReg; + struct TokOp Tok; + struct RegOp Reg; + struct VectorListOp VectorList; + struct VectorIndexOp VectorIndex; + struct ImmOp Imm; + struct MemoryOp Memory; + struct PostIdxRegOp PostIdxReg; + struct ShifterImmOp ShifterImm; + struct RegShiftedRegOp RegShiftedReg; + struct RegShiftedImmOp RegShiftedImm; + struct RotImmOp RotImm; + struct ModImmOp ModImm; + struct BitfieldOp Bitfield; + }; + +public: + ARMOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {} + + /// getStartLoc - Get the location of the first token of this operand. + SMLoc getStartLoc() const override { return StartLoc; } + /// getEndLoc - Get the location of the last token of this operand. + SMLoc getEndLoc() const override { return EndLoc; } + /// getLocRange - Get the range between the first and last token of this + /// operand. + SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); } + + /// getAlignmentLoc - Get the location of the Alignment token of this operand. + SMLoc getAlignmentLoc() const { + assert(Kind == k_Memory && "Invalid access!"); + return AlignmentLoc; + } + + ARMCC::CondCodes getCondCode() const { + assert(Kind == k_CondCode && "Invalid access!"); + return CC.Val; + } + + unsigned getCoproc() const { + assert((Kind == k_CoprocNum || Kind == k_CoprocReg) && "Invalid access!"); + return Cop.Val; + } + + StringRef getToken() const { + assert(Kind == k_Token && "Invalid access!"); + return StringRef(Tok.Data, Tok.Length); + } + + unsigned getReg() const override { + assert((Kind == k_Register || Kind == k_CCOut) && "Invalid access!"); + return Reg.RegNum; + } + + const SmallVectorImpl<unsigned> &getRegList() const { + assert((Kind == k_RegisterList || Kind == k_DPRRegisterList || + Kind == k_SPRRegisterList) && "Invalid access!"); + return Registers; + } + + const MCExpr *getImm() const { + assert(isImm() && "Invalid access!"); + return Imm.Val; + } + + unsigned getVectorIndex() const { + assert(Kind == k_VectorIndex && "Invalid access!"); + return VectorIndex.Val; + } + + ARM_MB::MemBOpt getMemBarrierOpt() const { + assert(Kind == k_MemBarrierOpt && "Invalid access!"); + return MBOpt.Val; + } + + ARM_ISB::InstSyncBOpt getInstSyncBarrierOpt() const { + assert(Kind == k_InstSyncBarrierOpt && "Invalid access!"); + return ISBOpt.Val; + } + + ARM_PROC::IFlags getProcIFlags() const { + assert(Kind == k_ProcIFlags && "Invalid access!"); + return IFlags.Val; + } + + unsigned getMSRMask() const { + assert(Kind == k_MSRMask && "Invalid access!"); + return MMask.Val; + } + + unsigned getBankedReg() const { + assert(Kind == k_BankedReg && "Invalid access!"); + return BankedReg.Val; + } + + bool isCoprocNum() const { return Kind == k_CoprocNum; } + bool isCoprocReg() const { return Kind == k_CoprocReg; } + bool isCoprocOption() const { return Kind == k_CoprocOption; } + bool isCondCode() const { return Kind == k_CondCode; } + bool isCCOut() const { return Kind == k_CCOut; } + bool isITMask() const { return Kind == k_ITCondMask; } + bool isITCondCode() const { return Kind == k_CondCode; } + bool isImm() const override { return Kind == k_Immediate; } + // checks whether this operand is an unsigned offset which fits is a field + // of specified width and scaled by a specific number of bits + template<unsigned width, unsigned scale> + bool isUnsignedOffset() const { + if (!isImm()) return false; + if (isa<MCSymbolRefExpr>(Imm.Val)) return true; + if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) { + int64_t Val = CE->getValue(); + int64_t Align = 1LL << scale; + int64_t Max = Align * ((1LL << width) - 1); + return ((Val % Align) == 0) && (Val >= 0) && (Val <= Max); + } + return false; + } + // checks whether this operand is an signed offset which fits is a field + // of specified width and scaled by a specific number of bits + template<unsigned width, unsigned scale> + bool isSignedOffset() const { + if (!isImm()) return false; + if (isa<MCSymbolRefExpr>(Imm.Val)) return true; + if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) { + int64_t Val = CE->getValue(); + int64_t Align = 1LL << scale; + int64_t Max = Align * ((1LL << (width-1)) - 1); + int64_t Min = -Align * (1LL << (width-1)); + return ((Val % Align) == 0) && (Val >= Min) && (Val <= Max); + } + return false; + } + + // checks whether this operand is a memory operand computed as an offset + // applied to PC. the offset may have 8 bits of magnitude and is represented + // with two bits of shift. textually it may be either [pc, #imm], #imm or + // relocable expression... + bool isThumbMemPC() const { + int64_t Val = 0; + if (isImm()) { + if (isa<MCSymbolRefExpr>(Imm.Val)) return true; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val); + if (!CE) return false; + Val = CE->getValue(); + } + else if (isMem()) { + if(!Memory.OffsetImm || Memory.OffsetRegNum) return false; + if(Memory.BaseRegNum != ARM::PC) return false; + Val = Memory.OffsetImm->getValue(); + } + else return false; + return ((Val % 4) == 0) && (Val >= 0) && (Val <= 1020); + } + bool isFPImm() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int Val = ARM_AM::getFP32Imm(APInt(32, CE->getValue())); + return Val != -1; + } + bool isFBits16() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value >= 0 && Value <= 16; + } + bool isFBits32() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value >= 1 && Value <= 32; + } + bool isImm8s4() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return ((Value & 3) == 0) && Value >= -1020 && Value <= 1020; + } + bool isImm0_1020s4() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return ((Value & 3) == 0) && Value >= 0 && Value <= 1020; + } + bool isImm0_508s4() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return ((Value & 3) == 0) && Value >= 0 && Value <= 508; + } + bool isImm0_508s4Neg() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = -CE->getValue(); + // explicitly exclude zero. we want that to use the normal 0_508 version. + return ((Value & 3) == 0) && Value > 0 && Value <= 508; + } + bool isImm0_239() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value >= 0 && Value < 240; + } + bool isImm0_255() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value >= 0 && Value < 256; + } + bool isImm0_4095() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value >= 0 && Value < 4096; + } + bool isImm0_4095Neg() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = -CE->getValue(); + return Value > 0 && Value < 4096; + } + bool isImm0_1() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value >= 0 && Value < 2; + } + bool isImm0_3() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value >= 0 && Value < 4; + } + bool isImm0_7() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value >= 0 && Value < 8; + } + bool isImm0_15() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value >= 0 && Value < 16; + } + bool isImm0_31() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value >= 0 && Value < 32; + } + bool isImm0_63() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value >= 0 && Value < 64; + } + bool isImm8() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value == 8; + } + bool isImm16() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value == 16; + } + bool isImm32() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value == 32; + } + bool isShrImm8() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value > 0 && Value <= 8; + } + bool isShrImm16() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value > 0 && Value <= 16; + } + bool isShrImm32() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value > 0 && Value <= 32; + } + bool isShrImm64() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value > 0 && Value <= 64; + } + bool isImm1_7() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value > 0 && Value < 8; + } + bool isImm1_15() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value > 0 && Value < 16; + } + bool isImm1_31() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value > 0 && Value < 32; + } + bool isImm1_16() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value > 0 && Value < 17; + } + bool isImm1_32() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value > 0 && Value < 33; + } + bool isImm0_32() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value >= 0 && Value < 33; + } + bool isImm0_65535() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value >= 0 && Value < 65536; + } + bool isImm256_65535Expr() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + // If it's not a constant expression, it'll generate a fixup and be + // handled later. + if (!CE) return true; + int64_t Value = CE->getValue(); + return Value >= 256 && Value < 65536; + } + bool isImm0_65535Expr() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + // If it's not a constant expression, it'll generate a fixup and be + // handled later. + if (!CE) return true; + int64_t Value = CE->getValue(); + return Value >= 0 && Value < 65536; + } + bool isImm24bit() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value >= 0 && Value <= 0xffffff; + } + bool isImmThumbSR() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value > 0 && Value < 33; + } + bool isPKHLSLImm() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value >= 0 && Value < 32; + } + bool isPKHASRImm() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value > 0 && Value <= 32; + } + bool isAdrLabel() const { + // If we have an immediate that's not a constant, treat it as a label + // reference needing a fixup. + if (isImm() && !isa<MCConstantExpr>(getImm())) + return true; + + // If it is a constant, it must fit into a modified immediate encoding. + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return (ARM_AM::getSOImmVal(Value) != -1 || + ARM_AM::getSOImmVal(-Value) != -1); + } + bool isT2SOImm() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return ARM_AM::getT2SOImmVal(Value) != -1; + } + bool isT2SOImmNot() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return ARM_AM::getT2SOImmVal(Value) == -1 && + ARM_AM::getT2SOImmVal(~Value) != -1; + } + bool isT2SOImmNeg() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + // Only use this when not representable as a plain so_imm. + return ARM_AM::getT2SOImmVal(Value) == -1 && + ARM_AM::getT2SOImmVal(-Value) != -1; + } + bool isSetEndImm() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value == 1 || Value == 0; + } + bool isReg() const override { return Kind == k_Register; } + bool isRegList() const { return Kind == k_RegisterList; } + bool isDPRRegList() const { return Kind == k_DPRRegisterList; } + bool isSPRRegList() const { return Kind == k_SPRRegisterList; } + bool isToken() const override { return Kind == k_Token; } + bool isMemBarrierOpt() const { return Kind == k_MemBarrierOpt; } + bool isInstSyncBarrierOpt() const { return Kind == k_InstSyncBarrierOpt; } + bool isMem() const override { return Kind == k_Memory; } + bool isShifterImm() const { return Kind == k_ShifterImmediate; } + bool isRegShiftedReg() const { return Kind == k_ShiftedRegister; } + bool isRegShiftedImm() const { return Kind == k_ShiftedImmediate; } + bool isRotImm() const { return Kind == k_RotateImmediate; } + bool isModImm() const { return Kind == k_ModifiedImmediate; } + bool isModImmNot() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return ARM_AM::getSOImmVal(~Value) != -1; + } + bool isModImmNeg() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return ARM_AM::getSOImmVal(Value) == -1 && + ARM_AM::getSOImmVal(-Value) != -1; + } + bool isBitfield() const { return Kind == k_BitfieldDescriptor; } + bool isPostIdxRegShifted() const { return Kind == k_PostIndexRegister; } + bool isPostIdxReg() const { + return Kind == k_PostIndexRegister && PostIdxReg.ShiftTy ==ARM_AM::no_shift; + } + bool isMemNoOffset(bool alignOK = false, unsigned Alignment = 0) const { + if (!isMem()) + return false; + // No offset of any kind. + return Memory.OffsetRegNum == 0 && Memory.OffsetImm == nullptr && + (alignOK || Memory.Alignment == Alignment); + } + bool isMemPCRelImm12() const { + if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) + return false; + // Base register must be PC. + if (Memory.BaseRegNum != ARM::PC) + return false; + // Immediate offset in range [-4095, 4095]. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + return (Val > -4096 && Val < 4096) || (Val == INT32_MIN); + } + bool isAlignedMemory() const { + return isMemNoOffset(true); + } + bool isAlignedMemoryNone() const { + return isMemNoOffset(false, 0); + } + bool isDupAlignedMemoryNone() const { + return isMemNoOffset(false, 0); + } + bool isAlignedMemory16() const { + if (isMemNoOffset(false, 2)) // alignment in bytes for 16-bits is 2. + return true; + return isMemNoOffset(false, 0); + } + bool isDupAlignedMemory16() const { + if (isMemNoOffset(false, 2)) // alignment in bytes for 16-bits is 2. + return true; + return isMemNoOffset(false, 0); + } + bool isAlignedMemory32() const { + if (isMemNoOffset(false, 4)) // alignment in bytes for 32-bits is 4. + return true; + return isMemNoOffset(false, 0); + } + bool isDupAlignedMemory32() const { + if (isMemNoOffset(false, 4)) // alignment in bytes for 32-bits is 4. + return true; + return isMemNoOffset(false, 0); + } + bool isAlignedMemory64() const { + if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8. + return true; + return isMemNoOffset(false, 0); + } + bool isDupAlignedMemory64() const { + if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8. + return true; + return isMemNoOffset(false, 0); + } + bool isAlignedMemory64or128() const { + if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8. + return true; + if (isMemNoOffset(false, 16)) // alignment in bytes for 128-bits is 16. + return true; + return isMemNoOffset(false, 0); + } + bool isDupAlignedMemory64or128() const { + if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8. + return true; + if (isMemNoOffset(false, 16)) // alignment in bytes for 128-bits is 16. + return true; + return isMemNoOffset(false, 0); + } + bool isAlignedMemory64or128or256() const { + if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8. + return true; + if (isMemNoOffset(false, 16)) // alignment in bytes for 128-bits is 16. + return true; + if (isMemNoOffset(false, 32)) // alignment in bytes for 256-bits is 32. + return true; + return isMemNoOffset(false, 0); + } + bool isAddrMode2() const { + if (!isMem() || Memory.Alignment != 0) return false; + // Check for register offset. + if (Memory.OffsetRegNum) return true; + // Immediate offset in range [-4095, 4095]. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + return Val > -4096 && Val < 4096; + } + bool isAM2OffsetImm() const { + if (!isImm()) return false; + // Immediate offset in range [-4095, 4095]. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Val = CE->getValue(); + return (Val == INT32_MIN) || (Val > -4096 && Val < 4096); + } + bool isAddrMode3() const { + // If we have an immediate that's not a constant, treat it as a label + // reference needing a fixup. If it is a constant, it's something else + // and we reject it. + if (isImm() && !isa<MCConstantExpr>(getImm())) + return true; + if (!isMem() || Memory.Alignment != 0) return false; + // No shifts are legal for AM3. + if (Memory.ShiftType != ARM_AM::no_shift) return false; + // Check for register offset. + if (Memory.OffsetRegNum) return true; + // Immediate offset in range [-255, 255]. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + // The #-0 offset is encoded as INT32_MIN, and we have to check + // for this too. + return (Val > -256 && Val < 256) || Val == INT32_MIN; + } + bool isAM3Offset() const { + if (Kind != k_Immediate && Kind != k_PostIndexRegister) + return false; + if (Kind == k_PostIndexRegister) + return PostIdxReg.ShiftTy == ARM_AM::no_shift; + // Immediate offset in range [-255, 255]. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Val = CE->getValue(); + // Special case, #-0 is INT32_MIN. + return (Val > -256 && Val < 256) || Val == INT32_MIN; + } + bool isAddrMode5() const { + // If we have an immediate that's not a constant, treat it as a label + // reference needing a fixup. If it is a constant, it's something else + // and we reject it. + if (isImm() && !isa<MCConstantExpr>(getImm())) + return true; + if (!isMem() || Memory.Alignment != 0) return false; + // Check for register offset. + if (Memory.OffsetRegNum) return false; + // Immediate offset in range [-1020, 1020] and a multiple of 4. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + return (Val >= -1020 && Val <= 1020 && ((Val & 3) == 0)) || + Val == INT32_MIN; + } + bool isMemTBB() const { + if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative || + Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0) + return false; + return true; + } + bool isMemTBH() const { + if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative || + Memory.ShiftType != ARM_AM::lsl || Memory.ShiftImm != 1 || + Memory.Alignment != 0 ) + return false; + return true; + } + bool isMemRegOffset() const { + if (!isMem() || !Memory.OffsetRegNum || Memory.Alignment != 0) + return false; + return true; + } + bool isT2MemRegOffset() const { + if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative || + Memory.Alignment != 0) + return false; + // Only lsl #{0, 1, 2, 3} allowed. + if (Memory.ShiftType == ARM_AM::no_shift) + return true; + if (Memory.ShiftType != ARM_AM::lsl || Memory.ShiftImm > 3) + return false; + return true; + } + bool isMemThumbRR() const { + // Thumb reg+reg addressing is simple. Just two registers, a base and + // an offset. No shifts, negations or any other complicating factors. + if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative || + Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0) + return false; + return isARMLowRegister(Memory.BaseRegNum) && + (!Memory.OffsetRegNum || isARMLowRegister(Memory.OffsetRegNum)); + } + bool isMemThumbRIs4() const { + if (!isMem() || Memory.OffsetRegNum != 0 || + !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0) + return false; + // Immediate offset, multiple of 4 in range [0, 124]. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + return Val >= 0 && Val <= 124 && (Val % 4) == 0; + } + bool isMemThumbRIs2() const { + if (!isMem() || Memory.OffsetRegNum != 0 || + !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0) + return false; + // Immediate offset, multiple of 4 in range [0, 62]. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + return Val >= 0 && Val <= 62 && (Val % 2) == 0; + } + bool isMemThumbRIs1() const { + if (!isMem() || Memory.OffsetRegNum != 0 || + !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0) + return false; + // Immediate offset in range [0, 31]. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + return Val >= 0 && Val <= 31; + } + bool isMemThumbSPI() const { + if (!isMem() || Memory.OffsetRegNum != 0 || + Memory.BaseRegNum != ARM::SP || Memory.Alignment != 0) + return false; + // Immediate offset, multiple of 4 in range [0, 1020]. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + return Val >= 0 && Val <= 1020 && (Val % 4) == 0; + } + bool isMemImm8s4Offset() const { + // If we have an immediate that's not a constant, treat it as a label + // reference needing a fixup. If it is a constant, it's something else + // and we reject it. + if (isImm() && !isa<MCConstantExpr>(getImm())) + return true; + if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) + return false; + // Immediate offset a multiple of 4 in range [-1020, 1020]. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + // Special case, #-0 is INT32_MIN. + return (Val >= -1020 && Val <= 1020 && (Val & 3) == 0) || Val == INT32_MIN; + } + bool isMemImm0_1020s4Offset() const { + if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) + return false; + // Immediate offset a multiple of 4 in range [0, 1020]. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + return Val >= 0 && Val <= 1020 && (Val & 3) == 0; + } + bool isMemImm8Offset() const { + if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) + return false; + // Base reg of PC isn't allowed for these encodings. + if (Memory.BaseRegNum == ARM::PC) return false; + // Immediate offset in range [-255, 255]. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + return (Val == INT32_MIN) || (Val > -256 && Val < 256); + } + bool isMemPosImm8Offset() const { + if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) + return false; + // Immediate offset in range [0, 255]. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + return Val >= 0 && Val < 256; + } + bool isMemNegImm8Offset() const { + if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) + return false; + // Base reg of PC isn't allowed for these encodings. + if (Memory.BaseRegNum == ARM::PC) return false; + // Immediate offset in range [-255, -1]. + if (!Memory.OffsetImm) return false; + int64_t Val = Memory.OffsetImm->getValue(); + return (Val == INT32_MIN) || (Val > -256 && Val < 0); + } + bool isMemUImm12Offset() const { + if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) + return false; + // Immediate offset in range [0, 4095]. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + return (Val >= 0 && Val < 4096); + } + bool isMemImm12Offset() const { + // If we have an immediate that's not a constant, treat it as a label + // reference needing a fixup. If it is a constant, it's something else + // and we reject it. + if (isImm() && !isa<MCConstantExpr>(getImm())) + return true; + + if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) + return false; + // Immediate offset in range [-4095, 4095]. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + return (Val > -4096 && Val < 4096) || (Val == INT32_MIN); + } + bool isPostIdxImm8() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Val = CE->getValue(); + return (Val > -256 && Val < 256) || (Val == INT32_MIN); + } + bool isPostIdxImm8s4() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Val = CE->getValue(); + return ((Val & 3) == 0 && Val >= -1020 && Val <= 1020) || + (Val == INT32_MIN); + } + + bool isMSRMask() const { return Kind == k_MSRMask; } + bool isBankedReg() const { return Kind == k_BankedReg; } + bool isProcIFlags() const { return Kind == k_ProcIFlags; } + + // NEON operands. + bool isSingleSpacedVectorList() const { + return Kind == k_VectorList && !VectorList.isDoubleSpaced; + } + bool isDoubleSpacedVectorList() const { + return Kind == k_VectorList && VectorList.isDoubleSpaced; + } + bool isVecListOneD() const { + if (!isSingleSpacedVectorList()) return false; + return VectorList.Count == 1; + } + + bool isVecListDPair() const { + if (!isSingleSpacedVectorList()) return false; + return (ARMMCRegisterClasses[ARM::DPairRegClassID] + .contains(VectorList.RegNum)); + } + + bool isVecListThreeD() const { + if (!isSingleSpacedVectorList()) return false; + return VectorList.Count == 3; + } + + bool isVecListFourD() const { + if (!isSingleSpacedVectorList()) return false; + return VectorList.Count == 4; + } + + bool isVecListDPairSpaced() const { + if (Kind != k_VectorList) return false; + if (isSingleSpacedVectorList()) return false; + return (ARMMCRegisterClasses[ARM::DPairSpcRegClassID] + .contains(VectorList.RegNum)); + } + + bool isVecListThreeQ() const { + if (!isDoubleSpacedVectorList()) return false; + return VectorList.Count == 3; + } + + bool isVecListFourQ() const { + if (!isDoubleSpacedVectorList()) return false; + return VectorList.Count == 4; + } + + bool isSingleSpacedVectorAllLanes() const { + return Kind == k_VectorListAllLanes && !VectorList.isDoubleSpaced; + } + bool isDoubleSpacedVectorAllLanes() const { + return Kind == k_VectorListAllLanes && VectorList.isDoubleSpaced; + } + bool isVecListOneDAllLanes() const { + if (!isSingleSpacedVectorAllLanes()) return false; + return VectorList.Count == 1; + } + + bool isVecListDPairAllLanes() const { + if (!isSingleSpacedVectorAllLanes()) return false; + return (ARMMCRegisterClasses[ARM::DPairRegClassID] + .contains(VectorList.RegNum)); + } + + bool isVecListDPairSpacedAllLanes() const { + if (!isDoubleSpacedVectorAllLanes()) return false; + return VectorList.Count == 2; + } + + bool isVecListThreeDAllLanes() const { + if (!isSingleSpacedVectorAllLanes()) return false; + return VectorList.Count == 3; + } + + bool isVecListThreeQAllLanes() const { + if (!isDoubleSpacedVectorAllLanes()) return false; + return VectorList.Count == 3; + } + + bool isVecListFourDAllLanes() const { + if (!isSingleSpacedVectorAllLanes()) return false; + return VectorList.Count == 4; + } + + bool isVecListFourQAllLanes() const { + if (!isDoubleSpacedVectorAllLanes()) return false; + return VectorList.Count == 4; + } + + bool isSingleSpacedVectorIndexed() const { + return Kind == k_VectorListIndexed && !VectorList.isDoubleSpaced; + } + bool isDoubleSpacedVectorIndexed() const { + return Kind == k_VectorListIndexed && VectorList.isDoubleSpaced; + } + bool isVecListOneDByteIndexed() const { + if (!isSingleSpacedVectorIndexed()) return false; + return VectorList.Count == 1 && VectorList.LaneIndex <= 7; + } + + bool isVecListOneDHWordIndexed() const { + if (!isSingleSpacedVectorIndexed()) return false; + return VectorList.Count == 1 && VectorList.LaneIndex <= 3; + } + + bool isVecListOneDWordIndexed() const { + if (!isSingleSpacedVectorIndexed()) return false; + return VectorList.Count == 1 && VectorList.LaneIndex <= 1; + } + + bool isVecListTwoDByteIndexed() const { + if (!isSingleSpacedVectorIndexed()) return false; + return VectorList.Count == 2 && VectorList.LaneIndex <= 7; + } + + bool isVecListTwoDHWordIndexed() const { + if (!isSingleSpacedVectorIndexed()) return false; + return VectorList.Count == 2 && VectorList.LaneIndex <= 3; + } + + bool isVecListTwoQWordIndexed() const { + if (!isDoubleSpacedVectorIndexed()) return false; + return VectorList.Count == 2 && VectorList.LaneIndex <= 1; + } + + bool isVecListTwoQHWordIndexed() const { + if (!isDoubleSpacedVectorIndexed()) return false; + return VectorList.Count == 2 && VectorList.LaneIndex <= 3; + } + + bool isVecListTwoDWordIndexed() const { + if (!isSingleSpacedVectorIndexed()) return false; + return VectorList.Count == 2 && VectorList.LaneIndex <= 1; + } + + bool isVecListThreeDByteIndexed() const { + if (!isSingleSpacedVectorIndexed()) return false; + return VectorList.Count == 3 && VectorList.LaneIndex <= 7; + } + + bool isVecListThreeDHWordIndexed() const { + if (!isSingleSpacedVectorIndexed()) return false; + return VectorList.Count == 3 && VectorList.LaneIndex <= 3; + } + + bool isVecListThreeQWordIndexed() const { + if (!isDoubleSpacedVectorIndexed()) return false; + return VectorList.Count == 3 && VectorList.LaneIndex <= 1; + } + + bool isVecListThreeQHWordIndexed() const { + if (!isDoubleSpacedVectorIndexed()) return false; + return VectorList.Count == 3 && VectorList.LaneIndex <= 3; + } + + bool isVecListThreeDWordIndexed() const { + if (!isSingleSpacedVectorIndexed()) return false; + return VectorList.Count == 3 && VectorList.LaneIndex <= 1; + } + + bool isVecListFourDByteIndexed() const { + if (!isSingleSpacedVectorIndexed()) return false; + return VectorList.Count == 4 && VectorList.LaneIndex <= 7; + } + + bool isVecListFourDHWordIndexed() const { + if (!isSingleSpacedVectorIndexed()) return false; + return VectorList.Count == 4 && VectorList.LaneIndex <= 3; + } + + bool isVecListFourQWordIndexed() const { + if (!isDoubleSpacedVectorIndexed()) return false; + return VectorList.Count == 4 && VectorList.LaneIndex <= 1; + } + + bool isVecListFourQHWordIndexed() const { + if (!isDoubleSpacedVectorIndexed()) return false; + return VectorList.Count == 4 && VectorList.LaneIndex <= 3; + } + + bool isVecListFourDWordIndexed() const { + if (!isSingleSpacedVectorIndexed()) return false; + return VectorList.Count == 4 && VectorList.LaneIndex <= 1; + } + + bool isVectorIndex8() const { + if (Kind != k_VectorIndex) return false; + return VectorIndex.Val < 8; + } + bool isVectorIndex16() const { + if (Kind != k_VectorIndex) return false; + return VectorIndex.Val < 4; + } + bool isVectorIndex32() const { + if (Kind != k_VectorIndex) return false; + return VectorIndex.Val < 2; + } + + bool isNEONi8splat() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + // Must be a constant. + if (!CE) return false; + int64_t Value = CE->getValue(); + // i8 value splatted across 8 bytes. The immediate is just the 8 byte + // value. + return Value >= 0 && Value < 256; + } + + bool isNEONi16splat() const { + if (isNEONByteReplicate(2)) + return false; // Leave that for bytes replication and forbid by default. + if (!isImm()) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + // Must be a constant. + if (!CE) return false; + unsigned Value = CE->getValue(); + return ARM_AM::isNEONi16splat(Value); + } + + bool isNEONi16splatNot() const { + if (!isImm()) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + // Must be a constant. + if (!CE) return false; + unsigned Value = CE->getValue(); + return ARM_AM::isNEONi16splat(~Value & 0xffff); + } + + bool isNEONi32splat() const { + if (isNEONByteReplicate(4)) + return false; // Leave that for bytes replication and forbid by default. + if (!isImm()) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + // Must be a constant. + if (!CE) return false; + unsigned Value = CE->getValue(); + return ARM_AM::isNEONi32splat(Value); + } + + bool isNEONi32splatNot() const { + if (!isImm()) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + // Must be a constant. + if (!CE) return false; + unsigned Value = CE->getValue(); + return ARM_AM::isNEONi32splat(~Value); + } + + bool isNEONByteReplicate(unsigned NumBytes) const { + if (!isImm()) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + // Must be a constant. + if (!CE) + return false; + int64_t Value = CE->getValue(); + if (!Value) + return false; // Don't bother with zero. + + unsigned char B = Value & 0xff; + for (unsigned i = 1; i < NumBytes; ++i) { + Value >>= 8; + if ((Value & 0xff) != B) + return false; + } + return true; + } + bool isNEONi16ByteReplicate() const { return isNEONByteReplicate(2); } + bool isNEONi32ByteReplicate() const { return isNEONByteReplicate(4); } + bool isNEONi32vmov() const { + if (isNEONByteReplicate(4)) + return false; // Let it to be classified as byte-replicate case. + if (!isImm()) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + // Must be a constant. + if (!CE) + return false; + int64_t Value = CE->getValue(); + // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X, + // for VMOV/VMVN only, 00Xf or 0Xff are also accepted. + // FIXME: This is probably wrong and a copy and paste from previous example + return (Value >= 0 && Value < 256) || + (Value >= 0x0100 && Value <= 0xff00) || + (Value >= 0x010000 && Value <= 0xff0000) || + (Value >= 0x01000000 && Value <= 0xff000000) || + (Value >= 0x01ff && Value <= 0xffff && (Value & 0xff) == 0xff) || + (Value >= 0x01ffff && Value <= 0xffffff && (Value & 0xffff) == 0xffff); + } + bool isNEONi32vmovNeg() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + // Must be a constant. + if (!CE) return false; + int64_t Value = ~CE->getValue(); + // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X, + // for VMOV/VMVN only, 00Xf or 0Xff are also accepted. + // FIXME: This is probably wrong and a copy and paste from previous example + return (Value >= 0 && Value < 256) || + (Value >= 0x0100 && Value <= 0xff00) || + (Value >= 0x010000 && Value <= 0xff0000) || + (Value >= 0x01000000 && Value <= 0xff000000) || + (Value >= 0x01ff && Value <= 0xffff && (Value & 0xff) == 0xff) || + (Value >= 0x01ffff && Value <= 0xffffff && (Value & 0xffff) == 0xffff); + } + + bool isNEONi64splat() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + // Must be a constant. + if (!CE) return false; + uint64_t Value = CE->getValue(); + // i64 value with each byte being either 0 or 0xff. + for (unsigned i = 0; i < 8; ++i) + if ((Value & 0xff) != 0 && (Value & 0xff) != 0xff) return false; + return true; + } + + void addExpr(MCInst &Inst, const MCExpr *Expr) const { + // Add as immediates when possible. Null MCExpr = 0. + if (!Expr) + Inst.addOperand(MCOperand::createImm(0)); + else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr)) + Inst.addOperand(MCOperand::createImm(CE->getValue())); + else + Inst.addOperand(MCOperand::createExpr(Expr)); + } + + void addCondCodeOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(unsigned(getCondCode()))); + unsigned RegNum = getCondCode() == ARMCC::AL ? 0: ARM::CPSR; + Inst.addOperand(MCOperand::createReg(RegNum)); + } + + void addCoprocNumOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(getCoproc())); + } + + void addCoprocRegOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(getCoproc())); + } + + void addCoprocOptionOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(CoprocOption.Val)); + } + + void addITMaskOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(ITMask.Mask)); + } + + void addITCondCodeOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(unsigned(getCondCode()))); + } + + void addCCOutOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(getReg())); + } + + void addRegOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(getReg())); + } + + void addRegShiftedRegOperands(MCInst &Inst, unsigned N) const { + assert(N == 3 && "Invalid number of operands!"); + assert(isRegShiftedReg() && + "addRegShiftedRegOperands() on non-RegShiftedReg!"); + Inst.addOperand(MCOperand::createReg(RegShiftedReg.SrcReg)); + Inst.addOperand(MCOperand::createReg(RegShiftedReg.ShiftReg)); + Inst.addOperand(MCOperand::createImm( + ARM_AM::getSORegOpc(RegShiftedReg.ShiftTy, RegShiftedReg.ShiftImm))); + } + + void addRegShiftedImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + assert(isRegShiftedImm() && + "addRegShiftedImmOperands() on non-RegShiftedImm!"); + Inst.addOperand(MCOperand::createReg(RegShiftedImm.SrcReg)); + // Shift of #32 is encoded as 0 where permitted + unsigned Imm = (RegShiftedImm.ShiftImm == 32 ? 0 : RegShiftedImm.ShiftImm); + Inst.addOperand(MCOperand::createImm( + ARM_AM::getSORegOpc(RegShiftedImm.ShiftTy, Imm))); + } + + void addShifterImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm((ShifterImm.isASR << 5) | + ShifterImm.Imm)); + } + + void addRegListOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const SmallVectorImpl<unsigned> &RegList = getRegList(); + for (SmallVectorImpl<unsigned>::const_iterator + I = RegList.begin(), E = RegList.end(); I != E; ++I) + Inst.addOperand(MCOperand::createReg(*I)); + } + + void addDPRRegListOperands(MCInst &Inst, unsigned N) const { + addRegListOperands(Inst, N); + } + + void addSPRRegListOperands(MCInst &Inst, unsigned N) const { + addRegListOperands(Inst, N); + } + + void addRotImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // Encoded as val>>3. The printer handles display as 8, 16, 24. + Inst.addOperand(MCOperand::createImm(RotImm.Imm >> 3)); + } + + void addModImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + // Support for fixups (MCFixup) + if (isImm()) + return addImmOperands(Inst, N); + + Inst.addOperand(MCOperand::createImm(ModImm.Bits | (ModImm.Rot << 7))); + } + + void addModImmNotOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + uint32_t Enc = ARM_AM::getSOImmVal(~CE->getValue()); + Inst.addOperand(MCOperand::createImm(Enc)); + } + + void addModImmNegOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + uint32_t Enc = ARM_AM::getSOImmVal(-CE->getValue()); + Inst.addOperand(MCOperand::createImm(Enc)); + } + + void addBitfieldOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // Munge the lsb/width into a bitfield mask. + unsigned lsb = Bitfield.LSB; + unsigned width = Bitfield.Width; + // Make a 32-bit mask w/ the referenced bits clear and all other bits set. + uint32_t Mask = ~(((uint32_t)0xffffffff >> lsb) << (32 - width) >> + (32 - (lsb + width))); + Inst.addOperand(MCOperand::createImm(Mask)); + } + + void addImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + addExpr(Inst, getImm()); + } + + void addFBits16Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(16 - CE->getValue())); + } + + void addFBits32Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(32 - CE->getValue())); + } + + void addFPImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + int Val = ARM_AM::getFP32Imm(APInt(32, CE->getValue())); + Inst.addOperand(MCOperand::createImm(Val)); + } + + void addImm8s4Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // FIXME: We really want to scale the value here, but the LDRD/STRD + // instruction don't encode operands that way yet. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(CE->getValue())); + } + + void addImm0_1020s4Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // The immediate is scaled by four in the encoding and is stored + // in the MCInst as such. Lop off the low two bits here. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(CE->getValue() / 4)); + } + + void addImm0_508s4NegOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // The immediate is scaled by four in the encoding and is stored + // in the MCInst as such. Lop off the low two bits here. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(-(CE->getValue() / 4))); + } + + void addImm0_508s4Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // The immediate is scaled by four in the encoding and is stored + // in the MCInst as such. Lop off the low two bits here. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(CE->getValue() / 4)); + } + + void addImm1_16Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // The constant encodes as the immediate-1, and we store in the instruction + // the bits as encoded, so subtract off one here. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(CE->getValue() - 1)); + } + + void addImm1_32Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // The constant encodes as the immediate-1, and we store in the instruction + // the bits as encoded, so subtract off one here. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(CE->getValue() - 1)); + } + + void addImmThumbSROperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // The constant encodes as the immediate, except for 32, which encodes as + // zero. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + unsigned Imm = CE->getValue(); + Inst.addOperand(MCOperand::createImm((Imm == 32 ? 0 : Imm))); + } + + void addPKHASRImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // An ASR value of 32 encodes as 0, so that's how we want to add it to + // the instruction as well. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + int Val = CE->getValue(); + Inst.addOperand(MCOperand::createImm(Val == 32 ? 0 : Val)); + } + + void addT2SOImmNotOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // The operand is actually a t2_so_imm, but we have its bitwise + // negation in the assembly source, so twiddle it here. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(~CE->getValue())); + } + + void addT2SOImmNegOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // The operand is actually a t2_so_imm, but we have its + // negation in the assembly source, so twiddle it here. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(-CE->getValue())); + } + + void addImm0_4095NegOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // The operand is actually an imm0_4095, but we have its + // negation in the assembly source, so twiddle it here. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(-CE->getValue())); + } + + void addUnsignedOffset_b8s2Operands(MCInst &Inst, unsigned N) const { + if(const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm())) { + Inst.addOperand(MCOperand::createImm(CE->getValue() >> 2)); + return; + } + + const MCSymbolRefExpr *SR = dyn_cast<MCSymbolRefExpr>(Imm.Val); + assert(SR && "Unknown value type!"); + Inst.addOperand(MCOperand::createExpr(SR)); + } + + void addThumbMemPCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + if (isImm()) { + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (CE) { + Inst.addOperand(MCOperand::createImm(CE->getValue())); + return; + } + + const MCSymbolRefExpr *SR = dyn_cast<MCSymbolRefExpr>(Imm.Val); + assert(SR && "Unknown value type!"); + Inst.addOperand(MCOperand::createExpr(SR)); + return; + } + + assert(isMem() && "Unknown value type!"); + assert(isa<MCConstantExpr>(Memory.OffsetImm) && "Unknown value type!"); + Inst.addOperand(MCOperand::createImm(Memory.OffsetImm->getValue())); + } + + void addMemBarrierOptOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(unsigned(getMemBarrierOpt()))); + } + + void addInstSyncBarrierOptOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(unsigned(getInstSyncBarrierOpt()))); + } + + void addMemNoOffsetOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum)); + } + + void addMemPCRelImm12Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + int32_t Imm = Memory.OffsetImm->getValue(); + Inst.addOperand(MCOperand::createImm(Imm)); + } + + void addAdrLabelOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + assert(isImm() && "Not an immediate!"); + + // If we have an immediate that's not a constant, treat it as a label + // reference needing a fixup. + if (!isa<MCConstantExpr>(getImm())) { + Inst.addOperand(MCOperand::createExpr(getImm())); + return; + } + + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + int Val = CE->getValue(); + Inst.addOperand(MCOperand::createImm(Val)); + } + + void addAlignedMemoryOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::createImm(Memory.Alignment)); + } + + void addDupAlignedMemoryNoneOperands(MCInst &Inst, unsigned N) const { + addAlignedMemoryOperands(Inst, N); + } + + void addAlignedMemoryNoneOperands(MCInst &Inst, unsigned N) const { + addAlignedMemoryOperands(Inst, N); + } + + void addAlignedMemory16Operands(MCInst &Inst, unsigned N) const { + addAlignedMemoryOperands(Inst, N); + } + + void addDupAlignedMemory16Operands(MCInst &Inst, unsigned N) const { + addAlignedMemoryOperands(Inst, N); + } + + void addAlignedMemory32Operands(MCInst &Inst, unsigned N) const { + addAlignedMemoryOperands(Inst, N); + } + + void addDupAlignedMemory32Operands(MCInst &Inst, unsigned N) const { + addAlignedMemoryOperands(Inst, N); + } + + void addAlignedMemory64Operands(MCInst &Inst, unsigned N) const { + addAlignedMemoryOperands(Inst, N); + } + + void addDupAlignedMemory64Operands(MCInst &Inst, unsigned N) const { + addAlignedMemoryOperands(Inst, N); + } + + void addAlignedMemory64or128Operands(MCInst &Inst, unsigned N) const { + addAlignedMemoryOperands(Inst, N); + } + + void addDupAlignedMemory64or128Operands(MCInst &Inst, unsigned N) const { + addAlignedMemoryOperands(Inst, N); + } + + void addAlignedMemory64or128or256Operands(MCInst &Inst, unsigned N) const { + addAlignedMemoryOperands(Inst, N); + } + + void addAddrMode2Operands(MCInst &Inst, unsigned N) const { + assert(N == 3 && "Invalid number of operands!"); + int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0; + if (!Memory.OffsetRegNum) { + ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add; + // Special case for #-0 + if (Val == INT32_MIN) Val = 0; + if (Val < 0) Val = -Val; + Val = ARM_AM::getAM2Opc(AddSub, Val, ARM_AM::no_shift); + } else { + // For register offset, we encode the shift type and negation flag + // here. + Val = ARM_AM::getAM2Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add, + Memory.ShiftImm, Memory.ShiftType); + } + Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum)); + Inst.addOperand(MCOperand::createImm(Val)); + } + + void addAM2OffsetImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + assert(CE && "non-constant AM2OffsetImm operand!"); + int32_t Val = CE->getValue(); + ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add; + // Special case for #-0 + if (Val == INT32_MIN) Val = 0; + if (Val < 0) Val = -Val; + Val = ARM_AM::getAM2Opc(AddSub, Val, ARM_AM::no_shift); + Inst.addOperand(MCOperand::createReg(0)); + Inst.addOperand(MCOperand::createImm(Val)); + } + + void addAddrMode3Operands(MCInst &Inst, unsigned N) const { + assert(N == 3 && "Invalid number of operands!"); + // If we have an immediate that's not a constant, treat it as a label + // reference needing a fixup. If it is a constant, it's something else + // and we reject it. + if (isImm()) { + Inst.addOperand(MCOperand::createExpr(getImm())); + Inst.addOperand(MCOperand::createReg(0)); + Inst.addOperand(MCOperand::createImm(0)); + return; + } + + int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0; + if (!Memory.OffsetRegNum) { + ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add; + // Special case for #-0 + if (Val == INT32_MIN) Val = 0; + if (Val < 0) Val = -Val; + Val = ARM_AM::getAM3Opc(AddSub, Val); + } else { + // For register offset, we encode the shift type and negation flag + // here. + Val = ARM_AM::getAM3Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add, 0); + } + Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum)); + Inst.addOperand(MCOperand::createImm(Val)); + } + + void addAM3OffsetOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + if (Kind == k_PostIndexRegister) { + int32_t Val = + ARM_AM::getAM3Opc(PostIdxReg.isAdd ? ARM_AM::add : ARM_AM::sub, 0); + Inst.addOperand(MCOperand::createReg(PostIdxReg.RegNum)); + Inst.addOperand(MCOperand::createImm(Val)); + return; + } + + // Constant offset. + const MCConstantExpr *CE = static_cast<const MCConstantExpr*>(getImm()); + int32_t Val = CE->getValue(); + ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add; + // Special case for #-0 + if (Val == INT32_MIN) Val = 0; + if (Val < 0) Val = -Val; + Val = ARM_AM::getAM3Opc(AddSub, Val); + Inst.addOperand(MCOperand::createReg(0)); + Inst.addOperand(MCOperand::createImm(Val)); + } + + void addAddrMode5Operands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + // If we have an immediate that's not a constant, treat it as a label + // reference needing a fixup. If it is a constant, it's something else + // and we reject it. + if (isImm()) { + Inst.addOperand(MCOperand::createExpr(getImm())); + Inst.addOperand(MCOperand::createImm(0)); + return; + } + + // The lower two bits are always zero and as such are not encoded. + int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() / 4 : 0; + ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add; + // Special case for #-0 + if (Val == INT32_MIN) Val = 0; + if (Val < 0) Val = -Val; + Val = ARM_AM::getAM5Opc(AddSub, Val); + Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::createImm(Val)); + } + + void addMemImm8s4OffsetOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + // If we have an immediate that's not a constant, treat it as a label + // reference needing a fixup. If it is a constant, it's something else + // and we reject it. + if (isImm()) { + Inst.addOperand(MCOperand::createExpr(getImm())); + Inst.addOperand(MCOperand::createImm(0)); + return; + } + + int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0; + Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::createImm(Val)); + } + + void addMemImm0_1020s4OffsetOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + // The lower two bits are always zero and as such are not encoded. + int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() / 4 : 0; + Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::createImm(Val)); + } + + void addMemImm8OffsetOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0; + Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::createImm(Val)); + } + + void addMemPosImm8OffsetOperands(MCInst &Inst, unsigned N) const { + addMemImm8OffsetOperands(Inst, N); + } + + void addMemNegImm8OffsetOperands(MCInst &Inst, unsigned N) const { + addMemImm8OffsetOperands(Inst, N); + } + + void addMemUImm12OffsetOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + // If this is an immediate, it's a label reference. + if (isImm()) { + addExpr(Inst, getImm()); + Inst.addOperand(MCOperand::createImm(0)); + return; + } + + // Otherwise, it's a normal memory reg+offset. + int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0; + Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::createImm(Val)); + } + + void addMemImm12OffsetOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + // If this is an immediate, it's a label reference. + if (isImm()) { + addExpr(Inst, getImm()); + Inst.addOperand(MCOperand::createImm(0)); + return; + } + + // Otherwise, it's a normal memory reg+offset. + int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0; + Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::createImm(Val)); + } + + void addMemTBBOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum)); + } + + void addMemTBHOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum)); + } + + void addMemRegOffsetOperands(MCInst &Inst, unsigned N) const { + assert(N == 3 && "Invalid number of operands!"); + unsigned Val = + ARM_AM::getAM2Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add, + Memory.ShiftImm, Memory.ShiftType); + Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum)); + Inst.addOperand(MCOperand::createImm(Val)); + } + + void addT2MemRegOffsetOperands(MCInst &Inst, unsigned N) const { + assert(N == 3 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum)); + Inst.addOperand(MCOperand::createImm(Memory.ShiftImm)); + } + + void addMemThumbRROperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum)); + } + + void addMemThumbRIs4Operands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue() / 4) : 0; + Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::createImm(Val)); + } + + void addMemThumbRIs2Operands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue() / 2) : 0; + Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::createImm(Val)); + } + + void addMemThumbRIs1Operands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue()) : 0; + Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::createImm(Val)); + } + + void addMemThumbSPIOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue() / 4) : 0; + Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::createImm(Val)); + } + + void addPostIdxImm8Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + assert(CE && "non-constant post-idx-imm8 operand!"); + int Imm = CE->getValue(); + bool isAdd = Imm >= 0; + if (Imm == INT32_MIN) Imm = 0; + Imm = (Imm < 0 ? -Imm : Imm) | (int)isAdd << 8; + Inst.addOperand(MCOperand::createImm(Imm)); + } + + void addPostIdxImm8s4Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + assert(CE && "non-constant post-idx-imm8s4 operand!"); + int Imm = CE->getValue(); + bool isAdd = Imm >= 0; + if (Imm == INT32_MIN) Imm = 0; + // Immediate is scaled by 4. + Imm = ((Imm < 0 ? -Imm : Imm) / 4) | (int)isAdd << 8; + Inst.addOperand(MCOperand::createImm(Imm)); + } + + void addPostIdxRegOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(PostIdxReg.RegNum)); + Inst.addOperand(MCOperand::createImm(PostIdxReg.isAdd)); + } + + void addPostIdxRegShiftedOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(PostIdxReg.RegNum)); + // The sign, shift type, and shift amount are encoded in a single operand + // using the AM2 encoding helpers. + ARM_AM::AddrOpc opc = PostIdxReg.isAdd ? ARM_AM::add : ARM_AM::sub; + unsigned Imm = ARM_AM::getAM2Opc(opc, PostIdxReg.ShiftImm, + PostIdxReg.ShiftTy); + Inst.addOperand(MCOperand::createImm(Imm)); + } + + void addMSRMaskOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(unsigned(getMSRMask()))); + } + + void addBankedRegOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(unsigned(getBankedReg()))); + } + + void addProcIFlagsOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(unsigned(getProcIFlags()))); + } + + void addVecListOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(VectorList.RegNum)); + } + + void addVecListIndexedOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(VectorList.RegNum)); + Inst.addOperand(MCOperand::createImm(VectorList.LaneIndex)); + } + + void addVectorIndex8Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(getVectorIndex())); + } + + void addVectorIndex16Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(getVectorIndex())); + } + + void addVectorIndex32Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(getVectorIndex())); + } + + void addNEONi8splatOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // The immediate encodes the type of constant as well as the value. + // Mask in that this is an i8 splat. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(CE->getValue() | 0xe00)); + } + + void addNEONi16splatOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // The immediate encodes the type of constant as well as the value. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + unsigned Value = CE->getValue(); + Value = ARM_AM::encodeNEONi16splat(Value); + Inst.addOperand(MCOperand::createImm(Value)); + } + + void addNEONi16splatNotOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // The immediate encodes the type of constant as well as the value. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + unsigned Value = CE->getValue(); + Value = ARM_AM::encodeNEONi16splat(~Value & 0xffff); + Inst.addOperand(MCOperand::createImm(Value)); + } + + void addNEONi32splatOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // The immediate encodes the type of constant as well as the value. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + unsigned Value = CE->getValue(); + Value = ARM_AM::encodeNEONi32splat(Value); + Inst.addOperand(MCOperand::createImm(Value)); + } + + void addNEONi32splatNotOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // The immediate encodes the type of constant as well as the value. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + unsigned Value = CE->getValue(); + Value = ARM_AM::encodeNEONi32splat(~Value); + Inst.addOperand(MCOperand::createImm(Value)); + } + + void addNEONinvByteReplicateOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // The immediate encodes the type of constant as well as the value. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + unsigned Value = CE->getValue(); + assert((Inst.getOpcode() == ARM::VMOVv8i8 || + Inst.getOpcode() == ARM::VMOVv16i8) && + "All vmvn instructions that wants to replicate non-zero byte " + "always must be replaced with VMOVv8i8 or VMOVv16i8."); + unsigned B = ((~Value) & 0xff); + B |= 0xe00; // cmode = 0b1110 + Inst.addOperand(MCOperand::createImm(B)); + } + void addNEONi32vmovOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // The immediate encodes the type of constant as well as the value. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + unsigned Value = CE->getValue(); + if (Value >= 256 && Value <= 0xffff) + Value = (Value >> 8) | ((Value & 0xff) ? 0xc00 : 0x200); + else if (Value > 0xffff && Value <= 0xffffff) + Value = (Value >> 16) | ((Value & 0xff) ? 0xd00 : 0x400); + else if (Value > 0xffffff) + Value = (Value >> 24) | 0x600; + Inst.addOperand(MCOperand::createImm(Value)); + } + + void addNEONvmovByteReplicateOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // The immediate encodes the type of constant as well as the value. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + unsigned Value = CE->getValue(); + assert((Inst.getOpcode() == ARM::VMOVv8i8 || + Inst.getOpcode() == ARM::VMOVv16i8) && + "All instructions that wants to replicate non-zero byte " + "always must be replaced with VMOVv8i8 or VMOVv16i8."); + unsigned B = Value & 0xff; + B |= 0xe00; // cmode = 0b1110 + Inst.addOperand(MCOperand::createImm(B)); + } + void addNEONi32vmovNegOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // The immediate encodes the type of constant as well as the value. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + unsigned Value = ~CE->getValue(); + if (Value >= 256 && Value <= 0xffff) + Value = (Value >> 8) | ((Value & 0xff) ? 0xc00 : 0x200); + else if (Value > 0xffff && Value <= 0xffffff) + Value = (Value >> 16) | ((Value & 0xff) ? 0xd00 : 0x400); + else if (Value > 0xffffff) + Value = (Value >> 24) | 0x600; + Inst.addOperand(MCOperand::createImm(Value)); + } + + void addNEONi64splatOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // The immediate encodes the type of constant as well as the value. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + uint64_t Value = CE->getValue(); + unsigned Imm = 0; + for (unsigned i = 0; i < 8; ++i, Value >>= 8) { + Imm |= (Value & 1) << i; + } + Inst.addOperand(MCOperand::createImm(Imm | 0x1e00)); + } + + void print(raw_ostream &OS) const override; + + static std::unique_ptr<ARMOperand> CreateITMask(unsigned Mask, SMLoc S) { + auto Op = make_unique<ARMOperand>(k_ITCondMask); + Op->ITMask.Mask = Mask; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static std::unique_ptr<ARMOperand> CreateCondCode(ARMCC::CondCodes CC, + SMLoc S) { + auto Op = make_unique<ARMOperand>(k_CondCode); + Op->CC.Val = CC; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static std::unique_ptr<ARMOperand> CreateCoprocNum(unsigned CopVal, SMLoc S) { + auto Op = make_unique<ARMOperand>(k_CoprocNum); + Op->Cop.Val = CopVal; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static std::unique_ptr<ARMOperand> CreateCoprocReg(unsigned CopVal, SMLoc S) { + auto Op = make_unique<ARMOperand>(k_CoprocReg); + Op->Cop.Val = CopVal; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static std::unique_ptr<ARMOperand> CreateCoprocOption(unsigned Val, SMLoc S, + SMLoc E) { + auto Op = make_unique<ARMOperand>(k_CoprocOption); + Op->Cop.Val = Val; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static std::unique_ptr<ARMOperand> CreateCCOut(unsigned RegNum, SMLoc S) { + auto Op = make_unique<ARMOperand>(k_CCOut); + Op->Reg.RegNum = RegNum; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static std::unique_ptr<ARMOperand> CreateToken(StringRef Str, SMLoc S) { + auto Op = make_unique<ARMOperand>(k_Token); + Op->Tok.Data = Str.data(); + Op->Tok.Length = Str.size(); + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static std::unique_ptr<ARMOperand> CreateReg(unsigned RegNum, SMLoc S, + SMLoc E) { + auto Op = make_unique<ARMOperand>(k_Register); + Op->Reg.RegNum = RegNum; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static std::unique_ptr<ARMOperand> + CreateShiftedRegister(ARM_AM::ShiftOpc ShTy, unsigned SrcReg, + unsigned ShiftReg, unsigned ShiftImm, SMLoc S, + SMLoc E) { + auto Op = make_unique<ARMOperand>(k_ShiftedRegister); + Op->RegShiftedReg.ShiftTy = ShTy; + Op->RegShiftedReg.SrcReg = SrcReg; + Op->RegShiftedReg.ShiftReg = ShiftReg; + Op->RegShiftedReg.ShiftImm = ShiftImm; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static std::unique_ptr<ARMOperand> + CreateShiftedImmediate(ARM_AM::ShiftOpc ShTy, unsigned SrcReg, + unsigned ShiftImm, SMLoc S, SMLoc E) { + auto Op = make_unique<ARMOperand>(k_ShiftedImmediate); + Op->RegShiftedImm.ShiftTy = ShTy; + Op->RegShiftedImm.SrcReg = SrcReg; + Op->RegShiftedImm.ShiftImm = ShiftImm; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static std::unique_ptr<ARMOperand> CreateShifterImm(bool isASR, unsigned Imm, + SMLoc S, SMLoc E) { + auto Op = make_unique<ARMOperand>(k_ShifterImmediate); + Op->ShifterImm.isASR = isASR; + Op->ShifterImm.Imm = Imm; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static std::unique_ptr<ARMOperand> CreateRotImm(unsigned Imm, SMLoc S, + SMLoc E) { + auto Op = make_unique<ARMOperand>(k_RotateImmediate); + Op->RotImm.Imm = Imm; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static std::unique_ptr<ARMOperand> CreateModImm(unsigned Bits, unsigned Rot, + SMLoc S, SMLoc E) { + auto Op = make_unique<ARMOperand>(k_ModifiedImmediate); + Op->ModImm.Bits = Bits; + Op->ModImm.Rot = Rot; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static std::unique_ptr<ARMOperand> + CreateBitfield(unsigned LSB, unsigned Width, SMLoc S, SMLoc E) { + auto Op = make_unique<ARMOperand>(k_BitfieldDescriptor); + Op->Bitfield.LSB = LSB; + Op->Bitfield.Width = Width; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static std::unique_ptr<ARMOperand> + CreateRegList(SmallVectorImpl<std::pair<unsigned, unsigned>> &Regs, + SMLoc StartLoc, SMLoc EndLoc) { + assert (Regs.size() > 0 && "RegList contains no registers?"); + KindTy Kind = k_RegisterList; + + if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Regs.front().second)) + Kind = k_DPRRegisterList; + else if (ARMMCRegisterClasses[ARM::SPRRegClassID]. + contains(Regs.front().second)) + Kind = k_SPRRegisterList; + + // Sort based on the register encoding values. + array_pod_sort(Regs.begin(), Regs.end()); + + auto Op = make_unique<ARMOperand>(Kind); + for (SmallVectorImpl<std::pair<unsigned, unsigned> >::const_iterator + I = Regs.begin(), E = Regs.end(); I != E; ++I) + Op->Registers.push_back(I->second); + Op->StartLoc = StartLoc; + Op->EndLoc = EndLoc; + return Op; + } + + static std::unique_ptr<ARMOperand> CreateVectorList(unsigned RegNum, + unsigned Count, + bool isDoubleSpaced, + SMLoc S, SMLoc E) { + auto Op = make_unique<ARMOperand>(k_VectorList); + Op->VectorList.RegNum = RegNum; + Op->VectorList.Count = Count; + Op->VectorList.isDoubleSpaced = isDoubleSpaced; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static std::unique_ptr<ARMOperand> + CreateVectorListAllLanes(unsigned RegNum, unsigned Count, bool isDoubleSpaced, + SMLoc S, SMLoc E) { + auto Op = make_unique<ARMOperand>(k_VectorListAllLanes); + Op->VectorList.RegNum = RegNum; + Op->VectorList.Count = Count; + Op->VectorList.isDoubleSpaced = isDoubleSpaced; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static std::unique_ptr<ARMOperand> + CreateVectorListIndexed(unsigned RegNum, unsigned Count, unsigned Index, + bool isDoubleSpaced, SMLoc S, SMLoc E) { + auto Op = make_unique<ARMOperand>(k_VectorListIndexed); + Op->VectorList.RegNum = RegNum; + Op->VectorList.Count = Count; + Op->VectorList.LaneIndex = Index; + Op->VectorList.isDoubleSpaced = isDoubleSpaced; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static std::unique_ptr<ARMOperand> + CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, MCContext &Ctx) { + auto Op = make_unique<ARMOperand>(k_VectorIndex); + Op->VectorIndex.Val = Idx; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static std::unique_ptr<ARMOperand> CreateImm(const MCExpr *Val, SMLoc S, + SMLoc E) { + auto Op = make_unique<ARMOperand>(k_Immediate); + Op->Imm.Val = Val; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static std::unique_ptr<ARMOperand> + CreateMem(unsigned BaseRegNum, const MCConstantExpr *OffsetImm, + unsigned OffsetRegNum, ARM_AM::ShiftOpc ShiftType, + unsigned ShiftImm, unsigned Alignment, bool isNegative, SMLoc S, + SMLoc E, SMLoc AlignmentLoc = SMLoc()) { + auto Op = make_unique<ARMOperand>(k_Memory); + Op->Memory.BaseRegNum = BaseRegNum; + Op->Memory.OffsetImm = OffsetImm; + Op->Memory.OffsetRegNum = OffsetRegNum; + Op->Memory.ShiftType = ShiftType; + Op->Memory.ShiftImm = ShiftImm; + Op->Memory.Alignment = Alignment; + Op->Memory.isNegative = isNegative; + Op->StartLoc = S; + Op->EndLoc = E; + Op->AlignmentLoc = AlignmentLoc; + return Op; + } + + static std::unique_ptr<ARMOperand> + CreatePostIdxReg(unsigned RegNum, bool isAdd, ARM_AM::ShiftOpc ShiftTy, + unsigned ShiftImm, SMLoc S, SMLoc E) { + auto Op = make_unique<ARMOperand>(k_PostIndexRegister); + Op->PostIdxReg.RegNum = RegNum; + Op->PostIdxReg.isAdd = isAdd; + Op->PostIdxReg.ShiftTy = ShiftTy; + Op->PostIdxReg.ShiftImm = ShiftImm; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static std::unique_ptr<ARMOperand> CreateMemBarrierOpt(ARM_MB::MemBOpt Opt, + SMLoc S) { + auto Op = make_unique<ARMOperand>(k_MemBarrierOpt); + Op->MBOpt.Val = Opt; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static std::unique_ptr<ARMOperand> + CreateInstSyncBarrierOpt(ARM_ISB::InstSyncBOpt Opt, SMLoc S) { + auto Op = make_unique<ARMOperand>(k_InstSyncBarrierOpt); + Op->ISBOpt.Val = Opt; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static std::unique_ptr<ARMOperand> CreateProcIFlags(ARM_PROC::IFlags IFlags, + SMLoc S) { + auto Op = make_unique<ARMOperand>(k_ProcIFlags); + Op->IFlags.Val = IFlags; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static std::unique_ptr<ARMOperand> CreateMSRMask(unsigned MMask, SMLoc S) { + auto Op = make_unique<ARMOperand>(k_MSRMask); + Op->MMask.Val = MMask; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static std::unique_ptr<ARMOperand> CreateBankedReg(unsigned Reg, SMLoc S) { + auto Op = make_unique<ARMOperand>(k_BankedReg); + Op->BankedReg.Val = Reg; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } +}; + +} // end anonymous namespace. + +void ARMOperand::print(raw_ostream &OS) const { + switch (Kind) { + case k_CondCode: + OS << "<ARMCC::" << ARMCondCodeToString(getCondCode()) << ">"; + break; + case k_CCOut: + OS << "<ccout " << getReg() << ">"; + break; + case k_ITCondMask: { + static const char *const MaskStr[] = { + "()", "(t)", "(e)", "(tt)", "(et)", "(te)", "(ee)", "(ttt)", "(ett)", + "(tet)", "(eet)", "(tte)", "(ete)", "(tee)", "(eee)" + }; + assert((ITMask.Mask & 0xf) == ITMask.Mask); + OS << "<it-mask " << MaskStr[ITMask.Mask] << ">"; + break; + } + case k_CoprocNum: + OS << "<coprocessor number: " << getCoproc() << ">"; + break; + case k_CoprocReg: + OS << "<coprocessor register: " << getCoproc() << ">"; + break; + case k_CoprocOption: + OS << "<coprocessor option: " << CoprocOption.Val << ">"; + break; + case k_MSRMask: + OS << "<mask: " << getMSRMask() << ">"; + break; + case k_BankedReg: + OS << "<banked reg: " << getBankedReg() << ">"; + break; + case k_Immediate: + OS << *getImm(); + break; + case k_MemBarrierOpt: + OS << "<ARM_MB::" << MemBOptToString(getMemBarrierOpt(), false) << ">"; + break; + case k_InstSyncBarrierOpt: + OS << "<ARM_ISB::" << InstSyncBOptToString(getInstSyncBarrierOpt()) << ">"; + break; + case k_Memory: + OS << "<memory " + << " base:" << Memory.BaseRegNum; + OS << ">"; + break; + case k_PostIndexRegister: + OS << "post-idx register " << (PostIdxReg.isAdd ? "" : "-") + << PostIdxReg.RegNum; + if (PostIdxReg.ShiftTy != ARM_AM::no_shift) + OS << ARM_AM::getShiftOpcStr(PostIdxReg.ShiftTy) << " " + << PostIdxReg.ShiftImm; + OS << ">"; + break; + case k_ProcIFlags: { + OS << "<ARM_PROC::"; + unsigned IFlags = getProcIFlags(); + for (int i=2; i >= 0; --i) + if (IFlags & (1 << i)) + OS << ARM_PROC::IFlagsToString(1 << i); + OS << ">"; + break; + } + case k_Register: + OS << "<register " << getReg() << ">"; + break; + case k_ShifterImmediate: + OS << "<shift " << (ShifterImm.isASR ? "asr" : "lsl") + << " #" << ShifterImm.Imm << ">"; + break; + case k_ShiftedRegister: + OS << "<so_reg_reg " + << RegShiftedReg.SrcReg << " " + << ARM_AM::getShiftOpcStr(RegShiftedReg.ShiftTy) + << " " << RegShiftedReg.ShiftReg << ">"; + break; + case k_ShiftedImmediate: + OS << "<so_reg_imm " + << RegShiftedImm.SrcReg << " " + << ARM_AM::getShiftOpcStr(RegShiftedImm.ShiftTy) + << " #" << RegShiftedImm.ShiftImm << ">"; + break; + case k_RotateImmediate: + OS << "<ror " << " #" << (RotImm.Imm * 8) << ">"; + break; + case k_ModifiedImmediate: + OS << "<mod_imm #" << ModImm.Bits << ", #" + << ModImm.Rot << ")>"; + break; + case k_BitfieldDescriptor: + OS << "<bitfield " << "lsb: " << Bitfield.LSB + << ", width: " << Bitfield.Width << ">"; + break; + case k_RegisterList: + case k_DPRRegisterList: + case k_SPRRegisterList: { + OS << "<register_list "; + + const SmallVectorImpl<unsigned> &RegList = getRegList(); + for (SmallVectorImpl<unsigned>::const_iterator + I = RegList.begin(), E = RegList.end(); I != E; ) { + OS << *I; + if (++I < E) OS << ", "; + } + + OS << ">"; + break; + } + case k_VectorList: + OS << "<vector_list " << VectorList.Count << " * " + << VectorList.RegNum << ">"; + break; + case k_VectorListAllLanes: + OS << "<vector_list(all lanes) " << VectorList.Count << " * " + << VectorList.RegNum << ">"; + break; + case k_VectorListIndexed: + OS << "<vector_list(lane " << VectorList.LaneIndex << ") " + << VectorList.Count << " * " << VectorList.RegNum << ">"; + break; + case k_Token: + OS << "'" << getToken() << "'"; + break; + case k_VectorIndex: + OS << "<vectorindex " << getVectorIndex() << ">"; + break; + } +} + +/// @name Auto-generated Match Functions +/// { + +static unsigned MatchRegisterName(StringRef Name); + +/// } + +bool ARMAsmParser::ParseRegister(unsigned &RegNo, + SMLoc &StartLoc, SMLoc &EndLoc) { + const AsmToken &Tok = getParser().getTok(); + StartLoc = Tok.getLoc(); + EndLoc = Tok.getEndLoc(); + RegNo = tryParseRegister(); + + return (RegNo == (unsigned)-1); +} + +/// Try to parse a register name. The token must be an Identifier when called, +/// and if it is a register name the token is eaten and the register number is +/// returned. Otherwise return -1. +/// +int ARMAsmParser::tryParseRegister() { + MCAsmParser &Parser = getParser(); + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Identifier)) return -1; + + std::string lowerCase = Tok.getString().lower(); + unsigned RegNum = MatchRegisterName(lowerCase); + if (!RegNum) { + RegNum = StringSwitch<unsigned>(lowerCase) + .Case("r13", ARM::SP) + .Case("r14", ARM::LR) + .Case("r15", ARM::PC) + .Case("ip", ARM::R12) + // Additional register name aliases for 'gas' compatibility. + .Case("a1", ARM::R0) + .Case("a2", ARM::R1) + .Case("a3", ARM::R2) + .Case("a4", ARM::R3) + .Case("v1", ARM::R4) + .Case("v2", ARM::R5) + .Case("v3", ARM::R6) + .Case("v4", ARM::R7) + .Case("v5", ARM::R8) + .Case("v6", ARM::R9) + .Case("v7", ARM::R10) + .Case("v8", ARM::R11) + .Case("sb", ARM::R9) + .Case("sl", ARM::R10) + .Case("fp", ARM::R11) + .Default(0); + } + if (!RegNum) { + // Check for aliases registered via .req. Canonicalize to lower case. + // That's more consistent since register names are case insensitive, and + // it's how the original entry was passed in from MC/MCParser/AsmParser. + StringMap<unsigned>::const_iterator Entry = RegisterReqs.find(lowerCase); + // If no match, return failure. + if (Entry == RegisterReqs.end()) + return -1; + Parser.Lex(); // Eat identifier token. + return Entry->getValue(); + } + + // Some FPUs only have 16 D registers, so D16-D31 are invalid + if (hasD16() && RegNum >= ARM::D16 && RegNum <= ARM::D31) + return -1; + + Parser.Lex(); // Eat identifier token. + + return RegNum; +} + +// Try to parse a shifter (e.g., "lsl <amt>"). On success, return 0. +// If a recoverable error occurs, return 1. If an irrecoverable error +// occurs, return -1. An irrecoverable error is one where tokens have been +// consumed in the process of trying to parse the shifter (i.e., when it is +// indeed a shifter operand, but malformed). +int ARMAsmParser::tryParseShiftRegister(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + SMLoc S = Parser.getTok().getLoc(); + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Identifier)) + return -1; + + std::string lowerCase = Tok.getString().lower(); + ARM_AM::ShiftOpc ShiftTy = StringSwitch<ARM_AM::ShiftOpc>(lowerCase) + .Case("asl", ARM_AM::lsl) + .Case("lsl", ARM_AM::lsl) + .Case("lsr", ARM_AM::lsr) + .Case("asr", ARM_AM::asr) + .Case("ror", ARM_AM::ror) + .Case("rrx", ARM_AM::rrx) + .Default(ARM_AM::no_shift); + + if (ShiftTy == ARM_AM::no_shift) + return 1; + + Parser.Lex(); // Eat the operator. + + // The source register for the shift has already been added to the + // operand list, so we need to pop it off and combine it into the shifted + // register operand instead. + std::unique_ptr<ARMOperand> PrevOp( + (ARMOperand *)Operands.pop_back_val().release()); + if (!PrevOp->isReg()) + return Error(PrevOp->getStartLoc(), "shift must be of a register"); + int SrcReg = PrevOp->getReg(); + + SMLoc EndLoc; + int64_t Imm = 0; + int ShiftReg = 0; + if (ShiftTy == ARM_AM::rrx) { + // RRX Doesn't have an explicit shift amount. The encoder expects + // the shift register to be the same as the source register. Seems odd, + // but OK. + ShiftReg = SrcReg; + } else { + // Figure out if this is shifted by a constant or a register (for non-RRX). + if (Parser.getTok().is(AsmToken::Hash) || + Parser.getTok().is(AsmToken::Dollar)) { + Parser.Lex(); // Eat hash. + SMLoc ImmLoc = Parser.getTok().getLoc(); + const MCExpr *ShiftExpr = nullptr; + if (getParser().parseExpression(ShiftExpr, EndLoc)) { + Error(ImmLoc, "invalid immediate shift value"); + return -1; + } + // The expression must be evaluatable as an immediate. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftExpr); + if (!CE) { + Error(ImmLoc, "invalid immediate shift value"); + return -1; + } + // Range check the immediate. + // lsl, ror: 0 <= imm <= 31 + // lsr, asr: 0 <= imm <= 32 + Imm = CE->getValue(); + if (Imm < 0 || + ((ShiftTy == ARM_AM::lsl || ShiftTy == ARM_AM::ror) && Imm > 31) || + ((ShiftTy == ARM_AM::lsr || ShiftTy == ARM_AM::asr) && Imm > 32)) { + Error(ImmLoc, "immediate shift value out of range"); + return -1; + } + // shift by zero is a nop. Always send it through as lsl. + // ('as' compatibility) + if (Imm == 0) + ShiftTy = ARM_AM::lsl; + } else if (Parser.getTok().is(AsmToken::Identifier)) { + SMLoc L = Parser.getTok().getLoc(); + EndLoc = Parser.getTok().getEndLoc(); + ShiftReg = tryParseRegister(); + if (ShiftReg == -1) { + Error(L, "expected immediate or register in shift operand"); + return -1; + } + } else { + Error(Parser.getTok().getLoc(), + "expected immediate or register in shift operand"); + return -1; + } + } + + if (ShiftReg && ShiftTy != ARM_AM::rrx) + Operands.push_back(ARMOperand::CreateShiftedRegister(ShiftTy, SrcReg, + ShiftReg, Imm, + S, EndLoc)); + else + Operands.push_back(ARMOperand::CreateShiftedImmediate(ShiftTy, SrcReg, Imm, + S, EndLoc)); + + return 0; +} + + +/// Try to parse a register name. The token must be an Identifier when called. +/// If it's a register, an AsmOperand is created. Another AsmOperand is created +/// if there is a "writeback". 'true' if it's not a register. +/// +/// TODO this is likely to change to allow different register types and or to +/// parse for a specific register type. +bool ARMAsmParser::tryParseRegisterWithWriteBack(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + const AsmToken &RegTok = Parser.getTok(); + int RegNo = tryParseRegister(); + if (RegNo == -1) + return true; + + Operands.push_back(ARMOperand::CreateReg(RegNo, RegTok.getLoc(), + RegTok.getEndLoc())); + + const AsmToken &ExclaimTok = Parser.getTok(); + if (ExclaimTok.is(AsmToken::Exclaim)) { + Operands.push_back(ARMOperand::CreateToken(ExclaimTok.getString(), + ExclaimTok.getLoc())); + Parser.Lex(); // Eat exclaim token + return false; + } + + // Also check for an index operand. This is only legal for vector registers, + // but that'll get caught OK in operand matching, so we don't need to + // explicitly filter everything else out here. + if (Parser.getTok().is(AsmToken::LBrac)) { + SMLoc SIdx = Parser.getTok().getLoc(); + Parser.Lex(); // Eat left bracket token. + + const MCExpr *ImmVal; + if (getParser().parseExpression(ImmVal)) + return true; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal); + if (!MCE) + return TokError("immediate value expected for vector index"); + + if (Parser.getTok().isNot(AsmToken::RBrac)) + return Error(Parser.getTok().getLoc(), "']' expected"); + + SMLoc E = Parser.getTok().getEndLoc(); + Parser.Lex(); // Eat right bracket token. + + Operands.push_back(ARMOperand::CreateVectorIndex(MCE->getValue(), + SIdx, E, + getContext())); + } + + return false; +} + +/// MatchCoprocessorOperandName - Try to parse an coprocessor related +/// instruction with a symbolic operand name. +/// We accept "crN" syntax for GAS compatibility. +/// <operand-name> ::= <prefix><number> +/// If CoprocOp is 'c', then: +/// <prefix> ::= c | cr +/// If CoprocOp is 'p', then : +/// <prefix> ::= p +/// <number> ::= integer in range [0, 15] +static int MatchCoprocessorOperandName(StringRef Name, char CoprocOp) { + // Use the same layout as the tablegen'erated register name matcher. Ugly, + // but efficient. + if (Name.size() < 2 || Name[0] != CoprocOp) + return -1; + Name = (Name[1] == 'r') ? Name.drop_front(2) : Name.drop_front(); + + switch (Name.size()) { + default: return -1; + case 1: + switch (Name[0]) { + default: return -1; + case '0': return 0; + case '1': return 1; + case '2': return 2; + case '3': return 3; + case '4': return 4; + case '5': return 5; + case '6': return 6; + case '7': return 7; + case '8': return 8; + case '9': return 9; + } + case 2: + if (Name[0] != '1') + return -1; + switch (Name[1]) { + default: return -1; + // CP10 and CP11 are VFP/NEON and so vector instructions should be used. + // However, old cores (v5/v6) did use them in that way. + case '0': return 10; + case '1': return 11; + case '2': return 12; + case '3': return 13; + case '4': return 14; + case '5': return 15; + } + } +} + +/// parseITCondCode - Try to parse a condition code for an IT instruction. +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseITCondCode(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + SMLoc S = Parser.getTok().getLoc(); + const AsmToken &Tok = Parser.getTok(); + if (!Tok.is(AsmToken::Identifier)) + return MatchOperand_NoMatch; + unsigned CC = StringSwitch<unsigned>(Tok.getString().lower()) + .Case("eq", ARMCC::EQ) + .Case("ne", ARMCC::NE) + .Case("hs", ARMCC::HS) + .Case("cs", ARMCC::HS) + .Case("lo", ARMCC::LO) + .Case("cc", ARMCC::LO) + .Case("mi", ARMCC::MI) + .Case("pl", ARMCC::PL) + .Case("vs", ARMCC::VS) + .Case("vc", ARMCC::VC) + .Case("hi", ARMCC::HI) + .Case("ls", ARMCC::LS) + .Case("ge", ARMCC::GE) + .Case("lt", ARMCC::LT) + .Case("gt", ARMCC::GT) + .Case("le", ARMCC::LE) + .Case("al", ARMCC::AL) + .Default(~0U); + if (CC == ~0U) + return MatchOperand_NoMatch; + Parser.Lex(); // Eat the token. + + Operands.push_back(ARMOperand::CreateCondCode(ARMCC::CondCodes(CC), S)); + + return MatchOperand_Success; +} + +/// parseCoprocNumOperand - Try to parse an coprocessor number operand. The +/// token must be an Identifier when called, and if it is a coprocessor +/// number, the token is eaten and the operand is added to the operand list. +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseCoprocNumOperand(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + SMLoc S = Parser.getTok().getLoc(); + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Identifier)) + return MatchOperand_NoMatch; + + int Num = MatchCoprocessorOperandName(Tok.getString(), 'p'); + if (Num == -1) + return MatchOperand_NoMatch; + // ARMv7 and v8 don't allow cp10/cp11 due to VFP/NEON specific instructions + if ((hasV7Ops() || hasV8Ops()) && (Num == 10 || Num == 11)) + return MatchOperand_NoMatch; + + Parser.Lex(); // Eat identifier token. + Operands.push_back(ARMOperand::CreateCoprocNum(Num, S)); + return MatchOperand_Success; +} + +/// parseCoprocRegOperand - Try to parse an coprocessor register operand. The +/// token must be an Identifier when called, and if it is a coprocessor +/// number, the token is eaten and the operand is added to the operand list. +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseCoprocRegOperand(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + SMLoc S = Parser.getTok().getLoc(); + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Identifier)) + return MatchOperand_NoMatch; + + int Reg = MatchCoprocessorOperandName(Tok.getString(), 'c'); + if (Reg == -1) + return MatchOperand_NoMatch; + + Parser.Lex(); // Eat identifier token. + Operands.push_back(ARMOperand::CreateCoprocReg(Reg, S)); + return MatchOperand_Success; +} + +/// parseCoprocOptionOperand - Try to parse an coprocessor option operand. +/// coproc_option : '{' imm0_255 '}' +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseCoprocOptionOperand(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + SMLoc S = Parser.getTok().getLoc(); + + // If this isn't a '{', this isn't a coprocessor immediate operand. + if (Parser.getTok().isNot(AsmToken::LCurly)) + return MatchOperand_NoMatch; + Parser.Lex(); // Eat the '{' + + const MCExpr *Expr; + SMLoc Loc = Parser.getTok().getLoc(); + if (getParser().parseExpression(Expr)) { + Error(Loc, "illegal expression"); + return MatchOperand_ParseFail; + } + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr); + if (!CE || CE->getValue() < 0 || CE->getValue() > 255) { + Error(Loc, "coprocessor option must be an immediate in range [0, 255]"); + return MatchOperand_ParseFail; + } + int Val = CE->getValue(); + + // Check for and consume the closing '}' + if (Parser.getTok().isNot(AsmToken::RCurly)) + return MatchOperand_ParseFail; + SMLoc E = Parser.getTok().getEndLoc(); + Parser.Lex(); // Eat the '}' + + Operands.push_back(ARMOperand::CreateCoprocOption(Val, S, E)); + return MatchOperand_Success; +} + +// For register list parsing, we need to map from raw GPR register numbering +// to the enumeration values. The enumeration values aren't sorted by +// register number due to our using "sp", "lr" and "pc" as canonical names. +static unsigned getNextRegister(unsigned Reg) { + // If this is a GPR, we need to do it manually, otherwise we can rely + // on the sort ordering of the enumeration since the other reg-classes + // are sane. + if (!ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg)) + return Reg + 1; + switch(Reg) { + default: llvm_unreachable("Invalid GPR number!"); + case ARM::R0: return ARM::R1; case ARM::R1: return ARM::R2; + case ARM::R2: return ARM::R3; case ARM::R3: return ARM::R4; + case ARM::R4: return ARM::R5; case ARM::R5: return ARM::R6; + case ARM::R6: return ARM::R7; case ARM::R7: return ARM::R8; + case ARM::R8: return ARM::R9; case ARM::R9: return ARM::R10; + case ARM::R10: return ARM::R11; case ARM::R11: return ARM::R12; + case ARM::R12: return ARM::SP; case ARM::SP: return ARM::LR; + case ARM::LR: return ARM::PC; case ARM::PC: return ARM::R0; + } +} + +// Return the low-subreg of a given Q register. +static unsigned getDRegFromQReg(unsigned QReg) { + switch (QReg) { + default: llvm_unreachable("expected a Q register!"); + case ARM::Q0: return ARM::D0; + case ARM::Q1: return ARM::D2; + case ARM::Q2: return ARM::D4; + case ARM::Q3: return ARM::D6; + case ARM::Q4: return ARM::D8; + case ARM::Q5: return ARM::D10; + case ARM::Q6: return ARM::D12; + case ARM::Q7: return ARM::D14; + case ARM::Q8: return ARM::D16; + case ARM::Q9: return ARM::D18; + case ARM::Q10: return ARM::D20; + case ARM::Q11: return ARM::D22; + case ARM::Q12: return ARM::D24; + case ARM::Q13: return ARM::D26; + case ARM::Q14: return ARM::D28; + case ARM::Q15: return ARM::D30; + } +} + +/// Parse a register list. +bool ARMAsmParser::parseRegisterList(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + assert(Parser.getTok().is(AsmToken::LCurly) && + "Token is not a Left Curly Brace"); + SMLoc S = Parser.getTok().getLoc(); + Parser.Lex(); // Eat '{' token. + SMLoc RegLoc = Parser.getTok().getLoc(); + + // Check the first register in the list to see what register class + // this is a list of. + int Reg = tryParseRegister(); + if (Reg == -1) + return Error(RegLoc, "register expected"); + + // The reglist instructions have at most 16 registers, so reserve + // space for that many. + int EReg = 0; + SmallVector<std::pair<unsigned, unsigned>, 16> Registers; + + // Allow Q regs and just interpret them as the two D sub-registers. + if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) { + Reg = getDRegFromQReg(Reg); + EReg = MRI->getEncodingValue(Reg); + Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg)); + ++Reg; + } + const MCRegisterClass *RC; + if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg)) + RC = &ARMMCRegisterClasses[ARM::GPRRegClassID]; + else if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg)) + RC = &ARMMCRegisterClasses[ARM::DPRRegClassID]; + else if (ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg)) + RC = &ARMMCRegisterClasses[ARM::SPRRegClassID]; + else + return Error(RegLoc, "invalid register in register list"); + + // Store the register. + EReg = MRI->getEncodingValue(Reg); + Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg)); + + // This starts immediately after the first register token in the list, + // so we can see either a comma or a minus (range separator) as a legal + // next token. + while (Parser.getTok().is(AsmToken::Comma) || + Parser.getTok().is(AsmToken::Minus)) { + if (Parser.getTok().is(AsmToken::Minus)) { + Parser.Lex(); // Eat the minus. + SMLoc AfterMinusLoc = Parser.getTok().getLoc(); + int EndReg = tryParseRegister(); + if (EndReg == -1) + return Error(AfterMinusLoc, "register expected"); + // Allow Q regs and just interpret them as the two D sub-registers. + if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(EndReg)) + EndReg = getDRegFromQReg(EndReg) + 1; + // If the register is the same as the start reg, there's nothing + // more to do. + if (Reg == EndReg) + continue; + // The register must be in the same register class as the first. + if (!RC->contains(EndReg)) + return Error(AfterMinusLoc, "invalid register in register list"); + // Ranges must go from low to high. + if (MRI->getEncodingValue(Reg) > MRI->getEncodingValue(EndReg)) + return Error(AfterMinusLoc, "bad range in register list"); + + // Add all the registers in the range to the register list. + while (Reg != EndReg) { + Reg = getNextRegister(Reg); + EReg = MRI->getEncodingValue(Reg); + Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg)); + } + continue; + } + Parser.Lex(); // Eat the comma. + RegLoc = Parser.getTok().getLoc(); + int OldReg = Reg; + const AsmToken RegTok = Parser.getTok(); + Reg = tryParseRegister(); + if (Reg == -1) + return Error(RegLoc, "register expected"); + // Allow Q regs and just interpret them as the two D sub-registers. + bool isQReg = false; + if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) { + Reg = getDRegFromQReg(Reg); + isQReg = true; + } + // The register must be in the same register class as the first. + if (!RC->contains(Reg)) + return Error(RegLoc, "invalid register in register list"); + // List must be monotonically increasing. + if (MRI->getEncodingValue(Reg) < MRI->getEncodingValue(OldReg)) { + if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg)) + Warning(RegLoc, "register list not in ascending order"); + else + return Error(RegLoc, "register list not in ascending order"); + } + if (MRI->getEncodingValue(Reg) == MRI->getEncodingValue(OldReg)) { + Warning(RegLoc, "duplicated register (" + RegTok.getString() + + ") in register list"); + continue; + } + // VFP register lists must also be contiguous. + if (RC != &ARMMCRegisterClasses[ARM::GPRRegClassID] && + Reg != OldReg + 1) + return Error(RegLoc, "non-contiguous register range"); + EReg = MRI->getEncodingValue(Reg); + Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg)); + if (isQReg) { + EReg = MRI->getEncodingValue(++Reg); + Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg)); + } + } + + if (Parser.getTok().isNot(AsmToken::RCurly)) + return Error(Parser.getTok().getLoc(), "'}' expected"); + SMLoc E = Parser.getTok().getEndLoc(); + Parser.Lex(); // Eat '}' token. + + // Push the register list operand. + Operands.push_back(ARMOperand::CreateRegList(Registers, S, E)); + + // The ARM system instruction variants for LDM/STM have a '^' token here. + if (Parser.getTok().is(AsmToken::Caret)) { + Operands.push_back(ARMOperand::CreateToken("^",Parser.getTok().getLoc())); + Parser.Lex(); // Eat '^' token. + } + + return false; +} + +// Helper function to parse the lane index for vector lists. +ARMAsmParser::OperandMatchResultTy ARMAsmParser:: +parseVectorLane(VectorLaneTy &LaneKind, unsigned &Index, SMLoc &EndLoc) { + MCAsmParser &Parser = getParser(); + Index = 0; // Always return a defined index value. + if (Parser.getTok().is(AsmToken::LBrac)) { + Parser.Lex(); // Eat the '['. + if (Parser.getTok().is(AsmToken::RBrac)) { + // "Dn[]" is the 'all lanes' syntax. + LaneKind = AllLanes; + EndLoc = Parser.getTok().getEndLoc(); + Parser.Lex(); // Eat the ']'. + return MatchOperand_Success; + } + + // There's an optional '#' token here. Normally there wouldn't be, but + // inline assemble puts one in, and it's friendly to accept that. + if (Parser.getTok().is(AsmToken::Hash)) + Parser.Lex(); // Eat '#' or '$'. + + const MCExpr *LaneIndex; + SMLoc Loc = Parser.getTok().getLoc(); + if (getParser().parseExpression(LaneIndex)) { + Error(Loc, "illegal expression"); + return MatchOperand_ParseFail; + } + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(LaneIndex); + if (!CE) { + Error(Loc, "lane index must be empty or an integer"); + return MatchOperand_ParseFail; + } + if (Parser.getTok().isNot(AsmToken::RBrac)) { + Error(Parser.getTok().getLoc(), "']' expected"); + return MatchOperand_ParseFail; + } + EndLoc = Parser.getTok().getEndLoc(); + Parser.Lex(); // Eat the ']'. + int64_t Val = CE->getValue(); + + // FIXME: Make this range check context sensitive for .8, .16, .32. + if (Val < 0 || Val > 7) { + Error(Parser.getTok().getLoc(), "lane index out of range"); + return MatchOperand_ParseFail; + } + Index = Val; + LaneKind = IndexedLane; + return MatchOperand_Success; + } + LaneKind = NoLanes; + return MatchOperand_Success; +} + +// parse a vector register list +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseVectorList(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + VectorLaneTy LaneKind; + unsigned LaneIndex; + SMLoc S = Parser.getTok().getLoc(); + // As an extension (to match gas), support a plain D register or Q register + // (without encosing curly braces) as a single or double entry list, + // respectively. + if (Parser.getTok().is(AsmToken::Identifier)) { + SMLoc E = Parser.getTok().getEndLoc(); + int Reg = tryParseRegister(); + if (Reg == -1) + return MatchOperand_NoMatch; + if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg)) { + OperandMatchResultTy Res = parseVectorLane(LaneKind, LaneIndex, E); + if (Res != MatchOperand_Success) + return Res; + switch (LaneKind) { + case NoLanes: + Operands.push_back(ARMOperand::CreateVectorList(Reg, 1, false, S, E)); + break; + case AllLanes: + Operands.push_back(ARMOperand::CreateVectorListAllLanes(Reg, 1, false, + S, E)); + break; + case IndexedLane: + Operands.push_back(ARMOperand::CreateVectorListIndexed(Reg, 1, + LaneIndex, + false, S, E)); + break; + } + return MatchOperand_Success; + } + if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) { + Reg = getDRegFromQReg(Reg); + OperandMatchResultTy Res = parseVectorLane(LaneKind, LaneIndex, E); + if (Res != MatchOperand_Success) + return Res; + switch (LaneKind) { + case NoLanes: + Reg = MRI->getMatchingSuperReg(Reg, ARM::dsub_0, + &ARMMCRegisterClasses[ARM::DPairRegClassID]); + Operands.push_back(ARMOperand::CreateVectorList(Reg, 2, false, S, E)); + break; + case AllLanes: + Reg = MRI->getMatchingSuperReg(Reg, ARM::dsub_0, + &ARMMCRegisterClasses[ARM::DPairRegClassID]); + Operands.push_back(ARMOperand::CreateVectorListAllLanes(Reg, 2, false, + S, E)); + break; + case IndexedLane: + Operands.push_back(ARMOperand::CreateVectorListIndexed(Reg, 2, + LaneIndex, + false, S, E)); + break; + } + return MatchOperand_Success; + } + Error(S, "vector register expected"); + return MatchOperand_ParseFail; + } + + if (Parser.getTok().isNot(AsmToken::LCurly)) + return MatchOperand_NoMatch; + + Parser.Lex(); // Eat '{' token. + SMLoc RegLoc = Parser.getTok().getLoc(); + + int Reg = tryParseRegister(); + if (Reg == -1) { + Error(RegLoc, "register expected"); + return MatchOperand_ParseFail; + } + unsigned Count = 1; + int Spacing = 0; + unsigned FirstReg = Reg; + // The list is of D registers, but we also allow Q regs and just interpret + // them as the two D sub-registers. + if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) { + FirstReg = Reg = getDRegFromQReg(Reg); + Spacing = 1; // double-spacing requires explicit D registers, otherwise + // it's ambiguous with four-register single spaced. + ++Reg; + ++Count; + } + + SMLoc E; + if (parseVectorLane(LaneKind, LaneIndex, E) != MatchOperand_Success) + return MatchOperand_ParseFail; + + while (Parser.getTok().is(AsmToken::Comma) || + Parser.getTok().is(AsmToken::Minus)) { + if (Parser.getTok().is(AsmToken::Minus)) { + if (!Spacing) + Spacing = 1; // Register range implies a single spaced list. + else if (Spacing == 2) { + Error(Parser.getTok().getLoc(), + "sequential registers in double spaced list"); + return MatchOperand_ParseFail; + } + Parser.Lex(); // Eat the minus. + SMLoc AfterMinusLoc = Parser.getTok().getLoc(); + int EndReg = tryParseRegister(); + if (EndReg == -1) { + Error(AfterMinusLoc, "register expected"); + return MatchOperand_ParseFail; + } + // Allow Q regs and just interpret them as the two D sub-registers. + if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(EndReg)) + EndReg = getDRegFromQReg(EndReg) + 1; + // If the register is the same as the start reg, there's nothing + // more to do. + if (Reg == EndReg) + continue; + // The register must be in the same register class as the first. + if (!ARMMCRegisterClasses[ARM::DPRRegClassID].contains(EndReg)) { + Error(AfterMinusLoc, "invalid register in register list"); + return MatchOperand_ParseFail; + } + // Ranges must go from low to high. + if (Reg > EndReg) { + Error(AfterMinusLoc, "bad range in register list"); + return MatchOperand_ParseFail; + } + // Parse the lane specifier if present. + VectorLaneTy NextLaneKind; + unsigned NextLaneIndex; + if (parseVectorLane(NextLaneKind, NextLaneIndex, E) != + MatchOperand_Success) + return MatchOperand_ParseFail; + if (NextLaneKind != LaneKind || LaneIndex != NextLaneIndex) { + Error(AfterMinusLoc, "mismatched lane index in register list"); + return MatchOperand_ParseFail; + } + + // Add all the registers in the range to the register list. + Count += EndReg - Reg; + Reg = EndReg; + continue; + } + Parser.Lex(); // Eat the comma. + RegLoc = Parser.getTok().getLoc(); + int OldReg = Reg; + Reg = tryParseRegister(); + if (Reg == -1) { + Error(RegLoc, "register expected"); + return MatchOperand_ParseFail; + } + // vector register lists must be contiguous. + // It's OK to use the enumeration values directly here rather, as the + // VFP register classes have the enum sorted properly. + // + // The list is of D registers, but we also allow Q regs and just interpret + // them as the two D sub-registers. + if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) { + if (!Spacing) + Spacing = 1; // Register range implies a single spaced list. + else if (Spacing == 2) { + Error(RegLoc, + "invalid register in double-spaced list (must be 'D' register')"); + return MatchOperand_ParseFail; + } + Reg = getDRegFromQReg(Reg); + if (Reg != OldReg + 1) { + Error(RegLoc, "non-contiguous register range"); + return MatchOperand_ParseFail; + } + ++Reg; + Count += 2; + // Parse the lane specifier if present. + VectorLaneTy NextLaneKind; + unsigned NextLaneIndex; + SMLoc LaneLoc = Parser.getTok().getLoc(); + if (parseVectorLane(NextLaneKind, NextLaneIndex, E) != + MatchOperand_Success) + return MatchOperand_ParseFail; + if (NextLaneKind != LaneKind || LaneIndex != NextLaneIndex) { + Error(LaneLoc, "mismatched lane index in register list"); + return MatchOperand_ParseFail; + } + continue; + } + // Normal D register. + // Figure out the register spacing (single or double) of the list if + // we don't know it already. + if (!Spacing) + Spacing = 1 + (Reg == OldReg + 2); + + // Just check that it's contiguous and keep going. + if (Reg != OldReg + Spacing) { + Error(RegLoc, "non-contiguous register range"); + return MatchOperand_ParseFail; + } + ++Count; + // Parse the lane specifier if present. + VectorLaneTy NextLaneKind; + unsigned NextLaneIndex; + SMLoc EndLoc = Parser.getTok().getLoc(); + if (parseVectorLane(NextLaneKind, NextLaneIndex, E) != MatchOperand_Success) + return MatchOperand_ParseFail; + if (NextLaneKind != LaneKind || LaneIndex != NextLaneIndex) { + Error(EndLoc, "mismatched lane index in register list"); + return MatchOperand_ParseFail; + } + } + + if (Parser.getTok().isNot(AsmToken::RCurly)) { + Error(Parser.getTok().getLoc(), "'}' expected"); + return MatchOperand_ParseFail; + } + E = Parser.getTok().getEndLoc(); + Parser.Lex(); // Eat '}' token. + + switch (LaneKind) { + case NoLanes: + // Two-register operands have been converted to the + // composite register classes. + if (Count == 2) { + const MCRegisterClass *RC = (Spacing == 1) ? + &ARMMCRegisterClasses[ARM::DPairRegClassID] : + &ARMMCRegisterClasses[ARM::DPairSpcRegClassID]; + FirstReg = MRI->getMatchingSuperReg(FirstReg, ARM::dsub_0, RC); + } + + Operands.push_back(ARMOperand::CreateVectorList(FirstReg, Count, + (Spacing == 2), S, E)); + break; + case AllLanes: + // Two-register operands have been converted to the + // composite register classes. + if (Count == 2) { + const MCRegisterClass *RC = (Spacing == 1) ? + &ARMMCRegisterClasses[ARM::DPairRegClassID] : + &ARMMCRegisterClasses[ARM::DPairSpcRegClassID]; + FirstReg = MRI->getMatchingSuperReg(FirstReg, ARM::dsub_0, RC); + } + Operands.push_back(ARMOperand::CreateVectorListAllLanes(FirstReg, Count, + (Spacing == 2), + S, E)); + break; + case IndexedLane: + Operands.push_back(ARMOperand::CreateVectorListIndexed(FirstReg, Count, + LaneIndex, + (Spacing == 2), + S, E)); + break; + } + return MatchOperand_Success; +} + +/// parseMemBarrierOptOperand - Try to parse DSB/DMB data barrier options. +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseMemBarrierOptOperand(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + SMLoc S = Parser.getTok().getLoc(); + const AsmToken &Tok = Parser.getTok(); + unsigned Opt; + + if (Tok.is(AsmToken::Identifier)) { + StringRef OptStr = Tok.getString(); + + Opt = StringSwitch<unsigned>(OptStr.slice(0, OptStr.size()).lower()) + .Case("sy", ARM_MB::SY) + .Case("st", ARM_MB::ST) + .Case("ld", ARM_MB::LD) + .Case("sh", ARM_MB::ISH) + .Case("ish", ARM_MB::ISH) + .Case("shst", ARM_MB::ISHST) + .Case("ishst", ARM_MB::ISHST) + .Case("ishld", ARM_MB::ISHLD) + .Case("nsh", ARM_MB::NSH) + .Case("un", ARM_MB::NSH) + .Case("nshst", ARM_MB::NSHST) + .Case("nshld", ARM_MB::NSHLD) + .Case("unst", ARM_MB::NSHST) + .Case("osh", ARM_MB::OSH) + .Case("oshst", ARM_MB::OSHST) + .Case("oshld", ARM_MB::OSHLD) + .Default(~0U); + + // ishld, oshld, nshld and ld are only available from ARMv8. + if (!hasV8Ops() && (Opt == ARM_MB::ISHLD || Opt == ARM_MB::OSHLD || + Opt == ARM_MB::NSHLD || Opt == ARM_MB::LD)) + Opt = ~0U; + + if (Opt == ~0U) + return MatchOperand_NoMatch; + + Parser.Lex(); // Eat identifier token. + } else if (Tok.is(AsmToken::Hash) || + Tok.is(AsmToken::Dollar) || + Tok.is(AsmToken::Integer)) { + if (Parser.getTok().isNot(AsmToken::Integer)) + Parser.Lex(); // Eat '#' or '$'. + SMLoc Loc = Parser.getTok().getLoc(); + + const MCExpr *MemBarrierID; + if (getParser().parseExpression(MemBarrierID)) { + Error(Loc, "illegal expression"); + return MatchOperand_ParseFail; + } + + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(MemBarrierID); + if (!CE) { + Error(Loc, "constant expression expected"); + return MatchOperand_ParseFail; + } + + int Val = CE->getValue(); + if (Val & ~0xf) { + Error(Loc, "immediate value out of range"); + return MatchOperand_ParseFail; + } + + Opt = ARM_MB::RESERVED_0 + Val; + } else + return MatchOperand_ParseFail; + + Operands.push_back(ARMOperand::CreateMemBarrierOpt((ARM_MB::MemBOpt)Opt, S)); + return MatchOperand_Success; +} + +/// parseInstSyncBarrierOptOperand - Try to parse ISB inst sync barrier options. +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseInstSyncBarrierOptOperand(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + SMLoc S = Parser.getTok().getLoc(); + const AsmToken &Tok = Parser.getTok(); + unsigned Opt; + + if (Tok.is(AsmToken::Identifier)) { + StringRef OptStr = Tok.getString(); + + if (OptStr.equals_lower("sy")) + Opt = ARM_ISB::SY; + else + return MatchOperand_NoMatch; + + Parser.Lex(); // Eat identifier token. + } else if (Tok.is(AsmToken::Hash) || + Tok.is(AsmToken::Dollar) || + Tok.is(AsmToken::Integer)) { + if (Parser.getTok().isNot(AsmToken::Integer)) + Parser.Lex(); // Eat '#' or '$'. + SMLoc Loc = Parser.getTok().getLoc(); + + const MCExpr *ISBarrierID; + if (getParser().parseExpression(ISBarrierID)) { + Error(Loc, "illegal expression"); + return MatchOperand_ParseFail; + } + + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ISBarrierID); + if (!CE) { + Error(Loc, "constant expression expected"); + return MatchOperand_ParseFail; + } + + int Val = CE->getValue(); + if (Val & ~0xf) { + Error(Loc, "immediate value out of range"); + return MatchOperand_ParseFail; + } + + Opt = ARM_ISB::RESERVED_0 + Val; + } else + return MatchOperand_ParseFail; + + Operands.push_back(ARMOperand::CreateInstSyncBarrierOpt( + (ARM_ISB::InstSyncBOpt)Opt, S)); + return MatchOperand_Success; +} + + +/// parseProcIFlagsOperand - Try to parse iflags from CPS instruction. +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseProcIFlagsOperand(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + SMLoc S = Parser.getTok().getLoc(); + const AsmToken &Tok = Parser.getTok(); + if (!Tok.is(AsmToken::Identifier)) + return MatchOperand_NoMatch; + StringRef IFlagsStr = Tok.getString(); + + // An iflags string of "none" is interpreted to mean that none of the AIF + // bits are set. Not a terribly useful instruction, but a valid encoding. + unsigned IFlags = 0; + if (IFlagsStr != "none") { + for (int i = 0, e = IFlagsStr.size(); i != e; ++i) { + unsigned Flag = StringSwitch<unsigned>(IFlagsStr.substr(i, 1)) + .Case("a", ARM_PROC::A) + .Case("i", ARM_PROC::I) + .Case("f", ARM_PROC::F) + .Default(~0U); + + // If some specific iflag is already set, it means that some letter is + // present more than once, this is not acceptable. + if (Flag == ~0U || (IFlags & Flag)) + return MatchOperand_NoMatch; + + IFlags |= Flag; + } + } + + Parser.Lex(); // Eat identifier token. + Operands.push_back(ARMOperand::CreateProcIFlags((ARM_PROC::IFlags)IFlags, S)); + return MatchOperand_Success; +} + +/// parseMSRMaskOperand - Try to parse mask flags from MSR instruction. +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseMSRMaskOperand(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + SMLoc S = Parser.getTok().getLoc(); + const AsmToken &Tok = Parser.getTok(); + if (!Tok.is(AsmToken::Identifier)) + return MatchOperand_NoMatch; + StringRef Mask = Tok.getString(); + + if (isMClass()) { + // See ARMv6-M 10.1.1 + std::string Name = Mask.lower(); + unsigned FlagsVal = StringSwitch<unsigned>(Name) + // Note: in the documentation: + // ARM deprecates using MSR APSR without a _<bits> qualifier as an alias + // for MSR APSR_nzcvq. + // but we do make it an alias here. This is so to get the "mask encoding" + // bits correct on MSR APSR writes. + // + // FIXME: Note the 0xc00 "mask encoding" bits version of the registers + // should really only be allowed when writing a special register. Note + // they get dropped in the MRS instruction reading a special register as + // the SYSm field is only 8 bits. + .Case("apsr", 0x800) + .Case("apsr_nzcvq", 0x800) + .Case("apsr_g", 0x400) + .Case("apsr_nzcvqg", 0xc00) + .Case("iapsr", 0x801) + .Case("iapsr_nzcvq", 0x801) + .Case("iapsr_g", 0x401) + .Case("iapsr_nzcvqg", 0xc01) + .Case("eapsr", 0x802) + .Case("eapsr_nzcvq", 0x802) + .Case("eapsr_g", 0x402) + .Case("eapsr_nzcvqg", 0xc02) + .Case("xpsr", 0x803) + .Case("xpsr_nzcvq", 0x803) + .Case("xpsr_g", 0x403) + .Case("xpsr_nzcvqg", 0xc03) + .Case("ipsr", 0x805) + .Case("epsr", 0x806) + .Case("iepsr", 0x807) + .Case("msp", 0x808) + .Case("psp", 0x809) + .Case("primask", 0x810) + .Case("basepri", 0x811) + .Case("basepri_max", 0x812) + .Case("faultmask", 0x813) + .Case("control", 0x814) + .Default(~0U); + + if (FlagsVal == ~0U) + return MatchOperand_NoMatch; + + if (!hasDSP() && (FlagsVal & 0x400)) + // The _g and _nzcvqg versions are only valid if the DSP extension is + // available. + return MatchOperand_NoMatch; + + if (!hasV7Ops() && FlagsVal >= 0x811 && FlagsVal <= 0x813) + // basepri, basepri_max and faultmask only valid for V7m. + return MatchOperand_NoMatch; + + Parser.Lex(); // Eat identifier token. + Operands.push_back(ARMOperand::CreateMSRMask(FlagsVal, S)); + return MatchOperand_Success; + } + + // Split spec_reg from flag, example: CPSR_sxf => "CPSR" and "sxf" + size_t Start = 0, Next = Mask.find('_'); + StringRef Flags = ""; + std::string SpecReg = Mask.slice(Start, Next).lower(); + if (Next != StringRef::npos) + Flags = Mask.slice(Next+1, Mask.size()); + + // FlagsVal contains the complete mask: + // 3-0: Mask + // 4: Special Reg (cpsr, apsr => 0; spsr => 1) + unsigned FlagsVal = 0; + + if (SpecReg == "apsr") { + FlagsVal = StringSwitch<unsigned>(Flags) + .Case("nzcvq", 0x8) // same as CPSR_f + .Case("g", 0x4) // same as CPSR_s + .Case("nzcvqg", 0xc) // same as CPSR_fs + .Default(~0U); + + if (FlagsVal == ~0U) { + if (!Flags.empty()) + return MatchOperand_NoMatch; + else + FlagsVal = 8; // No flag + } + } else if (SpecReg == "cpsr" || SpecReg == "spsr") { + // cpsr_all is an alias for cpsr_fc, as is plain cpsr. + if (Flags == "all" || Flags == "") + Flags = "fc"; + for (int i = 0, e = Flags.size(); i != e; ++i) { + unsigned Flag = StringSwitch<unsigned>(Flags.substr(i, 1)) + .Case("c", 1) + .Case("x", 2) + .Case("s", 4) + .Case("f", 8) + .Default(~0U); + + // If some specific flag is already set, it means that some letter is + // present more than once, this is not acceptable. + if (FlagsVal == ~0U || (FlagsVal & Flag)) + return MatchOperand_NoMatch; + FlagsVal |= Flag; + } + } else // No match for special register. + return MatchOperand_NoMatch; + + // Special register without flags is NOT equivalent to "fc" flags. + // NOTE: This is a divergence from gas' behavior. Uncommenting the following + // two lines would enable gas compatibility at the expense of breaking + // round-tripping. + // + // if (!FlagsVal) + // FlagsVal = 0x9; + + // Bit 4: Special Reg (cpsr, apsr => 0; spsr => 1) + if (SpecReg == "spsr") + FlagsVal |= 16; + + Parser.Lex(); // Eat identifier token. + Operands.push_back(ARMOperand::CreateMSRMask(FlagsVal, S)); + return MatchOperand_Success; +} + +/// parseBankedRegOperand - Try to parse a banked register (e.g. "lr_irq") for +/// use in the MRS/MSR instructions added to support virtualization. +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseBankedRegOperand(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + SMLoc S = Parser.getTok().getLoc(); + const AsmToken &Tok = Parser.getTok(); + if (!Tok.is(AsmToken::Identifier)) + return MatchOperand_NoMatch; + StringRef RegName = Tok.getString(); + + // The values here come from B9.2.3 of the ARM ARM, where bits 4-0 are SysM + // and bit 5 is R. + unsigned Encoding = StringSwitch<unsigned>(RegName.lower()) + .Case("r8_usr", 0x00) + .Case("r9_usr", 0x01) + .Case("r10_usr", 0x02) + .Case("r11_usr", 0x03) + .Case("r12_usr", 0x04) + .Case("sp_usr", 0x05) + .Case("lr_usr", 0x06) + .Case("r8_fiq", 0x08) + .Case("r9_fiq", 0x09) + .Case("r10_fiq", 0x0a) + .Case("r11_fiq", 0x0b) + .Case("r12_fiq", 0x0c) + .Case("sp_fiq", 0x0d) + .Case("lr_fiq", 0x0e) + .Case("lr_irq", 0x10) + .Case("sp_irq", 0x11) + .Case("lr_svc", 0x12) + .Case("sp_svc", 0x13) + .Case("lr_abt", 0x14) + .Case("sp_abt", 0x15) + .Case("lr_und", 0x16) + .Case("sp_und", 0x17) + .Case("lr_mon", 0x1c) + .Case("sp_mon", 0x1d) + .Case("elr_hyp", 0x1e) + .Case("sp_hyp", 0x1f) + .Case("spsr_fiq", 0x2e) + .Case("spsr_irq", 0x30) + .Case("spsr_svc", 0x32) + .Case("spsr_abt", 0x34) + .Case("spsr_und", 0x36) + .Case("spsr_mon", 0x3c) + .Case("spsr_hyp", 0x3e) + .Default(~0U); + + if (Encoding == ~0U) + return MatchOperand_NoMatch; + + Parser.Lex(); // Eat identifier token. + Operands.push_back(ARMOperand::CreateBankedReg(Encoding, S)); + return MatchOperand_Success; +} + +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parsePKHImm(OperandVector &Operands, StringRef Op, int Low, + int High) { + MCAsmParser &Parser = getParser(); + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Identifier)) { + Error(Parser.getTok().getLoc(), Op + " operand expected."); + return MatchOperand_ParseFail; + } + StringRef ShiftName = Tok.getString(); + std::string LowerOp = Op.lower(); + std::string UpperOp = Op.upper(); + if (ShiftName != LowerOp && ShiftName != UpperOp) { + Error(Parser.getTok().getLoc(), Op + " operand expected."); + return MatchOperand_ParseFail; + } + Parser.Lex(); // Eat shift type token. + + // There must be a '#' and a shift amount. + if (Parser.getTok().isNot(AsmToken::Hash) && + Parser.getTok().isNot(AsmToken::Dollar)) { + Error(Parser.getTok().getLoc(), "'#' expected"); + return MatchOperand_ParseFail; + } + Parser.Lex(); // Eat hash token. + + const MCExpr *ShiftAmount; + SMLoc Loc = Parser.getTok().getLoc(); + SMLoc EndLoc; + if (getParser().parseExpression(ShiftAmount, EndLoc)) { + Error(Loc, "illegal expression"); + return MatchOperand_ParseFail; + } + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftAmount); + if (!CE) { + Error(Loc, "constant expression expected"); + return MatchOperand_ParseFail; + } + int Val = CE->getValue(); + if (Val < Low || Val > High) { + Error(Loc, "immediate value out of range"); + return MatchOperand_ParseFail; + } + + Operands.push_back(ARMOperand::CreateImm(CE, Loc, EndLoc)); + + return MatchOperand_Success; +} + +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseSetEndImm(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + const AsmToken &Tok = Parser.getTok(); + SMLoc S = Tok.getLoc(); + if (Tok.isNot(AsmToken::Identifier)) { + Error(S, "'be' or 'le' operand expected"); + return MatchOperand_ParseFail; + } + int Val = StringSwitch<int>(Tok.getString().lower()) + .Case("be", 1) + .Case("le", 0) + .Default(-1); + Parser.Lex(); // Eat the token. + + if (Val == -1) { + Error(S, "'be' or 'le' operand expected"); + return MatchOperand_ParseFail; + } + Operands.push_back(ARMOperand::CreateImm(MCConstantExpr::create(Val, + getContext()), + S, Tok.getEndLoc())); + return MatchOperand_Success; +} + +/// parseShifterImm - Parse the shifter immediate operand for SSAT/USAT +/// instructions. Legal values are: +/// lsl #n 'n' in [0,31] +/// asr #n 'n' in [1,32] +/// n == 32 encoded as n == 0. +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseShifterImm(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + const AsmToken &Tok = Parser.getTok(); + SMLoc S = Tok.getLoc(); + if (Tok.isNot(AsmToken::Identifier)) { + Error(S, "shift operator 'asr' or 'lsl' expected"); + return MatchOperand_ParseFail; + } + StringRef ShiftName = Tok.getString(); + bool isASR; + if (ShiftName == "lsl" || ShiftName == "LSL") + isASR = false; + else if (ShiftName == "asr" || ShiftName == "ASR") + isASR = true; + else { + Error(S, "shift operator 'asr' or 'lsl' expected"); + return MatchOperand_ParseFail; + } + Parser.Lex(); // Eat the operator. + + // A '#' and a shift amount. + if (Parser.getTok().isNot(AsmToken::Hash) && + Parser.getTok().isNot(AsmToken::Dollar)) { + Error(Parser.getTok().getLoc(), "'#' expected"); + return MatchOperand_ParseFail; + } + Parser.Lex(); // Eat hash token. + SMLoc ExLoc = Parser.getTok().getLoc(); + + const MCExpr *ShiftAmount; + SMLoc EndLoc; + if (getParser().parseExpression(ShiftAmount, EndLoc)) { + Error(ExLoc, "malformed shift expression"); + return MatchOperand_ParseFail; + } + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftAmount); + if (!CE) { + Error(ExLoc, "shift amount must be an immediate"); + return MatchOperand_ParseFail; + } + + int64_t Val = CE->getValue(); + if (isASR) { + // Shift amount must be in [1,32] + if (Val < 1 || Val > 32) { + Error(ExLoc, "'asr' shift amount must be in range [1,32]"); + return MatchOperand_ParseFail; + } + // asr #32 encoded as asr #0, but is not allowed in Thumb2 mode. + if (isThumb() && Val == 32) { + Error(ExLoc, "'asr #32' shift amount not allowed in Thumb mode"); + return MatchOperand_ParseFail; + } + if (Val == 32) Val = 0; + } else { + // Shift amount must be in [1,32] + if (Val < 0 || Val > 31) { + Error(ExLoc, "'lsr' shift amount must be in range [0,31]"); + return MatchOperand_ParseFail; + } + } + + Operands.push_back(ARMOperand::CreateShifterImm(isASR, Val, S, EndLoc)); + + return MatchOperand_Success; +} + +/// parseRotImm - Parse the shifter immediate operand for SXTB/UXTB family +/// of instructions. Legal values are: +/// ror #n 'n' in {0, 8, 16, 24} +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseRotImm(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + const AsmToken &Tok = Parser.getTok(); + SMLoc S = Tok.getLoc(); + if (Tok.isNot(AsmToken::Identifier)) + return MatchOperand_NoMatch; + StringRef ShiftName = Tok.getString(); + if (ShiftName != "ror" && ShiftName != "ROR") + return MatchOperand_NoMatch; + Parser.Lex(); // Eat the operator. + + // A '#' and a rotate amount. + if (Parser.getTok().isNot(AsmToken::Hash) && + Parser.getTok().isNot(AsmToken::Dollar)) { + Error(Parser.getTok().getLoc(), "'#' expected"); + return MatchOperand_ParseFail; + } + Parser.Lex(); // Eat hash token. + SMLoc ExLoc = Parser.getTok().getLoc(); + + const MCExpr *ShiftAmount; + SMLoc EndLoc; + if (getParser().parseExpression(ShiftAmount, EndLoc)) { + Error(ExLoc, "malformed rotate expression"); + return MatchOperand_ParseFail; + } + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftAmount); + if (!CE) { + Error(ExLoc, "rotate amount must be an immediate"); + return MatchOperand_ParseFail; + } + + int64_t Val = CE->getValue(); + // Shift amount must be in {0, 8, 16, 24} (0 is undocumented extension) + // normally, zero is represented in asm by omitting the rotate operand + // entirely. + if (Val != 8 && Val != 16 && Val != 24 && Val != 0) { + Error(ExLoc, "'ror' rotate amount must be 8, 16, or 24"); + return MatchOperand_ParseFail; + } + + Operands.push_back(ARMOperand::CreateRotImm(Val, S, EndLoc)); + + return MatchOperand_Success; +} + +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseModImm(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + MCAsmLexer &Lexer = getLexer(); + int64_t Imm1, Imm2; + + SMLoc S = Parser.getTok().getLoc(); + + // 1) A mod_imm operand can appear in the place of a register name: + // add r0, #mod_imm + // add r0, r0, #mod_imm + // to correctly handle the latter, we bail out as soon as we see an + // identifier. + // + // 2) Similarly, we do not want to parse into complex operands: + // mov r0, #mod_imm + // mov r0, :lower16:(_foo) + if (Parser.getTok().is(AsmToken::Identifier) || + Parser.getTok().is(AsmToken::Colon)) + return MatchOperand_NoMatch; + + // Hash (dollar) is optional as per the ARMARM + if (Parser.getTok().is(AsmToken::Hash) || + Parser.getTok().is(AsmToken::Dollar)) { + // Avoid parsing into complex operands (#:) + if (Lexer.peekTok().is(AsmToken::Colon)) + return MatchOperand_NoMatch; + + // Eat the hash (dollar) + Parser.Lex(); + } + + SMLoc Sx1, Ex1; + Sx1 = Parser.getTok().getLoc(); + const MCExpr *Imm1Exp; + if (getParser().parseExpression(Imm1Exp, Ex1)) { + Error(Sx1, "malformed expression"); + return MatchOperand_ParseFail; + } + + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm1Exp); + + if (CE) { + // Immediate must fit within 32-bits + Imm1 = CE->getValue(); + int Enc = ARM_AM::getSOImmVal(Imm1); + if (Enc != -1 && Parser.getTok().is(AsmToken::EndOfStatement)) { + // We have a match! + Operands.push_back(ARMOperand::CreateModImm((Enc & 0xFF), + (Enc & 0xF00) >> 7, + Sx1, Ex1)); + return MatchOperand_Success; + } + + // We have parsed an immediate which is not for us, fallback to a plain + // immediate. This can happen for instruction aliases. For an example, + // ARMInstrInfo.td defines the alias [mov <-> mvn] which can transform + // a mov (mvn) with a mod_imm_neg/mod_imm_not operand into the opposite + // instruction with a mod_imm operand. The alias is defined such that the + // parser method is shared, that's why we have to do this here. + if (Parser.getTok().is(AsmToken::EndOfStatement)) { + Operands.push_back(ARMOperand::CreateImm(Imm1Exp, Sx1, Ex1)); + return MatchOperand_Success; + } + } else { + // Operands like #(l1 - l2) can only be evaluated at a later stage (via an + // MCFixup). Fallback to a plain immediate. + Operands.push_back(ARMOperand::CreateImm(Imm1Exp, Sx1, Ex1)); + return MatchOperand_Success; + } + + // From this point onward, we expect the input to be a (#bits, #rot) pair + if (Parser.getTok().isNot(AsmToken::Comma)) { + Error(Sx1, "expected modified immediate operand: #[0, 255], #even[0-30]"); + return MatchOperand_ParseFail; + } + + if (Imm1 & ~0xFF) { + Error(Sx1, "immediate operand must a number in the range [0, 255]"); + return MatchOperand_ParseFail; + } + + // Eat the comma + Parser.Lex(); + + // Repeat for #rot + SMLoc Sx2, Ex2; + Sx2 = Parser.getTok().getLoc(); + + // Eat the optional hash (dollar) + if (Parser.getTok().is(AsmToken::Hash) || + Parser.getTok().is(AsmToken::Dollar)) + Parser.Lex(); + + const MCExpr *Imm2Exp; + if (getParser().parseExpression(Imm2Exp, Ex2)) { + Error(Sx2, "malformed expression"); + return MatchOperand_ParseFail; + } + + CE = dyn_cast<MCConstantExpr>(Imm2Exp); + + if (CE) { + Imm2 = CE->getValue(); + if (!(Imm2 & ~0x1E)) { + // We have a match! + Operands.push_back(ARMOperand::CreateModImm(Imm1, Imm2, S, Ex2)); + return MatchOperand_Success; + } + Error(Sx2, "immediate operand must an even number in the range [0, 30]"); + return MatchOperand_ParseFail; + } else { + Error(Sx2, "constant expression expected"); + return MatchOperand_ParseFail; + } +} + +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseBitfield(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + SMLoc S = Parser.getTok().getLoc(); + // The bitfield descriptor is really two operands, the LSB and the width. + if (Parser.getTok().isNot(AsmToken::Hash) && + Parser.getTok().isNot(AsmToken::Dollar)) { + Error(Parser.getTok().getLoc(), "'#' expected"); + return MatchOperand_ParseFail; + } + Parser.Lex(); // Eat hash token. + + const MCExpr *LSBExpr; + SMLoc E = Parser.getTok().getLoc(); + if (getParser().parseExpression(LSBExpr)) { + Error(E, "malformed immediate expression"); + return MatchOperand_ParseFail; + } + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(LSBExpr); + if (!CE) { + Error(E, "'lsb' operand must be an immediate"); + return MatchOperand_ParseFail; + } + + int64_t LSB = CE->getValue(); + // The LSB must be in the range [0,31] + if (LSB < 0 || LSB > 31) { + Error(E, "'lsb' operand must be in the range [0,31]"); + return MatchOperand_ParseFail; + } + E = Parser.getTok().getLoc(); + + // Expect another immediate operand. + if (Parser.getTok().isNot(AsmToken::Comma)) { + Error(Parser.getTok().getLoc(), "too few operands"); + return MatchOperand_ParseFail; + } + Parser.Lex(); // Eat hash token. + if (Parser.getTok().isNot(AsmToken::Hash) && + Parser.getTok().isNot(AsmToken::Dollar)) { + Error(Parser.getTok().getLoc(), "'#' expected"); + return MatchOperand_ParseFail; + } + Parser.Lex(); // Eat hash token. + + const MCExpr *WidthExpr; + SMLoc EndLoc; + if (getParser().parseExpression(WidthExpr, EndLoc)) { + Error(E, "malformed immediate expression"); + return MatchOperand_ParseFail; + } + CE = dyn_cast<MCConstantExpr>(WidthExpr); + if (!CE) { + Error(E, "'width' operand must be an immediate"); + return MatchOperand_ParseFail; + } + + int64_t Width = CE->getValue(); + // The LSB must be in the range [1,32-lsb] + if (Width < 1 || Width > 32 - LSB) { + Error(E, "'width' operand must be in the range [1,32-lsb]"); + return MatchOperand_ParseFail; + } + + Operands.push_back(ARMOperand::CreateBitfield(LSB, Width, S, EndLoc)); + + return MatchOperand_Success; +} + +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parsePostIdxReg(OperandVector &Operands) { + // Check for a post-index addressing register operand. Specifically: + // postidx_reg := '+' register {, shift} + // | '-' register {, shift} + // | register {, shift} + + // This method must return MatchOperand_NoMatch without consuming any tokens + // in the case where there is no match, as other alternatives take other + // parse methods. + MCAsmParser &Parser = getParser(); + AsmToken Tok = Parser.getTok(); + SMLoc S = Tok.getLoc(); + bool haveEaten = false; + bool isAdd = true; + if (Tok.is(AsmToken::Plus)) { + Parser.Lex(); // Eat the '+' token. + haveEaten = true; + } else if (Tok.is(AsmToken::Minus)) { + Parser.Lex(); // Eat the '-' token. + isAdd = false; + haveEaten = true; + } + + SMLoc E = Parser.getTok().getEndLoc(); + int Reg = tryParseRegister(); + if (Reg == -1) { + if (!haveEaten) + return MatchOperand_NoMatch; + Error(Parser.getTok().getLoc(), "register expected"); + return MatchOperand_ParseFail; + } + + ARM_AM::ShiftOpc ShiftTy = ARM_AM::no_shift; + unsigned ShiftImm = 0; + if (Parser.getTok().is(AsmToken::Comma)) { + Parser.Lex(); // Eat the ','. + if (parseMemRegOffsetShift(ShiftTy, ShiftImm)) + return MatchOperand_ParseFail; + + // FIXME: Only approximates end...may include intervening whitespace. + E = Parser.getTok().getLoc(); + } + + Operands.push_back(ARMOperand::CreatePostIdxReg(Reg, isAdd, ShiftTy, + ShiftImm, S, E)); + + return MatchOperand_Success; +} + +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseAM3Offset(OperandVector &Operands) { + // Check for a post-index addressing register operand. Specifically: + // am3offset := '+' register + // | '-' register + // | register + // | # imm + // | # + imm + // | # - imm + + // This method must return MatchOperand_NoMatch without consuming any tokens + // in the case where there is no match, as other alternatives take other + // parse methods. + MCAsmParser &Parser = getParser(); + AsmToken Tok = Parser.getTok(); + SMLoc S = Tok.getLoc(); + + // Do immediates first, as we always parse those if we have a '#'. + if (Parser.getTok().is(AsmToken::Hash) || + Parser.getTok().is(AsmToken::Dollar)) { + Parser.Lex(); // Eat '#' or '$'. + // Explicitly look for a '-', as we need to encode negative zero + // differently. + bool isNegative = Parser.getTok().is(AsmToken::Minus); + const MCExpr *Offset; + SMLoc E; + if (getParser().parseExpression(Offset, E)) + return MatchOperand_ParseFail; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Offset); + if (!CE) { + Error(S, "constant expression expected"); + return MatchOperand_ParseFail; + } + // Negative zero is encoded as the flag value INT32_MIN. + int32_t Val = CE->getValue(); + if (isNegative && Val == 0) + Val = INT32_MIN; + + Operands.push_back( + ARMOperand::CreateImm(MCConstantExpr::create(Val, getContext()), S, E)); + + return MatchOperand_Success; + } + + + bool haveEaten = false; + bool isAdd = true; + if (Tok.is(AsmToken::Plus)) { + Parser.Lex(); // Eat the '+' token. + haveEaten = true; + } else if (Tok.is(AsmToken::Minus)) { + Parser.Lex(); // Eat the '-' token. + isAdd = false; + haveEaten = true; + } + + Tok = Parser.getTok(); + int Reg = tryParseRegister(); + if (Reg == -1) { + if (!haveEaten) + return MatchOperand_NoMatch; + Error(Tok.getLoc(), "register expected"); + return MatchOperand_ParseFail; + } + + Operands.push_back(ARMOperand::CreatePostIdxReg(Reg, isAdd, ARM_AM::no_shift, + 0, S, Tok.getEndLoc())); + + return MatchOperand_Success; +} + +/// Convert parsed operands to MCInst. Needed here because this instruction +/// only has two register operands, but multiplication is commutative so +/// assemblers should accept both "mul rD, rN, rD" and "mul rD, rD, rN". +void ARMAsmParser::cvtThumbMultiply(MCInst &Inst, + const OperandVector &Operands) { + ((ARMOperand &)*Operands[3]).addRegOperands(Inst, 1); + ((ARMOperand &)*Operands[1]).addCCOutOperands(Inst, 1); + // If we have a three-operand form, make sure to set Rn to be the operand + // that isn't the same as Rd. + unsigned RegOp = 4; + if (Operands.size() == 6 && + ((ARMOperand &)*Operands[4]).getReg() == + ((ARMOperand &)*Operands[3]).getReg()) + RegOp = 5; + ((ARMOperand &)*Operands[RegOp]).addRegOperands(Inst, 1); + Inst.addOperand(Inst.getOperand(0)); + ((ARMOperand &)*Operands[2]).addCondCodeOperands(Inst, 2); +} + +void ARMAsmParser::cvtThumbBranches(MCInst &Inst, + const OperandVector &Operands) { + int CondOp = -1, ImmOp = -1; + switch(Inst.getOpcode()) { + case ARM::tB: + case ARM::tBcc: CondOp = 1; ImmOp = 2; break; + + case ARM::t2B: + case ARM::t2Bcc: CondOp = 1; ImmOp = 3; break; + + default: llvm_unreachable("Unexpected instruction in cvtThumbBranches"); + } + // first decide whether or not the branch should be conditional + // by looking at it's location relative to an IT block + if(inITBlock()) { + // inside an IT block we cannot have any conditional branches. any + // such instructions needs to be converted to unconditional form + switch(Inst.getOpcode()) { + case ARM::tBcc: Inst.setOpcode(ARM::tB); break; + case ARM::t2Bcc: Inst.setOpcode(ARM::t2B); break; + } + } else { + // outside IT blocks we can only have unconditional branches with AL + // condition code or conditional branches with non-AL condition code + unsigned Cond = static_cast<ARMOperand &>(*Operands[CondOp]).getCondCode(); + switch(Inst.getOpcode()) { + case ARM::tB: + case ARM::tBcc: + Inst.setOpcode(Cond == ARMCC::AL ? ARM::tB : ARM::tBcc); + break; + case ARM::t2B: + case ARM::t2Bcc: + Inst.setOpcode(Cond == ARMCC::AL ? ARM::t2B : ARM::t2Bcc); + break; + } + } + + // now decide on encoding size based on branch target range + switch(Inst.getOpcode()) { + // classify tB as either t2B or t1B based on range of immediate operand + case ARM::tB: { + ARMOperand &op = static_cast<ARMOperand &>(*Operands[ImmOp]); + if (!op.isSignedOffset<11, 1>() && isThumbTwo()) + Inst.setOpcode(ARM::t2B); + break; + } + // classify tBcc as either t2Bcc or t1Bcc based on range of immediate operand + case ARM::tBcc: { + ARMOperand &op = static_cast<ARMOperand &>(*Operands[ImmOp]); + if (!op.isSignedOffset<8, 1>() && isThumbTwo()) + Inst.setOpcode(ARM::t2Bcc); + break; + } + } + ((ARMOperand &)*Operands[ImmOp]).addImmOperands(Inst, 1); + ((ARMOperand &)*Operands[CondOp]).addCondCodeOperands(Inst, 2); +} + +/// Parse an ARM memory expression, return false if successful else return true +/// or an error. The first token must be a '[' when called. +bool ARMAsmParser::parseMemory(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + SMLoc S, E; + assert(Parser.getTok().is(AsmToken::LBrac) && + "Token is not a Left Bracket"); + S = Parser.getTok().getLoc(); + Parser.Lex(); // Eat left bracket token. + + const AsmToken &BaseRegTok = Parser.getTok(); + int BaseRegNum = tryParseRegister(); + if (BaseRegNum == -1) + return Error(BaseRegTok.getLoc(), "register expected"); + + // The next token must either be a comma, a colon or a closing bracket. + const AsmToken &Tok = Parser.getTok(); + if (!Tok.is(AsmToken::Colon) && !Tok.is(AsmToken::Comma) && + !Tok.is(AsmToken::RBrac)) + return Error(Tok.getLoc(), "malformed memory operand"); + + if (Tok.is(AsmToken::RBrac)) { + E = Tok.getEndLoc(); + Parser.Lex(); // Eat right bracket token. + + Operands.push_back(ARMOperand::CreateMem(BaseRegNum, nullptr, 0, + ARM_AM::no_shift, 0, 0, false, + S, E)); + + // If there's a pre-indexing writeback marker, '!', just add it as a token + // operand. It's rather odd, but syntactically valid. + if (Parser.getTok().is(AsmToken::Exclaim)) { + Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc())); + Parser.Lex(); // Eat the '!'. + } + + return false; + } + + assert((Tok.is(AsmToken::Colon) || Tok.is(AsmToken::Comma)) && + "Lost colon or comma in memory operand?!"); + if (Tok.is(AsmToken::Comma)) { + Parser.Lex(); // Eat the comma. + } + + // If we have a ':', it's an alignment specifier. + if (Parser.getTok().is(AsmToken::Colon)) { + Parser.Lex(); // Eat the ':'. + E = Parser.getTok().getLoc(); + SMLoc AlignmentLoc = Tok.getLoc(); + + const MCExpr *Expr; + if (getParser().parseExpression(Expr)) + return true; + + // The expression has to be a constant. Memory references with relocations + // don't come through here, as they use the <label> forms of the relevant + // instructions. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr); + if (!CE) + return Error (E, "constant expression expected"); + + unsigned Align = 0; + switch (CE->getValue()) { + default: + return Error(E, + "alignment specifier must be 16, 32, 64, 128, or 256 bits"); + case 16: Align = 2; break; + case 32: Align = 4; break; + case 64: Align = 8; break; + case 128: Align = 16; break; + case 256: Align = 32; break; + } + + // Now we should have the closing ']' + if (Parser.getTok().isNot(AsmToken::RBrac)) + return Error(Parser.getTok().getLoc(), "']' expected"); + E = Parser.getTok().getEndLoc(); + Parser.Lex(); // Eat right bracket token. + + // Don't worry about range checking the value here. That's handled by + // the is*() predicates. + Operands.push_back(ARMOperand::CreateMem(BaseRegNum, nullptr, 0, + ARM_AM::no_shift, 0, Align, + false, S, E, AlignmentLoc)); + + // If there's a pre-indexing writeback marker, '!', just add it as a token + // operand. + if (Parser.getTok().is(AsmToken::Exclaim)) { + Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc())); + Parser.Lex(); // Eat the '!'. + } + + return false; + } + + // If we have a '#', it's an immediate offset, else assume it's a register + // offset. Be friendly and also accept a plain integer (without a leading + // hash) for gas compatibility. + if (Parser.getTok().is(AsmToken::Hash) || + Parser.getTok().is(AsmToken::Dollar) || + Parser.getTok().is(AsmToken::Integer)) { + if (Parser.getTok().isNot(AsmToken::Integer)) + Parser.Lex(); // Eat '#' or '$'. + E = Parser.getTok().getLoc(); + + bool isNegative = getParser().getTok().is(AsmToken::Minus); + const MCExpr *Offset; + if (getParser().parseExpression(Offset)) + return true; + + // The expression has to be a constant. Memory references with relocations + // don't come through here, as they use the <label> forms of the relevant + // instructions. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Offset); + if (!CE) + return Error (E, "constant expression expected"); + + // If the constant was #-0, represent it as INT32_MIN. + int32_t Val = CE->getValue(); + if (isNegative && Val == 0) + CE = MCConstantExpr::create(INT32_MIN, getContext()); + + // Now we should have the closing ']' + if (Parser.getTok().isNot(AsmToken::RBrac)) + return Error(Parser.getTok().getLoc(), "']' expected"); + E = Parser.getTok().getEndLoc(); + Parser.Lex(); // Eat right bracket token. + + // Don't worry about range checking the value here. That's handled by + // the is*() predicates. + Operands.push_back(ARMOperand::CreateMem(BaseRegNum, CE, 0, + ARM_AM::no_shift, 0, 0, + false, S, E)); + + // If there's a pre-indexing writeback marker, '!', just add it as a token + // operand. + if (Parser.getTok().is(AsmToken::Exclaim)) { + Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc())); + Parser.Lex(); // Eat the '!'. + } + + return false; + } + + // The register offset is optionally preceded by a '+' or '-' + bool isNegative = false; + if (Parser.getTok().is(AsmToken::Minus)) { + isNegative = true; + Parser.Lex(); // Eat the '-'. + } else if (Parser.getTok().is(AsmToken::Plus)) { + // Nothing to do. + Parser.Lex(); // Eat the '+'. + } + + E = Parser.getTok().getLoc(); + int OffsetRegNum = tryParseRegister(); + if (OffsetRegNum == -1) + return Error(E, "register expected"); + + // If there's a shift operator, handle it. + ARM_AM::ShiftOpc ShiftType = ARM_AM::no_shift; + unsigned ShiftImm = 0; + if (Parser.getTok().is(AsmToken::Comma)) { + Parser.Lex(); // Eat the ','. + if (parseMemRegOffsetShift(ShiftType, ShiftImm)) + return true; + } + + // Now we should have the closing ']' + if (Parser.getTok().isNot(AsmToken::RBrac)) + return Error(Parser.getTok().getLoc(), "']' expected"); + E = Parser.getTok().getEndLoc(); + Parser.Lex(); // Eat right bracket token. + + Operands.push_back(ARMOperand::CreateMem(BaseRegNum, nullptr, OffsetRegNum, + ShiftType, ShiftImm, 0, isNegative, + S, E)); + + // If there's a pre-indexing writeback marker, '!', just add it as a token + // operand. + if (Parser.getTok().is(AsmToken::Exclaim)) { + Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc())); + Parser.Lex(); // Eat the '!'. + } + + return false; +} + +/// parseMemRegOffsetShift - one of these two: +/// ( lsl | lsr | asr | ror ) , # shift_amount +/// rrx +/// return true if it parses a shift otherwise it returns false. +bool ARMAsmParser::parseMemRegOffsetShift(ARM_AM::ShiftOpc &St, + unsigned &Amount) { + MCAsmParser &Parser = getParser(); + SMLoc Loc = Parser.getTok().getLoc(); + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Identifier)) + return true; + StringRef ShiftName = Tok.getString(); + if (ShiftName == "lsl" || ShiftName == "LSL" || + ShiftName == "asl" || ShiftName == "ASL") + St = ARM_AM::lsl; + else if (ShiftName == "lsr" || ShiftName == "LSR") + St = ARM_AM::lsr; + else if (ShiftName == "asr" || ShiftName == "ASR") + St = ARM_AM::asr; + else if (ShiftName == "ror" || ShiftName == "ROR") + St = ARM_AM::ror; + else if (ShiftName == "rrx" || ShiftName == "RRX") + St = ARM_AM::rrx; + else + return Error(Loc, "illegal shift operator"); + Parser.Lex(); // Eat shift type token. + + // rrx stands alone. + Amount = 0; + if (St != ARM_AM::rrx) { + Loc = Parser.getTok().getLoc(); + // A '#' and a shift amount. + const AsmToken &HashTok = Parser.getTok(); + if (HashTok.isNot(AsmToken::Hash) && + HashTok.isNot(AsmToken::Dollar)) + return Error(HashTok.getLoc(), "'#' expected"); + Parser.Lex(); // Eat hash token. + + const MCExpr *Expr; + if (getParser().parseExpression(Expr)) + return true; + // Range check the immediate. + // lsl, ror: 0 <= imm <= 31 + // lsr, asr: 0 <= imm <= 32 + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr); + if (!CE) + return Error(Loc, "shift amount must be an immediate"); + int64_t Imm = CE->getValue(); + if (Imm < 0 || + ((St == ARM_AM::lsl || St == ARM_AM::ror) && Imm > 31) || + ((St == ARM_AM::lsr || St == ARM_AM::asr) && Imm > 32)) + return Error(Loc, "immediate shift value out of range"); + // If <ShiftTy> #0, turn it into a no_shift. + if (Imm == 0) + St = ARM_AM::lsl; + // For consistency, treat lsr #32 and asr #32 as having immediate value 0. + if (Imm == 32) + Imm = 0; + Amount = Imm; + } + + return false; +} + +/// parseFPImm - A floating point immediate expression operand. +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseFPImm(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + // Anything that can accept a floating point constant as an operand + // needs to go through here, as the regular parseExpression is + // integer only. + // + // This routine still creates a generic Immediate operand, containing + // a bitcast of the 64-bit floating point value. The various operands + // that accept floats can check whether the value is valid for them + // via the standard is*() predicates. + + SMLoc S = Parser.getTok().getLoc(); + + if (Parser.getTok().isNot(AsmToken::Hash) && + Parser.getTok().isNot(AsmToken::Dollar)) + return MatchOperand_NoMatch; + + // Disambiguate the VMOV forms that can accept an FP immediate. + // vmov.f32 <sreg>, #imm + // vmov.f64 <dreg>, #imm + // vmov.f32 <dreg>, #imm @ vector f32x2 + // vmov.f32 <qreg>, #imm @ vector f32x4 + // + // There are also the NEON VMOV instructions which expect an + // integer constant. Make sure we don't try to parse an FPImm + // for these: + // vmov.i{8|16|32|64} <dreg|qreg>, #imm + ARMOperand &TyOp = static_cast<ARMOperand &>(*Operands[2]); + bool isVmovf = TyOp.isToken() && + (TyOp.getToken() == ".f32" || TyOp.getToken() == ".f64"); + ARMOperand &Mnemonic = static_cast<ARMOperand &>(*Operands[0]); + bool isFconst = Mnemonic.isToken() && (Mnemonic.getToken() == "fconstd" || + Mnemonic.getToken() == "fconsts"); + if (!(isVmovf || isFconst)) + return MatchOperand_NoMatch; + + Parser.Lex(); // Eat '#' or '$'. + + // Handle negation, as that still comes through as a separate token. + bool isNegative = false; + if (Parser.getTok().is(AsmToken::Minus)) { + isNegative = true; + Parser.Lex(); + } + const AsmToken &Tok = Parser.getTok(); + SMLoc Loc = Tok.getLoc(); + if (Tok.is(AsmToken::Real) && isVmovf) { + APFloat RealVal(APFloat::IEEEsingle, Tok.getString()); + uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue(); + // If we had a '-' in front, toggle the sign bit. + IntVal ^= (uint64_t)isNegative << 31; + Parser.Lex(); // Eat the token. + Operands.push_back(ARMOperand::CreateImm( + MCConstantExpr::create(IntVal, getContext()), + S, Parser.getTok().getLoc())); + return MatchOperand_Success; + } + // Also handle plain integers. Instructions which allow floating point + // immediates also allow a raw encoded 8-bit value. + if (Tok.is(AsmToken::Integer) && isFconst) { + int64_t Val = Tok.getIntVal(); + Parser.Lex(); // Eat the token. + if (Val > 255 || Val < 0) { + Error(Loc, "encoded floating point value out of range"); + return MatchOperand_ParseFail; + } + float RealVal = ARM_AM::getFPImmFloat(Val); + Val = APFloat(RealVal).bitcastToAPInt().getZExtValue(); + + Operands.push_back(ARMOperand::CreateImm( + MCConstantExpr::create(Val, getContext()), S, + Parser.getTok().getLoc())); + return MatchOperand_Success; + } + + Error(Loc, "invalid floating point immediate"); + return MatchOperand_ParseFail; +} + +/// Parse a arm instruction operand. For now this parses the operand regardless +/// of the mnemonic. +bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { + MCAsmParser &Parser = getParser(); + SMLoc S, E; + + // Check if the current operand has a custom associated parser, if so, try to + // custom parse the operand, or fallback to the general approach. + OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic); + if (ResTy == MatchOperand_Success) + return false; + // If there wasn't a custom match, try the generic matcher below. Otherwise, + // there was a match, but an error occurred, in which case, just return that + // the operand parsing failed. + if (ResTy == MatchOperand_ParseFail) + return true; + + switch (getLexer().getKind()) { + default: + Error(Parser.getTok().getLoc(), "unexpected token in operand"); + return true; + case AsmToken::Identifier: { + // If we've seen a branch mnemonic, the next operand must be a label. This + // is true even if the label is a register name. So "br r1" means branch to + // label "r1". + bool ExpectLabel = Mnemonic == "b" || Mnemonic == "bl"; + if (!ExpectLabel) { + if (!tryParseRegisterWithWriteBack(Operands)) + return false; + int Res = tryParseShiftRegister(Operands); + if (Res == 0) // success + return false; + else if (Res == -1) // irrecoverable error + return true; + // If this is VMRS, check for the apsr_nzcv operand. + if (Mnemonic == "vmrs" && + Parser.getTok().getString().equals_lower("apsr_nzcv")) { + S = Parser.getTok().getLoc(); + Parser.Lex(); + Operands.push_back(ARMOperand::CreateToken("APSR_nzcv", S)); + return false; + } + } + + // Fall though for the Identifier case that is not a register or a + // special name. + } + case AsmToken::LParen: // parenthesized expressions like (_strcmp-4) + case AsmToken::Integer: // things like 1f and 2b as a branch targets + case AsmToken::String: // quoted label names. + case AsmToken::Dot: { // . as a branch target + // This was not a register so parse other operands that start with an + // identifier (like labels) as expressions and create them as immediates. + const MCExpr *IdVal; + S = Parser.getTok().getLoc(); + if (getParser().parseExpression(IdVal)) + return true; + E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); + Operands.push_back(ARMOperand::CreateImm(IdVal, S, E)); + return false; + } + case AsmToken::LBrac: + return parseMemory(Operands); + case AsmToken::LCurly: + return parseRegisterList(Operands); + case AsmToken::Dollar: + case AsmToken::Hash: { + // #42 -> immediate. + S = Parser.getTok().getLoc(); + Parser.Lex(); + + if (Parser.getTok().isNot(AsmToken::Colon)) { + bool isNegative = Parser.getTok().is(AsmToken::Minus); + const MCExpr *ImmVal; + if (getParser().parseExpression(ImmVal)) + return true; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ImmVal); + if (CE) { + int32_t Val = CE->getValue(); + if (isNegative && Val == 0) + ImmVal = MCConstantExpr::create(INT32_MIN, getContext()); + } + E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); + Operands.push_back(ARMOperand::CreateImm(ImmVal, S, E)); + + // There can be a trailing '!' on operands that we want as a separate + // '!' Token operand. Handle that here. For example, the compatibility + // alias for 'srsdb sp!, #imm' is 'srsdb #imm!'. + if (Parser.getTok().is(AsmToken::Exclaim)) { + Operands.push_back(ARMOperand::CreateToken(Parser.getTok().getString(), + Parser.getTok().getLoc())); + Parser.Lex(); // Eat exclaim token + } + return false; + } + // w/ a ':' after the '#', it's just like a plain ':'. + // FALLTHROUGH + } + case AsmToken::Colon: { + S = Parser.getTok().getLoc(); + // ":lower16:" and ":upper16:" expression prefixes + // FIXME: Check it's an expression prefix, + // e.g. (FOO - :lower16:BAR) isn't legal. + ARMMCExpr::VariantKind RefKind; + if (parsePrefix(RefKind)) + return true; + + const MCExpr *SubExprVal; + if (getParser().parseExpression(SubExprVal)) + return true; + + const MCExpr *ExprVal = ARMMCExpr::create(RefKind, SubExprVal, + getContext()); + E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); + Operands.push_back(ARMOperand::CreateImm(ExprVal, S, E)); + return false; + } + case AsmToken::Equal: { + S = Parser.getTok().getLoc(); + if (Mnemonic != "ldr") // only parse for ldr pseudo (e.g. ldr r0, =val) + return Error(S, "unexpected token in operand"); + + Parser.Lex(); // Eat '=' + const MCExpr *SubExprVal; + if (getParser().parseExpression(SubExprVal)) + return true; + E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); + + const MCExpr *CPLoc = + getTargetStreamer().addConstantPoolEntry(SubExprVal, S); + Operands.push_back(ARMOperand::CreateImm(CPLoc, S, E)); + return false; + } + } +} + +// parsePrefix - Parse ARM 16-bit relocations expression prefix, i.e. +// :lower16: and :upper16:. +bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) { + MCAsmParser &Parser = getParser(); + RefKind = ARMMCExpr::VK_ARM_None; + + // consume an optional '#' (GNU compatibility) + if (getLexer().is(AsmToken::Hash)) + Parser.Lex(); + + // :lower16: and :upper16: modifiers + assert(getLexer().is(AsmToken::Colon) && "expected a :"); + Parser.Lex(); // Eat ':' + + if (getLexer().isNot(AsmToken::Identifier)) { + Error(Parser.getTok().getLoc(), "expected prefix identifier in operand"); + return true; + } + + enum { + COFF = (1 << MCObjectFileInfo::IsCOFF), + ELF = (1 << MCObjectFileInfo::IsELF), + MACHO = (1 << MCObjectFileInfo::IsMachO) + }; + static const struct PrefixEntry { + const char *Spelling; + ARMMCExpr::VariantKind VariantKind; + uint8_t SupportedFormats; + } PrefixEntries[] = { + { "lower16", ARMMCExpr::VK_ARM_LO16, COFF | ELF | MACHO }, + { "upper16", ARMMCExpr::VK_ARM_HI16, COFF | ELF | MACHO }, + }; + + StringRef IDVal = Parser.getTok().getIdentifier(); + + const auto &Prefix = + std::find_if(std::begin(PrefixEntries), std::end(PrefixEntries), + [&IDVal](const PrefixEntry &PE) { + return PE.Spelling == IDVal; + }); + if (Prefix == std::end(PrefixEntries)) { + Error(Parser.getTok().getLoc(), "unexpected prefix in operand"); + return true; + } + + uint8_t CurrentFormat; + switch (getContext().getObjectFileInfo()->getObjectFileType()) { + case MCObjectFileInfo::IsMachO: + CurrentFormat = MACHO; + break; + case MCObjectFileInfo::IsELF: + CurrentFormat = ELF; + break; + case MCObjectFileInfo::IsCOFF: + CurrentFormat = COFF; + break; + } + + if (~Prefix->SupportedFormats & CurrentFormat) { + Error(Parser.getTok().getLoc(), + "cannot represent relocation in the current file format"); + return true; + } + + RefKind = Prefix->VariantKind; + Parser.Lex(); + + if (getLexer().isNot(AsmToken::Colon)) { + Error(Parser.getTok().getLoc(), "unexpected token after prefix"); + return true; + } + Parser.Lex(); // Eat the last ':' + + return false; +} + +/// \brief Given a mnemonic, split out possible predication code and carry +/// setting letters to form a canonical mnemonic and flags. +// +// FIXME: Would be nice to autogen this. +// FIXME: This is a bit of a maze of special cases. +StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic, + unsigned &PredicationCode, + bool &CarrySetting, + unsigned &ProcessorIMod, + StringRef &ITMask) { + PredicationCode = ARMCC::AL; + CarrySetting = false; + ProcessorIMod = 0; + + // Ignore some mnemonics we know aren't predicated forms. + // + // FIXME: Would be nice to autogen this. + if ((Mnemonic == "movs" && isThumb()) || + Mnemonic == "teq" || Mnemonic == "vceq" || Mnemonic == "svc" || + Mnemonic == "mls" || Mnemonic == "smmls" || Mnemonic == "vcls" || + Mnemonic == "vmls" || Mnemonic == "vnmls" || Mnemonic == "vacge" || + Mnemonic == "vcge" || Mnemonic == "vclt" || Mnemonic == "vacgt" || + Mnemonic == "vaclt" || Mnemonic == "vacle" || Mnemonic == "hlt" || + Mnemonic == "vcgt" || Mnemonic == "vcle" || Mnemonic == "smlal" || + Mnemonic == "umaal" || Mnemonic == "umlal" || Mnemonic == "vabal" || + Mnemonic == "vmlal" || Mnemonic == "vpadal" || Mnemonic == "vqdmlal" || + Mnemonic == "fmuls" || Mnemonic == "vmaxnm" || Mnemonic == "vminnm" || + Mnemonic == "vcvta" || Mnemonic == "vcvtn" || Mnemonic == "vcvtp" || + Mnemonic == "vcvtm" || Mnemonic == "vrinta" || Mnemonic == "vrintn" || + Mnemonic == "vrintp" || Mnemonic == "vrintm" || Mnemonic == "hvc" || + Mnemonic.startswith("vsel")) + return Mnemonic; + + // First, split out any predication code. Ignore mnemonics we know aren't + // predicated but do have a carry-set and so weren't caught above. + if (Mnemonic != "adcs" && Mnemonic != "bics" && Mnemonic != "movs" && + Mnemonic != "muls" && Mnemonic != "smlals" && Mnemonic != "smulls" && + Mnemonic != "umlals" && Mnemonic != "umulls" && Mnemonic != "lsls" && + Mnemonic != "sbcs" && Mnemonic != "rscs") { + unsigned CC = StringSwitch<unsigned>(Mnemonic.substr(Mnemonic.size()-2)) + .Case("eq", ARMCC::EQ) + .Case("ne", ARMCC::NE) + .Case("hs", ARMCC::HS) + .Case("cs", ARMCC::HS) + .Case("lo", ARMCC::LO) + .Case("cc", ARMCC::LO) + .Case("mi", ARMCC::MI) + .Case("pl", ARMCC::PL) + .Case("vs", ARMCC::VS) + .Case("vc", ARMCC::VC) + .Case("hi", ARMCC::HI) + .Case("ls", ARMCC::LS) + .Case("ge", ARMCC::GE) + .Case("lt", ARMCC::LT) + .Case("gt", ARMCC::GT) + .Case("le", ARMCC::LE) + .Case("al", ARMCC::AL) + .Default(~0U); + if (CC != ~0U) { + Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 2); + PredicationCode = CC; + } + } + + // Next, determine if we have a carry setting bit. We explicitly ignore all + // the instructions we know end in 's'. + if (Mnemonic.endswith("s") && + !(Mnemonic == "cps" || Mnemonic == "mls" || + Mnemonic == "mrs" || Mnemonic == "smmls" || Mnemonic == "vabs" || + Mnemonic == "vcls" || Mnemonic == "vmls" || Mnemonic == "vmrs" || + Mnemonic == "vnmls" || Mnemonic == "vqabs" || Mnemonic == "vrecps" || + Mnemonic == "vrsqrts" || Mnemonic == "srs" || Mnemonic == "flds" || + Mnemonic == "fmrs" || Mnemonic == "fsqrts" || Mnemonic == "fsubs" || + Mnemonic == "fsts" || Mnemonic == "fcpys" || Mnemonic == "fdivs" || + Mnemonic == "fmuls" || Mnemonic == "fcmps" || Mnemonic == "fcmpzs" || + Mnemonic == "vfms" || Mnemonic == "vfnms" || Mnemonic == "fconsts" || + (Mnemonic == "movs" && isThumb()))) { + Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 1); + CarrySetting = true; + } + + // The "cps" instruction can have a interrupt mode operand which is glued into + // the mnemonic. Check if this is the case, split it and parse the imod op + if (Mnemonic.startswith("cps")) { + // Split out any imod code. + unsigned IMod = + StringSwitch<unsigned>(Mnemonic.substr(Mnemonic.size()-2, 2)) + .Case("ie", ARM_PROC::IE) + .Case("id", ARM_PROC::ID) + .Default(~0U); + if (IMod != ~0U) { + Mnemonic = Mnemonic.slice(0, Mnemonic.size()-2); + ProcessorIMod = IMod; + } + } + + // The "it" instruction has the condition mask on the end of the mnemonic. + if (Mnemonic.startswith("it")) { + ITMask = Mnemonic.slice(2, Mnemonic.size()); + Mnemonic = Mnemonic.slice(0, 2); + } + + return Mnemonic; +} + +/// \brief Given a canonical mnemonic, determine if the instruction ever allows +/// inclusion of carry set or predication code operands. +// +// FIXME: It would be nice to autogen this. +void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst, + bool &CanAcceptCarrySet, + bool &CanAcceptPredicationCode) { + CanAcceptCarrySet = + Mnemonic == "and" || Mnemonic == "lsl" || Mnemonic == "lsr" || + Mnemonic == "rrx" || Mnemonic == "ror" || Mnemonic == "sub" || + Mnemonic == "add" || Mnemonic == "adc" || Mnemonic == "mul" || + Mnemonic == "bic" || Mnemonic == "asr" || Mnemonic == "orr" || + Mnemonic == "mvn" || Mnemonic == "rsb" || Mnemonic == "rsc" || + Mnemonic == "orn" || Mnemonic == "sbc" || Mnemonic == "eor" || + Mnemonic == "neg" || Mnemonic == "vfm" || Mnemonic == "vfnm" || + (!isThumb() && + (Mnemonic == "smull" || Mnemonic == "mov" || Mnemonic == "mla" || + Mnemonic == "smlal" || Mnemonic == "umlal" || Mnemonic == "umull")); + + if (Mnemonic == "bkpt" || Mnemonic == "cbnz" || Mnemonic == "setend" || + Mnemonic == "cps" || Mnemonic == "it" || Mnemonic == "cbz" || + Mnemonic == "trap" || Mnemonic == "hlt" || Mnemonic == "udf" || + Mnemonic.startswith("crc32") || Mnemonic.startswith("cps") || + Mnemonic.startswith("vsel") || Mnemonic == "vmaxnm" || + Mnemonic == "vminnm" || Mnemonic == "vcvta" || Mnemonic == "vcvtn" || + Mnemonic == "vcvtp" || Mnemonic == "vcvtm" || Mnemonic == "vrinta" || + Mnemonic == "vrintn" || Mnemonic == "vrintp" || Mnemonic == "vrintm" || + Mnemonic.startswith("aes") || Mnemonic == "hvc" || Mnemonic == "setpan" || + Mnemonic.startswith("sha1") || Mnemonic.startswith("sha256") || + (FullInst.startswith("vmull") && FullInst.endswith(".p64"))) { + // These mnemonics are never predicable + CanAcceptPredicationCode = false; + } else if (!isThumb()) { + // Some instructions are only predicable in Thumb mode + CanAcceptPredicationCode = + Mnemonic != "cdp2" && Mnemonic != "clrex" && Mnemonic != "mcr2" && + Mnemonic != "mcrr2" && Mnemonic != "mrc2" && Mnemonic != "mrrc2" && + Mnemonic != "dmb" && Mnemonic != "dsb" && Mnemonic != "isb" && + Mnemonic != "pld" && Mnemonic != "pli" && Mnemonic != "pldw" && + Mnemonic != "ldc2" && Mnemonic != "ldc2l" && Mnemonic != "stc2" && + Mnemonic != "stc2l" && !Mnemonic.startswith("rfe") && + !Mnemonic.startswith("srs"); + } else if (isThumbOne()) { + if (hasV6MOps()) + CanAcceptPredicationCode = Mnemonic != "movs"; + else + CanAcceptPredicationCode = Mnemonic != "nop" && Mnemonic != "movs"; + } else + CanAcceptPredicationCode = true; +} + +// \brief Some Thumb instructions have two operand forms that are not +// available as three operand, convert to two operand form if possible. +// +// FIXME: We would really like to be able to tablegen'erate this. +void ARMAsmParser::tryConvertingToTwoOperandForm(StringRef Mnemonic, + bool CarrySetting, + OperandVector &Operands) { + if (Operands.size() != 6) + return; + + const auto &Op3 = static_cast<ARMOperand &>(*Operands[3]); + auto &Op4 = static_cast<ARMOperand &>(*Operands[4]); + if (!Op3.isReg() || !Op4.isReg()) + return; + + auto Op3Reg = Op3.getReg(); + auto Op4Reg = Op4.getReg(); + + // For most Thumb2 cases we just generate the 3 operand form and reduce + // it in processInstruction(), but the 3 operand form of ADD (t2ADDrr) + // won't accept SP or PC so we do the transformation here taking care + // with immediate range in the 'add sp, sp #imm' case. + auto &Op5 = static_cast<ARMOperand &>(*Operands[5]); + if (isThumbTwo()) { + if (Mnemonic != "add") + return; + bool TryTransform = Op3Reg == ARM::PC || Op4Reg == ARM::PC || + (Op5.isReg() && Op5.getReg() == ARM::PC); + if (!TryTransform) { + TryTransform = (Op3Reg == ARM::SP || Op4Reg == ARM::SP || + (Op5.isReg() && Op5.getReg() == ARM::SP)) && + !(Op3Reg == ARM::SP && Op4Reg == ARM::SP && + Op5.isImm() && !Op5.isImm0_508s4()); + } + if (!TryTransform) + return; + } else if (!isThumbOne()) + return; + + if (!(Mnemonic == "add" || Mnemonic == "sub" || Mnemonic == "and" || + Mnemonic == "eor" || Mnemonic == "lsl" || Mnemonic == "lsr" || + Mnemonic == "asr" || Mnemonic == "adc" || Mnemonic == "sbc" || + Mnemonic == "ror" || Mnemonic == "orr" || Mnemonic == "bic")) + return; + + // If first 2 operands of a 3 operand instruction are the same + // then transform to 2 operand version of the same instruction + // e.g. 'adds r0, r0, #1' transforms to 'adds r0, #1' + bool Transform = Op3Reg == Op4Reg; + + // For communtative operations, we might be able to transform if we swap + // Op4 and Op5. The 'ADD Rdm, SP, Rdm' form is already handled specially + // as tADDrsp. + const ARMOperand *LastOp = &Op5; + bool Swap = false; + if (!Transform && Op5.isReg() && Op3Reg == Op5.getReg() && + ((Mnemonic == "add" && Op4Reg != ARM::SP) || + Mnemonic == "and" || Mnemonic == "eor" || + Mnemonic == "adc" || Mnemonic == "orr")) { + Swap = true; + LastOp = &Op4; + Transform = true; + } + + // If both registers are the same then remove one of them from + // the operand list, with certain exceptions. + if (Transform) { + // Don't transform 'adds Rd, Rd, Rm' or 'sub{s} Rd, Rd, Rm' because the + // 2 operand forms don't exist. + if (((Mnemonic == "add" && CarrySetting) || Mnemonic == "sub") && + LastOp->isReg()) + Transform = false; + + // Don't transform 'add/sub{s} Rd, Rd, #imm' if the immediate fits into + // 3-bits because the ARMARM says not to. + if ((Mnemonic == "add" || Mnemonic == "sub") && LastOp->isImm0_7()) + Transform = false; + } + + if (Transform) { + if (Swap) + std::swap(Op4, Op5); + Operands.erase(Operands.begin() + 3); + } +} + +bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic, + OperandVector &Operands) { + // FIXME: This is all horribly hacky. We really need a better way to deal + // with optional operands like this in the matcher table. + + // The 'mov' mnemonic is special. One variant has a cc_out operand, while + // another does not. Specifically, the MOVW instruction does not. So we + // special case it here and remove the defaulted (non-setting) cc_out + // operand if that's the instruction we're trying to match. + // + // We do this as post-processing of the explicit operands rather than just + // conditionally adding the cc_out in the first place because we need + // to check the type of the parsed immediate operand. + if (Mnemonic == "mov" && Operands.size() > 4 && !isThumb() && + !static_cast<ARMOperand &>(*Operands[4]).isModImm() && + static_cast<ARMOperand &>(*Operands[4]).isImm0_65535Expr() && + static_cast<ARMOperand &>(*Operands[1]).getReg() == 0) + return true; + + // Register-register 'add' for thumb does not have a cc_out operand + // when there are only two register operands. + if (isThumb() && Mnemonic == "add" && Operands.size() == 5 && + static_cast<ARMOperand &>(*Operands[3]).isReg() && + static_cast<ARMOperand &>(*Operands[4]).isReg() && + static_cast<ARMOperand &>(*Operands[1]).getReg() == 0) + return true; + // Register-register 'add' for thumb does not have a cc_out operand + // when it's an ADD Rdm, SP, {Rdm|#imm0_255} instruction. We do + // have to check the immediate range here since Thumb2 has a variant + // that can handle a different range and has a cc_out operand. + if (((isThumb() && Mnemonic == "add") || + (isThumbTwo() && Mnemonic == "sub")) && + Operands.size() == 6 && static_cast<ARMOperand &>(*Operands[3]).isReg() && + static_cast<ARMOperand &>(*Operands[4]).isReg() && + static_cast<ARMOperand &>(*Operands[4]).getReg() == ARM::SP && + static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 && + ((Mnemonic == "add" && static_cast<ARMOperand &>(*Operands[5]).isReg()) || + static_cast<ARMOperand &>(*Operands[5]).isImm0_1020s4())) + return true; + // For Thumb2, add/sub immediate does not have a cc_out operand for the + // imm0_4095 variant. That's the least-preferred variant when + // selecting via the generic "add" mnemonic, so to know that we + // should remove the cc_out operand, we have to explicitly check that + // it's not one of the other variants. Ugh. + if (isThumbTwo() && (Mnemonic == "add" || Mnemonic == "sub") && + Operands.size() == 6 && static_cast<ARMOperand &>(*Operands[3]).isReg() && + static_cast<ARMOperand &>(*Operands[4]).isReg() && + static_cast<ARMOperand &>(*Operands[5]).isImm()) { + // Nest conditions rather than one big 'if' statement for readability. + // + // If both registers are low, we're in an IT block, and the immediate is + // in range, we should use encoding T1 instead, which has a cc_out. + if (inITBlock() && + isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()) && + isARMLowRegister(static_cast<ARMOperand &>(*Operands[4]).getReg()) && + static_cast<ARMOperand &>(*Operands[5]).isImm0_7()) + return false; + // Check against T3. If the second register is the PC, this is an + // alternate form of ADR, which uses encoding T4, so check for that too. + if (static_cast<ARMOperand &>(*Operands[4]).getReg() != ARM::PC && + static_cast<ARMOperand &>(*Operands[5]).isT2SOImm()) + return false; + + // Otherwise, we use encoding T4, which does not have a cc_out + // operand. + return true; + } + + // The thumb2 multiply instruction doesn't have a CCOut register, so + // if we have a "mul" mnemonic in Thumb mode, check if we'll be able to + // use the 16-bit encoding or not. + if (isThumbTwo() && Mnemonic == "mul" && Operands.size() == 6 && + static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 && + static_cast<ARMOperand &>(*Operands[3]).isReg() && + static_cast<ARMOperand &>(*Operands[4]).isReg() && + static_cast<ARMOperand &>(*Operands[5]).isReg() && + // If the registers aren't low regs, the destination reg isn't the + // same as one of the source regs, or the cc_out operand is zero + // outside of an IT block, we have to use the 32-bit encoding, so + // remove the cc_out operand. + (!isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()) || + !isARMLowRegister(static_cast<ARMOperand &>(*Operands[4]).getReg()) || + !isARMLowRegister(static_cast<ARMOperand &>(*Operands[5]).getReg()) || + !inITBlock() || (static_cast<ARMOperand &>(*Operands[3]).getReg() != + static_cast<ARMOperand &>(*Operands[5]).getReg() && + static_cast<ARMOperand &>(*Operands[3]).getReg() != + static_cast<ARMOperand &>(*Operands[4]).getReg()))) + return true; + + // Also check the 'mul' syntax variant that doesn't specify an explicit + // destination register. + if (isThumbTwo() && Mnemonic == "mul" && Operands.size() == 5 && + static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 && + static_cast<ARMOperand &>(*Operands[3]).isReg() && + static_cast<ARMOperand &>(*Operands[4]).isReg() && + // If the registers aren't low regs or the cc_out operand is zero + // outside of an IT block, we have to use the 32-bit encoding, so + // remove the cc_out operand. + (!isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()) || + !isARMLowRegister(static_cast<ARMOperand &>(*Operands[4]).getReg()) || + !inITBlock())) + return true; + + + + // Register-register 'add/sub' for thumb does not have a cc_out operand + // when it's an ADD/SUB SP, #imm. Be lenient on count since there's also + // the "add/sub SP, SP, #imm" version. If the follow-up operands aren't + // right, this will result in better diagnostics (which operand is off) + // anyway. + if (isThumb() && (Mnemonic == "add" || Mnemonic == "sub") && + (Operands.size() == 5 || Operands.size() == 6) && + static_cast<ARMOperand &>(*Operands[3]).isReg() && + static_cast<ARMOperand &>(*Operands[3]).getReg() == ARM::SP && + static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 && + (static_cast<ARMOperand &>(*Operands[4]).isImm() || + (Operands.size() == 6 && + static_cast<ARMOperand &>(*Operands[5]).isImm()))) + return true; + + return false; +} + +bool ARMAsmParser::shouldOmitPredicateOperand(StringRef Mnemonic, + OperandVector &Operands) { + // VRINT{Z, R, X} have a predicate operand in VFP, but not in NEON + unsigned RegIdx = 3; + if ((Mnemonic == "vrintz" || Mnemonic == "vrintx" || Mnemonic == "vrintr") && + (static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f32" || + static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f16")) { + if (static_cast<ARMOperand &>(*Operands[3]).isToken() && + (static_cast<ARMOperand &>(*Operands[3]).getToken() == ".f32" || + static_cast<ARMOperand &>(*Operands[3]).getToken() == ".f16")) + RegIdx = 4; + + if (static_cast<ARMOperand &>(*Operands[RegIdx]).isReg() && + (ARMMCRegisterClasses[ARM::DPRRegClassID].contains( + static_cast<ARMOperand &>(*Operands[RegIdx]).getReg()) || + ARMMCRegisterClasses[ARM::QPRRegClassID].contains( + static_cast<ARMOperand &>(*Operands[RegIdx]).getReg()))) + return true; + } + return false; +} + +static bool isDataTypeToken(StringRef Tok) { + return Tok == ".8" || Tok == ".16" || Tok == ".32" || Tok == ".64" || + Tok == ".i8" || Tok == ".i16" || Tok == ".i32" || Tok == ".i64" || + Tok == ".u8" || Tok == ".u16" || Tok == ".u32" || Tok == ".u64" || + Tok == ".s8" || Tok == ".s16" || Tok == ".s32" || Tok == ".s64" || + Tok == ".p8" || Tok == ".p16" || Tok == ".f32" || Tok == ".f64" || + Tok == ".f" || Tok == ".d"; +} + +// FIXME: This bit should probably be handled via an explicit match class +// in the .td files that matches the suffix instead of having it be +// a literal string token the way it is now. +static bool doesIgnoreDataTypeSuffix(StringRef Mnemonic, StringRef DT) { + return Mnemonic.startswith("vldm") || Mnemonic.startswith("vstm"); +} +static void applyMnemonicAliases(StringRef &Mnemonic, uint64_t Features, + unsigned VariantID); + +static bool RequiresVFPRegListValidation(StringRef Inst, + bool &AcceptSinglePrecisionOnly, + bool &AcceptDoublePrecisionOnly) { + if (Inst.size() < 7) + return false; + + if (Inst.startswith("fldm") || Inst.startswith("fstm")) { + StringRef AddressingMode = Inst.substr(4, 2); + if (AddressingMode == "ia" || AddressingMode == "db" || + AddressingMode == "ea" || AddressingMode == "fd") { + AcceptSinglePrecisionOnly = Inst[6] == 's'; + AcceptDoublePrecisionOnly = Inst[6] == 'd' || Inst[6] == 'x'; + return true; + } + } + + return false; +} + +/// Parse an arm instruction mnemonic followed by its operands. +bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + // FIXME: Can this be done via tablegen in some fashion? + bool RequireVFPRegisterListCheck; + bool AcceptSinglePrecisionOnly; + bool AcceptDoublePrecisionOnly; + RequireVFPRegisterListCheck = + RequiresVFPRegListValidation(Name, AcceptSinglePrecisionOnly, + AcceptDoublePrecisionOnly); + + // Apply mnemonic aliases before doing anything else, as the destination + // mnemonic may include suffices and we want to handle them normally. + // The generic tblgen'erated code does this later, at the start of + // MatchInstructionImpl(), but that's too late for aliases that include + // any sort of suffix. + uint64_t AvailableFeatures = getAvailableFeatures(); + unsigned AssemblerDialect = getParser().getAssemblerDialect(); + applyMnemonicAliases(Name, AvailableFeatures, AssemblerDialect); + + // First check for the ARM-specific .req directive. + if (Parser.getTok().is(AsmToken::Identifier) && + Parser.getTok().getIdentifier() == ".req") { + parseDirectiveReq(Name, NameLoc); + // We always return 'error' for this, as we're done with this + // statement and don't need to match the 'instruction." + return true; + } + + // Create the leading tokens for the mnemonic, split by '.' characters. + size_t Start = 0, Next = Name.find('.'); + StringRef Mnemonic = Name.slice(Start, Next); + + // Split out the predication code and carry setting flag from the mnemonic. + unsigned PredicationCode; + unsigned ProcessorIMod; + bool CarrySetting; + StringRef ITMask; + Mnemonic = splitMnemonic(Mnemonic, PredicationCode, CarrySetting, + ProcessorIMod, ITMask); + + // In Thumb1, only the branch (B) instruction can be predicated. + if (isThumbOne() && PredicationCode != ARMCC::AL && Mnemonic != "b") { + Parser.eatToEndOfStatement(); + return Error(NameLoc, "conditional execution not supported in Thumb1"); + } + + Operands.push_back(ARMOperand::CreateToken(Mnemonic, NameLoc)); + + // Handle the IT instruction ITMask. Convert it to a bitmask. This + // is the mask as it will be for the IT encoding if the conditional + // encoding has a '1' as it's bit0 (i.e. 't' ==> '1'). In the case + // where the conditional bit0 is zero, the instruction post-processing + // will adjust the mask accordingly. + if (Mnemonic == "it") { + SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + 2); + if (ITMask.size() > 3) { + Parser.eatToEndOfStatement(); + return Error(Loc, "too many conditions on IT instruction"); + } + unsigned Mask = 8; + for (unsigned i = ITMask.size(); i != 0; --i) { + char pos = ITMask[i - 1]; + if (pos != 't' && pos != 'e') { + Parser.eatToEndOfStatement(); + return Error(Loc, "illegal IT block condition mask '" + ITMask + "'"); + } + Mask >>= 1; + if (ITMask[i - 1] == 't') + Mask |= 8; + } + Operands.push_back(ARMOperand::CreateITMask(Mask, Loc)); + } + + // FIXME: This is all a pretty gross hack. We should automatically handle + // optional operands like this via tblgen. + + // Next, add the CCOut and ConditionCode operands, if needed. + // + // For mnemonics which can ever incorporate a carry setting bit or predication + // code, our matching model involves us always generating CCOut and + // ConditionCode operands to match the mnemonic "as written" and then we let + // the matcher deal with finding the right instruction or generating an + // appropriate error. + bool CanAcceptCarrySet, CanAcceptPredicationCode; + getMnemonicAcceptInfo(Mnemonic, Name, CanAcceptCarrySet, CanAcceptPredicationCode); + + // If we had a carry-set on an instruction that can't do that, issue an + // error. + if (!CanAcceptCarrySet && CarrySetting) { + Parser.eatToEndOfStatement(); + return Error(NameLoc, "instruction '" + Mnemonic + + "' can not set flags, but 's' suffix specified"); + } + // If we had a predication code on an instruction that can't do that, issue an + // error. + if (!CanAcceptPredicationCode && PredicationCode != ARMCC::AL) { + Parser.eatToEndOfStatement(); + return Error(NameLoc, "instruction '" + Mnemonic + + "' is not predicable, but condition code specified"); + } + + // Add the carry setting operand, if necessary. + if (CanAcceptCarrySet) { + SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size()); + Operands.push_back(ARMOperand::CreateCCOut(CarrySetting ? ARM::CPSR : 0, + Loc)); + } + + // Add the predication code operand, if necessary. + if (CanAcceptPredicationCode) { + SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size() + + CarrySetting); + Operands.push_back(ARMOperand::CreateCondCode( + ARMCC::CondCodes(PredicationCode), Loc)); + } + + // Add the processor imod operand, if necessary. + if (ProcessorIMod) { + Operands.push_back(ARMOperand::CreateImm( + MCConstantExpr::create(ProcessorIMod, getContext()), + NameLoc, NameLoc)); + } else if (Mnemonic == "cps" && isMClass()) { + return Error(NameLoc, "instruction 'cps' requires effect for M-class"); + } + + // Add the remaining tokens in the mnemonic. + while (Next != StringRef::npos) { + Start = Next; + Next = Name.find('.', Start + 1); + StringRef ExtraToken = Name.slice(Start, Next); + + // Some NEON instructions have an optional datatype suffix that is + // completely ignored. Check for that. + if (isDataTypeToken(ExtraToken) && + doesIgnoreDataTypeSuffix(Mnemonic, ExtraToken)) + continue; + + // For for ARM mode generate an error if the .n qualifier is used. + if (ExtraToken == ".n" && !isThumb()) { + SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Start); + Parser.eatToEndOfStatement(); + return Error(Loc, "instruction with .n (narrow) qualifier not allowed in " + "arm mode"); + } + + // The .n qualifier is always discarded as that is what the tables + // and matcher expect. In ARM mode the .w qualifier has no effect, + // so discard it to avoid errors that can be caused by the matcher. + if (ExtraToken != ".n" && (isThumb() || ExtraToken != ".w")) { + SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Start); + Operands.push_back(ARMOperand::CreateToken(ExtraToken, Loc)); + } + } + + // Read the remaining operands. + if (getLexer().isNot(AsmToken::EndOfStatement)) { + // Read the first operand. + if (parseOperand(Operands, Mnemonic)) { + Parser.eatToEndOfStatement(); + return true; + } + + while (getLexer().is(AsmToken::Comma)) { + Parser.Lex(); // Eat the comma. + + // Parse and remember the operand. + if (parseOperand(Operands, Mnemonic)) { + Parser.eatToEndOfStatement(); + return true; + } + } + } + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + SMLoc Loc = getLexer().getLoc(); + Parser.eatToEndOfStatement(); + return Error(Loc, "unexpected token in argument list"); + } + + Parser.Lex(); // Consume the EndOfStatement + + if (RequireVFPRegisterListCheck) { + ARMOperand &Op = static_cast<ARMOperand &>(*Operands.back()); + if (AcceptSinglePrecisionOnly && !Op.isSPRRegList()) + return Error(Op.getStartLoc(), + "VFP/Neon single precision register expected"); + if (AcceptDoublePrecisionOnly && !Op.isDPRRegList()) + return Error(Op.getStartLoc(), + "VFP/Neon double precision register expected"); + } + + tryConvertingToTwoOperandForm(Mnemonic, CarrySetting, Operands); + + // Some instructions, mostly Thumb, have forms for the same mnemonic that + // do and don't have a cc_out optional-def operand. With some spot-checks + // of the operand list, we can figure out which variant we're trying to + // parse and adjust accordingly before actually matching. We shouldn't ever + // try to remove a cc_out operand that was explicitly set on the + // mnemonic, of course (CarrySetting == true). Reason number #317 the + // table driven matcher doesn't fit well with the ARM instruction set. + if (!CarrySetting && shouldOmitCCOutOperand(Mnemonic, Operands)) + Operands.erase(Operands.begin() + 1); + + // Some instructions have the same mnemonic, but don't always + // have a predicate. Distinguish them here and delete the + // predicate if needed. + if (shouldOmitPredicateOperand(Mnemonic, Operands)) + Operands.erase(Operands.begin() + 1); + + // ARM mode 'blx' need special handling, as the register operand version + // is predicable, but the label operand version is not. So, we can't rely + // on the Mnemonic based checking to correctly figure out when to put + // a k_CondCode operand in the list. If we're trying to match the label + // version, remove the k_CondCode operand here. + if (!isThumb() && Mnemonic == "blx" && Operands.size() == 3 && + static_cast<ARMOperand &>(*Operands[2]).isImm()) + Operands.erase(Operands.begin() + 1); + + // Adjust operands of ldrexd/strexd to MCK_GPRPair. + // ldrexd/strexd require even/odd GPR pair. To enforce this constraint, + // a single GPRPair reg operand is used in the .td file to replace the two + // GPRs. However, when parsing from asm, the two GRPs cannot be automatically + // expressed as a GPRPair, so we have to manually merge them. + // FIXME: We would really like to be able to tablegen'erate this. + if (!isThumb() && Operands.size() > 4 && + (Mnemonic == "ldrexd" || Mnemonic == "strexd" || Mnemonic == "ldaexd" || + Mnemonic == "stlexd")) { + bool isLoad = (Mnemonic == "ldrexd" || Mnemonic == "ldaexd"); + unsigned Idx = isLoad ? 2 : 3; + ARMOperand &Op1 = static_cast<ARMOperand &>(*Operands[Idx]); + ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[Idx + 1]); + + const MCRegisterClass& MRC = MRI->getRegClass(ARM::GPRRegClassID); + // Adjust only if Op1 and Op2 are GPRs. + if (Op1.isReg() && Op2.isReg() && MRC.contains(Op1.getReg()) && + MRC.contains(Op2.getReg())) { + unsigned Reg1 = Op1.getReg(); + unsigned Reg2 = Op2.getReg(); + unsigned Rt = MRI->getEncodingValue(Reg1); + unsigned Rt2 = MRI->getEncodingValue(Reg2); + + // Rt2 must be Rt + 1 and Rt must be even. + if (Rt + 1 != Rt2 || (Rt & 1)) { + Error(Op2.getStartLoc(), isLoad + ? "destination operands must be sequential" + : "source operands must be sequential"); + return true; + } + unsigned NewReg = MRI->getMatchingSuperReg(Reg1, ARM::gsub_0, + &(MRI->getRegClass(ARM::GPRPairRegClassID))); + Operands[Idx] = + ARMOperand::CreateReg(NewReg, Op1.getStartLoc(), Op2.getEndLoc()); + Operands.erase(Operands.begin() + Idx + 1); + } + } + + // GNU Assembler extension (compatibility) + if ((Mnemonic == "ldrd" || Mnemonic == "strd")) { + ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[2]); + ARMOperand &Op3 = static_cast<ARMOperand &>(*Operands[3]); + if (Op3.isMem()) { + assert(Op2.isReg() && "expected register argument"); + + unsigned SuperReg = MRI->getMatchingSuperReg( + Op2.getReg(), ARM::gsub_0, &MRI->getRegClass(ARM::GPRPairRegClassID)); + + assert(SuperReg && "expected register pair"); + + unsigned PairedReg = MRI->getSubReg(SuperReg, ARM::gsub_1); + + Operands.insert( + Operands.begin() + 3, + ARMOperand::CreateReg(PairedReg, Op2.getStartLoc(), Op2.getEndLoc())); + } + } + + // FIXME: As said above, this is all a pretty gross hack. This instruction + // does not fit with other "subs" and tblgen. + // Adjust operands of B9.3.19 SUBS PC, LR, #imm (Thumb2) system instruction + // so the Mnemonic is the original name "subs" and delete the predicate + // operand so it will match the table entry. + if (isThumbTwo() && Mnemonic == "sub" && Operands.size() == 6 && + static_cast<ARMOperand &>(*Operands[3]).isReg() && + static_cast<ARMOperand &>(*Operands[3]).getReg() == ARM::PC && + static_cast<ARMOperand &>(*Operands[4]).isReg() && + static_cast<ARMOperand &>(*Operands[4]).getReg() == ARM::LR && + static_cast<ARMOperand &>(*Operands[5]).isImm()) { + Operands.front() = ARMOperand::CreateToken(Name, NameLoc); + Operands.erase(Operands.begin() + 1); + } + return false; +} + +// Validate context-sensitive operand constraints. + +// return 'true' if register list contains non-low GPR registers, +// 'false' otherwise. If Reg is in the register list or is HiReg, set +// 'containsReg' to true. +static bool checkLowRegisterList(const MCInst &Inst, unsigned OpNo, + unsigned Reg, unsigned HiReg, + bool &containsReg) { + containsReg = false; + for (unsigned i = OpNo; i < Inst.getNumOperands(); ++i) { + unsigned OpReg = Inst.getOperand(i).getReg(); + if (OpReg == Reg) + containsReg = true; + // Anything other than a low register isn't legal here. + if (!isARMLowRegister(OpReg) && (!HiReg || OpReg != HiReg)) + return true; + } + return false; +} + +// Check if the specified regisgter is in the register list of the inst, +// starting at the indicated operand number. +static bool listContainsReg(const MCInst &Inst, unsigned OpNo, unsigned Reg) { + for (unsigned i = OpNo, e = Inst.getNumOperands(); i < e; ++i) { + unsigned OpReg = Inst.getOperand(i).getReg(); + if (OpReg == Reg) + return true; + } + return false; +} + +// Return true if instruction has the interesting property of being +// allowed in IT blocks, but not being predicable. +static bool instIsBreakpoint(const MCInst &Inst) { + return Inst.getOpcode() == ARM::tBKPT || + Inst.getOpcode() == ARM::BKPT || + Inst.getOpcode() == ARM::tHLT || + Inst.getOpcode() == ARM::HLT; + +} + +bool ARMAsmParser::validatetLDMRegList(const MCInst &Inst, + const OperandVector &Operands, + unsigned ListNo, bool IsARPop) { + const ARMOperand &Op = static_cast<const ARMOperand &>(*Operands[ListNo]); + bool HasWritebackToken = Op.isToken() && Op.getToken() == "!"; + + bool ListContainsSP = listContainsReg(Inst, ListNo, ARM::SP); + bool ListContainsLR = listContainsReg(Inst, ListNo, ARM::LR); + bool ListContainsPC = listContainsReg(Inst, ListNo, ARM::PC); + + if (!IsARPop && ListContainsSP) + return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(), + "SP may not be in the register list"); + else if (ListContainsPC && ListContainsLR) + return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(), + "PC and LR may not be in the register list simultaneously"); + else if (inITBlock() && !lastInITBlock() && ListContainsPC) + return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(), + "instruction must be outside of IT block or the last " + "instruction in an IT block"); + return false; +} + +bool ARMAsmParser::validatetSTMRegList(const MCInst &Inst, + const OperandVector &Operands, + unsigned ListNo) { + const ARMOperand &Op = static_cast<const ARMOperand &>(*Operands[ListNo]); + bool HasWritebackToken = Op.isToken() && Op.getToken() == "!"; + + bool ListContainsSP = listContainsReg(Inst, ListNo, ARM::SP); + bool ListContainsPC = listContainsReg(Inst, ListNo, ARM::PC); + + if (ListContainsSP && ListContainsPC) + return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(), + "SP and PC may not be in the register list"); + else if (ListContainsSP) + return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(), + "SP may not be in the register list"); + else if (ListContainsPC) + return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(), + "PC may not be in the register list"); + return false; +} + +// FIXME: We would really like to be able to tablegen'erate this. +bool ARMAsmParser::validateInstruction(MCInst &Inst, + const OperandVector &Operands) { + const MCInstrDesc &MCID = MII.get(Inst.getOpcode()); + SMLoc Loc = Operands[0]->getStartLoc(); + + // Check the IT block state first. + // NOTE: BKPT and HLT instructions have the interesting property of being + // allowed in IT blocks, but not being predicable. They just always execute. + if (inITBlock() && !instIsBreakpoint(Inst)) { + unsigned Bit = 1; + if (ITState.FirstCond) + ITState.FirstCond = false; + else + Bit = (ITState.Mask >> (5 - ITState.CurPosition)) & 1; + // The instruction must be predicable. + if (!MCID.isPredicable()) + return Error(Loc, "instructions in IT block must be predicable"); + unsigned Cond = Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm(); + unsigned ITCond = Bit ? ITState.Cond : + ARMCC::getOppositeCondition(ITState.Cond); + if (Cond != ITCond) { + // Find the condition code Operand to get its SMLoc information. + SMLoc CondLoc; + for (unsigned I = 1; I < Operands.size(); ++I) + if (static_cast<ARMOperand &>(*Operands[I]).isCondCode()) + CondLoc = Operands[I]->getStartLoc(); + return Error(CondLoc, "incorrect condition in IT block; got '" + + StringRef(ARMCondCodeToString(ARMCC::CondCodes(Cond))) + + "', but expected '" + + ARMCondCodeToString(ARMCC::CondCodes(ITCond)) + "'"); + } + // Check for non-'al' condition codes outside of the IT block. + } else if (isThumbTwo() && MCID.isPredicable() && + Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm() != + ARMCC::AL && Inst.getOpcode() != ARM::tBcc && + Inst.getOpcode() != ARM::t2Bcc) + return Error(Loc, "predicated instructions must be in IT block"); + + const unsigned Opcode = Inst.getOpcode(); + switch (Opcode) { + case ARM::LDRD: + case ARM::LDRD_PRE: + case ARM::LDRD_POST: { + const unsigned RtReg = Inst.getOperand(0).getReg(); + + // Rt can't be R14. + if (RtReg == ARM::LR) + return Error(Operands[3]->getStartLoc(), + "Rt can't be R14"); + + const unsigned Rt = MRI->getEncodingValue(RtReg); + // Rt must be even-numbered. + if ((Rt & 1) == 1) + return Error(Operands[3]->getStartLoc(), + "Rt must be even-numbered"); + + // Rt2 must be Rt + 1. + const unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg()); + if (Rt2 != Rt + 1) + return Error(Operands[3]->getStartLoc(), + "destination operands must be sequential"); + + if (Opcode == ARM::LDRD_PRE || Opcode == ARM::LDRD_POST) { + const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(3).getReg()); + // For addressing modes with writeback, the base register needs to be + // different from the destination registers. + if (Rn == Rt || Rn == Rt2) + return Error(Operands[3]->getStartLoc(), + "base register needs to be different from destination " + "registers"); + } + + return false; + } + case ARM::t2LDRDi8: + case ARM::t2LDRD_PRE: + case ARM::t2LDRD_POST: { + // Rt2 must be different from Rt. + unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg()); + unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg()); + if (Rt2 == Rt) + return Error(Operands[3]->getStartLoc(), + "destination operands can't be identical"); + return false; + } + case ARM::t2BXJ: { + const unsigned RmReg = Inst.getOperand(0).getReg(); + // Rm = SP is no longer unpredictable in v8-A + if (RmReg == ARM::SP && !hasV8Ops()) + return Error(Operands[2]->getStartLoc(), + "r13 (SP) is an unpredictable operand to BXJ"); + return false; + } + case ARM::STRD: { + // Rt2 must be Rt + 1. + unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg()); + unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg()); + if (Rt2 != Rt + 1) + return Error(Operands[3]->getStartLoc(), + "source operands must be sequential"); + return false; + } + case ARM::STRD_PRE: + case ARM::STRD_POST: { + // Rt2 must be Rt + 1. + unsigned Rt = MRI->getEncodingValue(Inst.getOperand(1).getReg()); + unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(2).getReg()); + if (Rt2 != Rt + 1) + return Error(Operands[3]->getStartLoc(), + "source operands must be sequential"); + return false; + } + case ARM::STR_PRE_IMM: + case ARM::STR_PRE_REG: + case ARM::STR_POST_IMM: + case ARM::STR_POST_REG: + case ARM::STRH_PRE: + case ARM::STRH_POST: + case ARM::STRB_PRE_IMM: + case ARM::STRB_PRE_REG: + case ARM::STRB_POST_IMM: + case ARM::STRB_POST_REG: { + // Rt must be different from Rn. + const unsigned Rt = MRI->getEncodingValue(Inst.getOperand(1).getReg()); + const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(2).getReg()); + + if (Rt == Rn) + return Error(Operands[3]->getStartLoc(), + "source register and base register can't be identical"); + return false; + } + case ARM::LDR_PRE_IMM: + case ARM::LDR_PRE_REG: + case ARM::LDR_POST_IMM: + case ARM::LDR_POST_REG: + case ARM::LDRH_PRE: + case ARM::LDRH_POST: + case ARM::LDRSH_PRE: + case ARM::LDRSH_POST: + case ARM::LDRB_PRE_IMM: + case ARM::LDRB_PRE_REG: + case ARM::LDRB_POST_IMM: + case ARM::LDRB_POST_REG: + case ARM::LDRSB_PRE: + case ARM::LDRSB_POST: { + // Rt must be different from Rn. + const unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg()); + const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(2).getReg()); + + if (Rt == Rn) + return Error(Operands[3]->getStartLoc(), + "destination register and base register can't be identical"); + return false; + } + case ARM::SBFX: + case ARM::UBFX: { + // Width must be in range [1, 32-lsb]. + unsigned LSB = Inst.getOperand(2).getImm(); + unsigned Widthm1 = Inst.getOperand(3).getImm(); + if (Widthm1 >= 32 - LSB) + return Error(Operands[5]->getStartLoc(), + "bitfield width must be in range [1,32-lsb]"); + return false; + } + // Notionally handles ARM::tLDMIA_UPD too. + case ARM::tLDMIA: { + // If we're parsing Thumb2, the .w variant is available and handles + // most cases that are normally illegal for a Thumb1 LDM instruction. + // We'll make the transformation in processInstruction() if necessary. + // + // Thumb LDM instructions are writeback iff the base register is not + // in the register list. + unsigned Rn = Inst.getOperand(0).getReg(); + bool HasWritebackToken = + (static_cast<ARMOperand &>(*Operands[3]).isToken() && + static_cast<ARMOperand &>(*Operands[3]).getToken() == "!"); + bool ListContainsBase; + if (checkLowRegisterList(Inst, 3, Rn, 0, ListContainsBase) && !isThumbTwo()) + return Error(Operands[3 + HasWritebackToken]->getStartLoc(), + "registers must be in range r0-r7"); + // If we should have writeback, then there should be a '!' token. + if (!ListContainsBase && !HasWritebackToken && !isThumbTwo()) + return Error(Operands[2]->getStartLoc(), + "writeback operator '!' expected"); + // If we should not have writeback, there must not be a '!'. This is + // true even for the 32-bit wide encodings. + if (ListContainsBase && HasWritebackToken) + return Error(Operands[3]->getStartLoc(), + "writeback operator '!' not allowed when base register " + "in register list"); + + if (validatetLDMRegList(Inst, Operands, 3)) + return true; + break; + } + case ARM::LDMIA_UPD: + case ARM::LDMDB_UPD: + case ARM::LDMIB_UPD: + case ARM::LDMDA_UPD: + // ARM variants loading and updating the same register are only officially + // UNPREDICTABLE on v7 upwards. Goodness knows what they did before. + if (!hasV7Ops()) + break; + if (listContainsReg(Inst, 3, Inst.getOperand(0).getReg())) + return Error(Operands.back()->getStartLoc(), + "writeback register not allowed in register list"); + break; + case ARM::t2LDMIA: + case ARM::t2LDMDB: + if (validatetLDMRegList(Inst, Operands, 3)) + return true; + break; + case ARM::t2STMIA: + case ARM::t2STMDB: + if (validatetSTMRegList(Inst, Operands, 3)) + return true; + break; + case ARM::t2LDMIA_UPD: + case ARM::t2LDMDB_UPD: + case ARM::t2STMIA_UPD: + case ARM::t2STMDB_UPD: { + if (listContainsReg(Inst, 3, Inst.getOperand(0).getReg())) + return Error(Operands.back()->getStartLoc(), + "writeback register not allowed in register list"); + + if (Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD) { + if (validatetLDMRegList(Inst, Operands, 3)) + return true; + } else { + if (validatetSTMRegList(Inst, Operands, 3)) + return true; + } + break; + } + case ARM::sysLDMIA_UPD: + case ARM::sysLDMDA_UPD: + case ARM::sysLDMDB_UPD: + case ARM::sysLDMIB_UPD: + if (!listContainsReg(Inst, 3, ARM::PC)) + return Error(Operands[4]->getStartLoc(), + "writeback register only allowed on system LDM " + "if PC in register-list"); + break; + case ARM::sysSTMIA_UPD: + case ARM::sysSTMDA_UPD: + case ARM::sysSTMDB_UPD: + case ARM::sysSTMIB_UPD: + return Error(Operands[2]->getStartLoc(), + "system STM cannot have writeback register"); + case ARM::tMUL: { + // The second source operand must be the same register as the destination + // operand. + // + // In this case, we must directly check the parsed operands because the + // cvtThumbMultiply() function is written in such a way that it guarantees + // this first statement is always true for the new Inst. Essentially, the + // destination is unconditionally copied into the second source operand + // without checking to see if it matches what we actually parsed. + if (Operands.size() == 6 && (((ARMOperand &)*Operands[3]).getReg() != + ((ARMOperand &)*Operands[5]).getReg()) && + (((ARMOperand &)*Operands[3]).getReg() != + ((ARMOperand &)*Operands[4]).getReg())) { + return Error(Operands[3]->getStartLoc(), + "destination register must match source register"); + } + break; + } + // Like for ldm/stm, push and pop have hi-reg handling version in Thumb2, + // so only issue a diagnostic for thumb1. The instructions will be + // switched to the t2 encodings in processInstruction() if necessary. + case ARM::tPOP: { + bool ListContainsBase; + if (checkLowRegisterList(Inst, 2, 0, ARM::PC, ListContainsBase) && + !isThumbTwo()) + return Error(Operands[2]->getStartLoc(), + "registers must be in range r0-r7 or pc"); + if (validatetLDMRegList(Inst, Operands, 2, !isMClass())) + return true; + break; + } + case ARM::tPUSH: { + bool ListContainsBase; + if (checkLowRegisterList(Inst, 2, 0, ARM::LR, ListContainsBase) && + !isThumbTwo()) + return Error(Operands[2]->getStartLoc(), + "registers must be in range r0-r7 or lr"); + if (validatetSTMRegList(Inst, Operands, 2)) + return true; + break; + } + case ARM::tSTMIA_UPD: { + bool ListContainsBase, InvalidLowList; + InvalidLowList = checkLowRegisterList(Inst, 4, Inst.getOperand(0).getReg(), + 0, ListContainsBase); + if (InvalidLowList && !isThumbTwo()) + return Error(Operands[4]->getStartLoc(), + "registers must be in range r0-r7"); + + // This would be converted to a 32-bit stm, but that's not valid if the + // writeback register is in the list. + if (InvalidLowList && ListContainsBase) + return Error(Operands[4]->getStartLoc(), + "writeback operator '!' not allowed when base register " + "in register list"); + + if (validatetSTMRegList(Inst, Operands, 4)) + return true; + break; + } + case ARM::tADDrSP: { + // If the non-SP source operand and the destination operand are not the + // same, we need thumb2 (for the wide encoding), or we have an error. + if (!isThumbTwo() && + Inst.getOperand(0).getReg() != Inst.getOperand(2).getReg()) { + return Error(Operands[4]->getStartLoc(), + "source register must be the same as destination"); + } + break; + } + // Final range checking for Thumb unconditional branch instructions. + case ARM::tB: + if (!(static_cast<ARMOperand &>(*Operands[2])).isSignedOffset<11, 1>()) + return Error(Operands[2]->getStartLoc(), "branch target out of range"); + break; + case ARM::t2B: { + int op = (Operands[2]->isImm()) ? 2 : 3; + if (!static_cast<ARMOperand &>(*Operands[op]).isSignedOffset<24, 1>()) + return Error(Operands[op]->getStartLoc(), "branch target out of range"); + break; + } + // Final range checking for Thumb conditional branch instructions. + case ARM::tBcc: + if (!static_cast<ARMOperand &>(*Operands[2]).isSignedOffset<8, 1>()) + return Error(Operands[2]->getStartLoc(), "branch target out of range"); + break; + case ARM::t2Bcc: { + int Op = (Operands[2]->isImm()) ? 2 : 3; + if (!static_cast<ARMOperand &>(*Operands[Op]).isSignedOffset<20, 1>()) + return Error(Operands[Op]->getStartLoc(), "branch target out of range"); + break; + } + case ARM::MOVi16: + case ARM::t2MOVi16: + case ARM::t2MOVTi16: + { + // We want to avoid misleadingly allowing something like "mov r0, <symbol>" + // especially when we turn it into a movw and the expression <symbol> does + // not have a :lower16: or :upper16 as part of the expression. We don't + // want the behavior of silently truncating, which can be unexpected and + // lead to bugs that are difficult to find since this is an easy mistake + // to make. + int i = (Operands[3]->isImm()) ? 3 : 4; + ARMOperand &Op = static_cast<ARMOperand &>(*Operands[i]); + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm()); + if (CE) break; + const MCExpr *E = dyn_cast<MCExpr>(Op.getImm()); + if (!E) break; + const ARMMCExpr *ARM16Expr = dyn_cast<ARMMCExpr>(E); + if (!ARM16Expr || (ARM16Expr->getKind() != ARMMCExpr::VK_ARM_HI16 && + ARM16Expr->getKind() != ARMMCExpr::VK_ARM_LO16)) + return Error( + Op.getStartLoc(), + "immediate expression for mov requires :lower16: or :upper16"); + break; + } + } + + return false; +} + +static unsigned getRealVSTOpcode(unsigned Opc, unsigned &Spacing) { + switch(Opc) { + default: llvm_unreachable("unexpected opcode!"); + // VST1LN + case ARM::VST1LNdWB_fixed_Asm_8: Spacing = 1; return ARM::VST1LNd8_UPD; + case ARM::VST1LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VST1LNd16_UPD; + case ARM::VST1LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VST1LNd32_UPD; + case ARM::VST1LNdWB_register_Asm_8: Spacing = 1; return ARM::VST1LNd8_UPD; + case ARM::VST1LNdWB_register_Asm_16: Spacing = 1; return ARM::VST1LNd16_UPD; + case ARM::VST1LNdWB_register_Asm_32: Spacing = 1; return ARM::VST1LNd32_UPD; + case ARM::VST1LNdAsm_8: Spacing = 1; return ARM::VST1LNd8; + case ARM::VST1LNdAsm_16: Spacing = 1; return ARM::VST1LNd16; + case ARM::VST1LNdAsm_32: Spacing = 1; return ARM::VST1LNd32; + + // VST2LN + case ARM::VST2LNdWB_fixed_Asm_8: Spacing = 1; return ARM::VST2LNd8_UPD; + case ARM::VST2LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VST2LNd16_UPD; + case ARM::VST2LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VST2LNd32_UPD; + case ARM::VST2LNqWB_fixed_Asm_16: Spacing = 2; return ARM::VST2LNq16_UPD; + case ARM::VST2LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VST2LNq32_UPD; + + case ARM::VST2LNdWB_register_Asm_8: Spacing = 1; return ARM::VST2LNd8_UPD; + case ARM::VST2LNdWB_register_Asm_16: Spacing = 1; return ARM::VST2LNd16_UPD; + case ARM::VST2LNdWB_register_Asm_32: Spacing = 1; return ARM::VST2LNd32_UPD; + case ARM::VST2LNqWB_register_Asm_16: Spacing = 2; return ARM::VST2LNq16_UPD; + case ARM::VST2LNqWB_register_Asm_32: Spacing = 2; return ARM::VST2LNq32_UPD; + + case ARM::VST2LNdAsm_8: Spacing = 1; return ARM::VST2LNd8; + case ARM::VST2LNdAsm_16: Spacing = 1; return ARM::VST2LNd16; + case ARM::VST2LNdAsm_32: Spacing = 1; return ARM::VST2LNd32; + case ARM::VST2LNqAsm_16: Spacing = 2; return ARM::VST2LNq16; + case ARM::VST2LNqAsm_32: Spacing = 2; return ARM::VST2LNq32; + + // VST3LN + case ARM::VST3LNdWB_fixed_Asm_8: Spacing = 1; return ARM::VST3LNd8_UPD; + case ARM::VST3LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VST3LNd16_UPD; + case ARM::VST3LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VST3LNd32_UPD; + case ARM::VST3LNqWB_fixed_Asm_16: Spacing = 1; return ARM::VST3LNq16_UPD; + case ARM::VST3LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VST3LNq32_UPD; + case ARM::VST3LNdWB_register_Asm_8: Spacing = 1; return ARM::VST3LNd8_UPD; + case ARM::VST3LNdWB_register_Asm_16: Spacing = 1; return ARM::VST3LNd16_UPD; + case ARM::VST3LNdWB_register_Asm_32: Spacing = 1; return ARM::VST3LNd32_UPD; + case ARM::VST3LNqWB_register_Asm_16: Spacing = 2; return ARM::VST3LNq16_UPD; + case ARM::VST3LNqWB_register_Asm_32: Spacing = 2; return ARM::VST3LNq32_UPD; + case ARM::VST3LNdAsm_8: Spacing = 1; return ARM::VST3LNd8; + case ARM::VST3LNdAsm_16: Spacing = 1; return ARM::VST3LNd16; + case ARM::VST3LNdAsm_32: Spacing = 1; return ARM::VST3LNd32; + case ARM::VST3LNqAsm_16: Spacing = 2; return ARM::VST3LNq16; + case ARM::VST3LNqAsm_32: Spacing = 2; return ARM::VST3LNq32; + + // VST3 + case ARM::VST3dWB_fixed_Asm_8: Spacing = 1; return ARM::VST3d8_UPD; + case ARM::VST3dWB_fixed_Asm_16: Spacing = 1; return ARM::VST3d16_UPD; + case ARM::VST3dWB_fixed_Asm_32: Spacing = 1; return ARM::VST3d32_UPD; + case ARM::VST3qWB_fixed_Asm_8: Spacing = 2; return ARM::VST3q8_UPD; + case ARM::VST3qWB_fixed_Asm_16: Spacing = 2; return ARM::VST3q16_UPD; + case ARM::VST3qWB_fixed_Asm_32: Spacing = 2; return ARM::VST3q32_UPD; + case ARM::VST3dWB_register_Asm_8: Spacing = 1; return ARM::VST3d8_UPD; + case ARM::VST3dWB_register_Asm_16: Spacing = 1; return ARM::VST3d16_UPD; + case ARM::VST3dWB_register_Asm_32: Spacing = 1; return ARM::VST3d32_UPD; + case ARM::VST3qWB_register_Asm_8: Spacing = 2; return ARM::VST3q8_UPD; + case ARM::VST3qWB_register_Asm_16: Spacing = 2; return ARM::VST3q16_UPD; + case ARM::VST3qWB_register_Asm_32: Spacing = 2; return ARM::VST3q32_UPD; + case ARM::VST3dAsm_8: Spacing = 1; return ARM::VST3d8; + case ARM::VST3dAsm_16: Spacing = 1; return ARM::VST3d16; + case ARM::VST3dAsm_32: Spacing = 1; return ARM::VST3d32; + case ARM::VST3qAsm_8: Spacing = 2; return ARM::VST3q8; + case ARM::VST3qAsm_16: Spacing = 2; return ARM::VST3q16; + case ARM::VST3qAsm_32: Spacing = 2; return ARM::VST3q32; + + // VST4LN + case ARM::VST4LNdWB_fixed_Asm_8: Spacing = 1; return ARM::VST4LNd8_UPD; + case ARM::VST4LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VST4LNd16_UPD; + case ARM::VST4LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VST4LNd32_UPD; + case ARM::VST4LNqWB_fixed_Asm_16: Spacing = 1; return ARM::VST4LNq16_UPD; + case ARM::VST4LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VST4LNq32_UPD; + case ARM::VST4LNdWB_register_Asm_8: Spacing = 1; return ARM::VST4LNd8_UPD; + case ARM::VST4LNdWB_register_Asm_16: Spacing = 1; return ARM::VST4LNd16_UPD; + case ARM::VST4LNdWB_register_Asm_32: Spacing = 1; return ARM::VST4LNd32_UPD; + case ARM::VST4LNqWB_register_Asm_16: Spacing = 2; return ARM::VST4LNq16_UPD; + case ARM::VST4LNqWB_register_Asm_32: Spacing = 2; return ARM::VST4LNq32_UPD; + case ARM::VST4LNdAsm_8: Spacing = 1; return ARM::VST4LNd8; + case ARM::VST4LNdAsm_16: Spacing = 1; return ARM::VST4LNd16; + case ARM::VST4LNdAsm_32: Spacing = 1; return ARM::VST4LNd32; + case ARM::VST4LNqAsm_16: Spacing = 2; return ARM::VST4LNq16; + case ARM::VST4LNqAsm_32: Spacing = 2; return ARM::VST4LNq32; + + // VST4 + case ARM::VST4dWB_fixed_Asm_8: Spacing = 1; return ARM::VST4d8_UPD; + case ARM::VST4dWB_fixed_Asm_16: Spacing = 1; return ARM::VST4d16_UPD; + case ARM::VST4dWB_fixed_Asm_32: Spacing = 1; return ARM::VST4d32_UPD; + case ARM::VST4qWB_fixed_Asm_8: Spacing = 2; return ARM::VST4q8_UPD; + case ARM::VST4qWB_fixed_Asm_16: Spacing = 2; return ARM::VST4q16_UPD; + case ARM::VST4qWB_fixed_Asm_32: Spacing = 2; return ARM::VST4q32_UPD; + case ARM::VST4dWB_register_Asm_8: Spacing = 1; return ARM::VST4d8_UPD; + case ARM::VST4dWB_register_Asm_16: Spacing = 1; return ARM::VST4d16_UPD; + case ARM::VST4dWB_register_Asm_32: Spacing = 1; return ARM::VST4d32_UPD; + case ARM::VST4qWB_register_Asm_8: Spacing = 2; return ARM::VST4q8_UPD; + case ARM::VST4qWB_register_Asm_16: Spacing = 2; return ARM::VST4q16_UPD; + case ARM::VST4qWB_register_Asm_32: Spacing = 2; return ARM::VST4q32_UPD; + case ARM::VST4dAsm_8: Spacing = 1; return ARM::VST4d8; + case ARM::VST4dAsm_16: Spacing = 1; return ARM::VST4d16; + case ARM::VST4dAsm_32: Spacing = 1; return ARM::VST4d32; + case ARM::VST4qAsm_8: Spacing = 2; return ARM::VST4q8; + case ARM::VST4qAsm_16: Spacing = 2; return ARM::VST4q16; + case ARM::VST4qAsm_32: Spacing = 2; return ARM::VST4q32; + } +} + +static unsigned getRealVLDOpcode(unsigned Opc, unsigned &Spacing) { + switch(Opc) { + default: llvm_unreachable("unexpected opcode!"); + // VLD1LN + case ARM::VLD1LNdWB_fixed_Asm_8: Spacing = 1; return ARM::VLD1LNd8_UPD; + case ARM::VLD1LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD1LNd16_UPD; + case ARM::VLD1LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD1LNd32_UPD; + case ARM::VLD1LNdWB_register_Asm_8: Spacing = 1; return ARM::VLD1LNd8_UPD; + case ARM::VLD1LNdWB_register_Asm_16: Spacing = 1; return ARM::VLD1LNd16_UPD; + case ARM::VLD1LNdWB_register_Asm_32: Spacing = 1; return ARM::VLD1LNd32_UPD; + case ARM::VLD1LNdAsm_8: Spacing = 1; return ARM::VLD1LNd8; + case ARM::VLD1LNdAsm_16: Spacing = 1; return ARM::VLD1LNd16; + case ARM::VLD1LNdAsm_32: Spacing = 1; return ARM::VLD1LNd32; + + // VLD2LN + case ARM::VLD2LNdWB_fixed_Asm_8: Spacing = 1; return ARM::VLD2LNd8_UPD; + case ARM::VLD2LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD2LNd16_UPD; + case ARM::VLD2LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD2LNd32_UPD; + case ARM::VLD2LNqWB_fixed_Asm_16: Spacing = 1; return ARM::VLD2LNq16_UPD; + case ARM::VLD2LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD2LNq32_UPD; + case ARM::VLD2LNdWB_register_Asm_8: Spacing = 1; return ARM::VLD2LNd8_UPD; + case ARM::VLD2LNdWB_register_Asm_16: Spacing = 1; return ARM::VLD2LNd16_UPD; + case ARM::VLD2LNdWB_register_Asm_32: Spacing = 1; return ARM::VLD2LNd32_UPD; + case ARM::VLD2LNqWB_register_Asm_16: Spacing = 2; return ARM::VLD2LNq16_UPD; + case ARM::VLD2LNqWB_register_Asm_32: Spacing = 2; return ARM::VLD2LNq32_UPD; + case ARM::VLD2LNdAsm_8: Spacing = 1; return ARM::VLD2LNd8; + case ARM::VLD2LNdAsm_16: Spacing = 1; return ARM::VLD2LNd16; + case ARM::VLD2LNdAsm_32: Spacing = 1; return ARM::VLD2LNd32; + case ARM::VLD2LNqAsm_16: Spacing = 2; return ARM::VLD2LNq16; + case ARM::VLD2LNqAsm_32: Spacing = 2; return ARM::VLD2LNq32; + + // VLD3DUP + case ARM::VLD3DUPdWB_fixed_Asm_8: Spacing = 1; return ARM::VLD3DUPd8_UPD; + case ARM::VLD3DUPdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3DUPd16_UPD; + case ARM::VLD3DUPdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD3DUPd32_UPD; + case ARM::VLD3DUPqWB_fixed_Asm_8: Spacing = 1; return ARM::VLD3DUPq8_UPD; + case ARM::VLD3DUPqWB_fixed_Asm_16: Spacing = 2; return ARM::VLD3DUPq16_UPD; + case ARM::VLD3DUPqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD3DUPq32_UPD; + case ARM::VLD3DUPdWB_register_Asm_8: Spacing = 1; return ARM::VLD3DUPd8_UPD; + case ARM::VLD3DUPdWB_register_Asm_16: Spacing = 1; return ARM::VLD3DUPd16_UPD; + case ARM::VLD3DUPdWB_register_Asm_32: Spacing = 1; return ARM::VLD3DUPd32_UPD; + case ARM::VLD3DUPqWB_register_Asm_8: Spacing = 2; return ARM::VLD3DUPq8_UPD; + case ARM::VLD3DUPqWB_register_Asm_16: Spacing = 2; return ARM::VLD3DUPq16_UPD; + case ARM::VLD3DUPqWB_register_Asm_32: Spacing = 2; return ARM::VLD3DUPq32_UPD; + case ARM::VLD3DUPdAsm_8: Spacing = 1; return ARM::VLD3DUPd8; + case ARM::VLD3DUPdAsm_16: Spacing = 1; return ARM::VLD3DUPd16; + case ARM::VLD3DUPdAsm_32: Spacing = 1; return ARM::VLD3DUPd32; + case ARM::VLD3DUPqAsm_8: Spacing = 2; return ARM::VLD3DUPq8; + case ARM::VLD3DUPqAsm_16: Spacing = 2; return ARM::VLD3DUPq16; + case ARM::VLD3DUPqAsm_32: Spacing = 2; return ARM::VLD3DUPq32; + + // VLD3LN + case ARM::VLD3LNdWB_fixed_Asm_8: Spacing = 1; return ARM::VLD3LNd8_UPD; + case ARM::VLD3LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3LNd16_UPD; + case ARM::VLD3LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD3LNd32_UPD; + case ARM::VLD3LNqWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3LNq16_UPD; + case ARM::VLD3LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD3LNq32_UPD; + case ARM::VLD3LNdWB_register_Asm_8: Spacing = 1; return ARM::VLD3LNd8_UPD; + case ARM::VLD3LNdWB_register_Asm_16: Spacing = 1; return ARM::VLD3LNd16_UPD; + case ARM::VLD3LNdWB_register_Asm_32: Spacing = 1; return ARM::VLD3LNd32_UPD; + case ARM::VLD3LNqWB_register_Asm_16: Spacing = 2; return ARM::VLD3LNq16_UPD; + case ARM::VLD3LNqWB_register_Asm_32: Spacing = 2; return ARM::VLD3LNq32_UPD; + case ARM::VLD3LNdAsm_8: Spacing = 1; return ARM::VLD3LNd8; + case ARM::VLD3LNdAsm_16: Spacing = 1; return ARM::VLD3LNd16; + case ARM::VLD3LNdAsm_32: Spacing = 1; return ARM::VLD3LNd32; + case ARM::VLD3LNqAsm_16: Spacing = 2; return ARM::VLD3LNq16; + case ARM::VLD3LNqAsm_32: Spacing = 2; return ARM::VLD3LNq32; + + // VLD3 + case ARM::VLD3dWB_fixed_Asm_8: Spacing = 1; return ARM::VLD3d8_UPD; + case ARM::VLD3dWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3d16_UPD; + case ARM::VLD3dWB_fixed_Asm_32: Spacing = 1; return ARM::VLD3d32_UPD; + case ARM::VLD3qWB_fixed_Asm_8: Spacing = 2; return ARM::VLD3q8_UPD; + case ARM::VLD3qWB_fixed_Asm_16: Spacing = 2; return ARM::VLD3q16_UPD; + case ARM::VLD3qWB_fixed_Asm_32: Spacing = 2; return ARM::VLD3q32_UPD; + case ARM::VLD3dWB_register_Asm_8: Spacing = 1; return ARM::VLD3d8_UPD; + case ARM::VLD3dWB_register_Asm_16: Spacing = 1; return ARM::VLD3d16_UPD; + case ARM::VLD3dWB_register_Asm_32: Spacing = 1; return ARM::VLD3d32_UPD; + case ARM::VLD3qWB_register_Asm_8: Spacing = 2; return ARM::VLD3q8_UPD; + case ARM::VLD3qWB_register_Asm_16: Spacing = 2; return ARM::VLD3q16_UPD; + case ARM::VLD3qWB_register_Asm_32: Spacing = 2; return ARM::VLD3q32_UPD; + case ARM::VLD3dAsm_8: Spacing = 1; return ARM::VLD3d8; + case ARM::VLD3dAsm_16: Spacing = 1; return ARM::VLD3d16; + case ARM::VLD3dAsm_32: Spacing = 1; return ARM::VLD3d32; + case ARM::VLD3qAsm_8: Spacing = 2; return ARM::VLD3q8; + case ARM::VLD3qAsm_16: Spacing = 2; return ARM::VLD3q16; + case ARM::VLD3qAsm_32: Spacing = 2; return ARM::VLD3q32; + + // VLD4LN + case ARM::VLD4LNdWB_fixed_Asm_8: Spacing = 1; return ARM::VLD4LNd8_UPD; + case ARM::VLD4LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD4LNd16_UPD; + case ARM::VLD4LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD4LNd32_UPD; + case ARM::VLD4LNqWB_fixed_Asm_16: Spacing = 2; return ARM::VLD4LNq16_UPD; + case ARM::VLD4LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD4LNq32_UPD; + case ARM::VLD4LNdWB_register_Asm_8: Spacing = 1; return ARM::VLD4LNd8_UPD; + case ARM::VLD4LNdWB_register_Asm_16: Spacing = 1; return ARM::VLD4LNd16_UPD; + case ARM::VLD4LNdWB_register_Asm_32: Spacing = 1; return ARM::VLD4LNd32_UPD; + case ARM::VLD4LNqWB_register_Asm_16: Spacing = 2; return ARM::VLD4LNq16_UPD; + case ARM::VLD4LNqWB_register_Asm_32: Spacing = 2; return ARM::VLD4LNq32_UPD; + case ARM::VLD4LNdAsm_8: Spacing = 1; return ARM::VLD4LNd8; + case ARM::VLD4LNdAsm_16: Spacing = 1; return ARM::VLD4LNd16; + case ARM::VLD4LNdAsm_32: Spacing = 1; return ARM::VLD4LNd32; + case ARM::VLD4LNqAsm_16: Spacing = 2; return ARM::VLD4LNq16; + case ARM::VLD4LNqAsm_32: Spacing = 2; return ARM::VLD4LNq32; + + // VLD4DUP + case ARM::VLD4DUPdWB_fixed_Asm_8: Spacing = 1; return ARM::VLD4DUPd8_UPD; + case ARM::VLD4DUPdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD4DUPd16_UPD; + case ARM::VLD4DUPdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD4DUPd32_UPD; + case ARM::VLD4DUPqWB_fixed_Asm_8: Spacing = 1; return ARM::VLD4DUPq8_UPD; + case ARM::VLD4DUPqWB_fixed_Asm_16: Spacing = 1; return ARM::VLD4DUPq16_UPD; + case ARM::VLD4DUPqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD4DUPq32_UPD; + case ARM::VLD4DUPdWB_register_Asm_8: Spacing = 1; return ARM::VLD4DUPd8_UPD; + case ARM::VLD4DUPdWB_register_Asm_16: Spacing = 1; return ARM::VLD4DUPd16_UPD; + case ARM::VLD4DUPdWB_register_Asm_32: Spacing = 1; return ARM::VLD4DUPd32_UPD; + case ARM::VLD4DUPqWB_register_Asm_8: Spacing = 2; return ARM::VLD4DUPq8_UPD; + case ARM::VLD4DUPqWB_register_Asm_16: Spacing = 2; return ARM::VLD4DUPq16_UPD; + case ARM::VLD4DUPqWB_register_Asm_32: Spacing = 2; return ARM::VLD4DUPq32_UPD; + case ARM::VLD4DUPdAsm_8: Spacing = 1; return ARM::VLD4DUPd8; + case ARM::VLD4DUPdAsm_16: Spacing = 1; return ARM::VLD4DUPd16; + case ARM::VLD4DUPdAsm_32: Spacing = 1; return ARM::VLD4DUPd32; + case ARM::VLD4DUPqAsm_8: Spacing = 2; return ARM::VLD4DUPq8; + case ARM::VLD4DUPqAsm_16: Spacing = 2; return ARM::VLD4DUPq16; + case ARM::VLD4DUPqAsm_32: Spacing = 2; return ARM::VLD4DUPq32; + + // VLD4 + case ARM::VLD4dWB_fixed_Asm_8: Spacing = 1; return ARM::VLD4d8_UPD; + case ARM::VLD4dWB_fixed_Asm_16: Spacing = 1; return ARM::VLD4d16_UPD; + case ARM::VLD4dWB_fixed_Asm_32: Spacing = 1; return ARM::VLD4d32_UPD; + case ARM::VLD4qWB_fixed_Asm_8: Spacing = 2; return ARM::VLD4q8_UPD; + case ARM::VLD4qWB_fixed_Asm_16: Spacing = 2; return ARM::VLD4q16_UPD; + case ARM::VLD4qWB_fixed_Asm_32: Spacing = 2; return ARM::VLD4q32_UPD; + case ARM::VLD4dWB_register_Asm_8: Spacing = 1; return ARM::VLD4d8_UPD; + case ARM::VLD4dWB_register_Asm_16: Spacing = 1; return ARM::VLD4d16_UPD; + case ARM::VLD4dWB_register_Asm_32: Spacing = 1; return ARM::VLD4d32_UPD; + case ARM::VLD4qWB_register_Asm_8: Spacing = 2; return ARM::VLD4q8_UPD; + case ARM::VLD4qWB_register_Asm_16: Spacing = 2; return ARM::VLD4q16_UPD; + case ARM::VLD4qWB_register_Asm_32: Spacing = 2; return ARM::VLD4q32_UPD; + case ARM::VLD4dAsm_8: Spacing = 1; return ARM::VLD4d8; + case ARM::VLD4dAsm_16: Spacing = 1; return ARM::VLD4d16; + case ARM::VLD4dAsm_32: Spacing = 1; return ARM::VLD4d32; + case ARM::VLD4qAsm_8: Spacing = 2; return ARM::VLD4q8; + case ARM::VLD4qAsm_16: Spacing = 2; return ARM::VLD4q16; + case ARM::VLD4qAsm_32: Spacing = 2; return ARM::VLD4q32; + } +} + +bool ARMAsmParser::processInstruction(MCInst &Inst, + const OperandVector &Operands, + MCStreamer &Out) { + switch (Inst.getOpcode()) { + // Alias for alternate form of 'ldr{,b}t Rt, [Rn], #imm' instruction. + case ARM::LDRT_POST: + case ARM::LDRBT_POST: { + const unsigned Opcode = + (Inst.getOpcode() == ARM::LDRT_POST) ? ARM::LDRT_POST_IMM + : ARM::LDRBT_POST_IMM; + MCInst TmpInst; + TmpInst.setOpcode(Opcode); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::createReg(0)); + TmpInst.addOperand(MCOperand::createImm(0)); + TmpInst.addOperand(Inst.getOperand(2)); + TmpInst.addOperand(Inst.getOperand(3)); + Inst = TmpInst; + return true; + } + // Alias for alternate form of 'str{,b}t Rt, [Rn], #imm' instruction. + case ARM::STRT_POST: + case ARM::STRBT_POST: { + const unsigned Opcode = + (Inst.getOpcode() == ARM::STRT_POST) ? ARM::STRT_POST_IMM + : ARM::STRBT_POST_IMM; + MCInst TmpInst; + TmpInst.setOpcode(Opcode); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::createReg(0)); + TmpInst.addOperand(MCOperand::createImm(0)); + TmpInst.addOperand(Inst.getOperand(2)); + TmpInst.addOperand(Inst.getOperand(3)); + Inst = TmpInst; + return true; + } + // Alias for alternate form of 'ADR Rd, #imm' instruction. + case ARM::ADDri: { + if (Inst.getOperand(1).getReg() != ARM::PC || + Inst.getOperand(5).getReg() != 0 || + !(Inst.getOperand(2).isExpr() || Inst.getOperand(2).isImm())) + return false; + MCInst TmpInst; + TmpInst.setOpcode(ARM::ADR); + TmpInst.addOperand(Inst.getOperand(0)); + if (Inst.getOperand(2).isImm()) { + // Immediate (mod_imm) will be in its encoded form, we must unencode it + // before passing it to the ADR instruction. + unsigned Enc = Inst.getOperand(2).getImm(); + TmpInst.addOperand(MCOperand::createImm( + ARM_AM::rotr32(Enc & 0xFF, (Enc & 0xF00) >> 7))); + } else { + // Turn PC-relative expression into absolute expression. + // Reading PC provides the start of the current instruction + 8 and + // the transform to adr is biased by that. + MCSymbol *Dot = getContext().createTempSymbol(); + Out.EmitLabel(Dot); + const MCExpr *OpExpr = Inst.getOperand(2).getExpr(); + const MCExpr *InstPC = MCSymbolRefExpr::create(Dot, + MCSymbolRefExpr::VK_None, + getContext()); + const MCExpr *Const8 = MCConstantExpr::create(8, getContext()); + const MCExpr *ReadPC = MCBinaryExpr::createAdd(InstPC, Const8, + getContext()); + const MCExpr *FixupAddr = MCBinaryExpr::createAdd(ReadPC, OpExpr, + getContext()); + TmpInst.addOperand(MCOperand::createExpr(FixupAddr)); + } + TmpInst.addOperand(Inst.getOperand(3)); + TmpInst.addOperand(Inst.getOperand(4)); + Inst = TmpInst; + return true; + } + // Aliases for alternate PC+imm syntax of LDR instructions. + case ARM::t2LDRpcrel: + // Select the narrow version if the immediate will fit. + if (Inst.getOperand(1).getImm() > 0 && + Inst.getOperand(1).getImm() <= 0xff && + !(static_cast<ARMOperand &>(*Operands[2]).isToken() && + static_cast<ARMOperand &>(*Operands[2]).getToken() == ".w")) + Inst.setOpcode(ARM::tLDRpci); + else + Inst.setOpcode(ARM::t2LDRpci); + return true; + case ARM::t2LDRBpcrel: + Inst.setOpcode(ARM::t2LDRBpci); + return true; + case ARM::t2LDRHpcrel: + Inst.setOpcode(ARM::t2LDRHpci); + return true; + case ARM::t2LDRSBpcrel: + Inst.setOpcode(ARM::t2LDRSBpci); + return true; + case ARM::t2LDRSHpcrel: + Inst.setOpcode(ARM::t2LDRSHpci); + return true; + // Handle NEON VST complex aliases. + case ARM::VST1LNdWB_register_Asm_8: + case ARM::VST1LNdWB_register_Asm_16: + case ARM::VST1LNdWB_register_Asm_32: { + MCInst TmpInst; + // Shuffle the operands around so the lane index operand is in the + // right place. + unsigned Spacing; + TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb + TmpInst.addOperand(Inst.getOperand(2)); // Rn + TmpInst.addOperand(Inst.getOperand(3)); // alignment + TmpInst.addOperand(Inst.getOperand(4)); // Rm + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(Inst.getOperand(1)); // lane + TmpInst.addOperand(Inst.getOperand(5)); // CondCode + TmpInst.addOperand(Inst.getOperand(6)); + Inst = TmpInst; + return true; + } + + case ARM::VST2LNdWB_register_Asm_8: + case ARM::VST2LNdWB_register_Asm_16: + case ARM::VST2LNdWB_register_Asm_32: + case ARM::VST2LNqWB_register_Asm_16: + case ARM::VST2LNqWB_register_Asm_32: { + MCInst TmpInst; + // Shuffle the operands around so the lane index operand is in the + // right place. + unsigned Spacing; + TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb + TmpInst.addOperand(Inst.getOperand(2)); // Rn + TmpInst.addOperand(Inst.getOperand(3)); // alignment + TmpInst.addOperand(Inst.getOperand(4)); // Rm + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(Inst.getOperand(1)); // lane + TmpInst.addOperand(Inst.getOperand(5)); // CondCode + TmpInst.addOperand(Inst.getOperand(6)); + Inst = TmpInst; + return true; + } + + case ARM::VST3LNdWB_register_Asm_8: + case ARM::VST3LNdWB_register_Asm_16: + case ARM::VST3LNdWB_register_Asm_32: + case ARM::VST3LNqWB_register_Asm_16: + case ARM::VST3LNqWB_register_Asm_32: { + MCInst TmpInst; + // Shuffle the operands around so the lane index operand is in the + // right place. + unsigned Spacing; + TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb + TmpInst.addOperand(Inst.getOperand(2)); // Rn + TmpInst.addOperand(Inst.getOperand(3)); // alignment + TmpInst.addOperand(Inst.getOperand(4)); // Rm + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(Inst.getOperand(1)); // lane + TmpInst.addOperand(Inst.getOperand(5)); // CondCode + TmpInst.addOperand(Inst.getOperand(6)); + Inst = TmpInst; + return true; + } + + case ARM::VST4LNdWB_register_Asm_8: + case ARM::VST4LNdWB_register_Asm_16: + case ARM::VST4LNdWB_register_Asm_32: + case ARM::VST4LNqWB_register_Asm_16: + case ARM::VST4LNqWB_register_Asm_32: { + MCInst TmpInst; + // Shuffle the operands around so the lane index operand is in the + // right place. + unsigned Spacing; + TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb + TmpInst.addOperand(Inst.getOperand(2)); // Rn + TmpInst.addOperand(Inst.getOperand(3)); // alignment + TmpInst.addOperand(Inst.getOperand(4)); // Rm + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 3)); + TmpInst.addOperand(Inst.getOperand(1)); // lane + TmpInst.addOperand(Inst.getOperand(5)); // CondCode + TmpInst.addOperand(Inst.getOperand(6)); + Inst = TmpInst; + return true; + } + + case ARM::VST1LNdWB_fixed_Asm_8: + case ARM::VST1LNdWB_fixed_Asm_16: + case ARM::VST1LNdWB_fixed_Asm_32: { + MCInst TmpInst; + // Shuffle the operands around so the lane index operand is in the + // right place. + unsigned Spacing; + TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb + TmpInst.addOperand(Inst.getOperand(2)); // Rn + TmpInst.addOperand(Inst.getOperand(3)); // alignment + TmpInst.addOperand(MCOperand::createReg(0)); // Rm + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(Inst.getOperand(1)); // lane + TmpInst.addOperand(Inst.getOperand(4)); // CondCode + TmpInst.addOperand(Inst.getOperand(5)); + Inst = TmpInst; + return true; + } + + case ARM::VST2LNdWB_fixed_Asm_8: + case ARM::VST2LNdWB_fixed_Asm_16: + case ARM::VST2LNdWB_fixed_Asm_32: + case ARM::VST2LNqWB_fixed_Asm_16: + case ARM::VST2LNqWB_fixed_Asm_32: { + MCInst TmpInst; + // Shuffle the operands around so the lane index operand is in the + // right place. + unsigned Spacing; + TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb + TmpInst.addOperand(Inst.getOperand(2)); // Rn + TmpInst.addOperand(Inst.getOperand(3)); // alignment + TmpInst.addOperand(MCOperand::createReg(0)); // Rm + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(Inst.getOperand(1)); // lane + TmpInst.addOperand(Inst.getOperand(4)); // CondCode + TmpInst.addOperand(Inst.getOperand(5)); + Inst = TmpInst; + return true; + } + + case ARM::VST3LNdWB_fixed_Asm_8: + case ARM::VST3LNdWB_fixed_Asm_16: + case ARM::VST3LNdWB_fixed_Asm_32: + case ARM::VST3LNqWB_fixed_Asm_16: + case ARM::VST3LNqWB_fixed_Asm_32: { + MCInst TmpInst; + // Shuffle the operands around so the lane index operand is in the + // right place. + unsigned Spacing; + TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb + TmpInst.addOperand(Inst.getOperand(2)); // Rn + TmpInst.addOperand(Inst.getOperand(3)); // alignment + TmpInst.addOperand(MCOperand::createReg(0)); // Rm + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(Inst.getOperand(1)); // lane + TmpInst.addOperand(Inst.getOperand(4)); // CondCode + TmpInst.addOperand(Inst.getOperand(5)); + Inst = TmpInst; + return true; + } + + case ARM::VST4LNdWB_fixed_Asm_8: + case ARM::VST4LNdWB_fixed_Asm_16: + case ARM::VST4LNdWB_fixed_Asm_32: + case ARM::VST4LNqWB_fixed_Asm_16: + case ARM::VST4LNqWB_fixed_Asm_32: { + MCInst TmpInst; + // Shuffle the operands around so the lane index operand is in the + // right place. + unsigned Spacing; + TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb + TmpInst.addOperand(Inst.getOperand(2)); // Rn + TmpInst.addOperand(Inst.getOperand(3)); // alignment + TmpInst.addOperand(MCOperand::createReg(0)); // Rm + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 3)); + TmpInst.addOperand(Inst.getOperand(1)); // lane + TmpInst.addOperand(Inst.getOperand(4)); // CondCode + TmpInst.addOperand(Inst.getOperand(5)); + Inst = TmpInst; + return true; + } + + case ARM::VST1LNdAsm_8: + case ARM::VST1LNdAsm_16: + case ARM::VST1LNdAsm_32: { + MCInst TmpInst; + // Shuffle the operands around so the lane index operand is in the + // right place. + unsigned Spacing; + TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(2)); // Rn + TmpInst.addOperand(Inst.getOperand(3)); // alignment + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(Inst.getOperand(1)); // lane + TmpInst.addOperand(Inst.getOperand(4)); // CondCode + TmpInst.addOperand(Inst.getOperand(5)); + Inst = TmpInst; + return true; + } + + case ARM::VST2LNdAsm_8: + case ARM::VST2LNdAsm_16: + case ARM::VST2LNdAsm_32: + case ARM::VST2LNqAsm_16: + case ARM::VST2LNqAsm_32: { + MCInst TmpInst; + // Shuffle the operands around so the lane index operand is in the + // right place. + unsigned Spacing; + TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(2)); // Rn + TmpInst.addOperand(Inst.getOperand(3)); // alignment + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(Inst.getOperand(1)); // lane + TmpInst.addOperand(Inst.getOperand(4)); // CondCode + TmpInst.addOperand(Inst.getOperand(5)); + Inst = TmpInst; + return true; + } + + case ARM::VST3LNdAsm_8: + case ARM::VST3LNdAsm_16: + case ARM::VST3LNdAsm_32: + case ARM::VST3LNqAsm_16: + case ARM::VST3LNqAsm_32: { + MCInst TmpInst; + // Shuffle the operands around so the lane index operand is in the + // right place. + unsigned Spacing; + TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(2)); // Rn + TmpInst.addOperand(Inst.getOperand(3)); // alignment + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(Inst.getOperand(1)); // lane + TmpInst.addOperand(Inst.getOperand(4)); // CondCode + TmpInst.addOperand(Inst.getOperand(5)); + Inst = TmpInst; + return true; + } + + case ARM::VST4LNdAsm_8: + case ARM::VST4LNdAsm_16: + case ARM::VST4LNdAsm_32: + case ARM::VST4LNqAsm_16: + case ARM::VST4LNqAsm_32: { + MCInst TmpInst; + // Shuffle the operands around so the lane index operand is in the + // right place. + unsigned Spacing; + TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(2)); // Rn + TmpInst.addOperand(Inst.getOperand(3)); // alignment + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 3)); + TmpInst.addOperand(Inst.getOperand(1)); // lane + TmpInst.addOperand(Inst.getOperand(4)); // CondCode + TmpInst.addOperand(Inst.getOperand(5)); + Inst = TmpInst; + return true; + } + + // Handle NEON VLD complex aliases. + case ARM::VLD1LNdWB_register_Asm_8: + case ARM::VLD1LNdWB_register_Asm_16: + case ARM::VLD1LNdWB_register_Asm_32: { + MCInst TmpInst; + // Shuffle the operands around so the lane index operand is in the + // right place. + unsigned Spacing; + TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb + TmpInst.addOperand(Inst.getOperand(2)); // Rn + TmpInst.addOperand(Inst.getOperand(3)); // alignment + TmpInst.addOperand(Inst.getOperand(4)); // Rm + TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd) + TmpInst.addOperand(Inst.getOperand(1)); // lane + TmpInst.addOperand(Inst.getOperand(5)); // CondCode + TmpInst.addOperand(Inst.getOperand(6)); + Inst = TmpInst; + return true; + } + + case ARM::VLD2LNdWB_register_Asm_8: + case ARM::VLD2LNdWB_register_Asm_16: + case ARM::VLD2LNdWB_register_Asm_32: + case ARM::VLD2LNqWB_register_Asm_16: + case ARM::VLD2LNqWB_register_Asm_32: { + MCInst TmpInst; + // Shuffle the operands around so the lane index operand is in the + // right place. + unsigned Spacing; + TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb + TmpInst.addOperand(Inst.getOperand(2)); // Rn + TmpInst.addOperand(Inst.getOperand(3)); // alignment + TmpInst.addOperand(Inst.getOperand(4)); // Rm + TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd) + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(Inst.getOperand(1)); // lane + TmpInst.addOperand(Inst.getOperand(5)); // CondCode + TmpInst.addOperand(Inst.getOperand(6)); + Inst = TmpInst; + return true; + } + + case ARM::VLD3LNdWB_register_Asm_8: + case ARM::VLD3LNdWB_register_Asm_16: + case ARM::VLD3LNdWB_register_Asm_32: + case ARM::VLD3LNqWB_register_Asm_16: + case ARM::VLD3LNqWB_register_Asm_32: { + MCInst TmpInst; + // Shuffle the operands around so the lane index operand is in the + // right place. + unsigned Spacing; + TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb + TmpInst.addOperand(Inst.getOperand(2)); // Rn + TmpInst.addOperand(Inst.getOperand(3)); // alignment + TmpInst.addOperand(Inst.getOperand(4)); // Rm + TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd) + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(Inst.getOperand(1)); // lane + TmpInst.addOperand(Inst.getOperand(5)); // CondCode + TmpInst.addOperand(Inst.getOperand(6)); + Inst = TmpInst; + return true; + } + + case ARM::VLD4LNdWB_register_Asm_8: + case ARM::VLD4LNdWB_register_Asm_16: + case ARM::VLD4LNdWB_register_Asm_32: + case ARM::VLD4LNqWB_register_Asm_16: + case ARM::VLD4LNqWB_register_Asm_32: { + MCInst TmpInst; + // Shuffle the operands around so the lane index operand is in the + // right place. + unsigned Spacing; + TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 3)); + TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb + TmpInst.addOperand(Inst.getOperand(2)); // Rn + TmpInst.addOperand(Inst.getOperand(3)); // alignment + TmpInst.addOperand(Inst.getOperand(4)); // Rm + TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd) + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 3)); + TmpInst.addOperand(Inst.getOperand(1)); // lane + TmpInst.addOperand(Inst.getOperand(5)); // CondCode + TmpInst.addOperand(Inst.getOperand(6)); + Inst = TmpInst; + return true; + } + + case ARM::VLD1LNdWB_fixed_Asm_8: + case ARM::VLD1LNdWB_fixed_Asm_16: + case ARM::VLD1LNdWB_fixed_Asm_32: { + MCInst TmpInst; + // Shuffle the operands around so the lane index operand is in the + // right place. + unsigned Spacing; + TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb + TmpInst.addOperand(Inst.getOperand(2)); // Rn + TmpInst.addOperand(Inst.getOperand(3)); // alignment + TmpInst.addOperand(MCOperand::createReg(0)); // Rm + TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd) + TmpInst.addOperand(Inst.getOperand(1)); // lane + TmpInst.addOperand(Inst.getOperand(4)); // CondCode + TmpInst.addOperand(Inst.getOperand(5)); + Inst = TmpInst; + return true; + } + + case ARM::VLD2LNdWB_fixed_Asm_8: + case ARM::VLD2LNdWB_fixed_Asm_16: + case ARM::VLD2LNdWB_fixed_Asm_32: + case ARM::VLD2LNqWB_fixed_Asm_16: + case ARM::VLD2LNqWB_fixed_Asm_32: { + MCInst TmpInst; + // Shuffle the operands around so the lane index operand is in the + // right place. + unsigned Spacing; + TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb + TmpInst.addOperand(Inst.getOperand(2)); // Rn + TmpInst.addOperand(Inst.getOperand(3)); // alignment + TmpInst.addOperand(MCOperand::createReg(0)); // Rm + TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd) + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(Inst.getOperand(1)); // lane + TmpInst.addOperand(Inst.getOperand(4)); // CondCode + TmpInst.addOperand(Inst.getOperand(5)); + Inst = TmpInst; + return true; + } + + case ARM::VLD3LNdWB_fixed_Asm_8: + case ARM::VLD3LNdWB_fixed_Asm_16: + case ARM::VLD3LNdWB_fixed_Asm_32: + case ARM::VLD3LNqWB_fixed_Asm_16: + case ARM::VLD3LNqWB_fixed_Asm_32: { + MCInst TmpInst; + // Shuffle the operands around so the lane index operand is in the + // right place. + unsigned Spacing; + TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb + TmpInst.addOperand(Inst.getOperand(2)); // Rn + TmpInst.addOperand(Inst.getOperand(3)); // alignment + TmpInst.addOperand(MCOperand::createReg(0)); // Rm + TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd) + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(Inst.getOperand(1)); // lane + TmpInst.addOperand(Inst.getOperand(4)); // CondCode + TmpInst.addOperand(Inst.getOperand(5)); + Inst = TmpInst; + return true; + } + + case ARM::VLD4LNdWB_fixed_Asm_8: + case ARM::VLD4LNdWB_fixed_Asm_16: + case ARM::VLD4LNdWB_fixed_Asm_32: + case ARM::VLD4LNqWB_fixed_Asm_16: + case ARM::VLD4LNqWB_fixed_Asm_32: { + MCInst TmpInst; + // Shuffle the operands around so the lane index operand is in the + // right place. + unsigned Spacing; + TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 3)); + TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb + TmpInst.addOperand(Inst.getOperand(2)); // Rn + TmpInst.addOperand(Inst.getOperand(3)); // alignment + TmpInst.addOperand(MCOperand::createReg(0)); // Rm + TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd) + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 3)); + TmpInst.addOperand(Inst.getOperand(1)); // lane + TmpInst.addOperand(Inst.getOperand(4)); // CondCode + TmpInst.addOperand(Inst.getOperand(5)); + Inst = TmpInst; + return true; + } + + case ARM::VLD1LNdAsm_8: + case ARM::VLD1LNdAsm_16: + case ARM::VLD1LNdAsm_32: { + MCInst TmpInst; + // Shuffle the operands around so the lane index operand is in the + // right place. + unsigned Spacing; + TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(Inst.getOperand(2)); // Rn + TmpInst.addOperand(Inst.getOperand(3)); // alignment + TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd) + TmpInst.addOperand(Inst.getOperand(1)); // lane + TmpInst.addOperand(Inst.getOperand(4)); // CondCode + TmpInst.addOperand(Inst.getOperand(5)); + Inst = TmpInst; + return true; + } + + case ARM::VLD2LNdAsm_8: + case ARM::VLD2LNdAsm_16: + case ARM::VLD2LNdAsm_32: + case ARM::VLD2LNqAsm_16: + case ARM::VLD2LNqAsm_32: { + MCInst TmpInst; + // Shuffle the operands around so the lane index operand is in the + // right place. + unsigned Spacing; + TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(Inst.getOperand(2)); // Rn + TmpInst.addOperand(Inst.getOperand(3)); // alignment + TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd) + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(Inst.getOperand(1)); // lane + TmpInst.addOperand(Inst.getOperand(4)); // CondCode + TmpInst.addOperand(Inst.getOperand(5)); + Inst = TmpInst; + return true; + } + + case ARM::VLD3LNdAsm_8: + case ARM::VLD3LNdAsm_16: + case ARM::VLD3LNdAsm_32: + case ARM::VLD3LNqAsm_16: + case ARM::VLD3LNqAsm_32: { + MCInst TmpInst; + // Shuffle the operands around so the lane index operand is in the + // right place. + unsigned Spacing; + TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(Inst.getOperand(2)); // Rn + TmpInst.addOperand(Inst.getOperand(3)); // alignment + TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd) + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(Inst.getOperand(1)); // lane + TmpInst.addOperand(Inst.getOperand(4)); // CondCode + TmpInst.addOperand(Inst.getOperand(5)); + Inst = TmpInst; + return true; + } + + case ARM::VLD4LNdAsm_8: + case ARM::VLD4LNdAsm_16: + case ARM::VLD4LNdAsm_32: + case ARM::VLD4LNqAsm_16: + case ARM::VLD4LNqAsm_32: { + MCInst TmpInst; + // Shuffle the operands around so the lane index operand is in the + // right place. + unsigned Spacing; + TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 3)); + TmpInst.addOperand(Inst.getOperand(2)); // Rn + TmpInst.addOperand(Inst.getOperand(3)); // alignment + TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd) + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 3)); + TmpInst.addOperand(Inst.getOperand(1)); // lane + TmpInst.addOperand(Inst.getOperand(4)); // CondCode + TmpInst.addOperand(Inst.getOperand(5)); + Inst = TmpInst; + return true; + } + + // VLD3DUP single 3-element structure to all lanes instructions. + case ARM::VLD3DUPdAsm_8: + case ARM::VLD3DUPdAsm_16: + case ARM::VLD3DUPdAsm_32: + case ARM::VLD3DUPqAsm_8: + case ARM::VLD3DUPqAsm_16: + case ARM::VLD3DUPqAsm_32: { + MCInst TmpInst; + unsigned Spacing; + TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(Inst.getOperand(1)); // Rn + TmpInst.addOperand(Inst.getOperand(2)); // alignment + TmpInst.addOperand(Inst.getOperand(3)); // CondCode + TmpInst.addOperand(Inst.getOperand(4)); + Inst = TmpInst; + return true; + } + + case ARM::VLD3DUPdWB_fixed_Asm_8: + case ARM::VLD3DUPdWB_fixed_Asm_16: + case ARM::VLD3DUPdWB_fixed_Asm_32: + case ARM::VLD3DUPqWB_fixed_Asm_8: + case ARM::VLD3DUPqWB_fixed_Asm_16: + case ARM::VLD3DUPqWB_fixed_Asm_32: { + MCInst TmpInst; + unsigned Spacing; + TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(Inst.getOperand(1)); // Rn + TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn + TmpInst.addOperand(Inst.getOperand(2)); // alignment + TmpInst.addOperand(MCOperand::createReg(0)); // Rm + TmpInst.addOperand(Inst.getOperand(3)); // CondCode + TmpInst.addOperand(Inst.getOperand(4)); + Inst = TmpInst; + return true; + } + + case ARM::VLD3DUPdWB_register_Asm_8: + case ARM::VLD3DUPdWB_register_Asm_16: + case ARM::VLD3DUPdWB_register_Asm_32: + case ARM::VLD3DUPqWB_register_Asm_8: + case ARM::VLD3DUPqWB_register_Asm_16: + case ARM::VLD3DUPqWB_register_Asm_32: { + MCInst TmpInst; + unsigned Spacing; + TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(Inst.getOperand(1)); // Rn + TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn + TmpInst.addOperand(Inst.getOperand(2)); // alignment + TmpInst.addOperand(Inst.getOperand(3)); // Rm + TmpInst.addOperand(Inst.getOperand(4)); // CondCode + TmpInst.addOperand(Inst.getOperand(5)); + Inst = TmpInst; + return true; + } + + // VLD3 multiple 3-element structure instructions. + case ARM::VLD3dAsm_8: + case ARM::VLD3dAsm_16: + case ARM::VLD3dAsm_32: + case ARM::VLD3qAsm_8: + case ARM::VLD3qAsm_16: + case ARM::VLD3qAsm_32: { + MCInst TmpInst; + unsigned Spacing; + TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(Inst.getOperand(1)); // Rn + TmpInst.addOperand(Inst.getOperand(2)); // alignment + TmpInst.addOperand(Inst.getOperand(3)); // CondCode + TmpInst.addOperand(Inst.getOperand(4)); + Inst = TmpInst; + return true; + } + + case ARM::VLD3dWB_fixed_Asm_8: + case ARM::VLD3dWB_fixed_Asm_16: + case ARM::VLD3dWB_fixed_Asm_32: + case ARM::VLD3qWB_fixed_Asm_8: + case ARM::VLD3qWB_fixed_Asm_16: + case ARM::VLD3qWB_fixed_Asm_32: { + MCInst TmpInst; + unsigned Spacing; + TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(Inst.getOperand(1)); // Rn + TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn + TmpInst.addOperand(Inst.getOperand(2)); // alignment + TmpInst.addOperand(MCOperand::createReg(0)); // Rm + TmpInst.addOperand(Inst.getOperand(3)); // CondCode + TmpInst.addOperand(Inst.getOperand(4)); + Inst = TmpInst; + return true; + } + + case ARM::VLD3dWB_register_Asm_8: + case ARM::VLD3dWB_register_Asm_16: + case ARM::VLD3dWB_register_Asm_32: + case ARM::VLD3qWB_register_Asm_8: + case ARM::VLD3qWB_register_Asm_16: + case ARM::VLD3qWB_register_Asm_32: { + MCInst TmpInst; + unsigned Spacing; + TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(Inst.getOperand(1)); // Rn + TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn + TmpInst.addOperand(Inst.getOperand(2)); // alignment + TmpInst.addOperand(Inst.getOperand(3)); // Rm + TmpInst.addOperand(Inst.getOperand(4)); // CondCode + TmpInst.addOperand(Inst.getOperand(5)); + Inst = TmpInst; + return true; + } + + // VLD4DUP single 3-element structure to all lanes instructions. + case ARM::VLD4DUPdAsm_8: + case ARM::VLD4DUPdAsm_16: + case ARM::VLD4DUPdAsm_32: + case ARM::VLD4DUPqAsm_8: + case ARM::VLD4DUPqAsm_16: + case ARM::VLD4DUPqAsm_32: { + MCInst TmpInst; + unsigned Spacing; + TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 3)); + TmpInst.addOperand(Inst.getOperand(1)); // Rn + TmpInst.addOperand(Inst.getOperand(2)); // alignment + TmpInst.addOperand(Inst.getOperand(3)); // CondCode + TmpInst.addOperand(Inst.getOperand(4)); + Inst = TmpInst; + return true; + } + + case ARM::VLD4DUPdWB_fixed_Asm_8: + case ARM::VLD4DUPdWB_fixed_Asm_16: + case ARM::VLD4DUPdWB_fixed_Asm_32: + case ARM::VLD4DUPqWB_fixed_Asm_8: + case ARM::VLD4DUPqWB_fixed_Asm_16: + case ARM::VLD4DUPqWB_fixed_Asm_32: { + MCInst TmpInst; + unsigned Spacing; + TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 3)); + TmpInst.addOperand(Inst.getOperand(1)); // Rn + TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn + TmpInst.addOperand(Inst.getOperand(2)); // alignment + TmpInst.addOperand(MCOperand::createReg(0)); // Rm + TmpInst.addOperand(Inst.getOperand(3)); // CondCode + TmpInst.addOperand(Inst.getOperand(4)); + Inst = TmpInst; + return true; + } + + case ARM::VLD4DUPdWB_register_Asm_8: + case ARM::VLD4DUPdWB_register_Asm_16: + case ARM::VLD4DUPdWB_register_Asm_32: + case ARM::VLD4DUPqWB_register_Asm_8: + case ARM::VLD4DUPqWB_register_Asm_16: + case ARM::VLD4DUPqWB_register_Asm_32: { + MCInst TmpInst; + unsigned Spacing; + TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 3)); + TmpInst.addOperand(Inst.getOperand(1)); // Rn + TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn + TmpInst.addOperand(Inst.getOperand(2)); // alignment + TmpInst.addOperand(Inst.getOperand(3)); // Rm + TmpInst.addOperand(Inst.getOperand(4)); // CondCode + TmpInst.addOperand(Inst.getOperand(5)); + Inst = TmpInst; + return true; + } + + // VLD4 multiple 4-element structure instructions. + case ARM::VLD4dAsm_8: + case ARM::VLD4dAsm_16: + case ARM::VLD4dAsm_32: + case ARM::VLD4qAsm_8: + case ARM::VLD4qAsm_16: + case ARM::VLD4qAsm_32: { + MCInst TmpInst; + unsigned Spacing; + TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 3)); + TmpInst.addOperand(Inst.getOperand(1)); // Rn + TmpInst.addOperand(Inst.getOperand(2)); // alignment + TmpInst.addOperand(Inst.getOperand(3)); // CondCode + TmpInst.addOperand(Inst.getOperand(4)); + Inst = TmpInst; + return true; + } + + case ARM::VLD4dWB_fixed_Asm_8: + case ARM::VLD4dWB_fixed_Asm_16: + case ARM::VLD4dWB_fixed_Asm_32: + case ARM::VLD4qWB_fixed_Asm_8: + case ARM::VLD4qWB_fixed_Asm_16: + case ARM::VLD4qWB_fixed_Asm_32: { + MCInst TmpInst; + unsigned Spacing; + TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 3)); + TmpInst.addOperand(Inst.getOperand(1)); // Rn + TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn + TmpInst.addOperand(Inst.getOperand(2)); // alignment + TmpInst.addOperand(MCOperand::createReg(0)); // Rm + TmpInst.addOperand(Inst.getOperand(3)); // CondCode + TmpInst.addOperand(Inst.getOperand(4)); + Inst = TmpInst; + return true; + } + + case ARM::VLD4dWB_register_Asm_8: + case ARM::VLD4dWB_register_Asm_16: + case ARM::VLD4dWB_register_Asm_32: + case ARM::VLD4qWB_register_Asm_8: + case ARM::VLD4qWB_register_Asm_16: + case ARM::VLD4qWB_register_Asm_32: { + MCInst TmpInst; + unsigned Spacing; + TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 3)); + TmpInst.addOperand(Inst.getOperand(1)); // Rn + TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn + TmpInst.addOperand(Inst.getOperand(2)); // alignment + TmpInst.addOperand(Inst.getOperand(3)); // Rm + TmpInst.addOperand(Inst.getOperand(4)); // CondCode + TmpInst.addOperand(Inst.getOperand(5)); + Inst = TmpInst; + return true; + } + + // VST3 multiple 3-element structure instructions. + case ARM::VST3dAsm_8: + case ARM::VST3dAsm_16: + case ARM::VST3dAsm_32: + case ARM::VST3qAsm_8: + case ARM::VST3qAsm_16: + case ARM::VST3qAsm_32: { + MCInst TmpInst; + unsigned Spacing; + TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(1)); // Rn + TmpInst.addOperand(Inst.getOperand(2)); // alignment + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(Inst.getOperand(3)); // CondCode + TmpInst.addOperand(Inst.getOperand(4)); + Inst = TmpInst; + return true; + } + + case ARM::VST3dWB_fixed_Asm_8: + case ARM::VST3dWB_fixed_Asm_16: + case ARM::VST3dWB_fixed_Asm_32: + case ARM::VST3qWB_fixed_Asm_8: + case ARM::VST3qWB_fixed_Asm_16: + case ARM::VST3qWB_fixed_Asm_32: { + MCInst TmpInst; + unsigned Spacing; + TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(1)); // Rn + TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn + TmpInst.addOperand(Inst.getOperand(2)); // alignment + TmpInst.addOperand(MCOperand::createReg(0)); // Rm + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(Inst.getOperand(3)); // CondCode + TmpInst.addOperand(Inst.getOperand(4)); + Inst = TmpInst; + return true; + } + + case ARM::VST3dWB_register_Asm_8: + case ARM::VST3dWB_register_Asm_16: + case ARM::VST3dWB_register_Asm_32: + case ARM::VST3qWB_register_Asm_8: + case ARM::VST3qWB_register_Asm_16: + case ARM::VST3qWB_register_Asm_32: { + MCInst TmpInst; + unsigned Spacing; + TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(1)); // Rn + TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn + TmpInst.addOperand(Inst.getOperand(2)); // alignment + TmpInst.addOperand(Inst.getOperand(3)); // Rm + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(Inst.getOperand(4)); // CondCode + TmpInst.addOperand(Inst.getOperand(5)); + Inst = TmpInst; + return true; + } + + // VST4 multiple 3-element structure instructions. + case ARM::VST4dAsm_8: + case ARM::VST4dAsm_16: + case ARM::VST4dAsm_32: + case ARM::VST4qAsm_8: + case ARM::VST4qAsm_16: + case ARM::VST4qAsm_32: { + MCInst TmpInst; + unsigned Spacing; + TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(1)); // Rn + TmpInst.addOperand(Inst.getOperand(2)); // alignment + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 3)); + TmpInst.addOperand(Inst.getOperand(3)); // CondCode + TmpInst.addOperand(Inst.getOperand(4)); + Inst = TmpInst; + return true; + } + + case ARM::VST4dWB_fixed_Asm_8: + case ARM::VST4dWB_fixed_Asm_16: + case ARM::VST4dWB_fixed_Asm_32: + case ARM::VST4qWB_fixed_Asm_8: + case ARM::VST4qWB_fixed_Asm_16: + case ARM::VST4qWB_fixed_Asm_32: { + MCInst TmpInst; + unsigned Spacing; + TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(1)); // Rn + TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn + TmpInst.addOperand(Inst.getOperand(2)); // alignment + TmpInst.addOperand(MCOperand::createReg(0)); // Rm + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 3)); + TmpInst.addOperand(Inst.getOperand(3)); // CondCode + TmpInst.addOperand(Inst.getOperand(4)); + Inst = TmpInst; + return true; + } + + case ARM::VST4dWB_register_Asm_8: + case ARM::VST4dWB_register_Asm_16: + case ARM::VST4dWB_register_Asm_32: + case ARM::VST4qWB_register_Asm_8: + case ARM::VST4qWB_register_Asm_16: + case ARM::VST4qWB_register_Asm_32: { + MCInst TmpInst; + unsigned Spacing; + TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing)); + TmpInst.addOperand(Inst.getOperand(1)); // Rn + TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn + TmpInst.addOperand(Inst.getOperand(2)); // alignment + TmpInst.addOperand(Inst.getOperand(3)); // Rm + TmpInst.addOperand(Inst.getOperand(0)); // Vd + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 2)); + TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() + + Spacing * 3)); + TmpInst.addOperand(Inst.getOperand(4)); // CondCode + TmpInst.addOperand(Inst.getOperand(5)); + Inst = TmpInst; + return true; + } + + // Handle encoding choice for the shift-immediate instructions. + case ARM::t2LSLri: + case ARM::t2LSRri: + case ARM::t2ASRri: { + if (isARMLowRegister(Inst.getOperand(0).getReg()) && + Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() && + Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) && + !(static_cast<ARMOperand &>(*Operands[3]).isToken() && + static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w")) { + unsigned NewOpc; + switch (Inst.getOpcode()) { + default: llvm_unreachable("unexpected opcode"); + case ARM::t2LSLri: NewOpc = ARM::tLSLri; break; + case ARM::t2LSRri: NewOpc = ARM::tLSRri; break; + case ARM::t2ASRri: NewOpc = ARM::tASRri; break; + } + // The Thumb1 operands aren't in the same order. Awesome, eh? + MCInst TmpInst; + TmpInst.setOpcode(NewOpc); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(5)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(Inst.getOperand(2)); + TmpInst.addOperand(Inst.getOperand(3)); + TmpInst.addOperand(Inst.getOperand(4)); + Inst = TmpInst; + return true; + } + return false; + } + + // Handle the Thumb2 mode MOV complex aliases. + case ARM::t2MOVsr: + case ARM::t2MOVSsr: { + // Which instruction to expand to depends on the CCOut operand and + // whether we're in an IT block if the register operands are low + // registers. + bool isNarrow = false; + if (isARMLowRegister(Inst.getOperand(0).getReg()) && + isARMLowRegister(Inst.getOperand(1).getReg()) && + isARMLowRegister(Inst.getOperand(2).getReg()) && + Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() && + inITBlock() == (Inst.getOpcode() == ARM::t2MOVsr)) + isNarrow = true; + MCInst TmpInst; + unsigned newOpc; + switch(ARM_AM::getSORegShOp(Inst.getOperand(3).getImm())) { + default: llvm_unreachable("unexpected opcode!"); + case ARM_AM::asr: newOpc = isNarrow ? ARM::tASRrr : ARM::t2ASRrr; break; + case ARM_AM::lsr: newOpc = isNarrow ? ARM::tLSRrr : ARM::t2LSRrr; break; + case ARM_AM::lsl: newOpc = isNarrow ? ARM::tLSLrr : ARM::t2LSLrr; break; + case ARM_AM::ror: newOpc = isNarrow ? ARM::tROR : ARM::t2RORrr; break; + } + TmpInst.setOpcode(newOpc); + TmpInst.addOperand(Inst.getOperand(0)); // Rd + if (isNarrow) + TmpInst.addOperand(MCOperand::createReg( + Inst.getOpcode() == ARM::t2MOVSsr ? ARM::CPSR : 0)); + TmpInst.addOperand(Inst.getOperand(1)); // Rn + TmpInst.addOperand(Inst.getOperand(2)); // Rm + TmpInst.addOperand(Inst.getOperand(4)); // CondCode + TmpInst.addOperand(Inst.getOperand(5)); + if (!isNarrow) + TmpInst.addOperand(MCOperand::createReg( + Inst.getOpcode() == ARM::t2MOVSsr ? ARM::CPSR : 0)); + Inst = TmpInst; + return true; + } + case ARM::t2MOVsi: + case ARM::t2MOVSsi: { + // Which instruction to expand to depends on the CCOut operand and + // whether we're in an IT block if the register operands are low + // registers. + bool isNarrow = false; + if (isARMLowRegister(Inst.getOperand(0).getReg()) && + isARMLowRegister(Inst.getOperand(1).getReg()) && + inITBlock() == (Inst.getOpcode() == ARM::t2MOVsi)) + isNarrow = true; + MCInst TmpInst; + unsigned newOpc; + switch(ARM_AM::getSORegShOp(Inst.getOperand(2).getImm())) { + default: llvm_unreachable("unexpected opcode!"); + case ARM_AM::asr: newOpc = isNarrow ? ARM::tASRri : ARM::t2ASRri; break; + case ARM_AM::lsr: newOpc = isNarrow ? ARM::tLSRri : ARM::t2LSRri; break; + case ARM_AM::lsl: newOpc = isNarrow ? ARM::tLSLri : ARM::t2LSLri; break; + case ARM_AM::ror: newOpc = ARM::t2RORri; isNarrow = false; break; + case ARM_AM::rrx: isNarrow = false; newOpc = ARM::t2RRX; break; + } + unsigned Amount = ARM_AM::getSORegOffset(Inst.getOperand(2).getImm()); + if (Amount == 32) Amount = 0; + TmpInst.setOpcode(newOpc); + TmpInst.addOperand(Inst.getOperand(0)); // Rd + if (isNarrow) + TmpInst.addOperand(MCOperand::createReg( + Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : 0)); + TmpInst.addOperand(Inst.getOperand(1)); // Rn + if (newOpc != ARM::t2RRX) + TmpInst.addOperand(MCOperand::createImm(Amount)); + TmpInst.addOperand(Inst.getOperand(3)); // CondCode + TmpInst.addOperand(Inst.getOperand(4)); + if (!isNarrow) + TmpInst.addOperand(MCOperand::createReg( + Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : 0)); + Inst = TmpInst; + return true; + } + // Handle the ARM mode MOV complex aliases. + case ARM::ASRr: + case ARM::LSRr: + case ARM::LSLr: + case ARM::RORr: { + ARM_AM::ShiftOpc ShiftTy; + switch(Inst.getOpcode()) { + default: llvm_unreachable("unexpected opcode!"); + case ARM::ASRr: ShiftTy = ARM_AM::asr; break; + case ARM::LSRr: ShiftTy = ARM_AM::lsr; break; + case ARM::LSLr: ShiftTy = ARM_AM::lsl; break; + case ARM::RORr: ShiftTy = ARM_AM::ror; break; + } + unsigned Shifter = ARM_AM::getSORegOpc(ShiftTy, 0); + MCInst TmpInst; + TmpInst.setOpcode(ARM::MOVsr); + TmpInst.addOperand(Inst.getOperand(0)); // Rd + TmpInst.addOperand(Inst.getOperand(1)); // Rn + TmpInst.addOperand(Inst.getOperand(2)); // Rm + TmpInst.addOperand(MCOperand::createImm(Shifter)); // Shift value and ty + TmpInst.addOperand(Inst.getOperand(3)); // CondCode + TmpInst.addOperand(Inst.getOperand(4)); + TmpInst.addOperand(Inst.getOperand(5)); // cc_out + Inst = TmpInst; + return true; + } + case ARM::ASRi: + case ARM::LSRi: + case ARM::LSLi: + case ARM::RORi: { + ARM_AM::ShiftOpc ShiftTy; + switch(Inst.getOpcode()) { + default: llvm_unreachable("unexpected opcode!"); + case ARM::ASRi: ShiftTy = ARM_AM::asr; break; + case ARM::LSRi: ShiftTy = ARM_AM::lsr; break; + case ARM::LSLi: ShiftTy = ARM_AM::lsl; break; + case ARM::RORi: ShiftTy = ARM_AM::ror; break; + } + // A shift by zero is a plain MOVr, not a MOVsi. + unsigned Amt = Inst.getOperand(2).getImm(); + unsigned Opc = Amt == 0 ? ARM::MOVr : ARM::MOVsi; + // A shift by 32 should be encoded as 0 when permitted + if (Amt == 32 && (ShiftTy == ARM_AM::lsr || ShiftTy == ARM_AM::asr)) + Amt = 0; + unsigned Shifter = ARM_AM::getSORegOpc(ShiftTy, Amt); + MCInst TmpInst; + TmpInst.setOpcode(Opc); + TmpInst.addOperand(Inst.getOperand(0)); // Rd + TmpInst.addOperand(Inst.getOperand(1)); // Rn + if (Opc == ARM::MOVsi) + TmpInst.addOperand(MCOperand::createImm(Shifter)); // Shift value and ty + TmpInst.addOperand(Inst.getOperand(3)); // CondCode + TmpInst.addOperand(Inst.getOperand(4)); + TmpInst.addOperand(Inst.getOperand(5)); // cc_out + Inst = TmpInst; + return true; + } + case ARM::RRXi: { + unsigned Shifter = ARM_AM::getSORegOpc(ARM_AM::rrx, 0); + MCInst TmpInst; + TmpInst.setOpcode(ARM::MOVsi); + TmpInst.addOperand(Inst.getOperand(0)); // Rd + TmpInst.addOperand(Inst.getOperand(1)); // Rn + TmpInst.addOperand(MCOperand::createImm(Shifter)); // Shift value and ty + TmpInst.addOperand(Inst.getOperand(2)); // CondCode + TmpInst.addOperand(Inst.getOperand(3)); + TmpInst.addOperand(Inst.getOperand(4)); // cc_out + Inst = TmpInst; + return true; + } + case ARM::t2LDMIA_UPD: { + // If this is a load of a single register, then we should use + // a post-indexed LDR instruction instead, per the ARM ARM. + if (Inst.getNumOperands() != 5) + return false; + MCInst TmpInst; + TmpInst.setOpcode(ARM::t2LDR_POST); + TmpInst.addOperand(Inst.getOperand(4)); // Rt + TmpInst.addOperand(Inst.getOperand(0)); // Rn_wb + TmpInst.addOperand(Inst.getOperand(1)); // Rn + TmpInst.addOperand(MCOperand::createImm(4)); + TmpInst.addOperand(Inst.getOperand(2)); // CondCode + TmpInst.addOperand(Inst.getOperand(3)); + Inst = TmpInst; + return true; + } + case ARM::t2STMDB_UPD: { + // If this is a store of a single register, then we should use + // a pre-indexed STR instruction instead, per the ARM ARM. + if (Inst.getNumOperands() != 5) + return false; + MCInst TmpInst; + TmpInst.setOpcode(ARM::t2STR_PRE); + TmpInst.addOperand(Inst.getOperand(0)); // Rn_wb + TmpInst.addOperand(Inst.getOperand(4)); // Rt + TmpInst.addOperand(Inst.getOperand(1)); // Rn + TmpInst.addOperand(MCOperand::createImm(-4)); + TmpInst.addOperand(Inst.getOperand(2)); // CondCode + TmpInst.addOperand(Inst.getOperand(3)); + Inst = TmpInst; + return true; + } + case ARM::LDMIA_UPD: + // If this is a load of a single register via a 'pop', then we should use + // a post-indexed LDR instruction instead, per the ARM ARM. + if (static_cast<ARMOperand &>(*Operands[0]).getToken() == "pop" && + Inst.getNumOperands() == 5) { + MCInst TmpInst; + TmpInst.setOpcode(ARM::LDR_POST_IMM); + TmpInst.addOperand(Inst.getOperand(4)); // Rt + TmpInst.addOperand(Inst.getOperand(0)); // Rn_wb + TmpInst.addOperand(Inst.getOperand(1)); // Rn + TmpInst.addOperand(MCOperand::createReg(0)); // am2offset + TmpInst.addOperand(MCOperand::createImm(4)); + TmpInst.addOperand(Inst.getOperand(2)); // CondCode + TmpInst.addOperand(Inst.getOperand(3)); + Inst = TmpInst; + return true; + } + break; + case ARM::STMDB_UPD: + // If this is a store of a single register via a 'push', then we should use + // a pre-indexed STR instruction instead, per the ARM ARM. + if (static_cast<ARMOperand &>(*Operands[0]).getToken() == "push" && + Inst.getNumOperands() == 5) { + MCInst TmpInst; + TmpInst.setOpcode(ARM::STR_PRE_IMM); + TmpInst.addOperand(Inst.getOperand(0)); // Rn_wb + TmpInst.addOperand(Inst.getOperand(4)); // Rt + TmpInst.addOperand(Inst.getOperand(1)); // addrmode_imm12 + TmpInst.addOperand(MCOperand::createImm(-4)); + TmpInst.addOperand(Inst.getOperand(2)); // CondCode + TmpInst.addOperand(Inst.getOperand(3)); + Inst = TmpInst; + } + break; + case ARM::t2ADDri12: + // If the immediate fits for encoding T3 (t2ADDri) and the generic "add" + // mnemonic was used (not "addw"), encoding T3 is preferred. + if (static_cast<ARMOperand &>(*Operands[0]).getToken() != "add" || + ARM_AM::getT2SOImmVal(Inst.getOperand(2).getImm()) == -1) + break; + Inst.setOpcode(ARM::t2ADDri); + Inst.addOperand(MCOperand::createReg(0)); // cc_out + break; + case ARM::t2SUBri12: + // If the immediate fits for encoding T3 (t2SUBri) and the generic "sub" + // mnemonic was used (not "subw"), encoding T3 is preferred. + if (static_cast<ARMOperand &>(*Operands[0]).getToken() != "sub" || + ARM_AM::getT2SOImmVal(Inst.getOperand(2).getImm()) == -1) + break; + Inst.setOpcode(ARM::t2SUBri); + Inst.addOperand(MCOperand::createReg(0)); // cc_out + break; + case ARM::tADDi8: + // If the immediate is in the range 0-7, we want tADDi3 iff Rd was + // explicitly specified. From the ARM ARM: "Encoding T1 is preferred + // to encoding T2 if <Rd> is specified and encoding T2 is preferred + // to encoding T1 if <Rd> is omitted." + if ((unsigned)Inst.getOperand(3).getImm() < 8 && Operands.size() == 6) { + Inst.setOpcode(ARM::tADDi3); + return true; + } + break; + case ARM::tSUBi8: + // If the immediate is in the range 0-7, we want tADDi3 iff Rd was + // explicitly specified. From the ARM ARM: "Encoding T1 is preferred + // to encoding T2 if <Rd> is specified and encoding T2 is preferred + // to encoding T1 if <Rd> is omitted." + if ((unsigned)Inst.getOperand(3).getImm() < 8 && Operands.size() == 6) { + Inst.setOpcode(ARM::tSUBi3); + return true; + } + break; + case ARM::t2ADDri: + case ARM::t2SUBri: { + // If the destination and first source operand are the same, and + // the flags are compatible with the current IT status, use encoding T2 + // instead of T3. For compatibility with the system 'as'. Make sure the + // wide encoding wasn't explicit. + if (Inst.getOperand(0).getReg() != Inst.getOperand(1).getReg() || + !isARMLowRegister(Inst.getOperand(0).getReg()) || + (unsigned)Inst.getOperand(2).getImm() > 255 || + ((!inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR) || + (inITBlock() && Inst.getOperand(5).getReg() != 0)) || + (static_cast<ARMOperand &>(*Operands[3]).isToken() && + static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w")) + break; + MCInst TmpInst; + TmpInst.setOpcode(Inst.getOpcode() == ARM::t2ADDri ? + ARM::tADDi8 : ARM::tSUBi8); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(5)); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(2)); + TmpInst.addOperand(Inst.getOperand(3)); + TmpInst.addOperand(Inst.getOperand(4)); + Inst = TmpInst; + return true; + } + case ARM::t2ADDrr: { + // If the destination and first source operand are the same, and + // there's no setting of the flags, use encoding T2 instead of T3. + // Note that this is only for ADD, not SUB. This mirrors the system + // 'as' behaviour. Also take advantage of ADD being commutative. + // Make sure the wide encoding wasn't explicit. + bool Swap = false; + auto DestReg = Inst.getOperand(0).getReg(); + bool Transform = DestReg == Inst.getOperand(1).getReg(); + if (!Transform && DestReg == Inst.getOperand(2).getReg()) { + Transform = true; + Swap = true; + } + if (!Transform || + Inst.getOperand(5).getReg() != 0 || + (static_cast<ARMOperand &>(*Operands[3]).isToken() && + static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w")) + break; + MCInst TmpInst; + TmpInst.setOpcode(ARM::tADDhirr); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(Swap ? 1 : 2)); + TmpInst.addOperand(Inst.getOperand(3)); + TmpInst.addOperand(Inst.getOperand(4)); + Inst = TmpInst; + return true; + } + case ARM::tADDrSP: { + // If the non-SP source operand and the destination operand are not the + // same, we need to use the 32-bit encoding if it's available. + if (Inst.getOperand(0).getReg() != Inst.getOperand(2).getReg()) { + Inst.setOpcode(ARM::t2ADDrr); + Inst.addOperand(MCOperand::createReg(0)); // cc_out + return true; + } + break; + } + case ARM::tB: + // A Thumb conditional branch outside of an IT block is a tBcc. + if (Inst.getOperand(1).getImm() != ARMCC::AL && !inITBlock()) { + Inst.setOpcode(ARM::tBcc); + return true; + } + break; + case ARM::t2B: + // A Thumb2 conditional branch outside of an IT block is a t2Bcc. + if (Inst.getOperand(1).getImm() != ARMCC::AL && !inITBlock()){ + Inst.setOpcode(ARM::t2Bcc); + return true; + } + break; + case ARM::t2Bcc: + // If the conditional is AL or we're in an IT block, we really want t2B. + if (Inst.getOperand(1).getImm() == ARMCC::AL || inITBlock()) { + Inst.setOpcode(ARM::t2B); + return true; + } + break; + case ARM::tBcc: + // If the conditional is AL, we really want tB. + if (Inst.getOperand(1).getImm() == ARMCC::AL) { + Inst.setOpcode(ARM::tB); + return true; + } + break; + case ARM::tLDMIA: { + // If the register list contains any high registers, or if the writeback + // doesn't match what tLDMIA can do, we need to use the 32-bit encoding + // instead if we're in Thumb2. Otherwise, this should have generated + // an error in validateInstruction(). + unsigned Rn = Inst.getOperand(0).getReg(); + bool hasWritebackToken = + (static_cast<ARMOperand &>(*Operands[3]).isToken() && + static_cast<ARMOperand &>(*Operands[3]).getToken() == "!"); + bool listContainsBase; + if (checkLowRegisterList(Inst, 3, Rn, 0, listContainsBase) || + (!listContainsBase && !hasWritebackToken) || + (listContainsBase && hasWritebackToken)) { + // 16-bit encoding isn't sufficient. Switch to the 32-bit version. + assert (isThumbTwo()); + Inst.setOpcode(hasWritebackToken ? ARM::t2LDMIA_UPD : ARM::t2LDMIA); + // If we're switching to the updating version, we need to insert + // the writeback tied operand. + if (hasWritebackToken) + Inst.insert(Inst.begin(), + MCOperand::createReg(Inst.getOperand(0).getReg())); + return true; + } + break; + } + case ARM::tSTMIA_UPD: { + // If the register list contains any high registers, we need to use + // the 32-bit encoding instead if we're in Thumb2. Otherwise, this + // should have generated an error in validateInstruction(). + unsigned Rn = Inst.getOperand(0).getReg(); + bool listContainsBase; + if (checkLowRegisterList(Inst, 4, Rn, 0, listContainsBase)) { + // 16-bit encoding isn't sufficient. Switch to the 32-bit version. + assert (isThumbTwo()); + Inst.setOpcode(ARM::t2STMIA_UPD); + return true; + } + break; + } + case ARM::tPOP: { + bool listContainsBase; + // If the register list contains any high registers, we need to use + // the 32-bit encoding instead if we're in Thumb2. Otherwise, this + // should have generated an error in validateInstruction(). + if (!checkLowRegisterList(Inst, 2, 0, ARM::PC, listContainsBase)) + return false; + assert (isThumbTwo()); + Inst.setOpcode(ARM::t2LDMIA_UPD); + // Add the base register and writeback operands. + Inst.insert(Inst.begin(), MCOperand::createReg(ARM::SP)); + Inst.insert(Inst.begin(), MCOperand::createReg(ARM::SP)); + return true; + } + case ARM::tPUSH: { + bool listContainsBase; + if (!checkLowRegisterList(Inst, 2, 0, ARM::LR, listContainsBase)) + return false; + assert (isThumbTwo()); + Inst.setOpcode(ARM::t2STMDB_UPD); + // Add the base register and writeback operands. + Inst.insert(Inst.begin(), MCOperand::createReg(ARM::SP)); + Inst.insert(Inst.begin(), MCOperand::createReg(ARM::SP)); + return true; + } + case ARM::t2MOVi: { + // If we can use the 16-bit encoding and the user didn't explicitly + // request the 32-bit variant, transform it here. + if (isARMLowRegister(Inst.getOperand(0).getReg()) && + (unsigned)Inst.getOperand(1).getImm() <= 255 && + ((!inITBlock() && Inst.getOperand(2).getImm() == ARMCC::AL && + Inst.getOperand(4).getReg() == ARM::CPSR) || + (inITBlock() && Inst.getOperand(4).getReg() == 0)) && + (!static_cast<ARMOperand &>(*Operands[2]).isToken() || + static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) { + // The operands aren't in the same order for tMOVi8... + MCInst TmpInst; + TmpInst.setOpcode(ARM::tMOVi8); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(4)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(Inst.getOperand(2)); + TmpInst.addOperand(Inst.getOperand(3)); + Inst = TmpInst; + return true; + } + break; + } + case ARM::t2MOVr: { + // If we can use the 16-bit encoding and the user didn't explicitly + // request the 32-bit variant, transform it here. + if (isARMLowRegister(Inst.getOperand(0).getReg()) && + isARMLowRegister(Inst.getOperand(1).getReg()) && + Inst.getOperand(2).getImm() == ARMCC::AL && + Inst.getOperand(4).getReg() == ARM::CPSR && + (!static_cast<ARMOperand &>(*Operands[2]).isToken() || + static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) { + // The operands aren't the same for tMOV[S]r... (no cc_out) + MCInst TmpInst; + TmpInst.setOpcode(Inst.getOperand(4).getReg() ? ARM::tMOVSr : ARM::tMOVr); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(Inst.getOperand(2)); + TmpInst.addOperand(Inst.getOperand(3)); + Inst = TmpInst; + return true; + } + break; + } + case ARM::t2SXTH: + case ARM::t2SXTB: + case ARM::t2UXTH: + case ARM::t2UXTB: { + // If we can use the 16-bit encoding and the user didn't explicitly + // request the 32-bit variant, transform it here. + if (isARMLowRegister(Inst.getOperand(0).getReg()) && + isARMLowRegister(Inst.getOperand(1).getReg()) && + Inst.getOperand(2).getImm() == 0 && + (!static_cast<ARMOperand &>(*Operands[2]).isToken() || + static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) { + unsigned NewOpc; + switch (Inst.getOpcode()) { + default: llvm_unreachable("Illegal opcode!"); + case ARM::t2SXTH: NewOpc = ARM::tSXTH; break; + case ARM::t2SXTB: NewOpc = ARM::tSXTB; break; + case ARM::t2UXTH: NewOpc = ARM::tUXTH; break; + case ARM::t2UXTB: NewOpc = ARM::tUXTB; break; + } + // The operands aren't the same for thumb1 (no rotate operand). + MCInst TmpInst; + TmpInst.setOpcode(NewOpc); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(Inst.getOperand(3)); + TmpInst.addOperand(Inst.getOperand(4)); + Inst = TmpInst; + return true; + } + break; + } + case ARM::MOVsi: { + ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(Inst.getOperand(2).getImm()); + // rrx shifts and asr/lsr of #32 is encoded as 0 + if (SOpc == ARM_AM::rrx || SOpc == ARM_AM::asr || SOpc == ARM_AM::lsr) + return false; + if (ARM_AM::getSORegOffset(Inst.getOperand(2).getImm()) == 0) { + // Shifting by zero is accepted as a vanilla 'MOVr' + MCInst TmpInst; + TmpInst.setOpcode(ARM::MOVr); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(Inst.getOperand(3)); + TmpInst.addOperand(Inst.getOperand(4)); + TmpInst.addOperand(Inst.getOperand(5)); + Inst = TmpInst; + return true; + } + return false; + } + case ARM::ANDrsi: + case ARM::ORRrsi: + case ARM::EORrsi: + case ARM::BICrsi: + case ARM::SUBrsi: + case ARM::ADDrsi: { + unsigned newOpc; + ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(Inst.getOperand(3).getImm()); + if (SOpc == ARM_AM::rrx) return false; + switch (Inst.getOpcode()) { + default: llvm_unreachable("unexpected opcode!"); + case ARM::ANDrsi: newOpc = ARM::ANDrr; break; + case ARM::ORRrsi: newOpc = ARM::ORRrr; break; + case ARM::EORrsi: newOpc = ARM::EORrr; break; + case ARM::BICrsi: newOpc = ARM::BICrr; break; + case ARM::SUBrsi: newOpc = ARM::SUBrr; break; + case ARM::ADDrsi: newOpc = ARM::ADDrr; break; + } + // If the shift is by zero, use the non-shifted instruction definition. + // The exception is for right shifts, where 0 == 32 + if (ARM_AM::getSORegOffset(Inst.getOperand(3).getImm()) == 0 && + !(SOpc == ARM_AM::lsr || SOpc == ARM_AM::asr)) { + MCInst TmpInst; + TmpInst.setOpcode(newOpc); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(Inst.getOperand(2)); + TmpInst.addOperand(Inst.getOperand(4)); + TmpInst.addOperand(Inst.getOperand(5)); + TmpInst.addOperand(Inst.getOperand(6)); + Inst = TmpInst; + return true; + } + return false; + } + case ARM::ITasm: + case ARM::t2IT: { + // The mask bits for all but the first condition are represented as + // the low bit of the condition code value implies 't'. We currently + // always have 1 implies 't', so XOR toggle the bits if the low bit + // of the condition code is zero. + MCOperand &MO = Inst.getOperand(1); + unsigned Mask = MO.getImm(); + unsigned OrigMask = Mask; + unsigned TZ = countTrailingZeros(Mask); + if ((Inst.getOperand(0).getImm() & 1) == 0) { + assert(Mask && TZ <= 3 && "illegal IT mask value!"); + Mask ^= (0xE << TZ) & 0xF; + } + MO.setImm(Mask); + + // Set up the IT block state according to the IT instruction we just + // matched. + assert(!inITBlock() && "nested IT blocks?!"); + ITState.Cond = ARMCC::CondCodes(Inst.getOperand(0).getImm()); + ITState.Mask = OrigMask; // Use the original mask, not the updated one. + ITState.CurPosition = 0; + ITState.FirstCond = true; + break; + } + case ARM::t2LSLrr: + case ARM::t2LSRrr: + case ARM::t2ASRrr: + case ARM::t2SBCrr: + case ARM::t2RORrr: + case ARM::t2BICrr: + { + // Assemblers should use the narrow encodings of these instructions when permissible. + if ((isARMLowRegister(Inst.getOperand(1).getReg()) && + isARMLowRegister(Inst.getOperand(2).getReg())) && + Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() && + ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) || + (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) && + (!static_cast<ARMOperand &>(*Operands[3]).isToken() || + !static_cast<ARMOperand &>(*Operands[3]).getToken().equals_lower( + ".w"))) { + unsigned NewOpc; + switch (Inst.getOpcode()) { + default: llvm_unreachable("unexpected opcode"); + case ARM::t2LSLrr: NewOpc = ARM::tLSLrr; break; + case ARM::t2LSRrr: NewOpc = ARM::tLSRrr; break; + case ARM::t2ASRrr: NewOpc = ARM::tASRrr; break; + case ARM::t2SBCrr: NewOpc = ARM::tSBC; break; + case ARM::t2RORrr: NewOpc = ARM::tROR; break; + case ARM::t2BICrr: NewOpc = ARM::tBIC; break; + } + MCInst TmpInst; + TmpInst.setOpcode(NewOpc); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(5)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(Inst.getOperand(2)); + TmpInst.addOperand(Inst.getOperand(3)); + TmpInst.addOperand(Inst.getOperand(4)); + Inst = TmpInst; + return true; + } + return false; + } + case ARM::t2ANDrr: + case ARM::t2EORrr: + case ARM::t2ADCrr: + case ARM::t2ORRrr: + { + // Assemblers should use the narrow encodings of these instructions when permissible. + // These instructions are special in that they are commutable, so shorter encodings + // are available more often. + if ((isARMLowRegister(Inst.getOperand(1).getReg()) && + isARMLowRegister(Inst.getOperand(2).getReg())) && + (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() || + Inst.getOperand(0).getReg() == Inst.getOperand(2).getReg()) && + ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) || + (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) && + (!static_cast<ARMOperand &>(*Operands[3]).isToken() || + !static_cast<ARMOperand &>(*Operands[3]).getToken().equals_lower( + ".w"))) { + unsigned NewOpc; + switch (Inst.getOpcode()) { + default: llvm_unreachable("unexpected opcode"); + case ARM::t2ADCrr: NewOpc = ARM::tADC; break; + case ARM::t2ANDrr: NewOpc = ARM::tAND; break; + case ARM::t2EORrr: NewOpc = ARM::tEOR; break; + case ARM::t2ORRrr: NewOpc = ARM::tORR; break; + } + MCInst TmpInst; + TmpInst.setOpcode(NewOpc); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(5)); + if (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) { + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(Inst.getOperand(2)); + } else { + TmpInst.addOperand(Inst.getOperand(2)); + TmpInst.addOperand(Inst.getOperand(1)); + } + TmpInst.addOperand(Inst.getOperand(3)); + TmpInst.addOperand(Inst.getOperand(4)); + Inst = TmpInst; + return true; + } + return false; + } + } + return false; +} + +unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) { + // 16-bit thumb arithmetic instructions either require or preclude the 'S' + // suffix depending on whether they're in an IT block or not. + unsigned Opc = Inst.getOpcode(); + const MCInstrDesc &MCID = MII.get(Opc); + if (MCID.TSFlags & ARMII::ThumbArithFlagSetting) { + assert(MCID.hasOptionalDef() && + "optionally flag setting instruction missing optional def operand"); + assert(MCID.NumOperands == Inst.getNumOperands() && + "operand count mismatch!"); + // Find the optional-def operand (cc_out). + unsigned OpNo; + for (OpNo = 0; + !MCID.OpInfo[OpNo].isOptionalDef() && OpNo < MCID.NumOperands; + ++OpNo) + ; + // If we're parsing Thumb1, reject it completely. + if (isThumbOne() && Inst.getOperand(OpNo).getReg() != ARM::CPSR) + return Match_MnemonicFail; + // If we're parsing Thumb2, which form is legal depends on whether we're + // in an IT block. + if (isThumbTwo() && Inst.getOperand(OpNo).getReg() != ARM::CPSR && + !inITBlock()) + return Match_RequiresITBlock; + if (isThumbTwo() && Inst.getOperand(OpNo).getReg() == ARM::CPSR && + inITBlock()) + return Match_RequiresNotITBlock; + } else if (isThumbOne()) { + // Some high-register supporting Thumb1 encodings only allow both registers + // to be from r0-r7 when in Thumb2. + if (Opc == ARM::tADDhirr && !hasV6MOps() && + isARMLowRegister(Inst.getOperand(1).getReg()) && + isARMLowRegister(Inst.getOperand(2).getReg())) + return Match_RequiresThumb2; + // Others only require ARMv6 or later. + else if (Opc == ARM::tMOVr && !hasV6Ops() && + isARMLowRegister(Inst.getOperand(0).getReg()) && + isARMLowRegister(Inst.getOperand(1).getReg())) + return Match_RequiresV6; + } + + for (unsigned I = 0; I < MCID.NumOperands; ++I) + if (MCID.OpInfo[I].RegClass == ARM::rGPRRegClassID) { + // rGPRRegClass excludes PC, and also excluded SP before ARMv8 + if ((Inst.getOperand(I).getReg() == ARM::SP) && !hasV8Ops()) + return Match_RequiresV8; + else if (Inst.getOperand(I).getReg() == ARM::PC) + return Match_InvalidOperand; + } + + return Match_Success; +} + +namespace llvm { +template <> inline bool IsCPSRDead<MCInst>(MCInst *Instr) { + return true; // In an assembly source, no need to second-guess +} +} + +static const char *getSubtargetFeatureName(uint64_t Val); +bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, + MCStreamer &Out, uint64_t &ErrorInfo, + bool MatchingInlineAsm) { + MCInst Inst; + unsigned MatchResult; + + MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo, + MatchingInlineAsm); + switch (MatchResult) { + case Match_Success: + // Context sensitive operand constraints aren't handled by the matcher, + // so check them here. + if (validateInstruction(Inst, Operands)) { + // Still progress the IT block, otherwise one wrong condition causes + // nasty cascading errors. + forwardITPosition(); + return true; + } + + { // processInstruction() updates inITBlock state, we need to save it away + bool wasInITBlock = inITBlock(); + + // Some instructions need post-processing to, for example, tweak which + // encoding is selected. Loop on it while changes happen so the + // individual transformations can chain off each other. E.g., + // tPOP(r8)->t2LDMIA_UPD(sp,r8)->t2STR_POST(sp,r8) + while (processInstruction(Inst, Operands, Out)) + ; + + // Only after the instruction is fully processed, we can validate it + if (wasInITBlock && hasV8Ops() && isThumb() && + !isV8EligibleForIT(&Inst)) { + Warning(IDLoc, "deprecated instruction in IT block"); + } + } + + // Only move forward at the very end so that everything in validate + // and process gets a consistent answer about whether we're in an IT + // block. + forwardITPosition(); + + // ITasm is an ARM mode pseudo-instruction that just sets the ITblock and + // doesn't actually encode. + if (Inst.getOpcode() == ARM::ITasm) + return false; + + Inst.setLoc(IDLoc); + Out.EmitInstruction(Inst, getSTI()); + return false; + case Match_MissingFeature: { + assert(ErrorInfo && "Unknown missing feature!"); + // Special case the error message for the very common case where only + // a single subtarget feature is missing (Thumb vs. ARM, e.g.). + std::string Msg = "instruction requires:"; + uint64_t Mask = 1; + for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) { + if (ErrorInfo & Mask) { + Msg += " "; + Msg += getSubtargetFeatureName(ErrorInfo & Mask); + } + Mask <<= 1; + } + return Error(IDLoc, Msg); + } + case Match_InvalidOperand: { + SMLoc ErrorLoc = IDLoc; + if (ErrorInfo != ~0ULL) { + if (ErrorInfo >= Operands.size()) + return Error(IDLoc, "too few operands for instruction"); + + ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc(); + if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; + } + + return Error(ErrorLoc, "invalid operand for instruction"); + } + case Match_MnemonicFail: + return Error(IDLoc, "invalid instruction", + ((ARMOperand &)*Operands[0]).getLocRange()); + case Match_RequiresNotITBlock: + return Error(IDLoc, "flag setting instruction only valid outside IT block"); + case Match_RequiresITBlock: + return Error(IDLoc, "instruction only valid inside IT block"); + case Match_RequiresV6: + return Error(IDLoc, "instruction variant requires ARMv6 or later"); + case Match_RequiresThumb2: + return Error(IDLoc, "instruction variant requires Thumb2"); + case Match_RequiresV8: + return Error(IDLoc, "instruction variant requires ARMv8 or later"); + case Match_ImmRange0_15: { + SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc(); + if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; + return Error(ErrorLoc, "immediate operand must be in the range [0,15]"); + } + case Match_ImmRange0_239: { + SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc(); + if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; + return Error(ErrorLoc, "immediate operand must be in the range [0,239]"); + } + case Match_AlignedMemoryRequiresNone: + case Match_DupAlignedMemoryRequiresNone: + case Match_AlignedMemoryRequires16: + case Match_DupAlignedMemoryRequires16: + case Match_AlignedMemoryRequires32: + case Match_DupAlignedMemoryRequires32: + case Match_AlignedMemoryRequires64: + case Match_DupAlignedMemoryRequires64: + case Match_AlignedMemoryRequires64or128: + case Match_DupAlignedMemoryRequires64or128: + case Match_AlignedMemoryRequires64or128or256: + { + SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getAlignmentLoc(); + if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; + switch (MatchResult) { + default: + llvm_unreachable("Missing Match_Aligned type"); + case Match_AlignedMemoryRequiresNone: + case Match_DupAlignedMemoryRequiresNone: + return Error(ErrorLoc, "alignment must be omitted"); + case Match_AlignedMemoryRequires16: + case Match_DupAlignedMemoryRequires16: + return Error(ErrorLoc, "alignment must be 16 or omitted"); + case Match_AlignedMemoryRequires32: + case Match_DupAlignedMemoryRequires32: + return Error(ErrorLoc, "alignment must be 32 or omitted"); + case Match_AlignedMemoryRequires64: + case Match_DupAlignedMemoryRequires64: + return Error(ErrorLoc, "alignment must be 64 or omitted"); + case Match_AlignedMemoryRequires64or128: + case Match_DupAlignedMemoryRequires64or128: + return Error(ErrorLoc, "alignment must be 64, 128 or omitted"); + case Match_AlignedMemoryRequires64or128or256: + return Error(ErrorLoc, "alignment must be 64, 128, 256 or omitted"); + } + } + } + + llvm_unreachable("Implement any new match types added!"); +} + +/// parseDirective parses the arm specific directives +bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) { + const MCObjectFileInfo::Environment Format = + getContext().getObjectFileInfo()->getObjectFileType(); + bool IsMachO = Format == MCObjectFileInfo::IsMachO; + bool IsCOFF = Format == MCObjectFileInfo::IsCOFF; + + StringRef IDVal = DirectiveID.getIdentifier(); + if (IDVal == ".word") + return parseLiteralValues(4, DirectiveID.getLoc()); + else if (IDVal == ".short" || IDVal == ".hword") + return parseLiteralValues(2, DirectiveID.getLoc()); + else if (IDVal == ".thumb") + return parseDirectiveThumb(DirectiveID.getLoc()); + else if (IDVal == ".arm") + return parseDirectiveARM(DirectiveID.getLoc()); + else if (IDVal == ".thumb_func") + return parseDirectiveThumbFunc(DirectiveID.getLoc()); + else if (IDVal == ".code") + return parseDirectiveCode(DirectiveID.getLoc()); + else if (IDVal == ".syntax") + return parseDirectiveSyntax(DirectiveID.getLoc()); + else if (IDVal == ".unreq") + return parseDirectiveUnreq(DirectiveID.getLoc()); + else if (IDVal == ".fnend") + return parseDirectiveFnEnd(DirectiveID.getLoc()); + else if (IDVal == ".cantunwind") + return parseDirectiveCantUnwind(DirectiveID.getLoc()); + else if (IDVal == ".personality") + return parseDirectivePersonality(DirectiveID.getLoc()); + else if (IDVal == ".handlerdata") + return parseDirectiveHandlerData(DirectiveID.getLoc()); + else if (IDVal == ".setfp") + return parseDirectiveSetFP(DirectiveID.getLoc()); + else if (IDVal == ".pad") + return parseDirectivePad(DirectiveID.getLoc()); + else if (IDVal == ".save") + return parseDirectiveRegSave(DirectiveID.getLoc(), false); + else if (IDVal == ".vsave") + return parseDirectiveRegSave(DirectiveID.getLoc(), true); + else if (IDVal == ".ltorg" || IDVal == ".pool") + return parseDirectiveLtorg(DirectiveID.getLoc()); + else if (IDVal == ".even") + return parseDirectiveEven(DirectiveID.getLoc()); + else if (IDVal == ".personalityindex") + return parseDirectivePersonalityIndex(DirectiveID.getLoc()); + else if (IDVal == ".unwind_raw") + return parseDirectiveUnwindRaw(DirectiveID.getLoc()); + else if (IDVal == ".movsp") + return parseDirectiveMovSP(DirectiveID.getLoc()); + else if (IDVal == ".arch_extension") + return parseDirectiveArchExtension(DirectiveID.getLoc()); + else if (IDVal == ".align") + return parseDirectiveAlign(DirectiveID.getLoc()); + else if (IDVal == ".thumb_set") + return parseDirectiveThumbSet(DirectiveID.getLoc()); + + if (!IsMachO && !IsCOFF) { + if (IDVal == ".arch") + return parseDirectiveArch(DirectiveID.getLoc()); + else if (IDVal == ".cpu") + return parseDirectiveCPU(DirectiveID.getLoc()); + else if (IDVal == ".eabi_attribute") + return parseDirectiveEabiAttr(DirectiveID.getLoc()); + else if (IDVal == ".fpu") + return parseDirectiveFPU(DirectiveID.getLoc()); + else if (IDVal == ".fnstart") + return parseDirectiveFnStart(DirectiveID.getLoc()); + else if (IDVal == ".inst") + return parseDirectiveInst(DirectiveID.getLoc()); + else if (IDVal == ".inst.n") + return parseDirectiveInst(DirectiveID.getLoc(), 'n'); + else if (IDVal == ".inst.w") + return parseDirectiveInst(DirectiveID.getLoc(), 'w'); + else if (IDVal == ".object_arch") + return parseDirectiveObjectArch(DirectiveID.getLoc()); + else if (IDVal == ".tlsdescseq") + return parseDirectiveTLSDescSeq(DirectiveID.getLoc()); + } + + return true; +} + +/// parseLiteralValues +/// ::= .hword expression [, expression]* +/// ::= .short expression [, expression]* +/// ::= .word expression [, expression]* +bool ARMAsmParser::parseLiteralValues(unsigned Size, SMLoc L) { + MCAsmParser &Parser = getParser(); + if (getLexer().isNot(AsmToken::EndOfStatement)) { + for (;;) { + const MCExpr *Value; + if (getParser().parseExpression(Value)) { + Parser.eatToEndOfStatement(); + return false; + } + + getParser().getStreamer().EmitValue(Value, Size, L); + + if (getLexer().is(AsmToken::EndOfStatement)) + break; + + // FIXME: Improve diagnostic. + if (getLexer().isNot(AsmToken::Comma)) { + Error(L, "unexpected token in directive"); + return false; + } + Parser.Lex(); + } + } + + Parser.Lex(); + return false; +} + +/// parseDirectiveThumb +/// ::= .thumb +bool ARMAsmParser::parseDirectiveThumb(SMLoc L) { + MCAsmParser &Parser = getParser(); + if (getLexer().isNot(AsmToken::EndOfStatement)) { + Error(L, "unexpected token in directive"); + return false; + } + Parser.Lex(); + + if (!hasThumb()) { + Error(L, "target does not support Thumb mode"); + return false; + } + + if (!isThumb()) + SwitchMode(); + + getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16); + return false; +} + +/// parseDirectiveARM +/// ::= .arm +bool ARMAsmParser::parseDirectiveARM(SMLoc L) { + MCAsmParser &Parser = getParser(); + if (getLexer().isNot(AsmToken::EndOfStatement)) { + Error(L, "unexpected token in directive"); + return false; + } + Parser.Lex(); + + if (!hasARM()) { + Error(L, "target does not support ARM mode"); + return false; + } + + if (isThumb()) + SwitchMode(); + + getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32); + return false; +} + +void ARMAsmParser::onLabelParsed(MCSymbol *Symbol) { + if (NextSymbolIsThumb) { + getParser().getStreamer().EmitThumbFunc(Symbol); + NextSymbolIsThumb = false; + } +} + +/// parseDirectiveThumbFunc +/// ::= .thumbfunc symbol_name +bool ARMAsmParser::parseDirectiveThumbFunc(SMLoc L) { + MCAsmParser &Parser = getParser(); + const auto Format = getContext().getObjectFileInfo()->getObjectFileType(); + bool IsMachO = Format == MCObjectFileInfo::IsMachO; + + // Darwin asm has (optionally) function name after .thumb_func direction + // ELF doesn't + if (IsMachO) { + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::EndOfStatement)) { + if (Tok.isNot(AsmToken::Identifier) && Tok.isNot(AsmToken::String)) { + Error(L, "unexpected token in .thumb_func directive"); + return false; + } + + MCSymbol *Func = + getParser().getContext().getOrCreateSymbol(Tok.getIdentifier()); + getParser().getStreamer().EmitThumbFunc(Func); + Parser.Lex(); // Consume the identifier token. + return false; + } + } + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + Error(Parser.getTok().getLoc(), "unexpected token in directive"); + Parser.eatToEndOfStatement(); + return false; + } + + NextSymbolIsThumb = true; + return false; +} + +/// parseDirectiveSyntax +/// ::= .syntax unified | divided +bool ARMAsmParser::parseDirectiveSyntax(SMLoc L) { + MCAsmParser &Parser = getParser(); + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Identifier)) { + Error(L, "unexpected token in .syntax directive"); + return false; + } + + StringRef Mode = Tok.getString(); + if (Mode == "unified" || Mode == "UNIFIED") { + Parser.Lex(); + } else if (Mode == "divided" || Mode == "DIVIDED") { + Error(L, "'.syntax divided' arm asssembly not supported"); + return false; + } else { + Error(L, "unrecognized syntax mode in .syntax directive"); + return false; + } + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + Error(Parser.getTok().getLoc(), "unexpected token in directive"); + return false; + } + Parser.Lex(); + + // TODO tell the MC streamer the mode + // getParser().getStreamer().Emit???(); + return false; +} + +/// parseDirectiveCode +/// ::= .code 16 | 32 +bool ARMAsmParser::parseDirectiveCode(SMLoc L) { + MCAsmParser &Parser = getParser(); + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Integer)) { + Error(L, "unexpected token in .code directive"); + return false; + } + int64_t Val = Parser.getTok().getIntVal(); + if (Val != 16 && Val != 32) { + Error(L, "invalid operand to .code directive"); + return false; + } + Parser.Lex(); + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + Error(Parser.getTok().getLoc(), "unexpected token in directive"); + return false; + } + Parser.Lex(); + + if (Val == 16) { + if (!hasThumb()) { + Error(L, "target does not support Thumb mode"); + return false; + } + + if (!isThumb()) + SwitchMode(); + getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16); + } else { + if (!hasARM()) { + Error(L, "target does not support ARM mode"); + return false; + } + + if (isThumb()) + SwitchMode(); + getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32); + } + + return false; +} + +/// parseDirectiveReq +/// ::= name .req registername +bool ARMAsmParser::parseDirectiveReq(StringRef Name, SMLoc L) { + MCAsmParser &Parser = getParser(); + Parser.Lex(); // Eat the '.req' token. + unsigned Reg; + SMLoc SRegLoc, ERegLoc; + if (ParseRegister(Reg, SRegLoc, ERegLoc)) { + Parser.eatToEndOfStatement(); + Error(SRegLoc, "register name expected"); + return false; + } + + // Shouldn't be anything else. + if (Parser.getTok().isNot(AsmToken::EndOfStatement)) { + Parser.eatToEndOfStatement(); + Error(Parser.getTok().getLoc(), "unexpected input in .req directive."); + return false; + } + + Parser.Lex(); // Consume the EndOfStatement + + if (RegisterReqs.insert(std::make_pair(Name, Reg)).first->second != Reg) { + Error(SRegLoc, "redefinition of '" + Name + "' does not match original."); + return false; + } + + return false; +} + +/// parseDirectiveUneq +/// ::= .unreq registername +bool ARMAsmParser::parseDirectiveUnreq(SMLoc L) { + MCAsmParser &Parser = getParser(); + if (Parser.getTok().isNot(AsmToken::Identifier)) { + Parser.eatToEndOfStatement(); + Error(L, "unexpected input in .unreq directive."); + return false; + } + RegisterReqs.erase(Parser.getTok().getIdentifier().lower()); + Parser.Lex(); // Eat the identifier. + return false; +} + +/// parseDirectiveArch +/// ::= .arch token +bool ARMAsmParser::parseDirectiveArch(SMLoc L) { + StringRef Arch = getParser().parseStringToEndOfStatement().trim(); + + unsigned ID = ARM::parseArch(Arch); + + if (ID == ARM::AK_INVALID) { + Error(L, "Unknown arch name"); + return false; + } + + Triple T; + MCSubtargetInfo &STI = copySTI(); + STI.setDefaultFeatures("", ("+" + ARM::getArchName(ID)).str()); + setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + + getTargetStreamer().emitArch(ID); + return false; +} + +/// parseDirectiveEabiAttr +/// ::= .eabi_attribute int, int [, "str"] +/// ::= .eabi_attribute Tag_name, int [, "str"] +bool ARMAsmParser::parseDirectiveEabiAttr(SMLoc L) { + MCAsmParser &Parser = getParser(); + int64_t Tag; + SMLoc TagLoc; + TagLoc = Parser.getTok().getLoc(); + if (Parser.getTok().is(AsmToken::Identifier)) { + StringRef Name = Parser.getTok().getIdentifier(); + Tag = ARMBuildAttrs::AttrTypeFromString(Name); + if (Tag == -1) { + Error(TagLoc, "attribute name not recognised: " + Name); + Parser.eatToEndOfStatement(); + return false; + } + Parser.Lex(); + } else { + const MCExpr *AttrExpr; + + TagLoc = Parser.getTok().getLoc(); + if (Parser.parseExpression(AttrExpr)) { + Parser.eatToEndOfStatement(); + return false; + } + + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(AttrExpr); + if (!CE) { + Error(TagLoc, "expected numeric constant"); + Parser.eatToEndOfStatement(); + return false; + } + + Tag = CE->getValue(); + } + + if (Parser.getTok().isNot(AsmToken::Comma)) { + Error(Parser.getTok().getLoc(), "comma expected"); + Parser.eatToEndOfStatement(); + return false; + } + Parser.Lex(); // skip comma + + StringRef StringValue = ""; + bool IsStringValue = false; + + int64_t IntegerValue = 0; + bool IsIntegerValue = false; + + if (Tag == ARMBuildAttrs::CPU_raw_name || Tag == ARMBuildAttrs::CPU_name) + IsStringValue = true; + else if (Tag == ARMBuildAttrs::compatibility) { + IsStringValue = true; + IsIntegerValue = true; + } else if (Tag < 32 || Tag % 2 == 0) + IsIntegerValue = true; + else if (Tag % 2 == 1) + IsStringValue = true; + else + llvm_unreachable("invalid tag type"); + + if (IsIntegerValue) { + const MCExpr *ValueExpr; + SMLoc ValueExprLoc = Parser.getTok().getLoc(); + if (Parser.parseExpression(ValueExpr)) { + Parser.eatToEndOfStatement(); + return false; + } + + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ValueExpr); + if (!CE) { + Error(ValueExprLoc, "expected numeric constant"); + Parser.eatToEndOfStatement(); + return false; + } + + IntegerValue = CE->getValue(); + } + + if (Tag == ARMBuildAttrs::compatibility) { + if (Parser.getTok().isNot(AsmToken::Comma)) + IsStringValue = false; + if (Parser.getTok().isNot(AsmToken::Comma)) { + Error(Parser.getTok().getLoc(), "comma expected"); + Parser.eatToEndOfStatement(); + return false; + } else { + Parser.Lex(); + } + } + + if (IsStringValue) { + if (Parser.getTok().isNot(AsmToken::String)) { + Error(Parser.getTok().getLoc(), "bad string constant"); + Parser.eatToEndOfStatement(); + return false; + } + + StringValue = Parser.getTok().getStringContents(); + Parser.Lex(); + } + + if (IsIntegerValue && IsStringValue) { + assert(Tag == ARMBuildAttrs::compatibility); + getTargetStreamer().emitIntTextAttribute(Tag, IntegerValue, StringValue); + } else if (IsIntegerValue) + getTargetStreamer().emitAttribute(Tag, IntegerValue); + else if (IsStringValue) + getTargetStreamer().emitTextAttribute(Tag, StringValue); + return false; +} + +/// parseDirectiveCPU +/// ::= .cpu str +bool ARMAsmParser::parseDirectiveCPU(SMLoc L) { + StringRef CPU = getParser().parseStringToEndOfStatement().trim(); + getTargetStreamer().emitTextAttribute(ARMBuildAttrs::CPU_name, CPU); + + // FIXME: This is using table-gen data, but should be moved to + // ARMTargetParser once that is table-gen'd. + if (!getSTI().isCPUStringValid(CPU)) { + Error(L, "Unknown CPU name"); + return false; + } + + MCSubtargetInfo &STI = copySTI(); + STI.setDefaultFeatures(CPU, ""); + setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + + return false; +} +/// parseDirectiveFPU +/// ::= .fpu str +bool ARMAsmParser::parseDirectiveFPU(SMLoc L) { + SMLoc FPUNameLoc = getTok().getLoc(); + StringRef FPU = getParser().parseStringToEndOfStatement().trim(); + + unsigned ID = ARM::parseFPU(FPU); + std::vector<const char *> Features; + if (!ARM::getFPUFeatures(ID, Features)) { + Error(FPUNameLoc, "Unknown FPU name"); + return false; + } + + MCSubtargetInfo &STI = copySTI(); + for (auto Feature : Features) + STI.ApplyFeatureFlag(Feature); + setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + + getTargetStreamer().emitFPU(ID); + return false; +} + +/// parseDirectiveFnStart +/// ::= .fnstart +bool ARMAsmParser::parseDirectiveFnStart(SMLoc L) { + if (UC.hasFnStart()) { + Error(L, ".fnstart starts before the end of previous one"); + UC.emitFnStartLocNotes(); + return false; + } + + // Reset the unwind directives parser state + UC.reset(); + + getTargetStreamer().emitFnStart(); + + UC.recordFnStart(L); + return false; +} + +/// parseDirectiveFnEnd +/// ::= .fnend +bool ARMAsmParser::parseDirectiveFnEnd(SMLoc L) { + // Check the ordering of unwind directives + if (!UC.hasFnStart()) { + Error(L, ".fnstart must precede .fnend directive"); + return false; + } + + // Reset the unwind directives parser state + getTargetStreamer().emitFnEnd(); + + UC.reset(); + return false; +} + +/// parseDirectiveCantUnwind +/// ::= .cantunwind +bool ARMAsmParser::parseDirectiveCantUnwind(SMLoc L) { + UC.recordCantUnwind(L); + + // Check the ordering of unwind directives + if (!UC.hasFnStart()) { + Error(L, ".fnstart must precede .cantunwind directive"); + return false; + } + if (UC.hasHandlerData()) { + Error(L, ".cantunwind can't be used with .handlerdata directive"); + UC.emitHandlerDataLocNotes(); + return false; + } + if (UC.hasPersonality()) { + Error(L, ".cantunwind can't be used with .personality directive"); + UC.emitPersonalityLocNotes(); + return false; + } + + getTargetStreamer().emitCantUnwind(); + return false; +} + +/// parseDirectivePersonality +/// ::= .personality name +bool ARMAsmParser::parseDirectivePersonality(SMLoc L) { + MCAsmParser &Parser = getParser(); + bool HasExistingPersonality = UC.hasPersonality(); + + UC.recordPersonality(L); + + // Check the ordering of unwind directives + if (!UC.hasFnStart()) { + Error(L, ".fnstart must precede .personality directive"); + return false; + } + if (UC.cantUnwind()) { + Error(L, ".personality can't be used with .cantunwind directive"); + UC.emitCantUnwindLocNotes(); + return false; + } + if (UC.hasHandlerData()) { + Error(L, ".personality must precede .handlerdata directive"); + UC.emitHandlerDataLocNotes(); + return false; + } + if (HasExistingPersonality) { + Parser.eatToEndOfStatement(); + Error(L, "multiple personality directives"); + UC.emitPersonalityLocNotes(); + return false; + } + + // Parse the name of the personality routine + if (Parser.getTok().isNot(AsmToken::Identifier)) { + Parser.eatToEndOfStatement(); + Error(L, "unexpected input in .personality directive."); + return false; + } + StringRef Name(Parser.getTok().getIdentifier()); + Parser.Lex(); + + MCSymbol *PR = getParser().getContext().getOrCreateSymbol(Name); + getTargetStreamer().emitPersonality(PR); + return false; +} + +/// parseDirectiveHandlerData +/// ::= .handlerdata +bool ARMAsmParser::parseDirectiveHandlerData(SMLoc L) { + UC.recordHandlerData(L); + + // Check the ordering of unwind directives + if (!UC.hasFnStart()) { + Error(L, ".fnstart must precede .personality directive"); + return false; + } + if (UC.cantUnwind()) { + Error(L, ".handlerdata can't be used with .cantunwind directive"); + UC.emitCantUnwindLocNotes(); + return false; + } + + getTargetStreamer().emitHandlerData(); + return false; +} + +/// parseDirectiveSetFP +/// ::= .setfp fpreg, spreg [, offset] +bool ARMAsmParser::parseDirectiveSetFP(SMLoc L) { + MCAsmParser &Parser = getParser(); + // Check the ordering of unwind directives + if (!UC.hasFnStart()) { + Error(L, ".fnstart must precede .setfp directive"); + return false; + } + if (UC.hasHandlerData()) { + Error(L, ".setfp must precede .handlerdata directive"); + return false; + } + + // Parse fpreg + SMLoc FPRegLoc = Parser.getTok().getLoc(); + int FPReg = tryParseRegister(); + if (FPReg == -1) { + Error(FPRegLoc, "frame pointer register expected"); + return false; + } + + // Consume comma + if (Parser.getTok().isNot(AsmToken::Comma)) { + Error(Parser.getTok().getLoc(), "comma expected"); + return false; + } + Parser.Lex(); // skip comma + + // Parse spreg + SMLoc SPRegLoc = Parser.getTok().getLoc(); + int SPReg = tryParseRegister(); + if (SPReg == -1) { + Error(SPRegLoc, "stack pointer register expected"); + return false; + } + + if (SPReg != ARM::SP && SPReg != UC.getFPReg()) { + Error(SPRegLoc, "register should be either $sp or the latest fp register"); + return false; + } + + // Update the frame pointer register + UC.saveFPReg(FPReg); + + // Parse offset + int64_t Offset = 0; + if (Parser.getTok().is(AsmToken::Comma)) { + Parser.Lex(); // skip comma + + if (Parser.getTok().isNot(AsmToken::Hash) && + Parser.getTok().isNot(AsmToken::Dollar)) { + Error(Parser.getTok().getLoc(), "'#' expected"); + return false; + } + Parser.Lex(); // skip hash token. + + const MCExpr *OffsetExpr; + SMLoc ExLoc = Parser.getTok().getLoc(); + SMLoc EndLoc; + if (getParser().parseExpression(OffsetExpr, EndLoc)) { + Error(ExLoc, "malformed setfp offset"); + return false; + } + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(OffsetExpr); + if (!CE) { + Error(ExLoc, "setfp offset must be an immediate"); + return false; + } + + Offset = CE->getValue(); + } + + getTargetStreamer().emitSetFP(static_cast<unsigned>(FPReg), + static_cast<unsigned>(SPReg), Offset); + return false; +} + +/// parseDirective +/// ::= .pad offset +bool ARMAsmParser::parseDirectivePad(SMLoc L) { + MCAsmParser &Parser = getParser(); + // Check the ordering of unwind directives + if (!UC.hasFnStart()) { + Error(L, ".fnstart must precede .pad directive"); + return false; + } + if (UC.hasHandlerData()) { + Error(L, ".pad must precede .handlerdata directive"); + return false; + } + + // Parse the offset + if (Parser.getTok().isNot(AsmToken::Hash) && + Parser.getTok().isNot(AsmToken::Dollar)) { + Error(Parser.getTok().getLoc(), "'#' expected"); + return false; + } + Parser.Lex(); // skip hash token. + + const MCExpr *OffsetExpr; + SMLoc ExLoc = Parser.getTok().getLoc(); + SMLoc EndLoc; + if (getParser().parseExpression(OffsetExpr, EndLoc)) { + Error(ExLoc, "malformed pad offset"); + return false; + } + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(OffsetExpr); + if (!CE) { + Error(ExLoc, "pad offset must be an immediate"); + return false; + } + + getTargetStreamer().emitPad(CE->getValue()); + return false; +} + +/// parseDirectiveRegSave +/// ::= .save { registers } +/// ::= .vsave { registers } +bool ARMAsmParser::parseDirectiveRegSave(SMLoc L, bool IsVector) { + // Check the ordering of unwind directives + if (!UC.hasFnStart()) { + Error(L, ".fnstart must precede .save or .vsave directives"); + return false; + } + if (UC.hasHandlerData()) { + Error(L, ".save or .vsave must precede .handlerdata directive"); + return false; + } + + // RAII object to make sure parsed operands are deleted. + SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Operands; + + // Parse the register list + if (parseRegisterList(Operands)) + return false; + ARMOperand &Op = (ARMOperand &)*Operands[0]; + if (!IsVector && !Op.isRegList()) { + Error(L, ".save expects GPR registers"); + return false; + } + if (IsVector && !Op.isDPRRegList()) { + Error(L, ".vsave expects DPR registers"); + return false; + } + + getTargetStreamer().emitRegSave(Op.getRegList(), IsVector); + return false; +} + +/// parseDirectiveInst +/// ::= .inst opcode [, ...] +/// ::= .inst.n opcode [, ...] +/// ::= .inst.w opcode [, ...] +bool ARMAsmParser::parseDirectiveInst(SMLoc Loc, char Suffix) { + MCAsmParser &Parser = getParser(); + int Width; + + if (isThumb()) { + switch (Suffix) { + case 'n': + Width = 2; + break; + case 'w': + Width = 4; + break; + default: + Parser.eatToEndOfStatement(); + Error(Loc, "cannot determine Thumb instruction size, " + "use inst.n/inst.w instead"); + return false; + } + } else { + if (Suffix) { + Parser.eatToEndOfStatement(); + Error(Loc, "width suffixes are invalid in ARM mode"); + return false; + } + Width = 4; + } + + if (getLexer().is(AsmToken::EndOfStatement)) { + Parser.eatToEndOfStatement(); + Error(Loc, "expected expression following directive"); + return false; + } + + for (;;) { + const MCExpr *Expr; + + if (getParser().parseExpression(Expr)) { + Error(Loc, "expected expression"); + return false; + } + + const MCConstantExpr *Value = dyn_cast_or_null<MCConstantExpr>(Expr); + if (!Value) { + Error(Loc, "expected constant expression"); + return false; + } + + switch (Width) { + case 2: + if (Value->getValue() > 0xffff) { + Error(Loc, "inst.n operand is too big, use inst.w instead"); + return false; + } + break; + case 4: + if (Value->getValue() > 0xffffffff) { + Error(Loc, + StringRef(Suffix ? "inst.w" : "inst") + " operand is too big"); + return false; + } + break; + default: + llvm_unreachable("only supported widths are 2 and 4"); + } + + getTargetStreamer().emitInst(Value->getValue(), Suffix); + + if (getLexer().is(AsmToken::EndOfStatement)) + break; + + if (getLexer().isNot(AsmToken::Comma)) { + Error(Loc, "unexpected token in directive"); + return false; + } + + Parser.Lex(); + } + + Parser.Lex(); + return false; +} + +/// parseDirectiveLtorg +/// ::= .ltorg | .pool +bool ARMAsmParser::parseDirectiveLtorg(SMLoc L) { + getTargetStreamer().emitCurrentConstantPool(); + return false; +} + +bool ARMAsmParser::parseDirectiveEven(SMLoc L) { + const MCSection *Section = getStreamer().getCurrentSection().first; + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + TokError("unexpected token in directive"); + return false; + } + + if (!Section) { + getStreamer().InitSections(false); + Section = getStreamer().getCurrentSection().first; + } + + assert(Section && "must have section to emit alignment"); + if (Section->UseCodeAlign()) + getStreamer().EmitCodeAlignment(2); + else + getStreamer().EmitValueToAlignment(2); + + return false; +} + +/// parseDirectivePersonalityIndex +/// ::= .personalityindex index +bool ARMAsmParser::parseDirectivePersonalityIndex(SMLoc L) { + MCAsmParser &Parser = getParser(); + bool HasExistingPersonality = UC.hasPersonality(); + + UC.recordPersonalityIndex(L); + + if (!UC.hasFnStart()) { + Parser.eatToEndOfStatement(); + Error(L, ".fnstart must precede .personalityindex directive"); + return false; + } + if (UC.cantUnwind()) { + Parser.eatToEndOfStatement(); + Error(L, ".personalityindex cannot be used with .cantunwind"); + UC.emitCantUnwindLocNotes(); + return false; + } + if (UC.hasHandlerData()) { + Parser.eatToEndOfStatement(); + Error(L, ".personalityindex must precede .handlerdata directive"); + UC.emitHandlerDataLocNotes(); + return false; + } + if (HasExistingPersonality) { + Parser.eatToEndOfStatement(); + Error(L, "multiple personality directives"); + UC.emitPersonalityLocNotes(); + return false; + } + + const MCExpr *IndexExpression; + SMLoc IndexLoc = Parser.getTok().getLoc(); + if (Parser.parseExpression(IndexExpression)) { + Parser.eatToEndOfStatement(); + return false; + } + + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(IndexExpression); + if (!CE) { + Parser.eatToEndOfStatement(); + Error(IndexLoc, "index must be a constant number"); + return false; + } + if (CE->getValue() < 0 || + CE->getValue() >= ARM::EHABI::NUM_PERSONALITY_INDEX) { + Parser.eatToEndOfStatement(); + Error(IndexLoc, "personality routine index should be in range [0-3]"); + return false; + } + + getTargetStreamer().emitPersonalityIndex(CE->getValue()); + return false; +} + +/// parseDirectiveUnwindRaw +/// ::= .unwind_raw offset, opcode [, opcode...] +bool ARMAsmParser::parseDirectiveUnwindRaw(SMLoc L) { + MCAsmParser &Parser = getParser(); + if (!UC.hasFnStart()) { + Parser.eatToEndOfStatement(); + Error(L, ".fnstart must precede .unwind_raw directives"); + return false; + } + + int64_t StackOffset; + + const MCExpr *OffsetExpr; + SMLoc OffsetLoc = getLexer().getLoc(); + if (getLexer().is(AsmToken::EndOfStatement) || + getParser().parseExpression(OffsetExpr)) { + Error(OffsetLoc, "expected expression"); + Parser.eatToEndOfStatement(); + return false; + } + + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(OffsetExpr); + if (!CE) { + Error(OffsetLoc, "offset must be a constant"); + Parser.eatToEndOfStatement(); + return false; + } + + StackOffset = CE->getValue(); + + if (getLexer().isNot(AsmToken::Comma)) { + Error(getLexer().getLoc(), "expected comma"); + Parser.eatToEndOfStatement(); + return false; + } + Parser.Lex(); + + SmallVector<uint8_t, 16> Opcodes; + for (;;) { + const MCExpr *OE; + + SMLoc OpcodeLoc = getLexer().getLoc(); + if (getLexer().is(AsmToken::EndOfStatement) || Parser.parseExpression(OE)) { + Error(OpcodeLoc, "expected opcode expression"); + Parser.eatToEndOfStatement(); + return false; + } + + const MCConstantExpr *OC = dyn_cast<MCConstantExpr>(OE); + if (!OC) { + Error(OpcodeLoc, "opcode value must be a constant"); + Parser.eatToEndOfStatement(); + return false; + } + + const int64_t Opcode = OC->getValue(); + if (Opcode & ~0xff) { + Error(OpcodeLoc, "invalid opcode"); + Parser.eatToEndOfStatement(); + return false; + } + + Opcodes.push_back(uint8_t(Opcode)); + + if (getLexer().is(AsmToken::EndOfStatement)) + break; + + if (getLexer().isNot(AsmToken::Comma)) { + Error(getLexer().getLoc(), "unexpected token in directive"); + Parser.eatToEndOfStatement(); + return false; + } + + Parser.Lex(); + } + + getTargetStreamer().emitUnwindRaw(StackOffset, Opcodes); + + Parser.Lex(); + return false; +} + +/// parseDirectiveTLSDescSeq +/// ::= .tlsdescseq tls-variable +bool ARMAsmParser::parseDirectiveTLSDescSeq(SMLoc L) { + MCAsmParser &Parser = getParser(); + + if (getLexer().isNot(AsmToken::Identifier)) { + TokError("expected variable after '.tlsdescseq' directive"); + Parser.eatToEndOfStatement(); + return false; + } + + const MCSymbolRefExpr *SRE = + MCSymbolRefExpr::create(Parser.getTok().getIdentifier(), + MCSymbolRefExpr::VK_ARM_TLSDESCSEQ, getContext()); + Lex(); + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + Error(Parser.getTok().getLoc(), "unexpected token"); + Parser.eatToEndOfStatement(); + return false; + } + + getTargetStreamer().AnnotateTLSDescriptorSequence(SRE); + return false; +} + +/// parseDirectiveMovSP +/// ::= .movsp reg [, #offset] +bool ARMAsmParser::parseDirectiveMovSP(SMLoc L) { + MCAsmParser &Parser = getParser(); + if (!UC.hasFnStart()) { + Parser.eatToEndOfStatement(); + Error(L, ".fnstart must precede .movsp directives"); + return false; + } + if (UC.getFPReg() != ARM::SP) { + Parser.eatToEndOfStatement(); + Error(L, "unexpected .movsp directive"); + return false; + } + + SMLoc SPRegLoc = Parser.getTok().getLoc(); + int SPReg = tryParseRegister(); + if (SPReg == -1) { + Parser.eatToEndOfStatement(); + Error(SPRegLoc, "register expected"); + return false; + } + + if (SPReg == ARM::SP || SPReg == ARM::PC) { + Parser.eatToEndOfStatement(); + Error(SPRegLoc, "sp and pc are not permitted in .movsp directive"); + return false; + } + + int64_t Offset = 0; + if (Parser.getTok().is(AsmToken::Comma)) { + Parser.Lex(); + + if (Parser.getTok().isNot(AsmToken::Hash)) { + Error(Parser.getTok().getLoc(), "expected #constant"); + Parser.eatToEndOfStatement(); + return false; + } + Parser.Lex(); + + const MCExpr *OffsetExpr; + SMLoc OffsetLoc = Parser.getTok().getLoc(); + if (Parser.parseExpression(OffsetExpr)) { + Parser.eatToEndOfStatement(); + Error(OffsetLoc, "malformed offset expression"); + return false; + } + + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(OffsetExpr); + if (!CE) { + Parser.eatToEndOfStatement(); + Error(OffsetLoc, "offset must be an immediate constant"); + return false; + } + + Offset = CE->getValue(); + } + + getTargetStreamer().emitMovSP(SPReg, Offset); + UC.saveFPReg(SPReg); + + return false; +} + +/// parseDirectiveObjectArch +/// ::= .object_arch name +bool ARMAsmParser::parseDirectiveObjectArch(SMLoc L) { + MCAsmParser &Parser = getParser(); + if (getLexer().isNot(AsmToken::Identifier)) { + Error(getLexer().getLoc(), "unexpected token"); + Parser.eatToEndOfStatement(); + return false; + } + + StringRef Arch = Parser.getTok().getString(); + SMLoc ArchLoc = Parser.getTok().getLoc(); + getLexer().Lex(); + + unsigned ID = ARM::parseArch(Arch); + + if (ID == ARM::AK_INVALID) { + Error(ArchLoc, "unknown architecture '" + Arch + "'"); + Parser.eatToEndOfStatement(); + return false; + } + + getTargetStreamer().emitObjectArch(ID); + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + Error(getLexer().getLoc(), "unexpected token"); + Parser.eatToEndOfStatement(); + } + + return false; +} + +/// parseDirectiveAlign +/// ::= .align +bool ARMAsmParser::parseDirectiveAlign(SMLoc L) { + // NOTE: if this is not the end of the statement, fall back to the target + // agnostic handling for this directive which will correctly handle this. + if (getLexer().isNot(AsmToken::EndOfStatement)) + return true; + + // '.align' is target specifically handled to mean 2**2 byte alignment. + if (getStreamer().getCurrentSection().first->UseCodeAlign()) + getStreamer().EmitCodeAlignment(4, 0); + else + getStreamer().EmitValueToAlignment(4, 0, 1, 0); + + return false; +} + +/// parseDirectiveThumbSet +/// ::= .thumb_set name, value +bool ARMAsmParser::parseDirectiveThumbSet(SMLoc L) { + MCAsmParser &Parser = getParser(); + + StringRef Name; + if (Parser.parseIdentifier(Name)) { + TokError("expected identifier after '.thumb_set'"); + Parser.eatToEndOfStatement(); + return false; + } + + if (getLexer().isNot(AsmToken::Comma)) { + TokError("expected comma after name '" + Name + "'"); + Parser.eatToEndOfStatement(); + return false; + } + Lex(); + + MCSymbol *Sym; + const MCExpr *Value; + if (MCParserUtils::parseAssignmentExpression(Name, /* allow_redef */ true, + Parser, Sym, Value)) + return true; + + getTargetStreamer().emitThumbSet(Sym, Value); + return false; +} + +/// Force static initialization. +extern "C" void LLVMInitializeARMAsmParser() { + RegisterMCAsmParser<ARMAsmParser> X(TheARMLETarget); + RegisterMCAsmParser<ARMAsmParser> Y(TheARMBETarget); + RegisterMCAsmParser<ARMAsmParser> A(TheThumbLETarget); + RegisterMCAsmParser<ARMAsmParser> B(TheThumbBETarget); +} + +#define GET_REGISTER_MATCHER +#define GET_SUBTARGET_FEATURE_NAME +#define GET_MATCHER_IMPLEMENTATION +#include "ARMGenAsmMatcher.inc" + +// FIXME: This structure should be moved inside ARMTargetParser +// when we start to table-generate them, and we can use the ARM +// flags below, that were generated by table-gen. +static const struct { + const unsigned Kind; + const uint64_t ArchCheck; + const FeatureBitset Features; +} Extensions[] = { + { ARM::AEK_CRC, Feature_HasV8, {ARM::FeatureCRC} }, + { ARM::AEK_CRYPTO, Feature_HasV8, + {ARM::FeatureCrypto, ARM::FeatureNEON, ARM::FeatureFPARMv8} }, + { ARM::AEK_FP, Feature_HasV8, {ARM::FeatureFPARMv8} }, + { (ARM::AEK_HWDIV | ARM::AEK_HWDIVARM), Feature_HasV7 | Feature_IsNotMClass, + {ARM::FeatureHWDiv, ARM::FeatureHWDivARM} }, + { ARM::AEK_MP, Feature_HasV7 | Feature_IsNotMClass, {ARM::FeatureMP} }, + { ARM::AEK_SIMD, Feature_HasV8, {ARM::FeatureNEON, ARM::FeatureFPARMv8} }, + { ARM::AEK_SEC, Feature_HasV6K, {ARM::FeatureTrustZone} }, + // FIXME: Only available in A-class, isel not predicated + { ARM::AEK_VIRT, Feature_HasV7, {ARM::FeatureVirtualization} }, + { ARM::AEK_FP16, Feature_HasV8_2a, {ARM::FeatureFPARMv8, ARM::FeatureFullFP16} }, + // FIXME: Unsupported extensions. + { ARM::AEK_OS, Feature_None, {} }, + { ARM::AEK_IWMMXT, Feature_None, {} }, + { ARM::AEK_IWMMXT2, Feature_None, {} }, + { ARM::AEK_MAVERICK, Feature_None, {} }, + { ARM::AEK_XSCALE, Feature_None, {} }, +}; + +/// parseDirectiveArchExtension +/// ::= .arch_extension [no]feature +bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) { + MCAsmParser &Parser = getParser(); + + if (getLexer().isNot(AsmToken::Identifier)) { + Error(getLexer().getLoc(), "unexpected token"); + Parser.eatToEndOfStatement(); + return false; + } + + StringRef Name = Parser.getTok().getString(); + SMLoc ExtLoc = Parser.getTok().getLoc(); + getLexer().Lex(); + + bool EnableFeature = true; + if (Name.startswith_lower("no")) { + EnableFeature = false; + Name = Name.substr(2); + } + unsigned FeatureKind = ARM::parseArchExt(Name); + if (FeatureKind == ARM::AEK_INVALID) + Error(ExtLoc, "unknown architectural extension: " + Name); + + for (const auto &Extension : Extensions) { + if (Extension.Kind != FeatureKind) + continue; + + if (Extension.Features.none()) + report_fatal_error("unsupported architectural extension: " + Name); + + if ((getAvailableFeatures() & Extension.ArchCheck) != Extension.ArchCheck) { + Error(ExtLoc, "architectural extension '" + Name + "' is not " + "allowed for the current base architecture"); + return false; + } + + MCSubtargetInfo &STI = copySTI(); + FeatureBitset ToggleFeatures = EnableFeature + ? (~STI.getFeatureBits() & Extension.Features) + : ( STI.getFeatureBits() & Extension.Features); + + uint64_t Features = + ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures)); + setAvailableFeatures(Features); + return false; + } + + Error(ExtLoc, "unknown architectural extension: " + Name); + Parser.eatToEndOfStatement(); + return false; +} + +// Define this matcher function after the auto-generated include so we +// have the match class enum definitions. +unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, + unsigned Kind) { + ARMOperand &Op = static_cast<ARMOperand &>(AsmOp); + // If the kind is a token for a literal immediate, check if our asm + // operand matches. This is for InstAliases which have a fixed-value + // immediate in the syntax. + switch (Kind) { + default: break; + case MCK__35_0: + if (Op.isImm()) + if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm())) + if (CE->getValue() == 0) + return Match_Success; + break; + case MCK_ModImm: + if (Op.isImm()) { + const MCExpr *SOExpr = Op.getImm(); + int64_t Value; + if (!SOExpr->evaluateAsAbsolute(Value)) + return Match_Success; + assert((Value >= INT32_MIN && Value <= UINT32_MAX) && + "expression value must be representable in 32 bits"); + } + break; + case MCK_rGPR: + if (hasV8Ops() && Op.isReg() && Op.getReg() == ARM::SP) + return Match_Success; + break; + case MCK_GPRPair: + if (Op.isReg() && + MRI->getRegClass(ARM::GPRRegClassID).contains(Op.getReg())) + return Match_Success; + break; + } + return Match_InvalidOperand; +} diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp new file mode 100644 index 0000000..e63defe --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -0,0 +1,5223 @@ +//===-- ARMDisassembler.cpp - Disassembler for ARM/Thumb ISA --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/MC/MCDisassembler.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "MCTargetDesc/ARMBaseInfo.h" +#include "MCTargetDesc/ARMMCExpr.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCFixedLenDisassembler.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/LEB128.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" +#include <vector> + +using namespace llvm; + +#define DEBUG_TYPE "arm-disassembler" + +typedef MCDisassembler::DecodeStatus DecodeStatus; + +namespace { + // Handles the condition code status of instructions in IT blocks + class ITStatus + { + public: + // Returns the condition code for instruction in IT block + unsigned getITCC() { + unsigned CC = ARMCC::AL; + if (instrInITBlock()) + CC = ITStates.back(); + return CC; + } + + // Advances the IT block state to the next T or E + void advanceITState() { + ITStates.pop_back(); + } + + // Returns true if the current instruction is in an IT block + bool instrInITBlock() { + return !ITStates.empty(); + } + + // Returns true if current instruction is the last instruction in an IT block + bool instrLastInITBlock() { + return ITStates.size() == 1; + } + + // Called when decoding an IT instruction. Sets the IT state for the following + // instructions that for the IT block. Firstcond and Mask correspond to the + // fields in the IT instruction encoding. + void setITState(char Firstcond, char Mask) { + // (3 - the number of trailing zeros) is the number of then / else. + unsigned CondBit0 = Firstcond & 1; + unsigned NumTZ = countTrailingZeros<uint8_t>(Mask); + unsigned char CCBits = static_cast<unsigned char>(Firstcond & 0xf); + assert(NumTZ <= 3 && "Invalid IT mask!"); + // push condition codes onto the stack the correct order for the pops + for (unsigned Pos = NumTZ+1; Pos <= 3; ++Pos) { + bool T = ((Mask >> Pos) & 1) == CondBit0; + if (T) + ITStates.push_back(CCBits); + else + ITStates.push_back(CCBits ^ 1); + } + ITStates.push_back(CCBits); + } + + private: + std::vector<unsigned char> ITStates; + }; +} + +namespace { +/// ARM disassembler for all ARM platforms. +class ARMDisassembler : public MCDisassembler { +public: + ARMDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) : + MCDisassembler(STI, Ctx) { + } + + ~ARMDisassembler() override {} + + DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size, + ArrayRef<uint8_t> Bytes, uint64_t Address, + raw_ostream &VStream, + raw_ostream &CStream) const override; +}; + +/// Thumb disassembler for all Thumb platforms. +class ThumbDisassembler : public MCDisassembler { +public: + ThumbDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) : + MCDisassembler(STI, Ctx) { + } + + ~ThumbDisassembler() override {} + + DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size, + ArrayRef<uint8_t> Bytes, uint64_t Address, + raw_ostream &VStream, + raw_ostream &CStream) const override; + +private: + mutable ITStatus ITBlock; + DecodeStatus AddThumbPredicate(MCInst&) const; + void UpdateThumbVFPPredicate(MCInst&) const; +}; +} + +static bool Check(DecodeStatus &Out, DecodeStatus In) { + switch (In) { + case MCDisassembler::Success: + // Out stays the same. + return true; + case MCDisassembler::SoftFail: + Out = In; + return true; + case MCDisassembler::Fail: + Out = In; + return false; + } + llvm_unreachable("Invalid DecodeStatus!"); +} + + +// Forward declare these because the autogenerated code will reference them. +// Definitions are further down. +static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeGPRnopcRegisterClass(MCInst &Inst, + unsigned RegNo, uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeGPRwithAPSRRegisterClass(MCInst &Inst, + unsigned RegNo, uint64_t Address, + const void *Decoder); +static DecodeStatus DecodetGPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); +static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeDPR_8RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeDPR_VFP2RegisterClass(MCInst &Inst, + unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeQPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeDPairRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeDPairSpacedRegisterClass(MCInst &Inst, + unsigned RegNo, uint64_t Address, + const void *Decoder); + +static DecodeStatus DecodePredicateOperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeCCOutOperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); + +static DecodeStatus DecodeBitfieldMaskOperand(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeAddrMode2IdxInstruction(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeSORegMemOperand(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeAddrMode3Instruction(MCInst &Inst,unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeSORegRegOperand(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); + +static DecodeStatus DecodeMemMultipleWritebackInstruction(MCInst & Inst, + unsigned Insn, + uint64_t Adddress, + const void *Decoder); +static DecodeStatus DecodeT2MOVTWInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeArmMOVTWInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeSMLAInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeTSTInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeSETPANInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeAddrModeImm12Operand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeAddrMode5Operand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeAddrMode7Operand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2BInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeBranchImmInstruction(MCInst &Inst,unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeAddrMode6Operand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVLDST1Instruction(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVLDST2Instruction(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVLDST3Instruction(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVLDST4Instruction(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVLD1DupInstruction(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVLD2DupInstruction(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVLD3DupInstruction(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeNEONModImmInstruction(MCInst &Inst,unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVSHLMaxInstruction(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeShiftRight8Imm(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeShiftRight16Imm(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeShiftRight32Imm(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeShiftRight64Imm(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeTBLInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodePostIdxReg(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeInstSyncBarrierOption(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeBankedReg(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeDoubleRegLoad(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeDoubleRegStore(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeLDRPreImm(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeLDRPreReg(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeSTRPreImm(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeSTRPreReg(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVLD1LN(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVLD2LN(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVLD3LN(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVLD4LN(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVST1LN(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVST2LN(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVST3LN(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVST4LN(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVMOVSRR(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVMOVRRS(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeSwap(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); + + +static DecodeStatus DecodeThumbAddSpecialReg(MCInst &Inst, uint16_t Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumbBROperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2BROperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumbCmpBROperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumbAddrModeRR(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumbAddrModeIS(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumbAddrModePC(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumbAddrModeSP(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2LoadImm8(MCInst &Inst, unsigned Insn, + uint64_t Address, const void* Decoder); +static DecodeStatus DecodeT2LoadImm12(MCInst &Inst, unsigned Insn, + uint64_t Address, const void* Decoder); +static DecodeStatus DecodeT2LoadT(MCInst &Inst, unsigned Insn, + uint64_t Address, const void* Decoder); +static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn, + uint64_t Address, const void* Decoder); +static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2AddrModeImm8s4(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2AddrModeImm0_1020s4(MCInst &Inst,unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumbAddSPImm(MCInst &Inst, uint16_t Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumbCPS(MCInst &Inst, uint16_t Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeQADDInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumbBLXOffset(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumbTableBranch(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumb2BCCInstruction(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2SOImm(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumbBCCTargetOperand(MCInst &Inst,unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumbBLTargetOperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeIT(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2LDRDPreInstruction(MCInst &Inst,unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2STRDPreInstruction(MCInst &Inst,unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2Adr(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2ShifterImmOperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); + +static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeMRRC2(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +#include "ARMGenDisassemblerTables.inc" + +static MCDisassembler *createARMDisassembler(const Target &T, + const MCSubtargetInfo &STI, + MCContext &Ctx) { + return new ARMDisassembler(STI, Ctx); +} + +static MCDisassembler *createThumbDisassembler(const Target &T, + const MCSubtargetInfo &STI, + MCContext &Ctx) { + return new ThumbDisassembler(STI, Ctx); +} + +// Post-decoding checks +static DecodeStatus checkDecodedInstruction(MCInst &MI, uint64_t &Size, + uint64_t Address, raw_ostream &OS, + raw_ostream &CS, + uint32_t Insn, + DecodeStatus Result) +{ + switch (MI.getOpcode()) { + case ARM::HVC: { + // HVC is undefined if condition = 0xf otherwise upredictable + // if condition != 0xe + uint32_t Cond = (Insn >> 28) & 0xF; + if (Cond == 0xF) + return MCDisassembler::Fail; + if (Cond != 0xE) + return MCDisassembler::SoftFail; + return Result; + } + default: return Result; + } +} + +DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size, + ArrayRef<uint8_t> Bytes, + uint64_t Address, raw_ostream &OS, + raw_ostream &CS) const { + CommentStream = &CS; + + assert(!STI.getFeatureBits()[ARM::ModeThumb] && + "Asked to disassemble an ARM instruction but Subtarget is in Thumb " + "mode!"); + + // We want to read exactly 4 bytes of data. + if (Bytes.size() < 4) { + Size = 0; + return MCDisassembler::Fail; + } + + // Encoded as a small-endian 32-bit word in the stream. + uint32_t Insn = + (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0); + + // Calling the auto-generated decoder function. + DecodeStatus Result = + decodeInstruction(DecoderTableARM32, MI, Insn, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + return checkDecodedInstruction(MI, Size, Address, OS, CS, Insn, Result); + } + + // VFP and NEON instructions, similarly, are shared between ARM + // and Thumb modes. + Result = decodeInstruction(DecoderTableVFP32, MI, Insn, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + return Result; + } + + Result = decodeInstruction(DecoderTableVFPV832, MI, Insn, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + return Result; + } + + Result = + decodeInstruction(DecoderTableNEONData32, MI, Insn, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + // Add a fake predicate operand, because we share these instruction + // definitions with Thumb2 where these instructions are predicable. + if (!DecodePredicateOperand(MI, 0xE, Address, this)) + return MCDisassembler::Fail; + return Result; + } + + Result = decodeInstruction(DecoderTableNEONLoadStore32, MI, Insn, Address, + this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + // Add a fake predicate operand, because we share these instruction + // definitions with Thumb2 where these instructions are predicable. + if (!DecodePredicateOperand(MI, 0xE, Address, this)) + return MCDisassembler::Fail; + return Result; + } + + Result = + decodeInstruction(DecoderTableNEONDup32, MI, Insn, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + // Add a fake predicate operand, because we share these instruction + // definitions with Thumb2 where these instructions are predicable. + if (!DecodePredicateOperand(MI, 0xE, Address, this)) + return MCDisassembler::Fail; + return Result; + } + + Result = + decodeInstruction(DecoderTablev8NEON32, MI, Insn, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + return Result; + } + + Result = + decodeInstruction(DecoderTablev8Crypto32, MI, Insn, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + return Result; + } + + Size = 0; + return MCDisassembler::Fail; +} + +namespace llvm { +extern const MCInstrDesc ARMInsts[]; +} + +/// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the +/// immediate Value in the MCInst. The immediate Value has had any PC +/// adjustment made by the caller. If the instruction is a branch instruction +/// then isBranch is true, else false. If the getOpInfo() function was set as +/// part of the setupForSymbolicDisassembly() call then that function is called +/// to get any symbolic information at the Address for this instruction. If +/// that returns non-zero then the symbolic information it returns is used to +/// create an MCExpr and that is added as an operand to the MCInst. If +/// getOpInfo() returns zero and isBranch is true then a symbol look up for +/// Value is done and if a symbol is found an MCExpr is created with that, else +/// an MCExpr with Value is created. This function returns true if it adds an +/// operand to the MCInst and false otherwise. +static bool tryAddingSymbolicOperand(uint64_t Address, int32_t Value, + bool isBranch, uint64_t InstSize, + MCInst &MI, const void *Decoder) { + const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder); + // FIXME: Does it make sense for value to be negative? + return Dis->tryAddingSymbolicOperand(MI, (uint32_t)Value, Address, isBranch, + /* Offset */ 0, InstSize); +} + +/// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being +/// referenced by a load instruction with the base register that is the Pc. +/// These can often be values in a literal pool near the Address of the +/// instruction. The Address of the instruction and its immediate Value are +/// used as a possible literal pool entry. The SymbolLookUp call back will +/// return the name of a symbol referenced by the literal pool's entry if +/// the referenced address is that of a symbol. Or it will return a pointer to +/// a literal 'C' string if the referenced address of the literal pool's entry +/// is an address into a section with 'C' string literals. +static void tryAddingPcLoadReferenceComment(uint64_t Address, int Value, + const void *Decoder) { + const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder); + Dis->tryAddingPcLoadReferenceComment(Value, Address); +} + +// Thumb1 instructions don't have explicit S bits. Rather, they +// implicitly set CPSR. Since it's not represented in the encoding, the +// auto-generated decoder won't inject the CPSR operand. We need to fix +// that as a post-pass. +static void AddThumb1SBit(MCInst &MI, bool InITBlock) { + const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo; + unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands; + MCInst::iterator I = MI.begin(); + for (unsigned i = 0; i < NumOps; ++i, ++I) { + if (I == MI.end()) break; + if (OpInfo[i].isOptionalDef() && OpInfo[i].RegClass == ARM::CCRRegClassID) { + if (i > 0 && OpInfo[i-1].isPredicate()) continue; + MI.insert(I, MCOperand::createReg(InITBlock ? 0 : ARM::CPSR)); + return; + } + } + + MI.insert(I, MCOperand::createReg(InITBlock ? 0 : ARM::CPSR)); +} + +// Most Thumb instructions don't have explicit predicates in the +// encoding, but rather get their predicates from IT context. We need +// to fix up the predicate operands using this context information as a +// post-pass. +MCDisassembler::DecodeStatus +ThumbDisassembler::AddThumbPredicate(MCInst &MI) const { + MCDisassembler::DecodeStatus S = Success; + + // A few instructions actually have predicates encoded in them. Don't + // try to overwrite it if we're seeing one of those. + switch (MI.getOpcode()) { + case ARM::tBcc: + case ARM::t2Bcc: + case ARM::tCBZ: + case ARM::tCBNZ: + case ARM::tCPS: + case ARM::t2CPS3p: + case ARM::t2CPS2p: + case ARM::t2CPS1p: + case ARM::tMOVSr: + case ARM::tSETEND: + // Some instructions (mostly conditional branches) are not + // allowed in IT blocks. + if (ITBlock.instrInITBlock()) + S = SoftFail; + else + return Success; + break; + case ARM::tB: + case ARM::t2B: + case ARM::t2TBB: + case ARM::t2TBH: + // Some instructions (mostly unconditional branches) can + // only appears at the end of, or outside of, an IT. + if (ITBlock.instrInITBlock() && !ITBlock.instrLastInITBlock()) + S = SoftFail; + break; + default: + break; + } + + // If we're in an IT block, base the predicate on that. Otherwise, + // assume a predicate of AL. + unsigned CC; + CC = ITBlock.getITCC(); + if (CC == 0xF) + CC = ARMCC::AL; + if (ITBlock.instrInITBlock()) + ITBlock.advanceITState(); + + const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo; + unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands; + MCInst::iterator I = MI.begin(); + for (unsigned i = 0; i < NumOps; ++i, ++I) { + if (I == MI.end()) break; + if (OpInfo[i].isPredicate()) { + I = MI.insert(I, MCOperand::createImm(CC)); + ++I; + if (CC == ARMCC::AL) + MI.insert(I, MCOperand::createReg(0)); + else + MI.insert(I, MCOperand::createReg(ARM::CPSR)); + return S; + } + } + + I = MI.insert(I, MCOperand::createImm(CC)); + ++I; + if (CC == ARMCC::AL) + MI.insert(I, MCOperand::createReg(0)); + else + MI.insert(I, MCOperand::createReg(ARM::CPSR)); + + return S; +} + +// Thumb VFP instructions are a special case. Because we share their +// encodings between ARM and Thumb modes, and they are predicable in ARM +// mode, the auto-generated decoder will give them an (incorrect) +// predicate operand. We need to rewrite these operands based on the IT +// context as a post-pass. +void ThumbDisassembler::UpdateThumbVFPPredicate(MCInst &MI) const { + unsigned CC; + CC = ITBlock.getITCC(); + if (ITBlock.instrInITBlock()) + ITBlock.advanceITState(); + + const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo; + MCInst::iterator I = MI.begin(); + unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands; + for (unsigned i = 0; i < NumOps; ++i, ++I) { + if (OpInfo[i].isPredicate() ) { + I->setImm(CC); + ++I; + if (CC == ARMCC::AL) + I->setReg(0); + else + I->setReg(ARM::CPSR); + return; + } + } +} + +DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, + ArrayRef<uint8_t> Bytes, + uint64_t Address, + raw_ostream &OS, + raw_ostream &CS) const { + CommentStream = &CS; + + assert(STI.getFeatureBits()[ARM::ModeThumb] && + "Asked to disassemble in Thumb mode but Subtarget is in ARM mode!"); + + // We want to read exactly 2 bytes of data. + if (Bytes.size() < 2) { + Size = 0; + return MCDisassembler::Fail; + } + + uint16_t Insn16 = (Bytes[1] << 8) | Bytes[0]; + DecodeStatus Result = + decodeInstruction(DecoderTableThumb16, MI, Insn16, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 2; + Check(Result, AddThumbPredicate(MI)); + return Result; + } + + Result = decodeInstruction(DecoderTableThumbSBit16, MI, Insn16, Address, this, + STI); + if (Result) { + Size = 2; + bool InITBlock = ITBlock.instrInITBlock(); + Check(Result, AddThumbPredicate(MI)); + AddThumb1SBit(MI, InITBlock); + return Result; + } + + Result = + decodeInstruction(DecoderTableThumb216, MI, Insn16, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 2; + + // Nested IT blocks are UNPREDICTABLE. Must be checked before we add + // the Thumb predicate. + if (MI.getOpcode() == ARM::t2IT && ITBlock.instrInITBlock()) + Result = MCDisassembler::SoftFail; + + Check(Result, AddThumbPredicate(MI)); + + // If we find an IT instruction, we need to parse its condition + // code and mask operands so that we can apply them correctly + // to the subsequent instructions. + if (MI.getOpcode() == ARM::t2IT) { + + unsigned Firstcond = MI.getOperand(0).getImm(); + unsigned Mask = MI.getOperand(1).getImm(); + ITBlock.setITState(Firstcond, Mask); + } + + return Result; + } + + // We want to read exactly 4 bytes of data. + if (Bytes.size() < 4) { + Size = 0; + return MCDisassembler::Fail; + } + + uint32_t Insn32 = + (Bytes[3] << 8) | (Bytes[2] << 0) | (Bytes[1] << 24) | (Bytes[0] << 16); + Result = + decodeInstruction(DecoderTableThumb32, MI, Insn32, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + bool InITBlock = ITBlock.instrInITBlock(); + Check(Result, AddThumbPredicate(MI)); + AddThumb1SBit(MI, InITBlock); + return Result; + } + + Result = + decodeInstruction(DecoderTableThumb232, MI, Insn32, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + Check(Result, AddThumbPredicate(MI)); + return Result; + } + + if (fieldFromInstruction(Insn32, 28, 4) == 0xE) { + Result = + decodeInstruction(DecoderTableVFP32, MI, Insn32, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + UpdateThumbVFPPredicate(MI); + return Result; + } + } + + Result = + decodeInstruction(DecoderTableVFPV832, MI, Insn32, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + return Result; + } + + if (fieldFromInstruction(Insn32, 28, 4) == 0xE) { + Result = decodeInstruction(DecoderTableNEONDup32, MI, Insn32, Address, this, + STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + Check(Result, AddThumbPredicate(MI)); + return Result; + } + } + + if (fieldFromInstruction(Insn32, 24, 8) == 0xF9) { + uint32_t NEONLdStInsn = Insn32; + NEONLdStInsn &= 0xF0FFFFFF; + NEONLdStInsn |= 0x04000000; + Result = decodeInstruction(DecoderTableNEONLoadStore32, MI, NEONLdStInsn, + Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + Check(Result, AddThumbPredicate(MI)); + return Result; + } + } + + if (fieldFromInstruction(Insn32, 24, 4) == 0xF) { + uint32_t NEONDataInsn = Insn32; + NEONDataInsn &= 0xF0FFFFFF; // Clear bits 27-24 + NEONDataInsn |= (NEONDataInsn & 0x10000000) >> 4; // Move bit 28 to bit 24 + NEONDataInsn |= 0x12000000; // Set bits 28 and 25 + Result = decodeInstruction(DecoderTableNEONData32, MI, NEONDataInsn, + Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + Check(Result, AddThumbPredicate(MI)); + return Result; + } + + uint32_t NEONCryptoInsn = Insn32; + NEONCryptoInsn &= 0xF0FFFFFF; // Clear bits 27-24 + NEONCryptoInsn |= (NEONCryptoInsn & 0x10000000) >> 4; // Move bit 28 to bit 24 + NEONCryptoInsn |= 0x12000000; // Set bits 28 and 25 + Result = decodeInstruction(DecoderTablev8Crypto32, MI, NEONCryptoInsn, + Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + return Result; + } + + uint32_t NEONv8Insn = Insn32; + NEONv8Insn &= 0xF3FFFFFF; // Clear bits 27-26 + Result = decodeInstruction(DecoderTablev8NEON32, MI, NEONv8Insn, Address, + this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + return Result; + } + } + + Size = 0; + return MCDisassembler::Fail; +} + + +extern "C" void LLVMInitializeARMDisassembler() { + TargetRegistry::RegisterMCDisassembler(TheARMLETarget, + createARMDisassembler); + TargetRegistry::RegisterMCDisassembler(TheARMBETarget, + createARMDisassembler); + TargetRegistry::RegisterMCDisassembler(TheThumbLETarget, + createThumbDisassembler); + TargetRegistry::RegisterMCDisassembler(TheThumbBETarget, + createThumbDisassembler); +} + +static const uint16_t GPRDecoderTable[] = { + ARM::R0, ARM::R1, ARM::R2, ARM::R3, + ARM::R4, ARM::R5, ARM::R6, ARM::R7, + ARM::R8, ARM::R9, ARM::R10, ARM::R11, + ARM::R12, ARM::SP, ARM::LR, ARM::PC +}; + +static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + if (RegNo > 15) + return MCDisassembler::Fail; + + unsigned Register = GPRDecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return MCDisassembler::Success; +} + +static DecodeStatus +DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + if (RegNo == 15) + S = MCDisassembler::SoftFail; + + Check(S, DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder)); + + return S; +} + +static DecodeStatus +DecodeGPRwithAPSRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + if (RegNo == 15) + { + Inst.addOperand(MCOperand::createReg(ARM::APSR_NZCV)); + return MCDisassembler::Success; + } + + Check(S, DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder)); + return S; +} + +static DecodeStatus DecodetGPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + if (RegNo > 7) + return MCDisassembler::Fail; + return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder); +} + +static const uint16_t GPRPairDecoderTable[] = { + ARM::R0_R1, ARM::R2_R3, ARM::R4_R5, ARM::R6_R7, + ARM::R8_R9, ARM::R10_R11, ARM::R12_SP +}; + +static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + if (RegNo > 13) + return MCDisassembler::Fail; + + if ((RegNo & 1) || RegNo == 0xe) + S = MCDisassembler::SoftFail; + + unsigned RegisterPair = GPRPairDecoderTable[RegNo/2]; + Inst.addOperand(MCOperand::createReg(RegisterPair)); + return S; +} + +static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + unsigned Register = 0; + switch (RegNo) { + case 0: + Register = ARM::R0; + break; + case 1: + Register = ARM::R1; + break; + case 2: + Register = ARM::R2; + break; + case 3: + Register = ARM::R3; + break; + case 9: + Register = ARM::R9; + break; + case 12: + Register = ARM::R12; + break; + default: + return MCDisassembler::Fail; + } + + Inst.addOperand(MCOperand::createReg(Register)); + return MCDisassembler::Success; +} + +static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + const FeatureBitset &featureBits = + ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); + + if ((RegNo == 13 && !featureBits[ARM::HasV8Ops]) || RegNo == 15) + S = MCDisassembler::SoftFail; + + Check(S, DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder)); + return S; +} + +static const uint16_t SPRDecoderTable[] = { + ARM::S0, ARM::S1, ARM::S2, ARM::S3, + ARM::S4, ARM::S5, ARM::S6, ARM::S7, + ARM::S8, ARM::S9, ARM::S10, ARM::S11, + ARM::S12, ARM::S13, ARM::S14, ARM::S15, + ARM::S16, ARM::S17, ARM::S18, ARM::S19, + ARM::S20, ARM::S21, ARM::S22, ARM::S23, + ARM::S24, ARM::S25, ARM::S26, ARM::S27, + ARM::S28, ARM::S29, ARM::S30, ARM::S31 +}; + +static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + if (RegNo > 31) + return MCDisassembler::Fail; + + unsigned Register = SPRDecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return MCDisassembler::Success; +} + +static const uint16_t DPRDecoderTable[] = { + ARM::D0, ARM::D1, ARM::D2, ARM::D3, + ARM::D4, ARM::D5, ARM::D6, ARM::D7, + ARM::D8, ARM::D9, ARM::D10, ARM::D11, + ARM::D12, ARM::D13, ARM::D14, ARM::D15, + ARM::D16, ARM::D17, ARM::D18, ARM::D19, + ARM::D20, ARM::D21, ARM::D22, ARM::D23, + ARM::D24, ARM::D25, ARM::D26, ARM::D27, + ARM::D28, ARM::D29, ARM::D30, ARM::D31 +}; + +static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + const FeatureBitset &featureBits = + ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); + + bool hasD16 = featureBits[ARM::FeatureD16]; + + if (RegNo > 31 || (hasD16 && RegNo > 15)) + return MCDisassembler::Fail; + + unsigned Register = DPRDecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeDPR_8RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + if (RegNo > 7) + return MCDisassembler::Fail; + return DecodeDPRRegisterClass(Inst, RegNo, Address, Decoder); +} + +static DecodeStatus +DecodeDPR_VFP2RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + if (RegNo > 15) + return MCDisassembler::Fail; + return DecodeDPRRegisterClass(Inst, RegNo, Address, Decoder); +} + +static const uint16_t QPRDecoderTable[] = { + ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3, + ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7, + ARM::Q8, ARM::Q9, ARM::Q10, ARM::Q11, + ARM::Q12, ARM::Q13, ARM::Q14, ARM::Q15 +}; + + +static DecodeStatus DecodeQPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + if (RegNo > 31 || (RegNo & 1) != 0) + return MCDisassembler::Fail; + RegNo >>= 1; + + unsigned Register = QPRDecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return MCDisassembler::Success; +} + +static const uint16_t DPairDecoderTable[] = { + ARM::Q0, ARM::D1_D2, ARM::Q1, ARM::D3_D4, ARM::Q2, ARM::D5_D6, + ARM::Q3, ARM::D7_D8, ARM::Q4, ARM::D9_D10, ARM::Q5, ARM::D11_D12, + ARM::Q6, ARM::D13_D14, ARM::Q7, ARM::D15_D16, ARM::Q8, ARM::D17_D18, + ARM::Q9, ARM::D19_D20, ARM::Q10, ARM::D21_D22, ARM::Q11, ARM::D23_D24, + ARM::Q12, ARM::D25_D26, ARM::Q13, ARM::D27_D28, ARM::Q14, ARM::D29_D30, + ARM::Q15 +}; + +static DecodeStatus DecodeDPairRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + if (RegNo > 30) + return MCDisassembler::Fail; + + unsigned Register = DPairDecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return MCDisassembler::Success; +} + +static const uint16_t DPairSpacedDecoderTable[] = { + ARM::D0_D2, ARM::D1_D3, ARM::D2_D4, ARM::D3_D5, + ARM::D4_D6, ARM::D5_D7, ARM::D6_D8, ARM::D7_D9, + ARM::D8_D10, ARM::D9_D11, ARM::D10_D12, ARM::D11_D13, + ARM::D12_D14, ARM::D13_D15, ARM::D14_D16, ARM::D15_D17, + ARM::D16_D18, ARM::D17_D19, ARM::D18_D20, ARM::D19_D21, + ARM::D20_D22, ARM::D21_D23, ARM::D22_D24, ARM::D23_D25, + ARM::D24_D26, ARM::D25_D27, ARM::D26_D28, ARM::D27_D29, + ARM::D28_D30, ARM::D29_D31 +}; + +static DecodeStatus DecodeDPairSpacedRegisterClass(MCInst &Inst, + unsigned RegNo, + uint64_t Address, + const void *Decoder) { + if (RegNo > 29) + return MCDisassembler::Fail; + + unsigned Register = DPairSpacedDecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodePredicateOperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + if (Val == 0xF) return MCDisassembler::Fail; + // AL predicate is not allowed on Thumb1 branches. + if (Inst.getOpcode() == ARM::tBcc && Val == 0xE) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(Val)); + if (Val == ARMCC::AL) { + Inst.addOperand(MCOperand::createReg(0)); + } else + Inst.addOperand(MCOperand::createReg(ARM::CPSR)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeCCOutOperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + if (Val) + Inst.addOperand(MCOperand::createReg(ARM::CPSR)); + else + Inst.addOperand(MCOperand::createReg(0)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rm = fieldFromInstruction(Val, 0, 4); + unsigned type = fieldFromInstruction(Val, 5, 2); + unsigned imm = fieldFromInstruction(Val, 7, 5); + + // Register-immediate + if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + + ARM_AM::ShiftOpc Shift = ARM_AM::lsl; + switch (type) { + case 0: + Shift = ARM_AM::lsl; + break; + case 1: + Shift = ARM_AM::lsr; + break; + case 2: + Shift = ARM_AM::asr; + break; + case 3: + Shift = ARM_AM::ror; + break; + } + + if (Shift == ARM_AM::ror && imm == 0) + Shift = ARM_AM::rrx; + + unsigned Op = Shift | (imm << 3); + Inst.addOperand(MCOperand::createImm(Op)); + + return S; +} + +static DecodeStatus DecodeSORegRegOperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rm = fieldFromInstruction(Val, 0, 4); + unsigned type = fieldFromInstruction(Val, 5, 2); + unsigned Rs = fieldFromInstruction(Val, 8, 4); + + // Register-register + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rs, Address, Decoder))) + return MCDisassembler::Fail; + + ARM_AM::ShiftOpc Shift = ARM_AM::lsl; + switch (type) { + case 0: + Shift = ARM_AM::lsl; + break; + case 1: + Shift = ARM_AM::lsr; + break; + case 2: + Shift = ARM_AM::asr; + break; + case 3: + Shift = ARM_AM::ror; + break; + } + + Inst.addOperand(MCOperand::createImm(Shift)); + + return S; +} + +static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + bool NeedDisjointWriteback = false; + unsigned WritebackReg = 0; + switch (Inst.getOpcode()) { + default: + break; + case ARM::LDMIA_UPD: + case ARM::LDMDB_UPD: + case ARM::LDMIB_UPD: + case ARM::LDMDA_UPD: + case ARM::t2LDMIA_UPD: + case ARM::t2LDMDB_UPD: + case ARM::t2STMIA_UPD: + case ARM::t2STMDB_UPD: + NeedDisjointWriteback = true; + WritebackReg = Inst.getOperand(0).getReg(); + break; + } + + // Empty register lists are not allowed. + if (Val == 0) return MCDisassembler::Fail; + for (unsigned i = 0; i < 16; ++i) { + if (Val & (1 << i)) { + if (!Check(S, DecodeGPRRegisterClass(Inst, i, Address, Decoder))) + return MCDisassembler::Fail; + // Writeback not allowed if Rn is in the target list. + if (NeedDisjointWriteback && WritebackReg == Inst.end()[-1].getReg()) + Check(S, MCDisassembler::SoftFail); + } + } + + return S; +} + +static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Vd = fieldFromInstruction(Val, 8, 5); + unsigned regs = fieldFromInstruction(Val, 0, 8); + + // In case of unpredictable encoding, tweak the operands. + if (regs == 0 || (Vd + regs) > 32) { + regs = Vd + regs > 32 ? 32 - Vd : regs; + regs = std::max( 1u, regs); + S = MCDisassembler::SoftFail; + } + + if (!Check(S, DecodeSPRRegisterClass(Inst, Vd, Address, Decoder))) + return MCDisassembler::Fail; + for (unsigned i = 0; i < (regs - 1); ++i) { + if (!Check(S, DecodeSPRRegisterClass(Inst, ++Vd, Address, Decoder))) + return MCDisassembler::Fail; + } + + return S; +} + +static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Vd = fieldFromInstruction(Val, 8, 5); + unsigned regs = fieldFromInstruction(Val, 1, 7); + + // In case of unpredictable encoding, tweak the operands. + if (regs == 0 || regs > 16 || (Vd + regs) > 32) { + regs = Vd + regs > 32 ? 32 - Vd : regs; + regs = std::max( 1u, regs); + regs = std::min(16u, regs); + S = MCDisassembler::SoftFail; + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Vd, Address, Decoder))) + return MCDisassembler::Fail; + for (unsigned i = 0; i < (regs - 1); ++i) { + if (!Check(S, DecodeDPRRegisterClass(Inst, ++Vd, Address, Decoder))) + return MCDisassembler::Fail; + } + + return S; +} + +static DecodeStatus DecodeBitfieldMaskOperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + // This operand encodes a mask of contiguous zeros between a specified MSB + // and LSB. To decode it, we create the mask of all bits MSB-and-lower, + // the mask of all bits LSB-and-lower, and then xor them to create + // the mask of that's all ones on [msb, lsb]. Finally we not it to + // create the final mask. + unsigned msb = fieldFromInstruction(Val, 5, 5); + unsigned lsb = fieldFromInstruction(Val, 0, 5); + + DecodeStatus S = MCDisassembler::Success; + if (lsb > msb) { + Check(S, MCDisassembler::SoftFail); + // The check above will cause the warning for the "potentially undefined + // instruction encoding" but we can't build a bad MCOperand value here + // with a lsb > msb or else printing the MCInst will cause a crash. + lsb = msb; + } + + uint32_t msb_mask = 0xFFFFFFFF; + if (msb != 31) msb_mask = (1U << (msb+1)) - 1; + uint32_t lsb_mask = (1U << lsb) - 1; + + Inst.addOperand(MCOperand::createImm(~(msb_mask ^ lsb_mask))); + return S; +} + +static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned pred = fieldFromInstruction(Insn, 28, 4); + unsigned CRd = fieldFromInstruction(Insn, 12, 4); + unsigned coproc = fieldFromInstruction(Insn, 8, 4); + unsigned imm = fieldFromInstruction(Insn, 0, 8); + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned U = fieldFromInstruction(Insn, 23, 1); + + switch (Inst.getOpcode()) { + case ARM::LDC_OFFSET: + case ARM::LDC_PRE: + case ARM::LDC_POST: + case ARM::LDC_OPTION: + case ARM::LDCL_OFFSET: + case ARM::LDCL_PRE: + case ARM::LDCL_POST: + case ARM::LDCL_OPTION: + case ARM::STC_OFFSET: + case ARM::STC_PRE: + case ARM::STC_POST: + case ARM::STC_OPTION: + case ARM::STCL_OFFSET: + case ARM::STCL_PRE: + case ARM::STCL_POST: + case ARM::STCL_OPTION: + case ARM::t2LDC_OFFSET: + case ARM::t2LDC_PRE: + case ARM::t2LDC_POST: + case ARM::t2LDC_OPTION: + case ARM::t2LDCL_OFFSET: + case ARM::t2LDCL_PRE: + case ARM::t2LDCL_POST: + case ARM::t2LDCL_OPTION: + case ARM::t2STC_OFFSET: + case ARM::t2STC_PRE: + case ARM::t2STC_POST: + case ARM::t2STC_OPTION: + case ARM::t2STCL_OFFSET: + case ARM::t2STCL_PRE: + case ARM::t2STCL_POST: + case ARM::t2STCL_OPTION: + if (coproc == 0xA || coproc == 0xB) + return MCDisassembler::Fail; + break; + default: + break; + } + + const FeatureBitset &featureBits = + ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); + if (featureBits[ARM::HasV8Ops] && (coproc != 14)) + return MCDisassembler::Fail; + + Inst.addOperand(MCOperand::createImm(coproc)); + Inst.addOperand(MCOperand::createImm(CRd)); + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + + switch (Inst.getOpcode()) { + case ARM::t2LDC2_OFFSET: + case ARM::t2LDC2L_OFFSET: + case ARM::t2LDC2_PRE: + case ARM::t2LDC2L_PRE: + case ARM::t2STC2_OFFSET: + case ARM::t2STC2L_OFFSET: + case ARM::t2STC2_PRE: + case ARM::t2STC2L_PRE: + case ARM::LDC2_OFFSET: + case ARM::LDC2L_OFFSET: + case ARM::LDC2_PRE: + case ARM::LDC2L_PRE: + case ARM::STC2_OFFSET: + case ARM::STC2L_OFFSET: + case ARM::STC2_PRE: + case ARM::STC2L_PRE: + case ARM::t2LDC_OFFSET: + case ARM::t2LDCL_OFFSET: + case ARM::t2LDC_PRE: + case ARM::t2LDCL_PRE: + case ARM::t2STC_OFFSET: + case ARM::t2STCL_OFFSET: + case ARM::t2STC_PRE: + case ARM::t2STCL_PRE: + case ARM::LDC_OFFSET: + case ARM::LDCL_OFFSET: + case ARM::LDC_PRE: + case ARM::LDCL_PRE: + case ARM::STC_OFFSET: + case ARM::STCL_OFFSET: + case ARM::STC_PRE: + case ARM::STCL_PRE: + imm = ARM_AM::getAM5Opc(U ? ARM_AM::add : ARM_AM::sub, imm); + Inst.addOperand(MCOperand::createImm(imm)); + break; + case ARM::t2LDC2_POST: + case ARM::t2LDC2L_POST: + case ARM::t2STC2_POST: + case ARM::t2STC2L_POST: + case ARM::LDC2_POST: + case ARM::LDC2L_POST: + case ARM::STC2_POST: + case ARM::STC2L_POST: + case ARM::t2LDC_POST: + case ARM::t2LDCL_POST: + case ARM::t2STC_POST: + case ARM::t2STCL_POST: + case ARM::LDC_POST: + case ARM::LDCL_POST: + case ARM::STC_POST: + case ARM::STCL_POST: + imm |= U << 8; + // fall through. + default: + // The 'option' variant doesn't encode 'U' in the immediate since + // the immediate is unsigned [0,255]. + Inst.addOperand(MCOperand::createImm(imm)); + break; + } + + switch (Inst.getOpcode()) { + case ARM::LDC_OFFSET: + case ARM::LDC_PRE: + case ARM::LDC_POST: + case ARM::LDC_OPTION: + case ARM::LDCL_OFFSET: + case ARM::LDCL_PRE: + case ARM::LDCL_POST: + case ARM::LDCL_OPTION: + case ARM::STC_OFFSET: + case ARM::STC_PRE: + case ARM::STC_POST: + case ARM::STC_OPTION: + case ARM::STCL_OFFSET: + case ARM::STCL_PRE: + case ARM::STCL_POST: + case ARM::STCL_OPTION: + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + + return S; +} + +static DecodeStatus +DecodeAddrMode2IdxInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned Rt = fieldFromInstruction(Insn, 12, 4); + unsigned Rm = fieldFromInstruction(Insn, 0, 4); + unsigned imm = fieldFromInstruction(Insn, 0, 12); + unsigned pred = fieldFromInstruction(Insn, 28, 4); + unsigned reg = fieldFromInstruction(Insn, 25, 1); + unsigned P = fieldFromInstruction(Insn, 24, 1); + unsigned W = fieldFromInstruction(Insn, 21, 1); + + // On stores, the writeback operand precedes Rt. + switch (Inst.getOpcode()) { + case ARM::STR_POST_IMM: + case ARM::STR_POST_REG: + case ARM::STRB_POST_IMM: + case ARM::STRB_POST_REG: + case ARM::STRT_POST_REG: + case ARM::STRT_POST_IMM: + case ARM::STRBT_POST_REG: + case ARM::STRBT_POST_IMM: + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + + // On loads, the writeback operand comes after Rt. + switch (Inst.getOpcode()) { + case ARM::LDR_POST_IMM: + case ARM::LDR_POST_REG: + case ARM::LDRB_POST_IMM: + case ARM::LDRB_POST_REG: + case ARM::LDRBT_POST_REG: + case ARM::LDRBT_POST_IMM: + case ARM::LDRT_POST_REG: + case ARM::LDRT_POST_IMM: + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + + ARM_AM::AddrOpc Op = ARM_AM::add; + if (!fieldFromInstruction(Insn, 23, 1)) + Op = ARM_AM::sub; + + bool writeback = (P == 0) || (W == 1); + unsigned idx_mode = 0; + if (P && writeback) + idx_mode = ARMII::IndexModePre; + else if (!P && writeback) + idx_mode = ARMII::IndexModePost; + + if (writeback && (Rn == 15 || Rn == Rt)) + S = MCDisassembler::SoftFail; // UNPREDICTABLE + + if (reg) { + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + ARM_AM::ShiftOpc Opc = ARM_AM::lsl; + switch( fieldFromInstruction(Insn, 5, 2)) { + case 0: + Opc = ARM_AM::lsl; + break; + case 1: + Opc = ARM_AM::lsr; + break; + case 2: + Opc = ARM_AM::asr; + break; + case 3: + Opc = ARM_AM::ror; + break; + default: + return MCDisassembler::Fail; + } + unsigned amt = fieldFromInstruction(Insn, 7, 5); + if (Opc == ARM_AM::ror && amt == 0) + Opc = ARM_AM::rrx; + unsigned imm = ARM_AM::getAM2Opc(Op, amt, Opc, idx_mode); + + Inst.addOperand(MCOperand::createImm(imm)); + } else { + Inst.addOperand(MCOperand::createReg(0)); + unsigned tmp = ARM_AM::getAM2Opc(Op, imm, ARM_AM::lsl, idx_mode); + Inst.addOperand(MCOperand::createImm(tmp)); + } + + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeSORegMemOperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Val, 13, 4); + unsigned Rm = fieldFromInstruction(Val, 0, 4); + unsigned type = fieldFromInstruction(Val, 5, 2); + unsigned imm = fieldFromInstruction(Val, 7, 5); + unsigned U = fieldFromInstruction(Val, 12, 1); + + ARM_AM::ShiftOpc ShOp = ARM_AM::lsl; + switch (type) { + case 0: + ShOp = ARM_AM::lsl; + break; + case 1: + ShOp = ARM_AM::lsr; + break; + case 2: + ShOp = ARM_AM::asr; + break; + case 3: + ShOp = ARM_AM::ror; + break; + } + + if (ShOp == ARM_AM::ror && imm == 0) + ShOp = ARM_AM::rrx; + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + unsigned shift; + if (U) + shift = ARM_AM::getAM2Opc(ARM_AM::add, imm, ShOp); + else + shift = ARM_AM::getAM2Opc(ARM_AM::sub, imm, ShOp); + Inst.addOperand(MCOperand::createImm(shift)); + + return S; +} + +static DecodeStatus +DecodeAddrMode3Instruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rt = fieldFromInstruction(Insn, 12, 4); + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned Rm = fieldFromInstruction(Insn, 0, 4); + unsigned type = fieldFromInstruction(Insn, 22, 1); + unsigned imm = fieldFromInstruction(Insn, 8, 4); + unsigned U = ((~fieldFromInstruction(Insn, 23, 1)) & 1) << 8; + unsigned pred = fieldFromInstruction(Insn, 28, 4); + unsigned W = fieldFromInstruction(Insn, 21, 1); + unsigned P = fieldFromInstruction(Insn, 24, 1); + unsigned Rt2 = Rt + 1; + + bool writeback = (W == 1) | (P == 0); + + // For {LD,ST}RD, Rt must be even, else undefined. + switch (Inst.getOpcode()) { + case ARM::STRD: + case ARM::STRD_PRE: + case ARM::STRD_POST: + case ARM::LDRD: + case ARM::LDRD_PRE: + case ARM::LDRD_POST: + if (Rt & 0x1) S = MCDisassembler::SoftFail; + break; + default: + break; + } + switch (Inst.getOpcode()) { + case ARM::STRD: + case ARM::STRD_PRE: + case ARM::STRD_POST: + if (P == 0 && W == 1) + S = MCDisassembler::SoftFail; + + if (writeback && (Rn == 15 || Rn == Rt || Rn == Rt2)) + S = MCDisassembler::SoftFail; + if (type && Rm == 15) + S = MCDisassembler::SoftFail; + if (Rt2 == 15) + S = MCDisassembler::SoftFail; + if (!type && fieldFromInstruction(Insn, 8, 4)) + S = MCDisassembler::SoftFail; + break; + case ARM::STRH: + case ARM::STRH_PRE: + case ARM::STRH_POST: + if (Rt == 15) + S = MCDisassembler::SoftFail; + if (writeback && (Rn == 15 || Rn == Rt)) + S = MCDisassembler::SoftFail; + if (!type && Rm == 15) + S = MCDisassembler::SoftFail; + break; + case ARM::LDRD: + case ARM::LDRD_PRE: + case ARM::LDRD_POST: + if (type && Rn == 15){ + if (Rt2 == 15) + S = MCDisassembler::SoftFail; + break; + } + if (P == 0 && W == 1) + S = MCDisassembler::SoftFail; + if (!type && (Rt2 == 15 || Rm == 15 || Rm == Rt || Rm == Rt2)) + S = MCDisassembler::SoftFail; + if (!type && writeback && Rn == 15) + S = MCDisassembler::SoftFail; + if (writeback && (Rn == Rt || Rn == Rt2)) + S = MCDisassembler::SoftFail; + break; + case ARM::LDRH: + case ARM::LDRH_PRE: + case ARM::LDRH_POST: + if (type && Rn == 15){ + if (Rt == 15) + S = MCDisassembler::SoftFail; + break; + } + if (Rt == 15) + S = MCDisassembler::SoftFail; + if (!type && Rm == 15) + S = MCDisassembler::SoftFail; + if (!type && writeback && (Rn == 15 || Rn == Rt)) + S = MCDisassembler::SoftFail; + break; + case ARM::LDRSH: + case ARM::LDRSH_PRE: + case ARM::LDRSH_POST: + case ARM::LDRSB: + case ARM::LDRSB_PRE: + case ARM::LDRSB_POST: + if (type && Rn == 15){ + if (Rt == 15) + S = MCDisassembler::SoftFail; + break; + } + if (type && (Rt == 15 || (writeback && Rn == Rt))) + S = MCDisassembler::SoftFail; + if (!type && (Rt == 15 || Rm == 15)) + S = MCDisassembler::SoftFail; + if (!type && writeback && (Rn == 15 || Rn == Rt)) + S = MCDisassembler::SoftFail; + break; + default: + break; + } + + if (writeback) { // Writeback + if (P) + U |= ARMII::IndexModePre << 9; + else + U |= ARMII::IndexModePost << 9; + + // On stores, the writeback operand precedes Rt. + switch (Inst.getOpcode()) { + case ARM::STRD: + case ARM::STRD_PRE: + case ARM::STRD_POST: + case ARM::STRH: + case ARM::STRH_PRE: + case ARM::STRH_POST: + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + } + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + switch (Inst.getOpcode()) { + case ARM::STRD: + case ARM::STRD_PRE: + case ARM::STRD_POST: + case ARM::LDRD: + case ARM::LDRD_PRE: + case ARM::LDRD_POST: + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt+1, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + + if (writeback) { + // On loads, the writeback operand comes after Rt. + switch (Inst.getOpcode()) { + case ARM::LDRD: + case ARM::LDRD_PRE: + case ARM::LDRD_POST: + case ARM::LDRH: + case ARM::LDRH_PRE: + case ARM::LDRH_POST: + case ARM::LDRSH: + case ARM::LDRSH_PRE: + case ARM::LDRSH_POST: + case ARM::LDRSB: + case ARM::LDRSB_PRE: + case ARM::LDRSB_POST: + case ARM::LDRHTr: + case ARM::LDRSBTr: + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + } + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + + if (type) { + Inst.addOperand(MCOperand::createReg(0)); + Inst.addOperand(MCOperand::createImm(U | (imm << 4) | Rm)); + } else { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(U)); + } + + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeRFEInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned mode = fieldFromInstruction(Insn, 23, 2); + + switch (mode) { + case 0: + mode = ARM_AM::da; + break; + case 1: + mode = ARM_AM::ia; + break; + case 2: + mode = ARM_AM::db; + break; + case 3: + mode = ARM_AM::ib; + break; + } + + Inst.addOperand(MCOperand::createImm(mode)); + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeQADDInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rd = fieldFromInstruction(Insn, 12, 4); + unsigned Rm = fieldFromInstruction(Insn, 0, 4); + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned pred = fieldFromInstruction(Insn, 28, 4); + + if (pred == 0xF) + return DecodeCPSInstruction(Inst, Insn, Address, Decoder); + + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + return S; +} + +static DecodeStatus DecodeMemMultipleWritebackInstruction(MCInst &Inst, + unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned pred = fieldFromInstruction(Insn, 28, 4); + unsigned reglist = fieldFromInstruction(Insn, 0, 16); + + if (pred == 0xF) { + // Ambiguous with RFE and SRS + switch (Inst.getOpcode()) { + case ARM::LDMDA: + Inst.setOpcode(ARM::RFEDA); + break; + case ARM::LDMDA_UPD: + Inst.setOpcode(ARM::RFEDA_UPD); + break; + case ARM::LDMDB: + Inst.setOpcode(ARM::RFEDB); + break; + case ARM::LDMDB_UPD: + Inst.setOpcode(ARM::RFEDB_UPD); + break; + case ARM::LDMIA: + Inst.setOpcode(ARM::RFEIA); + break; + case ARM::LDMIA_UPD: + Inst.setOpcode(ARM::RFEIA_UPD); + break; + case ARM::LDMIB: + Inst.setOpcode(ARM::RFEIB); + break; + case ARM::LDMIB_UPD: + Inst.setOpcode(ARM::RFEIB_UPD); + break; + case ARM::STMDA: + Inst.setOpcode(ARM::SRSDA); + break; + case ARM::STMDA_UPD: + Inst.setOpcode(ARM::SRSDA_UPD); + break; + case ARM::STMDB: + Inst.setOpcode(ARM::SRSDB); + break; + case ARM::STMDB_UPD: + Inst.setOpcode(ARM::SRSDB_UPD); + break; + case ARM::STMIA: + Inst.setOpcode(ARM::SRSIA); + break; + case ARM::STMIA_UPD: + Inst.setOpcode(ARM::SRSIA_UPD); + break; + case ARM::STMIB: + Inst.setOpcode(ARM::SRSIB); + break; + case ARM::STMIB_UPD: + Inst.setOpcode(ARM::SRSIB_UPD); + break; + default: + return MCDisassembler::Fail; + } + + // For stores (which become SRS's, the only operand is the mode. + if (fieldFromInstruction(Insn, 20, 1) == 0) { + // Check SRS encoding constraints + if (!(fieldFromInstruction(Insn, 22, 1) == 1 && + fieldFromInstruction(Insn, 20, 1) == 0)) + return MCDisassembler::Fail; + + Inst.addOperand( + MCOperand::createImm(fieldFromInstruction(Insn, 0, 4))); + return S; + } + + return DecodeRFEInstruction(Inst, Insn, Address, Decoder); + } + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; // Tied + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeRegListOperand(Inst, reglist, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + unsigned imod = fieldFromInstruction(Insn, 18, 2); + unsigned M = fieldFromInstruction(Insn, 17, 1); + unsigned iflags = fieldFromInstruction(Insn, 6, 3); + unsigned mode = fieldFromInstruction(Insn, 0, 5); + + DecodeStatus S = MCDisassembler::Success; + + // This decoder is called from multiple location that do not check + // the full encoding is valid before they do. + if (fieldFromInstruction(Insn, 5, 1) != 0 || + fieldFromInstruction(Insn, 16, 1) != 0 || + fieldFromInstruction(Insn, 20, 8) != 0x10) + return MCDisassembler::Fail; + + // imod == '01' --> UNPREDICTABLE + // NOTE: Even though this is technically UNPREDICTABLE, we choose to + // return failure here. The '01' imod value is unprintable, so there's + // nothing useful we could do even if we returned UNPREDICTABLE. + + if (imod == 1) return MCDisassembler::Fail; + + if (imod && M) { + Inst.setOpcode(ARM::CPS3p); + Inst.addOperand(MCOperand::createImm(imod)); + Inst.addOperand(MCOperand::createImm(iflags)); + Inst.addOperand(MCOperand::createImm(mode)); + } else if (imod && !M) { + Inst.setOpcode(ARM::CPS2p); + Inst.addOperand(MCOperand::createImm(imod)); + Inst.addOperand(MCOperand::createImm(iflags)); + if (mode) S = MCDisassembler::SoftFail; + } else if (!imod && M) { + Inst.setOpcode(ARM::CPS1p); + Inst.addOperand(MCOperand::createImm(mode)); + if (iflags) S = MCDisassembler::SoftFail; + } else { + // imod == '00' && M == '0' --> UNPREDICTABLE + Inst.setOpcode(ARM::CPS1p); + Inst.addOperand(MCOperand::createImm(mode)); + S = MCDisassembler::SoftFail; + } + + return S; +} + +static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + unsigned imod = fieldFromInstruction(Insn, 9, 2); + unsigned M = fieldFromInstruction(Insn, 8, 1); + unsigned iflags = fieldFromInstruction(Insn, 5, 3); + unsigned mode = fieldFromInstruction(Insn, 0, 5); + + DecodeStatus S = MCDisassembler::Success; + + // imod == '01' --> UNPREDICTABLE + // NOTE: Even though this is technically UNPREDICTABLE, we choose to + // return failure here. The '01' imod value is unprintable, so there's + // nothing useful we could do even if we returned UNPREDICTABLE. + + if (imod == 1) return MCDisassembler::Fail; + + if (imod && M) { + Inst.setOpcode(ARM::t2CPS3p); + Inst.addOperand(MCOperand::createImm(imod)); + Inst.addOperand(MCOperand::createImm(iflags)); + Inst.addOperand(MCOperand::createImm(mode)); + } else if (imod && !M) { + Inst.setOpcode(ARM::t2CPS2p); + Inst.addOperand(MCOperand::createImm(imod)); + Inst.addOperand(MCOperand::createImm(iflags)); + if (mode) S = MCDisassembler::SoftFail; + } else if (!imod && M) { + Inst.setOpcode(ARM::t2CPS1p); + Inst.addOperand(MCOperand::createImm(mode)); + if (iflags) S = MCDisassembler::SoftFail; + } else { + // imod == '00' && M == '0' --> this is a HINT instruction + int imm = fieldFromInstruction(Insn, 0, 8); + // HINT are defined only for immediate in [0..4] + if(imm > 4) return MCDisassembler::Fail; + Inst.setOpcode(ARM::t2HINT); + Inst.addOperand(MCOperand::createImm(imm)); + } + + return S; +} + +static DecodeStatus DecodeT2MOVTWInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rd = fieldFromInstruction(Insn, 8, 4); + unsigned imm = 0; + + imm |= (fieldFromInstruction(Insn, 0, 8) << 0); + imm |= (fieldFromInstruction(Insn, 12, 3) << 8); + imm |= (fieldFromInstruction(Insn, 16, 4) << 12); + imm |= (fieldFromInstruction(Insn, 26, 1) << 11); + + if (Inst.getOpcode() == ARM::t2MOVTi16) + if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + + if (!tryAddingSymbolicOperand(Address, imm, false, 4, Inst, Decoder)) + Inst.addOperand(MCOperand::createImm(imm)); + + return S; +} + +static DecodeStatus DecodeArmMOVTWInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rd = fieldFromInstruction(Insn, 12, 4); + unsigned pred = fieldFromInstruction(Insn, 28, 4); + unsigned imm = 0; + + imm |= (fieldFromInstruction(Insn, 0, 12) << 0); + imm |= (fieldFromInstruction(Insn, 16, 4) << 12); + + if (Inst.getOpcode() == ARM::MOVTi16) + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + + if (!tryAddingSymbolicOperand(Address, imm, false, 4, Inst, Decoder)) + Inst.addOperand(MCOperand::createImm(imm)); + + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeSMLAInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rd = fieldFromInstruction(Insn, 16, 4); + unsigned Rn = fieldFromInstruction(Insn, 0, 4); + unsigned Rm = fieldFromInstruction(Insn, 8, 4); + unsigned Ra = fieldFromInstruction(Insn, 12, 4); + unsigned pred = fieldFromInstruction(Insn, 28, 4); + + if (pred == 0xF) + return DecodeCPSInstruction(Inst, Insn, Address, Decoder); + + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Ra, Address, Decoder))) + return MCDisassembler::Fail; + + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeTSTInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Pred = fieldFromInstruction(Insn, 28, 4); + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned Rm = fieldFromInstruction(Insn, 0, 4); + + if (Pred == 0xF) + return DecodeSETPANInstruction(Inst, Insn, Address, Decoder); + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodePredicateOperand(Inst, Pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeSETPANInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Imm = fieldFromInstruction(Insn, 9, 1); + + const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder); + const FeatureBitset &FeatureBits = Dis->getSubtargetInfo().getFeatureBits(); + + if (!FeatureBits[ARM::HasV8_1aOps] || + !FeatureBits[ARM::HasV8Ops]) + return MCDisassembler::Fail; + + // Decoder can be called from DecodeTST, which does not check the full + // encoding is valid. + if (fieldFromInstruction(Insn, 20,12) != 0xf11 || + fieldFromInstruction(Insn, 4,4) != 0) + return MCDisassembler::Fail; + if (fieldFromInstruction(Insn, 10,10) != 0 || + fieldFromInstruction(Insn, 0,4) != 0) + S = MCDisassembler::SoftFail; + + Inst.setOpcode(ARM::SETPAN); + Inst.addOperand(MCOperand::createImm(Imm)); + + return S; +} + +static DecodeStatus DecodeAddrModeImm12Operand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned add = fieldFromInstruction(Val, 12, 1); + unsigned imm = fieldFromInstruction(Val, 0, 12); + unsigned Rn = fieldFromInstruction(Val, 13, 4); + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + + if (!add) imm *= -1; + if (imm == 0 && !add) imm = INT32_MIN; + Inst.addOperand(MCOperand::createImm(imm)); + if (Rn == 15) + tryAddingPcLoadReferenceComment(Address, Address + imm + 8, Decoder); + + return S; +} + +static DecodeStatus DecodeAddrMode5Operand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Val, 9, 4); + unsigned U = fieldFromInstruction(Val, 8, 1); + unsigned imm = fieldFromInstruction(Val, 0, 8); + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + + if (U) + Inst.addOperand(MCOperand::createImm(ARM_AM::getAM5Opc(ARM_AM::add, imm))); + else + Inst.addOperand(MCOperand::createImm(ARM_AM::getAM5Opc(ARM_AM::sub, imm))); + + return S; +} + +static DecodeStatus DecodeAddrMode7Operand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + return DecodeGPRRegisterClass(Inst, Val, Address, Decoder); +} + +static DecodeStatus +DecodeT2BInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus Status = MCDisassembler::Success; + + // Note the J1 and J2 values are from the encoded instruction. So here + // change them to I1 and I2 values via as documented: + // I1 = NOT(J1 EOR S); + // I2 = NOT(J2 EOR S); + // and build the imm32 with one trailing zero as documented: + // imm32 = SignExtend(S:I1:I2:imm10:imm11:'0', 32); + unsigned S = fieldFromInstruction(Insn, 26, 1); + unsigned J1 = fieldFromInstruction(Insn, 13, 1); + unsigned J2 = fieldFromInstruction(Insn, 11, 1); + unsigned I1 = !(J1 ^ S); + unsigned I2 = !(J2 ^ S); + unsigned imm10 = fieldFromInstruction(Insn, 16, 10); + unsigned imm11 = fieldFromInstruction(Insn, 0, 11); + unsigned tmp = (S << 23) | (I1 << 22) | (I2 << 21) | (imm10 << 11) | imm11; + int imm32 = SignExtend32<25>(tmp << 1); + if (!tryAddingSymbolicOperand(Address, Address + imm32 + 4, + true, 4, Inst, Decoder)) + Inst.addOperand(MCOperand::createImm(imm32)); + + return Status; +} + +static DecodeStatus +DecodeBranchImmInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned pred = fieldFromInstruction(Insn, 28, 4); + unsigned imm = fieldFromInstruction(Insn, 0, 24) << 2; + + if (pred == 0xF) { + Inst.setOpcode(ARM::BLXi); + imm |= fieldFromInstruction(Insn, 24, 1) << 1; + if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<26>(imm) + 8, + true, 4, Inst, Decoder)) + Inst.addOperand(MCOperand::createImm(SignExtend32<26>(imm))); + return S; + } + + if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<26>(imm) + 8, + true, 4, Inst, Decoder)) + Inst.addOperand(MCOperand::createImm(SignExtend32<26>(imm))); + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + + +static DecodeStatus DecodeAddrMode6Operand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rm = fieldFromInstruction(Val, 0, 4); + unsigned align = fieldFromInstruction(Val, 4, 2); + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + if (!align) + Inst.addOperand(MCOperand::createImm(0)); + else + Inst.addOperand(MCOperand::createImm(4 << align)); + + return S; +} + +static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rd = fieldFromInstruction(Insn, 12, 4); + Rd |= fieldFromInstruction(Insn, 22, 1) << 4; + unsigned wb = fieldFromInstruction(Insn, 16, 4); + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + Rn |= fieldFromInstruction(Insn, 4, 2) << 4; + unsigned Rm = fieldFromInstruction(Insn, 0, 4); + + // First output register + switch (Inst.getOpcode()) { + case ARM::VLD1q16: case ARM::VLD1q32: case ARM::VLD1q64: case ARM::VLD1q8: + case ARM::VLD1q16wb_fixed: case ARM::VLD1q16wb_register: + case ARM::VLD1q32wb_fixed: case ARM::VLD1q32wb_register: + case ARM::VLD1q64wb_fixed: case ARM::VLD1q64wb_register: + case ARM::VLD1q8wb_fixed: case ARM::VLD1q8wb_register: + case ARM::VLD2d16: case ARM::VLD2d32: case ARM::VLD2d8: + case ARM::VLD2d16wb_fixed: case ARM::VLD2d16wb_register: + case ARM::VLD2d32wb_fixed: case ARM::VLD2d32wb_register: + case ARM::VLD2d8wb_fixed: case ARM::VLD2d8wb_register: + if (!Check(S, DecodeDPairRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + break; + case ARM::VLD2b16: + case ARM::VLD2b32: + case ARM::VLD2b8: + case ARM::VLD2b16wb_fixed: + case ARM::VLD2b16wb_register: + case ARM::VLD2b32wb_fixed: + case ARM::VLD2b32wb_register: + case ARM::VLD2b8wb_fixed: + case ARM::VLD2b8wb_register: + if (!Check(S, DecodeDPairSpacedRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + } + + // Second output register + switch (Inst.getOpcode()) { + case ARM::VLD3d8: + case ARM::VLD3d16: + case ARM::VLD3d32: + case ARM::VLD3d8_UPD: + case ARM::VLD3d16_UPD: + case ARM::VLD3d32_UPD: + case ARM::VLD4d8: + case ARM::VLD4d16: + case ARM::VLD4d32: + case ARM::VLD4d8_UPD: + case ARM::VLD4d16_UPD: + case ARM::VLD4d32_UPD: + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+1)%32, Address, Decoder))) + return MCDisassembler::Fail; + break; + case ARM::VLD3q8: + case ARM::VLD3q16: + case ARM::VLD3q32: + case ARM::VLD3q8_UPD: + case ARM::VLD3q16_UPD: + case ARM::VLD3q32_UPD: + case ARM::VLD4q8: + case ARM::VLD4q16: + case ARM::VLD4q32: + case ARM::VLD4q8_UPD: + case ARM::VLD4q16_UPD: + case ARM::VLD4q32_UPD: + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder))) + return MCDisassembler::Fail; + default: + break; + } + + // Third output register + switch(Inst.getOpcode()) { + case ARM::VLD3d8: + case ARM::VLD3d16: + case ARM::VLD3d32: + case ARM::VLD3d8_UPD: + case ARM::VLD3d16_UPD: + case ARM::VLD3d32_UPD: + case ARM::VLD4d8: + case ARM::VLD4d16: + case ARM::VLD4d32: + case ARM::VLD4d8_UPD: + case ARM::VLD4d16_UPD: + case ARM::VLD4d32_UPD: + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder))) + return MCDisassembler::Fail; + break; + case ARM::VLD3q8: + case ARM::VLD3q16: + case ARM::VLD3q32: + case ARM::VLD3q8_UPD: + case ARM::VLD3q16_UPD: + case ARM::VLD3q32_UPD: + case ARM::VLD4q8: + case ARM::VLD4q16: + case ARM::VLD4q32: + case ARM::VLD4q8_UPD: + case ARM::VLD4q16_UPD: + case ARM::VLD4q32_UPD: + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+4)%32, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + + // Fourth output register + switch (Inst.getOpcode()) { + case ARM::VLD4d8: + case ARM::VLD4d16: + case ARM::VLD4d32: + case ARM::VLD4d8_UPD: + case ARM::VLD4d16_UPD: + case ARM::VLD4d32_UPD: + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+3)%32, Address, Decoder))) + return MCDisassembler::Fail; + break; + case ARM::VLD4q8: + case ARM::VLD4q16: + case ARM::VLD4q32: + case ARM::VLD4q8_UPD: + case ARM::VLD4q16_UPD: + case ARM::VLD4q32_UPD: + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+6)%32, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + + // Writeback operand + switch (Inst.getOpcode()) { + case ARM::VLD1d8wb_fixed: + case ARM::VLD1d16wb_fixed: + case ARM::VLD1d32wb_fixed: + case ARM::VLD1d64wb_fixed: + case ARM::VLD1d8wb_register: + case ARM::VLD1d16wb_register: + case ARM::VLD1d32wb_register: + case ARM::VLD1d64wb_register: + case ARM::VLD1q8wb_fixed: + case ARM::VLD1q16wb_fixed: + case ARM::VLD1q32wb_fixed: + case ARM::VLD1q64wb_fixed: + case ARM::VLD1q8wb_register: + case ARM::VLD1q16wb_register: + case ARM::VLD1q32wb_register: + case ARM::VLD1q64wb_register: + case ARM::VLD1d8Twb_fixed: + case ARM::VLD1d8Twb_register: + case ARM::VLD1d16Twb_fixed: + case ARM::VLD1d16Twb_register: + case ARM::VLD1d32Twb_fixed: + case ARM::VLD1d32Twb_register: + case ARM::VLD1d64Twb_fixed: + case ARM::VLD1d64Twb_register: + case ARM::VLD1d8Qwb_fixed: + case ARM::VLD1d8Qwb_register: + case ARM::VLD1d16Qwb_fixed: + case ARM::VLD1d16Qwb_register: + case ARM::VLD1d32Qwb_fixed: + case ARM::VLD1d32Qwb_register: + case ARM::VLD1d64Qwb_fixed: + case ARM::VLD1d64Qwb_register: + case ARM::VLD2d8wb_fixed: + case ARM::VLD2d16wb_fixed: + case ARM::VLD2d32wb_fixed: + case ARM::VLD2q8wb_fixed: + case ARM::VLD2q16wb_fixed: + case ARM::VLD2q32wb_fixed: + case ARM::VLD2d8wb_register: + case ARM::VLD2d16wb_register: + case ARM::VLD2d32wb_register: + case ARM::VLD2q8wb_register: + case ARM::VLD2q16wb_register: + case ARM::VLD2q32wb_register: + case ARM::VLD2b8wb_fixed: + case ARM::VLD2b16wb_fixed: + case ARM::VLD2b32wb_fixed: + case ARM::VLD2b8wb_register: + case ARM::VLD2b16wb_register: + case ARM::VLD2b32wb_register: + Inst.addOperand(MCOperand::createImm(0)); + break; + case ARM::VLD3d8_UPD: + case ARM::VLD3d16_UPD: + case ARM::VLD3d32_UPD: + case ARM::VLD3q8_UPD: + case ARM::VLD3q16_UPD: + case ARM::VLD3q32_UPD: + case ARM::VLD4d8_UPD: + case ARM::VLD4d16_UPD: + case ARM::VLD4d32_UPD: + case ARM::VLD4q8_UPD: + case ARM::VLD4q16_UPD: + case ARM::VLD4q32_UPD: + if (!Check(S, DecodeGPRRegisterClass(Inst, wb, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + + // AddrMode6 Base (register+alignment) + if (!Check(S, DecodeAddrMode6Operand(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + + // AddrMode6 Offset (register) + switch (Inst.getOpcode()) { + default: + // The below have been updated to have explicit am6offset split + // between fixed and register offset. For those instructions not + // yet updated, we need to add an additional reg0 operand for the + // fixed variant. + // + // The fixed offset encodes as Rm == 0xd, so we check for that. + if (Rm == 0xd) { + Inst.addOperand(MCOperand::createReg(0)); + break; + } + // Fall through to handle the register offset variant. + case ARM::VLD1d8wb_fixed: + case ARM::VLD1d16wb_fixed: + case ARM::VLD1d32wb_fixed: + case ARM::VLD1d64wb_fixed: + case ARM::VLD1d8Twb_fixed: + case ARM::VLD1d16Twb_fixed: + case ARM::VLD1d32Twb_fixed: + case ARM::VLD1d64Twb_fixed: + case ARM::VLD1d8Qwb_fixed: + case ARM::VLD1d16Qwb_fixed: + case ARM::VLD1d32Qwb_fixed: + case ARM::VLD1d64Qwb_fixed: + case ARM::VLD1d8wb_register: + case ARM::VLD1d16wb_register: + case ARM::VLD1d32wb_register: + case ARM::VLD1d64wb_register: + case ARM::VLD1q8wb_fixed: + case ARM::VLD1q16wb_fixed: + case ARM::VLD1q32wb_fixed: + case ARM::VLD1q64wb_fixed: + case ARM::VLD1q8wb_register: + case ARM::VLD1q16wb_register: + case ARM::VLD1q32wb_register: + case ARM::VLD1q64wb_register: + // The fixed offset post-increment encodes Rm == 0xd. The no-writeback + // variant encodes Rm == 0xf. Anything else is a register offset post- + // increment and we need to add the register operand to the instruction. + if (Rm != 0xD && Rm != 0xF && + !Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + break; + case ARM::VLD2d8wb_fixed: + case ARM::VLD2d16wb_fixed: + case ARM::VLD2d32wb_fixed: + case ARM::VLD2b8wb_fixed: + case ARM::VLD2b16wb_fixed: + case ARM::VLD2b32wb_fixed: + case ARM::VLD2q8wb_fixed: + case ARM::VLD2q16wb_fixed: + case ARM::VLD2q32wb_fixed: + break; + } + + return S; +} + +static DecodeStatus DecodeVLDST1Instruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + unsigned type = fieldFromInstruction(Insn, 8, 4); + unsigned align = fieldFromInstruction(Insn, 4, 2); + if (type == 6 && (align & 2)) return MCDisassembler::Fail; + if (type == 7 && (align & 2)) return MCDisassembler::Fail; + if (type == 10 && align == 3) return MCDisassembler::Fail; + + unsigned load = fieldFromInstruction(Insn, 21, 1); + return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder) + : DecodeVSTInstruction(Inst, Insn, Address, Decoder); +} + +static DecodeStatus DecodeVLDST2Instruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + unsigned size = fieldFromInstruction(Insn, 6, 2); + if (size == 3) return MCDisassembler::Fail; + + unsigned type = fieldFromInstruction(Insn, 8, 4); + unsigned align = fieldFromInstruction(Insn, 4, 2); + if (type == 8 && align == 3) return MCDisassembler::Fail; + if (type == 9 && align == 3) return MCDisassembler::Fail; + + unsigned load = fieldFromInstruction(Insn, 21, 1); + return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder) + : DecodeVSTInstruction(Inst, Insn, Address, Decoder); +} + +static DecodeStatus DecodeVLDST3Instruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + unsigned size = fieldFromInstruction(Insn, 6, 2); + if (size == 3) return MCDisassembler::Fail; + + unsigned align = fieldFromInstruction(Insn, 4, 2); + if (align & 2) return MCDisassembler::Fail; + + unsigned load = fieldFromInstruction(Insn, 21, 1); + return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder) + : DecodeVSTInstruction(Inst, Insn, Address, Decoder); +} + +static DecodeStatus DecodeVLDST4Instruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + unsigned size = fieldFromInstruction(Insn, 6, 2); + if (size == 3) return MCDisassembler::Fail; + + unsigned load = fieldFromInstruction(Insn, 21, 1); + return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder) + : DecodeVSTInstruction(Inst, Insn, Address, Decoder); +} + +static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rd = fieldFromInstruction(Insn, 12, 4); + Rd |= fieldFromInstruction(Insn, 22, 1) << 4; + unsigned wb = fieldFromInstruction(Insn, 16, 4); + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + Rn |= fieldFromInstruction(Insn, 4, 2) << 4; + unsigned Rm = fieldFromInstruction(Insn, 0, 4); + + // Writeback Operand + switch (Inst.getOpcode()) { + case ARM::VST1d8wb_fixed: + case ARM::VST1d16wb_fixed: + case ARM::VST1d32wb_fixed: + case ARM::VST1d64wb_fixed: + case ARM::VST1d8wb_register: + case ARM::VST1d16wb_register: + case ARM::VST1d32wb_register: + case ARM::VST1d64wb_register: + case ARM::VST1q8wb_fixed: + case ARM::VST1q16wb_fixed: + case ARM::VST1q32wb_fixed: + case ARM::VST1q64wb_fixed: + case ARM::VST1q8wb_register: + case ARM::VST1q16wb_register: + case ARM::VST1q32wb_register: + case ARM::VST1q64wb_register: + case ARM::VST1d8Twb_fixed: + case ARM::VST1d16Twb_fixed: + case ARM::VST1d32Twb_fixed: + case ARM::VST1d64Twb_fixed: + case ARM::VST1d8Twb_register: + case ARM::VST1d16Twb_register: + case ARM::VST1d32Twb_register: + case ARM::VST1d64Twb_register: + case ARM::VST1d8Qwb_fixed: + case ARM::VST1d16Qwb_fixed: + case ARM::VST1d32Qwb_fixed: + case ARM::VST1d64Qwb_fixed: + case ARM::VST1d8Qwb_register: + case ARM::VST1d16Qwb_register: + case ARM::VST1d32Qwb_register: + case ARM::VST1d64Qwb_register: + case ARM::VST2d8wb_fixed: + case ARM::VST2d16wb_fixed: + case ARM::VST2d32wb_fixed: + case ARM::VST2d8wb_register: + case ARM::VST2d16wb_register: + case ARM::VST2d32wb_register: + case ARM::VST2q8wb_fixed: + case ARM::VST2q16wb_fixed: + case ARM::VST2q32wb_fixed: + case ARM::VST2q8wb_register: + case ARM::VST2q16wb_register: + case ARM::VST2q32wb_register: + case ARM::VST2b8wb_fixed: + case ARM::VST2b16wb_fixed: + case ARM::VST2b32wb_fixed: + case ARM::VST2b8wb_register: + case ARM::VST2b16wb_register: + case ARM::VST2b32wb_register: + if (Rm == 0xF) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(0)); + break; + case ARM::VST3d8_UPD: + case ARM::VST3d16_UPD: + case ARM::VST3d32_UPD: + case ARM::VST3q8_UPD: + case ARM::VST3q16_UPD: + case ARM::VST3q32_UPD: + case ARM::VST4d8_UPD: + case ARM::VST4d16_UPD: + case ARM::VST4d32_UPD: + case ARM::VST4q8_UPD: + case ARM::VST4q16_UPD: + case ARM::VST4q32_UPD: + if (!Check(S, DecodeGPRRegisterClass(Inst, wb, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + + // AddrMode6 Base (register+alignment) + if (!Check(S, DecodeAddrMode6Operand(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + + // AddrMode6 Offset (register) + switch (Inst.getOpcode()) { + default: + if (Rm == 0xD) + Inst.addOperand(MCOperand::createReg(0)); + else if (Rm != 0xF) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } + break; + case ARM::VST1d8wb_fixed: + case ARM::VST1d16wb_fixed: + case ARM::VST1d32wb_fixed: + case ARM::VST1d64wb_fixed: + case ARM::VST1q8wb_fixed: + case ARM::VST1q16wb_fixed: + case ARM::VST1q32wb_fixed: + case ARM::VST1q64wb_fixed: + case ARM::VST1d8Twb_fixed: + case ARM::VST1d16Twb_fixed: + case ARM::VST1d32Twb_fixed: + case ARM::VST1d64Twb_fixed: + case ARM::VST1d8Qwb_fixed: + case ARM::VST1d16Qwb_fixed: + case ARM::VST1d32Qwb_fixed: + case ARM::VST1d64Qwb_fixed: + case ARM::VST2d8wb_fixed: + case ARM::VST2d16wb_fixed: + case ARM::VST2d32wb_fixed: + case ARM::VST2q8wb_fixed: + case ARM::VST2q16wb_fixed: + case ARM::VST2q32wb_fixed: + case ARM::VST2b8wb_fixed: + case ARM::VST2b16wb_fixed: + case ARM::VST2b32wb_fixed: + break; + } + + + // First input register + switch (Inst.getOpcode()) { + case ARM::VST1q16: + case ARM::VST1q32: + case ARM::VST1q64: + case ARM::VST1q8: + case ARM::VST1q16wb_fixed: + case ARM::VST1q16wb_register: + case ARM::VST1q32wb_fixed: + case ARM::VST1q32wb_register: + case ARM::VST1q64wb_fixed: + case ARM::VST1q64wb_register: + case ARM::VST1q8wb_fixed: + case ARM::VST1q8wb_register: + case ARM::VST2d16: + case ARM::VST2d32: + case ARM::VST2d8: + case ARM::VST2d16wb_fixed: + case ARM::VST2d16wb_register: + case ARM::VST2d32wb_fixed: + case ARM::VST2d32wb_register: + case ARM::VST2d8wb_fixed: + case ARM::VST2d8wb_register: + if (!Check(S, DecodeDPairRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + break; + case ARM::VST2b16: + case ARM::VST2b32: + case ARM::VST2b8: + case ARM::VST2b16wb_fixed: + case ARM::VST2b16wb_register: + case ARM::VST2b32wb_fixed: + case ARM::VST2b32wb_register: + case ARM::VST2b8wb_fixed: + case ARM::VST2b8wb_register: + if (!Check(S, DecodeDPairSpacedRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + } + + // Second input register + switch (Inst.getOpcode()) { + case ARM::VST3d8: + case ARM::VST3d16: + case ARM::VST3d32: + case ARM::VST3d8_UPD: + case ARM::VST3d16_UPD: + case ARM::VST3d32_UPD: + case ARM::VST4d8: + case ARM::VST4d16: + case ARM::VST4d32: + case ARM::VST4d8_UPD: + case ARM::VST4d16_UPD: + case ARM::VST4d32_UPD: + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+1)%32, Address, Decoder))) + return MCDisassembler::Fail; + break; + case ARM::VST3q8: + case ARM::VST3q16: + case ARM::VST3q32: + case ARM::VST3q8_UPD: + case ARM::VST3q16_UPD: + case ARM::VST3q32_UPD: + case ARM::VST4q8: + case ARM::VST4q16: + case ARM::VST4q32: + case ARM::VST4q8_UPD: + case ARM::VST4q16_UPD: + case ARM::VST4q32_UPD: + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + + // Third input register + switch (Inst.getOpcode()) { + case ARM::VST3d8: + case ARM::VST3d16: + case ARM::VST3d32: + case ARM::VST3d8_UPD: + case ARM::VST3d16_UPD: + case ARM::VST3d32_UPD: + case ARM::VST4d8: + case ARM::VST4d16: + case ARM::VST4d32: + case ARM::VST4d8_UPD: + case ARM::VST4d16_UPD: + case ARM::VST4d32_UPD: + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder))) + return MCDisassembler::Fail; + break; + case ARM::VST3q8: + case ARM::VST3q16: + case ARM::VST3q32: + case ARM::VST3q8_UPD: + case ARM::VST3q16_UPD: + case ARM::VST3q32_UPD: + case ARM::VST4q8: + case ARM::VST4q16: + case ARM::VST4q32: + case ARM::VST4q8_UPD: + case ARM::VST4q16_UPD: + case ARM::VST4q32_UPD: + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+4)%32, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + + // Fourth input register + switch (Inst.getOpcode()) { + case ARM::VST4d8: + case ARM::VST4d16: + case ARM::VST4d32: + case ARM::VST4d8_UPD: + case ARM::VST4d16_UPD: + case ARM::VST4d32_UPD: + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+3)%32, Address, Decoder))) + return MCDisassembler::Fail; + break; + case ARM::VST4q8: + case ARM::VST4q16: + case ARM::VST4q32: + case ARM::VST4q8_UPD: + case ARM::VST4q16_UPD: + case ARM::VST4q32_UPD: + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+6)%32, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + + return S; +} + +static DecodeStatus DecodeVLD1DupInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rd = fieldFromInstruction(Insn, 12, 4); + Rd |= fieldFromInstruction(Insn, 22, 1) << 4; + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned Rm = fieldFromInstruction(Insn, 0, 4); + unsigned align = fieldFromInstruction(Insn, 4, 1); + unsigned size = fieldFromInstruction(Insn, 6, 2); + + if (size == 0 && align == 1) + return MCDisassembler::Fail; + align *= (1 << size); + + switch (Inst.getOpcode()) { + case ARM::VLD1DUPq16: case ARM::VLD1DUPq32: case ARM::VLD1DUPq8: + case ARM::VLD1DUPq16wb_fixed: case ARM::VLD1DUPq16wb_register: + case ARM::VLD1DUPq32wb_fixed: case ARM::VLD1DUPq32wb_register: + case ARM::VLD1DUPq8wb_fixed: case ARM::VLD1DUPq8wb_register: + if (!Check(S, DecodeDPairRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + break; + } + if (Rm != 0xF) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(align)); + + // The fixed offset post-increment encodes Rm == 0xd. The no-writeback + // variant encodes Rm == 0xf. Anything else is a register offset post- + // increment and we need to add the register operand to the instruction. + if (Rm != 0xD && Rm != 0xF && + !Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeVLD2DupInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rd = fieldFromInstruction(Insn, 12, 4); + Rd |= fieldFromInstruction(Insn, 22, 1) << 4; + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned Rm = fieldFromInstruction(Insn, 0, 4); + unsigned align = fieldFromInstruction(Insn, 4, 1); + unsigned size = 1 << fieldFromInstruction(Insn, 6, 2); + align *= 2*size; + + switch (Inst.getOpcode()) { + case ARM::VLD2DUPd16: case ARM::VLD2DUPd32: case ARM::VLD2DUPd8: + case ARM::VLD2DUPd16wb_fixed: case ARM::VLD2DUPd16wb_register: + case ARM::VLD2DUPd32wb_fixed: case ARM::VLD2DUPd32wb_register: + case ARM::VLD2DUPd8wb_fixed: case ARM::VLD2DUPd8wb_register: + if (!Check(S, DecodeDPairRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + break; + case ARM::VLD2DUPd16x2: case ARM::VLD2DUPd32x2: case ARM::VLD2DUPd8x2: + case ARM::VLD2DUPd16x2wb_fixed: case ARM::VLD2DUPd16x2wb_register: + case ARM::VLD2DUPd32x2wb_fixed: case ARM::VLD2DUPd32x2wb_register: + case ARM::VLD2DUPd8x2wb_fixed: case ARM::VLD2DUPd8x2wb_register: + if (!Check(S, DecodeDPairSpacedRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + break; + } + + if (Rm != 0xF) + Inst.addOperand(MCOperand::createImm(0)); + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(align)); + + if (Rm != 0xD && Rm != 0xF) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } + + return S; +} + +static DecodeStatus DecodeVLD3DupInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rd = fieldFromInstruction(Insn, 12, 4); + Rd |= fieldFromInstruction(Insn, 22, 1) << 4; + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned Rm = fieldFromInstruction(Insn, 0, 4); + unsigned inc = fieldFromInstruction(Insn, 5, 1) + 1; + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+inc)%32, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2*inc)%32, Address, Decoder))) + return MCDisassembler::Fail; + if (Rm != 0xF) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(0)); + + if (Rm == 0xD) + Inst.addOperand(MCOperand::createReg(0)); + else if (Rm != 0xF) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } + + return S; +} + +static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rd = fieldFromInstruction(Insn, 12, 4); + Rd |= fieldFromInstruction(Insn, 22, 1) << 4; + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned Rm = fieldFromInstruction(Insn, 0, 4); + unsigned size = fieldFromInstruction(Insn, 6, 2); + unsigned inc = fieldFromInstruction(Insn, 5, 1) + 1; + unsigned align = fieldFromInstruction(Insn, 4, 1); + + if (size == 0x3) { + if (align == 0) + return MCDisassembler::Fail; + align = 16; + } else { + if (size == 2) { + align *= 8; + } else { + size = 1 << size; + align *= 4*size; + } + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+inc)%32, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2*inc)%32, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+3*inc)%32, Address, Decoder))) + return MCDisassembler::Fail; + if (Rm != 0xF) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(align)); + + if (Rm == 0xD) + Inst.addOperand(MCOperand::createReg(0)); + else if (Rm != 0xF) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } + + return S; +} + +static DecodeStatus +DecodeNEONModImmInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rd = fieldFromInstruction(Insn, 12, 4); + Rd |= fieldFromInstruction(Insn, 22, 1) << 4; + unsigned imm = fieldFromInstruction(Insn, 0, 4); + imm |= fieldFromInstruction(Insn, 16, 3) << 4; + imm |= fieldFromInstruction(Insn, 24, 1) << 7; + imm |= fieldFromInstruction(Insn, 8, 4) << 8; + imm |= fieldFromInstruction(Insn, 5, 1) << 12; + unsigned Q = fieldFromInstruction(Insn, 6, 1); + + if (Q) { + if (!Check(S, DecodeQPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + } else { + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + } + + Inst.addOperand(MCOperand::createImm(imm)); + + switch (Inst.getOpcode()) { + case ARM::VORRiv4i16: + case ARM::VORRiv2i32: + case ARM::VBICiv4i16: + case ARM::VBICiv2i32: + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + break; + case ARM::VORRiv8i16: + case ARM::VORRiv4i32: + case ARM::VBICiv8i16: + case ARM::VBICiv4i32: + if (!Check(S, DecodeQPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + + return S; +} + +static DecodeStatus DecodeVSHLMaxInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rd = fieldFromInstruction(Insn, 12, 4); + Rd |= fieldFromInstruction(Insn, 22, 1) << 4; + unsigned Rm = fieldFromInstruction(Insn, 0, 4); + Rm |= fieldFromInstruction(Insn, 5, 1) << 4; + unsigned size = fieldFromInstruction(Insn, 18, 2); + + if (!Check(S, DecodeQPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(8 << size)); + + return S; +} + +static DecodeStatus DecodeShiftRight8Imm(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + Inst.addOperand(MCOperand::createImm(8 - Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeShiftRight16Imm(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + Inst.addOperand(MCOperand::createImm(16 - Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeShiftRight32Imm(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + Inst.addOperand(MCOperand::createImm(32 - Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeShiftRight64Imm(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + Inst.addOperand(MCOperand::createImm(64 - Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeTBLInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rd = fieldFromInstruction(Insn, 12, 4); + Rd |= fieldFromInstruction(Insn, 22, 1) << 4; + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + Rn |= fieldFromInstruction(Insn, 7, 1) << 4; + unsigned Rm = fieldFromInstruction(Insn, 0, 4); + Rm |= fieldFromInstruction(Insn, 5, 1) << 4; + unsigned op = fieldFromInstruction(Insn, 6, 1); + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (op) { + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; // Writeback + } + + switch (Inst.getOpcode()) { + case ARM::VTBL2: + case ARM::VTBX2: + if (!Check(S, DecodeDPairRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + if (!Check(S, DecodeDPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeThumbAddSpecialReg(MCInst &Inst, uint16_t Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned dst = fieldFromInstruction(Insn, 8, 3); + unsigned imm = fieldFromInstruction(Insn, 0, 8); + + if (!Check(S, DecodetGPRRegisterClass(Inst, dst, Address, Decoder))) + return MCDisassembler::Fail; + + switch(Inst.getOpcode()) { + default: + return MCDisassembler::Fail; + case ARM::tADR: + break; // tADR does not explicitly represent the PC as an operand. + case ARM::tADDrSPi: + Inst.addOperand(MCOperand::createReg(ARM::SP)); + break; + } + + Inst.addOperand(MCOperand::createImm(imm)); + return S; +} + +static DecodeStatus DecodeThumbBROperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<12>(Val<<1) + 4, + true, 2, Inst, Decoder)) + Inst.addOperand(MCOperand::createImm(SignExtend32<12>(Val << 1))); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeT2BROperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<21>(Val) + 4, + true, 4, Inst, Decoder)) + Inst.addOperand(MCOperand::createImm(SignExtend32<21>(Val))); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeThumbCmpBROperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + if (!tryAddingSymbolicOperand(Address, Address + (Val<<1) + 4, + true, 2, Inst, Decoder)) + Inst.addOperand(MCOperand::createImm(Val << 1)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeThumbAddrModeRR(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Val, 0, 3); + unsigned Rm = fieldFromInstruction(Val, 3, 3); + + if (!Check(S, DecodetGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodetGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeThumbAddrModeIS(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Val, 0, 3); + unsigned imm = fieldFromInstruction(Val, 3, 5); + + if (!Check(S, DecodetGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(imm)); + + return S; +} + +static DecodeStatus DecodeThumbAddrModePC(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + unsigned imm = Val << 2; + + Inst.addOperand(MCOperand::createImm(imm)); + tryAddingPcLoadReferenceComment(Address, (Address & ~2u) + imm + 4, Decoder); + + return MCDisassembler::Success; +} + +static DecodeStatus DecodeThumbAddrModeSP(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + Inst.addOperand(MCOperand::createReg(ARM::SP)); + Inst.addOperand(MCOperand::createImm(Val)); + + return MCDisassembler::Success; +} + +static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Val, 6, 4); + unsigned Rm = fieldFromInstruction(Val, 2, 4); + unsigned imm = fieldFromInstruction(Val, 0, 2); + + // Thumb stores cannot use PC as dest register. + switch (Inst.getOpcode()) { + case ARM::t2STRHs: + case ARM::t2STRBs: + case ARM::t2STRs: + if (Rn == 15) + return MCDisassembler::Fail; + default: + break; + } + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(imm)); + + return S; +} + +static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rt = fieldFromInstruction(Insn, 12, 4); + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + + const FeatureBitset &featureBits = + ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); + + bool hasMP = featureBits[ARM::FeatureMP]; + bool hasV7Ops = featureBits[ARM::HasV7Ops]; + + if (Rn == 15) { + switch (Inst.getOpcode()) { + case ARM::t2LDRBs: + Inst.setOpcode(ARM::t2LDRBpci); + break; + case ARM::t2LDRHs: + Inst.setOpcode(ARM::t2LDRHpci); + break; + case ARM::t2LDRSHs: + Inst.setOpcode(ARM::t2LDRSHpci); + break; + case ARM::t2LDRSBs: + Inst.setOpcode(ARM::t2LDRSBpci); + break; + case ARM::t2LDRs: + Inst.setOpcode(ARM::t2LDRpci); + break; + case ARM::t2PLDs: + Inst.setOpcode(ARM::t2PLDpci); + break; + case ARM::t2PLIs: + Inst.setOpcode(ARM::t2PLIpci); + break; + default: + return MCDisassembler::Fail; + } + + return DecodeT2LoadLabel(Inst, Insn, Address, Decoder); + } + + if (Rt == 15) { + switch (Inst.getOpcode()) { + case ARM::t2LDRSHs: + return MCDisassembler::Fail; + case ARM::t2LDRHs: + Inst.setOpcode(ARM::t2PLDWs); + break; + case ARM::t2LDRSBs: + Inst.setOpcode(ARM::t2PLIs); + default: + break; + } + } + + switch (Inst.getOpcode()) { + case ARM::t2PLDs: + break; + case ARM::t2PLIs: + if (!hasV7Ops) + return MCDisassembler::Fail; + break; + case ARM::t2PLDWs: + if (!hasV7Ops || !hasMP) + return MCDisassembler::Fail; + break; + default: + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + } + + unsigned addrmode = fieldFromInstruction(Insn, 4, 2); + addrmode |= fieldFromInstruction(Insn, 0, 4) << 2; + addrmode |= fieldFromInstruction(Insn, 16, 4) << 6; + if (!Check(S, DecodeT2AddrModeSOReg(Inst, addrmode, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeT2LoadImm8(MCInst &Inst, unsigned Insn, + uint64_t Address, const void* Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned Rt = fieldFromInstruction(Insn, 12, 4); + unsigned U = fieldFromInstruction(Insn, 9, 1); + unsigned imm = fieldFromInstruction(Insn, 0, 8); + imm |= (U << 8); + imm |= (Rn << 9); + unsigned add = fieldFromInstruction(Insn, 9, 1); + + const FeatureBitset &featureBits = + ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); + + bool hasMP = featureBits[ARM::FeatureMP]; + bool hasV7Ops = featureBits[ARM::HasV7Ops]; + + if (Rn == 15) { + switch (Inst.getOpcode()) { + case ARM::t2LDRi8: + Inst.setOpcode(ARM::t2LDRpci); + break; + case ARM::t2LDRBi8: + Inst.setOpcode(ARM::t2LDRBpci); + break; + case ARM::t2LDRSBi8: + Inst.setOpcode(ARM::t2LDRSBpci); + break; + case ARM::t2LDRHi8: + Inst.setOpcode(ARM::t2LDRHpci); + break; + case ARM::t2LDRSHi8: + Inst.setOpcode(ARM::t2LDRSHpci); + break; + case ARM::t2PLDi8: + Inst.setOpcode(ARM::t2PLDpci); + break; + case ARM::t2PLIi8: + Inst.setOpcode(ARM::t2PLIpci); + break; + default: + return MCDisassembler::Fail; + } + return DecodeT2LoadLabel(Inst, Insn, Address, Decoder); + } + + if (Rt == 15) { + switch (Inst.getOpcode()) { + case ARM::t2LDRSHi8: + return MCDisassembler::Fail; + case ARM::t2LDRHi8: + if (!add) + Inst.setOpcode(ARM::t2PLDWi8); + break; + case ARM::t2LDRSBi8: + Inst.setOpcode(ARM::t2PLIi8); + break; + default: + break; + } + } + + switch (Inst.getOpcode()) { + case ARM::t2PLDi8: + break; + case ARM::t2PLIi8: + if (!hasV7Ops) + return MCDisassembler::Fail; + break; + case ARM::t2PLDWi8: + if (!hasV7Ops || !hasMP) + return MCDisassembler::Fail; + break; + default: + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + } + + if (!Check(S, DecodeT2AddrModeImm8(Inst, imm, Address, Decoder))) + return MCDisassembler::Fail; + return S; +} + +static DecodeStatus DecodeT2LoadImm12(MCInst &Inst, unsigned Insn, + uint64_t Address, const void* Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned Rt = fieldFromInstruction(Insn, 12, 4); + unsigned imm = fieldFromInstruction(Insn, 0, 12); + imm |= (Rn << 13); + + const FeatureBitset &featureBits = + ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); + + bool hasMP = featureBits[ARM::FeatureMP]; + bool hasV7Ops = featureBits[ARM::HasV7Ops]; + + if (Rn == 15) { + switch (Inst.getOpcode()) { + case ARM::t2LDRi12: + Inst.setOpcode(ARM::t2LDRpci); + break; + case ARM::t2LDRHi12: + Inst.setOpcode(ARM::t2LDRHpci); + break; + case ARM::t2LDRSHi12: + Inst.setOpcode(ARM::t2LDRSHpci); + break; + case ARM::t2LDRBi12: + Inst.setOpcode(ARM::t2LDRBpci); + break; + case ARM::t2LDRSBi12: + Inst.setOpcode(ARM::t2LDRSBpci); + break; + case ARM::t2PLDi12: + Inst.setOpcode(ARM::t2PLDpci); + break; + case ARM::t2PLIi12: + Inst.setOpcode(ARM::t2PLIpci); + break; + default: + return MCDisassembler::Fail; + } + return DecodeT2LoadLabel(Inst, Insn, Address, Decoder); + } + + if (Rt == 15) { + switch (Inst.getOpcode()) { + case ARM::t2LDRSHi12: + return MCDisassembler::Fail; + case ARM::t2LDRHi12: + Inst.setOpcode(ARM::t2PLDWi12); + break; + case ARM::t2LDRSBi12: + Inst.setOpcode(ARM::t2PLIi12); + break; + default: + break; + } + } + + switch (Inst.getOpcode()) { + case ARM::t2PLDi12: + break; + case ARM::t2PLIi12: + if (!hasV7Ops) + return MCDisassembler::Fail; + break; + case ARM::t2PLDWi12: + if (!hasV7Ops || !hasMP) + return MCDisassembler::Fail; + break; + default: + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + } + + if (!Check(S, DecodeT2AddrModeImm12(Inst, imm, Address, Decoder))) + return MCDisassembler::Fail; + return S; +} + +static DecodeStatus DecodeT2LoadT(MCInst &Inst, unsigned Insn, + uint64_t Address, const void* Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned Rt = fieldFromInstruction(Insn, 12, 4); + unsigned imm = fieldFromInstruction(Insn, 0, 8); + imm |= (Rn << 9); + + if (Rn == 15) { + switch (Inst.getOpcode()) { + case ARM::t2LDRT: + Inst.setOpcode(ARM::t2LDRpci); + break; + case ARM::t2LDRBT: + Inst.setOpcode(ARM::t2LDRBpci); + break; + case ARM::t2LDRHT: + Inst.setOpcode(ARM::t2LDRHpci); + break; + case ARM::t2LDRSBT: + Inst.setOpcode(ARM::t2LDRSBpci); + break; + case ARM::t2LDRSHT: + Inst.setOpcode(ARM::t2LDRSHpci); + break; + default: + return MCDisassembler::Fail; + } + return DecodeT2LoadLabel(Inst, Insn, Address, Decoder); + } + + if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeT2AddrModeImm8(Inst, imm, Address, Decoder))) + return MCDisassembler::Fail; + return S; +} + +static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn, + uint64_t Address, const void* Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rt = fieldFromInstruction(Insn, 12, 4); + unsigned U = fieldFromInstruction(Insn, 23, 1); + int imm = fieldFromInstruction(Insn, 0, 12); + + const FeatureBitset &featureBits = + ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); + + bool hasV7Ops = featureBits[ARM::HasV7Ops]; + + if (Rt == 15) { + switch (Inst.getOpcode()) { + case ARM::t2LDRBpci: + case ARM::t2LDRHpci: + Inst.setOpcode(ARM::t2PLDpci); + break; + case ARM::t2LDRSBpci: + Inst.setOpcode(ARM::t2PLIpci); + break; + case ARM::t2LDRSHpci: + return MCDisassembler::Fail; + default: + break; + } + } + + switch(Inst.getOpcode()) { + case ARM::t2PLDpci: + break; + case ARM::t2PLIpci: + if (!hasV7Ops) + return MCDisassembler::Fail; + break; + default: + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + } + + if (!U) { + // Special case for #-0. + if (imm == 0) + imm = INT32_MIN; + else + imm = -imm; + } + Inst.addOperand(MCOperand::createImm(imm)); + + return S; +} + +static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + if (Val == 0) + Inst.addOperand(MCOperand::createImm(INT32_MIN)); + else { + int imm = Val & 0xFF; + + if (!(Val & 0x100)) imm *= -1; + Inst.addOperand(MCOperand::createImm(imm * 4)); + } + + return MCDisassembler::Success; +} + +static DecodeStatus DecodeT2AddrModeImm8s4(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Val, 9, 4); + unsigned imm = fieldFromInstruction(Val, 0, 9); + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeT2Imm8S4(Inst, imm, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeT2AddrModeImm0_1020s4(MCInst &Inst,unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Val, 8, 4); + unsigned imm = fieldFromInstruction(Val, 0, 8); + + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + + Inst.addOperand(MCOperand::createImm(imm)); + + return S; +} + +static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + int imm = Val & 0xFF; + if (Val == 0) + imm = INT32_MIN; + else if (!(Val & 0x100)) + imm *= -1; + Inst.addOperand(MCOperand::createImm(imm)); + + return MCDisassembler::Success; +} + + +static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Val, 9, 4); + unsigned imm = fieldFromInstruction(Val, 0, 9); + + // Thumb stores cannot use PC as dest register. + switch (Inst.getOpcode()) { + case ARM::t2STRT: + case ARM::t2STRBT: + case ARM::t2STRHT: + case ARM::t2STRi8: + case ARM::t2STRHi8: + case ARM::t2STRBi8: + if (Rn == 15) + return MCDisassembler::Fail; + break; + default: + break; + } + + // Some instructions always use an additive offset. + switch (Inst.getOpcode()) { + case ARM::t2LDRT: + case ARM::t2LDRBT: + case ARM::t2LDRHT: + case ARM::t2LDRSBT: + case ARM::t2LDRSHT: + case ARM::t2STRT: + case ARM::t2STRBT: + case ARM::t2STRHT: + imm |= 0x100; + break; + default: + break; + } + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeT2Imm8(Inst, imm, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rt = fieldFromInstruction(Insn, 12, 4); + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned addr = fieldFromInstruction(Insn, 0, 8); + addr |= fieldFromInstruction(Insn, 9, 1) << 8; + addr |= Rn << 9; + unsigned load = fieldFromInstruction(Insn, 20, 1); + + if (Rn == 15) { + switch (Inst.getOpcode()) { + case ARM::t2LDR_PRE: + case ARM::t2LDR_POST: + Inst.setOpcode(ARM::t2LDRpci); + break; + case ARM::t2LDRB_PRE: + case ARM::t2LDRB_POST: + Inst.setOpcode(ARM::t2LDRBpci); + break; + case ARM::t2LDRH_PRE: + case ARM::t2LDRH_POST: + Inst.setOpcode(ARM::t2LDRHpci); + break; + case ARM::t2LDRSB_PRE: + case ARM::t2LDRSB_POST: + if (Rt == 15) + Inst.setOpcode(ARM::t2PLIpci); + else + Inst.setOpcode(ARM::t2LDRSBpci); + break; + case ARM::t2LDRSH_PRE: + case ARM::t2LDRSH_POST: + Inst.setOpcode(ARM::t2LDRSHpci); + break; + default: + return MCDisassembler::Fail; + } + return DecodeT2LoadLabel(Inst, Insn, Address, Decoder); + } + + if (!load) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + + if (load) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + + if (!Check(S, DecodeT2AddrModeImm8(Inst, addr, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Val, 13, 4); + unsigned imm = fieldFromInstruction(Val, 0, 12); + + // Thumb stores cannot use PC as dest register. + switch (Inst.getOpcode()) { + case ARM::t2STRi12: + case ARM::t2STRBi12: + case ARM::t2STRHi12: + if (Rn == 15) + return MCDisassembler::Fail; + default: + break; + } + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(imm)); + + return S; +} + + +static DecodeStatus DecodeThumbAddSPImm(MCInst &Inst, uint16_t Insn, + uint64_t Address, const void *Decoder) { + unsigned imm = fieldFromInstruction(Insn, 0, 7); + + Inst.addOperand(MCOperand::createReg(ARM::SP)); + Inst.addOperand(MCOperand::createReg(ARM::SP)); + Inst.addOperand(MCOperand::createImm(imm)); + + return MCDisassembler::Success; +} + +static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + if (Inst.getOpcode() == ARM::tADDrSP) { + unsigned Rdm = fieldFromInstruction(Insn, 0, 3); + Rdm |= fieldFromInstruction(Insn, 7, 1) << 3; + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rdm, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createReg(ARM::SP)); + if (!Check(S, DecodeGPRRegisterClass(Inst, Rdm, Address, Decoder))) + return MCDisassembler::Fail; + } else if (Inst.getOpcode() == ARM::tADDspr) { + unsigned Rm = fieldFromInstruction(Insn, 3, 4); + + Inst.addOperand(MCOperand::createReg(ARM::SP)); + Inst.addOperand(MCOperand::createReg(ARM::SP)); + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } + + return S; +} + +static DecodeStatus DecodeThumbCPS(MCInst &Inst, uint16_t Insn, + uint64_t Address, const void *Decoder) { + unsigned imod = fieldFromInstruction(Insn, 4, 1) | 0x2; + unsigned flags = fieldFromInstruction(Insn, 0, 3); + + Inst.addOperand(MCOperand::createImm(imod)); + Inst.addOperand(MCOperand::createImm(flags)); + + return MCDisassembler::Success; +} + +static DecodeStatus DecodePostIdxReg(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + unsigned Rm = fieldFromInstruction(Insn, 0, 4); + unsigned add = fieldFromInstruction(Insn, 4, 1); + + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(add)); + + return S; +} + +static DecodeStatus DecodeThumbBLXOffset(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + // Val is passed in as S:J1:J2:imm10H:imm10L:'0' + // Note only one trailing zero not two. Also the J1 and J2 values are from + // the encoded instruction. So here change to I1 and I2 values via: + // I1 = NOT(J1 EOR S); + // I2 = NOT(J2 EOR S); + // and build the imm32 with two trailing zeros as documented: + // imm32 = SignExtend(S:I1:I2:imm10H:imm10L:'00', 32); + unsigned S = (Val >> 23) & 1; + unsigned J1 = (Val >> 22) & 1; + unsigned J2 = (Val >> 21) & 1; + unsigned I1 = !(J1 ^ S); + unsigned I2 = !(J2 ^ S); + unsigned tmp = (Val & ~0x600000) | (I1 << 22) | (I2 << 21); + int imm32 = SignExtend32<25>(tmp << 1); + + if (!tryAddingSymbolicOperand(Address, + (Address & ~2u) + imm32 + 4, + true, 4, Inst, Decoder)) + Inst.addOperand(MCOperand::createImm(imm32)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + if (Val == 0xA || Val == 0xB) + return MCDisassembler::Fail; + + const FeatureBitset &featureBits = + ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); + + if (featureBits[ARM::HasV8Ops] && !(Val == 14 || Val == 15)) + return MCDisassembler::Fail; + + Inst.addOperand(MCOperand::createImm(Val)); + return MCDisassembler::Success; +} + +static DecodeStatus +DecodeThumbTableBranch(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned Rm = fieldFromInstruction(Insn, 0, 4); + + if (Rn == ARM::SP) S = MCDisassembler::SoftFail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + return S; +} + +static DecodeStatus +DecodeThumb2BCCInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned pred = fieldFromInstruction(Insn, 22, 4); + if (pred == 0xE || pred == 0xF) { + unsigned opc = fieldFromInstruction(Insn, 4, 28); + switch (opc) { + default: + return MCDisassembler::Fail; + case 0xf3bf8f4: + Inst.setOpcode(ARM::t2DSB); + break; + case 0xf3bf8f5: + Inst.setOpcode(ARM::t2DMB); + break; + case 0xf3bf8f6: + Inst.setOpcode(ARM::t2ISB); + break; + } + + unsigned imm = fieldFromInstruction(Insn, 0, 4); + return DecodeMemBarrierOption(Inst, imm, Address, Decoder); + } + + unsigned brtarget = fieldFromInstruction(Insn, 0, 11) << 1; + brtarget |= fieldFromInstruction(Insn, 11, 1) << 19; + brtarget |= fieldFromInstruction(Insn, 13, 1) << 18; + brtarget |= fieldFromInstruction(Insn, 16, 6) << 12; + brtarget |= fieldFromInstruction(Insn, 26, 1) << 20; + + if (!Check(S, DecodeT2BROperand(Inst, brtarget, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +// Decode a shifted immediate operand. These basically consist +// of an 8-bit value, and a 4-bit directive that specifies either +// a splat operation or a rotation. +static DecodeStatus DecodeT2SOImm(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + unsigned ctrl = fieldFromInstruction(Val, 10, 2); + if (ctrl == 0) { + unsigned byte = fieldFromInstruction(Val, 8, 2); + unsigned imm = fieldFromInstruction(Val, 0, 8); + switch (byte) { + case 0: + Inst.addOperand(MCOperand::createImm(imm)); + break; + case 1: + Inst.addOperand(MCOperand::createImm((imm << 16) | imm)); + break; + case 2: + Inst.addOperand(MCOperand::createImm((imm << 24) | (imm << 8))); + break; + case 3: + Inst.addOperand(MCOperand::createImm((imm << 24) | (imm << 16) | + (imm << 8) | imm)); + break; + } + } else { + unsigned unrot = fieldFromInstruction(Val, 0, 7) | 0x80; + unsigned rot = fieldFromInstruction(Val, 7, 5); + unsigned imm = (unrot >> rot) | (unrot << ((32-rot)&31)); + Inst.addOperand(MCOperand::createImm(imm)); + } + + return MCDisassembler::Success; +} + +static DecodeStatus +DecodeThumbBCCTargetOperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder){ + if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<9>(Val<<1) + 4, + true, 2, Inst, Decoder)) + Inst.addOperand(MCOperand::createImm(SignExtend32<9>(Val << 1))); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeThumbBLTargetOperand(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder){ + // Val is passed in as S:J1:J2:imm10:imm11 + // Note no trailing zero after imm11. Also the J1 and J2 values are from + // the encoded instruction. So here change to I1 and I2 values via: + // I1 = NOT(J1 EOR S); + // I2 = NOT(J2 EOR S); + // and build the imm32 with one trailing zero as documented: + // imm32 = SignExtend(S:I1:I2:imm10:imm11:'0', 32); + unsigned S = (Val >> 23) & 1; + unsigned J1 = (Val >> 22) & 1; + unsigned J2 = (Val >> 21) & 1; + unsigned I1 = !(J1 ^ S); + unsigned I2 = !(J2 ^ S); + unsigned tmp = (Val & ~0x600000) | (I1 << 22) | (I2 << 21); + int imm32 = SignExtend32<25>(tmp << 1); + + if (!tryAddingSymbolicOperand(Address, Address + imm32 + 4, + true, 4, Inst, Decoder)) + Inst.addOperand(MCOperand::createImm(imm32)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + if (Val & ~0xf) + return MCDisassembler::Fail; + + Inst.addOperand(MCOperand::createImm(Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeInstSyncBarrierOption(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + if (Val & ~0xf) + return MCDisassembler::Fail; + + Inst.addOperand(MCOperand::createImm(Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + const FeatureBitset &FeatureBits = + ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); + + if (FeatureBits[ARM::FeatureMClass]) { + unsigned ValLow = Val & 0xff; + + // Validate the SYSm value first. + switch (ValLow) { + case 0: // apsr + case 1: // iapsr + case 2: // eapsr + case 3: // xpsr + case 5: // ipsr + case 6: // epsr + case 7: // iepsr + case 8: // msp + case 9: // psp + case 16: // primask + case 20: // control + break; + case 17: // basepri + case 18: // basepri_max + case 19: // faultmask + if (!(FeatureBits[ARM::HasV7Ops])) + // Values basepri, basepri_max and faultmask are only valid for v7m. + return MCDisassembler::Fail; + break; + default: + return MCDisassembler::Fail; + } + + if (Inst.getOpcode() == ARM::t2MSR_M) { + unsigned Mask = fieldFromInstruction(Val, 10, 2); + if (!(FeatureBits[ARM::HasV7Ops])) { + // The ARMv6-M MSR bits {11-10} can be only 0b10, other values are + // unpredictable. + if (Mask != 2) + S = MCDisassembler::SoftFail; + } + else { + // The ARMv7-M architecture stores an additional 2-bit mask value in + // MSR bits {11-10}. The mask is used only with apsr, iapsr, eapsr and + // xpsr, it has to be 0b10 in other cases. Bit mask{1} indicates if + // the NZCVQ bits should be moved by the instruction. Bit mask{0} + // indicates the move for the GE{3:0} bits, the mask{0} bit can be set + // only if the processor includes the DSP extension. + if (Mask == 0 || (Mask != 2 && ValLow > 3) || + (!(FeatureBits[ARM::FeatureDSP]) && (Mask & 1))) + S = MCDisassembler::SoftFail; + } + } + } else { + // A/R class + if (Val == 0) + return MCDisassembler::Fail; + } + Inst.addOperand(MCOperand::createImm(Val)); + return S; +} + +static DecodeStatus DecodeBankedReg(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + + unsigned R = fieldFromInstruction(Val, 5, 1); + unsigned SysM = fieldFromInstruction(Val, 0, 5); + + // The table of encodings for these banked registers comes from B9.2.3 of the + // ARM ARM. There are patterns, but nothing regular enough to make this logic + // neater. So by fiat, these values are UNPREDICTABLE: + if (!R) { + if (SysM == 0x7 || SysM == 0xf || SysM == 0x18 || SysM == 0x19 || + SysM == 0x1a || SysM == 0x1b) + return MCDisassembler::SoftFail; + } else { + if (SysM != 0xe && SysM != 0x10 && SysM != 0x12 && SysM != 0x14 && + SysM != 0x16 && SysM != 0x1c && SysM != 0x1e) + return MCDisassembler::SoftFail; + } + + Inst.addOperand(MCOperand::createImm(Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeDoubleRegLoad(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rt = fieldFromInstruction(Insn, 12, 4); + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned pred = fieldFromInstruction(Insn, 28, 4); + + if (Rn == 0xF) + S = MCDisassembler::SoftFail; + + if (!Check(S, DecodeGPRPairRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeDoubleRegStore(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder){ + DecodeStatus S = MCDisassembler::Success; + + unsigned Rd = fieldFromInstruction(Insn, 12, 4); + unsigned Rt = fieldFromInstruction(Insn, 0, 4); + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned pred = fieldFromInstruction(Insn, 28, 4); + + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + + if (Rn == 0xF || Rd == Rn || Rd == Rt || Rd == Rt+1) + S = MCDisassembler::SoftFail; + + if (!Check(S, DecodeGPRPairRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeLDRPreImm(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned Rt = fieldFromInstruction(Insn, 12, 4); + unsigned imm = fieldFromInstruction(Insn, 0, 12); + imm |= fieldFromInstruction(Insn, 16, 4) << 13; + imm |= fieldFromInstruction(Insn, 23, 1) << 12; + unsigned pred = fieldFromInstruction(Insn, 28, 4); + + if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail; + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeAddrModeImm12Operand(Inst, imm, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeLDRPreReg(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned Rt = fieldFromInstruction(Insn, 12, 4); + unsigned imm = fieldFromInstruction(Insn, 0, 12); + imm |= fieldFromInstruction(Insn, 16, 4) << 13; + imm |= fieldFromInstruction(Insn, 23, 1) << 12; + unsigned pred = fieldFromInstruction(Insn, 28, 4); + unsigned Rm = fieldFromInstruction(Insn, 0, 4); + + if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail; + if (Rm == 0xF) S = MCDisassembler::SoftFail; + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeSORegMemOperand(Inst, imm, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + + +static DecodeStatus DecodeSTRPreImm(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned Rt = fieldFromInstruction(Insn, 12, 4); + unsigned imm = fieldFromInstruction(Insn, 0, 12); + imm |= fieldFromInstruction(Insn, 16, 4) << 13; + imm |= fieldFromInstruction(Insn, 23, 1) << 12; + unsigned pred = fieldFromInstruction(Insn, 28, 4); + + if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail; + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeAddrModeImm12Operand(Inst, imm, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeSTRPreReg(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned Rt = fieldFromInstruction(Insn, 12, 4); + unsigned imm = fieldFromInstruction(Insn, 0, 12); + imm |= fieldFromInstruction(Insn, 16, 4) << 13; + imm |= fieldFromInstruction(Insn, 23, 1) << 12; + unsigned pred = fieldFromInstruction(Insn, 28, 4); + + if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail; + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeSORegMemOperand(Inst, imm, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeVLD1LN(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned Rm = fieldFromInstruction(Insn, 0, 4); + unsigned Rd = fieldFromInstruction(Insn, 12, 4); + Rd |= fieldFromInstruction(Insn, 22, 1) << 4; + unsigned size = fieldFromInstruction(Insn, 10, 2); + + unsigned align = 0; + unsigned index = 0; + switch (size) { + default: + return MCDisassembler::Fail; + case 0: + if (fieldFromInstruction(Insn, 4, 1)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction(Insn, 5, 3); + break; + case 1: + if (fieldFromInstruction(Insn, 5, 1)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction(Insn, 6, 2); + if (fieldFromInstruction(Insn, 4, 1)) + align = 2; + break; + case 2: + if (fieldFromInstruction(Insn, 6, 1)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction(Insn, 7, 1); + + switch (fieldFromInstruction(Insn, 4, 2)) { + case 0 : + align = 0; break; + case 3: + align = 4; break; + default: + return MCDisassembler::Fail; + } + break; + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (Rm != 0xF) { // Writeback + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(align)); + if (Rm != 0xF) { + if (Rm != 0xD) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } else + Inst.addOperand(MCOperand::createReg(0)); + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(index)); + + return S; +} + +static DecodeStatus DecodeVST1LN(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned Rm = fieldFromInstruction(Insn, 0, 4); + unsigned Rd = fieldFromInstruction(Insn, 12, 4); + Rd |= fieldFromInstruction(Insn, 22, 1) << 4; + unsigned size = fieldFromInstruction(Insn, 10, 2); + + unsigned align = 0; + unsigned index = 0; + switch (size) { + default: + return MCDisassembler::Fail; + case 0: + if (fieldFromInstruction(Insn, 4, 1)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction(Insn, 5, 3); + break; + case 1: + if (fieldFromInstruction(Insn, 5, 1)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction(Insn, 6, 2); + if (fieldFromInstruction(Insn, 4, 1)) + align = 2; + break; + case 2: + if (fieldFromInstruction(Insn, 6, 1)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction(Insn, 7, 1); + + switch (fieldFromInstruction(Insn, 4, 2)) { + case 0: + align = 0; break; + case 3: + align = 4; break; + default: + return MCDisassembler::Fail; + } + break; + } + + if (Rm != 0xF) { // Writeback + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(align)); + if (Rm != 0xF) { + if (Rm != 0xD) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } else + Inst.addOperand(MCOperand::createReg(0)); + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(index)); + + return S; +} + + +static DecodeStatus DecodeVLD2LN(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned Rm = fieldFromInstruction(Insn, 0, 4); + unsigned Rd = fieldFromInstruction(Insn, 12, 4); + Rd |= fieldFromInstruction(Insn, 22, 1) << 4; + unsigned size = fieldFromInstruction(Insn, 10, 2); + + unsigned align = 0; + unsigned index = 0; + unsigned inc = 1; + switch (size) { + default: + return MCDisassembler::Fail; + case 0: + index = fieldFromInstruction(Insn, 5, 3); + if (fieldFromInstruction(Insn, 4, 1)) + align = 2; + break; + case 1: + index = fieldFromInstruction(Insn, 6, 2); + if (fieldFromInstruction(Insn, 4, 1)) + align = 4; + if (fieldFromInstruction(Insn, 5, 1)) + inc = 2; + break; + case 2: + if (fieldFromInstruction(Insn, 5, 1)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction(Insn, 7, 1); + if (fieldFromInstruction(Insn, 4, 1) != 0) + align = 8; + if (fieldFromInstruction(Insn, 6, 1)) + inc = 2; + break; + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder))) + return MCDisassembler::Fail; + if (Rm != 0xF) { // Writeback + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(align)); + if (Rm != 0xF) { + if (Rm != 0xD) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } else + Inst.addOperand(MCOperand::createReg(0)); + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(index)); + + return S; +} + +static DecodeStatus DecodeVST2LN(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned Rm = fieldFromInstruction(Insn, 0, 4); + unsigned Rd = fieldFromInstruction(Insn, 12, 4); + Rd |= fieldFromInstruction(Insn, 22, 1) << 4; + unsigned size = fieldFromInstruction(Insn, 10, 2); + + unsigned align = 0; + unsigned index = 0; + unsigned inc = 1; + switch (size) { + default: + return MCDisassembler::Fail; + case 0: + index = fieldFromInstruction(Insn, 5, 3); + if (fieldFromInstruction(Insn, 4, 1)) + align = 2; + break; + case 1: + index = fieldFromInstruction(Insn, 6, 2); + if (fieldFromInstruction(Insn, 4, 1)) + align = 4; + if (fieldFromInstruction(Insn, 5, 1)) + inc = 2; + break; + case 2: + if (fieldFromInstruction(Insn, 5, 1)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction(Insn, 7, 1); + if (fieldFromInstruction(Insn, 4, 1) != 0) + align = 8; + if (fieldFromInstruction(Insn, 6, 1)) + inc = 2; + break; + } + + if (Rm != 0xF) { // Writeback + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(align)); + if (Rm != 0xF) { + if (Rm != 0xD) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } else + Inst.addOperand(MCOperand::createReg(0)); + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(index)); + + return S; +} + + +static DecodeStatus DecodeVLD3LN(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned Rm = fieldFromInstruction(Insn, 0, 4); + unsigned Rd = fieldFromInstruction(Insn, 12, 4); + Rd |= fieldFromInstruction(Insn, 22, 1) << 4; + unsigned size = fieldFromInstruction(Insn, 10, 2); + + unsigned align = 0; + unsigned index = 0; + unsigned inc = 1; + switch (size) { + default: + return MCDisassembler::Fail; + case 0: + if (fieldFromInstruction(Insn, 4, 1)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction(Insn, 5, 3); + break; + case 1: + if (fieldFromInstruction(Insn, 4, 1)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction(Insn, 6, 2); + if (fieldFromInstruction(Insn, 5, 1)) + inc = 2; + break; + case 2: + if (fieldFromInstruction(Insn, 4, 2)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction(Insn, 7, 1); + if (fieldFromInstruction(Insn, 6, 1)) + inc = 2; + break; + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder))) + return MCDisassembler::Fail; + + if (Rm != 0xF) { // Writeback + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(align)); + if (Rm != 0xF) { + if (Rm != 0xD) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } else + Inst.addOperand(MCOperand::createReg(0)); + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(index)); + + return S; +} + +static DecodeStatus DecodeVST3LN(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned Rm = fieldFromInstruction(Insn, 0, 4); + unsigned Rd = fieldFromInstruction(Insn, 12, 4); + Rd |= fieldFromInstruction(Insn, 22, 1) << 4; + unsigned size = fieldFromInstruction(Insn, 10, 2); + + unsigned align = 0; + unsigned index = 0; + unsigned inc = 1; + switch (size) { + default: + return MCDisassembler::Fail; + case 0: + if (fieldFromInstruction(Insn, 4, 1)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction(Insn, 5, 3); + break; + case 1: + if (fieldFromInstruction(Insn, 4, 1)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction(Insn, 6, 2); + if (fieldFromInstruction(Insn, 5, 1)) + inc = 2; + break; + case 2: + if (fieldFromInstruction(Insn, 4, 2)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction(Insn, 7, 1); + if (fieldFromInstruction(Insn, 6, 1)) + inc = 2; + break; + } + + if (Rm != 0xF) { // Writeback + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(align)); + if (Rm != 0xF) { + if (Rm != 0xD) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } else + Inst.addOperand(MCOperand::createReg(0)); + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(index)); + + return S; +} + + +static DecodeStatus DecodeVLD4LN(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned Rm = fieldFromInstruction(Insn, 0, 4); + unsigned Rd = fieldFromInstruction(Insn, 12, 4); + Rd |= fieldFromInstruction(Insn, 22, 1) << 4; + unsigned size = fieldFromInstruction(Insn, 10, 2); + + unsigned align = 0; + unsigned index = 0; + unsigned inc = 1; + switch (size) { + default: + return MCDisassembler::Fail; + case 0: + if (fieldFromInstruction(Insn, 4, 1)) + align = 4; + index = fieldFromInstruction(Insn, 5, 3); + break; + case 1: + if (fieldFromInstruction(Insn, 4, 1)) + align = 8; + index = fieldFromInstruction(Insn, 6, 2); + if (fieldFromInstruction(Insn, 5, 1)) + inc = 2; + break; + case 2: + switch (fieldFromInstruction(Insn, 4, 2)) { + case 0: + align = 0; break; + case 3: + return MCDisassembler::Fail; + default: + align = 4 << fieldFromInstruction(Insn, 4, 2); break; + } + + index = fieldFromInstruction(Insn, 7, 1); + if (fieldFromInstruction(Insn, 6, 1)) + inc = 2; + break; + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+3*inc, Address, Decoder))) + return MCDisassembler::Fail; + + if (Rm != 0xF) { // Writeback + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(align)); + if (Rm != 0xF) { + if (Rm != 0xD) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } else + Inst.addOperand(MCOperand::createReg(0)); + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+3*inc, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(index)); + + return S; +} + +static DecodeStatus DecodeVST4LN(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned Rm = fieldFromInstruction(Insn, 0, 4); + unsigned Rd = fieldFromInstruction(Insn, 12, 4); + Rd |= fieldFromInstruction(Insn, 22, 1) << 4; + unsigned size = fieldFromInstruction(Insn, 10, 2); + + unsigned align = 0; + unsigned index = 0; + unsigned inc = 1; + switch (size) { + default: + return MCDisassembler::Fail; + case 0: + if (fieldFromInstruction(Insn, 4, 1)) + align = 4; + index = fieldFromInstruction(Insn, 5, 3); + break; + case 1: + if (fieldFromInstruction(Insn, 4, 1)) + align = 8; + index = fieldFromInstruction(Insn, 6, 2); + if (fieldFromInstruction(Insn, 5, 1)) + inc = 2; + break; + case 2: + switch (fieldFromInstruction(Insn, 4, 2)) { + case 0: + align = 0; break; + case 3: + return MCDisassembler::Fail; + default: + align = 4 << fieldFromInstruction(Insn, 4, 2); break; + } + + index = fieldFromInstruction(Insn, 7, 1); + if (fieldFromInstruction(Insn, 6, 1)) + inc = 2; + break; + } + + if (Rm != 0xF) { // Writeback + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(align)); + if (Rm != 0xF) { + if (Rm != 0xD) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } else + Inst.addOperand(MCOperand::createReg(0)); + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+3*inc, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(index)); + + return S; +} + +static DecodeStatus DecodeVMOVSRR(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + unsigned Rt = fieldFromInstruction(Insn, 12, 4); + unsigned Rt2 = fieldFromInstruction(Insn, 16, 4); + unsigned Rm = fieldFromInstruction(Insn, 5, 1); + unsigned pred = fieldFromInstruction(Insn, 28, 4); + Rm |= fieldFromInstruction(Insn, 0, 4) << 1; + + if (Rt == 0xF || Rt2 == 0xF || Rm == 0x1F) + S = MCDisassembler::SoftFail; + + if (!Check(S, DecodeSPRRegisterClass(Inst, Rm , Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeSPRRegisterClass(Inst, Rm+1, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt , Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt2 , Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeVMOVRRS(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + unsigned Rt = fieldFromInstruction(Insn, 12, 4); + unsigned Rt2 = fieldFromInstruction(Insn, 16, 4); + unsigned Rm = fieldFromInstruction(Insn, 5, 1); + unsigned pred = fieldFromInstruction(Insn, 28, 4); + Rm |= fieldFromInstruction(Insn, 0, 4) << 1; + + if (Rt == 0xF || Rt2 == 0xF || Rm == 0x1F) + S = MCDisassembler::SoftFail; + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt , Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt2 , Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeSPRRegisterClass(Inst, Rm , Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeSPRRegisterClass(Inst, Rm+1, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeIT(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + unsigned pred = fieldFromInstruction(Insn, 4, 4); + unsigned mask = fieldFromInstruction(Insn, 0, 4); + + if (pred == 0xF) { + pred = 0xE; + S = MCDisassembler::SoftFail; + } + + if (mask == 0x0) + return MCDisassembler::Fail; + + Inst.addOperand(MCOperand::createImm(pred)); + Inst.addOperand(MCOperand::createImm(mask)); + return S; +} + +static DecodeStatus +DecodeT2LDRDPreInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rt = fieldFromInstruction(Insn, 12, 4); + unsigned Rt2 = fieldFromInstruction(Insn, 8, 4); + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned addr = fieldFromInstruction(Insn, 0, 8); + unsigned W = fieldFromInstruction(Insn, 21, 1); + unsigned U = fieldFromInstruction(Insn, 23, 1); + unsigned P = fieldFromInstruction(Insn, 24, 1); + bool writeback = (W == 1) | (P == 0); + + addr |= (U << 8) | (Rn << 9); + + if (writeback && (Rn == Rt || Rn == Rt2)) + Check(S, MCDisassembler::SoftFail); + if (Rt == Rt2) + Check(S, MCDisassembler::SoftFail); + + // Rt + if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + // Rt2 + if (!Check(S, DecoderGPRRegisterClass(Inst, Rt2, Address, Decoder))) + return MCDisassembler::Fail; + // Writeback operand + if (!Check(S, DecoderGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + // addr + if (!Check(S, DecodeT2AddrModeImm8s4(Inst, addr, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus +DecodeT2STRDPreInstruction(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rt = fieldFromInstruction(Insn, 12, 4); + unsigned Rt2 = fieldFromInstruction(Insn, 8, 4); + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned addr = fieldFromInstruction(Insn, 0, 8); + unsigned W = fieldFromInstruction(Insn, 21, 1); + unsigned U = fieldFromInstruction(Insn, 23, 1); + unsigned P = fieldFromInstruction(Insn, 24, 1); + bool writeback = (W == 1) | (P == 0); + + addr |= (U << 8) | (Rn << 9); + + if (writeback && (Rn == Rt || Rn == Rt2)) + Check(S, MCDisassembler::SoftFail); + + // Writeback operand + if (!Check(S, DecoderGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + // Rt + if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + // Rt2 + if (!Check(S, DecoderGPRRegisterClass(Inst, Rt2, Address, Decoder))) + return MCDisassembler::Fail; + // addr + if (!Check(S, DecodeT2AddrModeImm8s4(Inst, addr, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeT2Adr(MCInst &Inst, uint32_t Insn, + uint64_t Address, const void *Decoder) { + unsigned sign1 = fieldFromInstruction(Insn, 21, 1); + unsigned sign2 = fieldFromInstruction(Insn, 23, 1); + if (sign1 != sign2) return MCDisassembler::Fail; + + unsigned Val = fieldFromInstruction(Insn, 0, 8); + Val |= fieldFromInstruction(Insn, 12, 3) << 8; + Val |= fieldFromInstruction(Insn, 26, 1) << 11; + Val |= sign1 << 12; + Inst.addOperand(MCOperand::createImm(SignExtend32<13>(Val))); + + return MCDisassembler::Success; +} + +static DecodeStatus DecodeT2ShifterImmOperand(MCInst &Inst, uint32_t Val, + uint64_t Address, + const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + // Shift of "asr #32" is not allowed in Thumb2 mode. + if (Val == 0x20) S = MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(Val)); + return S; +} + +static DecodeStatus DecodeSwap(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + unsigned Rt = fieldFromInstruction(Insn, 12, 4); + unsigned Rt2 = fieldFromInstruction(Insn, 0, 4); + unsigned Rn = fieldFromInstruction(Insn, 16, 4); + unsigned pred = fieldFromInstruction(Insn, 28, 4); + + if (pred == 0xF) + return DecodeCPSInstruction(Inst, Insn, Address, Decoder); + + DecodeStatus S = MCDisassembler::Success; + + if (Rt == Rn || Rn == Rt2) + S = MCDisassembler::SoftFail; + + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt2, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + const FeatureBitset &featureBits = + ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits(); + bool hasFullFP16 = featureBits[ARM::FeatureFullFP16]; + + unsigned Vd = (fieldFromInstruction(Insn, 12, 4) << 0); + Vd |= (fieldFromInstruction(Insn, 22, 1) << 4); + unsigned Vm = (fieldFromInstruction(Insn, 0, 4) << 0); + Vm |= (fieldFromInstruction(Insn, 5, 1) << 4); + unsigned imm = fieldFromInstruction(Insn, 16, 6); + unsigned cmode = fieldFromInstruction(Insn, 8, 4); + unsigned op = fieldFromInstruction(Insn, 5, 1); + + DecodeStatus S = MCDisassembler::Success; + + // If the top 3 bits of imm are clear, this is a VMOV (immediate) + if (!(imm & 0x38)) { + if (cmode == 0xF) { + if (op == 1) return MCDisassembler::Fail; + Inst.setOpcode(ARM::VMOVv2f32); + } + if (hasFullFP16) { + if (cmode == 0xE) { + if (op == 1) { + Inst.setOpcode(ARM::VMOVv1i64); + } else { + Inst.setOpcode(ARM::VMOVv8i8); + } + } + if (cmode == 0xD) { + if (op == 1) { + Inst.setOpcode(ARM::VMVNv2i32); + } else { + Inst.setOpcode(ARM::VMOVv2i32); + } + } + if (cmode == 0xC) { + if (op == 1) { + Inst.setOpcode(ARM::VMVNv2i32); + } else { + Inst.setOpcode(ARM::VMOVv2i32); + } + } + } + return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder); + } + + if (!(imm & 0x20)) return MCDisassembler::Fail; + + if (!Check(S, DecodeDPRRegisterClass(Inst, Vd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Vm, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(64 - imm)); + + return S; +} + +static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + const FeatureBitset &featureBits = + ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits(); + bool hasFullFP16 = featureBits[ARM::FeatureFullFP16]; + + unsigned Vd = (fieldFromInstruction(Insn, 12, 4) << 0); + Vd |= (fieldFromInstruction(Insn, 22, 1) << 4); + unsigned Vm = (fieldFromInstruction(Insn, 0, 4) << 0); + Vm |= (fieldFromInstruction(Insn, 5, 1) << 4); + unsigned imm = fieldFromInstruction(Insn, 16, 6); + unsigned cmode = fieldFromInstruction(Insn, 8, 4); + unsigned op = fieldFromInstruction(Insn, 5, 1); + + DecodeStatus S = MCDisassembler::Success; + + // If the top 3 bits of imm are clear, this is a VMOV (immediate) + if (!(imm & 0x38)) { + if (cmode == 0xF) { + if (op == 1) return MCDisassembler::Fail; + Inst.setOpcode(ARM::VMOVv4f32); + } + if (hasFullFP16) { + if (cmode == 0xE) { + if (op == 1) { + Inst.setOpcode(ARM::VMOVv2i64); + } else { + Inst.setOpcode(ARM::VMOVv16i8); + } + } + if (cmode == 0xD) { + if (op == 1) { + Inst.setOpcode(ARM::VMVNv4i32); + } else { + Inst.setOpcode(ARM::VMOVv4i32); + } + } + if (cmode == 0xC) { + if (op == 1) { + Inst.setOpcode(ARM::VMVNv4i32); + } else { + Inst.setOpcode(ARM::VMOVv4i32); + } + } + } + return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder); + } + + if (!(imm & 0x20)) return MCDisassembler::Fail; + + if (!Check(S, DecodeQPRRegisterClass(Inst, Vd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeQPRRegisterClass(Inst, Vm, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(64 - imm)); + + return S; +} + +static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Val, 16, 4); + unsigned Rt = fieldFromInstruction(Val, 12, 4); + unsigned Rm = fieldFromInstruction(Val, 0, 4); + Rm |= (fieldFromInstruction(Val, 23, 1) << 4); + unsigned Cond = fieldFromInstruction(Val, 28, 4); + + if (fieldFromInstruction(Val, 8, 4) != 0 || Rn == Rt) + S = MCDisassembler::SoftFail; + + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeAddrMode7Operand(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodePostIdxReg(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodePredicateOperand(Inst, Cond, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeMRRC2(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + + DecodeStatus S = MCDisassembler::Success; + + unsigned CRm = fieldFromInstruction(Val, 0, 4); + unsigned opc1 = fieldFromInstruction(Val, 4, 4); + unsigned cop = fieldFromInstruction(Val, 8, 4); + unsigned Rt = fieldFromInstruction(Val, 12, 4); + unsigned Rt2 = fieldFromInstruction(Val, 16, 4); + + if ((cop & ~0x1) == 0xa) + return MCDisassembler::Fail; + + if (Rt == Rt2) + S = MCDisassembler::SoftFail; + + Inst.addOperand(MCOperand::createImm(cop)); + Inst.addOperand(MCOperand::createImm(opc1)); + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt2, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(CRm)); + + return S; +} + diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp new file mode 100644 index 0000000..33fc85a --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp @@ -0,0 +1,1647 @@ +//===-- ARMInstPrinter.cpp - Convert ARM MCInst to assembly syntax --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class prints an ARM MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#include "ARMInstPrinter.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "MCTargetDesc/ARMBaseInfo.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +#include "ARMGenAsmWriter.inc" + +/// translateShiftImm - Convert shift immediate from 0-31 to 1-32 for printing. +/// +/// getSORegOffset returns an integer from 0-31, representing '32' as 0. +static unsigned translateShiftImm(unsigned imm) { + // lsr #32 and asr #32 exist, but should be encoded as a 0. + assert((imm & ~0x1f) == 0 && "Invalid shift encoding"); + + if (imm == 0) + return 32; + return imm; +} + +/// Prints the shift value with an immediate value. +static void printRegImmShift(raw_ostream &O, ARM_AM::ShiftOpc ShOpc, + unsigned ShImm, bool UseMarkup) { + if (ShOpc == ARM_AM::no_shift || (ShOpc == ARM_AM::lsl && !ShImm)) + return; + O << ", "; + + assert(!(ShOpc == ARM_AM::ror && !ShImm) && "Cannot have ror #0"); + O << getShiftOpcStr(ShOpc); + + if (ShOpc != ARM_AM::rrx) { + O << " "; + if (UseMarkup) + O << "<imm:"; + O << "#" << translateShiftImm(ShImm); + if (UseMarkup) + O << ">"; + } +} + +ARMInstPrinter::ARMInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} + +void ARMInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + OS << markup("<reg:") << getRegisterName(RegNo) << markup(">"); +} + +void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O, + StringRef Annot, const MCSubtargetInfo &STI) { + unsigned Opcode = MI->getOpcode(); + + switch (Opcode) { + + // Check for HINT instructions w/ canonical names. + case ARM::HINT: + case ARM::tHINT: + case ARM::t2HINT: + switch (MI->getOperand(0).getImm()) { + case 0: + O << "\tnop"; + break; + case 1: + O << "\tyield"; + break; + case 2: + O << "\twfe"; + break; + case 3: + O << "\twfi"; + break; + case 4: + O << "\tsev"; + break; + case 5: + if (STI.getFeatureBits()[ARM::HasV8Ops]) { + O << "\tsevl"; + break; + } // Fallthrough for non-v8 + default: + // Anything else should just print normally. + printInstruction(MI, STI, O); + printAnnotation(O, Annot); + return; + } + printPredicateOperand(MI, 1, STI, O); + if (Opcode == ARM::t2HINT) + O << ".w"; + printAnnotation(O, Annot); + return; + + // Check for MOVs and print canonical forms, instead. + case ARM::MOVsr: { + // FIXME: Thumb variants? + const MCOperand &Dst = MI->getOperand(0); + const MCOperand &MO1 = MI->getOperand(1); + const MCOperand &MO2 = MI->getOperand(2); + const MCOperand &MO3 = MI->getOperand(3); + + O << '\t' << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO3.getImm())); + printSBitModifierOperand(MI, 6, STI, O); + printPredicateOperand(MI, 4, STI, O); + + O << '\t'; + printRegName(O, Dst.getReg()); + O << ", "; + printRegName(O, MO1.getReg()); + + O << ", "; + printRegName(O, MO2.getReg()); + assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0); + printAnnotation(O, Annot); + return; + } + + case ARM::MOVsi: { + // FIXME: Thumb variants? + const MCOperand &Dst = MI->getOperand(0); + const MCOperand &MO1 = MI->getOperand(1); + const MCOperand &MO2 = MI->getOperand(2); + + O << '\t' << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO2.getImm())); + printSBitModifierOperand(MI, 5, STI, O); + printPredicateOperand(MI, 3, STI, O); + + O << '\t'; + printRegName(O, Dst.getReg()); + O << ", "; + printRegName(O, MO1.getReg()); + + if (ARM_AM::getSORegShOp(MO2.getImm()) == ARM_AM::rrx) { + printAnnotation(O, Annot); + return; + } + + O << ", " << markup("<imm:") << "#" + << translateShiftImm(ARM_AM::getSORegOffset(MO2.getImm())) << markup(">"); + printAnnotation(O, Annot); + return; + } + + // A8.6.123 PUSH + case ARM::STMDB_UPD: + case ARM::t2STMDB_UPD: + if (MI->getOperand(0).getReg() == ARM::SP && MI->getNumOperands() > 5) { + // Should only print PUSH if there are at least two registers in the list. + O << '\t' << "push"; + printPredicateOperand(MI, 2, STI, O); + if (Opcode == ARM::t2STMDB_UPD) + O << ".w"; + O << '\t'; + printRegisterList(MI, 4, STI, O); + printAnnotation(O, Annot); + return; + } else + break; + + case ARM::STR_PRE_IMM: + if (MI->getOperand(2).getReg() == ARM::SP && + MI->getOperand(3).getImm() == -4) { + O << '\t' << "push"; + printPredicateOperand(MI, 4, STI, O); + O << "\t{"; + printRegName(O, MI->getOperand(1).getReg()); + O << "}"; + printAnnotation(O, Annot); + return; + } else + break; + + // A8.6.122 POP + case ARM::LDMIA_UPD: + case ARM::t2LDMIA_UPD: + if (MI->getOperand(0).getReg() == ARM::SP && MI->getNumOperands() > 5) { + // Should only print POP if there are at least two registers in the list. + O << '\t' << "pop"; + printPredicateOperand(MI, 2, STI, O); + if (Opcode == ARM::t2LDMIA_UPD) + O << ".w"; + O << '\t'; + printRegisterList(MI, 4, STI, O); + printAnnotation(O, Annot); + return; + } else + break; + + case ARM::LDR_POST_IMM: + if (MI->getOperand(2).getReg() == ARM::SP && + MI->getOperand(4).getImm() == 4) { + O << '\t' << "pop"; + printPredicateOperand(MI, 5, STI, O); + O << "\t{"; + printRegName(O, MI->getOperand(0).getReg()); + O << "}"; + printAnnotation(O, Annot); + return; + } else + break; + + // A8.6.355 VPUSH + case ARM::VSTMSDB_UPD: + case ARM::VSTMDDB_UPD: + if (MI->getOperand(0).getReg() == ARM::SP) { + O << '\t' << "vpush"; + printPredicateOperand(MI, 2, STI, O); + O << '\t'; + printRegisterList(MI, 4, STI, O); + printAnnotation(O, Annot); + return; + } else + break; + + // A8.6.354 VPOP + case ARM::VLDMSIA_UPD: + case ARM::VLDMDIA_UPD: + if (MI->getOperand(0).getReg() == ARM::SP) { + O << '\t' << "vpop"; + printPredicateOperand(MI, 2, STI, O); + O << '\t'; + printRegisterList(MI, 4, STI, O); + printAnnotation(O, Annot); + return; + } else + break; + + case ARM::tLDMIA: { + bool Writeback = true; + unsigned BaseReg = MI->getOperand(0).getReg(); + for (unsigned i = 3; i < MI->getNumOperands(); ++i) { + if (MI->getOperand(i).getReg() == BaseReg) + Writeback = false; + } + + O << "\tldm"; + + printPredicateOperand(MI, 1, STI, O); + O << '\t'; + printRegName(O, BaseReg); + if (Writeback) + O << "!"; + O << ", "; + printRegisterList(MI, 3, STI, O); + printAnnotation(O, Annot); + return; + } + + // Combine 2 GPRs from disassember into a GPRPair to match with instr def. + // ldrexd/strexd require even/odd GPR pair. To enforce this constraint, + // a single GPRPair reg operand is used in the .td file to replace the two + // GPRs. However, when decoding them, the two GRPs cannot be automatically + // expressed as a GPRPair, so we have to manually merge them. + // FIXME: We would really like to be able to tablegen'erate this. + case ARM::LDREXD: + case ARM::STREXD: + case ARM::LDAEXD: + case ARM::STLEXD: { + const MCRegisterClass &MRC = MRI.getRegClass(ARM::GPRRegClassID); + bool isStore = Opcode == ARM::STREXD || Opcode == ARM::STLEXD; + unsigned Reg = MI->getOperand(isStore ? 1 : 0).getReg(); + if (MRC.contains(Reg)) { + MCInst NewMI; + MCOperand NewReg; + NewMI.setOpcode(Opcode); + + if (isStore) + NewMI.addOperand(MI->getOperand(0)); + NewReg = MCOperand::createReg(MRI.getMatchingSuperReg( + Reg, ARM::gsub_0, &MRI.getRegClass(ARM::GPRPairRegClassID))); + NewMI.addOperand(NewReg); + + // Copy the rest operands into NewMI. + for (unsigned i = isStore ? 3 : 2; i < MI->getNumOperands(); ++i) + NewMI.addOperand(MI->getOperand(i)); + printInstruction(&NewMI, STI, O); + return; + } + break; + } + // B9.3.3 ERET (Thumb) + // For a target that has Virtualization Extensions, ERET is the preferred + // disassembly of SUBS PC, LR, #0 + case ARM::t2SUBS_PC_LR: { + if (MI->getNumOperands() == 3 && MI->getOperand(0).isImm() && + MI->getOperand(0).getImm() == 0 && + STI.getFeatureBits()[ARM::FeatureVirtualization]) { + O << "\teret"; + printPredicateOperand(MI, 1, STI, O); + printAnnotation(O, Annot); + return; + } + break; + } + } + + printInstruction(MI, STI, O); + printAnnotation(O, Annot); +} + +void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + unsigned Reg = Op.getReg(); + printRegName(O, Reg); + } else if (Op.isImm()) { + O << markup("<imm:") << '#' << formatImm(Op.getImm()) << markup(">"); + } else { + assert(Op.isExpr() && "unknown operand kind in printOperand"); + const MCExpr *Expr = Op.getExpr(); + switch (Expr->getKind()) { + case MCExpr::Binary: + O << '#'; + Expr->print(O, &MAI); + break; + case MCExpr::Constant: { + // If a symbolic branch target was added as a constant expression then + // print that address in hex. And only print 32 unsigned bits for the + // address. + const MCConstantExpr *Constant = cast<MCConstantExpr>(Expr); + int64_t TargetAddress; + if (!Constant->evaluateAsAbsolute(TargetAddress)) { + O << '#'; + Expr->print(O, &MAI); + } else { + O << "0x"; + O.write_hex(static_cast<uint32_t>(TargetAddress)); + } + break; + } + default: + // FIXME: Should we always treat this as if it is a constant literal and + // prefix it with '#'? + Expr->print(O, &MAI); + break; + } + } +} + +void ARMInstPrinter::printThumbLdrLabelOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + if (MO1.isExpr()) { + MO1.getExpr()->print(O, &MAI); + return; + } + + O << markup("<mem:") << "[pc, "; + + int32_t OffImm = (int32_t)MO1.getImm(); + bool isSub = OffImm < 0; + + // Special value for #-0. All others are normal. + if (OffImm == INT32_MIN) + OffImm = 0; + if (isSub) { + O << markup("<imm:") << "#-" << formatImm(-OffImm) << markup(">"); + } else { + O << markup("<imm:") << "#" << formatImm(OffImm) << markup(">"); + } + O << "]" << markup(">"); +} + +// so_reg is a 4-operand unit corresponding to register forms of the A5.1 +// "Addressing Mode 1 - Data-processing operands" forms. This includes: +// REG 0 0 - e.g. R5 +// REG REG 0,SH_OPC - e.g. R5, ROR R3 +// REG 0 IMM,SH_OPC - e.g. R5, LSL #3 +void ARMInstPrinter::printSORegRegOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + const MCOperand &MO3 = MI->getOperand(OpNum + 2); + + printRegName(O, MO1.getReg()); + + // Print the shift opc. + ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO3.getImm()); + O << ", " << ARM_AM::getShiftOpcStr(ShOpc); + if (ShOpc == ARM_AM::rrx) + return; + + O << ' '; + printRegName(O, MO2.getReg()); + assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0); +} + +void ARMInstPrinter::printSORegImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + + printRegName(O, MO1.getReg()); + + // Print the shift opc. + printRegImmShift(O, ARM_AM::getSORegShOp(MO2.getImm()), + ARM_AM::getSORegOffset(MO2.getImm()), UseMarkup); +} + +//===--------------------------------------------------------------------===// +// Addressing Mode #2 +//===--------------------------------------------------------------------===// + +void ARMInstPrinter::printAM2PreOrOffsetIndexOp(const MCInst *MI, unsigned Op, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(Op); + const MCOperand &MO2 = MI->getOperand(Op + 1); + const MCOperand &MO3 = MI->getOperand(Op + 2); + + O << markup("<mem:") << "["; + printRegName(O, MO1.getReg()); + + if (!MO2.getReg()) { + if (ARM_AM::getAM2Offset(MO3.getImm())) { // Don't print +0. + O << ", " << markup("<imm:") << "#" + << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm())) + << ARM_AM::getAM2Offset(MO3.getImm()) << markup(">"); + } + O << "]" << markup(">"); + return; + } + + O << ", "; + O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm())); + printRegName(O, MO2.getReg()); + + printRegImmShift(O, ARM_AM::getAM2ShiftOpc(MO3.getImm()), + ARM_AM::getAM2Offset(MO3.getImm()), UseMarkup); + O << "]" << markup(">"); +} + +void ARMInstPrinter::printAddrModeTBB(const MCInst *MI, unsigned Op, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(Op); + const MCOperand &MO2 = MI->getOperand(Op + 1); + O << markup("<mem:") << "["; + printRegName(O, MO1.getReg()); + O << ", "; + printRegName(O, MO2.getReg()); + O << "]" << markup(">"); +} + +void ARMInstPrinter::printAddrModeTBH(const MCInst *MI, unsigned Op, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(Op); + const MCOperand &MO2 = MI->getOperand(Op + 1); + O << markup("<mem:") << "["; + printRegName(O, MO1.getReg()); + O << ", "; + printRegName(O, MO2.getReg()); + O << ", lsl " << markup("<imm:") << "#1" << markup(">") << "]" << markup(">"); +} + +void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(Op); + + if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, Op, STI, O); + return; + } + +#ifndef NDEBUG + const MCOperand &MO3 = MI->getOperand(Op + 2); + unsigned IdxMode = ARM_AM::getAM2IdxMode(MO3.getImm()); + assert(IdxMode != ARMII::IndexModePost && "Should be pre or offset index op"); +#endif + + printAM2PreOrOffsetIndexOp(MI, Op, STI, O); +} + +void ARMInstPrinter::printAddrMode2OffsetOperand(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + + if (!MO1.getReg()) { + unsigned ImmOffs = ARM_AM::getAM2Offset(MO2.getImm()); + O << markup("<imm:") << '#' + << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm())) << ImmOffs + << markup(">"); + return; + } + + O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm())); + printRegName(O, MO1.getReg()); + + printRegImmShift(O, ARM_AM::getAM2ShiftOpc(MO2.getImm()), + ARM_AM::getAM2Offset(MO2.getImm()), UseMarkup); +} + +//===--------------------------------------------------------------------===// +// Addressing Mode #3 +//===--------------------------------------------------------------------===// + +void ARMInstPrinter::printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op, + raw_ostream &O, + bool AlwaysPrintImm0) { + const MCOperand &MO1 = MI->getOperand(Op); + const MCOperand &MO2 = MI->getOperand(Op + 1); + const MCOperand &MO3 = MI->getOperand(Op + 2); + + O << markup("<mem:") << '['; + printRegName(O, MO1.getReg()); + + if (MO2.getReg()) { + O << ", " << getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm())); + printRegName(O, MO2.getReg()); + O << ']' << markup(">"); + return; + } + + // If the op is sub we have to print the immediate even if it is 0 + unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm()); + ARM_AM::AddrOpc op = ARM_AM::getAM3Op(MO3.getImm()); + + if (AlwaysPrintImm0 || ImmOffs || (op == ARM_AM::sub)) { + O << ", " << markup("<imm:") << "#" << ARM_AM::getAddrOpcStr(op) << ImmOffs + << markup(">"); + } + O << ']' << markup(">"); +} + +template <bool AlwaysPrintImm0> +void ARMInstPrinter::printAddrMode3Operand(const MCInst *MI, unsigned Op, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(Op); + if (!MO1.isReg()) { // For label symbolic references. + printOperand(MI, Op, STI, O); + return; + } + + assert(ARM_AM::getAM3IdxMode(MI->getOperand(Op + 2).getImm()) != + ARMII::IndexModePost && + "unexpected idxmode"); + printAM3PreOrOffsetIndexOp(MI, Op, O, AlwaysPrintImm0); +} + +void ARMInstPrinter::printAddrMode3OffsetOperand(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + + if (MO1.getReg()) { + O << getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm())); + printRegName(O, MO1.getReg()); + return; + } + + unsigned ImmOffs = ARM_AM::getAM3Offset(MO2.getImm()); + O << markup("<imm:") << '#' + << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm())) << ImmOffs + << markup(">"); +} + +void ARMInstPrinter::printPostIdxImm8Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + unsigned Imm = MO.getImm(); + O << markup("<imm:") << '#' << ((Imm & 256) ? "" : "-") << (Imm & 0xff) + << markup(">"); +} + +void ARMInstPrinter::printPostIdxRegOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + + O << (MO2.getImm() ? "" : "-"); + printRegName(O, MO1.getReg()); +} + +void ARMInstPrinter::printPostIdxImm8s4Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + unsigned Imm = MO.getImm(); + O << markup("<imm:") << '#' << ((Imm & 256) ? "" : "-") << ((Imm & 0xff) << 2) + << markup(">"); +} + +void ARMInstPrinter::printLdStmModeOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + ARM_AM::AMSubMode Mode = + ARM_AM::getAM4SubMode(MI->getOperand(OpNum).getImm()); + O << ARM_AM::getAMSubModeStr(Mode); +} + +template <bool AlwaysPrintImm0> +void ARMInstPrinter::printAddrMode5Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + + if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, OpNum, STI, O); + return; + } + + O << markup("<mem:") << "["; + printRegName(O, MO1.getReg()); + + unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm()); + ARM_AM::AddrOpc Op = ARM_AM::getAM5Op(MO2.getImm()); + if (AlwaysPrintImm0 || ImmOffs || Op == ARM_AM::sub) { + O << ", " << markup("<imm:") << "#" << ARM_AM::getAddrOpcStr(Op) + << ImmOffs * 4 << markup(">"); + } + O << "]" << markup(">"); +} + +void ARMInstPrinter::printAddrMode6Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + + O << markup("<mem:") << "["; + printRegName(O, MO1.getReg()); + if (MO2.getImm()) { + O << ":" << (MO2.getImm() << 3); + } + O << "]" << markup(">"); +} + +void ARMInstPrinter::printAddrMode7Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + O << markup("<mem:") << "["; + printRegName(O, MO1.getReg()); + O << "]" << markup(">"); +} + +void ARMInstPrinter::printAddrMode6OffsetOperand(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + if (MO.getReg() == 0) + O << "!"; + else { + O << ", "; + printRegName(O, MO.getReg()); + } +} + +void ARMInstPrinter::printBitfieldInvMaskImmOperand(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + uint32_t v = ~MO.getImm(); + int32_t lsb = countTrailingZeros(v); + int32_t width = (32 - countLeadingZeros(v)) - lsb; + assert(MO.isImm() && "Not a valid bf_inv_mask_imm value!"); + O << markup("<imm:") << '#' << lsb << markup(">") << ", " << markup("<imm:") + << '#' << width << markup(">"); +} + +void ARMInstPrinter::printMemBOption(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned val = MI->getOperand(OpNum).getImm(); + O << ARM_MB::MemBOptToString(val, STI.getFeatureBits()[ARM::HasV8Ops]); +} + +void ARMInstPrinter::printInstSyncBOption(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned val = MI->getOperand(OpNum).getImm(); + O << ARM_ISB::InstSyncBOptToString(val); +} + +void ARMInstPrinter::printShiftImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned ShiftOp = MI->getOperand(OpNum).getImm(); + bool isASR = (ShiftOp & (1 << 5)) != 0; + unsigned Amt = ShiftOp & 0x1f; + if (isASR) { + O << ", asr " << markup("<imm:") << "#" << (Amt == 0 ? 32 : Amt) + << markup(">"); + } else if (Amt) { + O << ", lsl " << markup("<imm:") << "#" << Amt << markup(">"); + } +} + +void ARMInstPrinter::printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNum).getImm(); + if (Imm == 0) + return; + assert(Imm > 0 && Imm < 32 && "Invalid PKH shift immediate value!"); + O << ", lsl " << markup("<imm:") << "#" << Imm << markup(">"); +} + +void ARMInstPrinter::printPKHASRShiftImm(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNum).getImm(); + // A shift amount of 32 is encoded as 0. + if (Imm == 0) + Imm = 32; + assert(Imm > 0 && Imm <= 32 && "Invalid PKH shift immediate value!"); + O << ", asr " << markup("<imm:") << "#" << Imm << markup(">"); +} + +void ARMInstPrinter::printRegisterList(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << "{"; + for (unsigned i = OpNum, e = MI->getNumOperands(); i != e; ++i) { + if (i != OpNum) + O << ", "; + printRegName(O, MI->getOperand(i).getReg()); + } + O << "}"; +} + +void ARMInstPrinter::printGPRPairOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Reg = MI->getOperand(OpNum).getReg(); + printRegName(O, MRI.getSubReg(Reg, ARM::gsub_0)); + O << ", "; + printRegName(O, MRI.getSubReg(Reg, ARM::gsub_1)); +} + +void ARMInstPrinter::printSetendOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNum); + if (Op.getImm()) + O << "be"; + else + O << "le"; +} + +void ARMInstPrinter::printCPSIMod(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNum); + O << ARM_PROC::IModToString(Op.getImm()); +} + +void ARMInstPrinter::printCPSIFlag(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNum); + unsigned IFlags = Op.getImm(); + for (int i = 2; i >= 0; --i) + if (IFlags & (1 << i)) + O << ARM_PROC::IFlagsToString(1 << i); + + if (IFlags == 0) + O << "none"; +} + +void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNum); + unsigned SpecRegRBit = Op.getImm() >> 4; + unsigned Mask = Op.getImm() & 0xf; + const FeatureBitset &FeatureBits = STI.getFeatureBits(); + + if (FeatureBits[ARM::FeatureMClass]) { + unsigned SYSm = Op.getImm(); + unsigned Opcode = MI->getOpcode(); + + // For writes, handle extended mask bits if the DSP extension is present. + if (Opcode == ARM::t2MSR_M && FeatureBits[ARM::FeatureDSP]) { + switch (SYSm) { + case 0x400: + O << "apsr_g"; + return; + case 0xc00: + O << "apsr_nzcvqg"; + return; + case 0x401: + O << "iapsr_g"; + return; + case 0xc01: + O << "iapsr_nzcvqg"; + return; + case 0x402: + O << "eapsr_g"; + return; + case 0xc02: + O << "eapsr_nzcvqg"; + return; + case 0x403: + O << "xpsr_g"; + return; + case 0xc03: + O << "xpsr_nzcvqg"; + return; + } + } + + // Handle the basic 8-bit mask. + SYSm &= 0xff; + + if (Opcode == ARM::t2MSR_M && FeatureBits [ARM::HasV7Ops]) { + // ARMv7-M deprecates using MSR APSR without a _<bits> qualifier as an + // alias for MSR APSR_nzcvq. + switch (SYSm) { + case 0: + O << "apsr_nzcvq"; + return; + case 1: + O << "iapsr_nzcvq"; + return; + case 2: + O << "eapsr_nzcvq"; + return; + case 3: + O << "xpsr_nzcvq"; + return; + } + } + + switch (SYSm) { + default: + llvm_unreachable("Unexpected mask value!"); + case 0: + O << "apsr"; + return; + case 1: + O << "iapsr"; + return; + case 2: + O << "eapsr"; + return; + case 3: + O << "xpsr"; + return; + case 5: + O << "ipsr"; + return; + case 6: + O << "epsr"; + return; + case 7: + O << "iepsr"; + return; + case 8: + O << "msp"; + return; + case 9: + O << "psp"; + return; + case 16: + O << "primask"; + return; + case 17: + O << "basepri"; + return; + case 18: + O << "basepri_max"; + return; + case 19: + O << "faultmask"; + return; + case 20: + O << "control"; + return; + } + } + + // As special cases, CPSR_f, CPSR_s and CPSR_fs prefer printing as + // APSR_nzcvq, APSR_g and APSRnzcvqg, respectively. + if (!SpecRegRBit && (Mask == 8 || Mask == 4 || Mask == 12)) { + O << "APSR_"; + switch (Mask) { + default: + llvm_unreachable("Unexpected mask value!"); + case 4: + O << "g"; + return; + case 8: + O << "nzcvq"; + return; + case 12: + O << "nzcvqg"; + return; + } + } + + if (SpecRegRBit) + O << "SPSR"; + else + O << "CPSR"; + + if (Mask) { + O << '_'; + if (Mask & 8) + O << 'f'; + if (Mask & 4) + O << 's'; + if (Mask & 2) + O << 'x'; + if (Mask & 1) + O << 'c'; + } +} + +void ARMInstPrinter::printBankedRegOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + uint32_t Banked = MI->getOperand(OpNum).getImm(); + uint32_t R = (Banked & 0x20) >> 5; + uint32_t SysM = Banked & 0x1f; + + // Nothing much we can do about this, the encodings are specified in B9.2.3 of + // the ARM ARM v7C, and are all over the shop. + if (R) { + O << "SPSR_"; + + switch (SysM) { + case 0x0e: + O << "fiq"; + return; + case 0x10: + O << "irq"; + return; + case 0x12: + O << "svc"; + return; + case 0x14: + O << "abt"; + return; + case 0x16: + O << "und"; + return; + case 0x1c: + O << "mon"; + return; + case 0x1e: + O << "hyp"; + return; + default: + llvm_unreachable("Invalid banked SPSR register"); + } + } + + assert(!R && "should have dealt with SPSR regs"); + const char *RegNames[] = { + "r8_usr", "r9_usr", "r10_usr", "r11_usr", "r12_usr", "sp_usr", "lr_usr", + "", "r8_fiq", "r9_fiq", "r10_fiq", "r11_fiq", "r12_fiq", "sp_fiq", + "lr_fiq", "", "lr_irq", "sp_irq", "lr_svc", "sp_svc", "lr_abt", + "sp_abt", "lr_und", "sp_und", "", "", "", "", + "lr_mon", "sp_mon", "elr_hyp", "sp_hyp"}; + const char *Name = RegNames[SysM]; + assert(Name[0] && "invalid banked register operand"); + + O << Name; +} + +void ARMInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm(); + // Handle the undefined 15 CC value here for printing so we don't abort(). + if ((unsigned)CC == 15) + O << "<und>"; + else if (CC != ARMCC::AL) + O << ARMCondCodeToString(CC); +} + +void ARMInstPrinter::printMandatoryPredicateOperand(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm(); + O << ARMCondCodeToString(CC); +} + +void ARMInstPrinter::printSBitModifierOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (MI->getOperand(OpNum).getReg()) { + assert(MI->getOperand(OpNum).getReg() == ARM::CPSR && + "Expect ARM CPSR register!"); + O << 's'; + } +} + +void ARMInstPrinter::printNoHashImmediate(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << MI->getOperand(OpNum).getImm(); +} + +void ARMInstPrinter::printPImmediate(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << "p" << MI->getOperand(OpNum).getImm(); +} + +void ARMInstPrinter::printCImmediate(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << "c" << MI->getOperand(OpNum).getImm(); +} + +void ARMInstPrinter::printCoprocOptionImm(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << "{" << MI->getOperand(OpNum).getImm() << "}"; +} + +void ARMInstPrinter::printPCLabel(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O) { + llvm_unreachable("Unhandled PC-relative pseudo-instruction!"); +} + +template <unsigned scale> +void ARMInstPrinter::printAdrLabelOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + + if (MO.isExpr()) { + MO.getExpr()->print(O, &MAI); + return; + } + + int32_t OffImm = (int32_t)MO.getImm() << scale; + + O << markup("<imm:"); + if (OffImm == INT32_MIN) + O << "#-0"; + else if (OffImm < 0) + O << "#-" << -OffImm; + else + O << "#" << OffImm; + O << markup(">"); +} + +void ARMInstPrinter::printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << markup("<imm:") << "#" << formatImm(MI->getOperand(OpNum).getImm() * 4) + << markup(">"); +} + +void ARMInstPrinter::printThumbSRImm(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNum).getImm(); + O << markup("<imm:") << "#" << formatImm((Imm == 0 ? 32 : Imm)) + << markup(">"); +} + +void ARMInstPrinter::printThumbITMask(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + // (3 - the number of trailing zeros) is the number of then / else. + unsigned Mask = MI->getOperand(OpNum).getImm(); + unsigned Firstcond = MI->getOperand(OpNum - 1).getImm(); + unsigned CondBit0 = Firstcond & 1; + unsigned NumTZ = countTrailingZeros(Mask); + assert(NumTZ <= 3 && "Invalid IT mask!"); + for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) { + bool T = ((Mask >> Pos) & 1) == CondBit0; + if (T) + O << 't'; + else + O << 'e'; + } +} + +void ARMInstPrinter::printThumbAddrModeRROperand(const MCInst *MI, unsigned Op, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(Op); + const MCOperand &MO2 = MI->getOperand(Op + 1); + + if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, Op, STI, O); + return; + } + + O << markup("<mem:") << "["; + printRegName(O, MO1.getReg()); + if (unsigned RegNum = MO2.getReg()) { + O << ", "; + printRegName(O, RegNum); + } + O << "]" << markup(">"); +} + +void ARMInstPrinter::printThumbAddrModeImm5SOperand(const MCInst *MI, + unsigned Op, + const MCSubtargetInfo &STI, + raw_ostream &O, + unsigned Scale) { + const MCOperand &MO1 = MI->getOperand(Op); + const MCOperand &MO2 = MI->getOperand(Op + 1); + + if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, Op, STI, O); + return; + } + + O << markup("<mem:") << "["; + printRegName(O, MO1.getReg()); + if (unsigned ImmOffs = MO2.getImm()) { + O << ", " << markup("<imm:") << "#" << formatImm(ImmOffs * Scale) + << markup(">"); + } + O << "]" << markup(">"); +} + +void ARMInstPrinter::printThumbAddrModeImm5S1Operand(const MCInst *MI, + unsigned Op, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printThumbAddrModeImm5SOperand(MI, Op, STI, O, 1); +} + +void ARMInstPrinter::printThumbAddrModeImm5S2Operand(const MCInst *MI, + unsigned Op, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printThumbAddrModeImm5SOperand(MI, Op, STI, O, 2); +} + +void ARMInstPrinter::printThumbAddrModeImm5S4Operand(const MCInst *MI, + unsigned Op, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printThumbAddrModeImm5SOperand(MI, Op, STI, O, 4); +} + +void ARMInstPrinter::printThumbAddrModeSPOperand(const MCInst *MI, unsigned Op, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printThumbAddrModeImm5SOperand(MI, Op, STI, O, 4); +} + +// Constant shifts t2_so_reg is a 2-operand unit corresponding to the Thumb2 +// register with shift forms. +// REG 0 0 - e.g. R5 +// REG IMM, SH_OPC - e.g. R5, LSL #3 +void ARMInstPrinter::printT2SOOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + + unsigned Reg = MO1.getReg(); + printRegName(O, Reg); + + // Print the shift opc. + assert(MO2.isImm() && "Not a valid t2_so_reg value!"); + printRegImmShift(O, ARM_AM::getSORegShOp(MO2.getImm()), + ARM_AM::getSORegOffset(MO2.getImm()), UseMarkup); +} + +template <bool AlwaysPrintImm0> +void ARMInstPrinter::printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + + if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, OpNum, STI, O); + return; + } + + O << markup("<mem:") << "["; + printRegName(O, MO1.getReg()); + + int32_t OffImm = (int32_t)MO2.getImm(); + bool isSub = OffImm < 0; + // Special value for #-0. All others are normal. + if (OffImm == INT32_MIN) + OffImm = 0; + if (isSub) { + O << ", " << markup("<imm:") << "#-" << formatImm(-OffImm) << markup(">"); + } else if (AlwaysPrintImm0 || OffImm > 0) { + O << ", " << markup("<imm:") << "#" << formatImm(OffImm) << markup(">"); + } + O << "]" << markup(">"); +} + +template <bool AlwaysPrintImm0> +void ARMInstPrinter::printT2AddrModeImm8Operand(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + + O << markup("<mem:") << "["; + printRegName(O, MO1.getReg()); + + int32_t OffImm = (int32_t)MO2.getImm(); + bool isSub = OffImm < 0; + // Don't print +0. + if (OffImm == INT32_MIN) + OffImm = 0; + if (isSub) { + O << ", " << markup("<imm:") << "#-" << -OffImm << markup(">"); + } else if (AlwaysPrintImm0 || OffImm > 0) { + O << ", " << markup("<imm:") << "#" << OffImm << markup(">"); + } + O << "]" << markup(">"); +} + +template <bool AlwaysPrintImm0> +void ARMInstPrinter::printT2AddrModeImm8s4Operand(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + + if (!MO1.isReg()) { // For label symbolic references. + printOperand(MI, OpNum, STI, O); + return; + } + + O << markup("<mem:") << "["; + printRegName(O, MO1.getReg()); + + int32_t OffImm = (int32_t)MO2.getImm(); + bool isSub = OffImm < 0; + + assert(((OffImm & 0x3) == 0) && "Not a valid immediate!"); + + // Don't print +0. + if (OffImm == INT32_MIN) + OffImm = 0; + if (isSub) { + O << ", " << markup("<imm:") << "#-" << -OffImm << markup(">"); + } else if (AlwaysPrintImm0 || OffImm > 0) { + O << ", " << markup("<imm:") << "#" << OffImm << markup(">"); + } + O << "]" << markup(">"); +} + +void ARMInstPrinter::printT2AddrModeImm0_1020s4Operand( + const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + + O << markup("<mem:") << "["; + printRegName(O, MO1.getReg()); + if (MO2.getImm()) { + O << ", " << markup("<imm:") << "#" << formatImm(MO2.getImm() * 4) + << markup(">"); + } + O << "]" << markup(">"); +} + +void ARMInstPrinter::printT2AddrModeImm8OffsetOperand( + const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + int32_t OffImm = (int32_t)MO1.getImm(); + O << ", " << markup("<imm:"); + if (OffImm == INT32_MIN) + O << "#-0"; + else if (OffImm < 0) + O << "#-" << -OffImm; + else + O << "#" << OffImm; + O << markup(">"); +} + +void ARMInstPrinter::printT2AddrModeImm8s4OffsetOperand( + const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + int32_t OffImm = (int32_t)MO1.getImm(); + + assert(((OffImm & 0x3) == 0) && "Not a valid immediate!"); + + O << ", " << markup("<imm:"); + if (OffImm == INT32_MIN) + O << "#-0"; + else if (OffImm < 0) + O << "#-" << -OffImm; + else + O << "#" << OffImm; + O << markup(">"); +} + +void ARMInstPrinter::printT2AddrModeSoRegOperand(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum + 1); + const MCOperand &MO3 = MI->getOperand(OpNum + 2); + + O << markup("<mem:") << "["; + printRegName(O, MO1.getReg()); + + assert(MO2.getReg() && "Invalid so_reg load / store address!"); + O << ", "; + printRegName(O, MO2.getReg()); + + unsigned ShAmt = MO3.getImm(); + if (ShAmt) { + assert(ShAmt <= 3 && "Not a valid Thumb2 addressing mode!"); + O << ", lsl " << markup("<imm:") << "#" << ShAmt << markup(">"); + } + O << "]" << markup(">"); +} + +void ARMInstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + O << markup("<imm:") << '#' << ARM_AM::getFPImmFloat(MO.getImm()) + << markup(">"); +} + +void ARMInstPrinter::printNEONModImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned EncodedImm = MI->getOperand(OpNum).getImm(); + unsigned EltBits; + uint64_t Val = ARM_AM::decodeNEONModImm(EncodedImm, EltBits); + O << markup("<imm:") << "#0x"; + O.write_hex(Val); + O << markup(">"); +} + +void ARMInstPrinter::printImmPlusOneOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNum).getImm(); + O << markup("<imm:") << "#" << formatImm(Imm + 1) << markup(">"); +} + +void ARMInstPrinter::printRotImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNum).getImm(); + if (Imm == 0) + return; + assert(Imm <= 3 && "illegal ror immediate!"); + O << ", ror " << markup("<imm:") << "#" << 8 * Imm << markup(">"); +} + +void ARMInstPrinter::printModImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + MCOperand Op = MI->getOperand(OpNum); + + // Support for fixups (MCFixup) + if (Op.isExpr()) + return printOperand(MI, OpNum, STI, O); + + unsigned Bits = Op.getImm() & 0xFF; + unsigned Rot = (Op.getImm() & 0xF00) >> 7; + + bool PrintUnsigned = false; + switch (MI->getOpcode()) { + case ARM::MOVi: + // Movs to PC should be treated unsigned + PrintUnsigned = (MI->getOperand(OpNum - 1).getReg() == ARM::PC); + break; + case ARM::MSRi: + // Movs to special registers should be treated unsigned + PrintUnsigned = true; + break; + } + + int32_t Rotated = ARM_AM::rotr32(Bits, Rot); + if (ARM_AM::getSOImmVal(Rotated) == Op.getImm()) { + // #rot has the least possible value + O << "#" << markup("<imm:"); + if (PrintUnsigned) + O << static_cast<uint32_t>(Rotated); + else + O << Rotated; + O << markup(">"); + return; + } + + // Explicit #bits, #rot implied + O << "#" << markup("<imm:") << Bits << markup(">") << ", #" << markup("<imm:") + << Rot << markup(">"); +} + +void ARMInstPrinter::printFBits16(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O) { + O << markup("<imm:") << "#" << 16 - MI->getOperand(OpNum).getImm() + << markup(">"); +} + +void ARMInstPrinter::printFBits32(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O) { + O << markup("<imm:") << "#" << 32 - MI->getOperand(OpNum).getImm() + << markup(">"); +} + +void ARMInstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << "[" << MI->getOperand(OpNum).getImm() << "]"; +} + +void ARMInstPrinter::printVectorListOne(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << "{"; + printRegName(O, MI->getOperand(OpNum).getReg()); + O << "}"; +} + +void ARMInstPrinter::printVectorListTwo(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Reg = MI->getOperand(OpNum).getReg(); + unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0); + unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_1); + O << "{"; + printRegName(O, Reg0); + O << ", "; + printRegName(O, Reg1); + O << "}"; +} + +void ARMInstPrinter::printVectorListTwoSpaced(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Reg = MI->getOperand(OpNum).getReg(); + unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0); + unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_2); + O << "{"; + printRegName(O, Reg0); + O << ", "; + printRegName(O, Reg1); + O << "}"; +} + +void ARMInstPrinter::printVectorListThree(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + // Normally, it's not safe to use register enum values directly with + // addition to get the next register, but for VFP registers, the + // sort order is guaranteed because they're all of the form D<n>. + O << "{"; + printRegName(O, MI->getOperand(OpNum).getReg()); + O << ", "; + printRegName(O, MI->getOperand(OpNum).getReg() + 1); + O << ", "; + printRegName(O, MI->getOperand(OpNum).getReg() + 2); + O << "}"; +} + +void ARMInstPrinter::printVectorListFour(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + // Normally, it's not safe to use register enum values directly with + // addition to get the next register, but for VFP registers, the + // sort order is guaranteed because they're all of the form D<n>. + O << "{"; + printRegName(O, MI->getOperand(OpNum).getReg()); + O << ", "; + printRegName(O, MI->getOperand(OpNum).getReg() + 1); + O << ", "; + printRegName(O, MI->getOperand(OpNum).getReg() + 2); + O << ", "; + printRegName(O, MI->getOperand(OpNum).getReg() + 3); + O << "}"; +} + +void ARMInstPrinter::printVectorListOneAllLanes(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << "{"; + printRegName(O, MI->getOperand(OpNum).getReg()); + O << "[]}"; +} + +void ARMInstPrinter::printVectorListTwoAllLanes(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Reg = MI->getOperand(OpNum).getReg(); + unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0); + unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_1); + O << "{"; + printRegName(O, Reg0); + O << "[], "; + printRegName(O, Reg1); + O << "[]}"; +} + +void ARMInstPrinter::printVectorListThreeAllLanes(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + // Normally, it's not safe to use register enum values directly with + // addition to get the next register, but for VFP registers, the + // sort order is guaranteed because they're all of the form D<n>. + O << "{"; + printRegName(O, MI->getOperand(OpNum).getReg()); + O << "[], "; + printRegName(O, MI->getOperand(OpNum).getReg() + 1); + O << "[], "; + printRegName(O, MI->getOperand(OpNum).getReg() + 2); + O << "[]}"; +} + +void ARMInstPrinter::printVectorListFourAllLanes(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + // Normally, it's not safe to use register enum values directly with + // addition to get the next register, but for VFP registers, the + // sort order is guaranteed because they're all of the form D<n>. + O << "{"; + printRegName(O, MI->getOperand(OpNum).getReg()); + O << "[], "; + printRegName(O, MI->getOperand(OpNum).getReg() + 1); + O << "[], "; + printRegName(O, MI->getOperand(OpNum).getReg() + 2); + O << "[], "; + printRegName(O, MI->getOperand(OpNum).getReg() + 3); + O << "[]}"; +} + +void ARMInstPrinter::printVectorListTwoSpacedAllLanes( + const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Reg = MI->getOperand(OpNum).getReg(); + unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0); + unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_2); + O << "{"; + printRegName(O, Reg0); + O << "[], "; + printRegName(O, Reg1); + O << "[]}"; +} + +void ARMInstPrinter::printVectorListThreeSpacedAllLanes( + const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, + raw_ostream &O) { + // Normally, it's not safe to use register enum values directly with + // addition to get the next register, but for VFP registers, the + // sort order is guaranteed because they're all of the form D<n>. + O << "{"; + printRegName(O, MI->getOperand(OpNum).getReg()); + O << "[], "; + printRegName(O, MI->getOperand(OpNum).getReg() + 2); + O << "[], "; + printRegName(O, MI->getOperand(OpNum).getReg() + 4); + O << "[]}"; +} + +void ARMInstPrinter::printVectorListFourSpacedAllLanes( + const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, + raw_ostream &O) { + // Normally, it's not safe to use register enum values directly with + // addition to get the next register, but for VFP registers, the + // sort order is guaranteed because they're all of the form D<n>. + O << "{"; + printRegName(O, MI->getOperand(OpNum).getReg()); + O << "[], "; + printRegName(O, MI->getOperand(OpNum).getReg() + 2); + O << "[], "; + printRegName(O, MI->getOperand(OpNum).getReg() + 4); + O << "[], "; + printRegName(O, MI->getOperand(OpNum).getReg() + 6); + O << "[]}"; +} + +void ARMInstPrinter::printVectorListThreeSpaced(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + // Normally, it's not safe to use register enum values directly with + // addition to get the next register, but for VFP registers, the + // sort order is guaranteed because they're all of the form D<n>. + O << "{"; + printRegName(O, MI->getOperand(OpNum).getReg()); + O << ", "; + printRegName(O, MI->getOperand(OpNum).getReg() + 2); + O << ", "; + printRegName(O, MI->getOperand(OpNum).getReg() + 4); + O << "}"; +} + +void ARMInstPrinter::printVectorListFourSpaced(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + // Normally, it's not safe to use register enum values directly with + // addition to get the next register, but for VFP registers, the + // sort order is guaranteed because they're all of the form D<n>. + O << "{"; + printRegName(O, MI->getOperand(OpNum).getReg()); + O << ", "; + printRegName(O, MI->getOperand(OpNum).getReg() + 2); + O << ", "; + printRegName(O, MI->getOperand(OpNum).getReg() + 4); + O << ", "; + printRegName(O, MI->getOperand(OpNum).getReg() + 6); + O << "}"; +} diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h new file mode 100644 index 0000000..52f7115 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h @@ -0,0 +1,229 @@ +//===- ARMInstPrinter.h - Convert ARM MCInst to assembly syntax -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class prints an ARM MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_INSTPRINTER_ARMINSTPRINTER_H +#define LLVM_LIB_TARGET_ARM_INSTPRINTER_ARMINSTPRINTER_H + +#include "llvm/MC/MCInstPrinter.h" + +namespace llvm { + +class ARMInstPrinter : public MCInstPrinter { +public: + ARMInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI); + + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + void printRegName(raw_ostream &OS, unsigned RegNo) const override; + + // Autogenerated by tblgen. + void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI, + raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); + + void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + + void printSORegRegOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printSORegImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + void printAddrModeTBB(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printAddrModeTBH(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printAddrMode2Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printAM2PostIndexOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printAM2PreOrOffsetIndexOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printAddrMode2OffsetOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + template <bool AlwaysPrintImm0> + void printAddrMode3Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printAddrMode3OffsetOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op, raw_ostream &O, + bool AlwaysPrintImm0); + void printPostIdxImm8Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printPostIdxRegOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printPostIdxImm8s4Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + void printLdStmModeOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + template <bool AlwaysPrintImm0> + void printAddrMode5Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printAddrMode6Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printAddrMode7Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printAddrMode6OffsetOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + void printBitfieldInvMaskImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + void printMemBOption(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printInstSyncBOption(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printShiftImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printPKHASRShiftImm(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + template <unsigned scale> + void printAdrLabelOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printThumbSRImm(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printThumbITMask(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printThumbAddrModeRROperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printThumbAddrModeImm5SOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O, unsigned Scale); + void printThumbAddrModeImm5S1Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + void printThumbAddrModeImm5S2Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + void printThumbAddrModeImm5S4Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + void printThumbAddrModeSPOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + void printT2SOOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + template <bool AlwaysPrintImm0> + void printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + template <bool AlwaysPrintImm0> + void printT2AddrModeImm8Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + template <bool AlwaysPrintImm0> + void printT2AddrModeImm8s4Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printT2AddrModeImm0_1020s4Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + void printT2AddrModeImm8OffsetOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + void printT2AddrModeImm8s4OffsetOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + void printT2AddrModeSoRegOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + void printSetendOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printCPSIMod(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printCPSIFlag(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printMSRMaskOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printBankedRegOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printPredicateOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printMandatoryPredicateOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + void printSBitModifierOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printRegisterList(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printNoHashImmediate(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printPImmediate(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printCImmediate(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printCoprocOptionImm(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printFPImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printNEONModImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printImmPlusOneOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printRotImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printModImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printGPRPairOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + void printPCLabel(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printThumbLdrLabelOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printFBits16(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printFBits32(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printVectorIndex(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printVectorListOne(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printVectorListTwo(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printVectorListTwoSpaced(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printVectorListThree(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printVectorListFour(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printVectorListOneAllLanes(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printVectorListTwoAllLanes(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printVectorListThreeAllLanes(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printVectorListFourAllLanes(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printVectorListTwoSpacedAllLanes(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + void printVectorListThreeSpacedAllLanes(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + void printVectorListFourSpacedAllLanes(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + void printVectorListThreeSpaced(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printVectorListFourSpaced(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/LICENSE.TXT b/contrib/llvm/lib/Target/ARM/LICENSE.TXT new file mode 100755 index 0000000..68afea1 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/LICENSE.TXT @@ -0,0 +1,47 @@ +ARM Limited + +Software Grant License Agreement ("Agreement") + +Except for the license granted herein to you, ARM Limited ("ARM") reserves all +right, title, and interest in and to the Software (defined below). + +Definition + +"Software" means the code and documentation as well as any original work of +authorship, including any modifications or additions to an existing work, that +is intentionally submitted by ARM to llvm.org (http://llvm.org) ("LLVM") for +inclusion in, or documentation of, any of the products owned or managed by LLVM +(the "Work"). For the purposes of this definition, "submitted" means any form of +electronic, verbal, or written communication sent to LLVM or its +representatives, including but not limited to communication on electronic +mailing lists, source code control systems, and issue tracking systems that are +managed by, or on behalf of, LLVM for the purpose of discussing and improving +the Work, but excluding communication that is conspicuously marked otherwise. + +1. Grant of Copyright License. Subject to the terms and conditions of this + Agreement, ARM hereby grants to you and to recipients of the Software + distributed by LLVM a perpetual, worldwide, non-exclusive, no-charge, + royalty-free, irrevocable copyright license to reproduce, prepare derivative + works of, publicly display, publicly perform, sublicense, and distribute the + Software and such derivative works. + +2. Grant of Patent License. Subject to the terms and conditions of this + Agreement, ARM hereby grants you and to recipients of the Software + distributed by LLVM a perpetual, worldwide, non-exclusive, no-charge, + royalty-free, irrevocable (except as stated in this section) patent license + to make, have made, use, offer to sell, sell, import, and otherwise transfer + the Work, where such license applies only to those patent claims licensable + by ARM that are necessarily infringed by ARM's Software alone or by + combination of the Software with the Work to which such Software was + submitted. If any entity institutes patent litigation against ARM or any + other entity (including a cross-claim or counterclaim in a lawsuit) alleging + that ARM's Software, or the Work to which ARM has contributed constitutes + direct or contributory patent infringement, then any patent licenses granted + to that entity under this Agreement for the Software or Work shall terminate + as of the date such litigation is filed. + +Unless required by applicable law or agreed to in writing, the software is +provided on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +either express or implied, including, without limitation, any warranties or +conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A +PARTICULAR PURPOSE. diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h new file mode 100644 index 0000000..b03cada --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h @@ -0,0 +1,713 @@ +//===-- ARMAddressingModes.h - ARM Addressing Modes -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the ARM addressing mode implementation stuff. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMADDRESSINGMODES_H +#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMADDRESSINGMODES_H + +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include <cassert> + +namespace llvm { + +/// ARM_AM - ARM Addressing Mode Stuff +namespace ARM_AM { + enum ShiftOpc { + no_shift = 0, + asr, + lsl, + lsr, + ror, + rrx + }; + + enum AddrOpc { + sub = 0, + add + }; + + static inline const char *getAddrOpcStr(AddrOpc Op) { + return Op == sub ? "-" : ""; + } + + static inline const char *getShiftOpcStr(ShiftOpc Op) { + switch (Op) { + default: llvm_unreachable("Unknown shift opc!"); + case ARM_AM::asr: return "asr"; + case ARM_AM::lsl: return "lsl"; + case ARM_AM::lsr: return "lsr"; + case ARM_AM::ror: return "ror"; + case ARM_AM::rrx: return "rrx"; + } + } + + static inline unsigned getShiftOpcEncoding(ShiftOpc Op) { + switch (Op) { + default: llvm_unreachable("Unknown shift opc!"); + case ARM_AM::asr: return 2; + case ARM_AM::lsl: return 0; + case ARM_AM::lsr: return 1; + case ARM_AM::ror: return 3; + } + } + + enum AMSubMode { + bad_am_submode = 0, + ia, + ib, + da, + db + }; + + static inline const char *getAMSubModeStr(AMSubMode Mode) { + switch (Mode) { + default: llvm_unreachable("Unknown addressing sub-mode!"); + case ARM_AM::ia: return "ia"; + case ARM_AM::ib: return "ib"; + case ARM_AM::da: return "da"; + case ARM_AM::db: return "db"; + } + } + + /// rotr32 - Rotate a 32-bit unsigned value right by a specified # bits. + /// + static inline unsigned rotr32(unsigned Val, unsigned Amt) { + assert(Amt < 32 && "Invalid rotate amount"); + return (Val >> Amt) | (Val << ((32-Amt)&31)); + } + + /// rotl32 - Rotate a 32-bit unsigned value left by a specified # bits. + /// + static inline unsigned rotl32(unsigned Val, unsigned Amt) { + assert(Amt < 32 && "Invalid rotate amount"); + return (Val << Amt) | (Val >> ((32-Amt)&31)); + } + + //===--------------------------------------------------------------------===// + // Addressing Mode #1: shift_operand with registers + //===--------------------------------------------------------------------===// + // + // This 'addressing mode' is used for arithmetic instructions. It can + // represent things like: + // reg + // reg [asr|lsl|lsr|ror|rrx] reg + // reg [asr|lsl|lsr|ror|rrx] imm + // + // This is stored three operands [rega, regb, opc]. The first is the base + // reg, the second is the shift amount (or reg0 if not present or imm). The + // third operand encodes the shift opcode and the imm if a reg isn't present. + // + static inline unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm) { + return ShOp | (Imm << 3); + } + static inline unsigned getSORegOffset(unsigned Op) { + return Op >> 3; + } + static inline ShiftOpc getSORegShOp(unsigned Op) { + return (ShiftOpc)(Op & 7); + } + + /// getSOImmValImm - Given an encoded imm field for the reg/imm form, return + /// the 8-bit imm value. + static inline unsigned getSOImmValImm(unsigned Imm) { + return Imm & 0xFF; + } + /// getSOImmValRot - Given an encoded imm field for the reg/imm form, return + /// the rotate amount. + static inline unsigned getSOImmValRot(unsigned Imm) { + return (Imm >> 8) * 2; + } + + /// getSOImmValRotate - Try to handle Imm with an immediate shifter operand, + /// computing the rotate amount to use. If this immediate value cannot be + /// handled with a single shifter-op, determine a good rotate amount that will + /// take a maximal chunk of bits out of the immediate. + static inline unsigned getSOImmValRotate(unsigned Imm) { + // 8-bit (or less) immediates are trivially shifter_operands with a rotate + // of zero. + if ((Imm & ~255U) == 0) return 0; + + // Use CTZ to compute the rotate amount. + unsigned TZ = countTrailingZeros(Imm); + + // Rotate amount must be even. Something like 0x200 must be rotated 8 bits, + // not 9. + unsigned RotAmt = TZ & ~1; + + // If we can handle this spread, return it. + if ((rotr32(Imm, RotAmt) & ~255U) == 0) + return (32-RotAmt)&31; // HW rotates right, not left. + + // For values like 0xF000000F, we should ignore the low 6 bits, then + // retry the hunt. + if (Imm & 63U) { + unsigned TZ2 = countTrailingZeros(Imm & ~63U); + unsigned RotAmt2 = TZ2 & ~1; + if ((rotr32(Imm, RotAmt2) & ~255U) == 0) + return (32-RotAmt2)&31; // HW rotates right, not left. + } + + // Otherwise, we have no way to cover this span of bits with a single + // shifter_op immediate. Return a chunk of bits that will be useful to + // handle. + return (32-RotAmt)&31; // HW rotates right, not left. + } + + /// getSOImmVal - Given a 32-bit immediate, if it is something that can fit + /// into an shifter_operand immediate operand, return the 12-bit encoding for + /// it. If not, return -1. + static inline int getSOImmVal(unsigned Arg) { + // 8-bit (or less) immediates are trivially shifter_operands with a rotate + // of zero. + if ((Arg & ~255U) == 0) return Arg; + + unsigned RotAmt = getSOImmValRotate(Arg); + + // If this cannot be handled with a single shifter_op, bail out. + if (rotr32(~255U, RotAmt) & Arg) + return -1; + + // Encode this correctly. + return rotl32(Arg, RotAmt) | ((RotAmt>>1) << 8); + } + + /// isSOImmTwoPartVal - Return true if the specified value can be obtained by + /// or'ing together two SOImmVal's. + static inline bool isSOImmTwoPartVal(unsigned V) { + // If this can be handled with a single shifter_op, bail out. + V = rotr32(~255U, getSOImmValRotate(V)) & V; + if (V == 0) + return false; + + // If this can be handled with two shifter_op's, accept. + V = rotr32(~255U, getSOImmValRotate(V)) & V; + return V == 0; + } + + /// getSOImmTwoPartFirst - If V is a value that satisfies isSOImmTwoPartVal, + /// return the first chunk of it. + static inline unsigned getSOImmTwoPartFirst(unsigned V) { + return rotr32(255U, getSOImmValRotate(V)) & V; + } + + /// getSOImmTwoPartSecond - If V is a value that satisfies isSOImmTwoPartVal, + /// return the second chunk of it. + static inline unsigned getSOImmTwoPartSecond(unsigned V) { + // Mask out the first hunk. + V = rotr32(~255U, getSOImmValRotate(V)) & V; + + // Take what's left. + assert(V == (rotr32(255U, getSOImmValRotate(V)) & V)); + return V; + } + + /// getThumbImmValShift - Try to handle Imm with a 8-bit immediate followed + /// by a left shift. Returns the shift amount to use. + static inline unsigned getThumbImmValShift(unsigned Imm) { + // 8-bit (or less) immediates are trivially immediate operand with a shift + // of zero. + if ((Imm & ~255U) == 0) return 0; + + // Use CTZ to compute the shift amount. + return countTrailingZeros(Imm); + } + + /// isThumbImmShiftedVal - Return true if the specified value can be obtained + /// by left shifting a 8-bit immediate. + static inline bool isThumbImmShiftedVal(unsigned V) { + // If this can be handled with + V = (~255U << getThumbImmValShift(V)) & V; + return V == 0; + } + + /// getThumbImm16ValShift - Try to handle Imm with a 16-bit immediate followed + /// by a left shift. Returns the shift amount to use. + static inline unsigned getThumbImm16ValShift(unsigned Imm) { + // 16-bit (or less) immediates are trivially immediate operand with a shift + // of zero. + if ((Imm & ~65535U) == 0) return 0; + + // Use CTZ to compute the shift amount. + return countTrailingZeros(Imm); + } + + /// isThumbImm16ShiftedVal - Return true if the specified value can be + /// obtained by left shifting a 16-bit immediate. + static inline bool isThumbImm16ShiftedVal(unsigned V) { + // If this can be handled with + V = (~65535U << getThumbImm16ValShift(V)) & V; + return V == 0; + } + + /// getThumbImmNonShiftedVal - If V is a value that satisfies + /// isThumbImmShiftedVal, return the non-shiftd value. + static inline unsigned getThumbImmNonShiftedVal(unsigned V) { + return V >> getThumbImmValShift(V); + } + + + /// getT2SOImmValSplat - Return the 12-bit encoded representation + /// if the specified value can be obtained by splatting the low 8 bits + /// into every other byte or every byte of a 32-bit value. i.e., + /// 00000000 00000000 00000000 abcdefgh control = 0 + /// 00000000 abcdefgh 00000000 abcdefgh control = 1 + /// abcdefgh 00000000 abcdefgh 00000000 control = 2 + /// abcdefgh abcdefgh abcdefgh abcdefgh control = 3 + /// Return -1 if none of the above apply. + /// See ARM Reference Manual A6.3.2. + static inline int getT2SOImmValSplatVal(unsigned V) { + unsigned u, Vs, Imm; + // control = 0 + if ((V & 0xffffff00) == 0) + return V; + + // If the value is zeroes in the first byte, just shift those off + Vs = ((V & 0xff) == 0) ? V >> 8 : V; + // Any passing value only has 8 bits of payload, splatted across the word + Imm = Vs & 0xff; + // Likewise, any passing values have the payload splatted into the 3rd byte + u = Imm | (Imm << 16); + + // control = 1 or 2 + if (Vs == u) + return (((Vs == V) ? 1 : 2) << 8) | Imm; + + // control = 3 + if (Vs == (u | (u << 8))) + return (3 << 8) | Imm; + + return -1; + } + + /// getT2SOImmValRotateVal - Return the 12-bit encoded representation if the + /// specified value is a rotated 8-bit value. Return -1 if no rotation + /// encoding is possible. + /// See ARM Reference Manual A6.3.2. + static inline int getT2SOImmValRotateVal(unsigned V) { + unsigned RotAmt = countLeadingZeros(V); + if (RotAmt >= 24) + return -1; + + // If 'Arg' can be handled with a single shifter_op return the value. + if ((rotr32(0xff000000U, RotAmt) & V) == V) + return (rotr32(V, 24 - RotAmt) & 0x7f) | ((RotAmt + 8) << 7); + + return -1; + } + + /// getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit + /// into a Thumb-2 shifter_operand immediate operand, return the 12-bit + /// encoding for it. If not, return -1. + /// See ARM Reference Manual A6.3.2. + static inline int getT2SOImmVal(unsigned Arg) { + // If 'Arg' is an 8-bit splat, then get the encoded value. + int Splat = getT2SOImmValSplatVal(Arg); + if (Splat != -1) + return Splat; + + // If 'Arg' can be handled with a single shifter_op return the value. + int Rot = getT2SOImmValRotateVal(Arg); + if (Rot != -1) + return Rot; + + return -1; + } + + static inline unsigned getT2SOImmValRotate(unsigned V) { + if ((V & ~255U) == 0) return 0; + // Use CTZ to compute the rotate amount. + unsigned RotAmt = countTrailingZeros(V); + return (32 - RotAmt) & 31; + } + + static inline bool isT2SOImmTwoPartVal (unsigned Imm) { + unsigned V = Imm; + // Passing values can be any combination of splat values and shifter + // values. If this can be handled with a single shifter or splat, bail + // out. Those should be handled directly, not with a two-part val. + if (getT2SOImmValSplatVal(V) != -1) + return false; + V = rotr32 (~255U, getT2SOImmValRotate(V)) & V; + if (V == 0) + return false; + + // If this can be handled as an immediate, accept. + if (getT2SOImmVal(V) != -1) return true; + + // Likewise, try masking out a splat value first. + V = Imm; + if (getT2SOImmValSplatVal(V & 0xff00ff00U) != -1) + V &= ~0xff00ff00U; + else if (getT2SOImmValSplatVal(V & 0x00ff00ffU) != -1) + V &= ~0x00ff00ffU; + // If what's left can be handled as an immediate, accept. + if (getT2SOImmVal(V) != -1) return true; + + // Otherwise, do not accept. + return false; + } + + static inline unsigned getT2SOImmTwoPartFirst(unsigned Imm) { + assert (isT2SOImmTwoPartVal(Imm) && + "Immedate cannot be encoded as two part immediate!"); + // Try a shifter operand as one part + unsigned V = rotr32 (~255, getT2SOImmValRotate(Imm)) & Imm; + // If the rest is encodable as an immediate, then return it. + if (getT2SOImmVal(V) != -1) return V; + + // Try masking out a splat value first. + if (getT2SOImmValSplatVal(Imm & 0xff00ff00U) != -1) + return Imm & 0xff00ff00U; + + // The other splat is all that's left as an option. + assert (getT2SOImmValSplatVal(Imm & 0x00ff00ffU) != -1); + return Imm & 0x00ff00ffU; + } + + static inline unsigned getT2SOImmTwoPartSecond(unsigned Imm) { + // Mask out the first hunk + Imm ^= getT2SOImmTwoPartFirst(Imm); + // Return what's left + assert (getT2SOImmVal(Imm) != -1 && + "Unable to encode second part of T2 two part SO immediate"); + return Imm; + } + + + //===--------------------------------------------------------------------===// + // Addressing Mode #2 + //===--------------------------------------------------------------------===// + // + // This is used for most simple load/store instructions. + // + // addrmode2 := reg +/- reg shop imm + // addrmode2 := reg +/- imm12 + // + // The first operand is always a Reg. The second operand is a reg if in + // reg/reg form, otherwise it's reg#0. The third field encodes the operation + // in bit 12, the immediate in bits 0-11, and the shift op in 13-15. The + // fourth operand 16-17 encodes the index mode. + // + // If this addressing mode is a frame index (before prolog/epilog insertion + // and code rewriting), this operand will have the form: FI#, reg0, <offs> + // with no shift amount for the frame offset. + // + static inline unsigned getAM2Opc(AddrOpc Opc, unsigned Imm12, ShiftOpc SO, + unsigned IdxMode = 0) { + assert(Imm12 < (1 << 12) && "Imm too large!"); + bool isSub = Opc == sub; + return Imm12 | ((int)isSub << 12) | (SO << 13) | (IdxMode << 16) ; + } + static inline unsigned getAM2Offset(unsigned AM2Opc) { + return AM2Opc & ((1 << 12)-1); + } + static inline AddrOpc getAM2Op(unsigned AM2Opc) { + return ((AM2Opc >> 12) & 1) ? sub : add; + } + static inline ShiftOpc getAM2ShiftOpc(unsigned AM2Opc) { + return (ShiftOpc)((AM2Opc >> 13) & 7); + } + static inline unsigned getAM2IdxMode(unsigned AM2Opc) { + return (AM2Opc >> 16); + } + + + //===--------------------------------------------------------------------===// + // Addressing Mode #3 + //===--------------------------------------------------------------------===// + // + // This is used for sign-extending loads, and load/store-pair instructions. + // + // addrmode3 := reg +/- reg + // addrmode3 := reg +/- imm8 + // + // The first operand is always a Reg. The second operand is a reg if in + // reg/reg form, otherwise it's reg#0. The third field encodes the operation + // in bit 8, the immediate in bits 0-7. The fourth operand 9-10 encodes the + // index mode. + + /// getAM3Opc - This function encodes the addrmode3 opc field. + static inline unsigned getAM3Opc(AddrOpc Opc, unsigned char Offset, + unsigned IdxMode = 0) { + bool isSub = Opc == sub; + return ((int)isSub << 8) | Offset | (IdxMode << 9); + } + static inline unsigned char getAM3Offset(unsigned AM3Opc) { + return AM3Opc & 0xFF; + } + static inline AddrOpc getAM3Op(unsigned AM3Opc) { + return ((AM3Opc >> 8) & 1) ? sub : add; + } + static inline unsigned getAM3IdxMode(unsigned AM3Opc) { + return (AM3Opc >> 9); + } + + //===--------------------------------------------------------------------===// + // Addressing Mode #4 + //===--------------------------------------------------------------------===// + // + // This is used for load / store multiple instructions. + // + // addrmode4 := reg, <mode> + // + // The four modes are: + // IA - Increment after + // IB - Increment before + // DA - Decrement after + // DB - Decrement before + // For VFP instructions, only the IA and DB modes are valid. + + static inline AMSubMode getAM4SubMode(unsigned Mode) { + return (AMSubMode)(Mode & 0x7); + } + + static inline unsigned getAM4ModeImm(AMSubMode SubMode) { + return (int)SubMode; + } + + //===--------------------------------------------------------------------===// + // Addressing Mode #5 + //===--------------------------------------------------------------------===// + // + // This is used for coprocessor instructions, such as FP load/stores. + // + // addrmode5 := reg +/- imm8*4 + // + // The first operand is always a Reg. The second operand encodes the + // operation in bit 8 and the immediate in bits 0-7. + + /// getAM5Opc - This function encodes the addrmode5 opc field. + static inline unsigned getAM5Opc(AddrOpc Opc, unsigned char Offset) { + bool isSub = Opc == sub; + return ((int)isSub << 8) | Offset; + } + static inline unsigned char getAM5Offset(unsigned AM5Opc) { + return AM5Opc & 0xFF; + } + static inline AddrOpc getAM5Op(unsigned AM5Opc) { + return ((AM5Opc >> 8) & 1) ? sub : add; + } + + //===--------------------------------------------------------------------===// + // Addressing Mode #6 + //===--------------------------------------------------------------------===// + // + // This is used for NEON load / store instructions. + // + // addrmode6 := reg with optional alignment + // + // This is stored in two operands [regaddr, align]. The first is the + // address register. The second operand is the value of the alignment + // specifier in bytes or zero if no explicit alignment. + // Valid alignments depend on the specific instruction. + + //===--------------------------------------------------------------------===// + // NEON Modified Immediates + //===--------------------------------------------------------------------===// + // + // Several NEON instructions (e.g., VMOV) take a "modified immediate" + // vector operand, where a small immediate encoded in the instruction + // specifies a full NEON vector value. These modified immediates are + // represented here as encoded integers. The low 8 bits hold the immediate + // value; bit 12 holds the "Op" field of the instruction, and bits 11-8 hold + // the "Cmode" field of the instruction. The interfaces below treat the + // Op and Cmode values as a single 5-bit value. + + static inline unsigned createNEONModImm(unsigned OpCmode, unsigned Val) { + return (OpCmode << 8) | Val; + } + static inline unsigned getNEONModImmOpCmode(unsigned ModImm) { + return (ModImm >> 8) & 0x1f; + } + static inline unsigned getNEONModImmVal(unsigned ModImm) { + return ModImm & 0xff; + } + + /// decodeNEONModImm - Decode a NEON modified immediate value into the + /// element value and the element size in bits. (If the element size is + /// smaller than the vector, it is splatted into all the elements.) + static inline uint64_t decodeNEONModImm(unsigned ModImm, unsigned &EltBits) { + unsigned OpCmode = getNEONModImmOpCmode(ModImm); + unsigned Imm8 = getNEONModImmVal(ModImm); + uint64_t Val = 0; + + if (OpCmode == 0xe) { + // 8-bit vector elements + Val = Imm8; + EltBits = 8; + } else if ((OpCmode & 0xc) == 0x8) { + // 16-bit vector elements + unsigned ByteNum = (OpCmode & 0x6) >> 1; + Val = Imm8 << (8 * ByteNum); + EltBits = 16; + } else if ((OpCmode & 0x8) == 0) { + // 32-bit vector elements, zero with one byte set + unsigned ByteNum = (OpCmode & 0x6) >> 1; + Val = Imm8 << (8 * ByteNum); + EltBits = 32; + } else if ((OpCmode & 0xe) == 0xc) { + // 32-bit vector elements, one byte with low bits set + unsigned ByteNum = 1 + (OpCmode & 0x1); + Val = (Imm8 << (8 * ByteNum)) | (0xffff >> (8 * (2 - ByteNum))); + EltBits = 32; + } else if (OpCmode == 0x1e) { + // 64-bit vector elements + for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) { + if ((ModImm >> ByteNum) & 1) + Val |= (uint64_t)0xff << (8 * ByteNum); + } + EltBits = 64; + } else { + llvm_unreachable("Unsupported NEON immediate"); + } + return Val; + } + + // Generic validation for single-byte immediate (0X00, 00X0, etc). + static inline bool isNEONBytesplat(unsigned Value, unsigned Size) { + assert(Size >= 1 && Size <= 4 && "Invalid size"); + unsigned count = 0; + for (unsigned i = 0; i < Size; ++i) { + if (Value & 0xff) count++; + Value >>= 8; + } + return count == 1; + } + + /// Checks if Value is a correct immediate for instructions like VBIC/VORR. + static inline bool isNEONi16splat(unsigned Value) { + if (Value > 0xffff) + return false; + // i16 value with set bits only in one byte X0 or 0X. + return Value == 0 || isNEONBytesplat(Value, 2); + } + + // Encode NEON 16 bits Splat immediate for instructions like VBIC/VORR + static inline unsigned encodeNEONi16splat(unsigned Value) { + assert(isNEONi16splat(Value) && "Invalid NEON splat value"); + if (Value >= 0x100) + Value = (Value >> 8) | 0xa00; + else + Value |= 0x800; + return Value; + } + + /// Checks if Value is a correct immediate for instructions like VBIC/VORR. + static inline bool isNEONi32splat(unsigned Value) { + // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X. + return Value == 0 || isNEONBytesplat(Value, 4); + } + + /// Encode NEON 32 bits Splat immediate for instructions like VBIC/VORR. + static inline unsigned encodeNEONi32splat(unsigned Value) { + assert(isNEONi32splat(Value) && "Invalid NEON splat value"); + if (Value >= 0x100 && Value <= 0xff00) + Value = (Value >> 8) | 0x200; + else if (Value > 0xffff && Value <= 0xff0000) + Value = (Value >> 16) | 0x400; + else if (Value > 0xffffff) + Value = (Value >> 24) | 0x600; + return Value; + } + + //===--------------------------------------------------------------------===// + // Floating-point Immediates + // + static inline float getFPImmFloat(unsigned Imm) { + // We expect an 8-bit binary encoding of a floating-point number here. + union { + uint32_t I; + float F; + } FPUnion; + + uint8_t Sign = (Imm >> 7) & 0x1; + uint8_t Exp = (Imm >> 4) & 0x7; + uint8_t Mantissa = Imm & 0xf; + + // 8-bit FP iEEEE Float Encoding + // abcd efgh aBbbbbbc defgh000 00000000 00000000 + // + // where B = NOT(b); + + FPUnion.I = 0; + FPUnion.I |= Sign << 31; + FPUnion.I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30; + FPUnion.I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25; + FPUnion.I |= (Exp & 0x3) << 23; + FPUnion.I |= Mantissa << 19; + return FPUnion.F; + } + + /// getFP32Imm - Return an 8-bit floating-point version of the 32-bit + /// floating-point value. If the value cannot be represented as an 8-bit + /// floating-point value, then return -1. + static inline int getFP32Imm(const APInt &Imm) { + uint32_t Sign = Imm.lshr(31).getZExtValue() & 1; + int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127; // -126 to 127 + int64_t Mantissa = Imm.getZExtValue() & 0x7fffff; // 23 bits + + // We can handle 4 bits of mantissa. + // mantissa = (16+UInt(e:f:g:h))/16. + if (Mantissa & 0x7ffff) + return -1; + Mantissa >>= 19; + if ((Mantissa & 0xf) != Mantissa) + return -1; + + // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3 + if (Exp < -3 || Exp > 4) + return -1; + Exp = ((Exp+3) & 0x7) ^ 4; + + return ((int)Sign << 7) | (Exp << 4) | Mantissa; + } + + static inline int getFP32Imm(const APFloat &FPImm) { + return getFP32Imm(FPImm.bitcastToAPInt()); + } + + /// getFP64Imm - Return an 8-bit floating-point version of the 64-bit + /// floating-point value. If the value cannot be represented as an 8-bit + /// floating-point value, then return -1. + static inline int getFP64Imm(const APInt &Imm) { + uint64_t Sign = Imm.lshr(63).getZExtValue() & 1; + int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023; // -1022 to 1023 + uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffULL; + + // We can handle 4 bits of mantissa. + // mantissa = (16+UInt(e:f:g:h))/16. + if (Mantissa & 0xffffffffffffULL) + return -1; + Mantissa >>= 48; + if ((Mantissa & 0xf) != Mantissa) + return -1; + + // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3 + if (Exp < -3 || Exp > 4) + return -1; + Exp = ((Exp+3) & 0x7) ^ 4; + + return ((int)Sign << 7) | (Exp << 4) | Mantissa; + } + + static inline int getFP64Imm(const APFloat &FPImm) { + return getFP64Imm(FPImm.bitcastToAPInt()); + } + +} // end namespace ARM_AM +} // end namespace llvm + +#endif + diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp new file mode 100644 index 0000000..fa52c93 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -0,0 +1,1089 @@ +//===-- ARMAsmBackend.cpp - ARM Assembler Backend -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/ARMMCTargetDesc.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "MCTargetDesc/ARMAsmBackend.h" +#include "MCTargetDesc/ARMAsmBackendDarwin.h" +#include "MCTargetDesc/ARMAsmBackendELF.h" +#include "MCTargetDesc/ARMAsmBackendWinCOFF.h" +#include "MCTargetDesc/ARMBaseInfo.h" +#include "MCTargetDesc/ARMFixupKinds.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDirectives.h" +#include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCFixupKindInfo.h" +#include "llvm/MC/MCMachObjectWriter.h" +#include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/MachO.h" +#include "llvm/Support/TargetParser.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +namespace { +class ARMELFObjectWriter : public MCELFObjectTargetWriter { +public: + ARMELFObjectWriter(uint8_t OSABI) + : MCELFObjectTargetWriter(/*Is64Bit*/ false, OSABI, ELF::EM_ARM, + /*HasRelocationAddend*/ false) {} +}; + +const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const { + const static MCFixupKindInfo InfosLE[ARM::NumTargetFixupKinds] = { + // This table *must* be in the order that the fixup_* kinds are defined in + // ARMFixupKinds.h. + // + // Name Offset (bits) Size (bits) Flags + {"fixup_arm_ldst_pcrel_12", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_t2_ldst_pcrel_12", 0, 32, + MCFixupKindInfo::FKF_IsPCRel | + MCFixupKindInfo::FKF_IsAlignedDownTo32Bits}, + {"fixup_arm_pcrel_10_unscaled", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_arm_pcrel_10", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_t2_pcrel_10", 0, 32, + MCFixupKindInfo::FKF_IsPCRel | + MCFixupKindInfo::FKF_IsAlignedDownTo32Bits}, + {"fixup_thumb_adr_pcrel_10", 0, 8, + MCFixupKindInfo::FKF_IsPCRel | + MCFixupKindInfo::FKF_IsAlignedDownTo32Bits}, + {"fixup_arm_adr_pcrel_12", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_t2_adr_pcrel_12", 0, 32, + MCFixupKindInfo::FKF_IsPCRel | + MCFixupKindInfo::FKF_IsAlignedDownTo32Bits}, + {"fixup_arm_condbranch", 0, 24, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_arm_uncondbranch", 0, 24, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_t2_condbranch", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_t2_uncondbranch", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_arm_thumb_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_arm_uncondbl", 0, 24, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_arm_condbl", 0, 24, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_arm_blx", 0, 24, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_arm_thumb_bl", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_arm_thumb_blx", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_arm_thumb_cb", 0, 16, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_arm_thumb_cp", 0, 8, + MCFixupKindInfo::FKF_IsPCRel | + MCFixupKindInfo::FKF_IsAlignedDownTo32Bits}, + {"fixup_arm_thumb_bcc", 0, 8, MCFixupKindInfo::FKF_IsPCRel}, + // movw / movt: 16-bits immediate but scattered into two chunks 0 - 12, 16 + // - 19. + {"fixup_arm_movt_hi16", 0, 20, 0}, + {"fixup_arm_movw_lo16", 0, 20, 0}, + {"fixup_t2_movt_hi16", 0, 20, 0}, + {"fixup_t2_movw_lo16", 0, 20, 0}, + }; + const static MCFixupKindInfo InfosBE[ARM::NumTargetFixupKinds] = { + // This table *must* be in the order that the fixup_* kinds are defined in + // ARMFixupKinds.h. + // + // Name Offset (bits) Size (bits) Flags + {"fixup_arm_ldst_pcrel_12", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_t2_ldst_pcrel_12", 0, 32, + MCFixupKindInfo::FKF_IsPCRel | + MCFixupKindInfo::FKF_IsAlignedDownTo32Bits}, + {"fixup_arm_pcrel_10_unscaled", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_arm_pcrel_10", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_t2_pcrel_10", 0, 32, + MCFixupKindInfo::FKF_IsPCRel | + MCFixupKindInfo::FKF_IsAlignedDownTo32Bits}, + {"fixup_thumb_adr_pcrel_10", 8, 8, + MCFixupKindInfo::FKF_IsPCRel | + MCFixupKindInfo::FKF_IsAlignedDownTo32Bits}, + {"fixup_arm_adr_pcrel_12", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_t2_adr_pcrel_12", 0, 32, + MCFixupKindInfo::FKF_IsPCRel | + MCFixupKindInfo::FKF_IsAlignedDownTo32Bits}, + {"fixup_arm_condbranch", 8, 24, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_arm_uncondbranch", 8, 24, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_t2_condbranch", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_t2_uncondbranch", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_arm_thumb_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_arm_uncondbl", 8, 24, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_arm_condbl", 8, 24, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_arm_blx", 8, 24, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_arm_thumb_bl", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_arm_thumb_blx", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_arm_thumb_cb", 0, 16, MCFixupKindInfo::FKF_IsPCRel}, + {"fixup_arm_thumb_cp", 8, 8, + MCFixupKindInfo::FKF_IsPCRel | + MCFixupKindInfo::FKF_IsAlignedDownTo32Bits}, + {"fixup_arm_thumb_bcc", 8, 8, MCFixupKindInfo::FKF_IsPCRel}, + // movw / movt: 16-bits immediate but scattered into two chunks 0 - 12, 16 + // - 19. + {"fixup_arm_movt_hi16", 12, 20, 0}, + {"fixup_arm_movw_lo16", 12, 20, 0}, + {"fixup_t2_movt_hi16", 12, 20, 0}, + {"fixup_t2_movw_lo16", 12, 20, 0}, + }; + + if (Kind < FirstTargetFixupKind) + return MCAsmBackend::getFixupKindInfo(Kind); + + assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() && + "Invalid kind!"); + return (IsLittleEndian ? InfosLE : InfosBE)[Kind - FirstTargetFixupKind]; +} + +void ARMAsmBackend::handleAssemblerFlag(MCAssemblerFlag Flag) { + switch (Flag) { + default: + break; + case MCAF_Code16: + setIsThumb(true); + break; + case MCAF_Code32: + setIsThumb(false); + break; + } +} +} // end anonymous namespace + +unsigned ARMAsmBackend::getRelaxedOpcode(unsigned Op) const { + bool HasThumb2 = STI->getFeatureBits()[ARM::FeatureThumb2]; + + switch (Op) { + default: + return Op; + case ARM::tBcc: + return HasThumb2 ? (unsigned)ARM::t2Bcc : Op; + case ARM::tLDRpci: + return HasThumb2 ? (unsigned)ARM::t2LDRpci : Op; + case ARM::tADR: + return HasThumb2 ? (unsigned)ARM::t2ADR : Op; + case ARM::tB: + return HasThumb2 ? (unsigned)ARM::t2B : Op; + case ARM::tCBZ: + return ARM::tHINT; + case ARM::tCBNZ: + return ARM::tHINT; + } +} + +bool ARMAsmBackend::mayNeedRelaxation(const MCInst &Inst) const { + if (getRelaxedOpcode(Inst.getOpcode()) != Inst.getOpcode()) + return true; + return false; +} + +const char *ARMAsmBackend::reasonForFixupRelaxation(const MCFixup &Fixup, + uint64_t Value) const { + switch ((unsigned)Fixup.getKind()) { + case ARM::fixup_arm_thumb_br: { + // Relaxing tB to t2B. tB has a signed 12-bit displacement with the + // low bit being an implied zero. There's an implied +4 offset for the + // branch, so we adjust the other way here to determine what's + // encodable. + // + // Relax if the value is too big for a (signed) i8. + int64_t Offset = int64_t(Value) - 4; + if (Offset > 2046 || Offset < -2048) + return "out of range pc-relative fixup value"; + break; + } + case ARM::fixup_arm_thumb_bcc: { + // Relaxing tBcc to t2Bcc. tBcc has a signed 9-bit displacement with the + // low bit being an implied zero. There's an implied +4 offset for the + // branch, so we adjust the other way here to determine what's + // encodable. + // + // Relax if the value is too big for a (signed) i8. + int64_t Offset = int64_t(Value) - 4; + if (Offset > 254 || Offset < -256) + return "out of range pc-relative fixup value"; + break; + } + case ARM::fixup_thumb_adr_pcrel_10: + case ARM::fixup_arm_thumb_cp: { + // If the immediate is negative, greater than 1020, or not a multiple + // of four, the wide version of the instruction must be used. + int64_t Offset = int64_t(Value) - 4; + if (Offset & 3) + return "misaligned pc-relative fixup value"; + else if (Offset > 1020 || Offset < 0) + return "out of range pc-relative fixup value"; + break; + } + case ARM::fixup_arm_thumb_cb: { + // If we have a Thumb CBZ or CBNZ instruction and its target is the next + // instruction it is is actually out of range for the instruction. + // It will be changed to a NOP. + int64_t Offset = (Value & ~1); + if (Offset == 2) + return "will be converted to nop"; + break; + } + default: + llvm_unreachable("Unexpected fixup kind in reasonForFixupRelaxation()!"); + } + return nullptr; +} + +bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, + const MCRelaxableFragment *DF, + const MCAsmLayout &Layout) const { + return reasonForFixupRelaxation(Fixup, Value); +} + +void ARMAsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const { + unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode()); + + // Sanity check w/ diagnostic if we get here w/ a bogus instruction. + if (RelaxedOp == Inst.getOpcode()) { + SmallString<256> Tmp; + raw_svector_ostream OS(Tmp); + Inst.dump_pretty(OS); + OS << "\n"; + report_fatal_error("unexpected instruction to relax: " + OS.str()); + } + + // If we are changing Thumb CBZ or CBNZ instruction to a NOP, aka tHINT, we + // have to change the operands too. + if ((Inst.getOpcode() == ARM::tCBZ || Inst.getOpcode() == ARM::tCBNZ) && + RelaxedOp == ARM::tHINT) { + Res.setOpcode(RelaxedOp); + Res.addOperand(MCOperand::createImm(0)); + Res.addOperand(MCOperand::createImm(14)); + Res.addOperand(MCOperand::createReg(0)); + return; + } + + // The rest of instructions we're relaxing have the same operands. + // We just need to update to the proper opcode. + Res = Inst; + Res.setOpcode(RelaxedOp); +} + +bool ARMAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { + const uint16_t Thumb1_16bitNopEncoding = 0x46c0; // using MOV r8,r8 + const uint16_t Thumb2_16bitNopEncoding = 0xbf00; // NOP + const uint32_t ARMv4_NopEncoding = 0xe1a00000; // using MOV r0,r0 + const uint32_t ARMv6T2_NopEncoding = 0xe320f000; // NOP + if (isThumb()) { + const uint16_t nopEncoding = + hasNOP() ? Thumb2_16bitNopEncoding : Thumb1_16bitNopEncoding; + uint64_t NumNops = Count / 2; + for (uint64_t i = 0; i != NumNops; ++i) + OW->write16(nopEncoding); + if (Count & 1) + OW->write8(0); + return true; + } + // ARM mode + const uint32_t nopEncoding = + hasNOP() ? ARMv6T2_NopEncoding : ARMv4_NopEncoding; + uint64_t NumNops = Count / 4; + for (uint64_t i = 0; i != NumNops; ++i) + OW->write32(nopEncoding); + // FIXME: should this function return false when unable to write exactly + // 'Count' bytes with NOP encodings? + switch (Count % 4) { + default: + break; // No leftover bytes to write + case 1: + OW->write8(0); + break; + case 2: + OW->write16(0); + break; + case 3: + OW->write16(0); + OW->write8(0xa0); + break; + } + + return true; +} + +static uint32_t swapHalfWords(uint32_t Value, bool IsLittleEndian) { + if (IsLittleEndian) { + // Note that the halfwords are stored high first and low second in thumb; + // so we need to swap the fixup value here to map properly. + uint32_t Swapped = (Value & 0xFFFF0000) >> 16; + Swapped |= (Value & 0x0000FFFF) << 16; + return Swapped; + } else + return Value; +} + +static uint32_t joinHalfWords(uint32_t FirstHalf, uint32_t SecondHalf, + bool IsLittleEndian) { + uint32_t Value; + + if (IsLittleEndian) { + Value = (SecondHalf & 0xFFFF) << 16; + Value |= (FirstHalf & 0xFFFF); + } else { + Value = (SecondHalf & 0xFFFF); + Value |= (FirstHalf & 0xFFFF) << 16; + } + + return Value; +} + +unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value, + bool IsPCRel, MCContext *Ctx, + bool IsLittleEndian, + bool IsResolved) const { + unsigned Kind = Fixup.getKind(); + switch (Kind) { + default: + llvm_unreachable("Unknown fixup kind!"); + case FK_Data_1: + case FK_Data_2: + case FK_Data_4: + return Value; + case FK_SecRel_2: + return Value; + case FK_SecRel_4: + return Value; + case ARM::fixup_arm_movt_hi16: + if (!IsPCRel) + Value >>= 16; + // Fallthrough + case ARM::fixup_arm_movw_lo16: { + unsigned Hi4 = (Value & 0xF000) >> 12; + unsigned Lo12 = Value & 0x0FFF; + // inst{19-16} = Hi4; + // inst{11-0} = Lo12; + Value = (Hi4 << 16) | (Lo12); + return Value; + } + case ARM::fixup_t2_movt_hi16: + if (!IsPCRel) + Value >>= 16; + // Fallthrough + case ARM::fixup_t2_movw_lo16: { + unsigned Hi4 = (Value & 0xF000) >> 12; + unsigned i = (Value & 0x800) >> 11; + unsigned Mid3 = (Value & 0x700) >> 8; + unsigned Lo8 = Value & 0x0FF; + // inst{19-16} = Hi4; + // inst{26} = i; + // inst{14-12} = Mid3; + // inst{7-0} = Lo8; + Value = (Hi4 << 16) | (i << 26) | (Mid3 << 12) | (Lo8); + return swapHalfWords(Value, IsLittleEndian); + } + case ARM::fixup_arm_ldst_pcrel_12: + // ARM PC-relative values are offset by 8. + Value -= 4; + // FALLTHROUGH + case ARM::fixup_t2_ldst_pcrel_12: { + // Offset by 4, adjusted by two due to the half-word ordering of thumb. + Value -= 4; + bool isAdd = true; + if ((int64_t)Value < 0) { + Value = -Value; + isAdd = false; + } + if (Ctx && Value >= 4096) { + Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value"); + return 0; + } + Value |= isAdd << 23; + + // Same addressing mode as fixup_arm_pcrel_10, + // but with 16-bit halfwords swapped. + if (Kind == ARM::fixup_t2_ldst_pcrel_12) + return swapHalfWords(Value, IsLittleEndian); + + return Value; + } + case ARM::fixup_arm_adr_pcrel_12: { + // ARM PC-relative values are offset by 8. + Value -= 8; + unsigned opc = 4; // bits {24-21}. Default to add: 0b0100 + if ((int64_t)Value < 0) { + Value = -Value; + opc = 2; // 0b0010 + } + if (Ctx && ARM_AM::getSOImmVal(Value) == -1) { + Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value"); + return 0; + } + // Encode the immediate and shift the opcode into place. + return ARM_AM::getSOImmVal(Value) | (opc << 21); + } + + case ARM::fixup_t2_adr_pcrel_12: { + Value -= 4; + unsigned opc = 0; + if ((int64_t)Value < 0) { + Value = -Value; + opc = 5; + } + + uint32_t out = (opc << 21); + out |= (Value & 0x800) << 15; + out |= (Value & 0x700) << 4; + out |= (Value & 0x0FF); + + return swapHalfWords(out, IsLittleEndian); + } + + case ARM::fixup_arm_condbranch: + case ARM::fixup_arm_uncondbranch: + case ARM::fixup_arm_uncondbl: + case ARM::fixup_arm_condbl: + case ARM::fixup_arm_blx: + // These values don't encode the low two bits since they're always zero. + // Offset by 8 just as above. + if (const MCSymbolRefExpr *SRE = + dyn_cast<MCSymbolRefExpr>(Fixup.getValue())) + if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_TLSCALL) + return 0; + return 0xffffff & ((Value - 8) >> 2); + case ARM::fixup_t2_uncondbranch: { + Value = Value - 4; + Value >>= 1; // Low bit is not encoded. + + uint32_t out = 0; + bool I = Value & 0x800000; + bool J1 = Value & 0x400000; + bool J2 = Value & 0x200000; + J1 ^= I; + J2 ^= I; + + out |= I << 26; // S bit + out |= !J1 << 13; // J1 bit + out |= !J2 << 11; // J2 bit + out |= (Value & 0x1FF800) << 5; // imm6 field + out |= (Value & 0x0007FF); // imm11 field + + return swapHalfWords(out, IsLittleEndian); + } + case ARM::fixup_t2_condbranch: { + Value = Value - 4; + Value >>= 1; // Low bit is not encoded. + + uint64_t out = 0; + out |= (Value & 0x80000) << 7; // S bit + out |= (Value & 0x40000) >> 7; // J2 bit + out |= (Value & 0x20000) >> 4; // J1 bit + out |= (Value & 0x1F800) << 5; // imm6 field + out |= (Value & 0x007FF); // imm11 field + + return swapHalfWords(out, IsLittleEndian); + } + case ARM::fixup_arm_thumb_bl: { + // The value doesn't encode the low bit (always zero) and is offset by + // four. The 32-bit immediate value is encoded as + // imm32 = SignExtend(S:I1:I2:imm10:imm11:0) + // where I1 = NOT(J1 ^ S) and I2 = NOT(J2 ^ S). + // The value is encoded into disjoint bit positions in the destination + // opcode. x = unchanged, I = immediate value bit, S = sign extension bit, + // J = either J1 or J2 bit + // + // BL: xxxxxSIIIIIIIIII xxJxJIIIIIIIIIII + // + // Note that the halfwords are stored high first, low second; so we need + // to transpose the fixup value here to map properly. + uint32_t offset = (Value - 4) >> 1; + uint32_t signBit = (offset & 0x800000) >> 23; + uint32_t I1Bit = (offset & 0x400000) >> 22; + uint32_t J1Bit = (I1Bit ^ 0x1) ^ signBit; + uint32_t I2Bit = (offset & 0x200000) >> 21; + uint32_t J2Bit = (I2Bit ^ 0x1) ^ signBit; + uint32_t imm10Bits = (offset & 0x1FF800) >> 11; + uint32_t imm11Bits = (offset & 0x000007FF); + + uint32_t FirstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10Bits); + uint32_t SecondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) | + (uint16_t)imm11Bits); + return joinHalfWords(FirstHalf, SecondHalf, IsLittleEndian); + } + case ARM::fixup_arm_thumb_blx: { + // The value doesn't encode the low two bits (always zero) and is offset by + // four (see fixup_arm_thumb_cp). The 32-bit immediate value is encoded as + // imm32 = SignExtend(S:I1:I2:imm10H:imm10L:00) + // where I1 = NOT(J1 ^ S) and I2 = NOT(J2 ^ S). + // The value is encoded into disjoint bit positions in the destination + // opcode. x = unchanged, I = immediate value bit, S = sign extension bit, + // J = either J1 or J2 bit, 0 = zero. + // + // BLX: xxxxxSIIIIIIIIII xxJxJIIIIIIIIII0 + // + // Note that the halfwords are stored high first, low second; so we need + // to transpose the fixup value here to map properly. + uint32_t offset = (Value - 2) >> 2; + if (const MCSymbolRefExpr *SRE = + dyn_cast<MCSymbolRefExpr>(Fixup.getValue())) + if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_TLSCALL) + offset = 0; + uint32_t signBit = (offset & 0x400000) >> 22; + uint32_t I1Bit = (offset & 0x200000) >> 21; + uint32_t J1Bit = (I1Bit ^ 0x1) ^ signBit; + uint32_t I2Bit = (offset & 0x100000) >> 20; + uint32_t J2Bit = (I2Bit ^ 0x1) ^ signBit; + uint32_t imm10HBits = (offset & 0xFFC00) >> 10; + uint32_t imm10LBits = (offset & 0x3FF); + + uint32_t FirstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10HBits); + uint32_t SecondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) | + ((uint16_t)imm10LBits) << 1); + return joinHalfWords(FirstHalf, SecondHalf, IsLittleEndian); + } + case ARM::fixup_thumb_adr_pcrel_10: + case ARM::fixup_arm_thumb_cp: + // On CPUs supporting Thumb2, this will be relaxed to an ldr.w, otherwise we + // could have an error on our hands. + if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2] && IsResolved) { + const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value); + if (FixupDiagnostic) { + Ctx->reportError(Fixup.getLoc(), FixupDiagnostic); + return 0; + } + } + // Offset by 4, and don't encode the low two bits. + return ((Value - 4) >> 2) & 0xff; + case ARM::fixup_arm_thumb_cb: { + // Offset by 4 and don't encode the lower bit, which is always 0. + // FIXME: diagnose if no Thumb2 + uint32_t Binary = (Value - 4) >> 1; + return ((Binary & 0x20) << 4) | ((Binary & 0x1f) << 3); + } + case ARM::fixup_arm_thumb_br: + // Offset by 4 and don't encode the lower bit, which is always 0. + if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2]) { + const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value); + if (FixupDiagnostic) { + Ctx->reportError(Fixup.getLoc(), FixupDiagnostic); + return 0; + } + } + return ((Value - 4) >> 1) & 0x7ff; + case ARM::fixup_arm_thumb_bcc: + // Offset by 4 and don't encode the lower bit, which is always 0. + if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2]) { + const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value); + if (FixupDiagnostic) { + Ctx->reportError(Fixup.getLoc(), FixupDiagnostic); + return 0; + } + } + return ((Value - 4) >> 1) & 0xff; + case ARM::fixup_arm_pcrel_10_unscaled: { + Value = Value - 8; // ARM fixups offset by an additional word and don't + // need to adjust for the half-word ordering. + bool isAdd = true; + if ((int64_t)Value < 0) { + Value = -Value; + isAdd = false; + } + // The value has the low 4 bits encoded in [3:0] and the high 4 in [11:8]. + if (Ctx && Value >= 256) { + Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value"); + return 0; + } + Value = (Value & 0xf) | ((Value & 0xf0) << 4); + return Value | (isAdd << 23); + } + case ARM::fixup_arm_pcrel_10: + Value = Value - 4; // ARM fixups offset by an additional word and don't + // need to adjust for the half-word ordering. + // Fall through. + case ARM::fixup_t2_pcrel_10: { + // Offset by 4, adjusted by two due to the half-word ordering of thumb. + Value = Value - 4; + bool isAdd = true; + if ((int64_t)Value < 0) { + Value = -Value; + isAdd = false; + } + // These values don't encode the low two bits since they're always zero. + Value >>= 2; + if (Ctx && Value >= 256) { + Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value"); + return 0; + } + Value |= isAdd << 23; + + // Same addressing mode as fixup_arm_pcrel_10, but with 16-bit halfwords + // swapped. + if (Kind == ARM::fixup_t2_pcrel_10) + return swapHalfWords(Value, IsLittleEndian); + + return Value; + } + } +} + +void ARMAsmBackend::processFixupValue(const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFixup &Fixup, + const MCFragment *DF, + const MCValue &Target, uint64_t &Value, + bool &IsResolved) { + const MCSymbolRefExpr *A = Target.getSymA(); + const MCSymbol *Sym = A ? &A->getSymbol() : nullptr; + // Some fixups to thumb function symbols need the low bit (thumb bit) + // twiddled. + if ((unsigned)Fixup.getKind() != ARM::fixup_arm_ldst_pcrel_12 && + (unsigned)Fixup.getKind() != ARM::fixup_t2_ldst_pcrel_12 && + (unsigned)Fixup.getKind() != ARM::fixup_arm_adr_pcrel_12 && + (unsigned)Fixup.getKind() != ARM::fixup_thumb_adr_pcrel_10 && + (unsigned)Fixup.getKind() != ARM::fixup_t2_adr_pcrel_12 && + (unsigned)Fixup.getKind() != ARM::fixup_arm_thumb_cp) { + if (Sym) { + if (Asm.isThumbFunc(Sym)) + Value |= 1; + } + } + if (IsResolved && (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) { + assert(Sym && "How did we resolve this?"); + + // If the symbol is external the linker will handle it. + // FIXME: Should we handle it as an optimization? + + // If the symbol is out of range, produce a relocation and hope the + // linker can handle it. GNU AS produces an error in this case. + if (Sym->isExternal() || Value >= 0x400004) + IsResolved = false; + } + // We must always generate a relocation for BL/BLX instructions if we have + // a symbol to reference, as the linker relies on knowing the destination + // symbol's thumb-ness to get interworking right. + if (A && ((unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_blx || + (unsigned)Fixup.getKind() == ARM::fixup_arm_blx || + (unsigned)Fixup.getKind() == ARM::fixup_arm_uncondbl || + (unsigned)Fixup.getKind() == ARM::fixup_arm_condbl)) + IsResolved = false; + + // Try to get the encoded value for the fixup as-if we're mapping it into + // the instruction. This allows adjustFixupValue() to issue a diagnostic + // if the value aren't invalid. + (void)adjustFixupValue(Fixup, Value, false, &Asm.getContext(), + IsLittleEndian, IsResolved); +} + +/// getFixupKindNumBytes - The number of bytes the fixup may change. +static unsigned getFixupKindNumBytes(unsigned Kind) { + switch (Kind) { + default: + llvm_unreachable("Unknown fixup kind!"); + + case FK_Data_1: + case ARM::fixup_arm_thumb_bcc: + case ARM::fixup_arm_thumb_cp: + case ARM::fixup_thumb_adr_pcrel_10: + return 1; + + case FK_Data_2: + case ARM::fixup_arm_thumb_br: + case ARM::fixup_arm_thumb_cb: + return 2; + + case ARM::fixup_arm_pcrel_10_unscaled: + case ARM::fixup_arm_ldst_pcrel_12: + case ARM::fixup_arm_pcrel_10: + case ARM::fixup_arm_adr_pcrel_12: + case ARM::fixup_arm_uncondbl: + case ARM::fixup_arm_condbl: + case ARM::fixup_arm_blx: + case ARM::fixup_arm_condbranch: + case ARM::fixup_arm_uncondbranch: + return 3; + + case FK_Data_4: + case ARM::fixup_t2_ldst_pcrel_12: + case ARM::fixup_t2_condbranch: + case ARM::fixup_t2_uncondbranch: + case ARM::fixup_t2_pcrel_10: + case ARM::fixup_t2_adr_pcrel_12: + case ARM::fixup_arm_thumb_bl: + case ARM::fixup_arm_thumb_blx: + case ARM::fixup_arm_movt_hi16: + case ARM::fixup_arm_movw_lo16: + case ARM::fixup_t2_movt_hi16: + case ARM::fixup_t2_movw_lo16: + return 4; + + case FK_SecRel_2: + return 2; + case FK_SecRel_4: + return 4; + } +} + +/// getFixupKindContainerSizeBytes - The number of bytes of the +/// container involved in big endian. +static unsigned getFixupKindContainerSizeBytes(unsigned Kind) { + switch (Kind) { + default: + llvm_unreachable("Unknown fixup kind!"); + + case FK_Data_1: + return 1; + case FK_Data_2: + return 2; + case FK_Data_4: + return 4; + + case ARM::fixup_arm_thumb_bcc: + case ARM::fixup_arm_thumb_cp: + case ARM::fixup_thumb_adr_pcrel_10: + case ARM::fixup_arm_thumb_br: + case ARM::fixup_arm_thumb_cb: + // Instruction size is 2 bytes. + return 2; + + case ARM::fixup_arm_pcrel_10_unscaled: + case ARM::fixup_arm_ldst_pcrel_12: + case ARM::fixup_arm_pcrel_10: + case ARM::fixup_arm_adr_pcrel_12: + case ARM::fixup_arm_uncondbl: + case ARM::fixup_arm_condbl: + case ARM::fixup_arm_blx: + case ARM::fixup_arm_condbranch: + case ARM::fixup_arm_uncondbranch: + case ARM::fixup_t2_ldst_pcrel_12: + case ARM::fixup_t2_condbranch: + case ARM::fixup_t2_uncondbranch: + case ARM::fixup_t2_pcrel_10: + case ARM::fixup_t2_adr_pcrel_12: + case ARM::fixup_arm_thumb_bl: + case ARM::fixup_arm_thumb_blx: + case ARM::fixup_arm_movt_hi16: + case ARM::fixup_arm_movw_lo16: + case ARM::fixup_t2_movt_hi16: + case ARM::fixup_t2_movw_lo16: + // Instruction size is 4 bytes. + return 4; + } +} + +void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, + unsigned DataSize, uint64_t Value, + bool IsPCRel) const { + unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); + Value = + adjustFixupValue(Fixup, Value, IsPCRel, nullptr, IsLittleEndian, true); + if (!Value) + return; // Doesn't change encoding. + + unsigned Offset = Fixup.getOffset(); + assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!"); + + // Used to point to big endian bytes. + unsigned FullSizeBytes; + if (!IsLittleEndian) { + FullSizeBytes = getFixupKindContainerSizeBytes(Fixup.getKind()); + assert((Offset + FullSizeBytes) <= DataSize && "Invalid fixup size!"); + assert(NumBytes <= FullSizeBytes && "Invalid fixup size!"); + } + + // For each byte of the fragment that the fixup touches, mask in the bits from + // the fixup value. The Value has been "split up" into the appropriate + // bitfields above. + for (unsigned i = 0; i != NumBytes; ++i) { + unsigned Idx = IsLittleEndian ? i : (FullSizeBytes - 1 - i); + Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff); + } +} + +namespace CU { + +/// \brief Compact unwind encoding values. +enum CompactUnwindEncodings { + UNWIND_ARM_MODE_MASK = 0x0F000000, + UNWIND_ARM_MODE_FRAME = 0x01000000, + UNWIND_ARM_MODE_FRAME_D = 0x02000000, + UNWIND_ARM_MODE_DWARF = 0x04000000, + + UNWIND_ARM_FRAME_STACK_ADJUST_MASK = 0x00C00000, + + UNWIND_ARM_FRAME_FIRST_PUSH_R4 = 0x00000001, + UNWIND_ARM_FRAME_FIRST_PUSH_R5 = 0x00000002, + UNWIND_ARM_FRAME_FIRST_PUSH_R6 = 0x00000004, + + UNWIND_ARM_FRAME_SECOND_PUSH_R8 = 0x00000008, + UNWIND_ARM_FRAME_SECOND_PUSH_R9 = 0x00000010, + UNWIND_ARM_FRAME_SECOND_PUSH_R10 = 0x00000020, + UNWIND_ARM_FRAME_SECOND_PUSH_R11 = 0x00000040, + UNWIND_ARM_FRAME_SECOND_PUSH_R12 = 0x00000080, + + UNWIND_ARM_FRAME_D_REG_COUNT_MASK = 0x00000F00, + + UNWIND_ARM_DWARF_SECTION_OFFSET = 0x00FFFFFF +}; + +} // end CU namespace + +/// Generate compact unwind encoding for the function based on the CFI +/// instructions. If the CFI instructions describe a frame that cannot be +/// encoded in compact unwind, the method returns UNWIND_ARM_MODE_DWARF which +/// tells the runtime to fallback and unwind using dwarf. +uint32_t ARMAsmBackendDarwin::generateCompactUnwindEncoding( + ArrayRef<MCCFIInstruction> Instrs) const { + DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs() << "generateCU()\n"); + // Only armv7k uses CFI based unwinding. + if (Subtype != MachO::CPU_SUBTYPE_ARM_V7K) + return 0; + // No .cfi directives means no frame. + if (Instrs.empty()) + return 0; + // Start off assuming CFA is at SP+0. + int CFARegister = ARM::SP; + int CFARegisterOffset = 0; + // Mark savable registers as initially unsaved + DenseMap<unsigned, int> RegOffsets; + int FloatRegCount = 0; + // Process each .cfi directive and build up compact unwind info. + for (size_t i = 0, e = Instrs.size(); i != e; ++i) { + int Reg; + const MCCFIInstruction &Inst = Instrs[i]; + switch (Inst.getOperation()) { + case MCCFIInstruction::OpDefCfa: // DW_CFA_def_cfa + CFARegisterOffset = -Inst.getOffset(); + CFARegister = MRI.getLLVMRegNum(Inst.getRegister(), true); + break; + case MCCFIInstruction::OpDefCfaOffset: // DW_CFA_def_cfa_offset + CFARegisterOffset = -Inst.getOffset(); + break; + case MCCFIInstruction::OpDefCfaRegister: // DW_CFA_def_cfa_register + CFARegister = MRI.getLLVMRegNum(Inst.getRegister(), true); + break; + case MCCFIInstruction::OpOffset: // DW_CFA_offset + Reg = MRI.getLLVMRegNum(Inst.getRegister(), true); + if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg)) + RegOffsets[Reg] = Inst.getOffset(); + else if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg)) { + RegOffsets[Reg] = Inst.getOffset(); + ++FloatRegCount; + } else { + DEBUG_WITH_TYPE("compact-unwind", + llvm::dbgs() << ".cfi_offset on unknown register=" + << Inst.getRegister() << "\n"); + return CU::UNWIND_ARM_MODE_DWARF; + } + break; + case MCCFIInstruction::OpRelOffset: // DW_CFA_advance_loc + // Ignore + break; + default: + // Directive not convertable to compact unwind, bail out. + DEBUG_WITH_TYPE("compact-unwind", + llvm::dbgs() + << "CFI directive not compatiable with comact " + "unwind encoding, opcode=" << Inst.getOperation() + << "\n"); + return CU::UNWIND_ARM_MODE_DWARF; + break; + } + } + + // If no frame set up, return no unwind info. + if ((CFARegister == ARM::SP) && (CFARegisterOffset == 0)) + return 0; + + // Verify standard frame (lr/r7) was used. + if (CFARegister != ARM::R7) { + DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs() << "frame register is " + << CFARegister + << " instead of r7\n"); + return CU::UNWIND_ARM_MODE_DWARF; + } + int StackAdjust = CFARegisterOffset - 8; + if (RegOffsets.lookup(ARM::LR) != (-4 - StackAdjust)) { + DEBUG_WITH_TYPE("compact-unwind", + llvm::dbgs() + << "LR not saved as standard frame, StackAdjust=" + << StackAdjust + << ", CFARegisterOffset=" << CFARegisterOffset + << ", lr save at offset=" << RegOffsets[14] << "\n"); + return CU::UNWIND_ARM_MODE_DWARF; + } + if (RegOffsets.lookup(ARM::R7) != (-8 - StackAdjust)) { + DEBUG_WITH_TYPE("compact-unwind", + llvm::dbgs() << "r7 not saved as standard frame\n"); + return CU::UNWIND_ARM_MODE_DWARF; + } + uint32_t CompactUnwindEncoding = CU::UNWIND_ARM_MODE_FRAME; + + // If var-args are used, there may be a stack adjust required. + switch (StackAdjust) { + case 0: + break; + case 4: + CompactUnwindEncoding |= 0x00400000; + break; + case 8: + CompactUnwindEncoding |= 0x00800000; + break; + case 12: + CompactUnwindEncoding |= 0x00C00000; + break; + default: + DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs() + << ".cfi_def_cfa stack adjust (" + << StackAdjust << ") out of range\n"); + return CU::UNWIND_ARM_MODE_DWARF; + } + + // If r6 is saved, it must be right below r7. + static struct { + unsigned Reg; + unsigned Encoding; + } GPRCSRegs[] = {{ARM::R6, CU::UNWIND_ARM_FRAME_FIRST_PUSH_R6}, + {ARM::R5, CU::UNWIND_ARM_FRAME_FIRST_PUSH_R5}, + {ARM::R4, CU::UNWIND_ARM_FRAME_FIRST_PUSH_R4}, + {ARM::R12, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R12}, + {ARM::R11, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R11}, + {ARM::R10, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R10}, + {ARM::R9, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R9}, + {ARM::R8, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R8}}; + + int CurOffset = -8 - StackAdjust; + for (auto CSReg : GPRCSRegs) { + auto Offset = RegOffsets.find(CSReg.Reg); + if (Offset == RegOffsets.end()) + continue; + + int RegOffset = Offset->second; + if (RegOffset != CurOffset - 4) { + DEBUG_WITH_TYPE("compact-unwind", + llvm::dbgs() << MRI.getName(CSReg.Reg) << " saved at " + << RegOffset << " but only supported at " + << CurOffset << "\n"); + return CU::UNWIND_ARM_MODE_DWARF; + } + CompactUnwindEncoding |= CSReg.Encoding; + CurOffset -= 4; + } + + // If no floats saved, we are done. + if (FloatRegCount == 0) + return CompactUnwindEncoding; + + // Switch mode to include D register saving. + CompactUnwindEncoding &= ~CU::UNWIND_ARM_MODE_MASK; + CompactUnwindEncoding |= CU::UNWIND_ARM_MODE_FRAME_D; + + // FIXME: supporting more than 4 saved D-registers compactly would be trivial, + // but needs coordination with the linker and libunwind. + if (FloatRegCount > 4) { + DEBUG_WITH_TYPE("compact-unwind", + llvm::dbgs() << "unsupported number of D registers saved (" + << FloatRegCount << ")\n"); + return CU::UNWIND_ARM_MODE_DWARF; + } + + // Floating point registers must either be saved sequentially, or we defer to + // DWARF. No gaps allowed here so check that each saved d-register is + // precisely where it should be. + static unsigned FPRCSRegs[] = { ARM::D8, ARM::D10, ARM::D12, ARM::D14 }; + for (int Idx = FloatRegCount - 1; Idx >= 0; --Idx) { + auto Offset = RegOffsets.find(FPRCSRegs[Idx]); + if (Offset == RegOffsets.end()) { + DEBUG_WITH_TYPE("compact-unwind", + llvm::dbgs() << FloatRegCount << " D-regs saved, but " + << MRI.getName(FPRCSRegs[Idx]) + << " not saved\n"); + return CU::UNWIND_ARM_MODE_DWARF; + } else if (Offset->second != CurOffset - 8) { + DEBUG_WITH_TYPE("compact-unwind", + llvm::dbgs() << FloatRegCount << " D-regs saved, but " + << MRI.getName(FPRCSRegs[Idx]) + << " saved at " << Offset->second + << ", expected at " << CurOffset - 8 + << "\n"); + return CU::UNWIND_ARM_MODE_DWARF; + } + CurOffset -= 8; + } + + return CompactUnwindEncoding | ((FloatRegCount - 1) << 8); +} + +static MachO::CPUSubTypeARM getMachOSubTypeFromArch(StringRef Arch) { + unsigned AK = ARM::parseArch(Arch); + switch (AK) { + default: + return MachO::CPU_SUBTYPE_ARM_V7; + case ARM::AK_ARMV4T: + return MachO::CPU_SUBTYPE_ARM_V4T; + case ARM::AK_ARMV5T: + case ARM::AK_ARMV5TE: + case ARM::AK_ARMV5TEJ: + return MachO::CPU_SUBTYPE_ARM_V5; + case ARM::AK_ARMV6: + case ARM::AK_ARMV6K: + return MachO::CPU_SUBTYPE_ARM_V6; + case ARM::AK_ARMV7A: + return MachO::CPU_SUBTYPE_ARM_V7; + case ARM::AK_ARMV7S: + return MachO::CPU_SUBTYPE_ARM_V7S; + case ARM::AK_ARMV7K: + return MachO::CPU_SUBTYPE_ARM_V7K; + case ARM::AK_ARMV6M: + return MachO::CPU_SUBTYPE_ARM_V6M; + case ARM::AK_ARMV7M: + return MachO::CPU_SUBTYPE_ARM_V7M; + case ARM::AK_ARMV7EM: + return MachO::CPU_SUBTYPE_ARM_V7EM; + } +} + +MCAsmBackend *llvm::createARMAsmBackend(const Target &T, + const MCRegisterInfo &MRI, + const Triple &TheTriple, StringRef CPU, + bool isLittle) { + switch (TheTriple.getObjectFormat()) { + default: + llvm_unreachable("unsupported object format"); + case Triple::MachO: { + MachO::CPUSubTypeARM CS = getMachOSubTypeFromArch(TheTriple.getArchName()); + return new ARMAsmBackendDarwin(T, TheTriple, MRI, CS); + } + case Triple::COFF: + assert(TheTriple.isOSWindows() && "non-Windows ARM COFF is not supported"); + return new ARMAsmBackendWinCOFF(T, TheTriple); + case Triple::ELF: + assert(TheTriple.isOSBinFormatELF() && "using ELF for non-ELF target"); + uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); + return new ARMAsmBackendELF(T, TheTriple, OSABI, isLittle); + } +} + +MCAsmBackend *llvm::createARMLEAsmBackend(const Target &T, + const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU) { + return createARMAsmBackend(T, MRI, TT, CPU, true); +} + +MCAsmBackend *llvm::createARMBEAsmBackend(const Target &T, + const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU) { + return createARMAsmBackend(T, MRI, TT, CPU, false); +} + +MCAsmBackend *llvm::createThumbLEAsmBackend(const Target &T, + const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU) { + return createARMAsmBackend(T, MRI, TT, CPU, true); +} + +MCAsmBackend *llvm::createThumbBEAsmBackend(const Target &T, + const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU) { + return createARMAsmBackend(T, MRI, TT, CPU, false); +} diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h new file mode 100644 index 0000000..28a6213 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h @@ -0,0 +1,79 @@ +//===-- ARMAsmBackend.h - ARM Assembler Backend -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_ARMASMBACKEND_H +#define LLVM_LIB_TARGET_ARM_ARMASMBACKEND_H + +#include "MCTargetDesc/ARMFixupKinds.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCSubtargetInfo.h" + +using namespace llvm; + +namespace { + +class ARMAsmBackend : public MCAsmBackend { + const MCSubtargetInfo *STI; + bool isThumbMode; // Currently emitting Thumb code. + bool IsLittleEndian; // Big or little endian. +public: + ARMAsmBackend(const Target &T, const Triple &TT, bool IsLittle) + : MCAsmBackend(), STI(ARM_MC::createARMMCSubtargetInfo(TT, "", "")), + isThumbMode(TT.getArchName().startswith("thumb")), + IsLittleEndian(IsLittle) {} + + ~ARMAsmBackend() override { delete STI; } + + unsigned getNumFixupKinds() const override { + return ARM::NumTargetFixupKinds; + } + + bool hasNOP() const { return STI->getFeatureBits()[ARM::HasV6T2Ops]; } + + const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override; + + /// processFixupValue - Target hook to process the literal value of a fixup + /// if necessary. + void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout, + const MCFixup &Fixup, const MCFragment *DF, + const MCValue &Target, uint64_t &Value, + bool &IsResolved) override; + + unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, bool IsPCRel, + MCContext *Ctx, bool IsLittleEndian, + bool IsResolved) const; + + void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, + uint64_t Value, bool IsPCRel) const override; + + unsigned getRelaxedOpcode(unsigned Op) const; + + bool mayNeedRelaxation(const MCInst &Inst) const override; + + const char *reasonForFixupRelaxation(const MCFixup &Fixup, + uint64_t Value) const; + + bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, + const MCRelaxableFragment *DF, + const MCAsmLayout &Layout) const override; + + void relaxInstruction(const MCInst &Inst, MCInst &Res) const override; + + bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override; + + void handleAssemblerFlag(MCAssemblerFlag Flag) override; + + unsigned getPointerSize() const { return 4; } + bool isThumb() const { return isThumbMode; } + void setIsThumb(bool it) { isThumbMode = it; } + bool isLittle() const { return IsLittleEndian; } +}; +} // end anonymous namespace + +#endif diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h new file mode 100644 index 0000000..995dd0f --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h @@ -0,0 +1,38 @@ +//===-- ARMAsmBackendDarwin.h ARM Asm Backend Darwin ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_ARMASMBACKENDDARWIN_H +#define LLVM_LIB_TARGET_ARM_ARMASMBACKENDDARWIN_H + +#include "llvm/Support/MachO.h" + +using namespace llvm; + +namespace { +class ARMAsmBackendDarwin : public ARMAsmBackend { + const MCRegisterInfo &MRI; +public: + const MachO::CPUSubTypeARM Subtype; + ARMAsmBackendDarwin(const Target &T, const Triple &TT, + const MCRegisterInfo &MRI, MachO::CPUSubTypeARM st) + : ARMAsmBackend(T, TT, /* IsLittleEndian */ true), MRI(MRI), Subtype(st) { + HasDataInCodeSupport = true; + } + + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + return createARMMachObjectWriter(OS, /*Is64Bit=*/false, MachO::CPU_TYPE_ARM, + Subtype); + } + + uint32_t generateCompactUnwindEncoding( + ArrayRef<MCCFIInstruction> Instrs) const override; +}; +} + +#endif diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h new file mode 100644 index 0000000..68b12ed --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h @@ -0,0 +1,28 @@ +//===-- ARMAsmBackendELF.h ARM Asm Backend ELF -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_ELFARMASMBACKEND_H +#define LLVM_LIB_TARGET_ARM_ELFARMASMBACKEND_H + +using namespace llvm; +namespace { +class ARMAsmBackendELF : public ARMAsmBackend { +public: + uint8_t OSABI; + ARMAsmBackendELF(const Target &T, const Triple &TT, uint8_t OSABI, + bool IsLittle) + : ARMAsmBackend(T, TT, IsLittle), OSABI(OSABI) {} + + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + return createARMELFObjectWriter(OS, OSABI, isLittle()); + } +}; +} + +#endif diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h new file mode 100644 index 0000000..170f59a --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h @@ -0,0 +1,26 @@ +//===-- ARMAsmBackendWinCOFF.h - ARM Asm Backend WinCOFF --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_ARMASMBACKENDWINCOFF_H +#define LLVM_LIB_TARGET_ARM_ARMASMBACKENDWINCOFF_H + +using namespace llvm; + +namespace { +class ARMAsmBackendWinCOFF : public ARMAsmBackend { +public: + ARMAsmBackendWinCOFF(const Target &T, const Triple &TheTriple) + : ARMAsmBackend(T, TheTriple, true) {} + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + return createARMWinCOFFObjectWriter(OS, /*Is64Bit=*/false); + } +}; +} + +#endif diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h new file mode 100644 index 0000000..4289a73 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h @@ -0,0 +1,464 @@ +//===-- ARMBaseInfo.h - Top level definitions for ARM -------- --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains small standalone helper functions and enum definitions for +// the ARM target useful for the compiler back-end and the MC libraries. +// As such, it deliberately does not include references to LLVM core +// code gen types, passes, etc.. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMBASEINFO_H +#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMBASEINFO_H + +#include "ARMMCTargetDesc.h" +#include "llvm/Support/ErrorHandling.h" + +namespace llvm { + +// Enums corresponding to ARM condition codes +namespace ARMCC { + // The CondCodes constants map directly to the 4-bit encoding of the + // condition field for predicated instructions. + enum CondCodes { // Meaning (integer) Meaning (floating-point) + EQ, // Equal Equal + NE, // Not equal Not equal, or unordered + HS, // Carry set >, ==, or unordered + LO, // Carry clear Less than + MI, // Minus, negative Less than + PL, // Plus, positive or zero >, ==, or unordered + VS, // Overflow Unordered + VC, // No overflow Not unordered + HI, // Unsigned higher Greater than, or unordered + LS, // Unsigned lower or same Less than or equal + GE, // Greater than or equal Greater than or equal + LT, // Less than Less than, or unordered + GT, // Greater than Greater than + LE, // Less than or equal <, ==, or unordered + AL // Always (unconditional) Always (unconditional) + }; + + inline static CondCodes getOppositeCondition(CondCodes CC) { + switch (CC) { + default: llvm_unreachable("Unknown condition code"); + case EQ: return NE; + case NE: return EQ; + case HS: return LO; + case LO: return HS; + case MI: return PL; + case PL: return MI; + case VS: return VC; + case VC: return VS; + case HI: return LS; + case LS: return HI; + case GE: return LT; + case LT: return GE; + case GT: return LE; + case LE: return GT; + } + } +} // namespace ARMCC + +inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) { + switch (CC) { + case ARMCC::EQ: return "eq"; + case ARMCC::NE: return "ne"; + case ARMCC::HS: return "hs"; + case ARMCC::LO: return "lo"; + case ARMCC::MI: return "mi"; + case ARMCC::PL: return "pl"; + case ARMCC::VS: return "vs"; + case ARMCC::VC: return "vc"; + case ARMCC::HI: return "hi"; + case ARMCC::LS: return "ls"; + case ARMCC::GE: return "ge"; + case ARMCC::LT: return "lt"; + case ARMCC::GT: return "gt"; + case ARMCC::LE: return "le"; + case ARMCC::AL: return "al"; + } + llvm_unreachable("Unknown condition code"); +} + +namespace ARM_PROC { + enum IMod { + IE = 2, + ID = 3 + }; + + enum IFlags { + F = 1, + I = 2, + A = 4 + }; + + inline static const char *IFlagsToString(unsigned val) { + switch (val) { + default: llvm_unreachable("Unknown iflags operand"); + case F: return "f"; + case I: return "i"; + case A: return "a"; + } + } + + inline static const char *IModToString(unsigned val) { + switch (val) { + default: llvm_unreachable("Unknown imod operand"); + case IE: return "ie"; + case ID: return "id"; + } + } +} + +namespace ARM_MB { + // The Memory Barrier Option constants map directly to the 4-bit encoding of + // the option field for memory barrier operations. + enum MemBOpt { + RESERVED_0 = 0, + OSHLD = 1, + OSHST = 2, + OSH = 3, + RESERVED_4 = 4, + NSHLD = 5, + NSHST = 6, + NSH = 7, + RESERVED_8 = 8, + ISHLD = 9, + ISHST = 10, + ISH = 11, + RESERVED_12 = 12, + LD = 13, + ST = 14, + SY = 15 + }; + + inline static const char *MemBOptToString(unsigned val, bool HasV8) { + switch (val) { + default: llvm_unreachable("Unknown memory operation"); + case SY: return "sy"; + case ST: return "st"; + case LD: return HasV8 ? "ld" : "#0xd"; + case RESERVED_12: return "#0xc"; + case ISH: return "ish"; + case ISHST: return "ishst"; + case ISHLD: return HasV8 ? "ishld" : "#0x9"; + case RESERVED_8: return "#0x8"; + case NSH: return "nsh"; + case NSHST: return "nshst"; + case NSHLD: return HasV8 ? "nshld" : "#0x5"; + case RESERVED_4: return "#0x4"; + case OSH: return "osh"; + case OSHST: return "oshst"; + case OSHLD: return HasV8 ? "oshld" : "#0x1"; + case RESERVED_0: return "#0x0"; + } + } +} // namespace ARM_MB + +namespace ARM_ISB { + enum InstSyncBOpt { + RESERVED_0 = 0, + RESERVED_1 = 1, + RESERVED_2 = 2, + RESERVED_3 = 3, + RESERVED_4 = 4, + RESERVED_5 = 5, + RESERVED_6 = 6, + RESERVED_7 = 7, + RESERVED_8 = 8, + RESERVED_9 = 9, + RESERVED_10 = 10, + RESERVED_11 = 11, + RESERVED_12 = 12, + RESERVED_13 = 13, + RESERVED_14 = 14, + SY = 15 + }; + + inline static const char *InstSyncBOptToString(unsigned val) { + switch (val) { + default: + llvm_unreachable("Unknown memory operation"); + case RESERVED_0: return "#0x0"; + case RESERVED_1: return "#0x1"; + case RESERVED_2: return "#0x2"; + case RESERVED_3: return "#0x3"; + case RESERVED_4: return "#0x4"; + case RESERVED_5: return "#0x5"; + case RESERVED_6: return "#0x6"; + case RESERVED_7: return "#0x7"; + case RESERVED_8: return "#0x8"; + case RESERVED_9: return "#0x9"; + case RESERVED_10: return "#0xa"; + case RESERVED_11: return "#0xb"; + case RESERVED_12: return "#0xc"; + case RESERVED_13: return "#0xd"; + case RESERVED_14: return "#0xe"; + case SY: return "sy"; + } + } +} // namespace ARM_ISB + +/// isARMLowRegister - Returns true if the register is a low register (r0-r7). +/// +static inline bool isARMLowRegister(unsigned Reg) { + using namespace ARM; + switch (Reg) { + case R0: case R1: case R2: case R3: + case R4: case R5: case R6: case R7: + return true; + default: + return false; + } +} + +/// ARMII - This namespace holds all of the target specific flags that +/// instruction info tracks. +/// +namespace ARMII { + + /// ARM Index Modes + enum IndexMode { + IndexModeNone = 0, + IndexModePre = 1, + IndexModePost = 2, + IndexModeUpd = 3 + }; + + /// ARM Addressing Modes + enum AddrMode { + AddrModeNone = 0, + AddrMode1 = 1, + AddrMode2 = 2, + AddrMode3 = 3, + AddrMode4 = 4, + AddrMode5 = 5, + AddrMode6 = 6, + AddrModeT1_1 = 7, + AddrModeT1_2 = 8, + AddrModeT1_4 = 9, + AddrModeT1_s = 10, // i8 * 4 for pc and sp relative data + AddrModeT2_i12 = 11, + AddrModeT2_i8 = 12, + AddrModeT2_so = 13, + AddrModeT2_pc = 14, // +/- i12 for pc relative data + AddrModeT2_i8s4 = 15, // i8 * 4 + AddrMode_i12 = 16 + }; + + inline static const char *AddrModeToString(AddrMode addrmode) { + switch (addrmode) { + case AddrModeNone: return "AddrModeNone"; + case AddrMode1: return "AddrMode1"; + case AddrMode2: return "AddrMode2"; + case AddrMode3: return "AddrMode3"; + case AddrMode4: return "AddrMode4"; + case AddrMode5: return "AddrMode5"; + case AddrMode6: return "AddrMode6"; + case AddrModeT1_1: return "AddrModeT1_1"; + case AddrModeT1_2: return "AddrModeT1_2"; + case AddrModeT1_4: return "AddrModeT1_4"; + case AddrModeT1_s: return "AddrModeT1_s"; + case AddrModeT2_i12: return "AddrModeT2_i12"; + case AddrModeT2_i8: return "AddrModeT2_i8"; + case AddrModeT2_so: return "AddrModeT2_so"; + case AddrModeT2_pc: return "AddrModeT2_pc"; + case AddrModeT2_i8s4: return "AddrModeT2_i8s4"; + case AddrMode_i12: return "AddrMode_i12"; + } + } + + /// Target Operand Flag enum. + enum TOF { + //===------------------------------------------------------------------===// + // ARM Specific MachineOperand flags. + + MO_NO_FLAG = 0, + + /// MO_LO16 - On a symbol operand, this represents a relocation containing + /// lower 16 bit of the address. Used only via movw instruction. + MO_LO16 = 0x1, + + /// MO_HI16 - On a symbol operand, this represents a relocation containing + /// higher 16 bit of the address. Used only via movt instruction. + MO_HI16 = 0x2, + + /// MO_PLT - On a symbol operand, this represents an ELF PLT reference on a + /// call operand. + MO_PLT = 0x3, + + /// MO_OPTION_MASK - Most flags are mutually exclusive; this mask selects + /// just that part of the flag set. + MO_OPTION_MASK = 0x3f, + + /// MO_DLLIMPORT - On a symbol operand, this represents that the reference + /// to the symbol is for an import stub. This is used for DLL import + /// storage class indication on Windows. + MO_DLLIMPORT = 0x40, + + /// MO_NONLAZY - This is an independent flag, on a symbol operand "FOO" it + /// represents a symbol which, if indirect, will get special Darwin mangling + /// as a non-lazy-ptr indirect symbol (i.e. "L_FOO$non_lazy_ptr"). Can be + /// combined with MO_LO16, MO_HI16 or MO_NO_FLAG (in a constant-pool, for + /// example). + MO_NONLAZY = 0x80, + + // It's undefined behaviour if an enum overflows the range between its + // smallest and largest values, but since these are |ed together, it can + // happen. Put a sentinel in (values of this enum are stored as "unsigned + // char"). + MO_UNUSED_MAXIMUM = 0xff + }; + + enum { + //===------------------------------------------------------------------===// + // Instruction Flags. + + //===------------------------------------------------------------------===// + // This four-bit field describes the addressing mode used. + AddrModeMask = 0x1f, // The AddrMode enums are declared in ARMBaseInfo.h + + // IndexMode - Unindex, pre-indexed, or post-indexed are valid for load + // and store ops only. Generic "updating" flag is used for ld/st multiple. + // The index mode enums are declared in ARMBaseInfo.h + IndexModeShift = 5, + IndexModeMask = 3 << IndexModeShift, + + //===------------------------------------------------------------------===// + // Instruction encoding formats. + // + FormShift = 7, + FormMask = 0x3f << FormShift, + + // Pseudo instructions + Pseudo = 0 << FormShift, + + // Multiply instructions + MulFrm = 1 << FormShift, + + // Branch instructions + BrFrm = 2 << FormShift, + BrMiscFrm = 3 << FormShift, + + // Data Processing instructions + DPFrm = 4 << FormShift, + DPSoRegFrm = 5 << FormShift, + + // Load and Store + LdFrm = 6 << FormShift, + StFrm = 7 << FormShift, + LdMiscFrm = 8 << FormShift, + StMiscFrm = 9 << FormShift, + LdStMulFrm = 10 << FormShift, + + LdStExFrm = 11 << FormShift, + + // Miscellaneous arithmetic instructions + ArithMiscFrm = 12 << FormShift, + SatFrm = 13 << FormShift, + + // Extend instructions + ExtFrm = 14 << FormShift, + + // VFP formats + VFPUnaryFrm = 15 << FormShift, + VFPBinaryFrm = 16 << FormShift, + VFPConv1Frm = 17 << FormShift, + VFPConv2Frm = 18 << FormShift, + VFPConv3Frm = 19 << FormShift, + VFPConv4Frm = 20 << FormShift, + VFPConv5Frm = 21 << FormShift, + VFPLdStFrm = 22 << FormShift, + VFPLdStMulFrm = 23 << FormShift, + VFPMiscFrm = 24 << FormShift, + + // Thumb format + ThumbFrm = 25 << FormShift, + + // Miscelleaneous format + MiscFrm = 26 << FormShift, + + // NEON formats + NGetLnFrm = 27 << FormShift, + NSetLnFrm = 28 << FormShift, + NDupFrm = 29 << FormShift, + NLdStFrm = 30 << FormShift, + N1RegModImmFrm= 31 << FormShift, + N2RegFrm = 32 << FormShift, + NVCVTFrm = 33 << FormShift, + NVDupLnFrm = 34 << FormShift, + N2RegVShLFrm = 35 << FormShift, + N2RegVShRFrm = 36 << FormShift, + N3RegFrm = 37 << FormShift, + N3RegVShFrm = 38 << FormShift, + NVExtFrm = 39 << FormShift, + NVMulSLFrm = 40 << FormShift, + NVTBLFrm = 41 << FormShift, + + //===------------------------------------------------------------------===// + // Misc flags. + + // UnaryDP - Indicates this is a unary data processing instruction, i.e. + // it doesn't have a Rn operand. + UnaryDP = 1 << 13, + + // Xform16Bit - Indicates this Thumb2 instruction may be transformed into + // a 16-bit Thumb instruction if certain conditions are met. + Xform16Bit = 1 << 14, + + // ThumbArithFlagSetting - The instruction is a 16-bit flag setting Thumb + // instruction. Used by the parser to determine whether to require the 'S' + // suffix on the mnemonic (when not in an IT block) or preclude it (when + // in an IT block). + ThumbArithFlagSetting = 1 << 18, + + //===------------------------------------------------------------------===// + // Code domain. + DomainShift = 15, + DomainMask = 7 << DomainShift, + DomainGeneral = 0 << DomainShift, + DomainVFP = 1 << DomainShift, + DomainNEON = 2 << DomainShift, + DomainNEONA8 = 4 << DomainShift, + + //===------------------------------------------------------------------===// + // Field shifts - such shifts are used to set field while generating + // machine instructions. + // + // FIXME: This list will need adjusting/fixing as the MC code emitter + // takes shape and the ARMCodeEmitter.cpp bits go away. + ShiftTypeShift = 4, + + M_BitShift = 5, + ShiftImmShift = 5, + ShiftShift = 7, + N_BitShift = 7, + ImmHiShift = 8, + SoRotImmShift = 8, + RegRsShift = 8, + ExtRotImmShift = 10, + RegRdLoShift = 12, + RegRdShift = 12, + RegRdHiShift = 16, + RegRnShift = 16, + S_BitShift = 20, + W_BitShift = 21, + AM3_I_BitShift = 22, + D_BitShift = 22, + U_BitShift = 23, + P_BitShift = 24, + I_BitShift = 25, + CondShift = 28 + }; + +} // end namespace ARMII + +} // end namespace llvm; + +#endif diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp new file mode 100644 index 0000000..52eba8be --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp @@ -0,0 +1,259 @@ +//===-- ARMELFObjectWriter.cpp - ARM ELF Writer ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/ARMMCTargetDesc.h" +#include "MCTargetDesc/ARMFixupKinds.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace { + class ARMELFObjectWriter : public MCELFObjectTargetWriter { + enum { DefaultEABIVersion = 0x05000000U }; + unsigned GetRelocTypeInner(const MCValue &Target, + const MCFixup &Fixup, + bool IsPCRel) const; + + + public: + ARMELFObjectWriter(uint8_t OSABI); + + ~ARMELFObjectWriter() override; + + unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, + bool IsPCRel) const override; + + bool needsRelocateWithSymbol(const MCSymbol &Sym, + unsigned Type) const override; + }; +} + +ARMELFObjectWriter::ARMELFObjectWriter(uint8_t OSABI) + : MCELFObjectTargetWriter(/*Is64Bit*/ false, OSABI, + ELF::EM_ARM, + /*HasRelocationAddend*/ false) {} + +ARMELFObjectWriter::~ARMELFObjectWriter() {} + +bool ARMELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym, + unsigned Type) const { + // FIXME: This is extremely conservative. This really needs to use a + // whitelist with a clear explanation for why each realocation needs to + // point to the symbol, not to the section. + switch (Type) { + default: + return true; + + case ELF::R_ARM_PREL31: + case ELF::R_ARM_ABS32: + return false; + } +} + +// Need to examine the Fixup when determining whether to +// emit the relocation as an explicit symbol or as a section relative +// offset +unsigned ARMELFObjectWriter::GetRelocType(const MCValue &Target, + const MCFixup &Fixup, + bool IsPCRel) const { + return GetRelocTypeInner(Target, Fixup, IsPCRel); +} + +unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, + const MCFixup &Fixup, + bool IsPCRel) const { + MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant(); + + unsigned Type = 0; + if (IsPCRel) { + switch ((unsigned)Fixup.getKind()) { + default: + report_fatal_error("unsupported relocation on symbol"); + return ELF::R_ARM_NONE; + case FK_Data_4: + switch (Modifier) { + default: llvm_unreachable("Unsupported Modifier"); + case MCSymbolRefExpr::VK_None: + Type = ELF::R_ARM_REL32; + break; + case MCSymbolRefExpr::VK_TLSGD: + llvm_unreachable("unimplemented"); + case MCSymbolRefExpr::VK_GOTTPOFF: + Type = ELF::R_ARM_TLS_IE32; + break; + case MCSymbolRefExpr::VK_ARM_GOT_PREL: + Type = ELF::R_ARM_GOT_PREL; + break; + } + break; + case ARM::fixup_arm_blx: + case ARM::fixup_arm_uncondbl: + switch (Modifier) { + case MCSymbolRefExpr::VK_PLT: + Type = ELF::R_ARM_CALL; + break; + case MCSymbolRefExpr::VK_ARM_TLSCALL: + Type = ELF::R_ARM_TLS_CALL; + break; + default: + Type = ELF::R_ARM_CALL; + break; + } + break; + case ARM::fixup_arm_condbl: + case ARM::fixup_arm_condbranch: + case ARM::fixup_arm_uncondbranch: + Type = ELF::R_ARM_JUMP24; + break; + case ARM::fixup_t2_condbranch: + case ARM::fixup_t2_uncondbranch: + Type = ELF::R_ARM_THM_JUMP24; + break; + case ARM::fixup_arm_movt_hi16: + Type = ELF::R_ARM_MOVT_PREL; + break; + case ARM::fixup_arm_movw_lo16: + Type = ELF::R_ARM_MOVW_PREL_NC; + break; + case ARM::fixup_t2_movt_hi16: + Type = ELF::R_ARM_THM_MOVT_PREL; + break; + case ARM::fixup_t2_movw_lo16: + Type = ELF::R_ARM_THM_MOVW_PREL_NC; + break; + case ARM::fixup_arm_thumb_bl: + case ARM::fixup_arm_thumb_blx: + switch (Modifier) { + case MCSymbolRefExpr::VK_ARM_TLSCALL: + Type = ELF::R_ARM_THM_TLS_CALL; + break; + default: + Type = ELF::R_ARM_THM_CALL; + break; + } + break; + } + } else { + switch ((unsigned)Fixup.getKind()) { + default: + report_fatal_error("unsupported relocation on symbol"); + return ELF::R_ARM_NONE; + case FK_Data_1: + switch (Modifier) { + default: llvm_unreachable("unsupported Modifier"); + case MCSymbolRefExpr::VK_None: + Type = ELF::R_ARM_ABS8; + break; + } + break; + case FK_Data_2: + switch (Modifier) { + default: llvm_unreachable("unsupported modifier"); + case MCSymbolRefExpr::VK_None: + Type = ELF::R_ARM_ABS16; + break; + } + break; + case FK_Data_4: + switch (Modifier) { + default: llvm_unreachable("Unsupported Modifier"); + case MCSymbolRefExpr::VK_ARM_NONE: + Type = ELF::R_ARM_NONE; + break; + case MCSymbolRefExpr::VK_GOT: + Type = ELF::R_ARM_GOT_BREL; + break; + case MCSymbolRefExpr::VK_TLSGD: + Type = ELF::R_ARM_TLS_GD32; + break; + case MCSymbolRefExpr::VK_TPOFF: + Type = ELF::R_ARM_TLS_LE32; + break; + case MCSymbolRefExpr::VK_GOTTPOFF: + Type = ELF::R_ARM_TLS_IE32; + break; + case MCSymbolRefExpr::VK_None: + Type = ELF::R_ARM_ABS32; + break; + case MCSymbolRefExpr::VK_GOTOFF: + Type = ELF::R_ARM_GOTOFF32; + break; + case MCSymbolRefExpr::VK_ARM_GOT_PREL: + Type = ELF::R_ARM_GOT_PREL; + break; + case MCSymbolRefExpr::VK_ARM_TARGET1: + Type = ELF::R_ARM_TARGET1; + break; + case MCSymbolRefExpr::VK_ARM_TARGET2: + Type = ELF::R_ARM_TARGET2; + break; + case MCSymbolRefExpr::VK_ARM_PREL31: + Type = ELF::R_ARM_PREL31; + break; + case MCSymbolRefExpr::VK_ARM_SBREL: + Type = ELF::R_ARM_SBREL32; + break; + case MCSymbolRefExpr::VK_ARM_TLSLDO: + Type = ELF::R_ARM_TLS_LDO32; + break; + case MCSymbolRefExpr::VK_ARM_TLSCALL: + Type = ELF::R_ARM_TLS_CALL; + break; + case MCSymbolRefExpr::VK_ARM_TLSDESC: + Type = ELF::R_ARM_TLS_GOTDESC; + break; + case MCSymbolRefExpr::VK_ARM_TLSDESCSEQ: + Type = ELF::R_ARM_TLS_DESCSEQ; + break; + } + break; + case ARM::fixup_arm_ldst_pcrel_12: + case ARM::fixup_arm_pcrel_10: + case ARM::fixup_arm_adr_pcrel_12: + case ARM::fixup_arm_thumb_bl: + case ARM::fixup_arm_thumb_cb: + case ARM::fixup_arm_thumb_cp: + case ARM::fixup_arm_thumb_br: + llvm_unreachable("Unimplemented"); + case ARM::fixup_arm_condbranch: + case ARM::fixup_arm_uncondbranch: + Type = ELF::R_ARM_JUMP24; + break; + case ARM::fixup_arm_movt_hi16: + Type = ELF::R_ARM_MOVT_ABS; + break; + case ARM::fixup_arm_movw_lo16: + Type = ELF::R_ARM_MOVW_ABS_NC; + break; + case ARM::fixup_t2_movt_hi16: + Type = ELF::R_ARM_THM_MOVT_ABS; + break; + case ARM::fixup_t2_movw_lo16: + Type = ELF::R_ARM_THM_MOVW_ABS_NC; + break; + } + } + + return Type; +} + +MCObjectWriter *llvm::createARMELFObjectWriter(raw_pwrite_stream &OS, + uint8_t OSABI, + bool IsLittleEndian) { + MCELFObjectTargetWriter *MOTW = new ARMELFObjectWriter(OSABI); + return createELFObjectWriter(MOTW, OS, IsLittleEndian); +} diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp new file mode 100644 index 0000000..6084f22 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -0,0 +1,1377 @@ +//===- lib/MC/ARMELFStreamer.cpp - ELF Object Output for ARM --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file assembles .s files and emits ARM ELF .o object files. Different +// from generic ELF streamer in emitting mapping symbols ($a, $t and $d) to +// delimit regions of data and code. +// +//===----------------------------------------------------------------------===// + +#include "ARMRegisterInfo.h" +#include "ARMUnwindOpAsm.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCELFStreamer.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCObjectStreamer.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSection.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbolELF.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/ARMBuildAttributes.h" +#include "llvm/Support/ARMEHABI.h" +#include "llvm/Support/TargetParser.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Support/LEB128.h" +#include "llvm/Support/raw_ostream.h" +#include <algorithm> + +using namespace llvm; + +static std::string GetAEABIUnwindPersonalityName(unsigned Index) { + assert(Index < ARM::EHABI::NUM_PERSONALITY_INDEX && + "Invalid personality index"); + return (Twine("__aeabi_unwind_cpp_pr") + Twine(Index)).str(); +} + +namespace { + +class ARMELFStreamer; + +class ARMTargetAsmStreamer : public ARMTargetStreamer { + formatted_raw_ostream &OS; + MCInstPrinter &InstPrinter; + bool IsVerboseAsm; + + void emitFnStart() override; + void emitFnEnd() override; + void emitCantUnwind() override; + void emitPersonality(const MCSymbol *Personality) override; + void emitPersonalityIndex(unsigned Index) override; + void emitHandlerData() override; + void emitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset = 0) override; + void emitMovSP(unsigned Reg, int64_t Offset = 0) override; + void emitPad(int64_t Offset) override; + void emitRegSave(const SmallVectorImpl<unsigned> &RegList, + bool isVector) override; + void emitUnwindRaw(int64_t Offset, + const SmallVectorImpl<uint8_t> &Opcodes) override; + + void switchVendor(StringRef Vendor) override; + void emitAttribute(unsigned Attribute, unsigned Value) override; + void emitTextAttribute(unsigned Attribute, StringRef String) override; + void emitIntTextAttribute(unsigned Attribute, unsigned IntValue, + StringRef StringValue) override; + void emitArch(unsigned Arch) override; + void emitArchExtension(unsigned ArchExt) override; + void emitObjectArch(unsigned Arch) override; + void emitFPU(unsigned FPU) override; + void emitInst(uint32_t Inst, char Suffix = '\0') override; + void finishAttributeSection() override; + + void AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) override; + void emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) override; + +public: + ARMTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS, + MCInstPrinter &InstPrinter, bool VerboseAsm); +}; + +ARMTargetAsmStreamer::ARMTargetAsmStreamer(MCStreamer &S, + formatted_raw_ostream &OS, + MCInstPrinter &InstPrinter, + bool VerboseAsm) + : ARMTargetStreamer(S), OS(OS), InstPrinter(InstPrinter), + IsVerboseAsm(VerboseAsm) {} +void ARMTargetAsmStreamer::emitFnStart() { OS << "\t.fnstart\n"; } +void ARMTargetAsmStreamer::emitFnEnd() { OS << "\t.fnend\n"; } +void ARMTargetAsmStreamer::emitCantUnwind() { OS << "\t.cantunwind\n"; } +void ARMTargetAsmStreamer::emitPersonality(const MCSymbol *Personality) { + OS << "\t.personality " << Personality->getName() << '\n'; +} +void ARMTargetAsmStreamer::emitPersonalityIndex(unsigned Index) { + OS << "\t.personalityindex " << Index << '\n'; +} +void ARMTargetAsmStreamer::emitHandlerData() { OS << "\t.handlerdata\n"; } +void ARMTargetAsmStreamer::emitSetFP(unsigned FpReg, unsigned SpReg, + int64_t Offset) { + OS << "\t.setfp\t"; + InstPrinter.printRegName(OS, FpReg); + OS << ", "; + InstPrinter.printRegName(OS, SpReg); + if (Offset) + OS << ", #" << Offset; + OS << '\n'; +} +void ARMTargetAsmStreamer::emitMovSP(unsigned Reg, int64_t Offset) { + assert((Reg != ARM::SP && Reg != ARM::PC) && + "the operand of .movsp cannot be either sp or pc"); + + OS << "\t.movsp\t"; + InstPrinter.printRegName(OS, Reg); + if (Offset) + OS << ", #" << Offset; + OS << '\n'; +} +void ARMTargetAsmStreamer::emitPad(int64_t Offset) { + OS << "\t.pad\t#" << Offset << '\n'; +} +void ARMTargetAsmStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList, + bool isVector) { + assert(RegList.size() && "RegList should not be empty"); + if (isVector) + OS << "\t.vsave\t{"; + else + OS << "\t.save\t{"; + + InstPrinter.printRegName(OS, RegList[0]); + + for (unsigned i = 1, e = RegList.size(); i != e; ++i) { + OS << ", "; + InstPrinter.printRegName(OS, RegList[i]); + } + + OS << "}\n"; +} +void ARMTargetAsmStreamer::switchVendor(StringRef Vendor) { +} +void ARMTargetAsmStreamer::emitAttribute(unsigned Attribute, unsigned Value) { + OS << "\t.eabi_attribute\t" << Attribute << ", " << Twine(Value); + if (IsVerboseAsm) { + StringRef Name = ARMBuildAttrs::AttrTypeAsString(Attribute); + if (!Name.empty()) + OS << "\t@ " << Name; + } + OS << "\n"; +} +void ARMTargetAsmStreamer::emitTextAttribute(unsigned Attribute, + StringRef String) { + switch (Attribute) { + case ARMBuildAttrs::CPU_name: + OS << "\t.cpu\t" << String.lower(); + break; + default: + OS << "\t.eabi_attribute\t" << Attribute << ", \"" << String << "\""; + if (IsVerboseAsm) { + StringRef Name = ARMBuildAttrs::AttrTypeAsString(Attribute); + if (!Name.empty()) + OS << "\t@ " << Name; + } + break; + } + OS << "\n"; +} +void ARMTargetAsmStreamer::emitIntTextAttribute(unsigned Attribute, + unsigned IntValue, + StringRef StringValue) { + switch (Attribute) { + default: llvm_unreachable("unsupported multi-value attribute in asm mode"); + case ARMBuildAttrs::compatibility: + OS << "\t.eabi_attribute\t" << Attribute << ", " << IntValue; + if (!StringValue.empty()) + OS << ", \"" << StringValue << "\""; + if (IsVerboseAsm) + OS << "\t@ " << ARMBuildAttrs::AttrTypeAsString(Attribute); + break; + } + OS << "\n"; +} +void ARMTargetAsmStreamer::emitArch(unsigned Arch) { + OS << "\t.arch\t" << ARM::getArchName(Arch) << "\n"; +} +void ARMTargetAsmStreamer::emitArchExtension(unsigned ArchExt) { + OS << "\t.arch_extension\t" << ARM::getArchExtName(ArchExt) << "\n"; +} +void ARMTargetAsmStreamer::emitObjectArch(unsigned Arch) { + OS << "\t.object_arch\t" << ARM::getArchName(Arch) << '\n'; +} +void ARMTargetAsmStreamer::emitFPU(unsigned FPU) { + OS << "\t.fpu\t" << ARM::getFPUName(FPU) << "\n"; +} +void ARMTargetAsmStreamer::finishAttributeSection() { +} +void +ARMTargetAsmStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *S) { + OS << "\t.tlsdescseq\t" << S->getSymbol().getName(); +} + +void ARMTargetAsmStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) { + const MCAsmInfo *MAI = Streamer.getContext().getAsmInfo(); + + OS << "\t.thumb_set\t"; + Symbol->print(OS, MAI); + OS << ", "; + Value->print(OS, MAI); + OS << '\n'; +} + +void ARMTargetAsmStreamer::emitInst(uint32_t Inst, char Suffix) { + OS << "\t.inst"; + if (Suffix) + OS << "." << Suffix; + OS << "\t0x" << Twine::utohexstr(Inst) << "\n"; +} + +void ARMTargetAsmStreamer::emitUnwindRaw(int64_t Offset, + const SmallVectorImpl<uint8_t> &Opcodes) { + OS << "\t.unwind_raw " << Offset; + for (SmallVectorImpl<uint8_t>::const_iterator OCI = Opcodes.begin(), + OCE = Opcodes.end(); + OCI != OCE; ++OCI) + OS << ", 0x" << Twine::utohexstr(*OCI); + OS << '\n'; +} + +class ARMTargetELFStreamer : public ARMTargetStreamer { +private: + // This structure holds all attributes, accounting for + // their string/numeric value, so we can later emit them + // in declaration order, keeping all in the same vector + struct AttributeItem { + enum { + HiddenAttribute = 0, + NumericAttribute, + TextAttribute, + NumericAndTextAttributes + } Type; + unsigned Tag; + unsigned IntValue; + std::string StringValue; + + static bool LessTag(const AttributeItem &LHS, const AttributeItem &RHS) { + // The conformance tag must be emitted first when serialised + // into an object file. Specifically, the addenda to the ARM ABI + // states that (2.3.7.4): + // + // "To simplify recognition by consumers in the common case of + // claiming conformity for the whole file, this tag should be + // emitted first in a file-scope sub-subsection of the first + // public subsection of the attributes section." + // + // So it is special-cased in this comparison predicate when the + // attributes are sorted in finishAttributeSection(). + return (RHS.Tag != ARMBuildAttrs::conformance) && + ((LHS.Tag == ARMBuildAttrs::conformance) || (LHS.Tag < RHS.Tag)); + } + }; + + StringRef CurrentVendor; + unsigned FPU; + unsigned Arch; + unsigned EmittedArch; + SmallVector<AttributeItem, 64> Contents; + + MCSection *AttributeSection; + + AttributeItem *getAttributeItem(unsigned Attribute) { + for (size_t i = 0; i < Contents.size(); ++i) + if (Contents[i].Tag == Attribute) + return &Contents[i]; + return nullptr; + } + + void setAttributeItem(unsigned Attribute, unsigned Value, + bool OverwriteExisting) { + // Look for existing attribute item + if (AttributeItem *Item = getAttributeItem(Attribute)) { + if (!OverwriteExisting) + return; + Item->Type = AttributeItem::NumericAttribute; + Item->IntValue = Value; + return; + } + + // Create new attribute item + AttributeItem Item = { + AttributeItem::NumericAttribute, + Attribute, + Value, + StringRef("") + }; + Contents.push_back(Item); + } + + void setAttributeItem(unsigned Attribute, StringRef Value, + bool OverwriteExisting) { + // Look for existing attribute item + if (AttributeItem *Item = getAttributeItem(Attribute)) { + if (!OverwriteExisting) + return; + Item->Type = AttributeItem::TextAttribute; + Item->StringValue = Value; + return; + } + + // Create new attribute item + AttributeItem Item = { + AttributeItem::TextAttribute, + Attribute, + 0, + Value + }; + Contents.push_back(Item); + } + + void setAttributeItems(unsigned Attribute, unsigned IntValue, + StringRef StringValue, bool OverwriteExisting) { + // Look for existing attribute item + if (AttributeItem *Item = getAttributeItem(Attribute)) { + if (!OverwriteExisting) + return; + Item->Type = AttributeItem::NumericAndTextAttributes; + Item->IntValue = IntValue; + Item->StringValue = StringValue; + return; + } + + // Create new attribute item + AttributeItem Item = { + AttributeItem::NumericAndTextAttributes, + Attribute, + IntValue, + StringValue + }; + Contents.push_back(Item); + } + + void emitArchDefaultAttributes(); + void emitFPUDefaultAttributes(); + + ARMELFStreamer &getStreamer(); + + void emitFnStart() override; + void emitFnEnd() override; + void emitCantUnwind() override; + void emitPersonality(const MCSymbol *Personality) override; + void emitPersonalityIndex(unsigned Index) override; + void emitHandlerData() override; + void emitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset = 0) override; + void emitMovSP(unsigned Reg, int64_t Offset = 0) override; + void emitPad(int64_t Offset) override; + void emitRegSave(const SmallVectorImpl<unsigned> &RegList, + bool isVector) override; + void emitUnwindRaw(int64_t Offset, + const SmallVectorImpl<uint8_t> &Opcodes) override; + + void switchVendor(StringRef Vendor) override; + void emitAttribute(unsigned Attribute, unsigned Value) override; + void emitTextAttribute(unsigned Attribute, StringRef String) override; + void emitIntTextAttribute(unsigned Attribute, unsigned IntValue, + StringRef StringValue) override; + void emitArch(unsigned Arch) override; + void emitObjectArch(unsigned Arch) override; + void emitFPU(unsigned FPU) override; + void emitInst(uint32_t Inst, char Suffix = '\0') override; + void finishAttributeSection() override; + void emitLabel(MCSymbol *Symbol) override; + + void AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) override; + void emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) override; + + size_t calculateContentSize() const; + +public: + ARMTargetELFStreamer(MCStreamer &S) + : ARMTargetStreamer(S), CurrentVendor("aeabi"), FPU(ARM::FK_INVALID), + Arch(ARM::AK_INVALID), EmittedArch(ARM::AK_INVALID), + AttributeSection(nullptr) {} +}; + +/// Extend the generic ELFStreamer class so that it can emit mapping symbols at +/// the appropriate points in the object files. These symbols are defined in the +/// ARM ELF ABI: infocenter.arm.com/help/topic/com.arm.../IHI0044D_aaelf.pdf. +/// +/// In brief: $a, $t or $d should be emitted at the start of each contiguous +/// region of ARM code, Thumb code or data in a section. In practice, this +/// emission does not rely on explicit assembler directives but on inherent +/// properties of the directives doing the emission (e.g. ".byte" is data, "add +/// r0, r0, r0" an instruction). +/// +/// As a result this system is orthogonal to the DataRegion infrastructure used +/// by MachO. Beware! +class ARMELFStreamer : public MCELFStreamer { +public: + friend class ARMTargetELFStreamer; + + ARMELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, bool IsThumb) + : MCELFStreamer(Context, TAB, OS, Emitter), IsThumb(IsThumb), + MappingSymbolCounter(0), LastEMS(EMS_None) { + Reset(); + } + + ~ARMELFStreamer() {} + + void FinishImpl() override; + + // ARM exception handling directives + void emitFnStart(); + void emitFnEnd(); + void emitCantUnwind(); + void emitPersonality(const MCSymbol *Per); + void emitPersonalityIndex(unsigned index); + void emitHandlerData(); + void emitSetFP(unsigned NewFpReg, unsigned NewSpReg, int64_t Offset = 0); + void emitMovSP(unsigned Reg, int64_t Offset = 0); + void emitPad(int64_t Offset); + void emitRegSave(const SmallVectorImpl<unsigned> &RegList, bool isVector); + void emitUnwindRaw(int64_t Offset, const SmallVectorImpl<uint8_t> &Opcodes); + + void ChangeSection(MCSection *Section, const MCExpr *Subsection) override { + // We have to keep track of the mapping symbol state of any sections we + // use. Each one should start off as EMS_None, which is provided as the + // default constructor by DenseMap::lookup. + LastMappingSymbols[getPreviousSection().first] = LastEMS; + LastEMS = LastMappingSymbols.lookup(Section); + + MCELFStreamer::ChangeSection(Section, Subsection); + } + + /// This function is the one used to emit instruction data into the ELF + /// streamer. We override it to add the appropriate mapping symbol if + /// necessary. + void EmitInstruction(const MCInst& Inst, + const MCSubtargetInfo &STI) override { + if (IsThumb) + EmitThumbMappingSymbol(); + else + EmitARMMappingSymbol(); + + MCELFStreamer::EmitInstruction(Inst, STI); + } + + void emitInst(uint32_t Inst, char Suffix) { + unsigned Size; + char Buffer[4]; + const bool LittleEndian = getContext().getAsmInfo()->isLittleEndian(); + + switch (Suffix) { + case '\0': + Size = 4; + + assert(!IsThumb); + EmitARMMappingSymbol(); + for (unsigned II = 0, IE = Size; II != IE; II++) { + const unsigned I = LittleEndian ? (Size - II - 1) : II; + Buffer[Size - II - 1] = uint8_t(Inst >> I * CHAR_BIT); + } + + break; + case 'n': + case 'w': + Size = (Suffix == 'n' ? 2 : 4); + + assert(IsThumb); + EmitThumbMappingSymbol(); + for (unsigned II = 0, IE = Size; II != IE; II = II + 2) { + const unsigned I0 = LittleEndian ? II + 0 : (Size - II - 1); + const unsigned I1 = LittleEndian ? II + 1 : (Size - II - 2); + Buffer[Size - II - 2] = uint8_t(Inst >> I0 * CHAR_BIT); + Buffer[Size - II - 1] = uint8_t(Inst >> I1 * CHAR_BIT); + } + + break; + default: + llvm_unreachable("Invalid Suffix"); + } + + MCELFStreamer::EmitBytes(StringRef(Buffer, Size)); + } + + /// This is one of the functions used to emit data into an ELF section, so the + /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if + /// necessary. + void EmitBytes(StringRef Data) override { + EmitDataMappingSymbol(); + MCELFStreamer::EmitBytes(Data); + } + + /// This is one of the functions used to emit data into an ELF section, so the + /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if + /// necessary. + void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override { + if (const MCSymbolRefExpr *SRE = dyn_cast_or_null<MCSymbolRefExpr>(Value)) + if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_SBREL && !(Size == 4)) { + getContext().reportError(Loc, "relocated expression must be 32-bit"); + return; + } + + EmitDataMappingSymbol(); + MCELFStreamer::EmitValueImpl(Value, Size, Loc); + } + + void EmitAssemblerFlag(MCAssemblerFlag Flag) override { + MCELFStreamer::EmitAssemblerFlag(Flag); + + switch (Flag) { + case MCAF_SyntaxUnified: + return; // no-op here. + case MCAF_Code16: + IsThumb = true; + return; // Change to Thumb mode + case MCAF_Code32: + IsThumb = false; + return; // Change to ARM mode + case MCAF_Code64: + return; + case MCAF_SubsectionsViaSymbols: + return; + } + } + +private: + enum ElfMappingSymbol { + EMS_None, + EMS_ARM, + EMS_Thumb, + EMS_Data + }; + + void EmitDataMappingSymbol() { + if (LastEMS == EMS_Data) return; + EmitMappingSymbol("$d"); + LastEMS = EMS_Data; + } + + void EmitThumbMappingSymbol() { + if (LastEMS == EMS_Thumb) return; + EmitMappingSymbol("$t"); + LastEMS = EMS_Thumb; + } + + void EmitARMMappingSymbol() { + if (LastEMS == EMS_ARM) return; + EmitMappingSymbol("$a"); + LastEMS = EMS_ARM; + } + + void EmitMappingSymbol(StringRef Name) { + auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol( + Name + "." + Twine(MappingSymbolCounter++))); + EmitLabel(Symbol); + + Symbol->setType(ELF::STT_NOTYPE); + Symbol->setBinding(ELF::STB_LOCAL); + Symbol->setExternal(false); + } + + void EmitThumbFunc(MCSymbol *Func) override { + getAssembler().setIsThumbFunc(Func); + EmitSymbolAttribute(Func, MCSA_ELF_TypeFunction); + } + + // Helper functions for ARM exception handling directives + void Reset(); + + void EmitPersonalityFixup(StringRef Name); + void FlushPendingOffset(); + void FlushUnwindOpcodes(bool NoHandlerData); + + void SwitchToEHSection(const char *Prefix, unsigned Type, unsigned Flags, + SectionKind Kind, const MCSymbol &Fn); + void SwitchToExTabSection(const MCSymbol &FnStart); + void SwitchToExIdxSection(const MCSymbol &FnStart); + + void EmitFixup(const MCExpr *Expr, MCFixupKind Kind); + + bool IsThumb; + int64_t MappingSymbolCounter; + + DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols; + ElfMappingSymbol LastEMS; + + // ARM Exception Handling Frame Information + MCSymbol *ExTab; + MCSymbol *FnStart; + const MCSymbol *Personality; + unsigned PersonalityIndex; + unsigned FPReg; // Frame pointer register + int64_t FPOffset; // Offset: (final frame pointer) - (initial $sp) + int64_t SPOffset; // Offset: (final $sp) - (initial $sp) + int64_t PendingOffset; // Offset: (final $sp) - (emitted $sp) + bool UsedFP; + bool CantUnwind; + SmallVector<uint8_t, 64> Opcodes; + UnwindOpcodeAssembler UnwindOpAsm; +}; +} // end anonymous namespace + +ARMELFStreamer &ARMTargetELFStreamer::getStreamer() { + return static_cast<ARMELFStreamer &>(Streamer); +} + +void ARMTargetELFStreamer::emitFnStart() { getStreamer().emitFnStart(); } +void ARMTargetELFStreamer::emitFnEnd() { getStreamer().emitFnEnd(); } +void ARMTargetELFStreamer::emitCantUnwind() { getStreamer().emitCantUnwind(); } +void ARMTargetELFStreamer::emitPersonality(const MCSymbol *Personality) { + getStreamer().emitPersonality(Personality); +} +void ARMTargetELFStreamer::emitPersonalityIndex(unsigned Index) { + getStreamer().emitPersonalityIndex(Index); +} +void ARMTargetELFStreamer::emitHandlerData() { + getStreamer().emitHandlerData(); +} +void ARMTargetELFStreamer::emitSetFP(unsigned FpReg, unsigned SpReg, + int64_t Offset) { + getStreamer().emitSetFP(FpReg, SpReg, Offset); +} +void ARMTargetELFStreamer::emitMovSP(unsigned Reg, int64_t Offset) { + getStreamer().emitMovSP(Reg, Offset); +} +void ARMTargetELFStreamer::emitPad(int64_t Offset) { + getStreamer().emitPad(Offset); +} +void ARMTargetELFStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList, + bool isVector) { + getStreamer().emitRegSave(RegList, isVector); +} +void ARMTargetELFStreamer::emitUnwindRaw(int64_t Offset, + const SmallVectorImpl<uint8_t> &Opcodes) { + getStreamer().emitUnwindRaw(Offset, Opcodes); +} +void ARMTargetELFStreamer::switchVendor(StringRef Vendor) { + assert(!Vendor.empty() && "Vendor cannot be empty."); + + if (CurrentVendor == Vendor) + return; + + if (!CurrentVendor.empty()) + finishAttributeSection(); + + assert(Contents.empty() && + ".ARM.attributes should be flushed before changing vendor"); + CurrentVendor = Vendor; + +} +void ARMTargetELFStreamer::emitAttribute(unsigned Attribute, unsigned Value) { + setAttributeItem(Attribute, Value, /* OverwriteExisting= */ true); +} +void ARMTargetELFStreamer::emitTextAttribute(unsigned Attribute, + StringRef Value) { + setAttributeItem(Attribute, Value, /* OverwriteExisting= */ true); +} +void ARMTargetELFStreamer::emitIntTextAttribute(unsigned Attribute, + unsigned IntValue, + StringRef StringValue) { + setAttributeItems(Attribute, IntValue, StringValue, + /* OverwriteExisting= */ true); +} +void ARMTargetELFStreamer::emitArch(unsigned Value) { + Arch = Value; +} +void ARMTargetELFStreamer::emitObjectArch(unsigned Value) { + EmittedArch = Value; +} +void ARMTargetELFStreamer::emitArchDefaultAttributes() { + using namespace ARMBuildAttrs; + + setAttributeItem(CPU_name, + ARM::getCPUAttr(Arch), + false); + + if (EmittedArch == ARM::AK_INVALID) + setAttributeItem(CPU_arch, + ARM::getArchAttr(Arch), + false); + else + setAttributeItem(CPU_arch, + ARM::getArchAttr(EmittedArch), + false); + + switch (Arch) { + case ARM::AK_ARMV2: + case ARM::AK_ARMV2A: + case ARM::AK_ARMV3: + case ARM::AK_ARMV3M: + case ARM::AK_ARMV4: + setAttributeItem(ARM_ISA_use, Allowed, false); + break; + + case ARM::AK_ARMV4T: + case ARM::AK_ARMV5T: + case ARM::AK_ARMV5TE: + case ARM::AK_ARMV6: + setAttributeItem(ARM_ISA_use, Allowed, false); + setAttributeItem(THUMB_ISA_use, Allowed, false); + break; + + case ARM::AK_ARMV6T2: + setAttributeItem(ARM_ISA_use, Allowed, false); + setAttributeItem(THUMB_ISA_use, AllowThumb32, false); + break; + + case ARM::AK_ARMV6K: + case ARM::AK_ARMV6KZ: + setAttributeItem(ARM_ISA_use, Allowed, false); + setAttributeItem(THUMB_ISA_use, Allowed, false); + setAttributeItem(Virtualization_use, AllowTZ, false); + break; + + case ARM::AK_ARMV6M: + setAttributeItem(THUMB_ISA_use, Allowed, false); + break; + + case ARM::AK_ARMV7A: + setAttributeItem(CPU_arch_profile, ApplicationProfile, false); + setAttributeItem(ARM_ISA_use, Allowed, false); + setAttributeItem(THUMB_ISA_use, AllowThumb32, false); + break; + + case ARM::AK_ARMV7R: + setAttributeItem(CPU_arch_profile, RealTimeProfile, false); + setAttributeItem(ARM_ISA_use, Allowed, false); + setAttributeItem(THUMB_ISA_use, AllowThumb32, false); + break; + + case ARM::AK_ARMV7M: + setAttributeItem(CPU_arch_profile, MicroControllerProfile, false); + setAttributeItem(THUMB_ISA_use, AllowThumb32, false); + break; + + case ARM::AK_ARMV8A: + case ARM::AK_ARMV8_1A: + case ARM::AK_ARMV8_2A: + setAttributeItem(CPU_arch_profile, ApplicationProfile, false); + setAttributeItem(ARM_ISA_use, Allowed, false); + setAttributeItem(THUMB_ISA_use, AllowThumb32, false); + setAttributeItem(MPextension_use, Allowed, false); + setAttributeItem(Virtualization_use, AllowTZVirtualization, false); + break; + + case ARM::AK_IWMMXT: + setAttributeItem(ARM_ISA_use, Allowed, false); + setAttributeItem(THUMB_ISA_use, Allowed, false); + setAttributeItem(WMMX_arch, AllowWMMXv1, false); + break; + + case ARM::AK_IWMMXT2: + setAttributeItem(ARM_ISA_use, Allowed, false); + setAttributeItem(THUMB_ISA_use, Allowed, false); + setAttributeItem(WMMX_arch, AllowWMMXv2, false); + break; + + default: + report_fatal_error("Unknown Arch: " + Twine(Arch)); + break; + } +} +void ARMTargetELFStreamer::emitFPU(unsigned Value) { + FPU = Value; +} +void ARMTargetELFStreamer::emitFPUDefaultAttributes() { + switch (FPU) { + case ARM::FK_VFP: + case ARM::FK_VFPV2: + setAttributeItem(ARMBuildAttrs::FP_arch, + ARMBuildAttrs::AllowFPv2, + /* OverwriteExisting= */ false); + break; + + case ARM::FK_VFPV3: + setAttributeItem(ARMBuildAttrs::FP_arch, + ARMBuildAttrs::AllowFPv3A, + /* OverwriteExisting= */ false); + break; + + case ARM::FK_VFPV3_FP16: + setAttributeItem(ARMBuildAttrs::FP_arch, + ARMBuildAttrs::AllowFPv3A, + /* OverwriteExisting= */ false); + setAttributeItem(ARMBuildAttrs::FP_HP_extension, + ARMBuildAttrs::AllowHPFP, + /* OverwriteExisting= */ false); + break; + + case ARM::FK_VFPV3_D16: + setAttributeItem(ARMBuildAttrs::FP_arch, + ARMBuildAttrs::AllowFPv3B, + /* OverwriteExisting= */ false); + break; + + case ARM::FK_VFPV3_D16_FP16: + setAttributeItem(ARMBuildAttrs::FP_arch, + ARMBuildAttrs::AllowFPv3B, + /* OverwriteExisting= */ false); + setAttributeItem(ARMBuildAttrs::FP_HP_extension, + ARMBuildAttrs::AllowHPFP, + /* OverwriteExisting= */ false); + break; + + case ARM::FK_VFPV3XD: + setAttributeItem(ARMBuildAttrs::FP_arch, + ARMBuildAttrs::AllowFPv3B, + /* OverwriteExisting= */ false); + break; + case ARM::FK_VFPV3XD_FP16: + setAttributeItem(ARMBuildAttrs::FP_arch, + ARMBuildAttrs::AllowFPv3B, + /* OverwriteExisting= */ false); + setAttributeItem(ARMBuildAttrs::FP_HP_extension, + ARMBuildAttrs::AllowHPFP, + /* OverwriteExisting= */ false); + break; + + case ARM::FK_VFPV4: + setAttributeItem(ARMBuildAttrs::FP_arch, + ARMBuildAttrs::AllowFPv4A, + /* OverwriteExisting= */ false); + break; + + // ABI_HardFP_use is handled in ARMAsmPrinter, so _SP_D16 is treated the same + // as _D16 here. + case ARM::FK_FPV4_SP_D16: + case ARM::FK_VFPV4_D16: + setAttributeItem(ARMBuildAttrs::FP_arch, + ARMBuildAttrs::AllowFPv4B, + /* OverwriteExisting= */ false); + break; + + case ARM::FK_FP_ARMV8: + setAttributeItem(ARMBuildAttrs::FP_arch, + ARMBuildAttrs::AllowFPARMv8A, + /* OverwriteExisting= */ false); + break; + + // FPV5_D16 is identical to FP_ARMV8 except for the number of D registers, so + // uses the FP_ARMV8_D16 build attribute. + case ARM::FK_FPV5_SP_D16: + case ARM::FK_FPV5_D16: + setAttributeItem(ARMBuildAttrs::FP_arch, + ARMBuildAttrs::AllowFPARMv8B, + /* OverwriteExisting= */ false); + break; + + case ARM::FK_NEON: + setAttributeItem(ARMBuildAttrs::FP_arch, + ARMBuildAttrs::AllowFPv3A, + /* OverwriteExisting= */ false); + setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch, + ARMBuildAttrs::AllowNeon, + /* OverwriteExisting= */ false); + break; + + case ARM::FK_NEON_FP16: + setAttributeItem(ARMBuildAttrs::FP_arch, + ARMBuildAttrs::AllowFPv3A, + /* OverwriteExisting= */ false); + setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch, + ARMBuildAttrs::AllowNeon, + /* OverwriteExisting= */ false); + setAttributeItem(ARMBuildAttrs::FP_HP_extension, + ARMBuildAttrs::AllowHPFP, + /* OverwriteExisting= */ false); + break; + + case ARM::FK_NEON_VFPV4: + setAttributeItem(ARMBuildAttrs::FP_arch, + ARMBuildAttrs::AllowFPv4A, + /* OverwriteExisting= */ false); + setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch, + ARMBuildAttrs::AllowNeon2, + /* OverwriteExisting= */ false); + break; + + case ARM::FK_NEON_FP_ARMV8: + case ARM::FK_CRYPTO_NEON_FP_ARMV8: + setAttributeItem(ARMBuildAttrs::FP_arch, + ARMBuildAttrs::AllowFPARMv8A, + /* OverwriteExisting= */ false); + // 'Advanced_SIMD_arch' must be emitted not here, but within + // ARMAsmPrinter::emitAttributes(), depending on hasV8Ops() and hasV8_1a() + break; + + case ARM::FK_SOFTVFP: + case ARM::FK_NONE: + break; + + default: + report_fatal_error("Unknown FPU: " + Twine(FPU)); + break; + } +} +size_t ARMTargetELFStreamer::calculateContentSize() const { + size_t Result = 0; + for (size_t i = 0; i < Contents.size(); ++i) { + AttributeItem item = Contents[i]; + switch (item.Type) { + case AttributeItem::HiddenAttribute: + break; + case AttributeItem::NumericAttribute: + Result += getULEB128Size(item.Tag); + Result += getULEB128Size(item.IntValue); + break; + case AttributeItem::TextAttribute: + Result += getULEB128Size(item.Tag); + Result += item.StringValue.size() + 1; // string + '\0' + break; + case AttributeItem::NumericAndTextAttributes: + Result += getULEB128Size(item.Tag); + Result += getULEB128Size(item.IntValue); + Result += item.StringValue.size() + 1; // string + '\0'; + break; + } + } + return Result; +} +void ARMTargetELFStreamer::finishAttributeSection() { + // <format-version> + // [ <section-length> "vendor-name" + // [ <file-tag> <size> <attribute>* + // | <section-tag> <size> <section-number>* 0 <attribute>* + // | <symbol-tag> <size> <symbol-number>* 0 <attribute>* + // ]+ + // ]* + + if (FPU != ARM::FK_INVALID) + emitFPUDefaultAttributes(); + + if (Arch != ARM::AK_INVALID) + emitArchDefaultAttributes(); + + if (Contents.empty()) + return; + + std::sort(Contents.begin(), Contents.end(), AttributeItem::LessTag); + + ARMELFStreamer &Streamer = getStreamer(); + + // Switch to .ARM.attributes section + if (AttributeSection) { + Streamer.SwitchSection(AttributeSection); + } else { + AttributeSection = Streamer.getContext().getELFSection( + ".ARM.attributes", ELF::SHT_ARM_ATTRIBUTES, 0); + Streamer.SwitchSection(AttributeSection); + + // Format version + Streamer.EmitIntValue(0x41, 1); + } + + // Vendor size + Vendor name + '\0' + const size_t VendorHeaderSize = 4 + CurrentVendor.size() + 1; + + // Tag + Tag Size + const size_t TagHeaderSize = 1 + 4; + + const size_t ContentsSize = calculateContentSize(); + + Streamer.EmitIntValue(VendorHeaderSize + TagHeaderSize + ContentsSize, 4); + Streamer.EmitBytes(CurrentVendor); + Streamer.EmitIntValue(0, 1); // '\0' + + Streamer.EmitIntValue(ARMBuildAttrs::File, 1); + Streamer.EmitIntValue(TagHeaderSize + ContentsSize, 4); + + // Size should have been accounted for already, now + // emit each field as its type (ULEB or String) + for (size_t i = 0; i < Contents.size(); ++i) { + AttributeItem item = Contents[i]; + Streamer.EmitULEB128IntValue(item.Tag); + switch (item.Type) { + default: llvm_unreachable("Invalid attribute type"); + case AttributeItem::NumericAttribute: + Streamer.EmitULEB128IntValue(item.IntValue); + break; + case AttributeItem::TextAttribute: + Streamer.EmitBytes(item.StringValue); + Streamer.EmitIntValue(0, 1); // '\0' + break; + case AttributeItem::NumericAndTextAttributes: + Streamer.EmitULEB128IntValue(item.IntValue); + Streamer.EmitBytes(item.StringValue); + Streamer.EmitIntValue(0, 1); // '\0' + break; + } + } + + Contents.clear(); + FPU = ARM::FK_INVALID; +} + +void ARMTargetELFStreamer::emitLabel(MCSymbol *Symbol) { + ARMELFStreamer &Streamer = getStreamer(); + if (!Streamer.IsThumb) + return; + + Streamer.getAssembler().registerSymbol(*Symbol); + unsigned Type = cast<MCSymbolELF>(Symbol)->getType(); + if (Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC) + Streamer.EmitThumbFunc(Symbol); +} + +void +ARMTargetELFStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *S) { + getStreamer().EmitFixup(S, FK_Data_4); +} + +void ARMTargetELFStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) { + if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Value)) { + const MCSymbol &Sym = SRE->getSymbol(); + if (!Sym.isDefined()) { + getStreamer().EmitAssignment(Symbol, Value); + return; + } + } + + getStreamer().EmitThumbFunc(Symbol); + getStreamer().EmitAssignment(Symbol, Value); +} + +void ARMTargetELFStreamer::emitInst(uint32_t Inst, char Suffix) { + getStreamer().emitInst(Inst, Suffix); +} + +void ARMELFStreamer::FinishImpl() { + MCTargetStreamer &TS = *getTargetStreamer(); + ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS); + ATS.finishAttributeSection(); + + MCELFStreamer::FinishImpl(); +} + +inline void ARMELFStreamer::SwitchToEHSection(const char *Prefix, + unsigned Type, + unsigned Flags, + SectionKind Kind, + const MCSymbol &Fn) { + const MCSectionELF &FnSection = + static_cast<const MCSectionELF &>(Fn.getSection()); + + // Create the name for new section + StringRef FnSecName(FnSection.getSectionName()); + SmallString<128> EHSecName(Prefix); + if (FnSecName != ".text") { + EHSecName += FnSecName; + } + + // Get .ARM.extab or .ARM.exidx section + const MCSymbolELF *Group = FnSection.getGroup(); + if (Group) + Flags |= ELF::SHF_GROUP; + MCSectionELF *EHSection = + getContext().getELFSection(EHSecName, Type, Flags, 0, Group, + FnSection.getUniqueID(), nullptr, &FnSection); + + assert(EHSection && "Failed to get the required EH section"); + + // Switch to .ARM.extab or .ARM.exidx section + SwitchSection(EHSection); + EmitCodeAlignment(4); +} + +inline void ARMELFStreamer::SwitchToExTabSection(const MCSymbol &FnStart) { + SwitchToEHSection(".ARM.extab", ELF::SHT_PROGBITS, ELF::SHF_ALLOC, + SectionKind::getData(), FnStart); +} + +inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) { + SwitchToEHSection(".ARM.exidx", ELF::SHT_ARM_EXIDX, + ELF::SHF_ALLOC | ELF::SHF_LINK_ORDER, + SectionKind::getData(), FnStart); +} +void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) { + MCDataFragment *Frag = getOrCreateDataFragment(); + Frag->getFixups().push_back(MCFixup::create(Frag->getContents().size(), Expr, + Kind)); +} + +void ARMELFStreamer::Reset() { + ExTab = nullptr; + FnStart = nullptr; + Personality = nullptr; + PersonalityIndex = ARM::EHABI::NUM_PERSONALITY_INDEX; + FPReg = ARM::SP; + FPOffset = 0; + SPOffset = 0; + PendingOffset = 0; + UsedFP = false; + CantUnwind = false; + + Opcodes.clear(); + UnwindOpAsm.Reset(); +} + +void ARMELFStreamer::emitFnStart() { + assert(FnStart == nullptr); + FnStart = getContext().createTempSymbol(); + EmitLabel(FnStart); +} + +void ARMELFStreamer::emitFnEnd() { + assert(FnStart && ".fnstart must precedes .fnend"); + + // Emit unwind opcodes if there is no .handlerdata directive + if (!ExTab && !CantUnwind) + FlushUnwindOpcodes(true); + + // Emit the exception index table entry + SwitchToExIdxSection(*FnStart); + + if (PersonalityIndex < ARM::EHABI::NUM_PERSONALITY_INDEX) + EmitPersonalityFixup(GetAEABIUnwindPersonalityName(PersonalityIndex)); + + const MCSymbolRefExpr *FnStartRef = + MCSymbolRefExpr::create(FnStart, + MCSymbolRefExpr::VK_ARM_PREL31, + getContext()); + + EmitValue(FnStartRef, 4); + + if (CantUnwind) { + EmitIntValue(ARM::EHABI::EXIDX_CANTUNWIND, 4); + } else if (ExTab) { + // Emit a reference to the unwind opcodes in the ".ARM.extab" section. + const MCSymbolRefExpr *ExTabEntryRef = + MCSymbolRefExpr::create(ExTab, + MCSymbolRefExpr::VK_ARM_PREL31, + getContext()); + EmitValue(ExTabEntryRef, 4); + } else { + // For the __aeabi_unwind_cpp_pr0, we have to emit the unwind opcodes in + // the second word of exception index table entry. The size of the unwind + // opcodes should always be 4 bytes. + assert(PersonalityIndex == ARM::EHABI::AEABI_UNWIND_CPP_PR0 && + "Compact model must use __aeabi_unwind_cpp_pr0 as personality"); + assert(Opcodes.size() == 4u && + "Unwind opcode size for __aeabi_unwind_cpp_pr0 must be equal to 4"); + uint64_t Intval = Opcodes[0] | + Opcodes[1] << 8 | + Opcodes[2] << 16 | + Opcodes[3] << 24; + EmitIntValue(Intval, Opcodes.size()); + } + + // Switch to the section containing FnStart + SwitchSection(&FnStart->getSection()); + + // Clean exception handling frame information + Reset(); +} + +void ARMELFStreamer::emitCantUnwind() { CantUnwind = true; } + +// Add the R_ARM_NONE fixup at the same position +void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) { + const MCSymbol *PersonalitySym = getContext().getOrCreateSymbol(Name); + + const MCSymbolRefExpr *PersonalityRef = MCSymbolRefExpr::create( + PersonalitySym, MCSymbolRefExpr::VK_ARM_NONE, getContext()); + + visitUsedExpr(*PersonalityRef); + MCDataFragment *DF = getOrCreateDataFragment(); + DF->getFixups().push_back(MCFixup::create(DF->getContents().size(), + PersonalityRef, + MCFixup::getKindForSize(4, false))); +} + +void ARMELFStreamer::FlushPendingOffset() { + if (PendingOffset != 0) { + UnwindOpAsm.EmitSPOffset(-PendingOffset); + PendingOffset = 0; + } +} + +void ARMELFStreamer::FlushUnwindOpcodes(bool NoHandlerData) { + // Emit the unwind opcode to restore $sp. + if (UsedFP) { + const MCRegisterInfo *MRI = getContext().getRegisterInfo(); + int64_t LastRegSaveSPOffset = SPOffset - PendingOffset; + UnwindOpAsm.EmitSPOffset(LastRegSaveSPOffset - FPOffset); + UnwindOpAsm.EmitSetSP(MRI->getEncodingValue(FPReg)); + } else { + FlushPendingOffset(); + } + + // Finalize the unwind opcode sequence + UnwindOpAsm.Finalize(PersonalityIndex, Opcodes); + + // For compact model 0, we have to emit the unwind opcodes in the .ARM.exidx + // section. Thus, we don't have to create an entry in the .ARM.extab + // section. + if (NoHandlerData && PersonalityIndex == ARM::EHABI::AEABI_UNWIND_CPP_PR0) + return; + + // Switch to .ARM.extab section. + SwitchToExTabSection(*FnStart); + + // Create .ARM.extab label for offset in .ARM.exidx + assert(!ExTab); + ExTab = getContext().createTempSymbol(); + EmitLabel(ExTab); + + // Emit personality + if (Personality) { + const MCSymbolRefExpr *PersonalityRef = + MCSymbolRefExpr::create(Personality, + MCSymbolRefExpr::VK_ARM_PREL31, + getContext()); + + EmitValue(PersonalityRef, 4); + } + + // Emit unwind opcodes + assert((Opcodes.size() % 4) == 0 && + "Unwind opcode size for __aeabi_cpp_unwind_pr0 must be multiple of 4"); + for (unsigned I = 0; I != Opcodes.size(); I += 4) { + uint64_t Intval = Opcodes[I] | + Opcodes[I + 1] << 8 | + Opcodes[I + 2] << 16 | + Opcodes[I + 3] << 24; + EmitIntValue(Intval, 4); + } + + // According to ARM EHABI section 9.2, if the __aeabi_unwind_cpp_pr1() or + // __aeabi_unwind_cpp_pr2() is used, then the handler data must be emitted + // after the unwind opcodes. The handler data consists of several 32-bit + // words, and should be terminated by zero. + // + // In case that the .handlerdata directive is not specified by the + // programmer, we should emit zero to terminate the handler data. + if (NoHandlerData && !Personality) + EmitIntValue(0, 4); +} + +void ARMELFStreamer::emitHandlerData() { FlushUnwindOpcodes(false); } + +void ARMELFStreamer::emitPersonality(const MCSymbol *Per) { + Personality = Per; + UnwindOpAsm.setPersonality(Per); +} + +void ARMELFStreamer::emitPersonalityIndex(unsigned Index) { + assert(Index < ARM::EHABI::NUM_PERSONALITY_INDEX && "invalid index"); + PersonalityIndex = Index; +} + +void ARMELFStreamer::emitSetFP(unsigned NewFPReg, unsigned NewSPReg, + int64_t Offset) { + assert((NewSPReg == ARM::SP || NewSPReg == FPReg) && + "the operand of .setfp directive should be either $sp or $fp"); + + UsedFP = true; + FPReg = NewFPReg; + + if (NewSPReg == ARM::SP) + FPOffset = SPOffset + Offset; + else + FPOffset += Offset; +} + +void ARMELFStreamer::emitMovSP(unsigned Reg, int64_t Offset) { + assert((Reg != ARM::SP && Reg != ARM::PC) && + "the operand of .movsp cannot be either sp or pc"); + assert(FPReg == ARM::SP && "current FP must be SP"); + + FlushPendingOffset(); + + FPReg = Reg; + FPOffset = SPOffset + Offset; + + const MCRegisterInfo *MRI = getContext().getRegisterInfo(); + UnwindOpAsm.EmitSetSP(MRI->getEncodingValue(FPReg)); +} + +void ARMELFStreamer::emitPad(int64_t Offset) { + // Track the change of the $sp offset + SPOffset -= Offset; + + // To squash multiple .pad directives, we should delay the unwind opcode + // until the .save, .vsave, .handlerdata, or .fnend directives. + PendingOffset -= Offset; +} + +void ARMELFStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList, + bool IsVector) { + // Collect the registers in the register list + unsigned Count = 0; + uint32_t Mask = 0; + const MCRegisterInfo *MRI = getContext().getRegisterInfo(); + for (size_t i = 0; i < RegList.size(); ++i) { + unsigned Reg = MRI->getEncodingValue(RegList[i]); + assert(Reg < (IsVector ? 32U : 16U) && "Register out of range"); + unsigned Bit = (1u << Reg); + if ((Mask & Bit) == 0) { + Mask |= Bit; + ++Count; + } + } + + // Track the change the $sp offset: For the .save directive, the + // corresponding push instruction will decrease the $sp by (4 * Count). + // For the .vsave directive, the corresponding vpush instruction will + // decrease $sp by (8 * Count). + SPOffset -= Count * (IsVector ? 8 : 4); + + // Emit the opcode + FlushPendingOffset(); + if (IsVector) + UnwindOpAsm.EmitVFPRegSave(Mask); + else + UnwindOpAsm.EmitRegSave(Mask); +} + +void ARMELFStreamer::emitUnwindRaw(int64_t Offset, + const SmallVectorImpl<uint8_t> &Opcodes) { + FlushPendingOffset(); + SPOffset = SPOffset - Offset; + UnwindOpAsm.EmitRaw(Opcodes); +} + +namespace llvm { + +MCTargetStreamer *createARMTargetAsmStreamer(MCStreamer &S, + formatted_raw_ostream &OS, + MCInstPrinter *InstPrint, + bool isVerboseAsm) { + return new ARMTargetAsmStreamer(S, OS, *InstPrint, isVerboseAsm); +} + +MCTargetStreamer *createARMNullTargetStreamer(MCStreamer &S) { + return new ARMTargetStreamer(S); +} + +MCTargetStreamer *createARMObjectTargetStreamer(MCStreamer &S, + const MCSubtargetInfo &STI) { + const Triple &TT = STI.getTargetTriple(); + if (TT.isOSBinFormatELF()) + return new ARMTargetELFStreamer(S); + return new ARMTargetStreamer(S); +} + +MCELFStreamer *createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB, + raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, bool RelaxAll, + bool IsThumb) { + ARMELFStreamer *S = new ARMELFStreamer(Context, TAB, OS, Emitter, IsThumb); + // FIXME: This should eventually end up somewhere else where more + // intelligent flag decisions can be made. For now we are just maintaining + // the status quo for ARM and setting EF_ARM_EABI_VER5 as the default. + S->getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5); + + if (RelaxAll) + S->getAssembler().setRelaxAll(true); + return S; + } + +} + + diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h new file mode 100644 index 0000000..46ba571 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h @@ -0,0 +1,110 @@ +//===-- ARMFixupKinds.h - ARM Specific Fixup Entries ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMFIXUPKINDS_H +#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMFIXUPKINDS_H + +#include "llvm/MC/MCFixup.h" + +namespace llvm { +namespace ARM { +enum Fixups { + // fixup_arm_ldst_pcrel_12 - 12-bit PC relative relocation for symbol + // addresses + fixup_arm_ldst_pcrel_12 = FirstTargetFixupKind, + + // fixup_t2_ldst_pcrel_12 - Equivalent to fixup_arm_ldst_pcrel_12, with + // the 16-bit halfwords reordered. + fixup_t2_ldst_pcrel_12, + + // fixup_arm_pcrel_10_unscaled - 10-bit PC relative relocation for symbol + // addresses used in LDRD/LDRH/LDRB/etc. instructions. All bits are encoded. + fixup_arm_pcrel_10_unscaled, + // fixup_arm_pcrel_10 - 10-bit PC relative relocation for symbol addresses + // used in VFP instructions where the lower 2 bits are not encoded + // (so it's encoded as an 8-bit immediate). + fixup_arm_pcrel_10, + // fixup_t2_pcrel_10 - Equivalent to fixup_arm_pcrel_10, accounting for + // the short-swapped encoding of Thumb2 instructions. + fixup_t2_pcrel_10, + // fixup_thumb_adr_pcrel_10 - 10-bit PC relative relocation for symbol + // addresses where the lower 2 bits are not encoded (so it's encoded as an + // 8-bit immediate). + fixup_thumb_adr_pcrel_10, + // fixup_arm_adr_pcrel_12 - 12-bit PC relative relocation for the ADR + // instruction. + fixup_arm_adr_pcrel_12, + // fixup_t2_adr_pcrel_12 - 12-bit PC relative relocation for the ADR + // instruction. + fixup_t2_adr_pcrel_12, + // fixup_arm_condbranch - 24-bit PC relative relocation for conditional branch + // instructions. + fixup_arm_condbranch, + // fixup_arm_uncondbranch - 24-bit PC relative relocation for + // branch instructions. (unconditional) + fixup_arm_uncondbranch, + // fixup_t2_condbranch - 20-bit PC relative relocation for Thumb2 direct + // uconditional branch instructions. + fixup_t2_condbranch, + // fixup_t2_uncondbranch - 20-bit PC relative relocation for Thumb2 direct + // branch unconditional branch instructions. + fixup_t2_uncondbranch, + + // fixup_arm_thumb_br - 12-bit fixup for Thumb B instructions. + fixup_arm_thumb_br, + + // The following fixups handle the ARM BL instructions. These can be + // conditionalised; however, the ARM ELF ABI requires a different relocation + // in that case: R_ARM_JUMP24 instead of R_ARM_CALL. The difference is that + // R_ARM_CALL is allowed to change the instruction to a BLX inline, which has + // no conditional version; R_ARM_JUMP24 would have to insert a veneer. + // + // MachO does not draw a distinction between the two cases, so it will treat + // fixup_arm_uncondbl and fixup_arm_condbl as identical fixups. + + // fixup_arm_uncondbl - Fixup for unconditional ARM BL instructions. + fixup_arm_uncondbl, + + // fixup_arm_condbl - Fixup for ARM BL instructions with nontrivial + // conditionalisation. + fixup_arm_condbl, + + // fixup_arm_blx - Fixup for ARM BLX instructions. + fixup_arm_blx, + + // fixup_arm_thumb_bl - Fixup for Thumb BL instructions. + fixup_arm_thumb_bl, + + // fixup_arm_thumb_blx - Fixup for Thumb BLX instructions. + fixup_arm_thumb_blx, + + // fixup_arm_thumb_cb - Fixup for Thumb branch instructions. + fixup_arm_thumb_cb, + + // fixup_arm_thumb_cp - Fixup for Thumb load/store from constant pool instrs. + fixup_arm_thumb_cp, + + // fixup_arm_thumb_bcc - Fixup for Thumb conditional branching instructions. + fixup_arm_thumb_bcc, + + // The next two are for the movt/movw pair + // the 16bit imm field are split into imm{15-12} and imm{11-0} + fixup_arm_movt_hi16, // :upper16: + fixup_arm_movw_lo16, // :lower16: + fixup_t2_movt_hi16, // :upper16: + fixup_t2_movw_lo16, // :lower16: + + // Marker + LastTargetFixupKind, + NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind +}; +} +} + +#endif diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp new file mode 100644 index 0000000..bda37f6 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp @@ -0,0 +1,115 @@ +//===-- ARMMCAsmInfo.cpp - ARM asm properties -----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of the ARMMCAsmInfo properties. +// +//===----------------------------------------------------------------------===// + +#include "ARMMCAsmInfo.h" +#include "llvm/ADT/Triple.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; + +void ARMMCAsmInfoDarwin::anchor() { } + +ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin(const Triple &TheTriple) { + if ((TheTriple.getArch() == Triple::armeb) || + (TheTriple.getArch() == Triple::thumbeb)) + IsLittleEndian = false; + + Data64bitsDirective = nullptr; + CommentString = "@"; + Code16Directive = ".code\t16"; + Code32Directive = ".code\t32"; + UseDataRegionDirectives = true; + + SupportsDebugInformation = true; + + // Exceptions handling + ExceptionsType = TheTriple.isOSDarwin() && !TheTriple.isWatchOS() + ? ExceptionHandling::SjLj + : ExceptionHandling::DwarfCFI; + + UseIntegratedAssembler = true; +} + +void ARMELFMCAsmInfo::anchor() { } + +ARMELFMCAsmInfo::ARMELFMCAsmInfo(const Triple &TheTriple) { + if ((TheTriple.getArch() == Triple::armeb) || + (TheTriple.getArch() == Triple::thumbeb)) + IsLittleEndian = false; + + // ".comm align is in bytes but .align is pow-2." + AlignmentIsInBytes = false; + + Data64bitsDirective = nullptr; + CommentString = "@"; + Code16Directive = ".code\t16"; + Code32Directive = ".code\t32"; + + SupportsDebugInformation = true; + + // Exceptions handling + switch (TheTriple.getOS()) { + case Triple::Bitrig: + case Triple::NetBSD: + ExceptionsType = ExceptionHandling::DwarfCFI; + break; + default: + ExceptionsType = ExceptionHandling::ARM; + break; + } + + // foo(plt) instead of foo@plt + UseParensForSymbolVariant = true; + + UseIntegratedAssembler = true; +} + +void ARMELFMCAsmInfo::setUseIntegratedAssembler(bool Value) { + UseIntegratedAssembler = Value; + if (!UseIntegratedAssembler) { + // gas doesn't handle VFP register names in cfi directives, + // so don't use register names with external assembler. + // See https://sourceware.org/bugzilla/show_bug.cgi?id=16694 + DwarfRegNumForCFI = true; + } +} + +void ARMCOFFMCAsmInfoMicrosoft::anchor() { } + +ARMCOFFMCAsmInfoMicrosoft::ARMCOFFMCAsmInfoMicrosoft() { + AlignmentIsInBytes = false; + + PrivateGlobalPrefix = "$M"; + PrivateLabelPrefix = "$M"; +} + +void ARMCOFFMCAsmInfoGNU::anchor() { } + +ARMCOFFMCAsmInfoGNU::ARMCOFFMCAsmInfoGNU() { + AlignmentIsInBytes = false; + HasSingleParameterDotFile = true; + + CommentString = "@"; + Code16Directive = ".code\t16"; + Code32Directive = ".code\t32"; + PrivateGlobalPrefix = ".L"; + PrivateLabelPrefix = ".L"; + + SupportsDebugInformation = true; + ExceptionsType = ExceptionHandling::None; + UseParensForSymbolVariant = true; + + UseIntegratedAssembler = false; + DwarfRegNumForCFI = true; +} + diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h new file mode 100644 index 0000000..5e54816 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h @@ -0,0 +1,56 @@ +//===-- ARMMCAsmInfo.h - ARM asm properties --------------------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the ARMMCAsmInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCASMINFO_H +#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCASMINFO_H + +#include "llvm/MC/MCAsmInfoCOFF.h" +#include "llvm/MC/MCAsmInfoDarwin.h" +#include "llvm/MC/MCAsmInfoELF.h" + +namespace llvm { +class Triple; + +class ARMMCAsmInfoDarwin : public MCAsmInfoDarwin { + virtual void anchor(); + +public: + explicit ARMMCAsmInfoDarwin(const Triple &TheTriple); +}; + +class ARMELFMCAsmInfo : public MCAsmInfoELF { + void anchor() override; + +public: + explicit ARMELFMCAsmInfo(const Triple &TT); + + void setUseIntegratedAssembler(bool Value) override; +}; + +class ARMCOFFMCAsmInfoMicrosoft : public MCAsmInfoMicrosoft { + void anchor() override; + +public: + explicit ARMCOFFMCAsmInfoMicrosoft(); +}; + +class ARMCOFFMCAsmInfoGNU : public MCAsmInfoGNUCOFF { + void anchor() override; + +public: + explicit ARMCOFFMCAsmInfoGNU(); +}; + +} // namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp new file mode 100644 index 0000000..b885783 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp @@ -0,0 +1,1695 @@ +//===-- ARM/ARMMCCodeEmitter.cpp - Convert ARM code to machine code -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the ARMMCCodeEmitter class. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/ARMMCTargetDesc.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "MCTargetDesc/ARMBaseInfo.h" +#include "MCTargetDesc/ARMFixupKinds.h" +#include "MCTargetDesc/ARMMCExpr.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "mccodeemitter" + +STATISTIC(MCNumEmitted, "Number of MC instructions emitted."); +STATISTIC(MCNumCPRelocations, "Number of constant pool relocations created."); + +namespace { +class ARMMCCodeEmitter : public MCCodeEmitter { + ARMMCCodeEmitter(const ARMMCCodeEmitter &) = delete; + void operator=(const ARMMCCodeEmitter &) = delete; + const MCInstrInfo &MCII; + const MCContext &CTX; + bool IsLittleEndian; + +public: + ARMMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx, bool IsLittle) + : MCII(mcii), CTX(ctx), IsLittleEndian(IsLittle) { + } + + ~ARMMCCodeEmitter() override {} + + bool isThumb(const MCSubtargetInfo &STI) const { + return STI.getFeatureBits()[ARM::ModeThumb]; + } + bool isThumb2(const MCSubtargetInfo &STI) const { + return isThumb(STI) && STI.getFeatureBits()[ARM::FeatureThumb2]; + } + bool isTargetMachO(const MCSubtargetInfo &STI) const { + const Triple &TT = STI.getTargetTriple(); + return TT.isOSBinFormatMachO(); + } + + unsigned getMachineSoImmOpValue(unsigned SoImm) const; + + // getBinaryCodeForInstr - TableGen'erated function for getting the + // binary encoding for an instruction. + uint64_t getBinaryCodeForInstr(const MCInst &MI, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getMachineOpValue - Return binary encoding of operand. If the machine + /// operand requires relocation, record the relocation and return zero. + unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getHiLo16ImmOpValue - Return the encoding for the hi / low 16-bit of + /// the specified operand. This is used for operands with :lower16: and + /// :upper16: prefixes. + uint32_t getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + bool EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx, + unsigned &Reg, unsigned &Imm, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getThumbBLTargetOpValue - Return encoding info for Thumb immediate + /// BL branch target. + uint32_t getThumbBLTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getThumbBLXTargetOpValue - Return encoding info for Thumb immediate + /// BLX branch target. + uint32_t getThumbBLXTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getThumbBRTargetOpValue - Return encoding info for Thumb branch target. + uint32_t getThumbBRTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getThumbBCCTargetOpValue - Return encoding info for Thumb branch target. + uint32_t getThumbBCCTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getThumbCBTargetOpValue - Return encoding info for Thumb branch target. + uint32_t getThumbCBTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getBranchTargetOpValue - Return encoding info for 24-bit immediate + /// branch target. + uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getUnconditionalBranchTargetOpValue - Return encoding info for 24-bit + /// immediate Thumb2 direct branch target. + uint32_t getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getARMBranchTargetOpValue - Return encoding info for 24-bit immediate + /// branch target. + uint32_t getARMBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + uint32_t getARMBLTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + uint32_t getARMBLXTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getAdrLabelOpValue - Return encoding info for 12-bit immediate + /// ADR label target. + uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + uint32_t getThumbAdrLabelOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + uint32_t getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + + /// getAddrModeImm12OpValue - Return encoding info for 'reg +/- imm12' + /// operand. + uint32_t getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getThumbAddrModeRegRegOpValue - Return encoding for 'reg + reg' operand. + uint32_t getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getT2AddrModeImm8s4OpValue - Return encoding info for 'reg +/- imm8<<2' + /// operand. + uint32_t getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getT2AddrModeImm0_1020s4OpValue - Return encoding info for 'reg + imm8<<2' + /// operand. + uint32_t getT2AddrModeImm0_1020s4OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getT2Imm8s4OpValue - Return encoding info for '+/- imm8<<2' + /// operand. + uint32_t getT2Imm8s4OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + + /// getLdStSORegOpValue - Return encoding info for 'reg +/- reg shop imm' + /// operand as needed by load/store instructions. + uint32_t getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getLdStmModeOpValue - Return encoding for load/store multiple mode. + uint32_t getLdStmModeOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + ARM_AM::AMSubMode Mode = (ARM_AM::AMSubMode)MI.getOperand(OpIdx).getImm(); + switch (Mode) { + default: llvm_unreachable("Unknown addressing sub-mode!"); + case ARM_AM::da: return 0; + case ARM_AM::ia: return 1; + case ARM_AM::db: return 2; + case ARM_AM::ib: return 3; + } + } + /// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value. + /// + unsigned getShiftOp(ARM_AM::ShiftOpc ShOpc) const { + switch (ShOpc) { + case ARM_AM::no_shift: + case ARM_AM::lsl: return 0; + case ARM_AM::lsr: return 1; + case ARM_AM::asr: return 2; + case ARM_AM::ror: + case ARM_AM::rrx: return 3; + } + llvm_unreachable("Invalid ShiftOpc!"); + } + + /// getAddrMode2OpValue - Return encoding for addrmode2 operands. + uint32_t getAddrMode2OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getAddrMode2OffsetOpValue - Return encoding for am2offset operands. + uint32_t getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getPostIdxRegOpValue - Return encoding for postidx_reg operands. + uint32_t getPostIdxRegOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getAddrMode3OffsetOpValue - Return encoding for am3offset operands. + uint32_t getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getAddrMode3OpValue - Return encoding for addrmode3 operands. + uint32_t getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getAddrModeThumbSPOpValue - Return encoding info for 'reg +/- imm12' + /// operand. + uint32_t getAddrModeThumbSPOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getAddrModeISOpValue - Encode the t_addrmode_is# operands. + uint32_t getAddrModeISOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getAddrModePCOpValue - Return encoding for t_addrmode_pc operands. + uint32_t getAddrModePCOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getAddrMode5OpValue - Return encoding info for 'reg +/- imm8' operand. + uint32_t getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getCCOutOpValue - Return encoding of the 's' bit. + unsigned getCCOutOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // The operand is either reg0 or CPSR. The 's' bit is encoded as '0' or + // '1' respectively. + return MI.getOperand(Op).getReg() == ARM::CPSR; + } + + /// getSOImmOpValue - Return an encoded 12-bit shifted-immediate value. + unsigned getSOImmOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + + const MCOperand &MO = MI.getOperand(Op); + + // We expect MO to be an immediate or an expression, + // if it is an immediate - that's fine, just encode the value. + // Otherwise - create a Fixup. + if (MO.isExpr()) { + const MCExpr *Expr = MO.getExpr(); + // In instruction code this value always encoded as lowest 12 bits, + // so we don't have to perform any specific adjustments. + // Due to requirements of relocatable records we have to use FK_Data_4. + // See ARMELFObjectWriter::ExplicitRelSym and + // ARMELFObjectWriter::GetRelocTypeInner for more details. + MCFixupKind Kind = MCFixupKind(FK_Data_4); + Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc())); + return 0; + } + + unsigned SoImm = MO.getImm(); + int SoImmVal = ARM_AM::getSOImmVal(SoImm); + assert(SoImmVal != -1 && "Not a valid so_imm value!"); + + // Encode rotate_imm. + unsigned Binary = (ARM_AM::getSOImmValRot((unsigned)SoImmVal) >> 1) + << ARMII::SoRotImmShift; + + // Encode immed_8. + Binary |= ARM_AM::getSOImmValImm((unsigned)SoImmVal); + return Binary; + } + + unsigned getModImmOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &ST) const { + const MCOperand &MO = MI.getOperand(Op); + + // Support for fixups (MCFixup) + if (MO.isExpr()) { + const MCExpr *Expr = MO.getExpr(); + // In instruction code this value always encoded as lowest 12 bits, + // so we don't have to perform any specific adjustments. + // Due to requirements of relocatable records we have to use FK_Data_4. + // See ARMELFObjectWriter::ExplicitRelSym and + // ARMELFObjectWriter::GetRelocTypeInner for more details. + MCFixupKind Kind = MCFixupKind(FK_Data_4); + Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc())); + return 0; + } + + // Immediate is already in its encoded format + return MO.getImm(); + } + + /// getT2SOImmOpValue - Return an encoded 12-bit shifted-immediate value. + unsigned getT2SOImmOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + unsigned SoImm = MI.getOperand(Op).getImm(); + unsigned Encoded = ARM_AM::getT2SOImmVal(SoImm); + assert(Encoded != ~0U && "Not a Thumb2 so_imm value?"); + return Encoded; + } + + unsigned getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getT2AddrModeImm12OffsetOpValue(const MCInst &MI, unsigned OpNum, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getSORegOpValue - Return an encoded so_reg shifted register value. + unsigned getSORegRegOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getSORegImmOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getT2SORegOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + unsigned getNEONVcvtImm32OpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + return 64 - MI.getOperand(Op).getImm(); + } + + unsigned getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + unsigned getRegisterListOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + unsigned getShiftRight8Imm(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getShiftRight16Imm(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getShiftRight32Imm(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getShiftRight64Imm(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + unsigned getThumbSRImmOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + unsigned NEONThumb2DataIPostEncoder(const MCInst &MI, + unsigned EncodedValue, + const MCSubtargetInfo &STI) const; + unsigned NEONThumb2LoadStorePostEncoder(const MCInst &MI, + unsigned EncodedValue, + const MCSubtargetInfo &STI) const; + unsigned NEONThumb2DupPostEncoder(const MCInst &MI, + unsigned EncodedValue, + const MCSubtargetInfo &STI) const; + unsigned NEONThumb2V8PostEncoder(const MCInst &MI, + unsigned EncodedValue, + const MCSubtargetInfo &STI) const; + + unsigned VFPThumb2PostEncoder(const MCInst &MI, + unsigned EncodedValue, + const MCSubtargetInfo &STI) const; + + void EmitByte(unsigned char C, raw_ostream &OS) const { + OS << (char)C; + } + + void EmitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) const { + // Output the constant in little endian byte order. + for (unsigned i = 0; i != Size; ++i) { + unsigned Shift = IsLittleEndian ? i * 8 : (Size - 1 - i) * 8; + EmitByte((Val >> Shift) & 0xff, OS); + } + } + + void encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; +}; + +} // end anonymous namespace + +MCCodeEmitter *llvm::createARMLEMCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + MCContext &Ctx) { + return new ARMMCCodeEmitter(MCII, Ctx, true); +} + +MCCodeEmitter *llvm::createARMBEMCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + MCContext &Ctx) { + return new ARMMCCodeEmitter(MCII, Ctx, false); +} + +/// NEONThumb2DataIPostEncoder - Post-process encoded NEON data-processing +/// instructions, and rewrite them to their Thumb2 form if we are currently in +/// Thumb2 mode. +unsigned ARMMCCodeEmitter::NEONThumb2DataIPostEncoder(const MCInst &MI, + unsigned EncodedValue, + const MCSubtargetInfo &STI) const { + if (isThumb2(STI)) { + // NEON Thumb2 data-processsing encodings are very simple: bit 24 is moved + // to bit 12 of the high half-word (i.e. bit 28), and bits 27-24 are + // set to 1111. + unsigned Bit24 = EncodedValue & 0x01000000; + unsigned Bit28 = Bit24 << 4; + EncodedValue &= 0xEFFFFFFF; + EncodedValue |= Bit28; + EncodedValue |= 0x0F000000; + } + + return EncodedValue; +} + +/// NEONThumb2LoadStorePostEncoder - Post-process encoded NEON load/store +/// instructions, and rewrite them to their Thumb2 form if we are currently in +/// Thumb2 mode. +unsigned ARMMCCodeEmitter::NEONThumb2LoadStorePostEncoder(const MCInst &MI, + unsigned EncodedValue, + const MCSubtargetInfo &STI) const { + if (isThumb2(STI)) { + EncodedValue &= 0xF0FFFFFF; + EncodedValue |= 0x09000000; + } + + return EncodedValue; +} + +/// NEONThumb2DupPostEncoder - Post-process encoded NEON vdup +/// instructions, and rewrite them to their Thumb2 form if we are currently in +/// Thumb2 mode. +unsigned ARMMCCodeEmitter::NEONThumb2DupPostEncoder(const MCInst &MI, + unsigned EncodedValue, + const MCSubtargetInfo &STI) const { + if (isThumb2(STI)) { + EncodedValue &= 0x00FFFFFF; + EncodedValue |= 0xEE000000; + } + + return EncodedValue; +} + +/// Post-process encoded NEON v8 instructions, and rewrite them to Thumb2 form +/// if we are in Thumb2. +unsigned ARMMCCodeEmitter::NEONThumb2V8PostEncoder(const MCInst &MI, + unsigned EncodedValue, + const MCSubtargetInfo &STI) const { + if (isThumb2(STI)) { + EncodedValue |= 0xC000000; // Set bits 27-26 + } + + return EncodedValue; +} + +/// VFPThumb2PostEncoder - Post-process encoded VFP instructions and rewrite +/// them to their Thumb2 form if we are currently in Thumb2 mode. +unsigned ARMMCCodeEmitter:: +VFPThumb2PostEncoder(const MCInst &MI, unsigned EncodedValue, + const MCSubtargetInfo &STI) const { + if (isThumb2(STI)) { + EncodedValue &= 0x0FFFFFFF; + EncodedValue |= 0xE0000000; + } + return EncodedValue; +} + +/// getMachineOpValue - Return binary encoding of operand. If the machine +/// operand requires relocation, record the relocation and return zero. +unsigned ARMMCCodeEmitter:: +getMachineOpValue(const MCInst &MI, const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + if (MO.isReg()) { + unsigned Reg = MO.getReg(); + unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg); + + // Q registers are encoded as 2x their register number. + switch (Reg) { + default: + return RegNo; + case ARM::Q0: case ARM::Q1: case ARM::Q2: case ARM::Q3: + case ARM::Q4: case ARM::Q5: case ARM::Q6: case ARM::Q7: + case ARM::Q8: case ARM::Q9: case ARM::Q10: case ARM::Q11: + case ARM::Q12: case ARM::Q13: case ARM::Q14: case ARM::Q15: + return 2 * RegNo; + } + } else if (MO.isImm()) { + return static_cast<unsigned>(MO.getImm()); + } else if (MO.isFPImm()) { + return static_cast<unsigned>(APFloat(MO.getFPImm()) + .bitcastToAPInt().getHiBits(32).getLimitedValue()); + } + + llvm_unreachable("Unable to encode MCOperand!"); +} + +/// getAddrModeImmOpValue - Return encoding info for 'reg +/- imm' operand. +bool ARMMCCodeEmitter:: +EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx, unsigned &Reg, + unsigned &Imm, SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx + 1); + + Reg = CTX.getRegisterInfo()->getEncodingValue(MO.getReg()); + + int32_t SImm = MO1.getImm(); + bool isAdd = true; + + // Special value for #-0 + if (SImm == INT32_MIN) { + SImm = 0; + isAdd = false; + } + + // Immediate is always encoded as positive. The 'U' bit controls add vs sub. + if (SImm < 0) { + SImm = -SImm; + isAdd = false; + } + + Imm = SImm; + return isAdd; +} + +/// getBranchTargetOpValue - Helper function to get the branch target operand, +/// which is either an immediate or requires a fixup. +static uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + unsigned FixupKind, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) { + const MCOperand &MO = MI.getOperand(OpIdx); + + // If the destination is an immediate, we have nothing to do. + if (MO.isImm()) return MO.getImm(); + assert(MO.isExpr() && "Unexpected branch target type!"); + const MCExpr *Expr = MO.getExpr(); + MCFixupKind Kind = MCFixupKind(FixupKind); + Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc())); + + // All of the information is in the fixup. + return 0; +} + +// Thumb BL and BLX use a strange offset encoding where bits 22 and 21 are +// determined by negating them and XOR'ing them with bit 23. +static int32_t encodeThumbBLOffset(int32_t offset) { + offset >>= 1; + uint32_t S = (offset & 0x800000) >> 23; + uint32_t J1 = (offset & 0x400000) >> 22; + uint32_t J2 = (offset & 0x200000) >> 21; + J1 = (~J1 & 0x1); + J2 = (~J2 & 0x1); + J1 ^= S; + J2 ^= S; + + offset &= ~0x600000; + offset |= J1 << 22; + offset |= J2 << 21; + + return offset; +} + +/// getThumbBLTargetOpValue - Return encoding info for immediate branch target. +uint32_t ARMMCCodeEmitter:: +getThumbBLTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand MO = MI.getOperand(OpIdx); + if (MO.isExpr()) + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_bl, + Fixups, STI); + return encodeThumbBLOffset(MO.getImm()); +} + +/// getThumbBLXTargetOpValue - Return encoding info for Thumb immediate +/// BLX branch target. +uint32_t ARMMCCodeEmitter:: +getThumbBLXTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand MO = MI.getOperand(OpIdx); + if (MO.isExpr()) + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_blx, + Fixups, STI); + return encodeThumbBLOffset(MO.getImm()); +} + +/// getThumbBRTargetOpValue - Return encoding info for Thumb branch target. +uint32_t ARMMCCodeEmitter:: +getThumbBRTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand MO = MI.getOperand(OpIdx); + if (MO.isExpr()) + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_br, + Fixups, STI); + return (MO.getImm() >> 1); +} + +/// getThumbBCCTargetOpValue - Return encoding info for Thumb branch target. +uint32_t ARMMCCodeEmitter:: +getThumbBCCTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand MO = MI.getOperand(OpIdx); + if (MO.isExpr()) + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_bcc, + Fixups, STI); + return (MO.getImm() >> 1); +} + +/// getThumbCBTargetOpValue - Return encoding info for Thumb branch target. +uint32_t ARMMCCodeEmitter:: +getThumbCBTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand MO = MI.getOperand(OpIdx); + if (MO.isExpr()) + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cb, Fixups, STI); + return (MO.getImm() >> 1); +} + +/// Return true if this branch has a non-always predication +static bool HasConditionalBranch(const MCInst &MI) { + int NumOp = MI.getNumOperands(); + if (NumOp >= 2) { + for (int i = 0; i < NumOp-1; ++i) { + const MCOperand &MCOp1 = MI.getOperand(i); + const MCOperand &MCOp2 = MI.getOperand(i + 1); + if (MCOp1.isImm() && MCOp2.isReg() && + (MCOp2.getReg() == 0 || MCOp2.getReg() == ARM::CPSR)) { + if (ARMCC::CondCodes(MCOp1.getImm()) != ARMCC::AL) + return true; + } + } + } + return false; +} + +/// getBranchTargetOpValue - Return encoding info for 24-bit immediate branch +/// target. +uint32_t ARMMCCodeEmitter:: +getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // FIXME: This really, really shouldn't use TargetMachine. We don't want + // coupling between MC and TM anywhere we can help it. + if (isThumb2(STI)) + return + ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_condbranch, Fixups, STI); + return getARMBranchTargetOpValue(MI, OpIdx, Fixups, STI); +} + +/// getBranchTargetOpValue - Return encoding info for 24-bit immediate branch +/// target. +uint32_t ARMMCCodeEmitter:: +getARMBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand MO = MI.getOperand(OpIdx); + if (MO.isExpr()) { + if (HasConditionalBranch(MI)) + return ::getBranchTargetOpValue(MI, OpIdx, + ARM::fixup_arm_condbranch, Fixups, STI); + return ::getBranchTargetOpValue(MI, OpIdx, + ARM::fixup_arm_uncondbranch, Fixups, STI); + } + + return MO.getImm() >> 2; +} + +uint32_t ARMMCCodeEmitter:: +getARMBLTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand MO = MI.getOperand(OpIdx); + if (MO.isExpr()) { + if (HasConditionalBranch(MI)) + return ::getBranchTargetOpValue(MI, OpIdx, + ARM::fixup_arm_condbl, Fixups, STI); + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_uncondbl, Fixups, STI); + } + + return MO.getImm() >> 2; +} + +uint32_t ARMMCCodeEmitter:: +getARMBLXTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand MO = MI.getOperand(OpIdx); + if (MO.isExpr()) + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_blx, Fixups, STI); + + return MO.getImm() >> 1; +} + +/// getUnconditionalBranchTargetOpValue - Return encoding info for 24-bit +/// immediate branch target. +uint32_t ARMMCCodeEmitter:: +getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + unsigned Val = 0; + const MCOperand MO = MI.getOperand(OpIdx); + + if(MO.isExpr()) + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_uncondbranch, Fixups, STI); + else + Val = MO.getImm() >> 1; + + bool I = (Val & 0x800000); + bool J1 = (Val & 0x400000); + bool J2 = (Val & 0x200000); + if (I ^ J1) + Val &= ~0x400000; + else + Val |= 0x400000; + + if (I ^ J2) + Val &= ~0x200000; + else + Val |= 0x200000; + + return Val; +} + +/// getAdrLabelOpValue - Return encoding info for 12-bit shifted-immediate +/// ADR label target. +uint32_t ARMMCCodeEmitter:: +getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand MO = MI.getOperand(OpIdx); + if (MO.isExpr()) + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_adr_pcrel_12, + Fixups, STI); + int64_t offset = MO.getImm(); + uint32_t Val = 0x2000; + + int SoImmVal; + if (offset == INT32_MIN) { + Val = 0x1000; + SoImmVal = 0; + } else if (offset < 0) { + Val = 0x1000; + offset *= -1; + SoImmVal = ARM_AM::getSOImmVal(offset); + if(SoImmVal == -1) { + Val = 0x2000; + offset *= -1; + SoImmVal = ARM_AM::getSOImmVal(offset); + } + } else { + SoImmVal = ARM_AM::getSOImmVal(offset); + if(SoImmVal == -1) { + Val = 0x1000; + offset *= -1; + SoImmVal = ARM_AM::getSOImmVal(offset); + } + } + + assert(SoImmVal != -1 && "Not a valid so_imm value!"); + + Val |= SoImmVal; + return Val; +} + +/// getT2AdrLabelOpValue - Return encoding info for 12-bit immediate ADR label +/// target. +uint32_t ARMMCCodeEmitter:: +getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand MO = MI.getOperand(OpIdx); + if (MO.isExpr()) + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_adr_pcrel_12, + Fixups, STI); + int32_t Val = MO.getImm(); + if (Val == INT32_MIN) + Val = 0x1000; + else if (Val < 0) { + Val *= -1; + Val |= 0x1000; + } + return Val; +} + +/// getThumbAdrLabelOpValue - Return encoding info for 8-bit immediate ADR label +/// target. +uint32_t ARMMCCodeEmitter:: +getThumbAdrLabelOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand MO = MI.getOperand(OpIdx); + if (MO.isExpr()) + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_thumb_adr_pcrel_10, + Fixups, STI); + return MO.getImm(); +} + +/// getThumbAddrModeRegRegOpValue - Return encoding info for 'reg + reg' +/// operand. +uint32_t ARMMCCodeEmitter:: +getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &, + const MCSubtargetInfo &STI) const { + // [Rn, Rm] + // {5-3} = Rm + // {2-0} = Rn + const MCOperand &MO1 = MI.getOperand(OpIdx); + const MCOperand &MO2 = MI.getOperand(OpIdx + 1); + unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg()); + unsigned Rm = CTX.getRegisterInfo()->getEncodingValue(MO2.getReg()); + return (Rm << 3) | Rn; +} + +/// getAddrModeImm12OpValue - Return encoding info for 'reg +/- imm12' operand. +uint32_t ARMMCCodeEmitter:: +getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // {17-13} = reg + // {12} = (U)nsigned (add == '1', sub == '0') + // {11-0} = imm12 + unsigned Reg, Imm12; + bool isAdd = true; + // If The first operand isn't a register, we have a label reference. + const MCOperand &MO = MI.getOperand(OpIdx); + if (!MO.isReg()) { + Reg = CTX.getRegisterInfo()->getEncodingValue(ARM::PC); // Rn is PC. + Imm12 = 0; + + if (MO.isExpr()) { + const MCExpr *Expr = MO.getExpr(); + isAdd = false ; // 'U' bit is set as part of the fixup. + + MCFixupKind Kind; + if (isThumb2(STI)) + Kind = MCFixupKind(ARM::fixup_t2_ldst_pcrel_12); + else + Kind = MCFixupKind(ARM::fixup_arm_ldst_pcrel_12); + Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc())); + + ++MCNumCPRelocations; + } else { + Reg = ARM::PC; + int32_t Offset = MO.getImm(); + if (Offset == INT32_MIN) { + Offset = 0; + isAdd = false; + } else if (Offset < 0) { + Offset *= -1; + isAdd = false; + } + Imm12 = Offset; + } + } else + isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm12, Fixups, STI); + + uint32_t Binary = Imm12 & 0xfff; + // Immediate is always encoded as positive. The 'U' bit controls add vs sub. + if (isAdd) + Binary |= (1 << 12); + Binary |= (Reg << 13); + return Binary; +} + +/// getT2Imm8s4OpValue - Return encoding info for +/// '+/- imm8<<2' operand. +uint32_t ARMMCCodeEmitter:: +getT2Imm8s4OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // FIXME: The immediate operand should have already been encoded like this + // before ever getting here. The encoder method should just need to combine + // the MI operands for the register and the offset into a single + // representation for the complex operand in the .td file. This isn't just + // style, unfortunately. As-is, we can't represent the distinct encoding + // for #-0. + + // {8} = (U)nsigned (add == '1', sub == '0') + // {7-0} = imm8 + int32_t Imm8 = MI.getOperand(OpIdx).getImm(); + bool isAdd = Imm8 >= 0; + + // Immediate is always encoded as positive. The 'U' bit controls add vs sub. + if (Imm8 < 0) + Imm8 = -(uint32_t)Imm8; + + // Scaled by 4. + Imm8 /= 4; + + uint32_t Binary = Imm8 & 0xff; + // Immediate is always encoded as positive. The 'U' bit controls add vs sub. + if (isAdd) + Binary |= (1 << 8); + return Binary; +} + +/// getT2AddrModeImm8s4OpValue - Return encoding info for +/// 'reg +/- imm8<<2' operand. +uint32_t ARMMCCodeEmitter:: +getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // {12-9} = reg + // {8} = (U)nsigned (add == '1', sub == '0') + // {7-0} = imm8 + unsigned Reg, Imm8; + bool isAdd = true; + // If The first operand isn't a register, we have a label reference. + const MCOperand &MO = MI.getOperand(OpIdx); + if (!MO.isReg()) { + Reg = CTX.getRegisterInfo()->getEncodingValue(ARM::PC); // Rn is PC. + Imm8 = 0; + isAdd = false ; // 'U' bit is set as part of the fixup. + + assert(MO.isExpr() && "Unexpected machine operand type!"); + const MCExpr *Expr = MO.getExpr(); + MCFixupKind Kind = MCFixupKind(ARM::fixup_t2_pcrel_10); + Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc())); + + ++MCNumCPRelocations; + } else + isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups, STI); + + // FIXME: The immediate operand should have already been encoded like this + // before ever getting here. The encoder method should just need to combine + // the MI operands for the register and the offset into a single + // representation for the complex operand in the .td file. This isn't just + // style, unfortunately. As-is, we can't represent the distinct encoding + // for #-0. + uint32_t Binary = (Imm8 >> 2) & 0xff; + // Immediate is always encoded as positive. The 'U' bit controls add vs sub. + if (isAdd) + Binary |= (1 << 8); + Binary |= (Reg << 9); + return Binary; +} + +/// getT2AddrModeImm0_1020s4OpValue - Return encoding info for +/// 'reg + imm8<<2' operand. +uint32_t ARMMCCodeEmitter:: +getT2AddrModeImm0_1020s4OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // {11-8} = reg + // {7-0} = imm8 + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx + 1); + unsigned Reg = CTX.getRegisterInfo()->getEncodingValue(MO.getReg()); + unsigned Imm8 = MO1.getImm(); + return (Reg << 8) | Imm8; +} + +uint32_t +ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // {20-16} = imm{15-12} + // {11-0} = imm{11-0} + const MCOperand &MO = MI.getOperand(OpIdx); + if (MO.isImm()) + // Hi / lo 16 bits already extracted during earlier passes. + return static_cast<unsigned>(MO.getImm()); + + // Handle :upper16: and :lower16: assembly prefixes. + const MCExpr *E = MO.getExpr(); + MCFixupKind Kind; + if (E->getKind() == MCExpr::Target) { + const ARMMCExpr *ARM16Expr = cast<ARMMCExpr>(E); + E = ARM16Expr->getSubExpr(); + + if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(E)) { + const int64_t Value = MCE->getValue(); + if (Value > UINT32_MAX) + report_fatal_error("constant value truncated (limited to 32-bit)"); + + switch (ARM16Expr->getKind()) { + case ARMMCExpr::VK_ARM_HI16: + return (int32_t(Value) & 0xffff0000) >> 16; + case ARMMCExpr::VK_ARM_LO16: + return (int32_t(Value) & 0x0000ffff); + default: llvm_unreachable("Unsupported ARMFixup"); + } + } + + switch (ARM16Expr->getKind()) { + default: llvm_unreachable("Unsupported ARMFixup"); + case ARMMCExpr::VK_ARM_HI16: + Kind = MCFixupKind(isThumb2(STI) ? ARM::fixup_t2_movt_hi16 + : ARM::fixup_arm_movt_hi16); + break; + case ARMMCExpr::VK_ARM_LO16: + Kind = MCFixupKind(isThumb2(STI) ? ARM::fixup_t2_movw_lo16 + : ARM::fixup_arm_movw_lo16); + break; + } + + Fixups.push_back(MCFixup::create(0, E, Kind, MI.getLoc())); + return 0; + } + // If the expression doesn't have :upper16: or :lower16: on it, + // it's just a plain immediate expression, previously those evaluated to + // the lower 16 bits of the expression regardless of whether + // we have a movt or a movw, but that led to misleadingly results. + // This is disallowed in the AsmParser in validateInstruction() + // so this should never happen. + llvm_unreachable("expression without :upper16: or :lower16:"); +} + +uint32_t ARMMCCodeEmitter:: +getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx+1); + const MCOperand &MO2 = MI.getOperand(OpIdx+2); + unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO.getReg()); + unsigned Rm = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg()); + unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm()); + bool isAdd = ARM_AM::getAM2Op(MO2.getImm()) == ARM_AM::add; + ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(MO2.getImm()); + unsigned SBits = getShiftOp(ShOp); + + // While "lsr #32" and "asr #32" exist, they are encoded with a 0 in the shift + // amount. However, it would be an easy mistake to make so check here. + assert((ShImm & ~0x1f) == 0 && "Out of range shift amount"); + + // {16-13} = Rn + // {12} = isAdd + // {11-0} = shifter + // {3-0} = Rm + // {4} = 0 + // {6-5} = type + // {11-7} = imm + uint32_t Binary = Rm; + Binary |= Rn << 13; + Binary |= SBits << 5; + Binary |= ShImm << 7; + if (isAdd) + Binary |= 1 << 12; + return Binary; +} + +uint32_t ARMMCCodeEmitter:: +getAddrMode2OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // {17-14} Rn + // {13} 1 == imm12, 0 == Rm + // {12} isAdd + // {11-0} imm12/Rm + const MCOperand &MO = MI.getOperand(OpIdx); + unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO.getReg()); + uint32_t Binary = getAddrMode2OffsetOpValue(MI, OpIdx + 1, Fixups, STI); + Binary |= Rn << 14; + return Binary; +} + +uint32_t ARMMCCodeEmitter:: +getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // {13} 1 == imm12, 0 == Rm + // {12} isAdd + // {11-0} imm12/Rm + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx+1); + unsigned Imm = MO1.getImm(); + bool isAdd = ARM_AM::getAM2Op(Imm) == ARM_AM::add; + bool isReg = MO.getReg() != 0; + uint32_t Binary = ARM_AM::getAM2Offset(Imm); + // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm12 + if (isReg) { + ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(Imm); + Binary <<= 7; // Shift amount is bits [11:7] + Binary |= getShiftOp(ShOp) << 5; // Shift type is bits [6:5] + Binary |= CTX.getRegisterInfo()->getEncodingValue(MO.getReg()); // Rm is bits [3:0] + } + return Binary | (isAdd << 12) | (isReg << 13); +} + +uint32_t ARMMCCodeEmitter:: +getPostIdxRegOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // {4} isAdd + // {3-0} Rm + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx+1); + bool isAdd = MO1.getImm() != 0; + return CTX.getRegisterInfo()->getEncodingValue(MO.getReg()) | (isAdd << 4); +} + +uint32_t ARMMCCodeEmitter:: +getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // {9} 1 == imm8, 0 == Rm + // {8} isAdd + // {7-4} imm7_4/zero + // {3-0} imm3_0/Rm + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx+1); + unsigned Imm = MO1.getImm(); + bool isAdd = ARM_AM::getAM3Op(Imm) == ARM_AM::add; + bool isImm = MO.getReg() == 0; + uint32_t Imm8 = ARM_AM::getAM3Offset(Imm); + // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm8 + if (!isImm) + Imm8 = CTX.getRegisterInfo()->getEncodingValue(MO.getReg()); + return Imm8 | (isAdd << 8) | (isImm << 9); +} + +uint32_t ARMMCCodeEmitter:: +getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // {13} 1 == imm8, 0 == Rm + // {12-9} Rn + // {8} isAdd + // {7-4} imm7_4/zero + // {3-0} imm3_0/Rm + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx+1); + const MCOperand &MO2 = MI.getOperand(OpIdx+2); + + // If The first operand isn't a register, we have a label reference. + if (!MO.isReg()) { + unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(ARM::PC); // Rn is PC. + + assert(MO.isExpr() && "Unexpected machine operand type!"); + const MCExpr *Expr = MO.getExpr(); + MCFixupKind Kind = MCFixupKind(ARM::fixup_arm_pcrel_10_unscaled); + Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc())); + + ++MCNumCPRelocations; + return (Rn << 9) | (1 << 13); + } + unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO.getReg()); + unsigned Imm = MO2.getImm(); + bool isAdd = ARM_AM::getAM3Op(Imm) == ARM_AM::add; + bool isImm = MO1.getReg() == 0; + uint32_t Imm8 = ARM_AM::getAM3Offset(Imm); + // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm8 + if (!isImm) + Imm8 = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg()); + return (Rn << 9) | Imm8 | (isAdd << 8) | (isImm << 13); +} + +/// getAddrModeThumbSPOpValue - Encode the t_addrmode_sp operands. +uint32_t ARMMCCodeEmitter:: +getAddrModeThumbSPOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // [SP, #imm] + // {7-0} = imm8 + const MCOperand &MO1 = MI.getOperand(OpIdx + 1); + assert(MI.getOperand(OpIdx).getReg() == ARM::SP && + "Unexpected base register!"); + + // The immediate is already shifted for the implicit zeroes, so no change + // here. + return MO1.getImm() & 0xff; +} + +/// getAddrModeISOpValue - Encode the t_addrmode_is# operands. +uint32_t ARMMCCodeEmitter:: +getAddrModeISOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // [Rn, #imm] + // {7-3} = imm5 + // {2-0} = Rn + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx + 1); + unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO.getReg()); + unsigned Imm5 = MO1.getImm(); + return ((Imm5 & 0x1f) << 3) | Rn; +} + +/// getAddrModePCOpValue - Return encoding for t_addrmode_pc operands. +uint32_t ARMMCCodeEmitter:: +getAddrModePCOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand MO = MI.getOperand(OpIdx); + if (MO.isExpr()) + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cp, Fixups, STI); + return (MO.getImm() >> 2); +} + +/// getAddrMode5OpValue - Return encoding info for 'reg +/- imm10' operand. +uint32_t ARMMCCodeEmitter:: +getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // {12-9} = reg + // {8} = (U)nsigned (add == '1', sub == '0') + // {7-0} = imm8 + unsigned Reg, Imm8; + bool isAdd; + // If The first operand isn't a register, we have a label reference. + const MCOperand &MO = MI.getOperand(OpIdx); + if (!MO.isReg()) { + Reg = CTX.getRegisterInfo()->getEncodingValue(ARM::PC); // Rn is PC. + Imm8 = 0; + isAdd = false; // 'U' bit is handled as part of the fixup. + + assert(MO.isExpr() && "Unexpected machine operand type!"); + const MCExpr *Expr = MO.getExpr(); + MCFixupKind Kind; + if (isThumb2(STI)) + Kind = MCFixupKind(ARM::fixup_t2_pcrel_10); + else + Kind = MCFixupKind(ARM::fixup_arm_pcrel_10); + Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc())); + + ++MCNumCPRelocations; + } else { + EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups, STI); + isAdd = ARM_AM::getAM5Op(Imm8) == ARM_AM::add; + } + + uint32_t Binary = ARM_AM::getAM5Offset(Imm8); + // Immediate is always encoded as positive. The 'U' bit controls add vs sub. + if (isAdd) + Binary |= (1 << 8); + Binary |= (Reg << 9); + return Binary; +} + +unsigned ARMMCCodeEmitter:: +getSORegRegOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // Sub-operands are [reg, reg, imm]. The first register is Rm, the reg to be + // shifted. The second is Rs, the amount to shift by, and the third specifies + // the type of the shift. + // + // {3-0} = Rm. + // {4} = 1 + // {6-5} = type + // {11-8} = Rs + // {7} = 0 + + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx + 1); + const MCOperand &MO2 = MI.getOperand(OpIdx + 2); + ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO2.getImm()); + + // Encode Rm. + unsigned Binary = CTX.getRegisterInfo()->getEncodingValue(MO.getReg()); + + // Encode the shift opcode. + unsigned SBits = 0; + unsigned Rs = MO1.getReg(); + if (Rs) { + // Set shift operand (bit[7:4]). + // LSL - 0001 + // LSR - 0011 + // ASR - 0101 + // ROR - 0111 + switch (SOpc) { + default: llvm_unreachable("Unknown shift opc!"); + case ARM_AM::lsl: SBits = 0x1; break; + case ARM_AM::lsr: SBits = 0x3; break; + case ARM_AM::asr: SBits = 0x5; break; + case ARM_AM::ror: SBits = 0x7; break; + } + } + + Binary |= SBits << 4; + + // Encode the shift operation Rs. + // Encode Rs bit[11:8]. + assert(ARM_AM::getSORegOffset(MO2.getImm()) == 0); + return Binary | (CTX.getRegisterInfo()->getEncodingValue(Rs) << ARMII::RegRsShift); +} + +unsigned ARMMCCodeEmitter:: +getSORegImmOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // Sub-operands are [reg, imm]. The first register is Rm, the reg to be + // shifted. The second is the amount to shift by. + // + // {3-0} = Rm. + // {4} = 0 + // {6-5} = type + // {11-7} = imm + + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx + 1); + ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO1.getImm()); + + // Encode Rm. + unsigned Binary = CTX.getRegisterInfo()->getEncodingValue(MO.getReg()); + + // Encode the shift opcode. + unsigned SBits = 0; + + // Set shift operand (bit[6:4]). + // LSL - 000 + // LSR - 010 + // ASR - 100 + // ROR - 110 + // RRX - 110 and bit[11:8] clear. + switch (SOpc) { + default: llvm_unreachable("Unknown shift opc!"); + case ARM_AM::lsl: SBits = 0x0; break; + case ARM_AM::lsr: SBits = 0x2; break; + case ARM_AM::asr: SBits = 0x4; break; + case ARM_AM::ror: SBits = 0x6; break; + case ARM_AM::rrx: + Binary |= 0x60; + return Binary; + } + + // Encode shift_imm bit[11:7]. + Binary |= SBits << 4; + unsigned Offset = ARM_AM::getSORegOffset(MO1.getImm()); + assert(Offset < 32 && "Offset must be in range 0-31!"); + return Binary | (Offset << 7); +} + + +unsigned ARMMCCodeEmitter:: +getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO1 = MI.getOperand(OpNum); + const MCOperand &MO2 = MI.getOperand(OpNum+1); + const MCOperand &MO3 = MI.getOperand(OpNum+2); + + // Encoded as [Rn, Rm, imm]. + // FIXME: Needs fixup support. + unsigned Value = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg()); + Value <<= 4; + Value |= CTX.getRegisterInfo()->getEncodingValue(MO2.getReg()); + Value <<= 2; + Value |= MO3.getImm(); + + return Value; +} + +unsigned ARMMCCodeEmitter:: +getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO1 = MI.getOperand(OpNum); + const MCOperand &MO2 = MI.getOperand(OpNum+1); + + // FIXME: Needs fixup support. + unsigned Value = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg()); + + // Even though the immediate is 8 bits long, we need 9 bits in order + // to represent the (inverse of the) sign bit. + Value <<= 9; + int32_t tmp = (int32_t)MO2.getImm(); + if (tmp < 0) + tmp = abs(tmp); + else + Value |= 256; // Set the ADD bit + Value |= tmp & 255; + return Value; +} + +unsigned ARMMCCodeEmitter:: +getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO1 = MI.getOperand(OpNum); + + // FIXME: Needs fixup support. + unsigned Value = 0; + int32_t tmp = (int32_t)MO1.getImm(); + if (tmp < 0) + tmp = abs(tmp); + else + Value |= 256; // Set the ADD bit + Value |= tmp & 255; + return Value; +} + +unsigned ARMMCCodeEmitter:: +getT2AddrModeImm12OffsetOpValue(const MCInst &MI, unsigned OpNum, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO1 = MI.getOperand(OpNum); + + // FIXME: Needs fixup support. + unsigned Value = 0; + int32_t tmp = (int32_t)MO1.getImm(); + if (tmp < 0) + tmp = abs(tmp); + else + Value |= 4096; // Set the ADD bit + Value |= tmp & 4095; + return Value; +} + +unsigned ARMMCCodeEmitter:: +getT2SORegOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // Sub-operands are [reg, imm]. The first register is Rm, the reg to be + // shifted. The second is the amount to shift by. + // + // {3-0} = Rm. + // {4} = 0 + // {6-5} = type + // {11-7} = imm + + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx + 1); + ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO1.getImm()); + + // Encode Rm. + unsigned Binary = CTX.getRegisterInfo()->getEncodingValue(MO.getReg()); + + // Encode the shift opcode. + unsigned SBits = 0; + // Set shift operand (bit[6:4]). + // LSL - 000 + // LSR - 010 + // ASR - 100 + // ROR - 110 + switch (SOpc) { + default: llvm_unreachable("Unknown shift opc!"); + case ARM_AM::lsl: SBits = 0x0; break; + case ARM_AM::lsr: SBits = 0x2; break; + case ARM_AM::asr: SBits = 0x4; break; + case ARM_AM::rrx: // FALLTHROUGH + case ARM_AM::ror: SBits = 0x6; break; + } + + Binary |= SBits << 4; + if (SOpc == ARM_AM::rrx) + return Binary; + + // Encode shift_imm bit[11:7]. + return Binary | ARM_AM::getSORegOffset(MO1.getImm()) << 7; +} + +unsigned ARMMCCodeEmitter:: +getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // 10 bits. lower 5 bits are are the lsb of the mask, high five bits are the + // msb of the mask. + const MCOperand &MO = MI.getOperand(Op); + uint32_t v = ~MO.getImm(); + uint32_t lsb = countTrailingZeros(v); + uint32_t msb = (32 - countLeadingZeros (v)) - 1; + assert (v != 0 && lsb < 32 && msb < 32 && "Illegal bitfield mask!"); + return lsb | (msb << 5); +} + +unsigned ARMMCCodeEmitter:: +getRegisterListOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // VLDM/VSTM: + // {12-8} = Vd + // {7-0} = Number of registers + // + // LDM/STM: + // {15-0} = Bitfield of GPRs. + unsigned Reg = MI.getOperand(Op).getReg(); + bool SPRRegs = ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg); + bool DPRRegs = ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg); + + unsigned Binary = 0; + + if (SPRRegs || DPRRegs) { + // VLDM/VSTM + unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg); + unsigned NumRegs = (MI.getNumOperands() - Op) & 0xff; + Binary |= (RegNo & 0x1f) << 8; + if (SPRRegs) + Binary |= NumRegs; + else + Binary |= NumRegs * 2; + } else { + for (unsigned I = Op, E = MI.getNumOperands(); I < E; ++I) { + unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(MI.getOperand(I).getReg()); + Binary |= 1 << RegNo; + } + } + + return Binary; +} + +/// getAddrMode6AddressOpValue - Encode an addrmode6 register number along +/// with the alignment operand. +unsigned ARMMCCodeEmitter:: +getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &Reg = MI.getOperand(Op); + const MCOperand &Imm = MI.getOperand(Op + 1); + + unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg.getReg()); + unsigned Align = 0; + + switch (Imm.getImm()) { + default: break; + case 2: + case 4: + case 8: Align = 0x01; break; + case 16: Align = 0x02; break; + case 32: Align = 0x03; break; + } + + return RegNo | (Align << 4); +} + +/// getAddrMode6OneLane32AddressOpValue - Encode an addrmode6 register number +/// along with the alignment operand for use in VST1 and VLD1 with size 32. +unsigned ARMMCCodeEmitter:: +getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &Reg = MI.getOperand(Op); + const MCOperand &Imm = MI.getOperand(Op + 1); + + unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg.getReg()); + unsigned Align = 0; + + switch (Imm.getImm()) { + default: break; + case 8: + case 16: + case 32: // Default '0' value for invalid alignments of 8, 16, 32 bytes. + case 2: Align = 0x00; break; + case 4: Align = 0x03; break; + } + + return RegNo | (Align << 4); +} + + +/// getAddrMode6DupAddressOpValue - Encode an addrmode6 register number and +/// alignment operand for use in VLD-dup instructions. This is the same as +/// getAddrMode6AddressOpValue except for the alignment encoding, which is +/// different for VLD4-dup. +unsigned ARMMCCodeEmitter:: +getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &Reg = MI.getOperand(Op); + const MCOperand &Imm = MI.getOperand(Op + 1); + + unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg.getReg()); + unsigned Align = 0; + + switch (Imm.getImm()) { + default: break; + case 2: + case 4: + case 8: Align = 0x01; break; + case 16: Align = 0x03; break; + } + + return RegNo | (Align << 4); +} + +unsigned ARMMCCodeEmitter:: +getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(Op); + if (MO.getReg() == 0) return 0x0D; + return CTX.getRegisterInfo()->getEncodingValue(MO.getReg()); +} + +unsigned ARMMCCodeEmitter:: +getShiftRight8Imm(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + return 8 - MI.getOperand(Op).getImm(); +} + +unsigned ARMMCCodeEmitter:: +getShiftRight16Imm(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + return 16 - MI.getOperand(Op).getImm(); +} + +unsigned ARMMCCodeEmitter:: +getShiftRight32Imm(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + return 32 - MI.getOperand(Op).getImm(); +} + +unsigned ARMMCCodeEmitter:: +getShiftRight64Imm(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + return 64 - MI.getOperand(Op).getImm(); +} + +void ARMMCCodeEmitter:: +encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // Pseudo instructions don't get encoded. + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + uint64_t TSFlags = Desc.TSFlags; + if ((TSFlags & ARMII::FormMask) == ARMII::Pseudo) + return; + + int Size; + if (Desc.getSize() == 2 || Desc.getSize() == 4) + Size = Desc.getSize(); + else + llvm_unreachable("Unexpected instruction size!"); + + uint32_t Binary = getBinaryCodeForInstr(MI, Fixups, STI); + // Thumb 32-bit wide instructions need to emit the high order halfword + // first. + if (isThumb(STI) && Size == 4) { + EmitConstant(Binary >> 16, 2, OS); + EmitConstant(Binary & 0xffff, 2, OS); + } else + EmitConstant(Binary, Size, OS); + ++MCNumEmitted; // Keep track of the # of mi's emitted. +} + +#include "ARMGenMCCodeEmitter.inc" diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp new file mode 100644 index 0000000..2063ca6 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp @@ -0,0 +1,41 @@ +//===-- ARMMCExpr.cpp - ARM specific MC expression classes ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "ARMMCExpr.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCStreamer.h" +using namespace llvm; + +#define DEBUG_TYPE "armmcexpr" + +const ARMMCExpr* +ARMMCExpr::create(VariantKind Kind, const MCExpr *Expr, + MCContext &Ctx) { + return new (Ctx) ARMMCExpr(Kind, Expr); +} + +void ARMMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const { + switch (Kind) { + default: llvm_unreachable("Invalid kind!"); + case VK_ARM_HI16: OS << ":upper16:"; break; + case VK_ARM_LO16: OS << ":lower16:"; break; + } + + const MCExpr *Expr = getSubExpr(); + if (Expr->getKind() != MCExpr::SymbolRef) + OS << '('; + Expr->print(OS, MAI); + if (Expr->getKind() != MCExpr::SymbolRef) + OS << ')'; +} + +void ARMMCExpr::visitUsedExpr(MCStreamer &Streamer) const { + Streamer.visitUsedExpr(*getSubExpr()); +} diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h new file mode 100644 index 0000000..75dde80 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h @@ -0,0 +1,79 @@ +//===-- ARMMCExpr.h - ARM specific MC expression classes --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCEXPR_H +#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCEXPR_H + +#include "llvm/MC/MCExpr.h" + +namespace llvm { + +class ARMMCExpr : public MCTargetExpr { +public: + enum VariantKind { + VK_ARM_None, + VK_ARM_HI16, // The R_ARM_MOVT_ABS relocation (:upper16: in the .s file) + VK_ARM_LO16 // The R_ARM_MOVW_ABS_NC relocation (:lower16: in the .s file) + }; + +private: + const VariantKind Kind; + const MCExpr *Expr; + + explicit ARMMCExpr(VariantKind Kind, const MCExpr *Expr) + : Kind(Kind), Expr(Expr) {} + +public: + /// @name Construction + /// @{ + + static const ARMMCExpr *create(VariantKind Kind, const MCExpr *Expr, + MCContext &Ctx); + + static const ARMMCExpr *createUpper16(const MCExpr *Expr, MCContext &Ctx) { + return create(VK_ARM_HI16, Expr, Ctx); + } + + static const ARMMCExpr *createLower16(const MCExpr *Expr, MCContext &Ctx) { + return create(VK_ARM_LO16, Expr, Ctx); + } + + /// @} + /// @name Accessors + /// @{ + + /// getOpcode - Get the kind of this expression. + VariantKind getKind() const { return Kind; } + + /// getSubExpr - Get the child of this expression. + const MCExpr *getSubExpr() const { return Expr; } + + /// @} + + void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override; + bool evaluateAsRelocatableImpl(MCValue &Res, + const MCAsmLayout *Layout, + const MCFixup *Fixup) const override { + return false; + } + void visitUsedExpr(MCStreamer &Streamer) const override; + MCFragment *findAssociatedFragment() const override { + return getSubExpr()->findAssociatedFragment(); + } + + // There are no TLS ARMMCExprs at the moment. + void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {} + + static bool classof(const MCExpr *E) { + return E->getKind() == MCExpr::Target; + } +}; +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp new file mode 100644 index 0000000..8c8c249 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp @@ -0,0 +1,344 @@ +//===-- ARMMCTargetDesc.cpp - ARM Target Descriptions ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides ARM specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#include "ARMBaseInfo.h" +#include "ARMMCAsmInfo.h" +#include "ARMMCTargetDesc.h" +#include "InstPrinter/ARMInstPrinter.h" +#include "llvm/ADT/Triple.h" +#include "llvm/MC/MCCodeGenInfo.h" +#include "llvm/MC/MCELFStreamer.h" +#include "llvm/MC/MCInstrAnalysis.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetParser.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + +#define GET_REGINFO_MC_DESC +#include "ARMGenRegisterInfo.inc" + +static bool getMCRDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI, + std::string &Info) { + if (STI.getFeatureBits()[llvm::ARM::HasV7Ops] && + (MI.getOperand(0).isImm() && MI.getOperand(0).getImm() == 15) && + (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) && + // Checks for the deprecated CP15ISB encoding: + // mcr p15, #0, rX, c7, c5, #4 + (MI.getOperand(3).isImm() && MI.getOperand(3).getImm() == 7)) { + if ((MI.getOperand(5).isImm() && MI.getOperand(5).getImm() == 4)) { + if (MI.getOperand(4).isImm() && MI.getOperand(4).getImm() == 5) { + Info = "deprecated since v7, use 'isb'"; + return true; + } + + // Checks for the deprecated CP15DSB encoding: + // mcr p15, #0, rX, c7, c10, #4 + if (MI.getOperand(4).isImm() && MI.getOperand(4).getImm() == 10) { + Info = "deprecated since v7, use 'dsb'"; + return true; + } + } + // Checks for the deprecated CP15DMB encoding: + // mcr p15, #0, rX, c7, c10, #5 + if (MI.getOperand(4).isImm() && MI.getOperand(4).getImm() == 10 && + (MI.getOperand(5).isImm() && MI.getOperand(5).getImm() == 5)) { + Info = "deprecated since v7, use 'dmb'"; + return true; + } + } + return false; +} + +static bool getITDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI, + std::string &Info) { + if (STI.getFeatureBits()[llvm::ARM::HasV8Ops] && MI.getOperand(1).isImm() && + MI.getOperand(1).getImm() != 8) { + Info = "applying IT instruction to more than one subsequent instruction is " + "deprecated"; + return true; + } + + return false; +} + +static bool getARMStoreDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI, + std::string &Info) { + assert(!STI.getFeatureBits()[llvm::ARM::ModeThumb] && + "cannot predicate thumb instructions"); + + assert(MI.getNumOperands() >= 4 && "expected >= 4 arguments"); + for (unsigned OI = 4, OE = MI.getNumOperands(); OI < OE; ++OI) { + assert(MI.getOperand(OI).isReg() && "expected register"); + if (MI.getOperand(OI).getReg() == ARM::SP || + MI.getOperand(OI).getReg() == ARM::PC) { + Info = "use of SP or PC in the list is deprecated"; + return true; + } + } + return false; +} + +static bool getARMLoadDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI, + std::string &Info) { + assert(!STI.getFeatureBits()[llvm::ARM::ModeThumb] && + "cannot predicate thumb instructions"); + + assert(MI.getNumOperands() >= 4 && "expected >= 4 arguments"); + bool ListContainsPC = false, ListContainsLR = false; + for (unsigned OI = 4, OE = MI.getNumOperands(); OI < OE; ++OI) { + assert(MI.getOperand(OI).isReg() && "expected register"); + switch (MI.getOperand(OI).getReg()) { + default: + break; + case ARM::LR: + ListContainsLR = true; + break; + case ARM::PC: + ListContainsPC = true; + break; + case ARM::SP: + Info = "use of SP in the list is deprecated"; + return true; + } + } + + if (ListContainsPC && ListContainsLR) { + Info = "use of LR and PC simultaneously in the list is deprecated"; + return true; + } + + return false; +} + +#define GET_INSTRINFO_MC_DESC +#include "ARMGenInstrInfo.inc" + +#define GET_SUBTARGETINFO_MC_DESC +#include "ARMGenSubtargetInfo.inc" + +std::string ARM_MC::ParseARMTriple(const Triple &TT, StringRef CPU) { + bool isThumb = + TT.getArch() == Triple::thumb || TT.getArch() == Triple::thumbeb; + + std::string ARMArchFeature; + + unsigned ArchID = ARM::parseArch(TT.getArchName()); + if (ArchID != ARM::AK_INVALID && (CPU.empty() || CPU == "generic")) + ARMArchFeature = (ARMArchFeature + "+" + ARM::getArchName(ArchID)).str(); + + if (isThumb) { + if (ARMArchFeature.empty()) + ARMArchFeature = "+thumb-mode"; + else + ARMArchFeature += ",+thumb-mode"; + } + + if (TT.isOSNaCl()) { + if (ARMArchFeature.empty()) + ARMArchFeature = "+nacl-trap"; + else + ARMArchFeature += ",+nacl-trap"; + } + + return ARMArchFeature; +} + +MCSubtargetInfo *ARM_MC::createARMMCSubtargetInfo(const Triple &TT, + StringRef CPU, StringRef FS) { + std::string ArchFS = ARM_MC::ParseARMTriple(TT, CPU); + if (!FS.empty()) { + if (!ArchFS.empty()) + ArchFS = (Twine(ArchFS) + "," + FS).str(); + else + ArchFS = FS; + } + + return createARMMCSubtargetInfoImpl(TT, CPU, ArchFS); +} + +static MCInstrInfo *createARMMCInstrInfo() { + MCInstrInfo *X = new MCInstrInfo(); + InitARMMCInstrInfo(X); + return X; +} + +static MCRegisterInfo *createARMMCRegisterInfo(const Triple &Triple) { + MCRegisterInfo *X = new MCRegisterInfo(); + InitARMMCRegisterInfo(X, ARM::LR, 0, 0, ARM::PC); + return X; +} + +static MCAsmInfo *createARMMCAsmInfo(const MCRegisterInfo &MRI, + const Triple &TheTriple) { + MCAsmInfo *MAI; + if (TheTriple.isOSDarwin() || TheTriple.isOSBinFormatMachO()) + MAI = new ARMMCAsmInfoDarwin(TheTriple); + else if (TheTriple.isWindowsMSVCEnvironment()) + MAI = new ARMCOFFMCAsmInfoMicrosoft(); + else if (TheTriple.isOSWindows()) + MAI = new ARMCOFFMCAsmInfoGNU(); + else + MAI = new ARMELFMCAsmInfo(TheTriple); + + unsigned Reg = MRI.getDwarfRegNum(ARM::SP, true); + MAI->addInitialFrameState(MCCFIInstruction::createDefCfa(nullptr, Reg, 0)); + + return MAI; +} + +static MCCodeGenInfo *createARMMCCodeGenInfo(const Triple &TT, Reloc::Model RM, + CodeModel::Model CM, + CodeGenOpt::Level OL) { + MCCodeGenInfo *X = new MCCodeGenInfo(); + if (RM == Reloc::Default) { + // Default relocation model on Darwin is PIC, not DynamicNoPIC. + RM = TT.isOSDarwin() ? Reloc::PIC_ : Reloc::DynamicNoPIC; + } + X->initMCCodeGenInfo(RM, CM, OL); + return X; +} + +static MCStreamer *createELFStreamer(const Triple &T, MCContext &Ctx, + MCAsmBackend &MAB, raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, bool RelaxAll) { + return createARMELFStreamer(Ctx, MAB, OS, Emitter, false, + T.getArch() == Triple::thumb); +} + +static MCStreamer *createARMMachOStreamer(MCContext &Ctx, MCAsmBackend &MAB, + raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, bool RelaxAll, + bool DWARFMustBeAtTheEnd) { + return createMachOStreamer(Ctx, MAB, OS, Emitter, false, DWARFMustBeAtTheEnd); +} + +static MCInstPrinter *createARMMCInstPrinter(const Triple &T, + unsigned SyntaxVariant, + const MCAsmInfo &MAI, + const MCInstrInfo &MII, + const MCRegisterInfo &MRI) { + if (SyntaxVariant == 0) + return new ARMInstPrinter(MAI, MII, MRI); + return nullptr; +} + +static MCRelocationInfo *createARMMCRelocationInfo(const Triple &TT, + MCContext &Ctx) { + if (TT.isOSBinFormatMachO()) + return createARMMachORelocationInfo(Ctx); + // Default to the stock relocation info. + return llvm::createMCRelocationInfo(TT, Ctx); +} + +namespace { + +class ARMMCInstrAnalysis : public MCInstrAnalysis { +public: + ARMMCInstrAnalysis(const MCInstrInfo *Info) : MCInstrAnalysis(Info) {} + + bool isUnconditionalBranch(const MCInst &Inst) const override { + // BCCs with the "always" predicate are unconditional branches. + if (Inst.getOpcode() == ARM::Bcc && Inst.getOperand(1).getImm()==ARMCC::AL) + return true; + return MCInstrAnalysis::isUnconditionalBranch(Inst); + } + + bool isConditionalBranch(const MCInst &Inst) const override { + // BCCs with the "always" predicate are unconditional branches. + if (Inst.getOpcode() == ARM::Bcc && Inst.getOperand(1).getImm()==ARMCC::AL) + return false; + return MCInstrAnalysis::isConditionalBranch(Inst); + } + + bool evaluateBranch(const MCInst &Inst, uint64_t Addr, + uint64_t Size, uint64_t &Target) const override { + // We only handle PCRel branches for now. + if (Info->get(Inst.getOpcode()).OpInfo[0].OperandType!=MCOI::OPERAND_PCREL) + return false; + + int64_t Imm = Inst.getOperand(0).getImm(); + // FIXME: This is not right for thumb. + Target = Addr+Imm+8; // In ARM mode the PC is always off by 8 bytes. + return true; + } +}; + +} + +static MCInstrAnalysis *createARMMCInstrAnalysis(const MCInstrInfo *Info) { + return new ARMMCInstrAnalysis(Info); +} + +// Force static initialization. +extern "C" void LLVMInitializeARMTargetMC() { + for (Target *T : {&TheARMLETarget, &TheARMBETarget, &TheThumbLETarget, + &TheThumbBETarget}) { + // Register the MC asm info. + RegisterMCAsmInfoFn X(*T, createARMMCAsmInfo); + + // Register the MC codegen info. + TargetRegistry::RegisterMCCodeGenInfo(*T, createARMMCCodeGenInfo); + + // Register the MC instruction info. + TargetRegistry::RegisterMCInstrInfo(*T, createARMMCInstrInfo); + + // Register the MC register info. + TargetRegistry::RegisterMCRegInfo(*T, createARMMCRegisterInfo); + + // Register the MC subtarget info. + TargetRegistry::RegisterMCSubtargetInfo(*T, + ARM_MC::createARMMCSubtargetInfo); + + // Register the MC instruction analyzer. + TargetRegistry::RegisterMCInstrAnalysis(*T, createARMMCInstrAnalysis); + + TargetRegistry::RegisterELFStreamer(*T, createELFStreamer); + TargetRegistry::RegisterCOFFStreamer(*T, createARMWinCOFFStreamer); + TargetRegistry::RegisterMachOStreamer(*T, createARMMachOStreamer); + + // Register the obj target streamer. + TargetRegistry::RegisterObjectTargetStreamer(*T, + createARMObjectTargetStreamer); + + // Register the asm streamer. + TargetRegistry::RegisterAsmTargetStreamer(*T, createARMTargetAsmStreamer); + + // Register the null TargetStreamer. + TargetRegistry::RegisterNullTargetStreamer(*T, createARMNullTargetStreamer); + + // Register the MCInstPrinter. + TargetRegistry::RegisterMCInstPrinter(*T, createARMMCInstPrinter); + + // Register the MC relocation info. + TargetRegistry::RegisterMCRelocationInfo(*T, createARMMCRelocationInfo); + } + + // Register the MC Code Emitter + for (Target *T : {&TheARMLETarget, &TheThumbLETarget}) + TargetRegistry::RegisterMCCodeEmitter(*T, createARMLEMCCodeEmitter); + for (Target *T : {&TheARMBETarget, &TheThumbBETarget}) + TargetRegistry::RegisterMCCodeEmitter(*T, createARMBEMCCodeEmitter); + + // Register the asm backend. + TargetRegistry::RegisterMCAsmBackend(TheARMLETarget, createARMLEAsmBackend); + TargetRegistry::RegisterMCAsmBackend(TheARMBETarget, createARMBEAsmBackend); + TargetRegistry::RegisterMCAsmBackend(TheThumbLETarget, + createThumbLEAsmBackend); + TargetRegistry::RegisterMCAsmBackend(TheThumbBETarget, + createThumbBEAsmBackend); +} diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h new file mode 100644 index 0000000..c2bbc8e --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h @@ -0,0 +1,123 @@ +//===-- ARMMCTargetDesc.h - ARM Target Descriptions -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides ARM specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCTARGETDESC_H +#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCTARGETDESC_H + +#include "llvm/Support/DataTypes.h" +#include <string> + +namespace llvm { +class formatted_raw_ostream; +class MCAsmBackend; +class MCCodeEmitter; +class MCContext; +class MCInstrInfo; +class MCInstPrinter; +class MCObjectWriter; +class MCRegisterInfo; +class MCSubtargetInfo; +class MCStreamer; +class MCRelocationInfo; +class MCTargetStreamer; +class StringRef; +class Target; +class Triple; +class raw_ostream; +class raw_pwrite_stream; + +extern Target TheARMLETarget, TheThumbLETarget; +extern Target TheARMBETarget, TheThumbBETarget; + +namespace ARM_MC { +std::string ParseARMTriple(const Triple &TT, StringRef CPU); + +/// Create a ARM MCSubtargetInfo instance. This is exposed so Asm parser, etc. +/// do not need to go through TargetRegistry. +MCSubtargetInfo *createARMMCSubtargetInfo(const Triple &TT, StringRef CPU, + StringRef FS); +} + +MCTargetStreamer *createARMNullTargetStreamer(MCStreamer &S); +MCTargetStreamer *createARMTargetAsmStreamer(MCStreamer &S, + formatted_raw_ostream &OS, + MCInstPrinter *InstPrint, + bool isVerboseAsm); +MCTargetStreamer *createARMObjectTargetStreamer(MCStreamer &S, + const MCSubtargetInfo &STI); + +MCCodeEmitter *createARMLEMCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + MCContext &Ctx); + +MCCodeEmitter *createARMBEMCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + MCContext &Ctx); + +MCAsmBackend *createARMAsmBackend(const Target &T, const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU, + bool IsLittleEndian); + +MCAsmBackend *createARMLEAsmBackend(const Target &T, const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU); + +MCAsmBackend *createARMBEAsmBackend(const Target &T, const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU); + +MCAsmBackend *createThumbLEAsmBackend(const Target &T, + const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU); + +MCAsmBackend *createThumbBEAsmBackend(const Target &T, + const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU); + +// Construct a PE/COFF machine code streamer which will generate a PE/COFF +// object file. +MCStreamer *createARMWinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB, + raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, bool RelaxAll, + bool IncrementalLinkerCompatible); + +/// Construct an ELF Mach-O object writer. +MCObjectWriter *createARMELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI, + bool IsLittleEndian); + +/// Construct an ARM Mach-O object writer. +MCObjectWriter *createARMMachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit, + uint32_t CPUType, + uint32_t CPUSubtype); + +/// Construct an ARM PE/COFF object writer. +MCObjectWriter *createARMWinCOFFObjectWriter(raw_pwrite_stream &OS, + bool Is64Bit); + +/// Construct ARM Mach-O relocation info. +MCRelocationInfo *createARMMachORelocationInfo(MCContext &Ctx); +} // End llvm namespace + +// Defines symbolic names for ARM registers. This defines a mapping from +// register name to register number. +// +#define GET_REGINFO_ENUM +#include "ARMGenRegisterInfo.inc" + +// Defines symbolic names for the ARM instructions. +// +#define GET_INSTRINFO_ENUM +#include "ARMGenInstrInfo.inc" + +#define GET_SUBTARGETINFO_ENUM +#include "ARMGenSubtargetInfo.inc" + +#endif diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp new file mode 100644 index 0000000..4468132 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp @@ -0,0 +1,43 @@ +//===-- ARMMachORelocationInfo.cpp ----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/ARMMCTargetDesc.h" +#include "ARMMCExpr.h" +#include "llvm-c/Disassembler.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCRelocationInfo.h" + +using namespace llvm; +using namespace object; + +namespace { +class ARMMachORelocationInfo : public MCRelocationInfo { +public: + ARMMachORelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {} + + const MCExpr *createExprForCAPIVariantKind(const MCExpr *SubExpr, + unsigned VariantKind) override { + switch(VariantKind) { + case LLVMDisassembler_VariantKind_ARM_HI16: + return ARMMCExpr::createUpper16(SubExpr, Ctx); + case LLVMDisassembler_VariantKind_ARM_LO16: + return ARMMCExpr::createLower16(SubExpr, Ctx); + default: + return MCRelocationInfo::createExprForCAPIVariantKind(SubExpr, + VariantKind); + } + } +}; +} // End unnamed namespace + +/// createARMMachORelocationInfo - Construct an ARM Mach-O RelocationInfo. +MCRelocationInfo *llvm::createARMMachORelocationInfo(MCContext &Ctx) { + return new ARMMachORelocationInfo(Ctx); +} diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp new file mode 100644 index 0000000..cfd504e --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp @@ -0,0 +1,483 @@ +//===-- ARMMachObjectWriter.cpp - ARM Mach Object Writer ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/ARMMCTargetDesc.h" +#include "MCTargetDesc/ARMBaseInfo.h" +#include "MCTargetDesc/ARMFixupKinds.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCAsmLayout.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCFixup.h" +#include "llvm/MC/MCFixupKindInfo.h" +#include "llvm/MC/MCMachObjectWriter.h" +#include "llvm/MC/MCSection.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MachO.h" +using namespace llvm; + +namespace { +class ARMMachObjectWriter : public MCMachObjectTargetWriter { + void RecordARMScatteredRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + unsigned Type, + unsigned Log2Size, + uint64_t &FixedValue); + void RecordARMScatteredHalfRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, MCValue Target, + uint64_t &FixedValue); + + bool requiresExternRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCFragment &Fragment, unsigned RelocType, + const MCSymbol &S, uint64_t FixedValue); + +public: + ARMMachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype) + : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype) {} + + void recordRelocation(MachObjectWriter *Writer, MCAssembler &Asm, + const MCAsmLayout &Layout, const MCFragment *Fragment, + const MCFixup &Fixup, MCValue Target, + uint64_t &FixedValue) override; +}; +} + +static bool getARMFixupKindMachOInfo(unsigned Kind, unsigned &RelocType, + unsigned &Log2Size) { + RelocType = unsigned(MachO::ARM_RELOC_VANILLA); + Log2Size = ~0U; + + switch (Kind) { + default: + return false; + + case FK_Data_1: + Log2Size = llvm::Log2_32(1); + return true; + case FK_Data_2: + Log2Size = llvm::Log2_32(2); + return true; + case FK_Data_4: + Log2Size = llvm::Log2_32(4); + return true; + case FK_Data_8: + Log2Size = llvm::Log2_32(8); + return true; + + // These fixups are expected to always be resolvable at assembly time and + // have no relocations supported. + case ARM::fixup_arm_ldst_pcrel_12: + case ARM::fixup_arm_pcrel_10: + case ARM::fixup_arm_adr_pcrel_12: + case ARM::fixup_arm_thumb_br: + return false; + + // Handle 24-bit branch kinds. + case ARM::fixup_arm_condbranch: + case ARM::fixup_arm_uncondbranch: + case ARM::fixup_arm_uncondbl: + case ARM::fixup_arm_condbl: + case ARM::fixup_arm_blx: + RelocType = unsigned(MachO::ARM_RELOC_BR24); + // Report as 'long', even though that is not quite accurate. + Log2Size = llvm::Log2_32(4); + return true; + + case ARM::fixup_t2_uncondbranch: + case ARM::fixup_arm_thumb_bl: + case ARM::fixup_arm_thumb_blx: + RelocType = unsigned(MachO::ARM_THUMB_RELOC_BR22); + Log2Size = llvm::Log2_32(4); + return true; + + // For movw/movt r_type relocations they always have a pair following them and + // the r_length bits are used differently. The encoding of the r_length is as + // follows: + // low bit of r_length: + // 0 - :lower16: for movw instructions + // 1 - :upper16: for movt instructions + // high bit of r_length: + // 0 - arm instructions + // 1 - thumb instructions + case ARM::fixup_arm_movt_hi16: + RelocType = unsigned(MachO::ARM_RELOC_HALF); + Log2Size = 1; + return true; + case ARM::fixup_t2_movt_hi16: + RelocType = unsigned(MachO::ARM_RELOC_HALF); + Log2Size = 3; + return true; + + case ARM::fixup_arm_movw_lo16: + RelocType = unsigned(MachO::ARM_RELOC_HALF); + Log2Size = 0; + return true; + case ARM::fixup_t2_movw_lo16: + RelocType = unsigned(MachO::ARM_RELOC_HALF); + Log2Size = 2; + return true; + } +} + +void ARMMachObjectWriter:: +RecordARMScatteredHalfRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + uint64_t &FixedValue) { + uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); + unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); + unsigned Type = MachO::ARM_RELOC_HALF; + + // See <reloc.h>. + const MCSymbol *A = &Target.getSymA()->getSymbol(); + + if (!A->getFragment()) { + Asm.getContext().reportError(Fixup.getLoc(), + "symbol '" + A->getName() + + "' can not be undefined in a subtraction expression"); + return; + } + + uint32_t Value = Writer->getSymbolAddress(*A, Layout); + uint32_t Value2 = 0; + uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent()); + FixedValue += SecAddr; + + if (const MCSymbolRefExpr *B = Target.getSymB()) { + const MCSymbol *SB = &B->getSymbol(); + + if (!SB->getFragment()) { + Asm.getContext().reportError(Fixup.getLoc(), + "symbol '" + B->getSymbol().getName() + + "' can not be undefined in a subtraction expression"); + return; + } + + // Select the appropriate difference relocation type. + Type = MachO::ARM_RELOC_HALF_SECTDIFF; + Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout); + FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent()); + } + + // Relocations are written out in reverse order, so the PAIR comes first. + // ARM_RELOC_HALF and ARM_RELOC_HALF_SECTDIFF abuse the r_length field: + // + // For these two r_type relocations they always have a pair following them and + // the r_length bits are used differently. The encoding of the r_length is as + // follows: + // low bit of r_length: + // 0 - :lower16: for movw instructions + // 1 - :upper16: for movt instructions + // high bit of r_length: + // 0 - arm instructions + // 1 - thumb instructions + // the other half of the relocated expression is in the following pair + // relocation entry in the low 16 bits of r_address field. + unsigned ThumbBit = 0; + unsigned MovtBit = 0; + switch ((unsigned)Fixup.getKind()) { + default: break; + case ARM::fixup_arm_movt_hi16: + MovtBit = 1; + // The thumb bit shouldn't be set in the 'other-half' bit of the + // relocation, but it will be set in FixedValue if the base symbol + // is a thumb function. Clear it out here. + if (Asm.isThumbFunc(A)) + FixedValue &= 0xfffffffe; + break; + case ARM::fixup_t2_movt_hi16: + if (Asm.isThumbFunc(A)) + FixedValue &= 0xfffffffe; + MovtBit = 1; + // Fallthrough + case ARM::fixup_t2_movw_lo16: + ThumbBit = 1; + break; + } + + if (Type == MachO::ARM_RELOC_HALF_SECTDIFF) { + uint32_t OtherHalf = MovtBit + ? (FixedValue & 0xffff) : ((FixedValue & 0xffff0000) >> 16); + + MachO::any_relocation_info MRE; + MRE.r_word0 = ((OtherHalf << 0) | + (MachO::ARM_RELOC_PAIR << 24) | + (MovtBit << 28) | + (ThumbBit << 29) | + (IsPCRel << 30) | + MachO::R_SCATTERED); + MRE.r_word1 = Value2; + Writer->addRelocation(nullptr, Fragment->getParent(), MRE); + } + + MachO::any_relocation_info MRE; + MRE.r_word0 = ((FixupOffset << 0) | + (Type << 24) | + (MovtBit << 28) | + (ThumbBit << 29) | + (IsPCRel << 30) | + MachO::R_SCATTERED); + MRE.r_word1 = Value; + Writer->addRelocation(nullptr, Fragment->getParent(), MRE); +} + +void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + unsigned Type, + unsigned Log2Size, + uint64_t &FixedValue) { + uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); + unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); + + // See <reloc.h>. + const MCSymbol *A = &Target.getSymA()->getSymbol(); + + if (!A->getFragment()) { + Asm.getContext().reportError(Fixup.getLoc(), + "symbol '" + A->getName() + + "' can not be undefined in a subtraction expression"); + return; + } + + uint32_t Value = Writer->getSymbolAddress(*A, Layout); + uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent()); + FixedValue += SecAddr; + uint32_t Value2 = 0; + + if (const MCSymbolRefExpr *B = Target.getSymB()) { + assert(Type == MachO::ARM_RELOC_VANILLA && "invalid reloc for 2 symbols"); + const MCSymbol *SB = &B->getSymbol(); + + if (!SB->getFragment()) { + Asm.getContext().reportError(Fixup.getLoc(), + "symbol '" + B->getSymbol().getName() + + "' can not be undefined in a subtraction expression"); + return; + } + + // Select the appropriate difference relocation type. + Type = MachO::ARM_RELOC_SECTDIFF; + Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout); + FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent()); + } + + // Relocations are written out in reverse order, so the PAIR comes first. + if (Type == MachO::ARM_RELOC_SECTDIFF || + Type == MachO::ARM_RELOC_LOCAL_SECTDIFF) { + MachO::any_relocation_info MRE; + MRE.r_word0 = ((0 << 0) | + (MachO::ARM_RELOC_PAIR << 24) | + (Log2Size << 28) | + (IsPCRel << 30) | + MachO::R_SCATTERED); + MRE.r_word1 = Value2; + Writer->addRelocation(nullptr, Fragment->getParent(), MRE); + } + + MachO::any_relocation_info MRE; + MRE.r_word0 = ((FixupOffset << 0) | + (Type << 24) | + (Log2Size << 28) | + (IsPCRel << 30) | + MachO::R_SCATTERED); + MRE.r_word1 = Value; + Writer->addRelocation(nullptr, Fragment->getParent(), MRE); +} + +bool ARMMachObjectWriter::requiresExternRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCFragment &Fragment, + unsigned RelocType, + const MCSymbol &S, + uint64_t FixedValue) { + // Most cases can be identified purely from the symbol. + if (Writer->doesSymbolRequireExternRelocation(S)) + return true; + int64_t Value = (int64_t)FixedValue; // The displacement is signed. + int64_t Range; + switch (RelocType) { + default: + return false; + case MachO::ARM_RELOC_BR24: + // PC pre-adjustment of 8 for these instructions. + Value -= 8; + // ARM BL/BLX has a 25-bit offset. + Range = 0x1ffffff; + break; + case MachO::ARM_THUMB_RELOC_BR22: + // PC pre-adjustment of 4 for these instructions. + Value -= 4; + // Thumb BL/BLX has a 24-bit offset. + Range = 0xffffff; + } + // BL/BLX also use external relocations when an internal relocation + // would result in the target being out of range. This gives the linker + // enough information to generate a branch island. + Value += Writer->getSectionAddress(&S.getSection()); + Value -= Writer->getSectionAddress(Fragment.getParent()); + // If the resultant value would be out of range for an internal relocation, + // use an external instead. + if (Value > Range || Value < -(Range + 1)) + return true; + return false; +} + +void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer, + MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, MCValue Target, + uint64_t &FixedValue) { + unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); + unsigned Log2Size; + unsigned RelocType = MachO::ARM_RELOC_VANILLA; + if (!getARMFixupKindMachOInfo(Fixup.getKind(), RelocType, Log2Size)) { + // If we failed to get fixup kind info, it's because there's no legal + // relocation type for the fixup kind. This happens when it's a fixup that's + // expected to always be resolvable at assembly time and not have any + // relocations needed. + Asm.getContext().reportError(Fixup.getLoc(), + "unsupported relocation on symbol"); + return; + } + + // If this is a difference or a defined symbol plus an offset, then we need a + // scattered relocation entry. Differences always require scattered + // relocations. + if (Target.getSymB()) { + if (RelocType == MachO::ARM_RELOC_HALF) + return RecordARMScatteredHalfRelocation(Writer, Asm, Layout, Fragment, + Fixup, Target, FixedValue); + return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, + Target, RelocType, Log2Size, + FixedValue); + } + + // Get the symbol data, if any. + const MCSymbol *A = nullptr; + if (Target.getSymA()) + A = &Target.getSymA()->getSymbol(); + + // FIXME: For other platforms, we need to use scattered relocations for + // internal relocations with offsets. If this is an internal relocation with + // an offset, it also needs a scattered relocation entry. + // + // Is this right for ARM? + uint32_t Offset = Target.getConstant(); + if (IsPCRel && RelocType == MachO::ARM_RELOC_VANILLA) + Offset += 1 << Log2Size; + if (Offset && A && !Writer->doesSymbolRequireExternRelocation(*A)) + return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, + Target, RelocType, Log2Size, + FixedValue); + + // See <reloc.h>. + uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); + unsigned Index = 0; + unsigned Type = 0; + const MCSymbol *RelSymbol = nullptr; + + if (Target.isAbsolute()) { // constant + // FIXME! + report_fatal_error("FIXME: relocations to absolute targets " + "not yet implemented"); + } else { + // Resolve constant variables. + if (A->isVariable()) { + int64_t Res; + if (A->getVariableValue()->evaluateAsAbsolute( + Res, Layout, Writer->getSectionAddressMap())) { + FixedValue = Res; + return; + } + } + + // Check whether we need an external or internal relocation. + if (requiresExternRelocation(Writer, Asm, *Fragment, RelocType, *A, + FixedValue)) { + RelSymbol = A; + + // For external relocations, make sure to offset the fixup value to + // compensate for the addend of the symbol address, if it was + // undefined. This occurs with weak definitions, for example. + if (!A->isUndefined()) + FixedValue -= Layout.getSymbolOffset(*A); + } else { + // The index is the section ordinal (1-based). + const MCSection &Sec = A->getSection(); + Index = Sec.getOrdinal() + 1; + FixedValue += Writer->getSectionAddress(&Sec); + } + if (IsPCRel) + FixedValue -= Writer->getSectionAddress(Fragment->getParent()); + + // The type is determined by the fixup kind. + Type = RelocType; + } + + // struct relocation_info (8 bytes) + MachO::any_relocation_info MRE; + MRE.r_word0 = FixupOffset; + MRE.r_word1 = + (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28); + + // Even when it's not a scattered relocation, movw/movt always uses + // a PAIR relocation. + if (Type == MachO::ARM_RELOC_HALF) { + // The other-half value only gets populated for the movt and movw + // relocation entries. + uint32_t Value = 0; + switch ((unsigned)Fixup.getKind()) { + default: break; + case ARM::fixup_arm_movw_lo16: + case ARM::fixup_t2_movw_lo16: + Value = (FixedValue >> 16) & 0xffff; + break; + case ARM::fixup_arm_movt_hi16: + case ARM::fixup_t2_movt_hi16: + Value = FixedValue & 0xffff; + break; + } + MachO::any_relocation_info MREPair; + MREPair.r_word0 = Value; + MREPair.r_word1 = ((0xffffff << 0) | + (Log2Size << 25) | + (MachO::ARM_RELOC_PAIR << 28)); + + Writer->addRelocation(nullptr, Fragment->getParent(), MREPair); + } + + Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE); +} + +MCObjectWriter *llvm::createARMMachObjectWriter(raw_pwrite_stream &OS, + bool Is64Bit, uint32_t CPUType, + uint32_t CPUSubtype) { + return createMachObjectWriter(new ARMMachObjectWriter(Is64Bit, + CPUType, + CPUSubtype), + OS, /*IsLittleEndian=*/true); +} diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp new file mode 100644 index 0000000..dad50f2 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp @@ -0,0 +1,74 @@ +//===- ARMTargetStreamer.cpp - ARMTargetStreamer class --*- C++ -*---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the ARMTargetStreamer class. +// +//===----------------------------------------------------------------------===// +#include "llvm/ADT/MapVector.h" +#include "llvm/MC/ConstantPools.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCStreamer.h" + +using namespace llvm; +// +// ARMTargetStreamer Implemenation +// +ARMTargetStreamer::ARMTargetStreamer(MCStreamer &S) + : MCTargetStreamer(S), ConstantPools(new AssemblerConstantPools()) {} + +ARMTargetStreamer::~ARMTargetStreamer() {} + +// The constant pool handling is shared by all ARMTargetStreamer +// implementations. +const MCExpr *ARMTargetStreamer::addConstantPoolEntry(const MCExpr *Expr, SMLoc Loc) { + return ConstantPools->addEntry(Streamer, Expr, 4, Loc); +} + +void ARMTargetStreamer::emitCurrentConstantPool() { + ConstantPools->emitForCurrentSection(Streamer); +} + +// finish() - write out any non-empty assembler constant pools. +void ARMTargetStreamer::finish() { ConstantPools->emitAll(Streamer); } + +// The remaining callbacks should be handled separately by each +// streamer. +void ARMTargetStreamer::emitFnStart() {} +void ARMTargetStreamer::emitFnEnd() {} +void ARMTargetStreamer::emitCantUnwind() {} +void ARMTargetStreamer::emitPersonality(const MCSymbol *Personality) {} +void ARMTargetStreamer::emitPersonalityIndex(unsigned Index) {} +void ARMTargetStreamer::emitHandlerData() {} +void ARMTargetStreamer::emitSetFP(unsigned FpReg, unsigned SpReg, + int64_t Offset) {} +void ARMTargetStreamer::emitMovSP(unsigned Reg, int64_t Offset) {} +void ARMTargetStreamer::emitPad(int64_t Offset) {} +void ARMTargetStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList, + bool isVector) {} +void ARMTargetStreamer::emitUnwindRaw(int64_t StackOffset, + const SmallVectorImpl<uint8_t> &Opcodes) { +} +void ARMTargetStreamer::switchVendor(StringRef Vendor) {} +void ARMTargetStreamer::emitAttribute(unsigned Attribute, unsigned Value) {} +void ARMTargetStreamer::emitTextAttribute(unsigned Attribute, + StringRef String) {} +void ARMTargetStreamer::emitIntTextAttribute(unsigned Attribute, + unsigned IntValue, + StringRef StringValue) {} +void ARMTargetStreamer::emitArch(unsigned Arch) {} +void ARMTargetStreamer::emitArchExtension(unsigned ArchExt) {} +void ARMTargetStreamer::emitObjectArch(unsigned Arch) {} +void ARMTargetStreamer::emitFPU(unsigned FPU) {} +void ARMTargetStreamer::finishAttributeSection() {} +void ARMTargetStreamer::emitInst(uint32_t Inst, char Suffix) {} +void +ARMTargetStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) {} + +void ARMTargetStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {} diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp new file mode 100644 index 0000000..173cc93 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp @@ -0,0 +1,196 @@ +//===-- ARMUnwindOpAsm.cpp - ARM Unwind Opcodes Assembler -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the unwind opcode assmebler for ARM exception handling +// table. +// +//===----------------------------------------------------------------------===// + +#include "ARMUnwindOpAsm.h" +#include "llvm/Support/ARMEHABI.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/LEB128.h" + +using namespace llvm; + +namespace { + /// UnwindOpcodeStreamer - The simple wrapper over SmallVector to emit bytes + /// with MSB to LSB per uint32_t ordering. For example, the first byte will + /// be placed in Vec[3], and the following bytes will be placed in 2, 1, 0, + /// 7, 6, 5, 4, 11, 10, 9, 8, and so on. + class UnwindOpcodeStreamer { + private: + SmallVectorImpl<uint8_t> &Vec; + size_t Pos; + + public: + UnwindOpcodeStreamer(SmallVectorImpl<uint8_t> &V) : Vec(V), Pos(3) { + } + + /// Emit the byte in MSB to LSB per uint32_t order. + inline void EmitByte(uint8_t elem) { + Vec[Pos] = elem; + Pos = (((Pos ^ 0x3u) + 1) ^ 0x3u); + } + + /// Emit the size prefix. + inline void EmitSize(size_t Size) { + size_t SizeInWords = (Size + 3) / 4; + assert(SizeInWords <= 0x100u && + "Only 256 additional words are allowed for unwind opcodes"); + EmitByte(static_cast<uint8_t>(SizeInWords - 1)); + } + + /// Emit the personality index prefix. + inline void EmitPersonalityIndex(unsigned PI) { + assert(PI < ARM::EHABI::NUM_PERSONALITY_INDEX && + "Invalid personality prefix"); + EmitByte(ARM::EHABI::EHT_COMPACT | PI); + } + + /// Fill the rest of bytes with FINISH opcode. + inline void FillFinishOpcode() { + while (Pos < Vec.size()) + EmitByte(ARM::EHABI::UNWIND_OPCODE_FINISH); + } + }; +} + +void UnwindOpcodeAssembler::EmitRegSave(uint32_t RegSave) { + if (RegSave == 0u) + return; + + // One byte opcode to save register r14 and r11-r4 + if (RegSave & (1u << 4)) { + // The one byte opcode will always save r4, thus we can't use the one byte + // opcode when r4 is not in .save directive. + + // Compute the consecutive registers from r4 to r11. + uint32_t Mask = RegSave & 0xff0u; + uint32_t Range = countTrailingOnes(Mask >> 5); // Exclude r4. + // Mask off non-consecutive registers. Keep r4. + Mask &= ~(0xffffffe0u << Range); + + // Emit this opcode when the mask covers every registers. + uint32_t UnmaskedReg = RegSave & 0xfff0u & (~Mask); + if (UnmaskedReg == 0u) { + // Pop r[4 : (4 + n)] + EmitInt8(ARM::EHABI::UNWIND_OPCODE_POP_REG_RANGE_R4 | Range); + RegSave &= 0x000fu; + } else if (UnmaskedReg == (1u << 14)) { + // Pop r[14] + r[4 : (4 + n)] + EmitInt8(ARM::EHABI::UNWIND_OPCODE_POP_REG_RANGE_R4_R14 | Range); + RegSave &= 0x000fu; + } + } + + // Two bytes opcode to save register r15-r4 + if ((RegSave & 0xfff0u) != 0) + EmitInt16(ARM::EHABI::UNWIND_OPCODE_POP_REG_MASK_R4 | (RegSave >> 4)); + + // Opcode to save register r3-r0 + if ((RegSave & 0x000fu) != 0) + EmitInt16(ARM::EHABI::UNWIND_OPCODE_POP_REG_MASK | (RegSave & 0x000fu)); +} + +/// Emit unwind opcodes for .vsave directives +void UnwindOpcodeAssembler::EmitVFPRegSave(uint32_t VFPRegSave) { + // We only have 4 bits to save the offset in the opcode so look at the lower + // and upper 16 bits separately. + for (uint32_t Regs : {VFPRegSave & 0xffff0000u, VFPRegSave & 0x0000ffffu}) { + while (Regs) { + // Now look for a run of set bits. Remember the MSB and LSB of the run. + auto RangeMSB = 32 - countLeadingZeros(Regs); + auto RangeLen = countLeadingOnes(Regs << (32 - RangeMSB)); + auto RangeLSB = RangeMSB - RangeLen; + + int Opcode = RangeLSB >= 16 + ? ARM::EHABI::UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD_D16 + : ARM::EHABI::UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD; + + EmitInt16(Opcode | ((RangeLSB % 16) << 4) | (RangeLen - 1)); + + // Zero out bits we're done with. + Regs &= ~(-1u << RangeLSB); + } + } +} + +/// Emit unwind opcodes to copy address from source register to $sp. +void UnwindOpcodeAssembler::EmitSetSP(uint16_t Reg) { + EmitInt8(ARM::EHABI::UNWIND_OPCODE_SET_VSP | Reg); +} + +/// Emit unwind opcodes to add $sp with an offset. +void UnwindOpcodeAssembler::EmitSPOffset(int64_t Offset) { + if (Offset > 0x200) { + uint8_t Buff[16]; + Buff[0] = ARM::EHABI::UNWIND_OPCODE_INC_VSP_ULEB128; + size_t ULEBSize = encodeULEB128((Offset - 0x204) >> 2, Buff + 1); + EmitBytes(Buff, ULEBSize + 1); + } else if (Offset > 0) { + if (Offset > 0x100) { + EmitInt8(ARM::EHABI::UNWIND_OPCODE_INC_VSP | 0x3fu); + Offset -= 0x100; + } + EmitInt8(ARM::EHABI::UNWIND_OPCODE_INC_VSP | + static_cast<uint8_t>((Offset - 4) >> 2)); + } else if (Offset < 0) { + while (Offset < -0x100) { + EmitInt8(ARM::EHABI::UNWIND_OPCODE_DEC_VSP | 0x3fu); + Offset += 0x100; + } + EmitInt8(ARM::EHABI::UNWIND_OPCODE_DEC_VSP | + static_cast<uint8_t>(((-Offset) - 4) >> 2)); + } +} + +void UnwindOpcodeAssembler::Finalize(unsigned &PersonalityIndex, + SmallVectorImpl<uint8_t> &Result) { + + UnwindOpcodeStreamer OpStreamer(Result); + + if (HasPersonality) { + // User-specifed personality routine: [ SIZE , OP1 , OP2 , ... ] + PersonalityIndex = ARM::EHABI::NUM_PERSONALITY_INDEX; + size_t TotalSize = Ops.size() + 1; + size_t RoundUpSize = (TotalSize + 3) / 4 * 4; + Result.resize(RoundUpSize); + OpStreamer.EmitSize(RoundUpSize); + } else { + // If no personalityindex is specified, select ane + if (PersonalityIndex == ARM::EHABI::NUM_PERSONALITY_INDEX) + PersonalityIndex = (Ops.size() <= 3) ? ARM::EHABI::AEABI_UNWIND_CPP_PR0 + : ARM::EHABI::AEABI_UNWIND_CPP_PR1; + if (PersonalityIndex == ARM::EHABI::AEABI_UNWIND_CPP_PR0) { + // __aeabi_unwind_cpp_pr0: [ 0x80 , OP1 , OP2 , OP3 ] + assert(Ops.size() <= 3 && "too many opcodes for __aeabi_unwind_cpp_pr0"); + Result.resize(4); + OpStreamer.EmitPersonalityIndex(PersonalityIndex); + } else { + // __aeabi_unwind_cpp_pr{1,2}: [ {0x81,0x82} , SIZE , OP1 , OP2 , ... ] + size_t TotalSize = Ops.size() + 2; + size_t RoundUpSize = (TotalSize + 3) / 4 * 4; + Result.resize(RoundUpSize); + OpStreamer.EmitPersonalityIndex(PersonalityIndex); + OpStreamer.EmitSize(RoundUpSize); + } + } + + // Copy the unwind opcodes + for (size_t i = OpBegins.size() - 1; i > 0; --i) + for (size_t j = OpBegins[i - 1], end = OpBegins[i]; j < end; ++j) + OpStreamer.EmitByte(Ops[j]); + + // Emit the padding finish opcodes if the size is not multiple of 4. + OpStreamer.FillFinishOpcode(); + + // Reset the assembler state + Reset(); +} diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h new file mode 100644 index 0000000..e0c113e --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h @@ -0,0 +1,93 @@ +//===-- ARMUnwindOpAsm.h - ARM Unwind Opcodes Assembler ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the unwind opcode assmebler for ARM exception handling +// table. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMUNWINDOPASM_H +#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMUNWINDOPASM_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/ARMEHABI.h" +#include "llvm/Support/DataTypes.h" + +namespace llvm { + +class MCSymbol; + +class UnwindOpcodeAssembler { +private: + llvm::SmallVector<uint8_t, 32> Ops; + llvm::SmallVector<unsigned, 8> OpBegins; + bool HasPersonality; + +public: + UnwindOpcodeAssembler() + : HasPersonality(0) { + OpBegins.push_back(0); + } + + /// Reset the unwind opcode assembler. + void Reset() { + Ops.clear(); + OpBegins.clear(); + OpBegins.push_back(0); + HasPersonality = 0; + } + + /// Set the personality + void setPersonality(const MCSymbol *Per) { + HasPersonality = 1; + } + + /// Emit unwind opcodes for .save directives + void EmitRegSave(uint32_t RegSave); + + /// Emit unwind opcodes for .vsave directives + void EmitVFPRegSave(uint32_t VFPRegSave); + + /// Emit unwind opcodes to copy address from source register to $sp. + void EmitSetSP(uint16_t Reg); + + /// Emit unwind opcodes to add $sp with an offset. + void EmitSPOffset(int64_t Offset); + + /// Emit unwind raw opcodes + void EmitRaw(const SmallVectorImpl<uint8_t> &Opcodes) { + Ops.insert(Ops.end(), Opcodes.begin(), Opcodes.end()); + OpBegins.push_back(OpBegins.back() + Opcodes.size()); + } + + /// Finalize the unwind opcode sequence for EmitBytes() + void Finalize(unsigned &PersonalityIndex, + SmallVectorImpl<uint8_t> &Result); + +private: + void EmitInt8(unsigned Opcode) { + Ops.push_back(Opcode & 0xff); + OpBegins.push_back(OpBegins.back() + 1); + } + + void EmitInt16(unsigned Opcode) { + Ops.push_back((Opcode >> 8) & 0xff); + Ops.push_back(Opcode & 0xff); + OpBegins.push_back(OpBegins.back() + 2); + } + + void EmitBytes(const uint8_t *Opcode, size_t Size) { + Ops.insert(Ops.end(), Opcode, Opcode + Size); + OpBegins.push_back(OpBegins.back() + Size); + } +}; + +} // namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp new file mode 100644 index 0000000..166c04b --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp @@ -0,0 +1,91 @@ +//===-- ARMWinCOFFObjectWriter.cpp - ARM Windows COFF Object Writer -- C++ -==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/ARMFixupKinds.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCFixup.h" +#include "llvm/MC/MCFixupKindInfo.h" +#include "llvm/MC/MCValue.h" +#include "llvm/MC/MCWinCOFFObjectWriter.h" +#include "llvm/Support/COFF.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +namespace { +class ARMWinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter { +public: + ARMWinCOFFObjectWriter(bool Is64Bit) + : MCWinCOFFObjectTargetWriter(COFF::IMAGE_FILE_MACHINE_ARMNT) { + assert(!Is64Bit && "AArch64 support not yet implemented"); + } + ~ARMWinCOFFObjectWriter() override {} + + unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup, + bool IsCrossSection, + const MCAsmBackend &MAB) const override; + + bool recordRelocation(const MCFixup &) const override; +}; + +unsigned ARMWinCOFFObjectWriter::getRelocType(const MCValue &Target, + const MCFixup &Fixup, + bool IsCrossSection, + const MCAsmBackend &MAB) const { + assert(getMachine() == COFF::IMAGE_FILE_MACHINE_ARMNT && + "AArch64 support not yet implemented"); + + MCSymbolRefExpr::VariantKind Modifier = + Target.isAbsolute() ? MCSymbolRefExpr::VK_None : Target.getSymA()->getKind(); + + switch (static_cast<unsigned>(Fixup.getKind())) { + default: { + const MCFixupKindInfo &Info = MAB.getFixupKindInfo(Fixup.getKind()); + report_fatal_error(Twine("unsupported relocation type: ") + Info.Name); + } + case FK_Data_4: + switch (Modifier) { + case MCSymbolRefExpr::VK_COFF_IMGREL32: + return COFF::IMAGE_REL_ARM_ADDR32NB; + case MCSymbolRefExpr::VK_SECREL: + return COFF::IMAGE_REL_ARM_SECREL; + default: + return COFF::IMAGE_REL_ARM_ADDR32; + } + case FK_SecRel_2: + return COFF::IMAGE_REL_ARM_SECTION; + case FK_SecRel_4: + return COFF::IMAGE_REL_ARM_SECREL; + case ARM::fixup_t2_condbranch: + return COFF::IMAGE_REL_ARM_BRANCH20T; + case ARM::fixup_t2_uncondbranch: + return COFF::IMAGE_REL_ARM_BRANCH24T; + case ARM::fixup_arm_thumb_bl: + case ARM::fixup_arm_thumb_blx: + return COFF::IMAGE_REL_ARM_BLX23T; + case ARM::fixup_t2_movw_lo16: + case ARM::fixup_t2_movt_hi16: + return COFF::IMAGE_REL_ARM_MOV32T; + } +} + +bool ARMWinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const { + return static_cast<unsigned>(Fixup.getKind()) != ARM::fixup_t2_movt_hi16; +} +} + +namespace llvm { +MCObjectWriter *createARMWinCOFFObjectWriter(raw_pwrite_stream &OS, + bool Is64Bit) { + MCWinCOFFObjectTargetWriter *MOTW = new ARMWinCOFFObjectWriter(Is64Bit); + return createWinCOFFObjectWriter(MOTW, OS); +} +} + diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp new file mode 100644 index 0000000..83fa084 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp @@ -0,0 +1,47 @@ +//===-- ARMWinCOFFStreamer.cpp - ARM Target WinCOFF Streamer ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "ARMMCTargetDesc.h" +#include "llvm/MC/MCWinCOFFStreamer.h" + +using namespace llvm; + +namespace { +class ARMWinCOFFStreamer : public MCWinCOFFStreamer { +public: + ARMWinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter &CE, + raw_pwrite_stream &OS) + : MCWinCOFFStreamer(C, AB, CE, OS) {} + + void EmitAssemblerFlag(MCAssemblerFlag Flag) override; + void EmitThumbFunc(MCSymbol *Symbol) override; +}; + +void ARMWinCOFFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) { + switch (Flag) { + default: llvm_unreachable("not implemented"); + case MCAF_SyntaxUnified: + case MCAF_Code16: + break; + } +} + +void ARMWinCOFFStreamer::EmitThumbFunc(MCSymbol *Symbol) { + getAssembler().setIsThumbFunc(Symbol); +} +} + +MCStreamer *llvm::createARMWinCOFFStreamer( + MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, bool RelaxAll, bool IncrementalLinkerCompatible) { + auto *S = new ARMWinCOFFStreamer(Context, MAB, *Emitter, OS); + S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible); + return S; +} + diff --git a/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp b/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp new file mode 100644 index 0000000..ed2deea --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp @@ -0,0 +1,400 @@ +//===-- MLxExpansionPass.cpp - Expand MLx instrs to avoid hazards ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Expand VFP / NEON floating point MLA / MLS instructions (each to a pair of +// multiple and add / sub instructions) when special VMLx hazards are detected. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "ARMSubtarget.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetRegisterInfo.h" +using namespace llvm; + +#define DEBUG_TYPE "mlx-expansion" + +static cl::opt<bool> +ForceExapnd("expand-all-fp-mlx", cl::init(false), cl::Hidden); +static cl::opt<unsigned> +ExpandLimit("expand-limit", cl::init(~0U), cl::Hidden); + +STATISTIC(NumExpand, "Number of fp MLA / MLS instructions expanded"); + +namespace { + struct MLxExpansion : public MachineFunctionPass { + static char ID; + MLxExpansion() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &Fn) override; + + const char *getPassName() const override { + return "ARM MLA / MLS expansion pass"; + } + + private: + const ARMBaseInstrInfo *TII; + const TargetRegisterInfo *TRI; + MachineRegisterInfo *MRI; + + bool isLikeA9; + bool isSwift; + unsigned MIIdx; + MachineInstr* LastMIs[4]; + SmallPtrSet<MachineInstr*, 4> IgnoreStall; + + void clearStack(); + void pushStack(MachineInstr *MI); + MachineInstr *getAccDefMI(MachineInstr *MI) const; + unsigned getDefReg(MachineInstr *MI) const; + bool hasLoopHazard(MachineInstr *MI) const; + bool hasRAWHazard(unsigned Reg, MachineInstr *MI) const; + bool FindMLxHazard(MachineInstr *MI); + void ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI, + unsigned MulOpc, unsigned AddSubOpc, + bool NegAcc, bool HasLane); + bool ExpandFPMLxInstructions(MachineBasicBlock &MBB); + }; + char MLxExpansion::ID = 0; +} + +void MLxExpansion::clearStack() { + std::fill(LastMIs, LastMIs + 4, nullptr); + MIIdx = 0; +} + +void MLxExpansion::pushStack(MachineInstr *MI) { + LastMIs[MIIdx] = MI; + if (++MIIdx == 4) + MIIdx = 0; +} + +MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const { + // Look past COPY and INSERT_SUBREG instructions to find the + // real definition MI. This is important for _sfp instructions. + unsigned Reg = MI->getOperand(1).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(Reg)) + return nullptr; + + MachineBasicBlock *MBB = MI->getParent(); + MachineInstr *DefMI = MRI->getVRegDef(Reg); + while (true) { + if (DefMI->getParent() != MBB) + break; + if (DefMI->isCopyLike()) { + Reg = DefMI->getOperand(1).getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + DefMI = MRI->getVRegDef(Reg); + continue; + } + } else if (DefMI->isInsertSubreg()) { + Reg = DefMI->getOperand(2).getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + DefMI = MRI->getVRegDef(Reg); + continue; + } + } + break; + } + return DefMI; +} + +unsigned MLxExpansion::getDefReg(MachineInstr *MI) const { + unsigned Reg = MI->getOperand(0).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(Reg) || + !MRI->hasOneNonDBGUse(Reg)) + return Reg; + + MachineBasicBlock *MBB = MI->getParent(); + MachineInstr *UseMI = &*MRI->use_instr_nodbg_begin(Reg); + if (UseMI->getParent() != MBB) + return Reg; + + while (UseMI->isCopy() || UseMI->isInsertSubreg()) { + Reg = UseMI->getOperand(0).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(Reg) || + !MRI->hasOneNonDBGUse(Reg)) + return Reg; + UseMI = &*MRI->use_instr_nodbg_begin(Reg); + if (UseMI->getParent() != MBB) + return Reg; + } + + return Reg; +} + +/// hasLoopHazard - Check whether an MLx instruction is chained to itself across +/// a single-MBB loop. +bool MLxExpansion::hasLoopHazard(MachineInstr *MI) const { + unsigned Reg = MI->getOperand(1).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(Reg)) + return false; + + MachineBasicBlock *MBB = MI->getParent(); + MachineInstr *DefMI = MRI->getVRegDef(Reg); + while (true) { +outer_continue: + if (DefMI->getParent() != MBB) + break; + + if (DefMI->isPHI()) { + for (unsigned i = 1, e = DefMI->getNumOperands(); i < e; i += 2) { + if (DefMI->getOperand(i + 1).getMBB() == MBB) { + unsigned SrcReg = DefMI->getOperand(i).getReg(); + if (TargetRegisterInfo::isVirtualRegister(SrcReg)) { + DefMI = MRI->getVRegDef(SrcReg); + goto outer_continue; + } + } + } + } else if (DefMI->isCopyLike()) { + Reg = DefMI->getOperand(1).getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + DefMI = MRI->getVRegDef(Reg); + continue; + } + } else if (DefMI->isInsertSubreg()) { + Reg = DefMI->getOperand(2).getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + DefMI = MRI->getVRegDef(Reg); + continue; + } + } + + break; + } + + return DefMI == MI; +} + +bool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const { + // FIXME: Detect integer instructions properly. + const MCInstrDesc &MCID = MI->getDesc(); + unsigned Domain = MCID.TSFlags & ARMII::DomainMask; + if (MI->mayStore()) + return false; + unsigned Opcode = MCID.getOpcode(); + if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD) + return false; + if ((Domain & ARMII::DomainVFP) || (Domain & ARMII::DomainNEON)) + return MI->readsRegister(Reg, TRI); + return false; +} + +static bool isFpMulInstruction(unsigned Opcode) { + switch (Opcode) { + case ARM::VMULS: + case ARM::VMULfd: + case ARM::VMULfq: + case ARM::VMULD: + case ARM::VMULslfd: + case ARM::VMULslfq: + return true; + default: + return false; + } +} + +bool MLxExpansion::FindMLxHazard(MachineInstr *MI) { + if (NumExpand >= ExpandLimit) + return false; + + if (ForceExapnd) + return true; + + MachineInstr *DefMI = getAccDefMI(MI); + if (TII->isFpMLxInstruction(DefMI->getOpcode())) { + // r0 = vmla + // r3 = vmla r0, r1, r2 + // takes 16 - 17 cycles + // + // r0 = vmla + // r4 = vmul r1, r2 + // r3 = vadd r0, r4 + // takes about 14 - 15 cycles even with vmul stalling for 4 cycles. + IgnoreStall.insert(DefMI); + return true; + } + + // On Swift, we mostly care about hazards from multiplication instructions + // writing the accumulator and the pipelining of loop iterations by out-of- + // order execution. + if (isSwift) + return isFpMulInstruction(DefMI->getOpcode()) || hasLoopHazard(MI); + + if (IgnoreStall.count(MI)) + return false; + + // If a VMLA.F is followed by an VADD.F or VMUL.F with no RAW hazard, the + // VADD.F or VMUL.F will stall 4 cycles before issue. The 4 cycle stall + // preserves the in-order retirement of the instructions. + // Look at the next few instructions, if *most* of them can cause hazards, + // then the scheduler can't *fix* this, we'd better break up the VMLA. + unsigned Limit1 = isLikeA9 ? 1 : 4; + unsigned Limit2 = isLikeA9 ? 1 : 4; + for (unsigned i = 1; i <= 4; ++i) { + int Idx = ((int)MIIdx - i + 4) % 4; + MachineInstr *NextMI = LastMIs[Idx]; + if (!NextMI) + continue; + + if (TII->canCauseFpMLxStall(NextMI->getOpcode())) { + if (i <= Limit1) + return true; + } + + // Look for VMLx RAW hazard. + if (i <= Limit2 && hasRAWHazard(getDefReg(MI), NextMI)) + return true; + } + + return false; +} + +/// ExpandFPMLxInstructions - Expand a MLA / MLS instruction into a pair +/// of MUL + ADD / SUB instructions. +void +MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI, + unsigned MulOpc, unsigned AddSubOpc, + bool NegAcc, bool HasLane) { + unsigned DstReg = MI->getOperand(0).getReg(); + bool DstDead = MI->getOperand(0).isDead(); + unsigned AccReg = MI->getOperand(1).getReg(); + unsigned Src1Reg = MI->getOperand(2).getReg(); + unsigned Src2Reg = MI->getOperand(3).getReg(); + bool Src1Kill = MI->getOperand(2).isKill(); + bool Src2Kill = MI->getOperand(3).isKill(); + unsigned LaneImm = HasLane ? MI->getOperand(4).getImm() : 0; + unsigned NextOp = HasLane ? 5 : 4; + ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NextOp).getImm(); + unsigned PredReg = MI->getOperand(++NextOp).getReg(); + + const MCInstrDesc &MCID1 = TII->get(MulOpc); + const MCInstrDesc &MCID2 = TII->get(AddSubOpc); + const MachineFunction &MF = *MI->getParent()->getParent(); + unsigned TmpReg = MRI->createVirtualRegister( + TII->getRegClass(MCID1, 0, TRI, MF)); + + MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID1, TmpReg) + .addReg(Src1Reg, getKillRegState(Src1Kill)) + .addReg(Src2Reg, getKillRegState(Src2Kill)); + if (HasLane) + MIB.addImm(LaneImm); + MIB.addImm(Pred).addReg(PredReg); + + MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID2) + .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstDead)); + + if (NegAcc) { + bool AccKill = MRI->hasOneNonDBGUse(AccReg); + MIB.addReg(TmpReg, getKillRegState(true)) + .addReg(AccReg, getKillRegState(AccKill)); + } else { + MIB.addReg(AccReg).addReg(TmpReg, getKillRegState(true)); + } + MIB.addImm(Pred).addReg(PredReg); + + DEBUG({ + dbgs() << "Expanding: " << *MI; + dbgs() << " to:\n"; + MachineBasicBlock::iterator MII = MI; + MII = std::prev(MII); + MachineInstr &MI2 = *MII; + MII = std::prev(MII); + MachineInstr &MI1 = *MII; + dbgs() << " " << MI1; + dbgs() << " " << MI2; + }); + + MI->eraseFromParent(); + ++NumExpand; +} + +bool MLxExpansion::ExpandFPMLxInstructions(MachineBasicBlock &MBB) { + bool Changed = false; + + clearStack(); + IgnoreStall.clear(); + + unsigned Skip = 0; + MachineBasicBlock::reverse_iterator MII = MBB.rbegin(), E = MBB.rend(); + while (MII != E) { + MachineInstr *MI = &*MII; + + if (MI->isPosition() || MI->isImplicitDef() || MI->isCopy()) { + ++MII; + continue; + } + + const MCInstrDesc &MCID = MI->getDesc(); + if (MI->isBarrier()) { + clearStack(); + Skip = 0; + ++MII; + continue; + } + + unsigned Domain = MCID.TSFlags & ARMII::DomainMask; + if (Domain == ARMII::DomainGeneral) { + if (++Skip == 2) + // Assume dual issues of non-VFP / NEON instructions. + pushStack(nullptr); + } else { + Skip = 0; + + unsigned MulOpc, AddSubOpc; + bool NegAcc, HasLane; + if (!TII->isFpMLxInstruction(MCID.getOpcode(), + MulOpc, AddSubOpc, NegAcc, HasLane) || + !FindMLxHazard(MI)) + pushStack(MI); + else { + ExpandFPMLxInstruction(MBB, MI, MulOpc, AddSubOpc, NegAcc, HasLane); + E = MBB.rend(); // May have changed if MI was the 1st instruction. + Changed = true; + continue; + } + } + + ++MII; + } + + return Changed; +} + +bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) { + TII = static_cast<const ARMBaseInstrInfo *>(Fn.getSubtarget().getInstrInfo()); + TRI = Fn.getSubtarget().getRegisterInfo(); + MRI = &Fn.getRegInfo(); + const ARMSubtarget *STI = &Fn.getSubtarget<ARMSubtarget>(); + // Only run this for CortexA9. + if (!STI->isCortexA9()) + return false; + isLikeA9 = STI->isLikeA9() || STI->isSwift(); + isSwift = STI->isSwift(); + + bool Modified = false; + for (MachineBasicBlock &MBB : Fn) + Modified |= ExpandFPMLxInstructions(MBB); + + return Modified; +} + +FunctionPass *llvm::createMLxExpansionPass() { + return new MLxExpansion(); +} diff --git a/contrib/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp b/contrib/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp new file mode 100644 index 0000000..df73554 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp @@ -0,0 +1,28 @@ +//===-- ARMTargetInfo.cpp - ARM Target Implementation ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/ARMMCTargetDesc.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/TargetRegistry.h" +using namespace llvm; + +Target llvm::TheARMLETarget, llvm::TheARMBETarget; +Target llvm::TheThumbLETarget, llvm::TheThumbBETarget; + +extern "C" void LLVMInitializeARMTargetInfo() { + RegisterTarget<Triple::arm, /*HasJIT=*/true> + X(TheARMLETarget, "arm", "ARM"); + RegisterTarget<Triple::armeb, /*HasJIT=*/true> + Y(TheARMBETarget, "armeb", "ARM (big endian)"); + + RegisterTarget<Triple::thumb, /*HasJIT=*/true> + A(TheThumbLETarget, "thumb", "Thumb"); + RegisterTarget<Triple::thumbeb, /*HasJIT=*/true> + B(TheThumbBETarget, "thumbeb", "Thumb (big endian)"); +} diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp new file mode 100644 index 0000000..93e0ac4 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -0,0 +1,659 @@ +//===-- Thumb1FrameLowering.cpp - Thumb1 Frame Information ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Thumb1 implementation of TargetFrameLowering class. +// +//===----------------------------------------------------------------------===// + +#include "Thumb1FrameLowering.h" +#include "ARMMachineFunctionInfo.h" +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +Thumb1FrameLowering::Thumb1FrameLowering(const ARMSubtarget &sti) + : ARMFrameLowering(sti) {} + +bool Thumb1FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const{ + const MachineFrameInfo *FFI = MF.getFrameInfo(); + unsigned CFSize = FFI->getMaxCallFrameSize(); + // It's not always a good idea to include the call frame as part of the + // stack frame. ARM (especially Thumb) has small immediate offset to + // address the stack frame. So a large call frame can cause poor codegen + // and may even makes it impossible to scavenge a register. + if (CFSize >= ((1 << 8) - 1) * 4 / 2) // Half of imm8 * 4 + return false; + + return !MF.getFrameInfo()->hasVarSizedObjects(); +} + +static void +emitSPUpdate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + const TargetInstrInfo &TII, DebugLoc dl, + const ThumbRegisterInfo &MRI, + int NumBytes, unsigned MIFlags = MachineInstr::NoFlags) { + emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, TII, + MRI, MIFlags); +} + + +void Thumb1FrameLowering:: +eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + const Thumb1InstrInfo &TII = + *static_cast<const Thumb1InstrInfo *>(STI.getInstrInfo()); + const ThumbRegisterInfo *RegInfo = + static_cast<const ThumbRegisterInfo *>(STI.getRegisterInfo()); + if (!hasReservedCallFrame(MF)) { + // If we have alloca, convert as follows: + // ADJCALLSTACKDOWN -> sub, sp, sp, amount + // ADJCALLSTACKUP -> add, sp, sp, amount + MachineInstr *Old = I; + DebugLoc dl = Old->getDebugLoc(); + unsigned Amount = Old->getOperand(0).getImm(); + if (Amount != 0) { + // We need to keep the stack aligned properly. To do this, we round the + // amount of space needed for the outgoing arguments up to the next + // alignment boundary. + unsigned Align = getStackAlignment(); + Amount = (Amount+Align-1)/Align*Align; + + // Replace the pseudo instruction with a new instruction... + unsigned Opc = Old->getOpcode(); + if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) { + emitSPUpdate(MBB, I, TII, dl, *RegInfo, -Amount); + } else { + assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP); + emitSPUpdate(MBB, I, TII, dl, *RegInfo, Amount); + } + } + } + MBB.erase(I); +} + +void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator MBBI = MBB.begin(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + MachineModuleInfo &MMI = MF.getMMI(); + const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); + const ThumbRegisterInfo *RegInfo = + static_cast<const ThumbRegisterInfo *>(STI.getRegisterInfo()); + const Thumb1InstrInfo &TII = + *static_cast<const Thumb1InstrInfo *>(STI.getInstrInfo()); + + unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(); + unsigned NumBytes = MFI->getStackSize(); + assert(NumBytes >= ArgRegsSaveSize && + "ArgRegsSaveSize is included in NumBytes"); + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. + DebugLoc dl; + + unsigned FramePtr = RegInfo->getFrameRegister(MF); + unsigned BasePtr = RegInfo->getBaseRegister(); + int CFAOffset = 0; + + // Thumb add/sub sp, imm8 instructions implicitly multiply the offset by 4. + NumBytes = (NumBytes + 3) & ~3; + MFI->setStackSize(NumBytes); + + // Determine the sizes of each callee-save spill areas and record which frame + // belongs to which callee-save spill areas. + unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0; + int FramePtrSpillFI = 0; + + if (ArgRegsSaveSize) { + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -ArgRegsSaveSize, + MachineInstr::FrameSetup); + CFAOffset -= ArgRegsSaveSize; + unsigned CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset)); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } + + if (!AFI->hasStackFrame()) { + if (NumBytes - ArgRegsSaveSize != 0) { + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -(NumBytes - ArgRegsSaveSize), + MachineInstr::FrameSetup); + CFAOffset -= NumBytes - ArgRegsSaveSize; + unsigned CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset)); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } + return; + } + + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + int FI = CSI[i].getFrameIdx(); + switch (Reg) { + case ARM::R8: + case ARM::R9: + case ARM::R10: + case ARM::R11: + if (STI.isTargetMachO()) { + GPRCS2Size += 4; + break; + } + // fallthrough + case ARM::R4: + case ARM::R5: + case ARM::R6: + case ARM::R7: + case ARM::LR: + if (Reg == FramePtr) + FramePtrSpillFI = FI; + GPRCS1Size += 4; + break; + default: + DPRCSSize += 8; + } + } + + if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPUSH) { + ++MBBI; + } + + // Determine starting offsets of spill areas. + unsigned DPRCSOffset = NumBytes - ArgRegsSaveSize - (GPRCS1Size + GPRCS2Size + DPRCSSize); + unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize; + unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size; + bool HasFP = hasFP(MF); + if (HasFP) + AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + + NumBytes); + AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset); + AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset); + AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset); + NumBytes = DPRCSOffset; + + int FramePtrOffsetInBlock = 0; + unsigned adjustedGPRCS1Size = GPRCS1Size; + if (tryFoldSPUpdateIntoPushPop(STI, MF, std::prev(MBBI), NumBytes)) { + FramePtrOffsetInBlock = NumBytes; + adjustedGPRCS1Size += NumBytes; + NumBytes = 0; + } + + if (adjustedGPRCS1Size) { + CFAOffset -= adjustedGPRCS1Size; + unsigned CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset)); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } + for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(), + E = CSI.end(); I != E; ++I) { + unsigned Reg = I->getReg(); + int FI = I->getFrameIdx(); + switch (Reg) { + case ARM::R8: + case ARM::R9: + case ARM::R10: + case ARM::R11: + case ARM::R12: + if (STI.isTargetMachO()) + break; + // fallthough + case ARM::R0: + case ARM::R1: + case ARM::R2: + case ARM::R3: + case ARM::R4: + case ARM::R5: + case ARM::R6: + case ARM::R7: + case ARM::LR: + unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( + nullptr, MRI->getDwarfRegNum(Reg, true), MFI->getObjectOffset(FI))); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + break; + } + } + + // Adjust FP so it point to the stack slot that contains the previous FP. + if (HasFP) { + FramePtrOffsetInBlock += + MFI->getObjectOffset(FramePtrSpillFI) + GPRCS1Size + ArgRegsSaveSize; + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr) + .addReg(ARM::SP).addImm(FramePtrOffsetInBlock / 4) + .setMIFlags(MachineInstr::FrameSetup)); + if(FramePtrOffsetInBlock) { + CFAOffset += FramePtrOffsetInBlock; + unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createDefCfa( + nullptr, MRI->getDwarfRegNum(FramePtr, true), CFAOffset)); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } else { + unsigned CFIIndex = + MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister( + nullptr, MRI->getDwarfRegNum(FramePtr, true))); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } + if (NumBytes > 508) + // If offset is > 508 then sp cannot be adjusted in a single instruction, + // try restoring from fp instead. + AFI->setShouldRestoreSPFromFP(true); + } + + if (NumBytes) { + // Insert it after all the callee-save spills. + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes, + MachineInstr::FrameSetup); + if (!HasFP) { + CFAOffset -= NumBytes; + unsigned CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset)); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } + } + + if (STI.isTargetELF() && HasFP) + MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() - + AFI->getFramePtrSpillOffset()); + + AFI->setGPRCalleeSavedArea1Size(GPRCS1Size); + AFI->setGPRCalleeSavedArea2Size(GPRCS2Size); + AFI->setDPRCalleeSavedAreaSize(DPRCSSize); + + // Thumb1 does not currently support dynamic stack realignment. Report a + // fatal error rather then silently generate bad code. + if (RegInfo->needsStackRealignment(MF)) + report_fatal_error("Dynamic stack realignment not supported for thumb1."); + + // If we need a base pointer, set it up here. It's whatever the value + // of the stack pointer is at this point. Any variable size objects + // will be allocated after this, so we can still use the base pointer + // to reference locals. + if (RegInfo->hasBasePointer(MF)) + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), BasePtr) + .addReg(ARM::SP)); + + // If the frame has variable sized objects then the epilogue must restore + // the sp from fp. We can assume there's an FP here since hasFP already + // checks for hasVarSizedObjects. + if (MFI->hasVarSizedObjects()) + AFI->setShouldRestoreSPFromFP(true); +} + +static bool isCSRestore(MachineInstr *MI, const MCPhysReg *CSRegs) { + if (MI->getOpcode() == ARM::tLDRspi && + MI->getOperand(1).isFI() && + isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs)) + return true; + else if (MI->getOpcode() == ARM::tPOP) { + // The first two operands are predicates. The last two are + // imp-def and imp-use of SP. Check everything in between. + for (int i = 2, e = MI->getNumOperands() - 2; i != e; ++i) + if (!isCalleeSavedRegister(MI->getOperand(i).getReg(), CSRegs)) + return false; + return true; + } + return false; +} + +void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + const ThumbRegisterInfo *RegInfo = + static_cast<const ThumbRegisterInfo *>(STI.getRegisterInfo()); + const Thumb1InstrInfo &TII = + *static_cast<const Thumb1InstrInfo *>(STI.getInstrInfo()); + + unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(); + int NumBytes = (int)MFI->getStackSize(); + assert((unsigned)NumBytes >= ArgRegsSaveSize && + "ArgRegsSaveSize is included in NumBytes"); + const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); + unsigned FramePtr = RegInfo->getFrameRegister(MF); + + if (!AFI->hasStackFrame()) { + if (NumBytes - ArgRegsSaveSize != 0) + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes - ArgRegsSaveSize); + } else { + // Unwind MBBI to point to first LDR / VLDRD. + if (MBBI != MBB.begin()) { + do + --MBBI; + while (MBBI != MBB.begin() && isCSRestore(MBBI, CSRegs)); + if (!isCSRestore(MBBI, CSRegs)) + ++MBBI; + } + + // Move SP to start of FP callee save spill area. + NumBytes -= (AFI->getGPRCalleeSavedArea1Size() + + AFI->getGPRCalleeSavedArea2Size() + + AFI->getDPRCalleeSavedAreaSize() + + ArgRegsSaveSize); + + if (AFI->shouldRestoreSPFromFP()) { + NumBytes = AFI->getFramePtrSpillOffset() - NumBytes; + // Reset SP based on frame pointer only if the stack frame extends beyond + // frame pointer stack slot, the target is ELF and the function has FP, or + // the target uses var sized objects. + if (NumBytes) { + assert(!MFI->getPristineRegs(MF).test(ARM::R4) && + "No scratch register to restore SP from FP!"); + emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes, + TII, *RegInfo); + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), + ARM::SP) + .addReg(ARM::R4)); + } else + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), + ARM::SP) + .addReg(FramePtr)); + } else { + if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tBX_RET && + &MBB.front() != MBBI && std::prev(MBBI)->getOpcode() == ARM::tPOP) { + MachineBasicBlock::iterator PMBBI = std::prev(MBBI); + if (!tryFoldSPUpdateIntoPushPop(STI, MF, PMBBI, NumBytes)) + emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes); + } else if (!tryFoldSPUpdateIntoPushPop(STI, MF, MBBI, NumBytes)) + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes); + } + } + + if (needPopSpecialFixUp(MF)) { + bool Done = emitPopSpecialFixUp(MBB, /* DoIt */ true); + (void)Done; + assert(Done && "Emission of the special fixup failed!?"); + } +} + +bool Thumb1FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const { + if (!needPopSpecialFixUp(*MBB.getParent())) + return true; + + MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB); + return emitPopSpecialFixUp(*TmpMBB, /* DoIt */ false); +} + +bool Thumb1FrameLowering::needPopSpecialFixUp(const MachineFunction &MF) const { + ARMFunctionInfo *AFI = + const_cast<MachineFunction *>(&MF)->getInfo<ARMFunctionInfo>(); + if (AFI->getArgRegsSaveSize()) + return true; + + // LR cannot be encoded with Thumb1, i.e., it requires a special fix-up. + for (const CalleeSavedInfo &CSI : MF.getFrameInfo()->getCalleeSavedInfo()) + if (CSI.getReg() == ARM::LR) + return true; + + return false; +} + +bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB, + bool DoIt) const { + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + const ThumbRegisterInfo *RegInfo = + static_cast<const ThumbRegisterInfo *>(STI.getRegisterInfo()); + + // If MBBI is a return instruction, or is a tPOP followed by a return + // instruction in the successor BB, we may be able to directly restore + // LR in the PC. + // This is only possible with v5T ops (v4T can't change the Thumb bit via + // a POP PC instruction), and only if we do not need to emit any SP update. + // Otherwise, we need a temporary register to pop the value + // and copy that value into LR. + auto MBBI = MBB.getFirstTerminator(); + bool CanRestoreDirectly = STI.hasV5TOps() && !ArgRegsSaveSize; + if (CanRestoreDirectly) { + if (MBBI != MBB.end() && MBBI->getOpcode() != ARM::tB) + CanRestoreDirectly = (MBBI->getOpcode() == ARM::tBX_RET || + MBBI->getOpcode() == ARM::tPOP_RET); + else { + auto MBBI_prev = MBBI; + MBBI_prev--; + assert(MBBI_prev->getOpcode() == ARM::tPOP); + assert(MBB.succ_size() == 1); + if ((*MBB.succ_begin())->begin()->getOpcode() == ARM::tBX_RET) + MBBI = MBBI_prev; // Replace the final tPOP with a tPOP_RET. + else + CanRestoreDirectly = false; + } + } + + if (CanRestoreDirectly) { + if (!DoIt || MBBI->getOpcode() == ARM::tPOP_RET) + return true; + MachineInstrBuilder MIB = + AddDefaultPred( + BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP_RET))); + // Copy implicit ops and popped registers, if any. + for (auto MO: MBBI->operands()) + if (MO.isReg() && (MO.isImplicit() || MO.isDef())) + MIB.addOperand(MO); + MIB.addReg(ARM::PC, RegState::Define); + // Erase the old instruction (tBX_RET or tPOP). + MBB.erase(MBBI); + return true; + } + + // Look for a temporary register to use. + // First, compute the liveness information. + LivePhysRegs UsedRegs(STI.getRegisterInfo()); + UsedRegs.addLiveOuts(&MBB, /*AddPristines*/ true); + // The semantic of pristines changed recently and now, + // the callee-saved registers that are touched in the function + // are not part of the pristines set anymore. + // Add those callee-saved now. + const TargetRegisterInfo *TRI = STI.getRegisterInfo(); + const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF); + for (unsigned i = 0; CSRegs[i]; ++i) + UsedRegs.addReg(CSRegs[i]); + + DebugLoc dl = DebugLoc(); + if (MBBI != MBB.end()) { + dl = MBBI->getDebugLoc(); + auto InstUpToMBBI = MBB.end(); + while (InstUpToMBBI != MBBI) + // The pre-decrement is on purpose here. + // We want to have the liveness right before MBBI. + UsedRegs.stepBackward(*--InstUpToMBBI); + } + + // Look for a register that can be directly use in the POP. + unsigned PopReg = 0; + // And some temporary register, just in case. + unsigned TemporaryReg = 0; + BitVector PopFriendly = + TRI->getAllocatableSet(MF, TRI->getRegClass(ARM::tGPRRegClassID)); + assert(PopFriendly.any() && "No allocatable pop-friendly register?!"); + // Rebuild the GPRs from the high registers because they are removed + // form the GPR reg class for thumb1. + BitVector GPRsNoLRSP = + TRI->getAllocatableSet(MF, TRI->getRegClass(ARM::hGPRRegClassID)); + GPRsNoLRSP |= PopFriendly; + GPRsNoLRSP.reset(ARM::LR); + GPRsNoLRSP.reset(ARM::SP); + GPRsNoLRSP.reset(ARM::PC); + for (int Register = GPRsNoLRSP.find_first(); Register != -1; + Register = GPRsNoLRSP.find_next(Register)) { + if (!UsedRegs.contains(Register)) { + // Remember the first pop-friendly register and exit. + if (PopFriendly.test(Register)) { + PopReg = Register; + TemporaryReg = 0; + break; + } + // Otherwise, remember that the register will be available to + // save a pop-friendly register. + TemporaryReg = Register; + } + } + + if (!DoIt && !PopReg && !TemporaryReg) + return false; + + assert((PopReg || TemporaryReg) && "Cannot get LR"); + + if (TemporaryReg) { + assert(!PopReg && "Unnecessary MOV is about to be inserted"); + PopReg = PopFriendly.find_first(); + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr)) + .addReg(TemporaryReg, RegState::Define) + .addReg(PopReg, RegState::Kill)); + } + + if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPOP_RET) { + // We couldn't use the direct restoration above, so + // perform the opposite conversion: tPOP_RET to tPOP. + MachineInstrBuilder MIB = + AddDefaultPred( + BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP))); + bool Popped = false; + for (auto MO: MBBI->operands()) + if (MO.isReg() && (MO.isImplicit() || MO.isDef()) && + MO.getReg() != ARM::PC) { + MIB.addOperand(MO); + if (!MO.isImplicit()) + Popped = true; + } + // Is there anything left to pop? + if (!Popped) + MBB.erase(MIB.getInstr()); + // Erase the old instruction. + MBB.erase(MBBI); + MBBI = AddDefaultPred(BuildMI(MBB, MBB.end(), dl, TII.get(ARM::tBX_RET))); + } + + assert(PopReg && "Do not know how to get LR"); + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP))) + .addReg(PopReg, RegState::Define); + + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize); + + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr)) + .addReg(ARM::LR, RegState::Define) + .addReg(PopReg, RegState::Kill)); + + if (TemporaryReg) + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr)) + .addReg(PopReg, RegState::Define) + .addReg(TemporaryReg, RegState::Kill)); + + return true; +} + +bool Thumb1FrameLowering:: +spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + if (CSI.empty()) + return false; + + DebugLoc DL; + const TargetInstrInfo &TII = *STI.getInstrInfo(); + + MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(ARM::tPUSH)); + AddDefaultPred(MIB); + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + bool isKill = true; + + // Add the callee-saved register as live-in unless it's LR and + // @llvm.returnaddress is called. If LR is returned for @llvm.returnaddress + // then it's already added to the function and entry block live-in sets. + if (Reg == ARM::LR) { + MachineFunction &MF = *MBB.getParent(); + if (MF.getFrameInfo()->isReturnAddressTaken() && + MF.getRegInfo().isLiveIn(Reg)) + isKill = false; + } + + if (isKill) + MBB.addLiveIn(Reg); + + MIB.addReg(Reg, getKillRegState(isKill)); + } + MIB.setMIFlags(MachineInstr::FrameSetup); + return true; +} + +bool Thumb1FrameLowering:: +restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + if (CSI.empty()) + return false; + + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + + bool isVarArg = AFI->getArgRegsSaveSize() > 0; + DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc(); + MachineInstrBuilder MIB = BuildMI(MF, DL, TII.get(ARM::tPOP)); + AddDefaultPred(MIB); + + bool NeedsPop = false; + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + if (Reg == ARM::LR) { + if (MBB.succ_empty()) { + // Special epilogue for vararg functions. See emitEpilogue + if (isVarArg) + continue; + // ARMv4T requires BX, see emitEpilogue + if (!STI.hasV5TOps()) + continue; + Reg = ARM::PC; + (*MIB).setDesc(TII.get(ARM::tPOP_RET)); + if (MI != MBB.end()) + MIB.copyImplicitOps(&*MI); + MI = MBB.erase(MI); + } else + // LR may only be popped into PC, as part of return sequence. + // If this isn't the return sequence, we'll need emitPopSpecialFixUp + // to restore LR the hard way. + continue; + } + MIB.addReg(Reg, getDefRegState(true)); + NeedsPop = true; + } + + // It's illegal to emit pop instruction without operands. + if (NeedsPop) + MBB.insert(MI, &*MIB); + else + MF.DeleteMachineInstr(MIB); + + return true; +} diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h new file mode 100644 index 0000000..812f983 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h @@ -0,0 +1,88 @@ +//===-- Thumb1FrameLowering.h - Thumb1-specific frame info stuff --*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_THUMB1FRAMELOWERING_H +#define LLVM_LIB_TARGET_ARM_THUMB1FRAMELOWERING_H + +#include "ARMFrameLowering.h" +#include "Thumb1InstrInfo.h" +#include "ThumbRegisterInfo.h" +#include "llvm/Target/TargetFrameLowering.h" + +namespace llvm { + +class Thumb1FrameLowering : public ARMFrameLowering { +public: + explicit Thumb1FrameLowering(const ARMSubtarget &sti); + + /// emitProlog/emitEpilog - These methods insert prolog and epilog code into + /// the function. + void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const override; + bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const override; + + bool hasReservedCallFrame(const MachineFunction &MF) const override; + + void + eliminateCallFramePseudoInstr(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const override; + + /// Check whether or not the given \p MBB can be used as a epilogue + /// for the target. + /// The epilogue will be inserted before the first terminator of that block. + /// This method is used by the shrink-wrapping pass to decide if + /// \p MBB will be correctly handled by the target. + bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override; + +private: + /// Check if the frame lowering of \p MF needs a special fixup + /// code sequence for the epilogue. + /// Unlike T2 and ARM mode, the T1 pop instruction cannot restore + /// to LR, and we can't pop the value directly to the PC when + /// we need to update the SP after popping the value. So instead + /// we have to emit: + /// POP {r3} + /// ADD sp, #offset + /// BX r3 + /// If this would clobber a return value, then generate this sequence instead: + /// MOV ip, r3 + /// POP {r3} + /// ADD sp, #offset + /// MOV lr, r3 + /// MOV r3, ip + /// BX lr + bool needPopSpecialFixUp(const MachineFunction &MF) const; + + /// Emit the special fixup code sequence for the epilogue. + /// \see needPopSpecialFixUp for more details. + /// \p DoIt, tells this method whether or not to actually insert + /// the code sequence in \p MBB. I.e., when \p DoIt is false, + /// \p MBB is left untouched. + /// \returns For \p DoIt == true: True when the emission succeeded + /// false otherwise. For \p DoIt == false: True when the emission + /// would have been possible, false otherwise. + bool emitPopSpecialFixUp(MachineBasicBlock &MBB, bool DoIt) const; +}; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp new file mode 100644 index 0000000..530e1d3 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -0,0 +1,128 @@ +//===-- Thumb1InstrInfo.cpp - Thumb-1 Instruction Information -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Thumb-1 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "ARMSubtarget.h" +#include "Thumb1InstrInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/MC/MCInst.h" + +using namespace llvm; + +Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI) + : ARMBaseInstrInfo(STI), RI() {} + +/// getNoopForMachoTarget - Return the noop instruction to use for a noop. +void Thumb1InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { + NopInst.setOpcode(ARM::tMOVr); + NopInst.addOperand(MCOperand::createReg(ARM::R8)); + NopInst.addOperand(MCOperand::createReg(ARM::R8)); + NopInst.addOperand(MCOperand::createImm(ARMCC::AL)); + NopInst.addOperand(MCOperand::createReg(0)); +} + +unsigned Thumb1InstrInfo::getUnindexedOpcode(unsigned Opc) const { + return 0; +} + +void Thumb1InstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const { + // Need to check the arch. + MachineFunction &MF = *MBB.getParent(); + const ARMSubtarget &st = MF.getSubtarget<ARMSubtarget>(); + + assert(ARM::GPRRegClass.contains(DestReg, SrcReg) && + "Thumb1 can only copy GPR registers"); + + if (st.hasV6Ops() || ARM::hGPRRegClass.contains(SrcReg) + || !ARM::tGPRRegClass.contains(DestReg)) + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc))); + else { + // FIXME: The performance consequences of this are going to be atrocious. + // Some things to try that should be better: + // * 'mov hi, $src; mov $dst, hi', with hi as either r10 or r11 + // * 'movs $dst, $src' if cpsr isn't live + // See: http://lists.llvm.org/pipermail/llvm-dev/2014-August/075998.html + + // 'MOV lo, lo' is unpredictable on < v6, so use the stack to do it + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tPUSH))) + .addReg(SrcReg, getKillRegState(KillSrc)); + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tPOP))) + .addReg(DestReg, getDefRegState(true)); + } +} + +void Thumb1InstrInfo:: +storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned SrcReg, bool isKill, int FI, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + assert((RC == &ARM::tGPRRegClass || + (TargetRegisterInfo::isPhysicalRegister(SrcReg) && + isARMLowRegister(SrcReg))) && "Unknown regclass!"); + + if (RC == &ARM::tGPRRegClass || + (TargetRegisterInfo::isPhysicalRegister(SrcReg) && + isARMLowRegister(SrcReg))) { + DebugLoc DL; + if (I != MBB.end()) DL = I->getDebugLoc(); + + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tSTRspi)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + } +} + +void Thumb1InstrInfo:: +loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned DestReg, int FI, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + assert((RC == &ARM::tGPRRegClass || + (TargetRegisterInfo::isPhysicalRegister(DestReg) && + isARMLowRegister(DestReg))) && "Unknown regclass!"); + + if (RC == &ARM::tGPRRegClass || + (TargetRegisterInfo::isPhysicalRegister(DestReg) && + isARMLowRegister(DestReg))) { + DebugLoc DL; + if (I != MBB.end()) DL = I->getDebugLoc(); + + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tLDRspi), DestReg) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + } +} + +void +Thumb1InstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI, + Reloc::Model RM) const { + if (RM == Reloc::PIC_) + expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_pcrel, ARM::tLDRi, RM); + else + expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_abs, ARM::tLDRi, RM); +} diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h new file mode 100644 index 0000000..f3f493d --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h @@ -0,0 +1,63 @@ +//===-- Thumb1InstrInfo.h - Thumb-1 Instruction Information -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Thumb-1 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_THUMB1INSTRINFO_H +#define LLVM_LIB_TARGET_ARM_THUMB1INSTRINFO_H + +#include "ARMBaseInstrInfo.h" +#include "ThumbRegisterInfo.h" + +namespace llvm { + class ARMSubtarget; + +class Thumb1InstrInfo : public ARMBaseInstrInfo { + ThumbRegisterInfo RI; +public: + explicit Thumb1InstrInfo(const ARMSubtarget &STI); + + /// getNoopForMachoTarget - Return the noop instruction to use for a noop. + void getNoopForMachoTarget(MCInst &NopInst) const override; + + // Return the non-pre/post incrementing version of 'Opc'. Return 0 + // if there is not such an opcode. + unsigned getUnindexedOpcode(unsigned Opc) const override; + + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As + /// such, whenever a client has an instance of instruction info, it should + /// always be able to get register info as well (through this method). + /// + const ThumbRegisterInfo &getRegisterInfo() const override { return RI; } + + void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const override; + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned SrcReg, bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + +private: + void expandLoadStackGuard(MachineBasicBlock::iterator MI, + Reloc::Model RM) const override; +}; +} + +#endif diff --git a/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp new file mode 100644 index 0000000..bf0498d --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp @@ -0,0 +1,299 @@ +//===-- Thumb2ITBlockPass.cpp - Insert Thumb-2 IT blocks ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMMachineFunctionInfo.h" +#include "Thumb2InstrInfo.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineInstrBundle.h" +using namespace llvm; + +#define DEBUG_TYPE "thumb2-it" + +STATISTIC(NumITs, "Number of IT blocks inserted"); +STATISTIC(NumMovedInsts, "Number of predicated instructions moved"); + +namespace { + class Thumb2ITBlockPass : public MachineFunctionPass { + public: + static char ID; + Thumb2ITBlockPass() : MachineFunctionPass(ID) {} + + bool restrictIT; + const Thumb2InstrInfo *TII; + const TargetRegisterInfo *TRI; + ARMFunctionInfo *AFI; + + bool runOnMachineFunction(MachineFunction &Fn) override; + + const char *getPassName() const override { + return "Thumb IT blocks insertion pass"; + } + + private: + bool MoveCopyOutOfITBlock(MachineInstr *MI, + ARMCC::CondCodes CC, ARMCC::CondCodes OCC, + SmallSet<unsigned, 4> &Defs, + SmallSet<unsigned, 4> &Uses); + bool InsertITInstructions(MachineBasicBlock &MBB); + }; + char Thumb2ITBlockPass::ID = 0; +} + +/// TrackDefUses - Tracking what registers are being defined and used by +/// instructions in the IT block. This also tracks "dependencies", i.e. uses +/// in the IT block that are defined before the IT instruction. +static void TrackDefUses(MachineInstr *MI, + SmallSet<unsigned, 4> &Defs, + SmallSet<unsigned, 4> &Uses, + const TargetRegisterInfo *TRI) { + SmallVector<unsigned, 4> LocalDefs; + SmallVector<unsigned, 4> LocalUses; + + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (!Reg || Reg == ARM::ITSTATE || Reg == ARM::SP) + continue; + if (MO.isUse()) + LocalUses.push_back(Reg); + else + LocalDefs.push_back(Reg); + } + + for (unsigned i = 0, e = LocalUses.size(); i != e; ++i) { + unsigned Reg = LocalUses[i]; + for (MCSubRegIterator Subreg(Reg, TRI, /*IncludeSelf=*/true); + Subreg.isValid(); ++Subreg) + Uses.insert(*Subreg); + } + + for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) { + unsigned Reg = LocalDefs[i]; + for (MCSubRegIterator Subreg(Reg, TRI, /*IncludeSelf=*/true); + Subreg.isValid(); ++Subreg) + Defs.insert(*Subreg); + if (Reg == ARM::CPSR) + continue; + } +} + +/// Clear kill flags for any uses in the given set. This will likely +/// conservatively remove more kill flags than are necessary, but removing them +/// is safer than incorrect kill flags remaining on instructions. +static void ClearKillFlags(MachineInstr *MI, SmallSet<unsigned, 4> &Uses) { + for (MachineOperand &MO : MI->operands()) { + if (!MO.isReg() || MO.isDef() || !MO.isKill()) + continue; + if (!Uses.count(MO.getReg())) + continue; + MO.setIsKill(false); + } +} + +static bool isCopy(MachineInstr *MI) { + switch (MI->getOpcode()) { + default: + return false; + case ARM::MOVr: + case ARM::MOVr_TC: + case ARM::tMOVr: + case ARM::t2MOVr: + return true; + } +} + +bool +Thumb2ITBlockPass::MoveCopyOutOfITBlock(MachineInstr *MI, + ARMCC::CondCodes CC, ARMCC::CondCodes OCC, + SmallSet<unsigned, 4> &Defs, + SmallSet<unsigned, 4> &Uses) { + if (!isCopy(MI)) + return false; + // llvm models select's as two-address instructions. That means a copy + // is inserted before a t2MOVccr, etc. If the copy is scheduled in + // between selects we would end up creating multiple IT blocks. + assert(MI->getOperand(0).getSubReg() == 0 && + MI->getOperand(1).getSubReg() == 0 && + "Sub-register indices still around?"); + + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned SrcReg = MI->getOperand(1).getReg(); + + // First check if it's safe to move it. + if (Uses.count(DstReg) || Defs.count(SrcReg)) + return false; + + // If the CPSR is defined by this copy, then we don't want to move it. E.g., + // if we have: + // + // movs r1, r1 + // rsb r1, 0 + // movs r2, r2 + // rsb r2, 0 + // + // we don't want this to be converted to: + // + // movs r1, r1 + // movs r2, r2 + // itt mi + // rsb r1, 0 + // rsb r2, 0 + // + const MCInstrDesc &MCID = MI->getDesc(); + if (MI->hasOptionalDef() && + MI->getOperand(MCID.getNumOperands() - 1).getReg() == ARM::CPSR) + return false; + + // Then peek at the next instruction to see if it's predicated on CC or OCC. + // If not, then there is nothing to be gained by moving the copy. + MachineBasicBlock::iterator I = MI; ++I; + MachineBasicBlock::iterator E = MI->getParent()->end(); + while (I != E && I->isDebugValue()) + ++I; + if (I != E) { + unsigned NPredReg = 0; + ARMCC::CondCodes NCC = getITInstrPredicate(I, NPredReg); + if (NCC == CC || NCC == OCC) + return true; + } + return false; +} + +bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) { + bool Modified = false; + + SmallSet<unsigned, 4> Defs; + SmallSet<unsigned, 4> Uses; + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + while (MBBI != E) { + MachineInstr *MI = &*MBBI; + DebugLoc dl = MI->getDebugLoc(); + unsigned PredReg = 0; + ARMCC::CondCodes CC = getITInstrPredicate(MI, PredReg); + if (CC == ARMCC::AL) { + ++MBBI; + continue; + } + + Defs.clear(); + Uses.clear(); + TrackDefUses(MI, Defs, Uses, TRI); + + // Insert an IT instruction. + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(ARM::t2IT)) + .addImm(CC); + + // Add implicit use of ITSTATE to IT block instructions. + MI->addOperand(MachineOperand::CreateReg(ARM::ITSTATE, false/*ifDef*/, + true/*isImp*/, false/*isKill*/)); + + MachineInstr *LastITMI = MI; + MachineBasicBlock::iterator InsertPos = MIB.getInstr(); + ++MBBI; + + // Form IT block. + ARMCC::CondCodes OCC = ARMCC::getOppositeCondition(CC); + unsigned Mask = 0, Pos = 3; + + // v8 IT blocks are limited to one conditional op unless -arm-no-restrict-it + // is set: skip the loop + if (!restrictIT) { + // Branches, including tricky ones like LDM_RET, need to end an IT + // block so check the instruction we just put in the block. + for (; MBBI != E && Pos && + (!MI->isBranch() && !MI->isReturn()) ; ++MBBI) { + if (MBBI->isDebugValue()) + continue; + + MachineInstr *NMI = &*MBBI; + MI = NMI; + + unsigned NPredReg = 0; + ARMCC::CondCodes NCC = getITInstrPredicate(NMI, NPredReg); + if (NCC == CC || NCC == OCC) { + Mask |= (NCC & 1) << Pos; + // Add implicit use of ITSTATE. + NMI->addOperand(MachineOperand::CreateReg(ARM::ITSTATE, false/*ifDef*/, + true/*isImp*/, false/*isKill*/)); + LastITMI = NMI; + } else { + if (NCC == ARMCC::AL && + MoveCopyOutOfITBlock(NMI, CC, OCC, Defs, Uses)) { + --MBBI; + MBB.remove(NMI); + MBB.insert(InsertPos, NMI); + ClearKillFlags(MI, Uses); + ++NumMovedInsts; + continue; + } + break; + } + TrackDefUses(NMI, Defs, Uses, TRI); + --Pos; + } + } + + // Finalize IT mask. + Mask |= (1 << Pos); + // Tag along (firstcond[0] << 4) with the mask. + Mask |= (CC & 1) << 4; + MIB.addImm(Mask); + + // Last instruction in IT block kills ITSTATE. + LastITMI->findRegisterUseOperand(ARM::ITSTATE)->setIsKill(); + + // Finalize the bundle. + finalizeBundle(MBB, InsertPos.getInstrIterator(), + ++LastITMI->getIterator()); + + Modified = true; + ++NumITs; + } + + return Modified; +} + +bool Thumb2ITBlockPass::runOnMachineFunction(MachineFunction &Fn) { + const ARMSubtarget &STI = + static_cast<const ARMSubtarget &>(Fn.getSubtarget()); + if (!STI.isThumb2()) + return false; + AFI = Fn.getInfo<ARMFunctionInfo>(); + TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo()); + TRI = STI.getRegisterInfo(); + restrictIT = STI.restrictIT(); + + if (!AFI->isThumbFunction()) + return false; + + bool Modified = false; + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; ) { + MachineBasicBlock &MBB = *MFI; + ++MFI; + Modified |= InsertITInstructions(MBB); + } + + if (Modified) + AFI->setHasITBlocks(true); + + return Modified; +} + +/// createThumb2ITBlockPass - Returns an instance of the Thumb2 IT blocks +/// insertion pass. +FunctionPass *llvm::createThumb2ITBlockPass() { + return new Thumb2ITBlockPass(); +} diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp new file mode 100644 index 0000000..4da769f --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -0,0 +1,636 @@ +//===-- Thumb2InstrInfo.cpp - Thumb-2 Instruction Information -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Thumb-2 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "Thumb2InstrInfo.h" +#include "ARMConstantPoolValue.h" +#include "ARMMachineFunctionInfo.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; + +static cl::opt<bool> +OldT2IfCvt("old-thumb2-ifcvt", cl::Hidden, + cl::desc("Use old-style Thumb2 if-conversion heuristics"), + cl::init(false)); + +Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI) + : ARMBaseInstrInfo(STI), RI() {} + +/// getNoopForMachoTarget - Return the noop instruction to use for a noop. +void Thumb2InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { + NopInst.setOpcode(ARM::tHINT); + NopInst.addOperand(MCOperand::createImm(0)); + NopInst.addOperand(MCOperand::createImm(ARMCC::AL)); + NopInst.addOperand(MCOperand::createReg(0)); +} + +unsigned Thumb2InstrInfo::getUnindexedOpcode(unsigned Opc) const { + // FIXME + return 0; +} + +void +Thumb2InstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail, + MachineBasicBlock *NewDest) const { + MachineBasicBlock *MBB = Tail->getParent(); + ARMFunctionInfo *AFI = MBB->getParent()->getInfo<ARMFunctionInfo>(); + if (!AFI->hasITBlocks()) { + TargetInstrInfo::ReplaceTailWithBranchTo(Tail, NewDest); + return; + } + + // If the first instruction of Tail is predicated, we may have to update + // the IT instruction. + unsigned PredReg = 0; + ARMCC::CondCodes CC = getInstrPredicate(Tail, PredReg); + MachineBasicBlock::iterator MBBI = Tail; + if (CC != ARMCC::AL) + // Expecting at least the t2IT instruction before it. + --MBBI; + + // Actually replace the tail. + TargetInstrInfo::ReplaceTailWithBranchTo(Tail, NewDest); + + // Fix up IT. + if (CC != ARMCC::AL) { + MachineBasicBlock::iterator E = MBB->begin(); + unsigned Count = 4; // At most 4 instructions in an IT block. + while (Count && MBBI != E) { + if (MBBI->isDebugValue()) { + --MBBI; + continue; + } + if (MBBI->getOpcode() == ARM::t2IT) { + unsigned Mask = MBBI->getOperand(1).getImm(); + if (Count == 4) + MBBI->eraseFromParent(); + else { + unsigned MaskOn = 1 << Count; + unsigned MaskOff = ~(MaskOn - 1); + MBBI->getOperand(1).setImm((Mask & MaskOff) | MaskOn); + } + return; + } + --MBBI; + --Count; + } + + // Ctrl flow can reach here if branch folding is run before IT block + // formation pass. + } +} + +bool +Thumb2InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) const { + while (MBBI->isDebugValue()) { + ++MBBI; + if (MBBI == MBB.end()) + return false; + } + + unsigned PredReg = 0; + return getITInstrPredicate(MBBI, PredReg) == ARMCC::AL; +} + +void Thumb2InstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const { + // Handle SPR, DPR, and QPR copies. + if (!ARM::GPRRegClass.contains(DestReg, SrcReg)) + return ARMBaseInstrInfo::copyPhysReg(MBB, I, DL, DestReg, SrcReg, KillSrc); + + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc))); +} + +void Thumb2InstrInfo:: +storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned SrcReg, bool isKill, int FI, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + DebugLoc DL; + if (I != MBB.end()) DL = I->getDebugLoc(); + + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); + + if (RC == &ARM::GPRRegClass || RC == &ARM::tGPRRegClass || + RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass || + RC == &ARM::GPRnopcRegClass) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2STRi12)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + return; + } + + if (ARM::GPRPairRegClass.hasSubClassEq(RC)) { + // Thumb2 STRD expects its dest-registers to be in rGPR. Not a problem for + // gsub_0, but needs an extra constraint for gsub_1 (which could be sp + // otherwise). + MachineRegisterInfo *MRI = &MF.getRegInfo(); + MRI->constrainRegClass(SrcReg, &ARM::GPRPair_with_gsub_1_in_rGPRRegClass); + + MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2STRDi8)); + AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI); + AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI); + MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO); + AddDefaultPred(MIB); + return; + } + + ARMBaseInstrInfo::storeRegToStackSlot(MBB, I, SrcReg, isKill, FI, RC, TRI); +} + +void Thumb2InstrInfo:: +loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned DestReg, int FI, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); + DebugLoc DL; + if (I != MBB.end()) DL = I->getDebugLoc(); + + if (RC == &ARM::GPRRegClass || RC == &ARM::tGPRRegClass || + RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass || + RC == &ARM::GPRnopcRegClass) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2LDRi12), DestReg) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + return; + } + + if (ARM::GPRPairRegClass.hasSubClassEq(RC)) { + // Thumb2 LDRD expects its dest-registers to be in rGPR. Not a problem for + // gsub_0, but needs an extra constraint for gsub_1 (which could be sp + // otherwise). + MachineRegisterInfo *MRI = &MF.getRegInfo(); + MRI->constrainRegClass(DestReg, &ARM::GPRPair_with_gsub_1_in_rGPRRegClass); + + MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2LDRDi8)); + AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI); + AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI); + MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO); + AddDefaultPred(MIB); + + if (TargetRegisterInfo::isPhysicalRegister(DestReg)) + MIB.addReg(DestReg, RegState::ImplicitDefine); + return; + } + + ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, TRI); +} + +void +Thumb2InstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI, + Reloc::Model RM) const { + if (RM == Reloc::PIC_) + expandLoadStackGuardBase(MI, ARM::t2MOV_ga_pcrel, ARM::t2LDRi12, RM); + else + expandLoadStackGuardBase(MI, ARM::t2MOVi32imm, ARM::t2LDRi12, RM); +} + +void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, DebugLoc dl, + unsigned DestReg, unsigned BaseReg, int NumBytes, + ARMCC::CondCodes Pred, unsigned PredReg, + const ARMBaseInstrInfo &TII, unsigned MIFlags) { + if (NumBytes == 0 && DestReg != BaseReg) { + BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg) + .addReg(BaseReg, RegState::Kill) + .addImm((unsigned)Pred).addReg(PredReg).setMIFlags(MIFlags); + return; + } + + bool isSub = NumBytes < 0; + if (isSub) NumBytes = -NumBytes; + + // If profitable, use a movw or movt to materialize the offset. + // FIXME: Use the scavenger to grab a scratch register. + if (DestReg != ARM::SP && DestReg != BaseReg && + NumBytes >= 4096 && + ARM_AM::getT2SOImmVal(NumBytes) == -1) { + bool Fits = false; + if (NumBytes < 65536) { + // Use a movw to materialize the 16-bit constant. + BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), DestReg) + .addImm(NumBytes) + .addImm((unsigned)Pred).addReg(PredReg).setMIFlags(MIFlags); + Fits = true; + } else if ((NumBytes & 0xffff) == 0) { + // Use a movt to materialize the 32-bit constant. + BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVTi16), DestReg) + .addReg(DestReg) + .addImm(NumBytes >> 16) + .addImm((unsigned)Pred).addReg(PredReg).setMIFlags(MIFlags); + Fits = true; + } + + if (Fits) { + if (isSub) { + BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr), DestReg) + .addReg(BaseReg) + .addReg(DestReg, RegState::Kill) + .addImm((unsigned)Pred).addReg(PredReg).addReg(0) + .setMIFlags(MIFlags); + } else { + // Here we know that DestReg is not SP but we do not + // know anything about BaseReg. t2ADDrr is an invalid + // instruction is SP is used as the second argument, but + // is fine if SP is the first argument. To be sure we + // do not generate invalid encoding, put BaseReg first. + BuildMI(MBB, MBBI, dl, TII.get(ARM::t2ADDrr), DestReg) + .addReg(BaseReg) + .addReg(DestReg, RegState::Kill) + .addImm((unsigned)Pred).addReg(PredReg).addReg(0) + .setMIFlags(MIFlags); + } + return; + } + } + + while (NumBytes) { + unsigned ThisVal = NumBytes; + unsigned Opc = 0; + if (DestReg == ARM::SP && BaseReg != ARM::SP) { + // mov sp, rn. Note t2MOVr cannot be used. + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),DestReg) + .addReg(BaseReg).setMIFlags(MIFlags)); + BaseReg = ARM::SP; + continue; + } + + bool HasCCOut = true; + if (BaseReg == ARM::SP) { + // sub sp, sp, #imm7 + if (DestReg == ARM::SP && (ThisVal < ((1 << 7)-1) * 4)) { + assert((ThisVal & 3) == 0 && "Stack update is not multiple of 4?"); + Opc = isSub ? ARM::tSUBspi : ARM::tADDspi; + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg) + .addReg(BaseReg).addImm(ThisVal/4).setMIFlags(MIFlags)); + NumBytes = 0; + continue; + } + + // sub rd, sp, so_imm + Opc = isSub ? ARM::t2SUBri : ARM::t2ADDri; + if (ARM_AM::getT2SOImmVal(NumBytes) != -1) { + NumBytes = 0; + } else { + // FIXME: Move this to ARMAddressingModes.h? + unsigned RotAmt = countLeadingZeros(ThisVal); + ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt); + NumBytes &= ~ThisVal; + assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 && + "Bit extraction didn't work?"); + } + } else { + assert(DestReg != ARM::SP && BaseReg != ARM::SP); + Opc = isSub ? ARM::t2SUBri : ARM::t2ADDri; + if (ARM_AM::getT2SOImmVal(NumBytes) != -1) { + NumBytes = 0; + } else if (ThisVal < 4096) { + Opc = isSub ? ARM::t2SUBri12 : ARM::t2ADDri12; + HasCCOut = false; + NumBytes = 0; + } else { + // FIXME: Move this to ARMAddressingModes.h? + unsigned RotAmt = countLeadingZeros(ThisVal); + ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt); + NumBytes &= ~ThisVal; + assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 && + "Bit extraction didn't work?"); + } + } + + // Build the new ADD / SUB. + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg) + .addReg(BaseReg, RegState::Kill) + .addImm(ThisVal)).setMIFlags(MIFlags); + if (HasCCOut) + AddDefaultCC(MIB); + + BaseReg = DestReg; + } +} + +static unsigned +negativeOffsetOpcode(unsigned opcode) +{ + switch (opcode) { + case ARM::t2LDRi12: return ARM::t2LDRi8; + case ARM::t2LDRHi12: return ARM::t2LDRHi8; + case ARM::t2LDRBi12: return ARM::t2LDRBi8; + case ARM::t2LDRSHi12: return ARM::t2LDRSHi8; + case ARM::t2LDRSBi12: return ARM::t2LDRSBi8; + case ARM::t2STRi12: return ARM::t2STRi8; + case ARM::t2STRBi12: return ARM::t2STRBi8; + case ARM::t2STRHi12: return ARM::t2STRHi8; + case ARM::t2PLDi12: return ARM::t2PLDi8; + + case ARM::t2LDRi8: + case ARM::t2LDRHi8: + case ARM::t2LDRBi8: + case ARM::t2LDRSHi8: + case ARM::t2LDRSBi8: + case ARM::t2STRi8: + case ARM::t2STRBi8: + case ARM::t2STRHi8: + case ARM::t2PLDi8: + return opcode; + + default: + break; + } + + return 0; +} + +static unsigned +positiveOffsetOpcode(unsigned opcode) +{ + switch (opcode) { + case ARM::t2LDRi8: return ARM::t2LDRi12; + case ARM::t2LDRHi8: return ARM::t2LDRHi12; + case ARM::t2LDRBi8: return ARM::t2LDRBi12; + case ARM::t2LDRSHi8: return ARM::t2LDRSHi12; + case ARM::t2LDRSBi8: return ARM::t2LDRSBi12; + case ARM::t2STRi8: return ARM::t2STRi12; + case ARM::t2STRBi8: return ARM::t2STRBi12; + case ARM::t2STRHi8: return ARM::t2STRHi12; + case ARM::t2PLDi8: return ARM::t2PLDi12; + + case ARM::t2LDRi12: + case ARM::t2LDRHi12: + case ARM::t2LDRBi12: + case ARM::t2LDRSHi12: + case ARM::t2LDRSBi12: + case ARM::t2STRi12: + case ARM::t2STRBi12: + case ARM::t2STRHi12: + case ARM::t2PLDi12: + return opcode; + + default: + break; + } + + return 0; +} + +static unsigned +immediateOffsetOpcode(unsigned opcode) +{ + switch (opcode) { + case ARM::t2LDRs: return ARM::t2LDRi12; + case ARM::t2LDRHs: return ARM::t2LDRHi12; + case ARM::t2LDRBs: return ARM::t2LDRBi12; + case ARM::t2LDRSHs: return ARM::t2LDRSHi12; + case ARM::t2LDRSBs: return ARM::t2LDRSBi12; + case ARM::t2STRs: return ARM::t2STRi12; + case ARM::t2STRBs: return ARM::t2STRBi12; + case ARM::t2STRHs: return ARM::t2STRHi12; + case ARM::t2PLDs: return ARM::t2PLDi12; + + case ARM::t2LDRi12: + case ARM::t2LDRHi12: + case ARM::t2LDRBi12: + case ARM::t2LDRSHi12: + case ARM::t2LDRSBi12: + case ARM::t2STRi12: + case ARM::t2STRBi12: + case ARM::t2STRHi12: + case ARM::t2PLDi12: + case ARM::t2LDRi8: + case ARM::t2LDRHi8: + case ARM::t2LDRBi8: + case ARM::t2LDRSHi8: + case ARM::t2LDRSBi8: + case ARM::t2STRi8: + case ARM::t2STRBi8: + case ARM::t2STRHi8: + case ARM::t2PLDi8: + return opcode; + + default: + break; + } + + return 0; +} + +bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const ARMBaseInstrInfo &TII) { + unsigned Opcode = MI.getOpcode(); + const MCInstrDesc &Desc = MI.getDesc(); + unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); + bool isSub = false; + + // Memory operands in inline assembly always use AddrModeT2_i12. + if (Opcode == ARM::INLINEASM) + AddrMode = ARMII::AddrModeT2_i12; // FIXME. mode for thumb2? + + if (Opcode == ARM::t2ADDri || Opcode == ARM::t2ADDri12) { + Offset += MI.getOperand(FrameRegIdx+1).getImm(); + + unsigned PredReg; + if (Offset == 0 && getInstrPredicate(&MI, PredReg) == ARMCC::AL) { + // Turn it into a move. + MI.setDesc(TII.get(ARM::tMOVr)); + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + // Remove offset and remaining explicit predicate operands. + do MI.RemoveOperand(FrameRegIdx+1); + while (MI.getNumOperands() > FrameRegIdx+1); + MachineInstrBuilder MIB(*MI.getParent()->getParent(), &MI); + AddDefaultPred(MIB); + return true; + } + + bool HasCCOut = Opcode != ARM::t2ADDri12; + + if (Offset < 0) { + Offset = -Offset; + isSub = true; + MI.setDesc(TII.get(ARM::t2SUBri)); + } else { + MI.setDesc(TII.get(ARM::t2ADDri)); + } + + // Common case: small offset, fits into instruction. + if (ARM_AM::getT2SOImmVal(Offset) != -1) { + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset); + // Add cc_out operand if the original instruction did not have one. + if (!HasCCOut) + MI.addOperand(MachineOperand::CreateReg(0, false)); + Offset = 0; + return true; + } + // Another common case: imm12. + if (Offset < 4096 && + (!HasCCOut || MI.getOperand(MI.getNumOperands()-1).getReg() == 0)) { + unsigned NewOpc = isSub ? ARM::t2SUBri12 : ARM::t2ADDri12; + MI.setDesc(TII.get(NewOpc)); + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset); + // Remove the cc_out operand. + if (HasCCOut) + MI.RemoveOperand(MI.getNumOperands()-1); + Offset = 0; + return true; + } + + // Otherwise, extract 8 adjacent bits from the immediate into this + // t2ADDri/t2SUBri. + unsigned RotAmt = countLeadingZeros<unsigned>(Offset); + unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xff000000U, RotAmt); + + // We will handle these bits from offset, clear them. + Offset &= ~ThisImmVal; + + assert(ARM_AM::getT2SOImmVal(ThisImmVal) != -1 && + "Bit extraction didn't work?"); + MI.getOperand(FrameRegIdx+1).ChangeToImmediate(ThisImmVal); + // Add cc_out operand if the original instruction did not have one. + if (!HasCCOut) + MI.addOperand(MachineOperand::CreateReg(0, false)); + + } else { + + // AddrMode4 and AddrMode6 cannot handle any offset. + if (AddrMode == ARMII::AddrMode4 || AddrMode == ARMII::AddrMode6) + return false; + + // AddrModeT2_so cannot handle any offset. If there is no offset + // register then we change to an immediate version. + unsigned NewOpc = Opcode; + if (AddrMode == ARMII::AddrModeT2_so) { + unsigned OffsetReg = MI.getOperand(FrameRegIdx+1).getReg(); + if (OffsetReg != 0) { + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + return Offset == 0; + } + + MI.RemoveOperand(FrameRegIdx+1); + MI.getOperand(FrameRegIdx+1).ChangeToImmediate(0); + NewOpc = immediateOffsetOpcode(Opcode); + AddrMode = ARMII::AddrModeT2_i12; + } + + unsigned NumBits = 0; + unsigned Scale = 1; + if (AddrMode == ARMII::AddrModeT2_i8 || AddrMode == ARMII::AddrModeT2_i12) { + // i8 supports only negative, and i12 supports only positive, so + // based on Offset sign convert Opcode to the appropriate + // instruction + Offset += MI.getOperand(FrameRegIdx+1).getImm(); + if (Offset < 0) { + NewOpc = negativeOffsetOpcode(Opcode); + NumBits = 8; + isSub = true; + Offset = -Offset; + } else { + NewOpc = positiveOffsetOpcode(Opcode); + NumBits = 12; + } + } else if (AddrMode == ARMII::AddrMode5) { + // VFP address mode. + const MachineOperand &OffOp = MI.getOperand(FrameRegIdx+1); + int InstrOffs = ARM_AM::getAM5Offset(OffOp.getImm()); + if (ARM_AM::getAM5Op(OffOp.getImm()) == ARM_AM::sub) + InstrOffs *= -1; + NumBits = 8; + Scale = 4; + Offset += InstrOffs * 4; + assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!"); + if (Offset < 0) { + Offset = -Offset; + isSub = true; + } + } else if (AddrMode == ARMII::AddrModeT2_i8s4) { + Offset += MI.getOperand(FrameRegIdx + 1).getImm() * 4; + NumBits = 10; // 8 bits scaled by 4 + // MCInst operand expects already scaled value. + Scale = 1; + assert((Offset & 3) == 0 && "Can't encode this offset!"); + } else { + llvm_unreachable("Unsupported addressing mode!"); + } + + if (NewOpc != Opcode) + MI.setDesc(TII.get(NewOpc)); + + MachineOperand &ImmOp = MI.getOperand(FrameRegIdx+1); + + // Attempt to fold address computation + // Common case: small offset, fits into instruction. + int ImmedOffset = Offset / Scale; + unsigned Mask = (1 << NumBits) - 1; + if ((unsigned)Offset <= Mask * Scale) { + // Replace the FrameIndex with fp/sp + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + if (isSub) { + if (AddrMode == ARMII::AddrMode5) + // FIXME: Not consistent. + ImmedOffset |= 1 << NumBits; + else + ImmedOffset = -ImmedOffset; + } + ImmOp.ChangeToImmediate(ImmedOffset); + Offset = 0; + return true; + } + + // Otherwise, offset doesn't fit. Pull in what we can to simplify + ImmedOffset = ImmedOffset & Mask; + if (isSub) { + if (AddrMode == ARMII::AddrMode5) + // FIXME: Not consistent. + ImmedOffset |= 1 << NumBits; + else { + ImmedOffset = -ImmedOffset; + if (ImmedOffset == 0) + // Change the opcode back if the encoded offset is zero. + MI.setDesc(TII.get(positiveOffsetOpcode(NewOpc))); + } + } + ImmOp.ChangeToImmediate(ImmedOffset); + Offset &= ~(Mask*Scale); + } + + Offset = (isSub) ? -Offset : Offset; + return Offset == 0; +} + +ARMCC::CondCodes +llvm::getITInstrPredicate(const MachineInstr *MI, unsigned &PredReg) { + unsigned Opc = MI->getOpcode(); + if (Opc == ARM::tBcc || Opc == ARM::t2Bcc) + return ARMCC::AL; + return getInstrPredicate(MI, PredReg); +} diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h new file mode 100644 index 0000000..916ab06 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h @@ -0,0 +1,78 @@ +//===-- Thumb2InstrInfo.h - Thumb-2 Instruction Information -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Thumb-2 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_THUMB2INSTRINFO_H +#define LLVM_LIB_TARGET_ARM_THUMB2INSTRINFO_H + +#include "ARMBaseInstrInfo.h" +#include "ThumbRegisterInfo.h" + +namespace llvm { +class ARMSubtarget; +class ScheduleHazardRecognizer; + +class Thumb2InstrInfo : public ARMBaseInstrInfo { + ThumbRegisterInfo RI; +public: + explicit Thumb2InstrInfo(const ARMSubtarget &STI); + + /// getNoopForMachoTarget - Return the noop instruction to use for a noop. + void getNoopForMachoTarget(MCInst &NopInst) const override; + + // Return the non-pre/post incrementing version of 'Opc'. Return 0 + // if there is not such an opcode. + unsigned getUnindexedOpcode(unsigned Opc) const override; + + void ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail, + MachineBasicBlock *NewDest) const override; + + bool isLegalToSplitMBBAt(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) const override; + + void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const override; + + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned SrcReg, bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As + /// such, whenever a client has an instance of instruction info, it should + /// always be able to get register info as well (through this method). + /// + const ThumbRegisterInfo &getRegisterInfo() const override { return RI; } + +private: + void expandLoadStackGuard(MachineBasicBlock::iterator MI, + Reloc::Model RM) const override; +}; + +/// getITInstrPredicate - Valid only in Thumb2 mode. This function is identical +/// to llvm::getInstrPredicate except it returns AL for conditional branch +/// instructions which are "predicated", but are not in IT blocks. +ARMCC::CondCodes getITInstrPredicate(const MachineInstr *MI, unsigned &PredReg); + + +} + +#endif diff --git a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp new file mode 100644 index 0000000..bcd0e57 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -0,0 +1,1054 @@ +//===-- Thumb2SizeReduction.cpp - Thumb2 code size reduction pass -*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "ARMSubtarget.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "Thumb2InstrInfo.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/IR/Function.h" // To access Function attributes +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +using namespace llvm; + +#define DEBUG_TYPE "t2-reduce-size" + +STATISTIC(NumNarrows, "Number of 32-bit instrs reduced to 16-bit ones"); +STATISTIC(Num2Addrs, "Number of 32-bit instrs reduced to 2addr 16-bit ones"); +STATISTIC(NumLdSts, "Number of 32-bit load / store reduced to 16-bit ones"); + +static cl::opt<int> ReduceLimit("t2-reduce-limit", + cl::init(-1), cl::Hidden); +static cl::opt<int> ReduceLimit2Addr("t2-reduce-limit2", + cl::init(-1), cl::Hidden); +static cl::opt<int> ReduceLimitLdSt("t2-reduce-limit3", + cl::init(-1), cl::Hidden); + +namespace { + /// ReduceTable - A static table with information on mapping from wide + /// opcodes to narrow + struct ReduceEntry { + uint16_t WideOpc; // Wide opcode + uint16_t NarrowOpc1; // Narrow opcode to transform to + uint16_t NarrowOpc2; // Narrow opcode when it's two-address + uint8_t Imm1Limit; // Limit of immediate field (bits) + uint8_t Imm2Limit; // Limit of immediate field when it's two-address + unsigned LowRegs1 : 1; // Only possible if low-registers are used + unsigned LowRegs2 : 1; // Only possible if low-registers are used (2addr) + unsigned PredCC1 : 2; // 0 - If predicated, cc is on and vice versa. + // 1 - No cc field. + // 2 - Always set CPSR. + unsigned PredCC2 : 2; + unsigned PartFlag : 1; // 16-bit instruction does partial flag update + unsigned Special : 1; // Needs to be dealt with specially + unsigned AvoidMovs: 1; // Avoid movs with shifter operand (for Swift) + }; + + static const ReduceEntry ReduceTable[] = { + // Wide, Narrow1, Narrow2, imm1,imm2, lo1, lo2, P/C,PF,S,AM + { ARM::t2ADCrr, 0, ARM::tADC, 0, 0, 0, 1, 0,0, 0,0,0 }, + { ARM::t2ADDri, ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 0,0, 0,1,0 }, + { ARM::t2ADDrr, ARM::tADDrr, ARM::tADDhirr, 0, 0, 1, 0, 0,1, 0,0,0 }, + { ARM::t2ADDSri,ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 2,2, 0,1,0 }, + { ARM::t2ADDSrr,ARM::tADDrr, 0, 0, 0, 1, 0, 2,0, 0,1,0 }, + { ARM::t2ANDrr, 0, ARM::tAND, 0, 0, 0, 1, 0,0, 1,0,0 }, + { ARM::t2ASRri, ARM::tASRri, 0, 5, 0, 1, 0, 0,0, 1,0,1 }, + { ARM::t2ASRrr, 0, ARM::tASRrr, 0, 0, 0, 1, 0,0, 1,0,1 }, + { ARM::t2BICrr, 0, ARM::tBIC, 0, 0, 0, 1, 0,0, 1,0,0 }, + //FIXME: Disable CMN, as CCodes are backwards from compare expectations + //{ ARM::t2CMNrr, ARM::tCMN, 0, 0, 0, 1, 0, 2,0, 0,0,0 }, + { ARM::t2CMNzrr, ARM::tCMNz, 0, 0, 0, 1, 0, 2,0, 0,0,0 }, + { ARM::t2CMPri, ARM::tCMPi8, 0, 8, 0, 1, 0, 2,0, 0,0,0 }, + { ARM::t2CMPrr, ARM::tCMPhir, 0, 0, 0, 0, 0, 2,0, 0,1,0 }, + { ARM::t2EORrr, 0, ARM::tEOR, 0, 0, 0, 1, 0,0, 1,0,0 }, + // FIXME: adr.n immediate offset must be multiple of 4. + //{ ARM::t2LEApcrelJT,ARM::tLEApcrelJT, 0, 0, 0, 1, 0, 1,0, 0,0,0 }, + { ARM::t2LSLri, ARM::tLSLri, 0, 5, 0, 1, 0, 0,0, 1,0,1 }, + { ARM::t2LSLrr, 0, ARM::tLSLrr, 0, 0, 0, 1, 0,0, 1,0,1 }, + { ARM::t2LSRri, ARM::tLSRri, 0, 5, 0, 1, 0, 0,0, 1,0,1 }, + { ARM::t2LSRrr, 0, ARM::tLSRrr, 0, 0, 0, 1, 0,0, 1,0,1 }, + { ARM::t2MOVi, ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 1,0,0 }, + { ARM::t2MOVi16,ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 1,1,0 }, + // FIXME: Do we need the 16-bit 'S' variant? + { ARM::t2MOVr,ARM::tMOVr, 0, 0, 0, 0, 0, 1,0, 0,0,0 }, + { ARM::t2MUL, 0, ARM::tMUL, 0, 0, 0, 1, 0,0, 1,0,0 }, + { ARM::t2MVNr, ARM::tMVN, 0, 0, 0, 1, 0, 0,0, 0,0,0 }, + { ARM::t2ORRrr, 0, ARM::tORR, 0, 0, 0, 1, 0,0, 1,0,0 }, + { ARM::t2REV, ARM::tREV, 0, 0, 0, 1, 0, 1,0, 0,0,0 }, + { ARM::t2REV16, ARM::tREV16, 0, 0, 0, 1, 0, 1,0, 0,0,0 }, + { ARM::t2REVSH, ARM::tREVSH, 0, 0, 0, 1, 0, 1,0, 0,0,0 }, + { ARM::t2RORrr, 0, ARM::tROR, 0, 0, 0, 1, 0,0, 1,0,0 }, + { ARM::t2RSBri, ARM::tRSB, 0, 0, 0, 1, 0, 0,0, 0,1,0 }, + { ARM::t2RSBSri,ARM::tRSB, 0, 0, 0, 1, 0, 2,0, 0,1,0 }, + { ARM::t2SBCrr, 0, ARM::tSBC, 0, 0, 0, 1, 0,0, 0,0,0 }, + { ARM::t2SUBri, ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 0,0, 0,0,0 }, + { ARM::t2SUBrr, ARM::tSUBrr, 0, 0, 0, 1, 0, 0,0, 0,0,0 }, + { ARM::t2SUBSri,ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 2,2, 0,0,0 }, + { ARM::t2SUBSrr,ARM::tSUBrr, 0, 0, 0, 1, 0, 2,0, 0,0,0 }, + { ARM::t2SXTB, ARM::tSXTB, 0, 0, 0, 1, 0, 1,0, 0,1,0 }, + { ARM::t2SXTH, ARM::tSXTH, 0, 0, 0, 1, 0, 1,0, 0,1,0 }, + { ARM::t2TSTrr, ARM::tTST, 0, 0, 0, 1, 0, 2,0, 0,0,0 }, + { ARM::t2UXTB, ARM::tUXTB, 0, 0, 0, 1, 0, 1,0, 0,1,0 }, + { ARM::t2UXTH, ARM::tUXTH, 0, 0, 0, 1, 0, 1,0, 0,1,0 }, + + // FIXME: Clean this up after splitting each Thumb load / store opcode + // into multiple ones. + { ARM::t2LDRi12,ARM::tLDRi, ARM::tLDRspi, 5, 8, 1, 0, 0,0, 0,1,0 }, + { ARM::t2LDRs, ARM::tLDRr, 0, 0, 0, 1, 0, 0,0, 0,1,0 }, + { ARM::t2LDRBi12,ARM::tLDRBi, 0, 5, 0, 1, 0, 0,0, 0,1,0 }, + { ARM::t2LDRBs, ARM::tLDRBr, 0, 0, 0, 1, 0, 0,0, 0,1,0 }, + { ARM::t2LDRHi12,ARM::tLDRHi, 0, 5, 0, 1, 0, 0,0, 0,1,0 }, + { ARM::t2LDRHs, ARM::tLDRHr, 0, 0, 0, 1, 0, 0,0, 0,1,0 }, + { ARM::t2LDRSBs,ARM::tLDRSB, 0, 0, 0, 1, 0, 0,0, 0,1,0 }, + { ARM::t2LDRSHs,ARM::tLDRSH, 0, 0, 0, 1, 0, 0,0, 0,1,0 }, + { ARM::t2STRi12,ARM::tSTRi, ARM::tSTRspi, 5, 8, 1, 0, 0,0, 0,1,0 }, + { ARM::t2STRs, ARM::tSTRr, 0, 0, 0, 1, 0, 0,0, 0,1,0 }, + { ARM::t2STRBi12,ARM::tSTRBi, 0, 5, 0, 1, 0, 0,0, 0,1,0 }, + { ARM::t2STRBs, ARM::tSTRBr, 0, 0, 0, 1, 0, 0,0, 0,1,0 }, + { ARM::t2STRHi12,ARM::tSTRHi, 0, 5, 0, 1, 0, 0,0, 0,1,0 }, + { ARM::t2STRHs, ARM::tSTRHr, 0, 0, 0, 1, 0, 0,0, 0,1,0 }, + + { ARM::t2LDMIA, ARM::tLDMIA, 0, 0, 0, 1, 1, 1,1, 0,1,0 }, + { ARM::t2LDMIA_RET,0, ARM::tPOP_RET, 0, 0, 1, 1, 1,1, 0,1,0 }, + { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0, 0, 1, 1, 1,1, 0,1,0 }, + // ARM::t2STMIA (with no basereg writeback) has no Thumb1 equivalent. + // tSTMIA_UPD is a change in semantics which can only be used if the base + // register is killed. This difference is correctly handled elsewhere. + { ARM::t2STMIA, ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1,0 }, + { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1,0 }, + { ARM::t2STMDB_UPD, 0, ARM::tPUSH, 0, 0, 1, 1, 1,1, 0,1,0 } + }; + + class Thumb2SizeReduce : public MachineFunctionPass { + public: + static char ID; + Thumb2SizeReduce(std::function<bool(const Function &)> Ftor); + + const Thumb2InstrInfo *TII; + const ARMSubtarget *STI; + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "Thumb2 instruction size reduction pass"; + } + + private: + /// ReduceOpcodeMap - Maps wide opcode to index of entry in ReduceTable. + DenseMap<unsigned, unsigned> ReduceOpcodeMap; + + bool canAddPseudoFlagDep(MachineInstr *Use, bool IsSelfLoop); + + bool VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry, + bool is2Addr, ARMCC::CondCodes Pred, + bool LiveCPSR, bool &HasCC, bool &CCDead); + + bool ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry); + + bool ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry, bool LiveCPSR, bool IsSelfLoop); + + /// ReduceTo2Addr - Reduce a 32-bit instruction to a 16-bit two-address + /// instruction. + bool ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry, bool LiveCPSR, + bool IsSelfLoop); + + /// ReduceToNarrow - Reduce a 32-bit instruction to a 16-bit + /// non-two-address instruction. + bool ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry, bool LiveCPSR, + bool IsSelfLoop); + + /// ReduceMI - Attempt to reduce MI, return true on success. + bool ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI, + bool LiveCPSR, bool IsSelfLoop); + + /// ReduceMBB - Reduce width of instructions in the specified basic block. + bool ReduceMBB(MachineBasicBlock &MBB); + + bool OptimizeSize; + bool MinimizeSize; + + // Last instruction to define CPSR in the current block. + MachineInstr *CPSRDef; + // Was CPSR last defined by a high latency instruction? + // When CPSRDef is null, this refers to CPSR defs in predecessors. + bool HighLatencyCPSR; + + struct MBBInfo { + // The flags leaving this block have high latency. + bool HighLatencyCPSR; + // Has this block been visited yet? + bool Visited; + + MBBInfo() : HighLatencyCPSR(false), Visited(false) {} + }; + + SmallVector<MBBInfo, 8> BlockInfo; + + std::function<bool(const Function &)> PredicateFtor; + }; + char Thumb2SizeReduce::ID = 0; +} + +Thumb2SizeReduce::Thumb2SizeReduce(std::function<bool(const Function &)> Ftor) + : MachineFunctionPass(ID), PredicateFtor(Ftor) { + OptimizeSize = MinimizeSize = false; + for (unsigned i = 0, e = array_lengthof(ReduceTable); i != e; ++i) { + unsigned FromOpc = ReduceTable[i].WideOpc; + if (!ReduceOpcodeMap.insert(std::make_pair(FromOpc, i)).second) + llvm_unreachable("Duplicated entries?"); + } +} + +static bool HasImplicitCPSRDef(const MCInstrDesc &MCID) { + for (const MCPhysReg *Regs = MCID.getImplicitDefs(); *Regs; ++Regs) + if (*Regs == ARM::CPSR) + return true; + return false; +} + +// Check for a likely high-latency flag def. +static bool isHighLatencyCPSR(MachineInstr *Def) { + switch(Def->getOpcode()) { + case ARM::FMSTAT: + case ARM::tMUL: + return true; + } + return false; +} + +/// canAddPseudoFlagDep - For A9 (and other out-of-order) implementations, +/// the 's' 16-bit instruction partially update CPSR. Abort the +/// transformation to avoid adding false dependency on last CPSR setting +/// instruction which hurts the ability for out-of-order execution engine +/// to do register renaming magic. +/// This function checks if there is a read-of-write dependency between the +/// last instruction that defines the CPSR and the current instruction. If there +/// is, then there is no harm done since the instruction cannot be retired +/// before the CPSR setting instruction anyway. +/// Note, we are not doing full dependency analysis here for the sake of compile +/// time. We're not looking for cases like: +/// r0 = muls ... +/// r1 = add.w r0, ... +/// ... +/// = mul.w r1 +/// In this case it would have been ok to narrow the mul.w to muls since there +/// are indirect RAW dependency between the muls and the mul.w +bool +Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Use, bool FirstInSelfLoop) { + // Disable the check for -Oz (aka OptimizeForSizeHarder). + if (MinimizeSize || !STI->avoidCPSRPartialUpdate()) + return false; + + if (!CPSRDef) + // If this BB loops back to itself, conservatively avoid narrowing the + // first instruction that does partial flag update. + return HighLatencyCPSR || FirstInSelfLoop; + + SmallSet<unsigned, 2> Defs; + for (const MachineOperand &MO : CPSRDef->operands()) { + if (!MO.isReg() || MO.isUndef() || MO.isUse()) + continue; + unsigned Reg = MO.getReg(); + if (Reg == 0 || Reg == ARM::CPSR) + continue; + Defs.insert(Reg); + } + + for (const MachineOperand &MO : Use->operands()) { + if (!MO.isReg() || MO.isUndef() || MO.isDef()) + continue; + unsigned Reg = MO.getReg(); + if (Defs.count(Reg)) + return false; + } + + // If the current CPSR has high latency, try to avoid the false dependency. + if (HighLatencyCPSR) + return true; + + // tMOVi8 usually doesn't start long dependency chains, and there are a lot + // of them, so always shrink them when CPSR doesn't have high latency. + if (Use->getOpcode() == ARM::t2MOVi || + Use->getOpcode() == ARM::t2MOVi16) + return false; + + // No read-after-write dependency. The narrowing will add false dependency. + return true; +} + +bool +Thumb2SizeReduce::VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry, + bool is2Addr, ARMCC::CondCodes Pred, + bool LiveCPSR, bool &HasCC, bool &CCDead) { + if ((is2Addr && Entry.PredCC2 == 0) || + (!is2Addr && Entry.PredCC1 == 0)) { + if (Pred == ARMCC::AL) { + // Not predicated, must set CPSR. + if (!HasCC) { + // Original instruction was not setting CPSR, but CPSR is not + // currently live anyway. It's ok to set it. The CPSR def is + // dead though. + if (!LiveCPSR) { + HasCC = true; + CCDead = true; + return true; + } + return false; + } + } else { + // Predicated, must not set CPSR. + if (HasCC) + return false; + } + } else if ((is2Addr && Entry.PredCC2 == 2) || + (!is2Addr && Entry.PredCC1 == 2)) { + /// Old opcode has an optional def of CPSR. + if (HasCC) + return true; + // If old opcode does not implicitly define CPSR, then it's not ok since + // these new opcodes' CPSR def is not meant to be thrown away. e.g. CMP. + if (!HasImplicitCPSRDef(MI->getDesc())) + return false; + HasCC = true; + } else { + // 16-bit instruction does not set CPSR. + if (HasCC) + return false; + } + + return true; +} + +static bool VerifyLowRegs(MachineInstr *MI) { + unsigned Opc = MI->getOpcode(); + bool isPCOk = (Opc == ARM::t2LDMIA_RET || Opc == ARM::t2LDMIA_UPD); + bool isLROk = (Opc == ARM::t2STMDB_UPD); + bool isSPOk = isPCOk || isLROk; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg() || MO.isImplicit()) + continue; + unsigned Reg = MO.getReg(); + if (Reg == 0 || Reg == ARM::CPSR) + continue; + if (isPCOk && Reg == ARM::PC) + continue; + if (isLROk && Reg == ARM::LR) + continue; + if (Reg == ARM::SP) { + if (isSPOk) + continue; + if (i == 1 && (Opc == ARM::t2LDRi12 || Opc == ARM::t2STRi12)) + // Special case for these ldr / str with sp as base register. + continue; + } + if (!isARMLowRegister(Reg)) + return false; + } + return true; +} + +bool +Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry) { + if (ReduceLimitLdSt != -1 && ((int)NumLdSts >= ReduceLimitLdSt)) + return false; + + unsigned Scale = 1; + bool HasImmOffset = false; + bool HasShift = false; + bool HasOffReg = true; + bool isLdStMul = false; + unsigned Opc = Entry.NarrowOpc1; + unsigned OpNum = 3; // First 'rest' of operands. + uint8_t ImmLimit = Entry.Imm1Limit; + + switch (Entry.WideOpc) { + default: + llvm_unreachable("Unexpected Thumb2 load / store opcode!"); + case ARM::t2LDRi12: + case ARM::t2STRi12: + if (MI->getOperand(1).getReg() == ARM::SP) { + Opc = Entry.NarrowOpc2; + ImmLimit = Entry.Imm2Limit; + } + + Scale = 4; + HasImmOffset = true; + HasOffReg = false; + break; + case ARM::t2LDRBi12: + case ARM::t2STRBi12: + HasImmOffset = true; + HasOffReg = false; + break; + case ARM::t2LDRHi12: + case ARM::t2STRHi12: + Scale = 2; + HasImmOffset = true; + HasOffReg = false; + break; + case ARM::t2LDRs: + case ARM::t2LDRBs: + case ARM::t2LDRHs: + case ARM::t2LDRSBs: + case ARM::t2LDRSHs: + case ARM::t2STRs: + case ARM::t2STRBs: + case ARM::t2STRHs: + HasShift = true; + OpNum = 4; + break; + case ARM::t2LDMIA: { + unsigned BaseReg = MI->getOperand(0).getReg(); + assert(isARMLowRegister(BaseReg)); + + // For the non-writeback version (this one), the base register must be + // one of the registers being loaded. + bool isOK = false; + for (unsigned i = 3; i < MI->getNumOperands(); ++i) { + if (MI->getOperand(i).getReg() == BaseReg) { + isOK = true; + break; + } + } + + if (!isOK) + return false; + + OpNum = 0; + isLdStMul = true; + break; + } + case ARM::t2STMIA: { + // If the base register is killed, we don't care what its value is after the + // instruction, so we can use an updating STMIA. + if (!MI->getOperand(0).isKill()) + return false; + + break; + } + case ARM::t2LDMIA_RET: { + unsigned BaseReg = MI->getOperand(1).getReg(); + if (BaseReg != ARM::SP) + return false; + Opc = Entry.NarrowOpc2; // tPOP_RET + OpNum = 2; + isLdStMul = true; + break; + } + case ARM::t2LDMIA_UPD: + case ARM::t2STMIA_UPD: + case ARM::t2STMDB_UPD: { + OpNum = 0; + + unsigned BaseReg = MI->getOperand(1).getReg(); + if (BaseReg == ARM::SP && + (Entry.WideOpc == ARM::t2LDMIA_UPD || + Entry.WideOpc == ARM::t2STMDB_UPD)) { + Opc = Entry.NarrowOpc2; // tPOP or tPUSH + OpNum = 2; + } else if (!isARMLowRegister(BaseReg) || + (Entry.WideOpc != ARM::t2LDMIA_UPD && + Entry.WideOpc != ARM::t2STMIA_UPD)) { + return false; + } + + isLdStMul = true; + break; + } + } + + unsigned OffsetReg = 0; + bool OffsetKill = false; + bool OffsetInternal = false; + if (HasShift) { + OffsetReg = MI->getOperand(2).getReg(); + OffsetKill = MI->getOperand(2).isKill(); + OffsetInternal = MI->getOperand(2).isInternalRead(); + + if (MI->getOperand(3).getImm()) + // Thumb1 addressing mode doesn't support shift. + return false; + } + + unsigned OffsetImm = 0; + if (HasImmOffset) { + OffsetImm = MI->getOperand(2).getImm(); + unsigned MaxOffset = ((1 << ImmLimit) - 1) * Scale; + + if ((OffsetImm & (Scale - 1)) || OffsetImm > MaxOffset) + // Make sure the immediate field fits. + return false; + } + + // Add the 16-bit load / store instruction. + DebugLoc dl = MI->getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, TII->get(Opc)); + + // tSTMIA_UPD takes a defining register operand. We've already checked that + // the register is killed, so mark it as dead here. + if (Entry.WideOpc == ARM::t2STMIA) + MIB.addReg(MI->getOperand(0).getReg(), RegState::Define | RegState::Dead); + + if (!isLdStMul) { + MIB.addOperand(MI->getOperand(0)); + MIB.addOperand(MI->getOperand(1)); + + if (HasImmOffset) + MIB.addImm(OffsetImm / Scale); + + assert((!HasShift || OffsetReg) && "Invalid so_reg load / store address!"); + + if (HasOffReg) + MIB.addReg(OffsetReg, getKillRegState(OffsetKill) | + getInternalReadRegState(OffsetInternal)); + } + + // Transfer the rest of operands. + for (unsigned e = MI->getNumOperands(); OpNum != e; ++OpNum) + MIB.addOperand(MI->getOperand(OpNum)); + + // Transfer memoperands. + MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + + // Transfer MI flags. + MIB.setMIFlags(MI->getFlags()); + + DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB); + + MBB.erase_instr(MI); + ++NumLdSts; + return true; +} + +bool +Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry, + bool LiveCPSR, bool IsSelfLoop) { + unsigned Opc = MI->getOpcode(); + if (Opc == ARM::t2ADDri) { + // If the source register is SP, try to reduce to tADDrSPi, otherwise + // it's a normal reduce. + if (MI->getOperand(1).getReg() != ARM::SP) { + if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop)) + return true; + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop); + } + // Try to reduce to tADDrSPi. + unsigned Imm = MI->getOperand(2).getImm(); + // The immediate must be in range, the destination register must be a low + // reg, the predicate must be "always" and the condition flags must not + // be being set. + if (Imm & 3 || Imm > 1020) + return false; + if (!isARMLowRegister(MI->getOperand(0).getReg())) + return false; + if (MI->getOperand(3).getImm() != ARMCC::AL) + return false; + const MCInstrDesc &MCID = MI->getDesc(); + if (MCID.hasOptionalDef() && + MI->getOperand(MCID.getNumOperands()-1).getReg() == ARM::CPSR) + return false; + + MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(), + TII->get(ARM::tADDrSPi)) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)) + .addImm(Imm / 4); // The tADDrSPi has an implied scale by four. + AddDefaultPred(MIB); + + // Transfer MI flags. + MIB.setMIFlags(MI->getFlags()); + + DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " <<*MIB); + + MBB.erase_instr(MI); + ++NumNarrows; + return true; + } + + if (Entry.LowRegs1 && !VerifyLowRegs(MI)) + return false; + + if (MI->mayLoadOrStore()) + return ReduceLoadStore(MBB, MI, Entry); + + switch (Opc) { + default: break; + case ARM::t2ADDSri: + case ARM::t2ADDSrr: { + unsigned PredReg = 0; + if (getInstrPredicate(MI, PredReg) == ARMCC::AL) { + switch (Opc) { + default: break; + case ARM::t2ADDSri: { + if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop)) + return true; + // fallthrough + } + case ARM::t2ADDSrr: + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop); + } + } + break; + } + case ARM::t2RSBri: + case ARM::t2RSBSri: + case ARM::t2SXTB: + case ARM::t2SXTH: + case ARM::t2UXTB: + case ARM::t2UXTH: + if (MI->getOperand(2).getImm() == 0) + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop); + break; + case ARM::t2MOVi16: + // Can convert only 'pure' immediate operands, not immediates obtained as + // globals' addresses. + if (MI->getOperand(1).isImm()) + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop); + break; + case ARM::t2CMPrr: { + // Try to reduce to the lo-reg only version first. Why there are two + // versions of the instruction is a mystery. + // It would be nice to just have two entries in the master table that + // are prioritized, but the table assumes a unique entry for each + // source insn opcode. So for now, we hack a local entry record to use. + static const ReduceEntry NarrowEntry = + { ARM::t2CMPrr,ARM::tCMPr, 0, 0, 0, 1, 1,2, 0, 0,1,0 }; + if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, IsSelfLoop)) + return true; + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop); + } + } + return false; +} + +bool +Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry, + bool LiveCPSR, bool IsSelfLoop) { + + if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr)) + return false; + + if (!OptimizeSize && Entry.AvoidMovs && STI->avoidMOVsShifterOperand()) + // Don't issue movs with shifter operand for some CPUs unless we + // are optimizing for size. + return false; + + unsigned Reg0 = MI->getOperand(0).getReg(); + unsigned Reg1 = MI->getOperand(1).getReg(); + // t2MUL is "special". The tied source operand is second, not first. + if (MI->getOpcode() == ARM::t2MUL) { + unsigned Reg2 = MI->getOperand(2).getReg(); + // Early exit if the regs aren't all low regs. + if (!isARMLowRegister(Reg0) || !isARMLowRegister(Reg1) + || !isARMLowRegister(Reg2)) + return false; + if (Reg0 != Reg2) { + // If the other operand also isn't the same as the destination, we + // can't reduce. + if (Reg1 != Reg0) + return false; + // Try to commute the operands to make it a 2-address instruction. + MachineInstr *CommutedMI = TII->commuteInstruction(MI); + if (!CommutedMI) + return false; + } + } else if (Reg0 != Reg1) { + // Try to commute the operands to make it a 2-address instruction. + unsigned CommOpIdx1 = 1; + unsigned CommOpIdx2 = TargetInstrInfo::CommuteAnyOperandIndex; + if (!TII->findCommutedOpIndices(MI, CommOpIdx1, CommOpIdx2) || + MI->getOperand(CommOpIdx2).getReg() != Reg0) + return false; + MachineInstr *CommutedMI = + TII->commuteInstruction(MI, false, CommOpIdx1, CommOpIdx2); + if (!CommutedMI) + return false; + } + if (Entry.LowRegs2 && !isARMLowRegister(Reg0)) + return false; + if (Entry.Imm2Limit) { + unsigned Imm = MI->getOperand(2).getImm(); + unsigned Limit = (1 << Entry.Imm2Limit) - 1; + if (Imm > Limit) + return false; + } else { + unsigned Reg2 = MI->getOperand(2).getReg(); + if (Entry.LowRegs2 && !isARMLowRegister(Reg2)) + return false; + } + + // Check if it's possible / necessary to transfer the predicate. + const MCInstrDesc &NewMCID = TII->get(Entry.NarrowOpc2); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); + bool SkipPred = false; + if (Pred != ARMCC::AL) { + if (!NewMCID.isPredicable()) + // Can't transfer predicate, fail. + return false; + } else { + SkipPred = !NewMCID.isPredicable(); + } + + bool HasCC = false; + bool CCDead = false; + const MCInstrDesc &MCID = MI->getDesc(); + if (MCID.hasOptionalDef()) { + unsigned NumOps = MCID.getNumOperands(); + HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR); + if (HasCC && MI->getOperand(NumOps-1).isDead()) + CCDead = true; + } + if (!VerifyPredAndCC(MI, Entry, true, Pred, LiveCPSR, HasCC, CCDead)) + return false; + + // Avoid adding a false dependency on partial flag update by some 16-bit + // instructions which has the 's' bit set. + if (Entry.PartFlag && NewMCID.hasOptionalDef() && HasCC && + canAddPseudoFlagDep(MI, IsSelfLoop)) + return false; + + // Add the 16-bit instruction. + DebugLoc dl = MI->getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, NewMCID); + MIB.addOperand(MI->getOperand(0)); + if (NewMCID.hasOptionalDef()) { + if (HasCC) + AddDefaultT1CC(MIB, CCDead); + else + AddNoT1CC(MIB); + } + + // Transfer the rest of operands. + unsigned NumOps = MCID.getNumOperands(); + for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) { + if (i < NumOps && MCID.OpInfo[i].isOptionalDef()) + continue; + if (SkipPred && MCID.OpInfo[i].isPredicate()) + continue; + MIB.addOperand(MI->getOperand(i)); + } + + // Transfer MI flags. + MIB.setMIFlags(MI->getFlags()); + + DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB); + + MBB.erase_instr(MI); + ++Num2Addrs; + return true; +} + +bool +Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry, + bool LiveCPSR, bool IsSelfLoop) { + if (ReduceLimit != -1 && ((int)NumNarrows >= ReduceLimit)) + return false; + + if (!OptimizeSize && Entry.AvoidMovs && STI->avoidMOVsShifterOperand()) + // Don't issue movs with shifter operand for some CPUs unless we + // are optimizing for size. + return false; + + unsigned Limit = ~0U; + if (Entry.Imm1Limit) + Limit = (1 << Entry.Imm1Limit) - 1; + + const MCInstrDesc &MCID = MI->getDesc(); + for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) { + if (MCID.OpInfo[i].isPredicate()) + continue; + const MachineOperand &MO = MI->getOperand(i); + if (MO.isReg()) { + unsigned Reg = MO.getReg(); + if (!Reg || Reg == ARM::CPSR) + continue; + if (Entry.LowRegs1 && !isARMLowRegister(Reg)) + return false; + } else if (MO.isImm() && + !MCID.OpInfo[i].isPredicate()) { + if (((unsigned)MO.getImm()) > Limit) + return false; + } + } + + // Check if it's possible / necessary to transfer the predicate. + const MCInstrDesc &NewMCID = TII->get(Entry.NarrowOpc1); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); + bool SkipPred = false; + if (Pred != ARMCC::AL) { + if (!NewMCID.isPredicable()) + // Can't transfer predicate, fail. + return false; + } else { + SkipPred = !NewMCID.isPredicable(); + } + + bool HasCC = false; + bool CCDead = false; + if (MCID.hasOptionalDef()) { + unsigned NumOps = MCID.getNumOperands(); + HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR); + if (HasCC && MI->getOperand(NumOps-1).isDead()) + CCDead = true; + } + if (!VerifyPredAndCC(MI, Entry, false, Pred, LiveCPSR, HasCC, CCDead)) + return false; + + // Avoid adding a false dependency on partial flag update by some 16-bit + // instructions which has the 's' bit set. + if (Entry.PartFlag && NewMCID.hasOptionalDef() && HasCC && + canAddPseudoFlagDep(MI, IsSelfLoop)) + return false; + + // Add the 16-bit instruction. + DebugLoc dl = MI->getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, NewMCID); + MIB.addOperand(MI->getOperand(0)); + if (NewMCID.hasOptionalDef()) { + if (HasCC) + AddDefaultT1CC(MIB, CCDead); + else + AddNoT1CC(MIB); + } + + // Transfer the rest of operands. + unsigned NumOps = MCID.getNumOperands(); + for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) { + if (i < NumOps && MCID.OpInfo[i].isOptionalDef()) + continue; + if ((MCID.getOpcode() == ARM::t2RSBSri || + MCID.getOpcode() == ARM::t2RSBri || + MCID.getOpcode() == ARM::t2SXTB || + MCID.getOpcode() == ARM::t2SXTH || + MCID.getOpcode() == ARM::t2UXTB || + MCID.getOpcode() == ARM::t2UXTH) && i == 2) + // Skip the zero immediate operand, it's now implicit. + continue; + bool isPred = (i < NumOps && MCID.OpInfo[i].isPredicate()); + if (SkipPred && isPred) + continue; + const MachineOperand &MO = MI->getOperand(i); + if (MO.isReg() && MO.isImplicit() && MO.getReg() == ARM::CPSR) + // Skip implicit def of CPSR. Either it's modeled as an optional + // def now or it's already an implicit def on the new instruction. + continue; + MIB.addOperand(MO); + } + if (!MCID.isPredicable() && NewMCID.isPredicable()) + AddDefaultPred(MIB); + + // Transfer MI flags. + MIB.setMIFlags(MI->getFlags()); + + DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB); + + MBB.erase_instr(MI); + ++NumNarrows; + return true; +} + +static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR, bool &DefCPSR) { + bool HasDef = false; + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || MO.isUndef() || MO.isUse()) + continue; + if (MO.getReg() != ARM::CPSR) + continue; + + DefCPSR = true; + if (!MO.isDead()) + HasDef = true; + } + + return HasDef || LiveCPSR; +} + +static bool UpdateCPSRUse(MachineInstr &MI, bool LiveCPSR) { + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || MO.isUndef() || MO.isDef()) + continue; + if (MO.getReg() != ARM::CPSR) + continue; + assert(LiveCPSR && "CPSR liveness tracking is wrong!"); + if (MO.isKill()) { + LiveCPSR = false; + break; + } + } + + return LiveCPSR; +} + +bool Thumb2SizeReduce::ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI, + bool LiveCPSR, bool IsSelfLoop) { + unsigned Opcode = MI->getOpcode(); + DenseMap<unsigned, unsigned>::iterator OPI = ReduceOpcodeMap.find(Opcode); + if (OPI == ReduceOpcodeMap.end()) + return false; + const ReduceEntry &Entry = ReduceTable[OPI->second]; + + // Don't attempt normal reductions on "special" cases for now. + if (Entry.Special) + return ReduceSpecial(MBB, MI, Entry, LiveCPSR, IsSelfLoop); + + // Try to transform to a 16-bit two-address instruction. + if (Entry.NarrowOpc2 && + ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop)) + return true; + + // Try to transform to a 16-bit non-two-address instruction. + if (Entry.NarrowOpc1 && + ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop)) + return true; + + return false; +} + +bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { + bool Modified = false; + + // Yes, CPSR could be livein. + bool LiveCPSR = MBB.isLiveIn(ARM::CPSR); + MachineInstr *BundleMI = nullptr; + + CPSRDef = nullptr; + HighLatencyCPSR = false; + + // Check predecessors for the latest CPSRDef. + for (auto *Pred : MBB.predecessors()) { + const MBBInfo &PInfo = BlockInfo[Pred->getNumber()]; + if (!PInfo.Visited) { + // Since blocks are visited in RPO, this must be a back-edge. + continue; + } + if (PInfo.HighLatencyCPSR) { + HighLatencyCPSR = true; + break; + } + } + + // If this BB loops back to itself, conservatively avoid narrowing the + // first instruction that does partial flag update. + bool IsSelfLoop = MBB.isSuccessor(&MBB); + MachineBasicBlock::instr_iterator MII = MBB.instr_begin(),E = MBB.instr_end(); + MachineBasicBlock::instr_iterator NextMII; + for (; MII != E; MII = NextMII) { + NextMII = std::next(MII); + + MachineInstr *MI = &*MII; + if (MI->isBundle()) { + BundleMI = MI; + continue; + } + if (MI->isDebugValue()) + continue; + + LiveCPSR = UpdateCPSRUse(*MI, LiveCPSR); + + // Does NextMII belong to the same bundle as MI? + bool NextInSameBundle = NextMII != E && NextMII->isBundledWithPred(); + + if (ReduceMI(MBB, MI, LiveCPSR, IsSelfLoop)) { + Modified = true; + MachineBasicBlock::instr_iterator I = std::prev(NextMII); + MI = &*I; + // Removing and reinserting the first instruction in a bundle will break + // up the bundle. Fix the bundling if it was broken. + if (NextInSameBundle && !NextMII->isBundledWithPred()) + NextMII->bundleWithPred(); + } + + if (!NextInSameBundle && MI->isInsideBundle()) { + // FIXME: Since post-ra scheduler operates on bundles, the CPSR kill + // marker is only on the BUNDLE instruction. Process the BUNDLE + // instruction as we finish with the bundled instruction to work around + // the inconsistency. + if (BundleMI->killsRegister(ARM::CPSR)) + LiveCPSR = false; + MachineOperand *MO = BundleMI->findRegisterDefOperand(ARM::CPSR); + if (MO && !MO->isDead()) + LiveCPSR = true; + MO = BundleMI->findRegisterUseOperand(ARM::CPSR); + if (MO && !MO->isKill()) + LiveCPSR = true; + } + + bool DefCPSR = false; + LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR, DefCPSR); + if (MI->isCall()) { + // Calls don't really set CPSR. + CPSRDef = nullptr; + HighLatencyCPSR = false; + IsSelfLoop = false; + } else if (DefCPSR) { + // This is the last CPSR defining instruction. + CPSRDef = MI; + HighLatencyCPSR = isHighLatencyCPSR(CPSRDef); + IsSelfLoop = false; + } + } + + MBBInfo &Info = BlockInfo[MBB.getNumber()]; + Info.HighLatencyCPSR = HighLatencyCPSR; + Info.Visited = true; + return Modified; +} + +bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) { + if (PredicateFtor && !PredicateFtor(*MF.getFunction())) + return false; + + STI = &static_cast<const ARMSubtarget &>(MF.getSubtarget()); + if (STI->isThumb1Only() || STI->prefers32BitThumb()) + return false; + + TII = static_cast<const Thumb2InstrInfo *>(STI->getInstrInfo()); + + // Optimizing / minimizing size? Minimizing size implies optimizing for size. + OptimizeSize = MF.getFunction()->optForSize(); + MinimizeSize = MF.getFunction()->optForMinSize(); + + BlockInfo.clear(); + BlockInfo.resize(MF.getNumBlockIDs()); + + // Visit blocks in reverse post-order so LastCPSRDef is known for all + // predecessors. + ReversePostOrderTraversal<MachineFunction*> RPOT(&MF); + bool Modified = false; + for (ReversePostOrderTraversal<MachineFunction*>::rpo_iterator + I = RPOT.begin(), E = RPOT.end(); I != E; ++I) + Modified |= ReduceMBB(**I); + return Modified; +} + +/// createThumb2SizeReductionPass - Returns an instance of the Thumb2 size +/// reduction pass. +FunctionPass *llvm::createThumb2SizeReductionPass( + std::function<bool(const Function &)> Ftor) { + return new Thumb2SizeReduce(Ftor); +} diff --git a/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp new file mode 100644 index 0000000..b5f9d7e --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp @@ -0,0 +1,623 @@ +//===-- ThumbRegisterInfo.cpp - Thumb-1 Register Information -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Thumb-1 implementation of the TargetRegisterInfo +// class. +// +//===----------------------------------------------------------------------===// + +#include "ThumbRegisterInfo.h" +#include "ARMBaseInstrInfo.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMSubtarget.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { +extern cl::opt<bool> ReuseFrameIndexVals; +} + +using namespace llvm; + +ThumbRegisterInfo::ThumbRegisterInfo() : ARMBaseRegisterInfo() {} + +const TargetRegisterClass * +ThumbRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, + const MachineFunction &MF) const { + if (!MF.getSubtarget<ARMSubtarget>().isThumb1Only()) + return ARMBaseRegisterInfo::getLargestLegalSuperClass(RC, MF); + + if (ARM::tGPRRegClass.hasSubClassEq(RC)) + return &ARM::tGPRRegClass; + return ARMBaseRegisterInfo::getLargestLegalSuperClass(RC, MF); +} + +const TargetRegisterClass * +ThumbRegisterInfo::getPointerRegClass(const MachineFunction &MF, + unsigned Kind) const { + if (!MF.getSubtarget<ARMSubtarget>().isThumb1Only()) + return ARMBaseRegisterInfo::getPointerRegClass(MF, Kind); + return &ARM::tGPRRegClass; +} + +static void emitThumb1LoadConstPool(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + DebugLoc dl, unsigned DestReg, + unsigned SubIdx, int Val, + ARMCC::CondCodes Pred, unsigned PredReg, + unsigned MIFlags) { + MachineFunction &MF = *MBB.getParent(); + const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + MachineConstantPool *ConstantPool = MF.getConstantPool(); + const Constant *C = ConstantInt::get( + Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4); + + BuildMI(MBB, MBBI, dl, TII.get(ARM::tLDRpci)) + .addReg(DestReg, getDefRegState(true), SubIdx) + .addConstantPoolIndex(Idx).addImm(Pred).addReg(PredReg) + .setMIFlags(MIFlags); +} + +static void emitThumb2LoadConstPool(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + DebugLoc dl, unsigned DestReg, + unsigned SubIdx, int Val, + ARMCC::CondCodes Pred, unsigned PredReg, + unsigned MIFlags) { + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + MachineConstantPool *ConstantPool = MF.getConstantPool(); + const Constant *C = ConstantInt::get( + Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4); + + BuildMI(MBB, MBBI, dl, TII.get(ARM::t2LDRpci)) + .addReg(DestReg, getDefRegState(true), SubIdx) + .addConstantPoolIndex(Idx).addImm((int64_t)ARMCC::AL).addReg(0) + .setMIFlags(MIFlags); +} + +/// emitLoadConstPool - Emits a load from constpool to materialize the +/// specified immediate. +void ThumbRegisterInfo::emitLoadConstPool( + MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, DebugLoc dl, + unsigned DestReg, unsigned SubIdx, int Val, ARMCC::CondCodes Pred, + unsigned PredReg, unsigned MIFlags) const { + MachineFunction &MF = *MBB.getParent(); + const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); + if (STI.isThumb1Only()) { + assert((isARMLowRegister(DestReg) || isVirtualRegister(DestReg)) && + "Thumb1 does not have ldr to high register"); + return emitThumb1LoadConstPool(MBB, MBBI, dl, DestReg, SubIdx, Val, Pred, + PredReg, MIFlags); + } + return emitThumb2LoadConstPool(MBB, MBBI, dl, DestReg, SubIdx, Val, Pred, + PredReg, MIFlags); +} + +/// emitThumbRegPlusImmInReg - Emits a series of instructions to materialize +/// a destreg = basereg + immediate in Thumb code. Materialize the immediate +/// in a register using mov / mvn sequences or load the immediate from a +/// constpool entry. +static +void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + DebugLoc dl, + unsigned DestReg, unsigned BaseReg, + int NumBytes, bool CanChangeCC, + const TargetInstrInfo &TII, + const ARMBaseRegisterInfo& MRI, + unsigned MIFlags = MachineInstr::NoFlags) { + MachineFunction &MF = *MBB.getParent(); + bool isHigh = !isARMLowRegister(DestReg) || + (BaseReg != 0 && !isARMLowRegister(BaseReg)); + bool isSub = false; + // Subtract doesn't have high register version. Load the negative value + // if either base or dest register is a high register. Also, if do not + // issue sub as part of the sequence if condition register is to be + // preserved. + if (NumBytes < 0 && !isHigh && CanChangeCC) { + isSub = true; + NumBytes = -NumBytes; + } + unsigned LdReg = DestReg; + if (DestReg == ARM::SP) + assert(BaseReg == ARM::SP && "Unexpected!"); + if (!isARMLowRegister(DestReg) && !MRI.isVirtualRegister(DestReg)) + LdReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass); + + if (NumBytes <= 255 && NumBytes >= 0 && CanChangeCC) { + AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg)) + .addImm(NumBytes).setMIFlags(MIFlags); + } else if (NumBytes < 0 && NumBytes >= -255 && CanChangeCC) { + AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg)) + .addImm(NumBytes).setMIFlags(MIFlags); + AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tRSB), LdReg)) + .addReg(LdReg, RegState::Kill).setMIFlags(MIFlags); + } else + MRI.emitLoadConstPool(MBB, MBBI, dl, LdReg, 0, NumBytes, + ARMCC::AL, 0, MIFlags); + + // Emit add / sub. + int Opc = (isSub) ? ARM::tSUBrr : ((isHigh || !CanChangeCC) ? ARM::tADDhirr + : ARM::tADDrr); + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg); + if (Opc != ARM::tADDhirr) + MIB = AddDefaultT1CC(MIB); + if (DestReg == ARM::SP || isSub) + MIB.addReg(BaseReg).addReg(LdReg, RegState::Kill); + else + MIB.addReg(LdReg).addReg(BaseReg, RegState::Kill); + AddDefaultPred(MIB); +} + +/// emitThumbRegPlusImmediate - Emits a series of instructions to materialize +/// a destreg = basereg + immediate in Thumb code. Tries a series of ADDs or +/// SUBs first, and uses a constant pool value if the instruction sequence would +/// be too long. This is allowed to modify the condition flags. +void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + DebugLoc dl, + unsigned DestReg, unsigned BaseReg, + int NumBytes, const TargetInstrInfo &TII, + const ARMBaseRegisterInfo& MRI, + unsigned MIFlags) { + bool isSub = NumBytes < 0; + unsigned Bytes = (unsigned)NumBytes; + if (isSub) Bytes = -NumBytes; + + int CopyOpc = 0; + unsigned CopyBits = 0; + unsigned CopyScale = 1; + bool CopyNeedsCC = false; + int ExtraOpc = 0; + unsigned ExtraBits = 0; + unsigned ExtraScale = 1; + bool ExtraNeedsCC = false; + + // Strategy: + // We need to select two types of instruction, maximizing the available + // immediate range of each. The instructions we use will depend on whether + // DestReg and BaseReg are low, high or the stack pointer. + // * CopyOpc - DestReg = BaseReg + imm + // This will be emitted once if DestReg != BaseReg, and never if + // DestReg == BaseReg. + // * ExtraOpc - DestReg = DestReg + imm + // This will be emitted as many times as necessary to add the + // full immediate. + // If the immediate ranges of these instructions are not large enough to cover + // NumBytes with a reasonable number of instructions, we fall back to using a + // value loaded from a constant pool. + if (DestReg == ARM::SP) { + if (BaseReg == ARM::SP) { + // sp -> sp + // Already in right reg, no copy needed + } else { + // low -> sp or high -> sp + CopyOpc = ARM::tMOVr; + CopyBits = 0; + } + ExtraOpc = isSub ? ARM::tSUBspi : ARM::tADDspi; + ExtraBits = 7; + ExtraScale = 4; + } else if (isARMLowRegister(DestReg)) { + if (BaseReg == ARM::SP) { + // sp -> low + assert(!isSub && "Thumb1 does not have tSUBrSPi"); + CopyOpc = ARM::tADDrSPi; + CopyBits = 8; + CopyScale = 4; + } else if (DestReg == BaseReg) { + // low -> same low + // Already in right reg, no copy needed + } else if (isARMLowRegister(BaseReg)) { + // low -> different low + CopyOpc = isSub ? ARM::tSUBi3 : ARM::tADDi3; + CopyBits = 3; + CopyNeedsCC = true; + } else { + // high -> low + CopyOpc = ARM::tMOVr; + CopyBits = 0; + } + ExtraOpc = isSub ? ARM::tSUBi8 : ARM::tADDi8; + ExtraBits = 8; + ExtraNeedsCC = true; + } else /* DestReg is high */ { + if (DestReg == BaseReg) { + // high -> same high + // Already in right reg, no copy needed + } else { + // {low,high,sp} -> high + CopyOpc = ARM::tMOVr; + CopyBits = 0; + } + ExtraOpc = 0; + } + + // We could handle an unaligned immediate with an unaligned copy instruction + // and an aligned extra instruction, but this case is not currently needed. + assert(((Bytes & 3) == 0 || ExtraScale == 1) && + "Unaligned offset, but all instructions require alignment"); + + unsigned CopyRange = ((1 << CopyBits) - 1) * CopyScale; + // If we would emit the copy with an immediate of 0, just use tMOVr. + if (CopyOpc && Bytes < CopyScale) { + CopyOpc = ARM::tMOVr; + CopyScale = 1; + CopyNeedsCC = false; + CopyRange = 0; + } + unsigned ExtraRange = ((1 << ExtraBits) - 1) * ExtraScale; // per instruction + unsigned RequiredCopyInstrs = CopyOpc ? 1 : 0; + unsigned RangeAfterCopy = (CopyRange > Bytes) ? 0 : (Bytes - CopyRange); + + // We could handle this case when the copy instruction does not require an + // aligned immediate, but we do not currently do this. + assert(RangeAfterCopy % ExtraScale == 0 && + "Extra instruction requires immediate to be aligned"); + + unsigned RequiredExtraInstrs; + if (ExtraRange) + RequiredExtraInstrs = RoundUpToAlignment(RangeAfterCopy, ExtraRange) / ExtraRange; + else if (RangeAfterCopy > 0) + // We need an extra instruction but none is available + RequiredExtraInstrs = 1000000; + else + RequiredExtraInstrs = 0; + unsigned RequiredInstrs = RequiredCopyInstrs + RequiredExtraInstrs; + unsigned Threshold = (DestReg == ARM::SP) ? 3 : 2; + + // Use a constant pool, if the sequence of ADDs/SUBs is too expensive. + if (RequiredInstrs > Threshold) { + emitThumbRegPlusImmInReg(MBB, MBBI, dl, + DestReg, BaseReg, NumBytes, true, + TII, MRI, MIFlags); + return; + } + + // Emit zero or one copy instructions + if (CopyOpc) { + unsigned CopyImm = std::min(Bytes, CopyRange) / CopyScale; + Bytes -= CopyImm * CopyScale; + + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(CopyOpc), DestReg); + if (CopyNeedsCC) + MIB = AddDefaultT1CC(MIB); + MIB.addReg(BaseReg, RegState::Kill); + if (CopyOpc != ARM::tMOVr) { + MIB.addImm(CopyImm); + } + AddDefaultPred(MIB.setMIFlags(MIFlags)); + + BaseReg = DestReg; + } + + // Emit zero or more in-place add/sub instructions + while (Bytes) { + unsigned ExtraImm = std::min(Bytes, ExtraRange) / ExtraScale; + Bytes -= ExtraImm * ExtraScale; + + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(ExtraOpc), DestReg); + if (ExtraNeedsCC) + MIB = AddDefaultT1CC(MIB); + MIB.addReg(BaseReg).addImm(ExtraImm); + MIB = AddDefaultPred(MIB); + MIB.setMIFlags(MIFlags); + } +} + +static void removeOperands(MachineInstr &MI, unsigned i) { + unsigned Op = i; + for (unsigned e = MI.getNumOperands(); i != e; ++i) + MI.RemoveOperand(Op); +} + +/// convertToNonSPOpcode - Change the opcode to the non-SP version, because +/// we're replacing the frame index with a non-SP register. +static unsigned convertToNonSPOpcode(unsigned Opcode) { + switch (Opcode) { + case ARM::tLDRspi: + return ARM::tLDRi; + + case ARM::tSTRspi: + return ARM::tSTRi; + } + + return Opcode; +} + +bool ThumbRegisterInfo::rewriteFrameIndex(MachineBasicBlock::iterator II, + unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const ARMBaseInstrInfo &TII) const { + MachineInstr &MI = *II; + MachineBasicBlock &MBB = *MI.getParent(); + assert(MBB.getParent()->getSubtarget<ARMSubtarget>().isThumb1Only() && + "This isn't needed for thumb2!"); + DebugLoc dl = MI.getDebugLoc(); + MachineInstrBuilder MIB(*MBB.getParent(), &MI); + unsigned Opcode = MI.getOpcode(); + const MCInstrDesc &Desc = MI.getDesc(); + unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); + + if (Opcode == ARM::tADDframe) { + Offset += MI.getOperand(FrameRegIdx+1).getImm(); + unsigned DestReg = MI.getOperand(0).getReg(); + + emitThumbRegPlusImmediate(MBB, II, dl, DestReg, FrameReg, Offset, TII, + *this); + MBB.erase(II); + return true; + } else { + if (AddrMode != ARMII::AddrModeT1_s) + llvm_unreachable("Unsupported addressing mode!"); + + unsigned ImmIdx = FrameRegIdx + 1; + int InstrOffs = MI.getOperand(ImmIdx).getImm(); + unsigned NumBits = (FrameReg == ARM::SP) ? 8 : 5; + unsigned Scale = 4; + + Offset += InstrOffs * Scale; + assert((Offset & (Scale - 1)) == 0 && "Can't encode this offset!"); + + // Common case: small offset, fits into instruction. + MachineOperand &ImmOp = MI.getOperand(ImmIdx); + int ImmedOffset = Offset / Scale; + unsigned Mask = (1 << NumBits) - 1; + + if ((unsigned)Offset <= Mask * Scale) { + // Replace the FrameIndex with the frame register (e.g., sp). + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + ImmOp.ChangeToImmediate(ImmedOffset); + + // If we're using a register where sp was stored, convert the instruction + // to the non-SP version. + unsigned NewOpc = convertToNonSPOpcode(Opcode); + if (NewOpc != Opcode && FrameReg != ARM::SP) + MI.setDesc(TII.get(NewOpc)); + + return true; + } + + NumBits = 5; + Mask = (1 << NumBits) - 1; + + // If this is a thumb spill / restore, we will be using a constpool load to + // materialize the offset. + if (Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) { + ImmOp.ChangeToImmediate(0); + } else { + // Otherwise, it didn't fit. Pull in what we can to simplify the immed. + ImmedOffset = ImmedOffset & Mask; + ImmOp.ChangeToImmediate(ImmedOffset); + Offset &= ~(Mask * Scale); + } + } + + return Offset == 0; +} + +void ThumbRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, + int64_t Offset) const { + const MachineFunction &MF = *MI.getParent()->getParent(); + const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); + if (!STI.isThumb1Only()) + return ARMBaseRegisterInfo::resolveFrameIndex(MI, BaseReg, Offset); + + const ARMBaseInstrInfo &TII = *STI.getInstrInfo(); + int Off = Offset; // ARM doesn't need the general 64-bit offsets + unsigned i = 0; + + while (!MI.getOperand(i).isFI()) { + ++i; + assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!"); + } + bool Done = rewriteFrameIndex(MI, i, BaseReg, Off, TII); + assert (Done && "Unable to resolve frame index!"); + (void)Done; +} + +/// saveScavengerRegister - Spill the register so it can be used by the +/// register scavenger. Return true. +bool ThumbRegisterInfo::saveScavengerRegister( + MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + MachineBasicBlock::iterator &UseMI, const TargetRegisterClass *RC, + unsigned Reg) const { + + const ARMSubtarget &STI = MBB.getParent()->getSubtarget<ARMSubtarget>(); + if (!STI.isThumb1Only()) + return ARMBaseRegisterInfo::saveScavengerRegister(MBB, I, UseMI, RC, Reg); + + // Thumb1 can't use the emergency spill slot on the stack because + // ldr/str immediate offsets must be positive, and if we're referencing + // off the frame pointer (if, for example, there are alloca() calls in + // the function, the offset will be negative. Use R12 instead since that's + // a call clobbered register that we know won't be used in Thumb1 mode. + const TargetInstrInfo &TII = *STI.getInstrInfo(); + DebugLoc DL; + AddDefaultPred(BuildMI(MBB, I, DL, TII.get(ARM::tMOVr)) + .addReg(ARM::R12, RegState::Define) + .addReg(Reg, RegState::Kill)); + + // The UseMI is where we would like to restore the register. If there's + // interference with R12 before then, however, we'll need to restore it + // before that instead and adjust the UseMI. + bool done = false; + for (MachineBasicBlock::iterator II = I; !done && II != UseMI ; ++II) { + if (II->isDebugValue()) + continue; + // If this instruction affects R12, adjust our restore point. + for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = II->getOperand(i); + if (MO.isRegMask() && MO.clobbersPhysReg(ARM::R12)) { + UseMI = II; + done = true; + break; + } + if (!MO.isReg() || MO.isUndef() || !MO.getReg() || + TargetRegisterInfo::isVirtualRegister(MO.getReg())) + continue; + if (MO.getReg() == ARM::R12) { + UseMI = II; + done = true; + break; + } + } + } + // Restore the register from R12 + AddDefaultPred(BuildMI(MBB, UseMI, DL, TII.get(ARM::tMOVr)). + addReg(Reg, RegState::Define).addReg(ARM::R12, RegState::Kill)); + + return true; +} + +void ThumbRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, unsigned FIOperandNum, + RegScavenger *RS) const { + MachineInstr &MI = *II; + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); + if (!STI.isThumb1Only()) + return ARMBaseRegisterInfo::eliminateFrameIndex(II, SPAdj, FIOperandNum, + RS); + + unsigned VReg = 0; + const ARMBaseInstrInfo &TII = *STI.getInstrInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + DebugLoc dl = MI.getDebugLoc(); + MachineInstrBuilder MIB(*MBB.getParent(), &MI); + + unsigned FrameReg = ARM::SP; + int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); + int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) + + MF.getFrameInfo()->getStackSize() + SPAdj; + + if (MF.getFrameInfo()->hasVarSizedObjects()) { + assert(SPAdj == 0 && STI.getFrameLowering()->hasFP(MF) && "Unexpected"); + // There are alloca()'s in this function, must reference off the frame + // pointer or base pointer instead. + if (!hasBasePointer(MF)) { + FrameReg = getFrameRegister(MF); + Offset -= AFI->getFramePtrSpillOffset(); + } else + FrameReg = BasePtr; + } + + // PEI::scavengeFrameVirtualRegs() cannot accurately track SPAdj because the + // call frame setup/destroy instructions have already been eliminated. That + // means the stack pointer cannot be used to access the emergency spill slot + // when !hasReservedCallFrame(). +#ifndef NDEBUG + if (RS && FrameReg == ARM::SP && RS->isScavengingFrameIndex(FrameIndex)){ + assert(STI.getFrameLowering()->hasReservedCallFrame(MF) && + "Cannot use SP to access the emergency spill slot in " + "functions without a reserved call frame"); + assert(!MF.getFrameInfo()->hasVarSizedObjects() && + "Cannot use SP to access the emergency spill slot in " + "functions with variable sized frame objects"); + } +#endif // NDEBUG + + // Special handling of dbg_value instructions. + if (MI.isDebugValue()) { + MI.getOperand(FIOperandNum). ChangeToRegister(FrameReg, false /*isDef*/); + MI.getOperand(FIOperandNum+1).ChangeToImmediate(Offset); + return; + } + + // Modify MI as necessary to handle as much of 'Offset' as possible + assert(AFI->isThumbFunction() && + "This eliminateFrameIndex only supports Thumb1!"); + if (rewriteFrameIndex(MI, FIOperandNum, FrameReg, Offset, TII)) + return; + + // If we get here, the immediate doesn't fit into the instruction. We folded + // as much as possible above, handle the rest, providing a register that is + // SP+LargeImm. + assert(Offset && "This code isn't needed if offset already handled!"); + + unsigned Opcode = MI.getOpcode(); + + // Remove predicate first. + int PIdx = MI.findFirstPredOperandIdx(); + if (PIdx != -1) + removeOperands(MI, PIdx); + + if (MI.mayLoad()) { + // Use the destination register to materialize sp + offset. + unsigned TmpReg = MI.getOperand(0).getReg(); + bool UseRR = false; + if (Opcode == ARM::tLDRspi) { + if (FrameReg == ARM::SP) + emitThumbRegPlusImmInReg(MBB, II, dl, TmpReg, FrameReg, + Offset, false, TII, *this); + else { + emitLoadConstPool(MBB, II, dl, TmpReg, 0, Offset); + UseRR = true; + } + } else { + emitThumbRegPlusImmediate(MBB, II, dl, TmpReg, FrameReg, Offset, TII, + *this); + } + + MI.setDesc(TII.get(UseRR ? ARM::tLDRr : ARM::tLDRi)); + MI.getOperand(FIOperandNum).ChangeToRegister(TmpReg, false, false, true); + if (UseRR) + // Use [reg, reg] addrmode. Replace the immediate operand w/ the frame + // register. The offset is already handled in the vreg value. + MI.getOperand(FIOperandNum+1).ChangeToRegister(FrameReg, false, false, + false); + } else if (MI.mayStore()) { + VReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass); + bool UseRR = false; + + if (Opcode == ARM::tSTRspi) { + if (FrameReg == ARM::SP) + emitThumbRegPlusImmInReg(MBB, II, dl, VReg, FrameReg, + Offset, false, TII, *this); + else { + emitLoadConstPool(MBB, II, dl, VReg, 0, Offset); + UseRR = true; + } + } else + emitThumbRegPlusImmediate(MBB, II, dl, VReg, FrameReg, Offset, TII, + *this); + MI.setDesc(TII.get(UseRR ? ARM::tSTRr : ARM::tSTRi)); + MI.getOperand(FIOperandNum).ChangeToRegister(VReg, false, false, true); + if (UseRR) + // Use [reg, reg] addrmode. Replace the immediate operand w/ the frame + // register. The offset is already handled in the vreg value. + MI.getOperand(FIOperandNum+1).ChangeToRegister(FrameReg, false, false, + false); + } else { + llvm_unreachable("Unexpected opcode!"); + } + + // Add predicate back if it's needed. + if (MI.isPredicable()) + AddDefaultPred(MIB); +} diff --git a/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.h new file mode 100644 index 0000000..23aaff3 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.h @@ -0,0 +1,65 @@ +//===- ThumbRegisterInfo.h - Thumb Register Information Impl -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Thumb implementation of the TargetRegisterInfo +// class. With the exception of emitLoadConstPool Thumb2 tracks +// ARMBaseRegisterInfo, Thumb1 overloads the functions below. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_THUMB1REGISTERINFO_H +#define LLVM_LIB_TARGET_ARM_THUMB1REGISTERINFO_H + +#include "ARMBaseRegisterInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" + +namespace llvm { + class ARMSubtarget; + class ARMBaseInstrInfo; + +struct ThumbRegisterInfo : public ARMBaseRegisterInfo { +public: + ThumbRegisterInfo(); + + const TargetRegisterClass * + getLargestLegalSuperClass(const TargetRegisterClass *RC, + const MachineFunction &MF) const override; + + const TargetRegisterClass * + getPointerRegClass(const MachineFunction &MF, + unsigned Kind = 0) const override; + + /// emitLoadConstPool - Emits a load from constpool to materialize the + /// specified immediate. + void + emitLoadConstPool(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, + DebugLoc dl, unsigned DestReg, unsigned SubIdx, int Val, + ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0, + unsigned MIFlags = MachineInstr::NoFlags) const override; + + // rewrite MI to access 'Offset' bytes from the FP. Update Offset to be + // however much remains to be handled. Return 'true' if no further + // work is required. + bool rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const ARMBaseInstrInfo &TII) const; + void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, + int64_t Offset) const override; + bool saveScavengerRegister(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator &UseMI, + const TargetRegisterClass *RC, + unsigned Reg) const override; + void eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, unsigned FIOperandNum, + RegScavenger *RS = nullptr) const override; +}; +} + +#endif |