diff options
Diffstat (limited to 'contrib/llvm/lib/Target/ARM')
88 files changed, 80461 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Target/ARM/ARM.h b/contrib/llvm/lib/Target/ARM/ARM.h new file mode 100644 index 0000000..16d0da3 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARM.h @@ -0,0 +1,53 @@ +//===-- ARM.h - Top-level interface for ARM representation---- --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the entry points for global functions defined in the LLVM +// ARM back-end. +// +//===----------------------------------------------------------------------===// + +#ifndef TARGET_ARM_H +#define TARGET_ARM_H + +#include "MCTargetDesc/ARMBaseInfo.h" +#include "MCTargetDesc/ARMMCTargetDesc.h" +#include "llvm/Support/DataTypes.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetMachine.h" +#include <cassert> + +namespace llvm { + +class ARMAsmPrinter; +class ARMBaseTargetMachine; +class FunctionPass; +class JITCodeEmitter; +class MachineInstr; +class MCInst; + +FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM, + CodeGenOpt::Level OptLevel); + +FunctionPass *createARMJITCodeEmitterPass(ARMBaseTargetMachine &TM, + JITCodeEmitter &JCE); + +FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false); +FunctionPass *createARMExpandPseudoPass(); +FunctionPass *createARMGlobalMergePass(const TargetLowering* tli); +FunctionPass *createARMConstantIslandPass(); +FunctionPass *createMLxExpansionPass(); +FunctionPass *createThumb2ITBlockPass(); +FunctionPass *createThumb2SizeReductionPass(); + +void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, + ARMAsmPrinter &AP); + +} // end namespace llvm; + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARM.td b/contrib/llvm/lib/Target/ARM/ARM.td new file mode 100644 index 0000000..5c727ad --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARM.td @@ -0,0 +1,260 @@ +//===- ARM.td - Describe the ARM Target Machine ------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Target-independent interfaces which we are implementing +//===----------------------------------------------------------------------===// + +include "llvm/Target/Target.td" + +//===----------------------------------------------------------------------===// +// ARM Subtarget state. +// + +def ModeThumb : SubtargetFeature<"thumb-mode", "InThumbMode", "true", + "Thumb mode">; + +def ModeNaCl : SubtargetFeature<"nacl-mode", "InNaClMode", "true", + "Native client mode">; + +//===----------------------------------------------------------------------===// +// ARM Subtarget features. +// + +def FeatureVFP2 : SubtargetFeature<"vfp2", "HasVFPv2", "true", + "Enable VFP2 instructions">; +def FeatureVFP3 : SubtargetFeature<"vfp3", "HasVFPv3", "true", + "Enable VFP3 instructions", + [FeatureVFP2]>; +def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true", + "Enable NEON instructions", + [FeatureVFP3]>; +def FeatureThumb2 : SubtargetFeature<"thumb2", "HasThumb2", "true", + "Enable Thumb2 instructions">; +def FeatureNoARM : SubtargetFeature<"noarm", "NoARM", "true", + "Does not support ARM mode execution">; +def FeatureFP16 : SubtargetFeature<"fp16", "HasFP16", "true", + "Enable half-precision floating point">; +def FeatureD16 : SubtargetFeature<"d16", "HasD16", "true", + "Restrict VFP3 to 16 double registers">; +def FeatureHWDiv : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true", + "Enable divide instructions">; +def FeatureT2XtPk : SubtargetFeature<"t2xtpk", "HasT2ExtractPack", "true", + "Enable Thumb2 extract and pack instructions">; +def FeatureDB : SubtargetFeature<"db", "HasDataBarrier", "true", + "Has data barrier (dmb / dsb) instructions">; +def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true", + "FP compare + branch is slow">; +def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true", + "Floating point unit supports single precision only">; + +// Some processors have FP multiply-accumulate instructions that don't +// play nicely with other VFP / NEON instructions, and it's generally better +// to just not use them. +def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true", + "Disable VFP / NEON MAC instructions">; + +// Cortex-A8 / A9 Advanced SIMD has multiplier accumulator forwarding. +def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding", + "HasVMLxForwarding", "true", + "Has multiplier accumulator forwarding">; + +// Some processors benefit from using NEON instructions for scalar +// single-precision FP operations. +def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP", + "true", + "Use NEON for single precision FP">; + +// Disable 32-bit to 16-bit narrowing for experimentation. +def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true", + "Prefer 32-bit Thumb instrs">; + +/// Some instructions update CPSR partially, which can add false dependency for +/// out-of-order implementation, e.g. Cortex-A9, unless each individual bit is +/// mapped to a separate physical register. Avoid partial CPSR update for these +/// processors. +def FeatureAvoidPartialCPSR : SubtargetFeature<"avoid-partial-cpsr", + "AvoidCPSRPartialUpdate", "true", + "Avoid CPSR partial update for OOO execution">; + +/// Some M architectures don't have the DSP extension (v7E-M vs. v7M) +def FeatureDSPThumb2 : SubtargetFeature<"t2dsp", "Thumb2DSP", "true", + "Supports v7 DSP instructions in Thumb2">; + +// Multiprocessing extension. +def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true", + "Supports Multiprocessing extension">; + +// M-series ISA? +def FeatureMClass : SubtargetFeature<"mclass", "IsMClass", "true", + "Is microcontroller profile ('M' series)">; + +// ARM ISAs. +def HasV4TOps : SubtargetFeature<"v4t", "HasV4TOps", "true", + "Support ARM v4T instructions">; +def HasV5TOps : SubtargetFeature<"v5t", "HasV5TOps", "true", + "Support ARM v5T instructions", + [HasV4TOps]>; +def HasV5TEOps : SubtargetFeature<"v5te", "HasV5TEOps", "true", + "Support ARM v5TE, v5TEj, and v5TExp instructions", + [HasV5TOps]>; +def HasV6Ops : SubtargetFeature<"v6", "HasV6Ops", "true", + "Support ARM v6 instructions", + [HasV5TEOps]>; +def HasV6T2Ops : SubtargetFeature<"v6t2", "HasV6T2Ops", "true", + "Support ARM v6t2 instructions", + [HasV6Ops, FeatureThumb2]>; +def HasV7Ops : SubtargetFeature<"v7", "HasV7Ops", "true", + "Support ARM v7 instructions", + [HasV6T2Ops]>; + +//===----------------------------------------------------------------------===// +// ARM Processors supported. +// + +include "ARMSchedule.td" + +// ARM processor families. +def ProcA8 : SubtargetFeature<"a8", "ARMProcFamily", "CortexA8", + "Cortex-A8 ARM processors", + [FeatureSlowFPBrcc, FeatureNEONForFP, + FeatureHasSlowFPVMLx, FeatureVMLxForwarding, + FeatureT2XtPk]>; +def ProcA9 : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9", + "Cortex-A9 ARM processors", + [FeatureVMLxForwarding, + FeatureT2XtPk, FeatureFP16, + FeatureAvoidPartialCPSR]>; + +class ProcNoItin<string Name, list<SubtargetFeature> Features> + : Processor<Name, GenericItineraries, Features>; + +// V4 Processors. +def : ProcNoItin<"generic", []>; +def : ProcNoItin<"arm8", []>; +def : ProcNoItin<"arm810", []>; +def : ProcNoItin<"strongarm", []>; +def : ProcNoItin<"strongarm110", []>; +def : ProcNoItin<"strongarm1100", []>; +def : ProcNoItin<"strongarm1110", []>; + +// V4T Processors. +def : ProcNoItin<"arm7tdmi", [HasV4TOps]>; +def : ProcNoItin<"arm7tdmi-s", [HasV4TOps]>; +def : ProcNoItin<"arm710t", [HasV4TOps]>; +def : ProcNoItin<"arm720t", [HasV4TOps]>; +def : ProcNoItin<"arm9", [HasV4TOps]>; +def : ProcNoItin<"arm9tdmi", [HasV4TOps]>; +def : ProcNoItin<"arm920", [HasV4TOps]>; +def : ProcNoItin<"arm920t", [HasV4TOps]>; +def : ProcNoItin<"arm922t", [HasV4TOps]>; +def : ProcNoItin<"arm940t", [HasV4TOps]>; +def : ProcNoItin<"ep9312", [HasV4TOps]>; + +// V5T Processors. +def : ProcNoItin<"arm10tdmi", [HasV5TOps]>; +def : ProcNoItin<"arm1020t", [HasV5TOps]>; + +// V5TE Processors. +def : ProcNoItin<"arm9e", [HasV5TEOps]>; +def : ProcNoItin<"arm926ej-s", [HasV5TEOps]>; +def : ProcNoItin<"arm946e-s", [HasV5TEOps]>; +def : ProcNoItin<"arm966e-s", [HasV5TEOps]>; +def : ProcNoItin<"arm968e-s", [HasV5TEOps]>; +def : ProcNoItin<"arm10e", [HasV5TEOps]>; +def : ProcNoItin<"arm1020e", [HasV5TEOps]>; +def : ProcNoItin<"arm1022e", [HasV5TEOps]>; +def : ProcNoItin<"xscale", [HasV5TEOps]>; +def : ProcNoItin<"iwmmxt", [HasV5TEOps]>; + +// V6 Processors. +def : Processor<"arm1136j-s", ARMV6Itineraries, [HasV6Ops]>; +def : Processor<"arm1136jf-s", ARMV6Itineraries, [HasV6Ops, FeatureVFP2, + FeatureHasSlowFPVMLx]>; +def : Processor<"arm1176jz-s", ARMV6Itineraries, [HasV6Ops]>; +def : Processor<"arm1176jzf-s", ARMV6Itineraries, [HasV6Ops, FeatureVFP2, + FeatureHasSlowFPVMLx]>; +def : Processor<"mpcorenovfp", ARMV6Itineraries, [HasV6Ops]>; +def : Processor<"mpcore", ARMV6Itineraries, [HasV6Ops, FeatureVFP2, + FeatureHasSlowFPVMLx]>; + +// V6M Processors. +def : Processor<"cortex-m0", ARMV6Itineraries, [HasV6Ops, FeatureNoARM, + FeatureDB, FeatureMClass]>; + +// V6T2 Processors. +def : Processor<"arm1156t2-s", ARMV6Itineraries, [HasV6T2Ops, + FeatureDSPThumb2]>; +def : Processor<"arm1156t2f-s", ARMV6Itineraries, [HasV6T2Ops, FeatureVFP2, + FeatureHasSlowFPVMLx, + FeatureDSPThumb2]>; + +// V7a Processors. +def : Processor<"cortex-a8", CortexA8Itineraries, + [ProcA8, HasV7Ops, FeatureNEON, FeatureDB, + FeatureDSPThumb2]>; +def : Processor<"cortex-a9", CortexA9Itineraries, + [ProcA9, HasV7Ops, FeatureNEON, FeatureDB, + FeatureDSPThumb2]>; +def : Processor<"cortex-a9-mp", CortexA9Itineraries, + [ProcA9, HasV7Ops, FeatureNEON, FeatureDB, + FeatureDSPThumb2, FeatureMP]>; + +// V7M Processors. +def : ProcNoItin<"cortex-m3", [HasV7Ops, + FeatureThumb2, FeatureNoARM, FeatureDB, + FeatureHWDiv, FeatureMClass]>; + +// V7EM Processors. +def : ProcNoItin<"cortex-m4", [HasV7Ops, + FeatureThumb2, FeatureNoARM, FeatureDB, + FeatureHWDiv, FeatureDSPThumb2, + FeatureT2XtPk, FeatureVFP2, + FeatureVFPOnlySP, FeatureMClass]>; + +//===----------------------------------------------------------------------===// +// Register File Description +//===----------------------------------------------------------------------===// + +include "ARMRegisterInfo.td" + +include "ARMCallingConv.td" + +//===----------------------------------------------------------------------===// +// Instruction Descriptions +//===----------------------------------------------------------------------===// + +include "ARMInstrInfo.td" + +def ARMInstrInfo : InstrInfo; + + +//===----------------------------------------------------------------------===// +// Assembly printer +//===----------------------------------------------------------------------===// +// ARM Uses the MC printer for asm output, so make sure the TableGen +// AsmWriter bits get associated with the correct class. +def ARMAsmWriter : AsmWriter { + string AsmWriterClassName = "InstPrinter"; + bit isMCAsmWriter = 1; +} + +//===----------------------------------------------------------------------===// +// Declare the target which we are implementing +//===----------------------------------------------------------------------===// + +def ARM : Target { + // Pull in Instruction Info: + let InstructionSet = ARMInstrInfo; + + let AssemblyWriters = [ARMAsmWriter]; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp new file mode 100644 index 0000000..ea3319f --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp @@ -0,0 +1,1938 @@ +//===-- ARMAsmPrinter.cpp - Print machine code to an ARM .s file ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a printer that converts from our internal representation +// of machine-dependent LLVM code to GAS-format ARM assembly language. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "asm-printer" +#include "ARM.h" +#include "ARMAsmPrinter.h" +#include "ARMBuildAttrs.h" +#include "ARMBaseRegisterInfo.h" +#include "ARMConstantPoolValue.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMTargetMachine.h" +#include "ARMTargetObjectFile.h" +#include "InstPrinter/ARMInstPrinter.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "MCTargetDesc/ARMMCExpr.h" +#include "llvm/Analysis/DebugInfo.h" +#include "llvm/Constants.h" +#include "llvm/Module.h" +#include "llvm/Type.h" +#include "llvm/Assembly/Writer.h" +#include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCObjectStreamer.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Target/Mangler.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" +#include <cctype> +using namespace llvm; + +namespace { + + // Per section and per symbol attributes are not supported. + // To implement them we would need the ability to delay this emission + // until the assembly file is fully parsed/generated as only then do we + // know the symbol and section numbers. + class AttributeEmitter { + public: + virtual void MaybeSwitchVendor(StringRef Vendor) = 0; + virtual void EmitAttribute(unsigned Attribute, unsigned Value) = 0; + virtual void EmitTextAttribute(unsigned Attribute, StringRef String) = 0; + virtual void Finish() = 0; + virtual ~AttributeEmitter() {} + }; + + class AsmAttributeEmitter : public AttributeEmitter { + MCStreamer &Streamer; + + public: + AsmAttributeEmitter(MCStreamer &Streamer_) : Streamer(Streamer_) {} + void MaybeSwitchVendor(StringRef Vendor) { } + + void EmitAttribute(unsigned Attribute, unsigned Value) { + Streamer.EmitRawText("\t.eabi_attribute " + + Twine(Attribute) + ", " + Twine(Value)); + } + + void EmitTextAttribute(unsigned Attribute, StringRef String) { + switch (Attribute) { + case ARMBuildAttrs::CPU_name: + Streamer.EmitRawText(StringRef("\t.cpu ") + LowercaseString(String)); + break; + /* GAS requires .fpu to be emitted regardless of EABI attribute */ + case ARMBuildAttrs::Advanced_SIMD_arch: + case ARMBuildAttrs::VFP_arch: + Streamer.EmitRawText(StringRef("\t.fpu ") + LowercaseString(String)); + break; + default: assert(0 && "Unsupported Text attribute in ASM Mode"); break; + } + } + void Finish() { } + }; + + class ObjectAttributeEmitter : public AttributeEmitter { + // This structure holds all attributes, accounting for + // their string/numeric value, so we can later emmit them + // in declaration order, keeping all in the same vector + struct AttributeItemType { + enum { + HiddenAttribute = 0, + NumericAttribute, + TextAttribute + } Type; + unsigned Tag; + unsigned IntValue; + StringRef StringValue; + } AttributeItem; + + MCObjectStreamer &Streamer; + StringRef CurrentVendor; + SmallVector<AttributeItemType, 64> Contents; + + // Account for the ULEB/String size of each item, + // not just the number of items + size_t ContentsSize; + // FIXME: this should be in a more generic place, but + // getULEBSize() is in MCAsmInfo and will be moved to MCDwarf + size_t getULEBSize(int Value) { + size_t Size = 0; + do { + Value >>= 7; + Size += sizeof(int8_t); // Is this really necessary? + } while (Value); + return Size; + } + + public: + ObjectAttributeEmitter(MCObjectStreamer &Streamer_) : + Streamer(Streamer_), CurrentVendor(""), ContentsSize(0) { } + + void MaybeSwitchVendor(StringRef Vendor) { + assert(!Vendor.empty() && "Vendor cannot be empty."); + + if (CurrentVendor.empty()) + CurrentVendor = Vendor; + else if (CurrentVendor == Vendor) + return; + else + Finish(); + + CurrentVendor = Vendor; + + assert(Contents.size() == 0); + } + + void EmitAttribute(unsigned Attribute, unsigned Value) { + AttributeItemType attr = { + AttributeItemType::NumericAttribute, + Attribute, + Value, + StringRef("") + }; + ContentsSize += getULEBSize(Attribute); + ContentsSize += getULEBSize(Value); + Contents.push_back(attr); + } + + void EmitTextAttribute(unsigned Attribute, StringRef String) { + AttributeItemType attr = { + AttributeItemType::TextAttribute, + Attribute, + 0, + String + }; + ContentsSize += getULEBSize(Attribute); + // String + \0 + ContentsSize += String.size()+1; + + Contents.push_back(attr); + } + + void Finish() { + // Vendor size + Vendor name + '\0' + const size_t VendorHeaderSize = 4 + CurrentVendor.size() + 1; + + // Tag + Tag Size + const size_t TagHeaderSize = 1 + 4; + + Streamer.EmitIntValue(VendorHeaderSize + TagHeaderSize + ContentsSize, 4); + Streamer.EmitBytes(CurrentVendor, 0); + Streamer.EmitIntValue(0, 1); // '\0' + + Streamer.EmitIntValue(ARMBuildAttrs::File, 1); + Streamer.EmitIntValue(TagHeaderSize + ContentsSize, 4); + + // Size should have been accounted for already, now + // emit each field as its type (ULEB or String) + for (unsigned int i=0; i<Contents.size(); ++i) { + AttributeItemType item = Contents[i]; + Streamer.EmitULEB128IntValue(item.Tag, 0); + switch (item.Type) { + case AttributeItemType::NumericAttribute: + Streamer.EmitULEB128IntValue(item.IntValue, 0); + break; + case AttributeItemType::TextAttribute: + Streamer.EmitBytes(UppercaseString(item.StringValue), 0); + Streamer.EmitIntValue(0, 1); // '\0' + break; + default: + assert(0 && "Invalid attribute type"); + } + } + + Contents.clear(); + } + }; + +} // end of anonymous namespace + +MachineLocation ARMAsmPrinter:: +getDebugValueLocation(const MachineInstr *MI) const { + MachineLocation Location; + assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!"); + // Frame address. Currently handles register +- offset only. + if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm()) + Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm()); + else { + DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n"); + } + return Location; +} + +/// EmitDwarfRegOp - Emit dwarf register operation. +void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const { + const TargetRegisterInfo *RI = TM.getRegisterInfo(); + if (RI->getDwarfRegNum(MLoc.getReg(), false) != -1) + AsmPrinter::EmitDwarfRegOp(MLoc); + else { + unsigned Reg = MLoc.getReg(); + if (Reg >= ARM::S0 && Reg <= ARM::S31) { + assert(ARM::S0 + 31 == ARM::S31 && "Unexpected ARM S register numbering"); + // S registers are described as bit-pieces of a register + // S[2x] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 0) + // S[2x+1] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 32) + + unsigned SReg = Reg - ARM::S0; + bool odd = SReg & 0x1; + unsigned Rx = 256 + (SReg >> 1); + + OutStreamer.AddComment("DW_OP_regx for S register"); + EmitInt8(dwarf::DW_OP_regx); + + OutStreamer.AddComment(Twine(SReg)); + EmitULEB128(Rx); + + if (odd) { + OutStreamer.AddComment("DW_OP_bit_piece 32 32"); + EmitInt8(dwarf::DW_OP_bit_piece); + EmitULEB128(32); + EmitULEB128(32); + } else { + OutStreamer.AddComment("DW_OP_bit_piece 32 0"); + EmitInt8(dwarf::DW_OP_bit_piece); + EmitULEB128(32); + EmitULEB128(0); + } + } else if (Reg >= ARM::Q0 && Reg <= ARM::Q15) { + assert(ARM::Q0 + 15 == ARM::Q15 && "Unexpected ARM Q register numbering"); + // Q registers Q0-Q15 are described by composing two D registers together. + // Qx = DW_OP_regx(256+2x) DW_OP_piece(8) DW_OP_regx(256+2x+1) + // DW_OP_piece(8) + + unsigned QReg = Reg - ARM::Q0; + unsigned D1 = 256 + 2 * QReg; + unsigned D2 = D1 + 1; + + OutStreamer.AddComment("DW_OP_regx for Q register: D1"); + EmitInt8(dwarf::DW_OP_regx); + EmitULEB128(D1); + OutStreamer.AddComment("DW_OP_piece 8"); + EmitInt8(dwarf::DW_OP_piece); + EmitULEB128(8); + + OutStreamer.AddComment("DW_OP_regx for Q register: D2"); + EmitInt8(dwarf::DW_OP_regx); + EmitULEB128(D2); + OutStreamer.AddComment("DW_OP_piece 8"); + EmitInt8(dwarf::DW_OP_piece); + EmitULEB128(8); + } + } +} + +void ARMAsmPrinter::EmitFunctionEntryLabel() { + OutStreamer.ForceCodeRegion(); + + if (AFI->isThumbFunction()) { + OutStreamer.EmitAssemblerFlag(MCAF_Code16); + OutStreamer.EmitThumbFunc(CurrentFnSym); + } + + OutStreamer.EmitLabel(CurrentFnSym); +} + +/// runOnMachineFunction - This uses the EmitInstruction() +/// method to print assembly for each instruction. +/// +bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) { + AFI = MF.getInfo<ARMFunctionInfo>(); + MCP = MF.getConstantPool(); + + return AsmPrinter::runOnMachineFunction(MF); +} + +void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O, const char *Modifier) { + const MachineOperand &MO = MI->getOperand(OpNum); + unsigned TF = MO.getTargetFlags(); + + switch (MO.getType()) { + default: + assert(0 && "<unknown operand type>"); + case MachineOperand::MO_Register: { + unsigned Reg = MO.getReg(); + assert(TargetRegisterInfo::isPhysicalRegister(Reg)); + assert(!MO.getSubReg() && "Subregs should be eliminated!"); + O << ARMInstPrinter::getRegisterName(Reg); + break; + } + case MachineOperand::MO_Immediate: { + int64_t Imm = MO.getImm(); + O << '#'; + if ((Modifier && strcmp(Modifier, "lo16") == 0) || + (TF == ARMII::MO_LO16)) + O << ":lower16:"; + else if ((Modifier && strcmp(Modifier, "hi16") == 0) || + (TF == ARMII::MO_HI16)) + O << ":upper16:"; + O << Imm; + break; + } + case MachineOperand::MO_MachineBasicBlock: + O << *MO.getMBB()->getSymbol(); + return; + case MachineOperand::MO_GlobalAddress: { + const GlobalValue *GV = MO.getGlobal(); + if ((Modifier && strcmp(Modifier, "lo16") == 0) || + (TF & ARMII::MO_LO16)) + O << ":lower16:"; + else if ((Modifier && strcmp(Modifier, "hi16") == 0) || + (TF & ARMII::MO_HI16)) + O << ":upper16:"; + O << *Mang->getSymbol(GV); + + printOffset(MO.getOffset(), O); + if (TF == ARMII::MO_PLT) + O << "(PLT)"; + break; + } + case MachineOperand::MO_ExternalSymbol: { + O << *GetExternalSymbolSymbol(MO.getSymbolName()); + if (TF == ARMII::MO_PLT) + O << "(PLT)"; + break; + } + case MachineOperand::MO_ConstantPoolIndex: + O << *GetCPISymbol(MO.getIndex()); + break; + case MachineOperand::MO_JumpTableIndex: + O << *GetJTISymbol(MO.getIndex()); + break; + } +} + +//===--------------------------------------------------------------------===// + +MCSymbol *ARMAsmPrinter:: +GetARMSetPICJumpTableLabel2(unsigned uid, unsigned uid2, + const MachineBasicBlock *MBB) const { + SmallString<60> Name; + raw_svector_ostream(Name) << MAI->getPrivateGlobalPrefix() + << getFunctionNumber() << '_' << uid << '_' << uid2 + << "_set_" << MBB->getNumber(); + return OutContext.GetOrCreateSymbol(Name.str()); +} + +MCSymbol *ARMAsmPrinter:: +GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const { + SmallString<60> Name; + raw_svector_ostream(Name) << MAI->getPrivateGlobalPrefix() << "JTI" + << getFunctionNumber() << '_' << uid << '_' << uid2; + return OutContext.GetOrCreateSymbol(Name.str()); +} + + +MCSymbol *ARMAsmPrinter::GetARMSJLJEHLabel(void) const { + SmallString<60> Name; + raw_svector_ostream(Name) << MAI->getPrivateGlobalPrefix() << "SJLJEH" + << getFunctionNumber(); + return OutContext.GetOrCreateSymbol(Name.str()); +} + +bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &O) { + // Does this asm operand have a single letter operand modifier? + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) return true; // Unknown modifier. + + switch (ExtraCode[0]) { + default: return true; // Unknown modifier. + case 'a': // Print as a memory address. + if (MI->getOperand(OpNum).isReg()) { + O << "[" + << ARMInstPrinter::getRegisterName(MI->getOperand(OpNum).getReg()) + << "]"; + return false; + } + // Fallthrough + case 'c': // Don't print "#" before an immediate operand. + if (!MI->getOperand(OpNum).isImm()) + return true; + O << MI->getOperand(OpNum).getImm(); + return false; + case 'P': // Print a VFP double precision register. + case 'q': // Print a NEON quad precision register. + printOperand(MI, OpNum, O); + return false; + case 'y': // Print a VFP single precision register as indexed double. + // This uses the ordering of the alias table to get the first 'd' register + // that overlaps the 's' register. Also, s0 is an odd register, hence the + // odd modulus check below. + if (MI->getOperand(OpNum).isReg()) { + unsigned Reg = MI->getOperand(OpNum).getReg(); + const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); + O << ARMInstPrinter::getRegisterName(TRI->getAliasSet(Reg)[0]) << + (((Reg % 2) == 1) ? "[0]" : "[1]"); + return false; + } + return true; + case 'B': // Bitwise inverse of integer or symbol without a preceding #. + if (!MI->getOperand(OpNum).isImm()) + return true; + O << ~(MI->getOperand(OpNum).getImm()); + return false; + case 'L': // The low 16 bits of an immediate constant. + if (!MI->getOperand(OpNum).isImm()) + return true; + O << (MI->getOperand(OpNum).getImm() & 0xffff); + return false; + case 'M': { // A register range suitable for LDM/STM. + if (!MI->getOperand(OpNum).isReg()) + return true; + const MachineOperand &MO = MI->getOperand(OpNum); + unsigned RegBegin = MO.getReg(); + // This takes advantage of the 2 operand-ness of ldm/stm and that we've + // already got the operands in registers that are operands to the + // inline asm statement. + + O << "{" << ARMInstPrinter::getRegisterName(RegBegin); + + // FIXME: The register allocator not only may not have given us the + // registers in sequence, but may not be in ascending registers. This + // will require changes in the register allocator that'll need to be + // propagated down here if the operands change. + unsigned RegOps = OpNum + 1; + while (MI->getOperand(RegOps).isReg()) { + O << ", " + << ARMInstPrinter::getRegisterName(MI->getOperand(RegOps).getReg()); + RegOps++; + } + + O << "}"; + + return false; + } + case 'R': // The most significant register of a pair. + case 'Q': { // The least significant register of a pair. + if (OpNum == 0) + return true; + const MachineOperand &FlagsOP = MI->getOperand(OpNum - 1); + if (!FlagsOP.isImm()) + return true; + unsigned Flags = FlagsOP.getImm(); + unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags); + if (NumVals != 2) + return true; + unsigned RegOp = ExtraCode[0] == 'Q' ? OpNum : OpNum + 1; + if (RegOp >= MI->getNumOperands()) + return true; + const MachineOperand &MO = MI->getOperand(RegOp); + if (!MO.isReg()) + return true; + unsigned Reg = MO.getReg(); + O << ARMInstPrinter::getRegisterName(Reg); + return false; + } + + // These modifiers are not yet supported. + case 'p': // The high single-precision register of a VFP double-precision + // register. + case 'e': // The low doubleword register of a NEON quad register. + case 'f': // The high doubleword register of a NEON quad register. + case 'h': // A range of VFP/NEON registers suitable for VLD1/VST1. + case 'H': // The highest-numbered register of a pair. + return true; + } + } + + printOperand(MI, OpNum, O); + return false; +} + +bool ARMAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, + unsigned OpNum, unsigned AsmVariant, + const char *ExtraCode, + raw_ostream &O) { + // Does this asm operand have a single letter operand modifier? + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) return true; // Unknown modifier. + + switch (ExtraCode[0]) { + case 'A': // A memory operand for a VLD1/VST1 instruction. + default: return true; // Unknown modifier. + case 'm': // The base register of a memory operand. + if (!MI->getOperand(OpNum).isReg()) + return true; + O << ARMInstPrinter::getRegisterName(MI->getOperand(OpNum).getReg()); + return false; + } + } + + const MachineOperand &MO = MI->getOperand(OpNum); + assert(MO.isReg() && "unexpected inline asm memory operand"); + O << "[" << ARMInstPrinter::getRegisterName(MO.getReg()) << "]"; + return false; +} + +void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) { + if (Subtarget->isTargetDarwin()) { + Reloc::Model RelocM = TM.getRelocationModel(); + if (RelocM == Reloc::PIC_ || RelocM == Reloc::DynamicNoPIC) { + // Declare all the text sections up front (before the DWARF sections + // emitted by AsmPrinter::doInitialization) so the assembler will keep + // them together at the beginning of the object file. This helps + // avoid out-of-range branches that are due a fundamental limitation of + // the way symbol offsets are encoded with the current Darwin ARM + // relocations. + const TargetLoweringObjectFileMachO &TLOFMacho = + static_cast<const TargetLoweringObjectFileMachO &>( + getObjFileLowering()); + OutStreamer.SwitchSection(TLOFMacho.getTextSection()); + OutStreamer.SwitchSection(TLOFMacho.getTextCoalSection()); + OutStreamer.SwitchSection(TLOFMacho.getConstTextCoalSection()); + if (RelocM == Reloc::DynamicNoPIC) { + const MCSection *sect = + OutContext.getMachOSection("__TEXT", "__symbol_stub4", + MCSectionMachO::S_SYMBOL_STUBS, + 12, SectionKind::getText()); + OutStreamer.SwitchSection(sect); + } else { + const MCSection *sect = + OutContext.getMachOSection("__TEXT", "__picsymbolstub4", + MCSectionMachO::S_SYMBOL_STUBS, + 16, SectionKind::getText()); + OutStreamer.SwitchSection(sect); + } + const MCSection *StaticInitSect = + OutContext.getMachOSection("__TEXT", "__StaticInit", + MCSectionMachO::S_REGULAR | + MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS, + SectionKind::getText()); + OutStreamer.SwitchSection(StaticInitSect); + } + } + + // Use unified assembler syntax. + OutStreamer.EmitAssemblerFlag(MCAF_SyntaxUnified); + + // Emit ARM Build Attributes + if (Subtarget->isTargetELF()) { + + emitAttributes(); + } +} + + +void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) { + if (Subtarget->isTargetDarwin()) { + // All darwin targets use mach-o. + const TargetLoweringObjectFileMachO &TLOFMacho = + static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering()); + MachineModuleInfoMachO &MMIMacho = + MMI->getObjFileInfo<MachineModuleInfoMachO>(); + + // Output non-lazy-pointers for external and common global variables. + MachineModuleInfoMachO::SymbolListTy Stubs = MMIMacho.GetGVStubList(); + + if (!Stubs.empty()) { + // Switch with ".non_lazy_symbol_pointer" directive. + OutStreamer.SwitchSection(TLOFMacho.getNonLazySymbolPointerSection()); + EmitAlignment(2); + for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { + // L_foo$stub: + OutStreamer.EmitLabel(Stubs[i].first); + // .indirect_symbol _foo + MachineModuleInfoImpl::StubValueTy &MCSym = Stubs[i].second; + OutStreamer.EmitSymbolAttribute(MCSym.getPointer(),MCSA_IndirectSymbol); + + if (MCSym.getInt()) + // External to current translation unit. + OutStreamer.EmitIntValue(0, 4/*size*/, 0/*addrspace*/); + else + // Internal to current translation unit. + // + // When we place the LSDA into the TEXT section, the type info + // pointers need to be indirect and pc-rel. We accomplish this by + // using NLPs; however, sometimes the types are local to the file. + // We need to fill in the value for the NLP in those cases. + OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(), + OutContext), + 4/*size*/, 0/*addrspace*/); + } + + Stubs.clear(); + OutStreamer.AddBlankLine(); + } + + Stubs = MMIMacho.GetHiddenGVStubList(); + if (!Stubs.empty()) { + OutStreamer.SwitchSection(getObjFileLowering().getDataSection()); + EmitAlignment(2); + for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { + // L_foo$stub: + OutStreamer.EmitLabel(Stubs[i].first); + // .long _foo + OutStreamer.EmitValue(MCSymbolRefExpr:: + Create(Stubs[i].second.getPointer(), + OutContext), + 4/*size*/, 0/*addrspace*/); + } + + Stubs.clear(); + OutStreamer.AddBlankLine(); + } + + // Funny Darwin hack: This flag tells the linker that no global symbols + // contain code that falls through to other global symbols (e.g. the obvious + // implementation of multiple entry points). If this doesn't occur, the + // linker can safely perform dead code stripping. Since LLVM never + // generates code that does this, it is always safe to set. + OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); + } +} + +//===----------------------------------------------------------------------===// +// Helper routines for EmitStartOfAsmFile() and EmitEndOfAsmFile() +// FIXME: +// The following seem like one-off assembler flags, but they actually need +// to appear in the .ARM.attributes section in ELF. +// Instead of subclassing the MCELFStreamer, we do the work here. + +void ARMAsmPrinter::emitAttributes() { + + emitARMAttributeSection(); + + /* GAS expect .fpu to be emitted, regardless of VFP build attribute */ + bool emitFPU = false; + AttributeEmitter *AttrEmitter; + if (OutStreamer.hasRawTextSupport()) { + AttrEmitter = new AsmAttributeEmitter(OutStreamer); + emitFPU = true; + } else { + MCObjectStreamer &O = static_cast<MCObjectStreamer&>(OutStreamer); + AttrEmitter = new ObjectAttributeEmitter(O); + } + + AttrEmitter->MaybeSwitchVendor("aeabi"); + + std::string CPUString = Subtarget->getCPUString(); + + if (CPUString == "cortex-a8" || + Subtarget->isCortexA8()) { + AttrEmitter->EmitTextAttribute(ARMBuildAttrs::CPU_name, "cortex-a8"); + AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v7); + AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch_profile, + ARMBuildAttrs::ApplicationProfile); + AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use, + ARMBuildAttrs::Allowed); + AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use, + ARMBuildAttrs::AllowThumb32); + // Fixme: figure out when this is emitted. + //AttrEmitter->EmitAttribute(ARMBuildAttrs::WMMX_arch, + // ARMBuildAttrs::AllowWMMXv1); + // + + /// ADD additional Else-cases here! + } else if (CPUString == "xscale") { + AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5TEJ); + AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use, + ARMBuildAttrs::Allowed); + AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use, + ARMBuildAttrs::Allowed); + } else if (CPUString == "generic") { + // FIXME: Why these defaults? + AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v4T); + AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use, + ARMBuildAttrs::Allowed); + AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use, + ARMBuildAttrs::Allowed); + } + + if (Subtarget->hasNEON() && emitFPU) { + /* NEON is not exactly a VFP architecture, but GAS emit one of + * neon/vfpv3/vfpv2 for .fpu parameters */ + AttrEmitter->EmitTextAttribute(ARMBuildAttrs::Advanced_SIMD_arch, "neon"); + /* If emitted for NEON, omit from VFP below, since you can have both + * NEON and VFP in build attributes but only one .fpu */ + emitFPU = false; + } + + /* VFPv3 + .fpu */ + if (Subtarget->hasVFP3()) { + AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch, + ARMBuildAttrs::AllowFPv3A); + if (emitFPU) + AttrEmitter->EmitTextAttribute(ARMBuildAttrs::VFP_arch, "vfpv3"); + + /* VFPv2 + .fpu */ + } else if (Subtarget->hasVFP2()) { + AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch, + ARMBuildAttrs::AllowFPv2); + if (emitFPU) + AttrEmitter->EmitTextAttribute(ARMBuildAttrs::VFP_arch, "vfpv2"); + } + + /* TODO: ARMBuildAttrs::Allowed is not completely accurate, + * since NEON can have 1 (allowed) or 2 (MAC operations) */ + if (Subtarget->hasNEON()) { + AttrEmitter->EmitAttribute(ARMBuildAttrs::Advanced_SIMD_arch, + ARMBuildAttrs::Allowed); + } + + // Signal various FP modes. + if (!UnsafeFPMath) { + AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_denormal, + ARMBuildAttrs::Allowed); + AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_exceptions, + ARMBuildAttrs::Allowed); + } + + if (NoInfsFPMath && NoNaNsFPMath) + AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_number_model, + ARMBuildAttrs::Allowed); + else + AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_number_model, + ARMBuildAttrs::AllowIEE754); + + // FIXME: add more flags to ARMBuildAttrs.h + // 8-bytes alignment stuff. + AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_align8_needed, 1); + AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_align8_preserved, 1); + + // Hard float. Use both S and D registers and conform to AAPCS-VFP. + if (Subtarget->isAAPCS_ABI() && FloatABIType == FloatABI::Hard) { + AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_HardFP_use, 3); + AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_VFP_args, 1); + } + // FIXME: Should we signal R9 usage? + + if (Subtarget->hasDivide()) + AttrEmitter->EmitAttribute(ARMBuildAttrs::DIV_use, 1); + + AttrEmitter->Finish(); + delete AttrEmitter; +} + +void ARMAsmPrinter::emitARMAttributeSection() { + // <format-version> + // [ <section-length> "vendor-name" + // [ <file-tag> <size> <attribute>* + // | <section-tag> <size> <section-number>* 0 <attribute>* + // | <symbol-tag> <size> <symbol-number>* 0 <attribute>* + // ]+ + // ]* + + if (OutStreamer.hasRawTextSupport()) + return; + + const ARMElfTargetObjectFile &TLOFELF = + static_cast<const ARMElfTargetObjectFile &> + (getObjFileLowering()); + + OutStreamer.SwitchSection(TLOFELF.getAttributesSection()); + + // Format version + OutStreamer.EmitIntValue(0x41, 1); +} + +//===----------------------------------------------------------------------===// + +static MCSymbol *getPICLabel(const char *Prefix, unsigned FunctionNumber, + unsigned LabelId, MCContext &Ctx) { + + MCSymbol *Label = Ctx.GetOrCreateSymbol(Twine(Prefix) + + "PC" + Twine(FunctionNumber) + "_" + Twine(LabelId)); + return Label; +} + +static MCSymbolRefExpr::VariantKind +getModifierVariantKind(ARMCP::ARMCPModifier Modifier) { + switch (Modifier) { + default: llvm_unreachable("Unknown modifier!"); + case ARMCP::no_modifier: return MCSymbolRefExpr::VK_None; + case ARMCP::TLSGD: return MCSymbolRefExpr::VK_ARM_TLSGD; + case ARMCP::TPOFF: return MCSymbolRefExpr::VK_ARM_TPOFF; + case ARMCP::GOTTPOFF: return MCSymbolRefExpr::VK_ARM_GOTTPOFF; + case ARMCP::GOT: return MCSymbolRefExpr::VK_ARM_GOT; + case ARMCP::GOTOFF: return MCSymbolRefExpr::VK_ARM_GOTOFF; + } + return MCSymbolRefExpr::VK_None; +} + +MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV) { + bool isIndirect = Subtarget->isTargetDarwin() && + Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel()); + if (!isIndirect) + return Mang->getSymbol(GV); + + // FIXME: Remove this when Darwin transition to @GOT like syntax. + MCSymbol *MCSym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); + MachineModuleInfoMachO &MMIMachO = + MMI->getObjFileInfo<MachineModuleInfoMachO>(); + MachineModuleInfoImpl::StubValueTy &StubSym = + GV->hasHiddenVisibility() ? MMIMachO.getHiddenGVStubEntry(MCSym) : + MMIMachO.getGVStubEntry(MCSym); + if (StubSym.getPointer() == 0) + StubSym = MachineModuleInfoImpl:: + StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage()); + return MCSym; +} + +void ARMAsmPrinter:: +EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) { + int Size = TM.getTargetData()->getTypeAllocSize(MCPV->getType()); + + ARMConstantPoolValue *ACPV = static_cast<ARMConstantPoolValue*>(MCPV); + + MCSymbol *MCSym; + if (ACPV->isLSDA()) { + SmallString<128> Str; + raw_svector_ostream OS(Str); + OS << MAI->getPrivateGlobalPrefix() << "_LSDA_" << getFunctionNumber(); + MCSym = OutContext.GetOrCreateSymbol(OS.str()); + } else if (ACPV->isBlockAddress()) { + const BlockAddress *BA = + cast<ARMConstantPoolConstant>(ACPV)->getBlockAddress(); + MCSym = GetBlockAddressSymbol(BA); + } else if (ACPV->isGlobalValue()) { + const GlobalValue *GV = cast<ARMConstantPoolConstant>(ACPV)->getGV(); + MCSym = GetARMGVSymbol(GV); + } else if (ACPV->isMachineBasicBlock()) { + const MachineBasicBlock *MBB = cast<ARMConstantPoolMBB>(ACPV)->getMBB(); + MCSym = MBB->getSymbol(); + } else { + assert(ACPV->isExtSymbol() && "unrecognized constant pool value"); + const char *Sym = cast<ARMConstantPoolSymbol>(ACPV)->getSymbol(); + MCSym = GetExternalSymbolSymbol(Sym); + } + + // Create an MCSymbol for the reference. + const MCExpr *Expr = + MCSymbolRefExpr::Create(MCSym, getModifierVariantKind(ACPV->getModifier()), + OutContext); + + if (ACPV->getPCAdjustment()) { + MCSymbol *PCLabel = getPICLabel(MAI->getPrivateGlobalPrefix(), + getFunctionNumber(), + ACPV->getLabelId(), + OutContext); + const MCExpr *PCRelExpr = MCSymbolRefExpr::Create(PCLabel, OutContext); + PCRelExpr = + MCBinaryExpr::CreateAdd(PCRelExpr, + MCConstantExpr::Create(ACPV->getPCAdjustment(), + OutContext), + OutContext); + if (ACPV->mustAddCurrentAddress()) { + // We want "(<expr> - .)", but MC doesn't have a concept of the '.' + // label, so just emit a local label end reference that instead. + MCSymbol *DotSym = OutContext.CreateTempSymbol(); + OutStreamer.EmitLabel(DotSym); + const MCExpr *DotExpr = MCSymbolRefExpr::Create(DotSym, OutContext); + PCRelExpr = MCBinaryExpr::CreateSub(PCRelExpr, DotExpr, OutContext); + } + Expr = MCBinaryExpr::CreateSub(Expr, PCRelExpr, OutContext); + } + OutStreamer.EmitValue(Expr, Size); +} + +void ARMAsmPrinter::EmitJumpTable(const MachineInstr *MI) { + unsigned Opcode = MI->getOpcode(); + int OpNum = 1; + if (Opcode == ARM::BR_JTadd) + OpNum = 2; + else if (Opcode == ARM::BR_JTm) + OpNum = 3; + + const MachineOperand &MO1 = MI->getOperand(OpNum); + const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id + unsigned JTI = MO1.getIndex(); + + // Tag the jump table appropriately for precise disassembly. + OutStreamer.EmitJumpTable32Region(); + + // Emit a label for the jump table. + MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm()); + OutStreamer.EmitLabel(JTISymbol); + + // Emit each entry of the table. + const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); + const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); + const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs; + + for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) { + MachineBasicBlock *MBB = JTBBs[i]; + // Construct an MCExpr for the entry. We want a value of the form: + // (BasicBlockAddr - TableBeginAddr) + // + // For example, a table with entries jumping to basic blocks BB0 and BB1 + // would look like: + // LJTI_0_0: + // .word (LBB0 - LJTI_0_0) + // .word (LBB1 - LJTI_0_0) + const MCExpr *Expr = MCSymbolRefExpr::Create(MBB->getSymbol(), OutContext); + + if (TM.getRelocationModel() == Reloc::PIC_) + Expr = MCBinaryExpr::CreateSub(Expr, MCSymbolRefExpr::Create(JTISymbol, + OutContext), + OutContext); + // If we're generating a table of Thumb addresses in static relocation + // model, we need to add one to keep interworking correctly. + else if (AFI->isThumbFunction()) + Expr = MCBinaryExpr::CreateAdd(Expr, MCConstantExpr::Create(1,OutContext), + OutContext); + OutStreamer.EmitValue(Expr, 4); + } +} + +void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) { + unsigned Opcode = MI->getOpcode(); + int OpNum = (Opcode == ARM::t2BR_JT) ? 2 : 1; + const MachineOperand &MO1 = MI->getOperand(OpNum); + const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id + unsigned JTI = MO1.getIndex(); + + // Emit a label for the jump table. + if (MI->getOpcode() == ARM::t2TBB_JT) { + OutStreamer.EmitJumpTable8Region(); + } else if (MI->getOpcode() == ARM::t2TBH_JT) { + OutStreamer.EmitJumpTable16Region(); + } else { + OutStreamer.EmitJumpTable32Region(); + } + + MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm()); + OutStreamer.EmitLabel(JTISymbol); + + // Emit each entry of the table. + const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); + const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); + const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs; + unsigned OffsetWidth = 4; + if (MI->getOpcode() == ARM::t2TBB_JT) + OffsetWidth = 1; + else if (MI->getOpcode() == ARM::t2TBH_JT) + OffsetWidth = 2; + + for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) { + MachineBasicBlock *MBB = JTBBs[i]; + const MCExpr *MBBSymbolExpr = MCSymbolRefExpr::Create(MBB->getSymbol(), + OutContext); + // If this isn't a TBB or TBH, the entries are direct branch instructions. + if (OffsetWidth == 4) { + MCInst BrInst; + BrInst.setOpcode(ARM::t2B); + BrInst.addOperand(MCOperand::CreateExpr(MBBSymbolExpr)); + BrInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + BrInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(BrInst); + continue; + } + // Otherwise it's an offset from the dispatch instruction. Construct an + // MCExpr for the entry. We want a value of the form: + // (BasicBlockAddr - TableBeginAddr) / 2 + // + // For example, a TBB table with entries jumping to basic blocks BB0 and BB1 + // would look like: + // LJTI_0_0: + // .byte (LBB0 - LJTI_0_0) / 2 + // .byte (LBB1 - LJTI_0_0) / 2 + const MCExpr *Expr = + MCBinaryExpr::CreateSub(MBBSymbolExpr, + MCSymbolRefExpr::Create(JTISymbol, OutContext), + OutContext); + Expr = MCBinaryExpr::CreateDiv(Expr, MCConstantExpr::Create(2, OutContext), + OutContext); + OutStreamer.EmitValue(Expr, OffsetWidth); + } +} + +void ARMAsmPrinter::PrintDebugValueComment(const MachineInstr *MI, + raw_ostream &OS) { + unsigned NOps = MI->getNumOperands(); + assert(NOps==4); + OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: "; + // cast away const; DIetc do not take const operands for some reason. + DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata())); + OS << V.getName(); + OS << " <- "; + // Frame address. Currently handles register +- offset only. + assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm()); + OS << '['; printOperand(MI, 0, OS); OS << '+'; printOperand(MI, 1, OS); + OS << ']'; + OS << "+"; + printOperand(MI, NOps-2, OS); +} + +static void populateADROperands(MCInst &Inst, unsigned Dest, + const MCSymbol *Label, + unsigned pred, unsigned ccreg, + MCContext &Ctx) { + const MCExpr *SymbolExpr = MCSymbolRefExpr::Create(Label, Ctx); + Inst.addOperand(MCOperand::CreateReg(Dest)); + Inst.addOperand(MCOperand::CreateExpr(SymbolExpr)); + // Add predicate operands. + Inst.addOperand(MCOperand::CreateImm(pred)); + Inst.addOperand(MCOperand::CreateReg(ccreg)); +} + +void ARMAsmPrinter::EmitPatchedInstruction(const MachineInstr *MI, + unsigned Opcode) { + MCInst TmpInst; + + // Emit the instruction as usual, just patch the opcode. + LowerARMMachineInstrToMCInst(MI, TmpInst, *this); + TmpInst.setOpcode(Opcode); + OutStreamer.EmitInstruction(TmpInst); +} + +void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { + assert(MI->getFlag(MachineInstr::FrameSetup) && + "Only instruction which are involved into frame setup code are allowed"); + + const MachineFunction &MF = *MI->getParent()->getParent(); + const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo(); + const ARMFunctionInfo &AFI = *MF.getInfo<ARMFunctionInfo>(); + + unsigned FramePtr = RegInfo->getFrameRegister(MF); + unsigned Opc = MI->getOpcode(); + unsigned SrcReg, DstReg; + + if (Opc == ARM::tPUSH || Opc == ARM::tLDRpci) { + // Two special cases: + // 1) tPUSH does not have src/dst regs. + // 2) for Thumb1 code we sometimes materialize the constant via constpool + // load. Yes, this is pretty fragile, but for now I don't see better + // way... :( + SrcReg = DstReg = ARM::SP; + } else { + SrcReg = MI->getOperand(1).getReg(); + DstReg = MI->getOperand(0).getReg(); + } + + // Try to figure out the unwinding opcode out of src / dst regs. + if (MI->getDesc().mayStore()) { + // Register saves. + assert(DstReg == ARM::SP && + "Only stack pointer as a destination reg is supported"); + + SmallVector<unsigned, 4> RegList; + // Skip src & dst reg, and pred ops. + unsigned StartOp = 2 + 2; + // Use all the operands. + unsigned NumOffset = 0; + + switch (Opc) { + default: + MI->dump(); + assert(0 && "Unsupported opcode for unwinding information"); + case ARM::tPUSH: + // Special case here: no src & dst reg, but two extra imp ops. + StartOp = 2; NumOffset = 2; + case ARM::STMDB_UPD: + case ARM::t2STMDB_UPD: + case ARM::VSTMDDB_UPD: + assert(SrcReg == ARM::SP && + "Only stack pointer as a source reg is supported"); + for (unsigned i = StartOp, NumOps = MI->getNumOperands() - NumOffset; + i != NumOps; ++i) + RegList.push_back(MI->getOperand(i).getReg()); + break; + case ARM::STR_PRE_IMM: + case ARM::STR_PRE_REG: + assert(MI->getOperand(2).getReg() == ARM::SP && + "Only stack pointer as a source reg is supported"); + RegList.push_back(SrcReg); + break; + } + OutStreamer.EmitRegSave(RegList, Opc == ARM::VSTMDDB_UPD); + } else { + // Changes of stack / frame pointer. + if (SrcReg == ARM::SP) { + int64_t Offset = 0; + switch (Opc) { + default: + MI->dump(); + assert(0 && "Unsupported opcode for unwinding information"); + case ARM::MOVr: + Offset = 0; + break; + case ARM::ADDri: + Offset = -MI->getOperand(2).getImm(); + break; + case ARM::SUBri: + Offset = MI->getOperand(2).getImm(); + break; + case ARM::tSUBspi: + Offset = MI->getOperand(2).getImm()*4; + break; + case ARM::tADDspi: + case ARM::tADDrSPi: + Offset = -MI->getOperand(2).getImm()*4; + break; + case ARM::tLDRpci: { + // Grab the constpool index and check, whether it corresponds to + // original or cloned constpool entry. + unsigned CPI = MI->getOperand(1).getIndex(); + const MachineConstantPool *MCP = MF.getConstantPool(); + if (CPI >= MCP->getConstants().size()) + CPI = AFI.getOriginalCPIdx(CPI); + assert(CPI != -1U && "Invalid constpool index"); + + // Derive the actual offset. + const MachineConstantPoolEntry &CPE = MCP->getConstants()[CPI]; + assert(!CPE.isMachineConstantPoolEntry() && "Invalid constpool entry"); + // FIXME: Check for user, it should be "add" instruction! + Offset = -cast<ConstantInt>(CPE.Val.ConstVal)->getSExtValue(); + break; + } + } + + if (DstReg == FramePtr && FramePtr != ARM::SP) + // Set-up of the frame pointer. Positive values correspond to "add" + // instruction. + OutStreamer.EmitSetFP(FramePtr, ARM::SP, -Offset); + else if (DstReg == ARM::SP) { + // Change of SP by an offset. Positive values correspond to "sub" + // instruction. + OutStreamer.EmitPad(Offset); + } else { + MI->dump(); + assert(0 && "Unsupported opcode for unwinding information"); + } + } else if (DstReg == ARM::SP) { + // FIXME: .movsp goes here + MI->dump(); + assert(0 && "Unsupported opcode for unwinding information"); + } + else { + MI->dump(); + assert(0 && "Unsupported opcode for unwinding information"); + } + } +} + +extern cl::opt<bool> EnableARMEHABI; + +// Simple pseudo-instructions have their lowering (with expansion to real +// instructions) auto-generated. +#include "ARMGenMCPseudoLowering.inc" + +void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { + if (MI->getOpcode() != ARM::CONSTPOOL_ENTRY) + OutStreamer.EmitCodeRegion(); + + // Emit unwinding stuff for frame-related instructions + if (EnableARMEHABI && MI->getFlag(MachineInstr::FrameSetup)) + EmitUnwindingInstruction(MI); + + // Do any auto-generated pseudo lowerings. + if (emitPseudoExpansionLowering(OutStreamer, MI)) + return; + + assert(!convertAddSubFlagsOpcode(MI->getOpcode()) && + "Pseudo flag setting opcode should be expanded early"); + + // Check for manual lowerings. + unsigned Opc = MI->getOpcode(); + switch (Opc) { + case ARM::t2MOVi32imm: assert(0 && "Should be lowered by thumb2it pass"); + case ARM::DBG_VALUE: { + if (isVerbose() && OutStreamer.hasRawTextSupport()) { + SmallString<128> TmpStr; + raw_svector_ostream OS(TmpStr); + PrintDebugValueComment(MI, OS); + OutStreamer.EmitRawText(StringRef(OS.str())); + } + return; + } + case ARM::LEApcrel: + case ARM::tLEApcrel: + case ARM::t2LEApcrel: { + // FIXME: Need to also handle globals and externals + MCInst TmpInst; + TmpInst.setOpcode(MI->getOpcode() == ARM::t2LEApcrel ? ARM::t2ADR + : (MI->getOpcode() == ARM::tLEApcrel ? ARM::tADR + : ARM::ADR)); + populateADROperands(TmpInst, MI->getOperand(0).getReg(), + GetCPISymbol(MI->getOperand(1).getIndex()), + MI->getOperand(2).getImm(), MI->getOperand(3).getReg(), + OutContext); + OutStreamer.EmitInstruction(TmpInst); + return; + } + case ARM::LEApcrelJT: + case ARM::tLEApcrelJT: + case ARM::t2LEApcrelJT: { + MCInst TmpInst; + TmpInst.setOpcode(MI->getOpcode() == ARM::t2LEApcrelJT ? ARM::t2ADR + : (MI->getOpcode() == ARM::tLEApcrelJT ? ARM::tADR + : ARM::ADR)); + populateADROperands(TmpInst, MI->getOperand(0).getReg(), + GetARMJTIPICJumpTableLabel2(MI->getOperand(1).getIndex(), + MI->getOperand(2).getImm()), + MI->getOperand(3).getImm(), MI->getOperand(4).getReg(), + OutContext); + OutStreamer.EmitInstruction(TmpInst); + return; + } + // Darwin call instructions are just normal call instructions with different + // clobber semantics (they clobber R9). + case ARM::BXr9_CALL: + case ARM::BX_CALL: { + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::MOVr); + TmpInst.addOperand(MCOperand::CreateReg(ARM::LR)); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + // Add 's' bit operand (always reg0 for this) + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::BX); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + OutStreamer.EmitInstruction(TmpInst); + } + return; + } + case ARM::tBXr9_CALL: + case ARM::tBX_CALL: { + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::tMOVr); + TmpInst.addOperand(MCOperand::CreateReg(ARM::LR)); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::tBX); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + return; + } + case ARM::BMOVPCRXr9_CALL: + case ARM::BMOVPCRX_CALL: { + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::MOVr); + TmpInst.addOperand(MCOperand::CreateReg(ARM::LR)); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + // Add 's' bit operand (always reg0 for this) + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::MOVr); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + // Add 's' bit operand (always reg0 for this) + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + return; + } + case ARM::MOVi16_ga_pcrel: + case ARM::t2MOVi16_ga_pcrel: { + MCInst TmpInst; + TmpInst.setOpcode(Opc == ARM::MOVi16_ga_pcrel? ARM::MOVi16 : ARM::t2MOVi16); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + + unsigned TF = MI->getOperand(1).getTargetFlags(); + bool isPIC = TF == ARMII::MO_LO16_NONLAZY_PIC; + const GlobalValue *GV = MI->getOperand(1).getGlobal(); + MCSymbol *GVSym = GetARMGVSymbol(GV); + const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext); + if (isPIC) { + MCSymbol *LabelSym = getPICLabel(MAI->getPrivateGlobalPrefix(), + getFunctionNumber(), + MI->getOperand(2).getImm(), OutContext); + const MCExpr *LabelSymExpr= MCSymbolRefExpr::Create(LabelSym, OutContext); + unsigned PCAdj = (Opc == ARM::MOVi16_ga_pcrel) ? 8 : 4; + const MCExpr *PCRelExpr = + ARMMCExpr::CreateLower16(MCBinaryExpr::CreateSub(GVSymExpr, + MCBinaryExpr::CreateAdd(LabelSymExpr, + MCConstantExpr::Create(PCAdj, OutContext), + OutContext), OutContext), OutContext); + TmpInst.addOperand(MCOperand::CreateExpr(PCRelExpr)); + } else { + const MCExpr *RefExpr= ARMMCExpr::CreateLower16(GVSymExpr, OutContext); + TmpInst.addOperand(MCOperand::CreateExpr(RefExpr)); + } + + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + // Add 's' bit operand (always reg0 for this) + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + return; + } + case ARM::MOVTi16_ga_pcrel: + case ARM::t2MOVTi16_ga_pcrel: { + MCInst TmpInst; + TmpInst.setOpcode(Opc == ARM::MOVTi16_ga_pcrel + ? ARM::MOVTi16 : ARM::t2MOVTi16); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg())); + + unsigned TF = MI->getOperand(2).getTargetFlags(); + bool isPIC = TF == ARMII::MO_HI16_NONLAZY_PIC; + const GlobalValue *GV = MI->getOperand(2).getGlobal(); + MCSymbol *GVSym = GetARMGVSymbol(GV); + const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext); + if (isPIC) { + MCSymbol *LabelSym = getPICLabel(MAI->getPrivateGlobalPrefix(), + getFunctionNumber(), + MI->getOperand(3).getImm(), OutContext); + const MCExpr *LabelSymExpr= MCSymbolRefExpr::Create(LabelSym, OutContext); + unsigned PCAdj = (Opc == ARM::MOVTi16_ga_pcrel) ? 8 : 4; + const MCExpr *PCRelExpr = + ARMMCExpr::CreateUpper16(MCBinaryExpr::CreateSub(GVSymExpr, + MCBinaryExpr::CreateAdd(LabelSymExpr, + MCConstantExpr::Create(PCAdj, OutContext), + OutContext), OutContext), OutContext); + TmpInst.addOperand(MCOperand::CreateExpr(PCRelExpr)); + } else { + const MCExpr *RefExpr= ARMMCExpr::CreateUpper16(GVSymExpr, OutContext); + TmpInst.addOperand(MCOperand::CreateExpr(RefExpr)); + } + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + // Add 's' bit operand (always reg0 for this) + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + return; + } + case ARM::tPICADD: { + // This is a pseudo op for a label + instruction sequence, which looks like: + // LPC0: + // add r0, pc + // This adds the address of LPC0 to r0. + + // Emit the label. + OutStreamer.EmitLabel(getPICLabel(MAI->getPrivateGlobalPrefix(), + getFunctionNumber(), MI->getOperand(2).getImm(), + OutContext)); + + // Form and emit the add. + MCInst AddInst; + AddInst.setOpcode(ARM::tADDhirr); + AddInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + AddInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + AddInst.addOperand(MCOperand::CreateReg(ARM::PC)); + // Add predicate operands. + AddInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + AddInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(AddInst); + return; + } + case ARM::PICADD: { + // This is a pseudo op for a label + instruction sequence, which looks like: + // LPC0: + // add r0, pc, r0 + // This adds the address of LPC0 to r0. + + // Emit the label. + OutStreamer.EmitLabel(getPICLabel(MAI->getPrivateGlobalPrefix(), + getFunctionNumber(), MI->getOperand(2).getImm(), + OutContext)); + + // Form and emit the add. + MCInst AddInst; + AddInst.setOpcode(ARM::ADDrr); + AddInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + AddInst.addOperand(MCOperand::CreateReg(ARM::PC)); + AddInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg())); + // Add predicate operands. + AddInst.addOperand(MCOperand::CreateImm(MI->getOperand(3).getImm())); + AddInst.addOperand(MCOperand::CreateReg(MI->getOperand(4).getReg())); + // Add 's' bit operand (always reg0 for this) + AddInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(AddInst); + return; + } + case ARM::PICSTR: + case ARM::PICSTRB: + case ARM::PICSTRH: + case ARM::PICLDR: + case ARM::PICLDRB: + case ARM::PICLDRH: + case ARM::PICLDRSB: + case ARM::PICLDRSH: { + // This is a pseudo op for a label + instruction sequence, which looks like: + // LPC0: + // OP r0, [pc, r0] + // The LCP0 label is referenced by a constant pool entry in order to get + // a PC-relative address at the ldr instruction. + + // Emit the label. + OutStreamer.EmitLabel(getPICLabel(MAI->getPrivateGlobalPrefix(), + getFunctionNumber(), MI->getOperand(2).getImm(), + OutContext)); + + // Form and emit the load + unsigned Opcode; + switch (MI->getOpcode()) { + default: + llvm_unreachable("Unexpected opcode!"); + case ARM::PICSTR: Opcode = ARM::STRrs; break; + case ARM::PICSTRB: Opcode = ARM::STRBrs; break; + case ARM::PICSTRH: Opcode = ARM::STRH; break; + case ARM::PICLDR: Opcode = ARM::LDRrs; break; + case ARM::PICLDRB: Opcode = ARM::LDRBrs; break; + case ARM::PICLDRH: Opcode = ARM::LDRH; break; + case ARM::PICLDRSB: Opcode = ARM::LDRSB; break; + case ARM::PICLDRSH: Opcode = ARM::LDRSH; break; + } + MCInst LdStInst; + LdStInst.setOpcode(Opcode); + LdStInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + LdStInst.addOperand(MCOperand::CreateReg(ARM::PC)); + LdStInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg())); + LdStInst.addOperand(MCOperand::CreateImm(0)); + // Add predicate operands. + LdStInst.addOperand(MCOperand::CreateImm(MI->getOperand(3).getImm())); + LdStInst.addOperand(MCOperand::CreateReg(MI->getOperand(4).getReg())); + OutStreamer.EmitInstruction(LdStInst); + + return; + } + case ARM::CONSTPOOL_ENTRY: { + /// CONSTPOOL_ENTRY - This instruction represents a floating constant pool + /// in the function. The first operand is the ID# for this instruction, the + /// second is the index into the MachineConstantPool that this is, the third + /// is the size in bytes of this constant pool entry. + unsigned LabelId = (unsigned)MI->getOperand(0).getImm(); + unsigned CPIdx = (unsigned)MI->getOperand(1).getIndex(); + + EmitAlignment(2); + + // Mark the constant pool entry as data if we're not already in a data + // region. + OutStreamer.EmitDataRegion(); + OutStreamer.EmitLabel(GetCPISymbol(LabelId)); + + const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx]; + if (MCPE.isMachineConstantPoolEntry()) + EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal); + else + EmitGlobalConstant(MCPE.Val.ConstVal); + return; + } + case ARM::t2BR_JT: { + // Lower and emit the instruction itself, then the jump table following it. + MCInst TmpInst; + TmpInst.setOpcode(ARM::tMOVr); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + // Output the data for the jump table itself + EmitJump2Table(MI); + return; + } + case ARM::t2TBB_JT: { + // Lower and emit the instruction itself, then the jump table following it. + MCInst TmpInst; + + TmpInst.setOpcode(ARM::t2TBB); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + // Output the data for the jump table itself + EmitJump2Table(MI); + // Make sure the next instruction is 2-byte aligned. + EmitAlignment(1); + return; + } + case ARM::t2TBH_JT: { + // Lower and emit the instruction itself, then the jump table following it. + MCInst TmpInst; + + TmpInst.setOpcode(ARM::t2TBH); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + // Output the data for the jump table itself + EmitJump2Table(MI); + return; + } + case ARM::tBR_JTr: + case ARM::BR_JTr: { + // Lower and emit the instruction itself, then the jump table following it. + // mov pc, target + MCInst TmpInst; + unsigned Opc = MI->getOpcode() == ARM::BR_JTr ? + ARM::MOVr : ARM::tMOVr; + TmpInst.setOpcode(Opc); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + // Add 's' bit operand (always reg0 for this) + if (Opc == ARM::MOVr) + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + + // Make sure the Thumb jump table is 4-byte aligned. + if (Opc == ARM::tMOVr) + EmitAlignment(2); + + // Output the data for the jump table itself + EmitJumpTable(MI); + return; + } + case ARM::BR_JTm: { + // Lower and emit the instruction itself, then the jump table following it. + // ldr pc, target + MCInst TmpInst; + if (MI->getOperand(1).getReg() == 0) { + // literal offset + TmpInst.setOpcode(ARM::LDRi12); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm())); + } else { + TmpInst.setOpcode(ARM::LDRrs); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg())); + TmpInst.addOperand(MCOperand::CreateImm(0)); + } + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + + // Output the data for the jump table itself + EmitJumpTable(MI); + return; + } + case ARM::BR_JTadd: { + // Lower and emit the instruction itself, then the jump table following it. + // add pc, target, idx + MCInst TmpInst; + TmpInst.setOpcode(ARM::ADDrr); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg())); + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + // Add 's' bit operand (always reg0 for this) + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + + // Output the data for the jump table itself + EmitJumpTable(MI); + return; + } + case ARM::TRAP: { + // Non-Darwin binutils don't yet support the "trap" mnemonic. + // FIXME: Remove this special case when they do. + if (!Subtarget->isTargetDarwin()) { + //.long 0xe7ffdefe @ trap + uint32_t Val = 0xe7ffdefeUL; + OutStreamer.AddComment("trap"); + OutStreamer.EmitIntValue(Val, 4); + return; + } + break; + } + case ARM::tTRAP: { + // Non-Darwin binutils don't yet support the "trap" mnemonic. + // FIXME: Remove this special case when they do. + if (!Subtarget->isTargetDarwin()) { + //.short 57086 @ trap + uint16_t Val = 0xdefe; + OutStreamer.AddComment("trap"); + OutStreamer.EmitIntValue(Val, 2); + return; + } + break; + } + case ARM::t2Int_eh_sjlj_setjmp: + case ARM::t2Int_eh_sjlj_setjmp_nofp: + case ARM::tInt_eh_sjlj_setjmp: { + // Two incoming args: GPR:$src, GPR:$val + // mov $val, pc + // adds $val, #7 + // str $val, [$src, #4] + // movs r0, #0 + // b 1f + // movs r0, #1 + // 1: + unsigned SrcReg = MI->getOperand(0).getReg(); + unsigned ValReg = MI->getOperand(1).getReg(); + MCSymbol *Label = GetARMSJLJEHLabel(); + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::tMOVr); + TmpInst.addOperand(MCOperand::CreateReg(ValReg)); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.AddComment("eh_setjmp begin"); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::tADDi3); + TmpInst.addOperand(MCOperand::CreateReg(ValReg)); + // 's' bit operand + TmpInst.addOperand(MCOperand::CreateReg(ARM::CPSR)); + TmpInst.addOperand(MCOperand::CreateReg(ValReg)); + TmpInst.addOperand(MCOperand::CreateImm(7)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::tSTRi); + TmpInst.addOperand(MCOperand::CreateReg(ValReg)); + TmpInst.addOperand(MCOperand::CreateReg(SrcReg)); + // The offset immediate is #4. The operand value is scaled by 4 for the + // tSTR instruction. + TmpInst.addOperand(MCOperand::CreateImm(1)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::tMOVi8); + TmpInst.addOperand(MCOperand::CreateReg(ARM::R0)); + TmpInst.addOperand(MCOperand::CreateReg(ARM::CPSR)); + TmpInst.addOperand(MCOperand::CreateImm(0)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + const MCExpr *SymbolExpr = MCSymbolRefExpr::Create(Label, OutContext); + MCInst TmpInst; + TmpInst.setOpcode(ARM::tB); + TmpInst.addOperand(MCOperand::CreateExpr(SymbolExpr)); + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::tMOVi8); + TmpInst.addOperand(MCOperand::CreateReg(ARM::R0)); + TmpInst.addOperand(MCOperand::CreateReg(ARM::CPSR)); + TmpInst.addOperand(MCOperand::CreateImm(1)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.AddComment("eh_setjmp end"); + OutStreamer.EmitInstruction(TmpInst); + } + OutStreamer.EmitLabel(Label); + return; + } + + case ARM::Int_eh_sjlj_setjmp_nofp: + case ARM::Int_eh_sjlj_setjmp: { + // Two incoming args: GPR:$src, GPR:$val + // add $val, pc, #8 + // str $val, [$src, #+4] + // mov r0, #0 + // add pc, pc, #0 + // mov r0, #1 + unsigned SrcReg = MI->getOperand(0).getReg(); + unsigned ValReg = MI->getOperand(1).getReg(); + + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::ADDri); + TmpInst.addOperand(MCOperand::CreateReg(ValReg)); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + TmpInst.addOperand(MCOperand::CreateImm(8)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + // 's' bit operand (always reg0 for this). + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.AddComment("eh_setjmp begin"); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::STRi12); + TmpInst.addOperand(MCOperand::CreateReg(ValReg)); + TmpInst.addOperand(MCOperand::CreateReg(SrcReg)); + TmpInst.addOperand(MCOperand::CreateImm(4)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::MOVi); + TmpInst.addOperand(MCOperand::CreateReg(ARM::R0)); + TmpInst.addOperand(MCOperand::CreateImm(0)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + // 's' bit operand (always reg0 for this). + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::ADDri); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + TmpInst.addOperand(MCOperand::CreateReg(ARM::PC)); + TmpInst.addOperand(MCOperand::CreateImm(0)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + // 's' bit operand (always reg0 for this). + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::MOVi); + TmpInst.addOperand(MCOperand::CreateReg(ARM::R0)); + TmpInst.addOperand(MCOperand::CreateImm(1)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + // 's' bit operand (always reg0 for this). + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.AddComment("eh_setjmp end"); + OutStreamer.EmitInstruction(TmpInst); + } + return; + } + case ARM::Int_eh_sjlj_longjmp: { + // ldr sp, [$src, #8] + // ldr $scratch, [$src, #4] + // ldr r7, [$src] + // bx $scratch + unsigned SrcReg = MI->getOperand(0).getReg(); + unsigned ScratchReg = MI->getOperand(1).getReg(); + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::LDRi12); + TmpInst.addOperand(MCOperand::CreateReg(ARM::SP)); + TmpInst.addOperand(MCOperand::CreateReg(SrcReg)); + TmpInst.addOperand(MCOperand::CreateImm(8)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::LDRi12); + TmpInst.addOperand(MCOperand::CreateReg(ScratchReg)); + TmpInst.addOperand(MCOperand::CreateReg(SrcReg)); + TmpInst.addOperand(MCOperand::CreateImm(4)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::LDRi12); + TmpInst.addOperand(MCOperand::CreateReg(ARM::R7)); + TmpInst.addOperand(MCOperand::CreateReg(SrcReg)); + TmpInst.addOperand(MCOperand::CreateImm(0)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::BX); + TmpInst.addOperand(MCOperand::CreateReg(ScratchReg)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + return; + } + case ARM::tInt_eh_sjlj_longjmp: { + // ldr $scratch, [$src, #8] + // mov sp, $scratch + // ldr $scratch, [$src, #4] + // ldr r7, [$src] + // bx $scratch + unsigned SrcReg = MI->getOperand(0).getReg(); + unsigned ScratchReg = MI->getOperand(1).getReg(); + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::tLDRi); + TmpInst.addOperand(MCOperand::CreateReg(ScratchReg)); + TmpInst.addOperand(MCOperand::CreateReg(SrcReg)); + // The offset immediate is #8. The operand value is scaled by 4 for the + // tLDR instruction. + TmpInst.addOperand(MCOperand::CreateImm(2)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::tMOVr); + TmpInst.addOperand(MCOperand::CreateReg(ARM::SP)); + TmpInst.addOperand(MCOperand::CreateReg(ScratchReg)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::tLDRi); + TmpInst.addOperand(MCOperand::CreateReg(ScratchReg)); + TmpInst.addOperand(MCOperand::CreateReg(SrcReg)); + TmpInst.addOperand(MCOperand::CreateImm(1)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::tLDRr); + TmpInst.addOperand(MCOperand::CreateReg(ARM::R7)); + TmpInst.addOperand(MCOperand::CreateReg(SrcReg)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + { + MCInst TmpInst; + TmpInst.setOpcode(ARM::tBX); + TmpInst.addOperand(MCOperand::CreateReg(ScratchReg)); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + } + return; + } + } + + MCInst TmpInst; + LowerARMMachineInstrToMCInst(MI, TmpInst, *this); + + OutStreamer.EmitInstruction(TmpInst); +} + +//===----------------------------------------------------------------------===// +// Target Registry Stuff +//===----------------------------------------------------------------------===// + +// Force static initialization. +extern "C" void LLVMInitializeARMAsmPrinter() { + RegisterAsmPrinter<ARMAsmPrinter> X(TheARMTarget); + RegisterAsmPrinter<ARMAsmPrinter> Y(TheThumbTarget); +} + diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h new file mode 100644 index 0000000..7741fc4 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h @@ -0,0 +1,127 @@ +//===-- ARMAsmPrinter.h - Print machine code to an ARM .s file ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// ARM Assembly printer class. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMASMPRINTER_H +#define ARMASMPRINTER_H + +#include "ARM.h" +#include "ARMTargetMachine.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/Support/Compiler.h" + +namespace llvm { + +class MCOperand; + +namespace ARM { + enum DW_ISA { + DW_ISA_ARM_thumb = 1, + DW_ISA_ARM_arm = 2 + }; +} + +class LLVM_LIBRARY_VISIBILITY ARMAsmPrinter : public AsmPrinter { + + /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can + /// make the right decision when printing asm code for different targets. + const ARMSubtarget *Subtarget; + + /// AFI - Keep a pointer to ARMFunctionInfo for the current + /// MachineFunction. + ARMFunctionInfo *AFI; + + /// MCP - Keep a pointer to constantpool entries of the current + /// MachineFunction. + const MachineConstantPool *MCP; + +public: + explicit ARMAsmPrinter(TargetMachine &TM, MCStreamer &Streamer) + : AsmPrinter(TM, Streamer), AFI(NULL), MCP(NULL) { + Subtarget = &TM.getSubtarget<ARMSubtarget>(); + } + + virtual const char *getPassName() const { + return "ARM Assembly Printer"; + } + + void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O, + const char *Modifier = 0); + + virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &O); + virtual bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum, + unsigned AsmVariant, + const char *ExtraCode, raw_ostream &O); + + void EmitJumpTable(const MachineInstr *MI); + void EmitJump2Table(const MachineInstr *MI); + virtual void EmitInstruction(const MachineInstr *MI); + bool runOnMachineFunction(MachineFunction &F); + + virtual void EmitConstantPool() {} // we emit constant pools customly! + virtual void EmitFunctionEntryLabel(); + void EmitStartOfAsmFile(Module &M); + void EmitEndOfAsmFile(Module &M); + + // lowerOperand - Convert a MachineOperand into the equivalent MCOperand. + bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp); + +private: + // Helpers for EmitStartOfAsmFile() and EmitEndOfAsmFile() + void emitAttributes(); + + // Helper for ELF .o only + void emitARMAttributeSection(); + + // Generic helper used to emit e.g. ARMv5 mul pseudos + void EmitPatchedInstruction(const MachineInstr *MI, unsigned TargetOpc); + + void EmitUnwindingInstruction(const MachineInstr *MI); + + // emitPseudoExpansionLowering - tblgen'erated. + bool emitPseudoExpansionLowering(MCStreamer &OutStreamer, + const MachineInstr *MI); + +public: + void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS); + + MachineLocation getDebugValueLocation(const MachineInstr *MI) const; + + /// EmitDwarfRegOp - Emit dwarf register operation. + virtual void EmitDwarfRegOp(const MachineLocation &MLoc) const; + + virtual unsigned getISAEncoding() { + // ARM/Darwin adds ISA to the DWARF info for each function. + if (!Subtarget->isTargetDarwin()) + return 0; + return Subtarget->isThumb() ? + llvm::ARM::DW_ISA_ARM_thumb : llvm::ARM::DW_ISA_ARM_arm; + } + + MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol); + MCSymbol *GetARMSetPICJumpTableLabel2(unsigned uid, unsigned uid2, + const MachineBasicBlock *MBB) const; + MCSymbol *GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const; + + MCSymbol *GetARMSJLJEHLabel(void) const; + + MCSymbol *GetARMGVSymbol(const GlobalValue *GV); + + /// EmitMachineConstantPoolValue - Print a machine constantpool value to + /// the .s file. + virtual void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV); +}; +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp new file mode 100644 index 0000000..408edfc --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -0,0 +1,2830 @@ +//===- ARMBaseInstrInfo.cpp - ARM Instruction Information -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Base ARM implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "ARMBaseInstrInfo.h" +#include "ARM.h" +#include "ARMConstantPoolValue.h" +#include "ARMHazardRecognizer.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMRegisterInfo.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/Constants.h" +#include "llvm/Function.h" +#include "llvm/GlobalValue.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/ADT/STLExtras.h" + +#define GET_INSTRINFO_CTOR +#include "ARMGenInstrInfo.inc" + +using namespace llvm; + +static cl::opt<bool> +EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden, + cl::desc("Enable ARM 2-addr to 3-addr conv")); + +static cl::opt<bool> +WidenVMOVS("widen-vmovs", cl::Hidden, + cl::desc("Widen ARM vmovs to vmovd when possible")); + +/// ARM_MLxEntry - Record information about MLA / MLS instructions. +struct ARM_MLxEntry { + unsigned MLxOpc; // MLA / MLS opcode + unsigned MulOpc; // Expanded multiplication opcode + unsigned AddSubOpc; // Expanded add / sub opcode + bool NegAcc; // True if the acc is negated before the add / sub. + bool HasLane; // True if instruction has an extra "lane" operand. +}; + +static const ARM_MLxEntry ARM_MLxTable[] = { + // MLxOpc, MulOpc, AddSubOpc, NegAcc, HasLane + // fp scalar ops + { ARM::VMLAS, ARM::VMULS, ARM::VADDS, false, false }, + { ARM::VMLSS, ARM::VMULS, ARM::VSUBS, false, false }, + { ARM::VMLAD, ARM::VMULD, ARM::VADDD, false, false }, + { ARM::VMLSD, ARM::VMULD, ARM::VSUBD, false, false }, + { ARM::VNMLAS, ARM::VNMULS, ARM::VSUBS, true, false }, + { ARM::VNMLSS, ARM::VMULS, ARM::VSUBS, true, false }, + { ARM::VNMLAD, ARM::VNMULD, ARM::VSUBD, true, false }, + { ARM::VNMLSD, ARM::VMULD, ARM::VSUBD, true, false }, + + // fp SIMD ops + { ARM::VMLAfd, ARM::VMULfd, ARM::VADDfd, false, false }, + { ARM::VMLSfd, ARM::VMULfd, ARM::VSUBfd, false, false }, + { ARM::VMLAfq, ARM::VMULfq, ARM::VADDfq, false, false }, + { ARM::VMLSfq, ARM::VMULfq, ARM::VSUBfq, false, false }, + { ARM::VMLAslfd, ARM::VMULslfd, ARM::VADDfd, false, true }, + { ARM::VMLSslfd, ARM::VMULslfd, ARM::VSUBfd, false, true }, + { ARM::VMLAslfq, ARM::VMULslfq, ARM::VADDfq, false, true }, + { ARM::VMLSslfq, ARM::VMULslfq, ARM::VSUBfq, false, true }, +}; + +ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget& STI) + : ARMGenInstrInfo(ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP), + Subtarget(STI) { + for (unsigned i = 0, e = array_lengthof(ARM_MLxTable); i != e; ++i) { + if (!MLxEntryMap.insert(std::make_pair(ARM_MLxTable[i].MLxOpc, i)).second) + assert(false && "Duplicated entries?"); + MLxHazardOpcodes.insert(ARM_MLxTable[i].AddSubOpc); + MLxHazardOpcodes.insert(ARM_MLxTable[i].MulOpc); + } +} + +// Use a ScoreboardHazardRecognizer for prepass ARM scheduling. TargetInstrImpl +// currently defaults to no prepass hazard recognizer. +ScheduleHazardRecognizer *ARMBaseInstrInfo:: +CreateTargetHazardRecognizer(const TargetMachine *TM, + const ScheduleDAG *DAG) const { + if (usePreRAHazardRecognizer()) { + const InstrItineraryData *II = TM->getInstrItineraryData(); + return new ScoreboardHazardRecognizer(II, DAG, "pre-RA-sched"); + } + return TargetInstrInfoImpl::CreateTargetHazardRecognizer(TM, DAG); +} + +ScheduleHazardRecognizer *ARMBaseInstrInfo:: +CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, + const ScheduleDAG *DAG) const { + if (Subtarget.isThumb2() || Subtarget.hasVFP2()) + return (ScheduleHazardRecognizer *) + new ARMHazardRecognizer(II, *this, getRegisterInfo(), Subtarget, DAG); + return TargetInstrInfoImpl::CreateTargetPostRAHazardRecognizer(II, DAG); +} + +MachineInstr * +ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const { + // FIXME: Thumb2 support. + + if (!EnableARM3Addr) + return NULL; + + MachineInstr *MI = MBBI; + MachineFunction &MF = *MI->getParent()->getParent(); + uint64_t TSFlags = MI->getDesc().TSFlags; + bool isPre = false; + switch ((TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift) { + default: return NULL; + case ARMII::IndexModePre: + isPre = true; + break; + case ARMII::IndexModePost: + break; + } + + // Try splitting an indexed load/store to an un-indexed one plus an add/sub + // operation. + unsigned MemOpc = getUnindexedOpcode(MI->getOpcode()); + if (MemOpc == 0) + return NULL; + + MachineInstr *UpdateMI = NULL; + MachineInstr *MemMI = NULL; + unsigned AddrMode = (TSFlags & ARMII::AddrModeMask); + const MCInstrDesc &MCID = MI->getDesc(); + unsigned NumOps = MCID.getNumOperands(); + bool isLoad = !MCID.mayStore(); + const MachineOperand &WB = isLoad ? MI->getOperand(1) : MI->getOperand(0); + const MachineOperand &Base = MI->getOperand(2); + const MachineOperand &Offset = MI->getOperand(NumOps-3); + unsigned WBReg = WB.getReg(); + unsigned BaseReg = Base.getReg(); + unsigned OffReg = Offset.getReg(); + unsigned OffImm = MI->getOperand(NumOps-2).getImm(); + ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NumOps-1).getImm(); + switch (AddrMode) { + default: + assert(false && "Unknown indexed op!"); + return NULL; + case ARMII::AddrMode2: { + bool isSub = ARM_AM::getAM2Op(OffImm) == ARM_AM::sub; + unsigned Amt = ARM_AM::getAM2Offset(OffImm); + if (OffReg == 0) { + if (ARM_AM::getSOImmVal(Amt) == -1) + // Can't encode it in a so_imm operand. This transformation will + // add more than 1 instruction. Abandon! + return NULL; + UpdateMI = BuildMI(MF, MI->getDebugLoc(), + get(isSub ? ARM::SUBri : ARM::ADDri), WBReg) + .addReg(BaseReg).addImm(Amt) + .addImm(Pred).addReg(0).addReg(0); + } else if (Amt != 0) { + ARM_AM::ShiftOpc ShOpc = ARM_AM::getAM2ShiftOpc(OffImm); + unsigned SOOpc = ARM_AM::getSORegOpc(ShOpc, Amt); + UpdateMI = BuildMI(MF, MI->getDebugLoc(), + get(isSub ? ARM::SUBrsi : ARM::ADDrsi), WBReg) + .addReg(BaseReg).addReg(OffReg).addReg(0).addImm(SOOpc) + .addImm(Pred).addReg(0).addReg(0); + } else + UpdateMI = BuildMI(MF, MI->getDebugLoc(), + get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg) + .addReg(BaseReg).addReg(OffReg) + .addImm(Pred).addReg(0).addReg(0); + break; + } + case ARMII::AddrMode3 : { + bool isSub = ARM_AM::getAM3Op(OffImm) == ARM_AM::sub; + unsigned Amt = ARM_AM::getAM3Offset(OffImm); + if (OffReg == 0) + // Immediate is 8-bits. It's guaranteed to fit in a so_imm operand. + UpdateMI = BuildMI(MF, MI->getDebugLoc(), + get(isSub ? ARM::SUBri : ARM::ADDri), WBReg) + .addReg(BaseReg).addImm(Amt) + .addImm(Pred).addReg(0).addReg(0); + else + UpdateMI = BuildMI(MF, MI->getDebugLoc(), + get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg) + .addReg(BaseReg).addReg(OffReg) + .addImm(Pred).addReg(0).addReg(0); + break; + } + } + + std::vector<MachineInstr*> NewMIs; + if (isPre) { + if (isLoad) + MemMI = BuildMI(MF, MI->getDebugLoc(), + get(MemOpc), MI->getOperand(0).getReg()) + .addReg(WBReg).addImm(0).addImm(Pred); + else + MemMI = BuildMI(MF, MI->getDebugLoc(), + get(MemOpc)).addReg(MI->getOperand(1).getReg()) + .addReg(WBReg).addReg(0).addImm(0).addImm(Pred); + NewMIs.push_back(MemMI); + NewMIs.push_back(UpdateMI); + } else { + if (isLoad) + MemMI = BuildMI(MF, MI->getDebugLoc(), + get(MemOpc), MI->getOperand(0).getReg()) + .addReg(BaseReg).addImm(0).addImm(Pred); + else + MemMI = BuildMI(MF, MI->getDebugLoc(), + get(MemOpc)).addReg(MI->getOperand(1).getReg()) + .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred); + if (WB.isDead()) + UpdateMI->getOperand(0).setIsDead(); + NewMIs.push_back(UpdateMI); + NewMIs.push_back(MemMI); + } + + // Transfer LiveVariables states, kill / dead info. + if (LV) { + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) { + unsigned Reg = MO.getReg(); + + LiveVariables::VarInfo &VI = LV->getVarInfo(Reg); + if (MO.isDef()) { + MachineInstr *NewMI = (Reg == WBReg) ? UpdateMI : MemMI; + if (MO.isDead()) + LV->addVirtualRegisterDead(Reg, NewMI); + } + if (MO.isUse() && MO.isKill()) { + for (unsigned j = 0; j < 2; ++j) { + // Look at the two new MI's in reverse order. + MachineInstr *NewMI = NewMIs[j]; + if (!NewMI->readsRegister(Reg)) + continue; + LV->addVirtualRegisterKilled(Reg, NewMI); + if (VI.removeKill(MI)) + VI.Kills.push_back(NewMI); + break; + } + } + } + } + } + + MFI->insert(MBBI, NewMIs[1]); + MFI->insert(MBBI, NewMIs[0]); + return NewMIs[0]; +} + +// Branch analysis. +bool +ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { + // If the block has no terminators, it just falls into the block after it. + MachineBasicBlock::iterator I = MBB.end(); + if (I == MBB.begin()) + return false; + --I; + while (I->isDebugValue()) { + if (I == MBB.begin()) + return false; + --I; + } + if (!isUnpredicatedTerminator(I)) + return false; + + // Get the last instruction in the block. + MachineInstr *LastInst = I; + + // If there is only one terminator instruction, process it. + unsigned LastOpc = LastInst->getOpcode(); + if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { + if (isUncondBranchOpcode(LastOpc)) { + TBB = LastInst->getOperand(0).getMBB(); + return false; + } + if (isCondBranchOpcode(LastOpc)) { + // Block ends with fall-through condbranch. + TBB = LastInst->getOperand(0).getMBB(); + Cond.push_back(LastInst->getOperand(1)); + Cond.push_back(LastInst->getOperand(2)); + return false; + } + return true; // Can't handle indirect branch. + } + + // Get the instruction before it if it is a terminator. + MachineInstr *SecondLastInst = I; + unsigned SecondLastOpc = SecondLastInst->getOpcode(); + + // If AllowModify is true and the block ends with two or more unconditional + // branches, delete all but the first unconditional branch. + if (AllowModify && isUncondBranchOpcode(LastOpc)) { + while (isUncondBranchOpcode(SecondLastOpc)) { + LastInst->eraseFromParent(); + LastInst = SecondLastInst; + LastOpc = LastInst->getOpcode(); + if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { + // Return now the only terminator is an unconditional branch. + TBB = LastInst->getOperand(0).getMBB(); + return false; + } else { + SecondLastInst = I; + SecondLastOpc = SecondLastInst->getOpcode(); + } + } + } + + // If there are three terminators, we don't know what sort of block this is. + if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I)) + return true; + + // If the block ends with a B and a Bcc, handle it. + if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { + TBB = SecondLastInst->getOperand(0).getMBB(); + Cond.push_back(SecondLastInst->getOperand(1)); + Cond.push_back(SecondLastInst->getOperand(2)); + FBB = LastInst->getOperand(0).getMBB(); + return false; + } + + // If the block ends with two unconditional branches, handle it. The second + // one is not executed, so remove it. + if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { + TBB = SecondLastInst->getOperand(0).getMBB(); + I = LastInst; + if (AllowModify) + I->eraseFromParent(); + return false; + } + + // ...likewise if it ends with a branch table followed by an unconditional + // branch. The branch folder can create these, and we must get rid of them for + // correctness of Thumb constant islands. + if ((isJumpTableBranchOpcode(SecondLastOpc) || + isIndirectBranchOpcode(SecondLastOpc)) && + isUncondBranchOpcode(LastOpc)) { + I = LastInst; + if (AllowModify) + I->eraseFromParent(); + return true; + } + + // Otherwise, can't handle this. + return true; +} + + +unsigned ARMBaseInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator I = MBB.end(); + if (I == MBB.begin()) return 0; + --I; + while (I->isDebugValue()) { + if (I == MBB.begin()) + return 0; + --I; + } + if (!isUncondBranchOpcode(I->getOpcode()) && + !isCondBranchOpcode(I->getOpcode())) + return 0; + + // Remove the branch. + I->eraseFromParent(); + + I = MBB.end(); + + if (I == MBB.begin()) return 1; + --I; + if (!isCondBranchOpcode(I->getOpcode())) + return 1; + + // Remove the branch. + I->eraseFromParent(); + return 2; +} + +unsigned +ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const SmallVectorImpl<MachineOperand> &Cond, + DebugLoc DL) const { + ARMFunctionInfo *AFI = MBB.getParent()->getInfo<ARMFunctionInfo>(); + int BOpc = !AFI->isThumbFunction() + ? ARM::B : (AFI->isThumb2Function() ? ARM::t2B : ARM::tB); + int BccOpc = !AFI->isThumbFunction() + ? ARM::Bcc : (AFI->isThumb2Function() ? ARM::t2Bcc : ARM::tBcc); + bool isThumb = AFI->isThumbFunction() || AFI->isThumb2Function(); + + // Shouldn't be a fall through. + assert(TBB && "InsertBranch must not be told to insert a fallthrough"); + assert((Cond.size() == 2 || Cond.size() == 0) && + "ARM branch conditions have two components!"); + + if (FBB == 0) { + if (Cond.empty()) { // Unconditional branch? + if (isThumb) + BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB).addImm(ARMCC::AL).addReg(0); + else + BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB); + } else + BuildMI(&MBB, DL, get(BccOpc)).addMBB(TBB) + .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()); + return 1; + } + + // Two-way conditional branch. + BuildMI(&MBB, DL, get(BccOpc)).addMBB(TBB) + .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()); + if (isThumb) + BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB).addImm(ARMCC::AL).addReg(0); + else + BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB); + return 2; +} + +bool ARMBaseInstrInfo:: +ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { + ARMCC::CondCodes CC = (ARMCC::CondCodes)(int)Cond[0].getImm(); + Cond[0].setImm(ARMCC::getOppositeCondition(CC)); + return false; +} + +bool ARMBaseInstrInfo:: +PredicateInstruction(MachineInstr *MI, + const SmallVectorImpl<MachineOperand> &Pred) const { + unsigned Opc = MI->getOpcode(); + if (isUncondBranchOpcode(Opc)) { + MI->setDesc(get(getMatchingCondBranchOpcode(Opc))); + MI->addOperand(MachineOperand::CreateImm(Pred[0].getImm())); + MI->addOperand(MachineOperand::CreateReg(Pred[1].getReg(), false)); + return true; + } + + int PIdx = MI->findFirstPredOperandIdx(); + if (PIdx != -1) { + MachineOperand &PMO = MI->getOperand(PIdx); + PMO.setImm(Pred[0].getImm()); + MI->getOperand(PIdx+1).setReg(Pred[1].getReg()); + return true; + } + return false; +} + +bool ARMBaseInstrInfo:: +SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1, + const SmallVectorImpl<MachineOperand> &Pred2) const { + if (Pred1.size() > 2 || Pred2.size() > 2) + return false; + + ARMCC::CondCodes CC1 = (ARMCC::CondCodes)Pred1[0].getImm(); + ARMCC::CondCodes CC2 = (ARMCC::CondCodes)Pred2[0].getImm(); + if (CC1 == CC2) + return true; + + switch (CC1) { + default: + return false; + case ARMCC::AL: + return true; + case ARMCC::HS: + return CC2 == ARMCC::HI; + case ARMCC::LS: + return CC2 == ARMCC::LO || CC2 == ARMCC::EQ; + case ARMCC::GE: + return CC2 == ARMCC::GT; + case ARMCC::LE: + return CC2 == ARMCC::LT; + } +} + +bool ARMBaseInstrInfo::DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const { + // FIXME: This confuses implicit_def with optional CPSR def. + const MCInstrDesc &MCID = MI->getDesc(); + if (!MCID.getImplicitDefs() && !MCID.hasOptionalDef()) + return false; + + bool Found = false; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (MO.isReg() && MO.getReg() == ARM::CPSR) { + Pred.push_back(MO); + Found = true; + } + } + + return Found; +} + +/// isPredicable - Return true if the specified instruction can be predicated. +/// By default, this returns true for every instruction with a +/// PredicateOperand. +bool ARMBaseInstrInfo::isPredicable(MachineInstr *MI) const { + const MCInstrDesc &MCID = MI->getDesc(); + if (!MCID.isPredicable()) + return false; + + if ((MCID.TSFlags & ARMII::DomainMask) == ARMII::DomainNEON) { + ARMFunctionInfo *AFI = + MI->getParent()->getParent()->getInfo<ARMFunctionInfo>(); + return AFI->isThumb2Function(); + } + return true; +} + +/// FIXME: Works around a gcc miscompilation with -fstrict-aliasing. +LLVM_ATTRIBUTE_NOINLINE +static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT, + unsigned JTI); +static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT, + unsigned JTI) { + assert(JTI < JT.size()); + return JT[JTI].MBBs.size(); +} + +/// GetInstSize - Return the size of the specified MachineInstr. +/// +unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { + const MachineBasicBlock &MBB = *MI->getParent(); + const MachineFunction *MF = MBB.getParent(); + const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); + + const MCInstrDesc &MCID = MI->getDesc(); + if (MCID.getSize()) + return MCID.getSize(); + + // If this machine instr is an inline asm, measure it. + if (MI->getOpcode() == ARM::INLINEASM) + return getInlineAsmLength(MI->getOperand(0).getSymbolName(), *MAI); + if (MI->isLabel()) + return 0; + unsigned Opc = MI->getOpcode(); + switch (Opc) { + case TargetOpcode::IMPLICIT_DEF: + case TargetOpcode::KILL: + case TargetOpcode::PROLOG_LABEL: + case TargetOpcode::EH_LABEL: + case TargetOpcode::DBG_VALUE: + return 0; + case ARM::MOVi16_ga_pcrel: + case ARM::MOVTi16_ga_pcrel: + case ARM::t2MOVi16_ga_pcrel: + case ARM::t2MOVTi16_ga_pcrel: + return 4; + case ARM::MOVi32imm: + case ARM::t2MOVi32imm: + return 8; + case ARM::CONSTPOOL_ENTRY: + // If this machine instr is a constant pool entry, its size is recorded as + // operand #2. + return MI->getOperand(2).getImm(); + case ARM::Int_eh_sjlj_longjmp: + return 16; + case ARM::tInt_eh_sjlj_longjmp: + return 10; + case ARM::Int_eh_sjlj_setjmp: + case ARM::Int_eh_sjlj_setjmp_nofp: + return 20; + case ARM::tInt_eh_sjlj_setjmp: + case ARM::t2Int_eh_sjlj_setjmp: + case ARM::t2Int_eh_sjlj_setjmp_nofp: + return 12; + case ARM::BR_JTr: + case ARM::BR_JTm: + case ARM::BR_JTadd: + case ARM::tBR_JTr: + case ARM::t2BR_JT: + case ARM::t2TBB_JT: + case ARM::t2TBH_JT: { + // These are jumptable branches, i.e. a branch followed by an inlined + // jumptable. The size is 4 + 4 * number of entries. For TBB, each + // entry is one byte; TBH two byte each. + unsigned EntrySize = (Opc == ARM::t2TBB_JT) + ? 1 : ((Opc == ARM::t2TBH_JT) ? 2 : 4); + unsigned NumOps = MCID.getNumOperands(); + MachineOperand JTOP = + MI->getOperand(NumOps - (MCID.isPredicable() ? 3 : 2)); + unsigned JTI = JTOP.getIndex(); + const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); + assert(MJTI != 0); + const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); + assert(JTI < JT.size()); + // Thumb instructions are 2 byte aligned, but JT entries are 4 byte + // 4 aligned. The assembler / linker may add 2 byte padding just before + // the JT entries. The size does not include this padding; the + // constant islands pass does separate bookkeeping for it. + // FIXME: If we know the size of the function is less than (1 << 16) *2 + // bytes, we can use 16-bit entries instead. Then there won't be an + // alignment issue. + unsigned InstSize = (Opc == ARM::tBR_JTr || Opc == ARM::t2BR_JT) ? 2 : 4; + unsigned NumEntries = getNumJTEntries(JT, JTI); + if (Opc == ARM::t2TBB_JT && (NumEntries & 1)) + // Make sure the instruction that follows TBB is 2-byte aligned. + // FIXME: Constant island pass should insert an "ALIGN" instruction + // instead. + ++NumEntries; + return NumEntries * EntrySize + InstSize; + } + default: + // Otherwise, pseudo-instruction sizes are zero. + return 0; + } + return 0; // Not reached +} + +void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const { + bool GPRDest = ARM::GPRRegClass.contains(DestReg); + bool GPRSrc = ARM::GPRRegClass.contains(SrcReg); + + if (GPRDest && GPRSrc) { + AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::MOVr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)))); + return; + } + + bool SPRDest = ARM::SPRRegClass.contains(DestReg); + bool SPRSrc = ARM::SPRRegClass.contains(SrcReg); + + unsigned Opc = 0; + if (SPRDest && SPRSrc) + Opc = ARM::VMOVS; + else if (GPRDest && SPRSrc) + Opc = ARM::VMOVRS; + else if (SPRDest && GPRSrc) + Opc = ARM::VMOVSR; + else if (ARM::DPRRegClass.contains(DestReg, SrcReg)) + Opc = ARM::VMOVD; + else if (ARM::QPRRegClass.contains(DestReg, SrcReg)) + Opc = ARM::VORRq; + + if (Opc) { + MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc), DestReg); + MIB.addReg(SrcReg, getKillRegState(KillSrc)); + if (Opc == ARM::VORRq) + MIB.addReg(SrcReg, getKillRegState(KillSrc)); + AddDefaultPred(MIB); + return; + } + + // Generate instructions for VMOVQQ and VMOVQQQQ pseudos in place. + if (ARM::QQPRRegClass.contains(DestReg, SrcReg) || + ARM::QQQQPRRegClass.contains(DestReg, SrcReg)) { + const TargetRegisterInfo *TRI = &getRegisterInfo(); + assert(ARM::qsub_0 + 3 == ARM::qsub_3 && "Expected contiguous enum."); + unsigned EndSubReg = ARM::QQPRRegClass.contains(DestReg, SrcReg) ? + ARM::qsub_1 : ARM::qsub_3; + for (unsigned i = ARM::qsub_0, e = EndSubReg + 1; i != e; ++i) { + unsigned Dst = TRI->getSubReg(DestReg, i); + unsigned Src = TRI->getSubReg(SrcReg, i); + MachineInstrBuilder Mov = + AddDefaultPred(BuildMI(MBB, I, I->getDebugLoc(), get(ARM::VORRq)) + .addReg(Dst, RegState::Define) + .addReg(Src, getKillRegState(KillSrc)) + .addReg(Src, getKillRegState(KillSrc))); + if (i == EndSubReg) { + Mov->addRegisterDefined(DestReg, TRI); + if (KillSrc) + Mov->addRegisterKilled(SrcReg, TRI); + } + } + return; + } + llvm_unreachable("Impossible reg-to-reg copy"); +} + +static const +MachineInstrBuilder &AddDReg(MachineInstrBuilder &MIB, + unsigned Reg, unsigned SubIdx, unsigned State, + const TargetRegisterInfo *TRI) { + if (!SubIdx) + return MIB.addReg(Reg, State); + + if (TargetRegisterInfo::isPhysicalRegister(Reg)) + return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State); + return MIB.addReg(Reg, State, SubIdx); +} + +void ARMBaseInstrInfo:: +storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned SrcReg, bool isKill, int FI, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + DebugLoc DL; + if (I != MBB.end()) DL = I->getDebugLoc(); + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + unsigned Align = MFI.getObjectAlignment(FI); + + MachineMemOperand *MMO = + MF.getMachineMemOperand(MachinePointerInfo( + PseudoSourceValue::getFixedStack(FI)), + MachineMemOperand::MOStore, + MFI.getObjectSize(FI), + Align); + + switch (RC->getSize()) { + case 4: + if (ARM::GPRRegClass.hasSubClassEq(RC)) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STRi12)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + } else if (ARM::SPRRegClass.hasSubClassEq(RC)) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRS)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + } else + llvm_unreachable("Unknown reg class!"); + break; + case 8: + if (ARM::DPRRegClass.hasSubClassEq(RC)) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRD)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + } else + llvm_unreachable("Unknown reg class!"); + break; + case 16: + if (ARM::QPRRegClass.hasSubClassEq(RC)) { + if (Align >= 16 && getRegisterInfo().needsStackRealignment(MF)) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q64Pseudo)) + .addFrameIndex(FI).addImm(16) + .addReg(SrcReg, getKillRegState(isKill)) + .addMemOperand(MMO)); + } else { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMQIA)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI) + .addMemOperand(MMO)); + } + } else + llvm_unreachable("Unknown reg class!"); + break; + case 32: + if (ARM::QQPRRegClass.hasSubClassEq(RC)) { + if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { + // FIXME: It's possible to only store part of the QQ register if the + // spilled def has a sub-register index. + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1d64QPseudo)) + .addFrameIndex(FI).addImm(16) + .addReg(SrcReg, getKillRegState(isKill)) + .addMemOperand(MMO)); + } else { + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA)) + .addFrameIndex(FI)) + .addMemOperand(MMO); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI); + AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI); + } + } else + llvm_unreachable("Unknown reg class!"); + break; + case 64: + if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) { + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA)) + .addFrameIndex(FI)) + .addMemOperand(MMO); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_4, 0, TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_5, 0, TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_6, 0, TRI); + AddDReg(MIB, SrcReg, ARM::dsub_7, 0, TRI); + } else + llvm_unreachable("Unknown reg class!"); + break; + default: + llvm_unreachable("Unknown reg class!"); + } +} + +unsigned +ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + switch (MI->getOpcode()) { + default: break; + case ARM::STRrs: + case ARM::t2STRs: // FIXME: don't use t2STRs to access frame. + if (MI->getOperand(1).isFI() && + MI->getOperand(2).isReg() && + MI->getOperand(3).isImm() && + MI->getOperand(2).getReg() == 0 && + MI->getOperand(3).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + case ARM::STRi12: + case ARM::t2STRi12: + case ARM::tSTRspi: + case ARM::VSTRD: + case ARM::VSTRS: + if (MI->getOperand(1).isFI() && + MI->getOperand(2).isImm() && + MI->getOperand(2).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + case ARM::VST1q64Pseudo: + if (MI->getOperand(0).isFI() && + MI->getOperand(2).getSubReg() == 0) { + FrameIndex = MI->getOperand(0).getIndex(); + return MI->getOperand(2).getReg(); + } + break; + case ARM::VSTMQIA: + if (MI->getOperand(1).isFI() && + MI->getOperand(0).getSubReg() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + } + + return 0; +} + +unsigned ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const { + const MachineMemOperand *Dummy; + return MI->getDesc().mayStore() && hasStoreToStackSlot(MI, Dummy, FrameIndex); +} + +void ARMBaseInstrInfo:: +loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned DestReg, int FI, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + DebugLoc DL; + if (I != MBB.end()) DL = I->getDebugLoc(); + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + unsigned Align = MFI.getObjectAlignment(FI); + MachineMemOperand *MMO = + MF.getMachineMemOperand( + MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)), + MachineMemOperand::MOLoad, + MFI.getObjectSize(FI), + Align); + + switch (RC->getSize()) { + case 4: + if (ARM::GPRRegClass.hasSubClassEq(RC)) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDRi12), DestReg) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + + } else if (ARM::SPRRegClass.hasSubClassEq(RC)) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRS), DestReg) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + } else + llvm_unreachable("Unknown reg class!"); + break; + case 8: + if (ARM::DPRRegClass.hasSubClassEq(RC)) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRD), DestReg) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + } else + llvm_unreachable("Unknown reg class!"); + break; + case 16: + if (ARM::QPRRegClass.hasSubClassEq(RC)) { + if (Align >= 16 && getRegisterInfo().needsStackRealignment(MF)) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q64Pseudo), DestReg) + .addFrameIndex(FI).addImm(16) + .addMemOperand(MMO)); + } else { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMQIA), DestReg) + .addFrameIndex(FI) + .addMemOperand(MMO)); + } + } else + llvm_unreachable("Unknown reg class!"); + break; + case 32: + if (ARM::QQPRRegClass.hasSubClassEq(RC)) { + if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg) + .addFrameIndex(FI).addImm(16) + .addMemOperand(MMO)); + } else { + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA)) + .addFrameIndex(FI)) + .addMemOperand(MMO); + MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::Define, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::Define, TRI); + MIB.addReg(DestReg, RegState::Define | RegState::Implicit); + } + } else + llvm_unreachable("Unknown reg class!"); + break; + case 64: + if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) { + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA)) + .addFrameIndex(FI)) + .addMemOperand(MMO); + MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::Define, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::Define, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_4, RegState::Define, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::Define, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::Define, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_7, RegState::Define, TRI); + MIB.addReg(DestReg, RegState::Define | RegState::Implicit); + } else + llvm_unreachable("Unknown reg class!"); + break; + default: + llvm_unreachable("Unknown regclass!"); + } +} + +unsigned +ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + switch (MI->getOpcode()) { + default: break; + case ARM::LDRrs: + case ARM::t2LDRs: // FIXME: don't use t2LDRs to access frame. + if (MI->getOperand(1).isFI() && + MI->getOperand(2).isReg() && + MI->getOperand(3).isImm() && + MI->getOperand(2).getReg() == 0 && + MI->getOperand(3).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + case ARM::LDRi12: + case ARM::t2LDRi12: + case ARM::tLDRspi: + case ARM::VLDRD: + case ARM::VLDRS: + if (MI->getOperand(1).isFI() && + MI->getOperand(2).isImm() && + MI->getOperand(2).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + case ARM::VLD1q64Pseudo: + if (MI->getOperand(1).isFI() && + MI->getOperand(0).getSubReg() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + case ARM::VLDMQIA: + if (MI->getOperand(1).isFI() && + MI->getOperand(0).getSubReg() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + } + + return 0; +} + +unsigned ARMBaseInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const { + const MachineMemOperand *Dummy; + return MI->getDesc().mayLoad() && hasLoadFromStackSlot(MI, Dummy, FrameIndex); +} + +bool ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const{ + // This hook gets to expand COPY instructions before they become + // copyPhysReg() calls. Look for VMOVS instructions that can legally be + // widened to VMOVD. We prefer the VMOVD when possible because it may be + // changed into a VORR that can go down the NEON pipeline. + if (!WidenVMOVS || !MI->isCopy()) + return false; + + // Look for a copy between even S-registers. That is where we keep floats + // when using NEON v2f32 instructions for f32 arithmetic. + unsigned DstRegS = MI->getOperand(0).getReg(); + unsigned SrcRegS = MI->getOperand(1).getReg(); + if (!ARM::SPRRegClass.contains(DstRegS, SrcRegS)) + return false; + + const TargetRegisterInfo *TRI = &getRegisterInfo(); + unsigned DstRegD = TRI->getMatchingSuperReg(DstRegS, ARM::ssub_0, + &ARM::DPRRegClass); + unsigned SrcRegD = TRI->getMatchingSuperReg(SrcRegS, ARM::ssub_0, + &ARM::DPRRegClass); + if (!DstRegD || !SrcRegD) + return false; + + // We want to widen this into a DstRegD = VMOVD SrcRegD copy. This is only + // legal if the COPY already defines the full DstRegD, and it isn't a + // sub-register insertion. + if (!MI->definesRegister(DstRegD, TRI) || MI->readsRegister(DstRegD, TRI)) + return false; + + // A dead copy shouldn't show up here, but reject it just in case. + if (MI->getOperand(0).isDead()) + return false; + + // All clear, widen the COPY. + DEBUG(dbgs() << "widening: " << *MI); + + // Get rid of the old <imp-def> of DstRegD. Leave it if it defines a Q-reg + // or some other super-register. + int ImpDefIdx = MI->findRegisterDefOperandIdx(DstRegD); + if (ImpDefIdx != -1) + MI->RemoveOperand(ImpDefIdx); + + // Change the opcode and operands. + MI->setDesc(get(ARM::VMOVD)); + MI->getOperand(0).setReg(DstRegD); + MI->getOperand(1).setReg(SrcRegD); + AddDefaultPred(MachineInstrBuilder(MI)); + + // We are now reading SrcRegD instead of SrcRegS. This may upset the + // register scavenger and machine verifier, so we need to indicate that we + // are reading an undefined value from SrcRegD, but a proper value from + // SrcRegS. + MI->getOperand(1).setIsUndef(); + MachineInstrBuilder(MI).addReg(SrcRegS, RegState::Implicit); + + // SrcRegD may actually contain an unrelated value in the ssub_1 + // sub-register. Don't kill it. Only kill the ssub_0 sub-register. + if (MI->getOperand(1).isKill()) { + MI->getOperand(1).setIsKill(false); + MI->addRegisterKilled(SrcRegS, TRI, true); + } + + DEBUG(dbgs() << "replaced by: " << *MI); + return true; +} + +MachineInstr* +ARMBaseInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, + int FrameIx, uint64_t Offset, + const MDNode *MDPtr, + DebugLoc DL) const { + MachineInstrBuilder MIB = BuildMI(MF, DL, get(ARM::DBG_VALUE)) + .addFrameIndex(FrameIx).addImm(0).addImm(Offset).addMetadata(MDPtr); + return &*MIB; +} + +/// Create a copy of a const pool value. Update CPI to the new index and return +/// the label UID. +static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) { + MachineConstantPool *MCP = MF.getConstantPool(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + + const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPI]; + assert(MCPE.isMachineConstantPoolEntry() && + "Expecting a machine constantpool entry!"); + ARMConstantPoolValue *ACPV = + static_cast<ARMConstantPoolValue*>(MCPE.Val.MachineCPVal); + + unsigned PCLabelId = AFI->createPICLabelUId(); + ARMConstantPoolValue *NewCPV = 0; + // FIXME: The below assumes PIC relocation model and that the function + // is Thumb mode (t1 or t2). PCAdjustment would be 8 for ARM mode PIC, and + // zero for non-PIC in ARM or Thumb. The callers are all of thumb LDR + // instructions, so that's probably OK, but is PIC always correct when + // we get here? + if (ACPV->isGlobalValue()) + NewCPV = ARMConstantPoolConstant:: + Create(cast<ARMConstantPoolConstant>(ACPV)->getGV(), PCLabelId, + ARMCP::CPValue, 4); + else if (ACPV->isExtSymbol()) + NewCPV = ARMConstantPoolSymbol:: + Create(MF.getFunction()->getContext(), + cast<ARMConstantPoolSymbol>(ACPV)->getSymbol(), PCLabelId, 4); + else if (ACPV->isBlockAddress()) + NewCPV = ARMConstantPoolConstant:: + Create(cast<ARMConstantPoolConstant>(ACPV)->getBlockAddress(), PCLabelId, + ARMCP::CPBlockAddress, 4); + else if (ACPV->isLSDA()) + NewCPV = ARMConstantPoolConstant::Create(MF.getFunction(), PCLabelId, + ARMCP::CPLSDA, 4); + else if (ACPV->isMachineBasicBlock()) + NewCPV = ARMConstantPoolMBB:: + Create(MF.getFunction()->getContext(), + cast<ARMConstantPoolMBB>(ACPV)->getMBB(), PCLabelId, 4); + else + llvm_unreachable("Unexpected ARM constantpool value type!!"); + CPI = MCP->getConstantPoolIndex(NewCPV, MCPE.getAlignment()); + return PCLabelId; +} + +void ARMBaseInstrInfo:: +reMaterialize(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, unsigned SubIdx, + const MachineInstr *Orig, + const TargetRegisterInfo &TRI) const { + unsigned Opcode = Orig->getOpcode(); + switch (Opcode) { + default: { + MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig); + MI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI); + MBB.insert(I, MI); + break; + } + case ARM::tLDRpci_pic: + case ARM::t2LDRpci_pic: { + MachineFunction &MF = *MBB.getParent(); + unsigned CPI = Orig->getOperand(1).getIndex(); + unsigned PCLabelId = duplicateCPV(MF, CPI); + MachineInstrBuilder MIB = BuildMI(MBB, I, Orig->getDebugLoc(), get(Opcode), + DestReg) + .addConstantPoolIndex(CPI).addImm(PCLabelId); + MIB->setMemRefs(Orig->memoperands_begin(), Orig->memoperands_end()); + break; + } + } +} + +MachineInstr * +ARMBaseInstrInfo::duplicate(MachineInstr *Orig, MachineFunction &MF) const { + MachineInstr *MI = TargetInstrInfoImpl::duplicate(Orig, MF); + switch(Orig->getOpcode()) { + case ARM::tLDRpci_pic: + case ARM::t2LDRpci_pic: { + unsigned CPI = Orig->getOperand(1).getIndex(); + unsigned PCLabelId = duplicateCPV(MF, CPI); + Orig->getOperand(1).setIndex(CPI); + Orig->getOperand(2).setImm(PCLabelId); + break; + } + } + return MI; +} + +bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0, + const MachineInstr *MI1, + const MachineRegisterInfo *MRI) const { + int Opcode = MI0->getOpcode(); + if (Opcode == ARM::t2LDRpci || + Opcode == ARM::t2LDRpci_pic || + Opcode == ARM::tLDRpci || + Opcode == ARM::tLDRpci_pic || + Opcode == ARM::MOV_ga_dyn || + Opcode == ARM::MOV_ga_pcrel || + Opcode == ARM::MOV_ga_pcrel_ldr || + Opcode == ARM::t2MOV_ga_dyn || + Opcode == ARM::t2MOV_ga_pcrel) { + if (MI1->getOpcode() != Opcode) + return false; + if (MI0->getNumOperands() != MI1->getNumOperands()) + return false; + + const MachineOperand &MO0 = MI0->getOperand(1); + const MachineOperand &MO1 = MI1->getOperand(1); + if (MO0.getOffset() != MO1.getOffset()) + return false; + + if (Opcode == ARM::MOV_ga_dyn || + Opcode == ARM::MOV_ga_pcrel || + Opcode == ARM::MOV_ga_pcrel_ldr || + Opcode == ARM::t2MOV_ga_dyn || + Opcode == ARM::t2MOV_ga_pcrel) + // Ignore the PC labels. + return MO0.getGlobal() == MO1.getGlobal(); + + const MachineFunction *MF = MI0->getParent()->getParent(); + const MachineConstantPool *MCP = MF->getConstantPool(); + int CPI0 = MO0.getIndex(); + int CPI1 = MO1.getIndex(); + const MachineConstantPoolEntry &MCPE0 = MCP->getConstants()[CPI0]; + const MachineConstantPoolEntry &MCPE1 = MCP->getConstants()[CPI1]; + bool isARMCP0 = MCPE0.isMachineConstantPoolEntry(); + bool isARMCP1 = MCPE1.isMachineConstantPoolEntry(); + if (isARMCP0 && isARMCP1) { + ARMConstantPoolValue *ACPV0 = + static_cast<ARMConstantPoolValue*>(MCPE0.Val.MachineCPVal); + ARMConstantPoolValue *ACPV1 = + static_cast<ARMConstantPoolValue*>(MCPE1.Val.MachineCPVal); + return ACPV0->hasSameValue(ACPV1); + } else if (!isARMCP0 && !isARMCP1) { + return MCPE0.Val.ConstVal == MCPE1.Val.ConstVal; + } + return false; + } else if (Opcode == ARM::PICLDR) { + if (MI1->getOpcode() != Opcode) + return false; + if (MI0->getNumOperands() != MI1->getNumOperands()) + return false; + + unsigned Addr0 = MI0->getOperand(1).getReg(); + unsigned Addr1 = MI1->getOperand(1).getReg(); + if (Addr0 != Addr1) { + if (!MRI || + !TargetRegisterInfo::isVirtualRegister(Addr0) || + !TargetRegisterInfo::isVirtualRegister(Addr1)) + return false; + + // This assumes SSA form. + MachineInstr *Def0 = MRI->getVRegDef(Addr0); + MachineInstr *Def1 = MRI->getVRegDef(Addr1); + // Check if the loaded value, e.g. a constantpool of a global address, are + // the same. + if (!produceSameValue(Def0, Def1, MRI)) + return false; + } + + for (unsigned i = 3, e = MI0->getNumOperands(); i != e; ++i) { + // %vreg12<def> = PICLDR %vreg11, 0, pred:14, pred:%noreg + const MachineOperand &MO0 = MI0->getOperand(i); + const MachineOperand &MO1 = MI1->getOperand(i); + if (!MO0.isIdenticalTo(MO1)) + return false; + } + return true; + } + + return MI0->isIdenticalTo(MI1, MachineInstr::IgnoreVRegDefs); +} + +/// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to +/// determine if two loads are loading from the same base address. It should +/// only return true if the base pointers are the same and the only differences +/// between the two addresses is the offset. It also returns the offsets by +/// reference. +bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, + int64_t &Offset1, + int64_t &Offset2) const { + // Don't worry about Thumb: just ARM and Thumb2. + if (Subtarget.isThumb1Only()) return false; + + if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode()) + return false; + + switch (Load1->getMachineOpcode()) { + default: + return false; + case ARM::LDRi12: + case ARM::LDRBi12: + case ARM::LDRD: + case ARM::LDRH: + case ARM::LDRSB: + case ARM::LDRSH: + case ARM::VLDRD: + case ARM::VLDRS: + case ARM::t2LDRi8: + case ARM::t2LDRDi8: + case ARM::t2LDRSHi8: + case ARM::t2LDRi12: + case ARM::t2LDRSHi12: + break; + } + + switch (Load2->getMachineOpcode()) { + default: + return false; + case ARM::LDRi12: + case ARM::LDRBi12: + case ARM::LDRD: + case ARM::LDRH: + case ARM::LDRSB: + case ARM::LDRSH: + case ARM::VLDRD: + case ARM::VLDRS: + case ARM::t2LDRi8: + case ARM::t2LDRDi8: + case ARM::t2LDRSHi8: + case ARM::t2LDRi12: + case ARM::t2LDRSHi12: + break; + } + + // Check if base addresses and chain operands match. + if (Load1->getOperand(0) != Load2->getOperand(0) || + Load1->getOperand(4) != Load2->getOperand(4)) + return false; + + // Index should be Reg0. + if (Load1->getOperand(3) != Load2->getOperand(3)) + return false; + + // Determine the offsets. + if (isa<ConstantSDNode>(Load1->getOperand(1)) && + isa<ConstantSDNode>(Load2->getOperand(1))) { + Offset1 = cast<ConstantSDNode>(Load1->getOperand(1))->getSExtValue(); + Offset2 = cast<ConstantSDNode>(Load2->getOperand(1))->getSExtValue(); + return true; + } + + return false; +} + +/// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to +/// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should +/// be scheduled togther. On some targets if two loads are loading from +/// addresses in the same cache line, it's better if they are scheduled +/// together. This function takes two integers that represent the load offsets +/// from the common base address. It returns true if it decides it's desirable +/// to schedule the two loads together. "NumLoads" is the number of loads that +/// have already been scheduled after Load1. +bool ARMBaseInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, + int64_t Offset1, int64_t Offset2, + unsigned NumLoads) const { + // Don't worry about Thumb: just ARM and Thumb2. + if (Subtarget.isThumb1Only()) return false; + + assert(Offset2 > Offset1); + + if ((Offset2 - Offset1) / 8 > 64) + return false; + + if (Load1->getMachineOpcode() != Load2->getMachineOpcode()) + return false; // FIXME: overly conservative? + + // Four loads in a row should be sufficient. + if (NumLoads >= 3) + return false; + + return true; +} + +bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI, + const MachineBasicBlock *MBB, + const MachineFunction &MF) const { + // Debug info is never a scheduling boundary. It's necessary to be explicit + // due to the special treatment of IT instructions below, otherwise a + // dbg_value followed by an IT will result in the IT instruction being + // considered a scheduling hazard, which is wrong. It should be the actual + // instruction preceding the dbg_value instruction(s), just like it is + // when debug info is not present. + if (MI->isDebugValue()) + return false; + + // Terminators and labels can't be scheduled around. + if (MI->getDesc().isTerminator() || MI->isLabel()) + return true; + + // Treat the start of the IT block as a scheduling boundary, but schedule + // t2IT along with all instructions following it. + // FIXME: This is a big hammer. But the alternative is to add all potential + // true and anti dependencies to IT block instructions as implicit operands + // to the t2IT instruction. The added compile time and complexity does not + // seem worth it. + MachineBasicBlock::const_iterator I = MI; + // Make sure to skip any dbg_value instructions + while (++I != MBB->end() && I->isDebugValue()) + ; + if (I != MBB->end() && I->getOpcode() == ARM::t2IT) + return true; + + // Don't attempt to schedule around any instruction that defines + // a stack-oriented pointer, as it's unlikely to be profitable. This + // saves compile time, because it doesn't require every single + // stack slot reference to depend on the instruction that does the + // modification. + if (MI->definesRegister(ARM::SP)) + return true; + + return false; +} + +bool ARMBaseInstrInfo:: +isProfitableToIfCvt(MachineBasicBlock &MBB, + unsigned NumCycles, unsigned ExtraPredCycles, + const BranchProbability &Probability) const { + if (!NumCycles) + return false; + + // Attempt to estimate the relative costs of predication versus branching. + unsigned UnpredCost = Probability.getNumerator() * NumCycles; + UnpredCost /= Probability.getDenominator(); + UnpredCost += 1; // The branch itself + UnpredCost += Subtarget.getMispredictionPenalty() / 10; + + return (NumCycles + ExtraPredCycles) <= UnpredCost; +} + +bool ARMBaseInstrInfo:: +isProfitableToIfCvt(MachineBasicBlock &TMBB, + unsigned TCycles, unsigned TExtra, + MachineBasicBlock &FMBB, + unsigned FCycles, unsigned FExtra, + const BranchProbability &Probability) const { + if (!TCycles || !FCycles) + return false; + + // Attempt to estimate the relative costs of predication versus branching. + unsigned TUnpredCost = Probability.getNumerator() * TCycles; + TUnpredCost /= Probability.getDenominator(); + + uint32_t Comp = Probability.getDenominator() - Probability.getNumerator(); + unsigned FUnpredCost = Comp * FCycles; + FUnpredCost /= Probability.getDenominator(); + + unsigned UnpredCost = TUnpredCost + FUnpredCost; + UnpredCost += 1; // The branch itself + UnpredCost += Subtarget.getMispredictionPenalty() / 10; + + return (TCycles + FCycles + TExtra + FExtra) <= UnpredCost; +} + +/// getInstrPredicate - If instruction is predicated, returns its predicate +/// condition, otherwise returns AL. It also returns the condition code +/// register by reference. +ARMCC::CondCodes +llvm::getInstrPredicate(const MachineInstr *MI, unsigned &PredReg) { + int PIdx = MI->findFirstPredOperandIdx(); + if (PIdx == -1) { + PredReg = 0; + return ARMCC::AL; + } + + PredReg = MI->getOperand(PIdx+1).getReg(); + return (ARMCC::CondCodes)MI->getOperand(PIdx).getImm(); +} + + +int llvm::getMatchingCondBranchOpcode(int Opc) { + if (Opc == ARM::B) + return ARM::Bcc; + else if (Opc == ARM::tB) + return ARM::tBcc; + else if (Opc == ARM::t2B) + return ARM::t2Bcc; + + llvm_unreachable("Unknown unconditional branch opcode!"); + return 0; +} + + +/// Map pseudo instructions that imply an 'S' bit onto real opcodes. Whether the +/// instruction is encoded with an 'S' bit is determined by the optional CPSR +/// def operand. +/// +/// This will go away once we can teach tblgen how to set the optional CPSR def +/// operand itself. +struct AddSubFlagsOpcodePair { + unsigned PseudoOpc; + unsigned MachineOpc; +}; + +static AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = { + {ARM::ADDSri, ARM::ADDri}, + {ARM::ADDSrr, ARM::ADDrr}, + {ARM::ADDSrsi, ARM::ADDrsi}, + {ARM::ADDSrsr, ARM::ADDrsr}, + + {ARM::SUBSri, ARM::SUBri}, + {ARM::SUBSrr, ARM::SUBrr}, + {ARM::SUBSrsi, ARM::SUBrsi}, + {ARM::SUBSrsr, ARM::SUBrsr}, + + {ARM::RSBSri, ARM::RSBri}, + {ARM::RSBSrr, ARM::RSBrr}, + {ARM::RSBSrsi, ARM::RSBrsi}, + {ARM::RSBSrsr, ARM::RSBrsr}, + + {ARM::t2ADDSri, ARM::t2ADDri}, + {ARM::t2ADDSrr, ARM::t2ADDrr}, + {ARM::t2ADDSrs, ARM::t2ADDrs}, + + {ARM::t2SUBSri, ARM::t2SUBri}, + {ARM::t2SUBSrr, ARM::t2SUBrr}, + {ARM::t2SUBSrs, ARM::t2SUBrs}, + + {ARM::t2RSBSri, ARM::t2RSBri}, + {ARM::t2RSBSrs, ARM::t2RSBrs}, +}; + +unsigned llvm::convertAddSubFlagsOpcode(unsigned OldOpc) { + static const int NPairs = + sizeof(AddSubFlagsOpcodeMap) / sizeof(AddSubFlagsOpcodePair); + for (AddSubFlagsOpcodePair *OpcPair = &AddSubFlagsOpcodeMap[0], + *End = &AddSubFlagsOpcodeMap[NPairs]; OpcPair != End; ++OpcPair) { + if (OldOpc == OpcPair->PseudoOpc) { + return OpcPair->MachineOpc; + } + } + return 0; +} + +void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, DebugLoc dl, + unsigned DestReg, unsigned BaseReg, int NumBytes, + ARMCC::CondCodes Pred, unsigned PredReg, + const ARMBaseInstrInfo &TII, unsigned MIFlags) { + bool isSub = NumBytes < 0; + if (isSub) NumBytes = -NumBytes; + + while (NumBytes) { + unsigned RotAmt = ARM_AM::getSOImmValRotate(NumBytes); + unsigned ThisVal = NumBytes & ARM_AM::rotr32(0xFF, RotAmt); + assert(ThisVal && "Didn't extract field correctly"); + + // We will handle these bits from offset, clear them. + NumBytes &= ~ThisVal; + + assert(ARM_AM::getSOImmVal(ThisVal) != -1 && "Bit extraction didn't work?"); + + // Build the new ADD / SUB. + unsigned Opc = isSub ? ARM::SUBri : ARM::ADDri; + BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg) + .addReg(BaseReg, RegState::Kill).addImm(ThisVal) + .addImm((unsigned)Pred).addReg(PredReg).addReg(0) + .setMIFlags(MIFlags); + BaseReg = DestReg; + } +} + +bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const ARMBaseInstrInfo &TII) { + unsigned Opcode = MI.getOpcode(); + const MCInstrDesc &Desc = MI.getDesc(); + unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); + bool isSub = false; + + // Memory operands in inline assembly always use AddrMode2. + if (Opcode == ARM::INLINEASM) + AddrMode = ARMII::AddrMode2; + + if (Opcode == ARM::ADDri) { + Offset += MI.getOperand(FrameRegIdx+1).getImm(); + if (Offset == 0) { + // Turn it into a move. + MI.setDesc(TII.get(ARM::MOVr)); + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + MI.RemoveOperand(FrameRegIdx+1); + Offset = 0; + return true; + } else if (Offset < 0) { + Offset = -Offset; + isSub = true; + MI.setDesc(TII.get(ARM::SUBri)); + } + + // Common case: small offset, fits into instruction. + if (ARM_AM::getSOImmVal(Offset) != -1) { + // Replace the FrameIndex with sp / fp + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset); + Offset = 0; + return true; + } + + // Otherwise, pull as much of the immedidate into this ADDri/SUBri + // as possible. + unsigned RotAmt = ARM_AM::getSOImmValRotate(Offset); + unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xFF, RotAmt); + + // We will handle these bits from offset, clear them. + Offset &= ~ThisImmVal; + + // Get the properly encoded SOImmVal field. + assert(ARM_AM::getSOImmVal(ThisImmVal) != -1 && + "Bit extraction didn't work?"); + MI.getOperand(FrameRegIdx+1).ChangeToImmediate(ThisImmVal); + } else { + unsigned ImmIdx = 0; + int InstrOffs = 0; + unsigned NumBits = 0; + unsigned Scale = 1; + switch (AddrMode) { + case ARMII::AddrMode_i12: { + ImmIdx = FrameRegIdx + 1; + InstrOffs = MI.getOperand(ImmIdx).getImm(); + NumBits = 12; + break; + } + case ARMII::AddrMode2: { + ImmIdx = FrameRegIdx+2; + InstrOffs = ARM_AM::getAM2Offset(MI.getOperand(ImmIdx).getImm()); + if (ARM_AM::getAM2Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub) + InstrOffs *= -1; + NumBits = 12; + break; + } + case ARMII::AddrMode3: { + ImmIdx = FrameRegIdx+2; + InstrOffs = ARM_AM::getAM3Offset(MI.getOperand(ImmIdx).getImm()); + if (ARM_AM::getAM3Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub) + InstrOffs *= -1; + NumBits = 8; + break; + } + case ARMII::AddrMode4: + case ARMII::AddrMode6: + // Can't fold any offset even if it's zero. + return false; + case ARMII::AddrMode5: { + ImmIdx = FrameRegIdx+1; + InstrOffs = ARM_AM::getAM5Offset(MI.getOperand(ImmIdx).getImm()); + if (ARM_AM::getAM5Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub) + InstrOffs *= -1; + NumBits = 8; + Scale = 4; + break; + } + default: + llvm_unreachable("Unsupported addressing mode!"); + break; + } + + Offset += InstrOffs * Scale; + assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!"); + if (Offset < 0) { + Offset = -Offset; + isSub = true; + } + + // Attempt to fold address comp. if opcode has offset bits + if (NumBits > 0) { + // Common case: small offset, fits into instruction. + MachineOperand &ImmOp = MI.getOperand(ImmIdx); + int ImmedOffset = Offset / Scale; + unsigned Mask = (1 << NumBits) - 1; + if ((unsigned)Offset <= Mask * Scale) { + // Replace the FrameIndex with sp + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + // FIXME: When addrmode2 goes away, this will simplify (like the + // T2 version), as the LDR.i12 versions don't need the encoding + // tricks for the offset value. + if (isSub) { + if (AddrMode == ARMII::AddrMode_i12) + ImmedOffset = -ImmedOffset; + else + ImmedOffset |= 1 << NumBits; + } + ImmOp.ChangeToImmediate(ImmedOffset); + Offset = 0; + return true; + } + + // Otherwise, it didn't fit. Pull in what we can to simplify the immed. + ImmedOffset = ImmedOffset & Mask; + if (isSub) { + if (AddrMode == ARMII::AddrMode_i12) + ImmedOffset = -ImmedOffset; + else + ImmedOffset |= 1 << NumBits; + } + ImmOp.ChangeToImmediate(ImmedOffset); + Offset &= ~(Mask*Scale); + } + } + + Offset = (isSub) ? -Offset : Offset; + return Offset == 0; +} + +bool ARMBaseInstrInfo:: +AnalyzeCompare(const MachineInstr *MI, unsigned &SrcReg, int &CmpMask, + int &CmpValue) const { + switch (MI->getOpcode()) { + default: break; + case ARM::CMPri: + case ARM::t2CMPri: + SrcReg = MI->getOperand(0).getReg(); + CmpMask = ~0; + CmpValue = MI->getOperand(1).getImm(); + return true; + case ARM::TSTri: + case ARM::t2TSTri: + SrcReg = MI->getOperand(0).getReg(); + CmpMask = MI->getOperand(1).getImm(); + CmpValue = 0; + return true; + } + + return false; +} + +/// isSuitableForMask - Identify a suitable 'and' instruction that +/// operates on the given source register and applies the same mask +/// as a 'tst' instruction. Provide a limited look-through for copies. +/// When successful, MI will hold the found instruction. +static bool isSuitableForMask(MachineInstr *&MI, unsigned SrcReg, + int CmpMask, bool CommonUse) { + switch (MI->getOpcode()) { + case ARM::ANDri: + case ARM::t2ANDri: + if (CmpMask != MI->getOperand(2).getImm()) + return false; + if (SrcReg == MI->getOperand(CommonUse ? 1 : 0).getReg()) + return true; + break; + case ARM::COPY: { + // Walk down one instruction which is potentially an 'and'. + const MachineInstr &Copy = *MI; + MachineBasicBlock::iterator AND( + llvm::next(MachineBasicBlock::iterator(MI))); + if (AND == MI->getParent()->end()) return false; + MI = AND; + return isSuitableForMask(MI, Copy.getOperand(0).getReg(), + CmpMask, true); + } + } + + return false; +} + +/// OptimizeCompareInstr - Convert the instruction supplying the argument to the +/// comparison into one that sets the zero bit in the flags register. +bool ARMBaseInstrInfo:: +OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, int CmpMask, + int CmpValue, const MachineRegisterInfo *MRI) const { + if (CmpValue != 0) + return false; + + MachineRegisterInfo::def_iterator DI = MRI->def_begin(SrcReg); + if (llvm::next(DI) != MRI->def_end()) + // Only support one definition. + return false; + + MachineInstr *MI = &*DI; + + // Masked compares sometimes use the same register as the corresponding 'and'. + if (CmpMask != ~0) { + if (!isSuitableForMask(MI, SrcReg, CmpMask, false)) { + MI = 0; + for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(SrcReg), + UE = MRI->use_end(); UI != UE; ++UI) { + if (UI->getParent() != CmpInstr->getParent()) continue; + MachineInstr *PotentialAND = &*UI; + if (!isSuitableForMask(PotentialAND, SrcReg, CmpMask, true)) + continue; + MI = PotentialAND; + break; + } + if (!MI) return false; + } + } + + // Conservatively refuse to convert an instruction which isn't in the same BB + // as the comparison. + if (MI->getParent() != CmpInstr->getParent()) + return false; + + // Check that CPSR isn't set between the comparison instruction and the one we + // want to change. + MachineBasicBlock::const_iterator I = CmpInstr, E = MI, + B = MI->getParent()->begin(); + + // Early exit if CmpInstr is at the beginning of the BB. + if (I == B) return false; + + --I; + for (; I != E; --I) { + const MachineInstr &Instr = *I; + + for (unsigned IO = 0, EO = Instr.getNumOperands(); IO != EO; ++IO) { + const MachineOperand &MO = Instr.getOperand(IO); + if (!MO.isReg()) continue; + + // This instruction modifies or uses CPSR after the one we want to + // change. We can't do this transformation. + if (MO.getReg() == ARM::CPSR) + return false; + } + + if (I == B) + // The 'and' is below the comparison instruction. + return false; + } + + // Set the "zero" bit in CPSR. + switch (MI->getOpcode()) { + default: break; + case ARM::RSBrr: + case ARM::RSBri: + case ARM::RSCrr: + case ARM::RSCri: + case ARM::ADDrr: + case ARM::ADDri: + case ARM::ADCrr: + case ARM::ADCri: + case ARM::SUBrr: + case ARM::SUBri: + case ARM::SBCrr: + case ARM::SBCri: + case ARM::t2RSBri: + case ARM::t2ADDrr: + case ARM::t2ADDri: + case ARM::t2ADCrr: + case ARM::t2ADCri: + case ARM::t2SUBrr: + case ARM::t2SUBri: + case ARM::t2SBCrr: + case ARM::t2SBCri: + case ARM::ANDrr: + case ARM::ANDri: + case ARM::t2ANDrr: + case ARM::t2ANDri: + case ARM::ORRrr: + case ARM::ORRri: + case ARM::t2ORRrr: + case ARM::t2ORRri: + case ARM::EORrr: + case ARM::EORri: + case ARM::t2EORrr: + case ARM::t2EORri: { + // Scan forward for the use of CPSR, if it's a conditional code requires + // checking of V bit, then this is not safe to do. If we can't find the + // CPSR use (i.e. used in another block), then it's not safe to perform + // the optimization. + bool isSafe = false; + I = CmpInstr; + E = MI->getParent()->end(); + while (!isSafe && ++I != E) { + const MachineInstr &Instr = *I; + for (unsigned IO = 0, EO = Instr.getNumOperands(); + !isSafe && IO != EO; ++IO) { + const MachineOperand &MO = Instr.getOperand(IO); + if (!MO.isReg() || MO.getReg() != ARM::CPSR) + continue; + if (MO.isDef()) { + isSafe = true; + break; + } + // Condition code is after the operand before CPSR. + ARMCC::CondCodes CC = (ARMCC::CondCodes)Instr.getOperand(IO-1).getImm(); + switch (CC) { + default: + isSafe = true; + break; + case ARMCC::VS: + case ARMCC::VC: + case ARMCC::GE: + case ARMCC::LT: + case ARMCC::GT: + case ARMCC::LE: + return false; + } + } + } + + if (!isSafe) + return false; + + // Toggle the optional operand to CPSR. + MI->getOperand(5).setReg(ARM::CPSR); + MI->getOperand(5).setIsDef(true); + CmpInstr->eraseFromParent(); + return true; + } + } + + return false; +} + +bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI, + MachineInstr *DefMI, unsigned Reg, + MachineRegisterInfo *MRI) const { + // Fold large immediates into add, sub, or, xor. + unsigned DefOpc = DefMI->getOpcode(); + if (DefOpc != ARM::t2MOVi32imm && DefOpc != ARM::MOVi32imm) + return false; + if (!DefMI->getOperand(1).isImm()) + // Could be t2MOVi32imm <ga:xx> + return false; + + if (!MRI->hasOneNonDBGUse(Reg)) + return false; + + unsigned UseOpc = UseMI->getOpcode(); + unsigned NewUseOpc = 0; + uint32_t ImmVal = (uint32_t)DefMI->getOperand(1).getImm(); + uint32_t SOImmValV1 = 0, SOImmValV2 = 0; + bool Commute = false; + switch (UseOpc) { + default: return false; + case ARM::SUBrr: + case ARM::ADDrr: + case ARM::ORRrr: + case ARM::EORrr: + case ARM::t2SUBrr: + case ARM::t2ADDrr: + case ARM::t2ORRrr: + case ARM::t2EORrr: { + Commute = UseMI->getOperand(2).getReg() != Reg; + switch (UseOpc) { + default: break; + case ARM::SUBrr: { + if (Commute) + return false; + ImmVal = -ImmVal; + NewUseOpc = ARM::SUBri; + // Fallthrough + } + case ARM::ADDrr: + case ARM::ORRrr: + case ARM::EORrr: { + if (!ARM_AM::isSOImmTwoPartVal(ImmVal)) + return false; + SOImmValV1 = (uint32_t)ARM_AM::getSOImmTwoPartFirst(ImmVal); + SOImmValV2 = (uint32_t)ARM_AM::getSOImmTwoPartSecond(ImmVal); + switch (UseOpc) { + default: break; + case ARM::ADDrr: NewUseOpc = ARM::ADDri; break; + case ARM::ORRrr: NewUseOpc = ARM::ORRri; break; + case ARM::EORrr: NewUseOpc = ARM::EORri; break; + } + break; + } + case ARM::t2SUBrr: { + if (Commute) + return false; + ImmVal = -ImmVal; + NewUseOpc = ARM::t2SUBri; + // Fallthrough + } + case ARM::t2ADDrr: + case ARM::t2ORRrr: + case ARM::t2EORrr: { + if (!ARM_AM::isT2SOImmTwoPartVal(ImmVal)) + return false; + SOImmValV1 = (uint32_t)ARM_AM::getT2SOImmTwoPartFirst(ImmVal); + SOImmValV2 = (uint32_t)ARM_AM::getT2SOImmTwoPartSecond(ImmVal); + switch (UseOpc) { + default: break; + case ARM::t2ADDrr: NewUseOpc = ARM::t2ADDri; break; + case ARM::t2ORRrr: NewUseOpc = ARM::t2ORRri; break; + case ARM::t2EORrr: NewUseOpc = ARM::t2EORri; break; + } + break; + } + } + } + } + + unsigned OpIdx = Commute ? 2 : 1; + unsigned Reg1 = UseMI->getOperand(OpIdx).getReg(); + bool isKill = UseMI->getOperand(OpIdx).isKill(); + unsigned NewReg = MRI->createVirtualRegister(MRI->getRegClass(Reg)); + AddDefaultCC(AddDefaultPred(BuildMI(*UseMI->getParent(), + *UseMI, UseMI->getDebugLoc(), + get(NewUseOpc), NewReg) + .addReg(Reg1, getKillRegState(isKill)) + .addImm(SOImmValV1))); + UseMI->setDesc(get(NewUseOpc)); + UseMI->getOperand(1).setReg(NewReg); + UseMI->getOperand(1).setIsKill(); + UseMI->getOperand(2).ChangeToImmediate(SOImmValV2); + DefMI->eraseFromParent(); + return true; +} + +unsigned +ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData, + const MachineInstr *MI) const { + if (!ItinData || ItinData->isEmpty()) + return 1; + + const MCInstrDesc &Desc = MI->getDesc(); + unsigned Class = Desc.getSchedClass(); + unsigned UOps = ItinData->Itineraries[Class].NumMicroOps; + if (UOps) + return UOps; + + unsigned Opc = MI->getOpcode(); + switch (Opc) { + default: + llvm_unreachable("Unexpected multi-uops instruction!"); + break; + case ARM::VLDMQIA: + case ARM::VSTMQIA: + return 2; + + // The number of uOps for load / store multiple are determined by the number + // registers. + // + // On Cortex-A8, each pair of register loads / stores can be scheduled on the + // same cycle. The scheduling for the first load / store must be done + // separately by assuming the the address is not 64-bit aligned. + // + // On Cortex-A9, the formula is simply (#reg / 2) + (#reg % 2). If the address + // is not 64-bit aligned, then AGU would take an extra cycle. For VFP / NEON + // load / store multiple, the formula is (#reg / 2) + (#reg % 2) + 1. + case ARM::VLDMDIA: + case ARM::VLDMDIA_UPD: + case ARM::VLDMDDB_UPD: + case ARM::VLDMSIA: + case ARM::VLDMSIA_UPD: + case ARM::VLDMSDB_UPD: + case ARM::VSTMDIA: + case ARM::VSTMDIA_UPD: + case ARM::VSTMDDB_UPD: + case ARM::VSTMSIA: + case ARM::VSTMSIA_UPD: + case ARM::VSTMSDB_UPD: { + unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands(); + return (NumRegs / 2) + (NumRegs % 2) + 1; + } + + case ARM::LDMIA_RET: + case ARM::LDMIA: + case ARM::LDMDA: + case ARM::LDMDB: + case ARM::LDMIB: + case ARM::LDMIA_UPD: + case ARM::LDMDA_UPD: + case ARM::LDMDB_UPD: + case ARM::LDMIB_UPD: + case ARM::STMIA: + case ARM::STMDA: + case ARM::STMDB: + case ARM::STMIB: + case ARM::STMIA_UPD: + case ARM::STMDA_UPD: + case ARM::STMDB_UPD: + case ARM::STMIB_UPD: + case ARM::tLDMIA: + case ARM::tLDMIA_UPD: + case ARM::tSTMIA_UPD: + case ARM::tPOP_RET: + case ARM::tPOP: + case ARM::tPUSH: + case ARM::t2LDMIA_RET: + case ARM::t2LDMIA: + case ARM::t2LDMDB: + case ARM::t2LDMIA_UPD: + case ARM::t2LDMDB_UPD: + case ARM::t2STMIA: + case ARM::t2STMDB: + case ARM::t2STMIA_UPD: + case ARM::t2STMDB_UPD: { + unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands() + 1; + if (Subtarget.isCortexA8()) { + if (NumRegs < 4) + return 2; + // 4 registers would be issued: 2, 2. + // 5 registers would be issued: 2, 2, 1. + UOps = (NumRegs / 2); + if (NumRegs % 2) + ++UOps; + return UOps; + } else if (Subtarget.isCortexA9()) { + UOps = (NumRegs / 2); + // If there are odd number of registers or if it's not 64-bit aligned, + // then it takes an extra AGU (Address Generation Unit) cycle. + if ((NumRegs % 2) || + !MI->hasOneMemOperand() || + (*MI->memoperands_begin())->getAlignment() < 8) + ++UOps; + return UOps; + } else { + // Assume the worst. + return NumRegs; + } + } + } +} + +int +ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData, + const MCInstrDesc &DefMCID, + unsigned DefClass, + unsigned DefIdx, unsigned DefAlign) const { + int RegNo = (int)(DefIdx+1) - DefMCID.getNumOperands() + 1; + if (RegNo <= 0) + // Def is the address writeback. + return ItinData->getOperandCycle(DefClass, DefIdx); + + int DefCycle; + if (Subtarget.isCortexA8()) { + // (regno / 2) + (regno % 2) + 1 + DefCycle = RegNo / 2 + 1; + if (RegNo % 2) + ++DefCycle; + } else if (Subtarget.isCortexA9()) { + DefCycle = RegNo; + bool isSLoad = false; + + switch (DefMCID.getOpcode()) { + default: break; + case ARM::VLDMSIA: + case ARM::VLDMSIA_UPD: + case ARM::VLDMSDB_UPD: + isSLoad = true; + break; + } + + // If there are odd number of 'S' registers or if it's not 64-bit aligned, + // then it takes an extra cycle. + if ((isSLoad && (RegNo % 2)) || DefAlign < 8) + ++DefCycle; + } else { + // Assume the worst. + DefCycle = RegNo + 2; + } + + return DefCycle; +} + +int +ARMBaseInstrInfo::getLDMDefCycle(const InstrItineraryData *ItinData, + const MCInstrDesc &DefMCID, + unsigned DefClass, + unsigned DefIdx, unsigned DefAlign) const { + int RegNo = (int)(DefIdx+1) - DefMCID.getNumOperands() + 1; + if (RegNo <= 0) + // Def is the address writeback. + return ItinData->getOperandCycle(DefClass, DefIdx); + + int DefCycle; + if (Subtarget.isCortexA8()) { + // 4 registers would be issued: 1, 2, 1. + // 5 registers would be issued: 1, 2, 2. + DefCycle = RegNo / 2; + if (DefCycle < 1) + DefCycle = 1; + // Result latency is issue cycle + 2: E2. + DefCycle += 2; + } else if (Subtarget.isCortexA9()) { + DefCycle = (RegNo / 2); + // If there are odd number of registers or if it's not 64-bit aligned, + // then it takes an extra AGU (Address Generation Unit) cycle. + if ((RegNo % 2) || DefAlign < 8) + ++DefCycle; + // Result latency is AGU cycles + 2. + DefCycle += 2; + } else { + // Assume the worst. + DefCycle = RegNo + 2; + } + + return DefCycle; +} + +int +ARMBaseInstrInfo::getVSTMUseCycle(const InstrItineraryData *ItinData, + const MCInstrDesc &UseMCID, + unsigned UseClass, + unsigned UseIdx, unsigned UseAlign) const { + int RegNo = (int)(UseIdx+1) - UseMCID.getNumOperands() + 1; + if (RegNo <= 0) + return ItinData->getOperandCycle(UseClass, UseIdx); + + int UseCycle; + if (Subtarget.isCortexA8()) { + // (regno / 2) + (regno % 2) + 1 + UseCycle = RegNo / 2 + 1; + if (RegNo % 2) + ++UseCycle; + } else if (Subtarget.isCortexA9()) { + UseCycle = RegNo; + bool isSStore = false; + + switch (UseMCID.getOpcode()) { + default: break; + case ARM::VSTMSIA: + case ARM::VSTMSIA_UPD: + case ARM::VSTMSDB_UPD: + isSStore = true; + break; + } + + // If there are odd number of 'S' registers or if it's not 64-bit aligned, + // then it takes an extra cycle. + if ((isSStore && (RegNo % 2)) || UseAlign < 8) + ++UseCycle; + } else { + // Assume the worst. + UseCycle = RegNo + 2; + } + + return UseCycle; +} + +int +ARMBaseInstrInfo::getSTMUseCycle(const InstrItineraryData *ItinData, + const MCInstrDesc &UseMCID, + unsigned UseClass, + unsigned UseIdx, unsigned UseAlign) const { + int RegNo = (int)(UseIdx+1) - UseMCID.getNumOperands() + 1; + if (RegNo <= 0) + return ItinData->getOperandCycle(UseClass, UseIdx); + + int UseCycle; + if (Subtarget.isCortexA8()) { + UseCycle = RegNo / 2; + if (UseCycle < 2) + UseCycle = 2; + // Read in E3. + UseCycle += 2; + } else if (Subtarget.isCortexA9()) { + UseCycle = (RegNo / 2); + // If there are odd number of registers or if it's not 64-bit aligned, + // then it takes an extra AGU (Address Generation Unit) cycle. + if ((RegNo % 2) || UseAlign < 8) + ++UseCycle; + } else { + // Assume the worst. + UseCycle = 1; + } + return UseCycle; +} + +int +ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, + const MCInstrDesc &DefMCID, + unsigned DefIdx, unsigned DefAlign, + const MCInstrDesc &UseMCID, + unsigned UseIdx, unsigned UseAlign) const { + unsigned DefClass = DefMCID.getSchedClass(); + unsigned UseClass = UseMCID.getSchedClass(); + + if (DefIdx < DefMCID.getNumDefs() && UseIdx < UseMCID.getNumOperands()) + return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx); + + // This may be a def / use of a variable_ops instruction, the operand + // latency might be determinable dynamically. Let the target try to + // figure it out. + int DefCycle = -1; + bool LdmBypass = false; + switch (DefMCID.getOpcode()) { + default: + DefCycle = ItinData->getOperandCycle(DefClass, DefIdx); + break; + + case ARM::VLDMDIA: + case ARM::VLDMDIA_UPD: + case ARM::VLDMDDB_UPD: + case ARM::VLDMSIA: + case ARM::VLDMSIA_UPD: + case ARM::VLDMSDB_UPD: + DefCycle = getVLDMDefCycle(ItinData, DefMCID, DefClass, DefIdx, DefAlign); + break; + + case ARM::LDMIA_RET: + case ARM::LDMIA: + case ARM::LDMDA: + case ARM::LDMDB: + case ARM::LDMIB: + case ARM::LDMIA_UPD: + case ARM::LDMDA_UPD: + case ARM::LDMDB_UPD: + case ARM::LDMIB_UPD: + case ARM::tLDMIA: + case ARM::tLDMIA_UPD: + case ARM::tPUSH: + case ARM::t2LDMIA_RET: + case ARM::t2LDMIA: + case ARM::t2LDMDB: + case ARM::t2LDMIA_UPD: + case ARM::t2LDMDB_UPD: + LdmBypass = 1; + DefCycle = getLDMDefCycle(ItinData, DefMCID, DefClass, DefIdx, DefAlign); + break; + } + + if (DefCycle == -1) + // We can't seem to determine the result latency of the def, assume it's 2. + DefCycle = 2; + + int UseCycle = -1; + switch (UseMCID.getOpcode()) { + default: + UseCycle = ItinData->getOperandCycle(UseClass, UseIdx); + break; + + case ARM::VSTMDIA: + case ARM::VSTMDIA_UPD: + case ARM::VSTMDDB_UPD: + case ARM::VSTMSIA: + case ARM::VSTMSIA_UPD: + case ARM::VSTMSDB_UPD: + UseCycle = getVSTMUseCycle(ItinData, UseMCID, UseClass, UseIdx, UseAlign); + break; + + case ARM::STMIA: + case ARM::STMDA: + case ARM::STMDB: + case ARM::STMIB: + case ARM::STMIA_UPD: + case ARM::STMDA_UPD: + case ARM::STMDB_UPD: + case ARM::STMIB_UPD: + case ARM::tSTMIA_UPD: + case ARM::tPOP_RET: + case ARM::tPOP: + case ARM::t2STMIA: + case ARM::t2STMDB: + case ARM::t2STMIA_UPD: + case ARM::t2STMDB_UPD: + UseCycle = getSTMUseCycle(ItinData, UseMCID, UseClass, UseIdx, UseAlign); + break; + } + + if (UseCycle == -1) + // Assume it's read in the first stage. + UseCycle = 1; + + UseCycle = DefCycle - UseCycle + 1; + if (UseCycle > 0) { + if (LdmBypass) { + // It's a variable_ops instruction so we can't use DefIdx here. Just use + // first def operand. + if (ItinData->hasPipelineForwarding(DefClass, DefMCID.getNumOperands()-1, + UseClass, UseIdx)) + --UseCycle; + } else if (ItinData->hasPipelineForwarding(DefClass, DefIdx, + UseClass, UseIdx)) { + --UseCycle; + } + } + + return UseCycle; +} + +int +ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const { + if (DefMI->isCopyLike() || DefMI->isInsertSubreg() || + DefMI->isRegSequence() || DefMI->isImplicitDef()) + return 1; + + const MCInstrDesc &DefMCID = DefMI->getDesc(); + if (!ItinData || ItinData->isEmpty()) + return DefMCID.mayLoad() ? 3 : 1; + + const MCInstrDesc &UseMCID = UseMI->getDesc(); + const MachineOperand &DefMO = DefMI->getOperand(DefIdx); + if (DefMO.getReg() == ARM::CPSR) { + if (DefMI->getOpcode() == ARM::FMSTAT) { + // fpscr -> cpsr stalls over 20 cycles on A8 (and earlier?) + return Subtarget.isCortexA9() ? 1 : 20; + } + + // CPSR set and branch can be paired in the same cycle. + if (UseMCID.isBranch()) + return 0; + } + + unsigned DefAlign = DefMI->hasOneMemOperand() + ? (*DefMI->memoperands_begin())->getAlignment() : 0; + unsigned UseAlign = UseMI->hasOneMemOperand() + ? (*UseMI->memoperands_begin())->getAlignment() : 0; + int Latency = getOperandLatency(ItinData, DefMCID, DefIdx, DefAlign, + UseMCID, UseIdx, UseAlign); + + if (Latency > 1 && + (Subtarget.isCortexA8() || Subtarget.isCortexA9())) { + // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2] + // variants are one cycle cheaper. + switch (DefMCID.getOpcode()) { + default: break; + case ARM::LDRrs: + case ARM::LDRBrs: { + unsigned ShOpVal = DefMI->getOperand(3).getImm(); + unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal); + if (ShImm == 0 || + (ShImm == 2 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)) + --Latency; + break; + } + case ARM::t2LDRs: + case ARM::t2LDRBs: + case ARM::t2LDRHs: + case ARM::t2LDRSHs: { + // Thumb2 mode: lsl only. + unsigned ShAmt = DefMI->getOperand(3).getImm(); + if (ShAmt == 0 || ShAmt == 2) + --Latency; + break; + } + } + } + + if (DefAlign < 8 && Subtarget.isCortexA9()) + switch (DefMCID.getOpcode()) { + default: break; + case ARM::VLD1q8: + case ARM::VLD1q16: + case ARM::VLD1q32: + case ARM::VLD1q64: + case ARM::VLD1q8_UPD: + case ARM::VLD1q16_UPD: + case ARM::VLD1q32_UPD: + case ARM::VLD1q64_UPD: + case ARM::VLD2d8: + case ARM::VLD2d16: + case ARM::VLD2d32: + case ARM::VLD2q8: + case ARM::VLD2q16: + case ARM::VLD2q32: + case ARM::VLD2d8_UPD: + case ARM::VLD2d16_UPD: + case ARM::VLD2d32_UPD: + case ARM::VLD2q8_UPD: + case ARM::VLD2q16_UPD: + case ARM::VLD2q32_UPD: + case ARM::VLD3d8: + case ARM::VLD3d16: + case ARM::VLD3d32: + case ARM::VLD1d64T: + case ARM::VLD3d8_UPD: + case ARM::VLD3d16_UPD: + case ARM::VLD3d32_UPD: + case ARM::VLD1d64T_UPD: + case ARM::VLD3q8_UPD: + case ARM::VLD3q16_UPD: + case ARM::VLD3q32_UPD: + case ARM::VLD4d8: + case ARM::VLD4d16: + case ARM::VLD4d32: + case ARM::VLD1d64Q: + case ARM::VLD4d8_UPD: + case ARM::VLD4d16_UPD: + case ARM::VLD4d32_UPD: + case ARM::VLD1d64Q_UPD: + case ARM::VLD4q8_UPD: + case ARM::VLD4q16_UPD: + case ARM::VLD4q32_UPD: + case ARM::VLD1DUPq8: + case ARM::VLD1DUPq16: + case ARM::VLD1DUPq32: + case ARM::VLD1DUPq8_UPD: + case ARM::VLD1DUPq16_UPD: + case ARM::VLD1DUPq32_UPD: + case ARM::VLD2DUPd8: + case ARM::VLD2DUPd16: + case ARM::VLD2DUPd32: + case ARM::VLD2DUPd8_UPD: + case ARM::VLD2DUPd16_UPD: + case ARM::VLD2DUPd32_UPD: + case ARM::VLD4DUPd8: + case ARM::VLD4DUPd16: + case ARM::VLD4DUPd32: + case ARM::VLD4DUPd8_UPD: + case ARM::VLD4DUPd16_UPD: + case ARM::VLD4DUPd32_UPD: + case ARM::VLD1LNd8: + case ARM::VLD1LNd16: + case ARM::VLD1LNd32: + case ARM::VLD1LNd8_UPD: + case ARM::VLD1LNd16_UPD: + case ARM::VLD1LNd32_UPD: + case ARM::VLD2LNd8: + case ARM::VLD2LNd16: + case ARM::VLD2LNd32: + case ARM::VLD2LNq16: + case ARM::VLD2LNq32: + case ARM::VLD2LNd8_UPD: + case ARM::VLD2LNd16_UPD: + case ARM::VLD2LNd32_UPD: + case ARM::VLD2LNq16_UPD: + case ARM::VLD2LNq32_UPD: + case ARM::VLD4LNd8: + case ARM::VLD4LNd16: + case ARM::VLD4LNd32: + case ARM::VLD4LNq16: + case ARM::VLD4LNq32: + case ARM::VLD4LNd8_UPD: + case ARM::VLD4LNd16_UPD: + case ARM::VLD4LNd32_UPD: + case ARM::VLD4LNq16_UPD: + case ARM::VLD4LNq32_UPD: + // If the address is not 64-bit aligned, the latencies of these + // instructions increases by one. + ++Latency; + break; + } + + return Latency; +} + +int +ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, + SDNode *DefNode, unsigned DefIdx, + SDNode *UseNode, unsigned UseIdx) const { + if (!DefNode->isMachineOpcode()) + return 1; + + const MCInstrDesc &DefMCID = get(DefNode->getMachineOpcode()); + + if (isZeroCost(DefMCID.Opcode)) + return 0; + + if (!ItinData || ItinData->isEmpty()) + return DefMCID.mayLoad() ? 3 : 1; + + if (!UseNode->isMachineOpcode()) { + int Latency = ItinData->getOperandCycle(DefMCID.getSchedClass(), DefIdx); + if (Subtarget.isCortexA9()) + return Latency <= 2 ? 1 : Latency - 1; + else + return Latency <= 3 ? 1 : Latency - 2; + } + + const MCInstrDesc &UseMCID = get(UseNode->getMachineOpcode()); + const MachineSDNode *DefMN = dyn_cast<MachineSDNode>(DefNode); + unsigned DefAlign = !DefMN->memoperands_empty() + ? (*DefMN->memoperands_begin())->getAlignment() : 0; + const MachineSDNode *UseMN = dyn_cast<MachineSDNode>(UseNode); + unsigned UseAlign = !UseMN->memoperands_empty() + ? (*UseMN->memoperands_begin())->getAlignment() : 0; + int Latency = getOperandLatency(ItinData, DefMCID, DefIdx, DefAlign, + UseMCID, UseIdx, UseAlign); + + if (Latency > 1 && + (Subtarget.isCortexA8() || Subtarget.isCortexA9())) { + // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2] + // variants are one cycle cheaper. + switch (DefMCID.getOpcode()) { + default: break; + case ARM::LDRrs: + case ARM::LDRBrs: { + unsigned ShOpVal = + cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue(); + unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal); + if (ShImm == 0 || + (ShImm == 2 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)) + --Latency; + break; + } + case ARM::t2LDRs: + case ARM::t2LDRBs: + case ARM::t2LDRHs: + case ARM::t2LDRSHs: { + // Thumb2 mode: lsl only. + unsigned ShAmt = + cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue(); + if (ShAmt == 0 || ShAmt == 2) + --Latency; + break; + } + } + } + + if (DefAlign < 8 && Subtarget.isCortexA9()) + switch (DefMCID.getOpcode()) { + default: break; + case ARM::VLD1q8Pseudo: + case ARM::VLD1q16Pseudo: + case ARM::VLD1q32Pseudo: + case ARM::VLD1q64Pseudo: + case ARM::VLD1q8Pseudo_UPD: + case ARM::VLD1q16Pseudo_UPD: + case ARM::VLD1q32Pseudo_UPD: + case ARM::VLD1q64Pseudo_UPD: + case ARM::VLD2d8Pseudo: + case ARM::VLD2d16Pseudo: + case ARM::VLD2d32Pseudo: + case ARM::VLD2q8Pseudo: + case ARM::VLD2q16Pseudo: + case ARM::VLD2q32Pseudo: + case ARM::VLD2d8Pseudo_UPD: + case ARM::VLD2d16Pseudo_UPD: + case ARM::VLD2d32Pseudo_UPD: + case ARM::VLD2q8Pseudo_UPD: + case ARM::VLD2q16Pseudo_UPD: + case ARM::VLD2q32Pseudo_UPD: + case ARM::VLD3d8Pseudo: + case ARM::VLD3d16Pseudo: + case ARM::VLD3d32Pseudo: + case ARM::VLD1d64TPseudo: + case ARM::VLD3d8Pseudo_UPD: + case ARM::VLD3d16Pseudo_UPD: + case ARM::VLD3d32Pseudo_UPD: + case ARM::VLD1d64TPseudo_UPD: + case ARM::VLD3q8Pseudo_UPD: + case ARM::VLD3q16Pseudo_UPD: + case ARM::VLD3q32Pseudo_UPD: + case ARM::VLD3q8oddPseudo: + case ARM::VLD3q16oddPseudo: + case ARM::VLD3q32oddPseudo: + case ARM::VLD3q8oddPseudo_UPD: + case ARM::VLD3q16oddPseudo_UPD: + case ARM::VLD3q32oddPseudo_UPD: + case ARM::VLD4d8Pseudo: + case ARM::VLD4d16Pseudo: + case ARM::VLD4d32Pseudo: + case ARM::VLD1d64QPseudo: + case ARM::VLD4d8Pseudo_UPD: + case ARM::VLD4d16Pseudo_UPD: + case ARM::VLD4d32Pseudo_UPD: + case ARM::VLD1d64QPseudo_UPD: + case ARM::VLD4q8Pseudo_UPD: + case ARM::VLD4q16Pseudo_UPD: + case ARM::VLD4q32Pseudo_UPD: + case ARM::VLD4q8oddPseudo: + case ARM::VLD4q16oddPseudo: + case ARM::VLD4q32oddPseudo: + case ARM::VLD4q8oddPseudo_UPD: + case ARM::VLD4q16oddPseudo_UPD: + case ARM::VLD4q32oddPseudo_UPD: + case ARM::VLD1DUPq8Pseudo: + case ARM::VLD1DUPq16Pseudo: + case ARM::VLD1DUPq32Pseudo: + case ARM::VLD1DUPq8Pseudo_UPD: + case ARM::VLD1DUPq16Pseudo_UPD: + case ARM::VLD1DUPq32Pseudo_UPD: + case ARM::VLD2DUPd8Pseudo: + case ARM::VLD2DUPd16Pseudo: + case ARM::VLD2DUPd32Pseudo: + case ARM::VLD2DUPd8Pseudo_UPD: + case ARM::VLD2DUPd16Pseudo_UPD: + case ARM::VLD2DUPd32Pseudo_UPD: + case ARM::VLD4DUPd8Pseudo: + case ARM::VLD4DUPd16Pseudo: + case ARM::VLD4DUPd32Pseudo: + case ARM::VLD4DUPd8Pseudo_UPD: + case ARM::VLD4DUPd16Pseudo_UPD: + case ARM::VLD4DUPd32Pseudo_UPD: + case ARM::VLD1LNq8Pseudo: + case ARM::VLD1LNq16Pseudo: + case ARM::VLD1LNq32Pseudo: + case ARM::VLD1LNq8Pseudo_UPD: + case ARM::VLD1LNq16Pseudo_UPD: + case ARM::VLD1LNq32Pseudo_UPD: + case ARM::VLD2LNd8Pseudo: + case ARM::VLD2LNd16Pseudo: + case ARM::VLD2LNd32Pseudo: + case ARM::VLD2LNq16Pseudo: + case ARM::VLD2LNq32Pseudo: + case ARM::VLD2LNd8Pseudo_UPD: + case ARM::VLD2LNd16Pseudo_UPD: + case ARM::VLD2LNd32Pseudo_UPD: + case ARM::VLD2LNq16Pseudo_UPD: + case ARM::VLD2LNq32Pseudo_UPD: + case ARM::VLD4LNd8Pseudo: + case ARM::VLD4LNd16Pseudo: + case ARM::VLD4LNd32Pseudo: + case ARM::VLD4LNq16Pseudo: + case ARM::VLD4LNq32Pseudo: + case ARM::VLD4LNd8Pseudo_UPD: + case ARM::VLD4LNd16Pseudo_UPD: + case ARM::VLD4LNd32Pseudo_UPD: + case ARM::VLD4LNq16Pseudo_UPD: + case ARM::VLD4LNq32Pseudo_UPD: + // If the address is not 64-bit aligned, the latencies of these + // instructions increases by one. + ++Latency; + break; + } + + return Latency; +} + +int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost) const { + if (MI->isCopyLike() || MI->isInsertSubreg() || + MI->isRegSequence() || MI->isImplicitDef()) + return 1; + + if (!ItinData || ItinData->isEmpty()) + return 1; + + const MCInstrDesc &MCID = MI->getDesc(); + unsigned Class = MCID.getSchedClass(); + unsigned UOps = ItinData->Itineraries[Class].NumMicroOps; + if (PredCost && MCID.hasImplicitDefOfPhysReg(ARM::CPSR)) + // When predicated, CPSR is an additional source operand for CPSR updating + // instructions, this apparently increases their latencies. + *PredCost = 1; + if (UOps) + return ItinData->getStageLatency(Class); + return getNumMicroOps(ItinData, MI); +} + +int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, + SDNode *Node) const { + if (!Node->isMachineOpcode()) + return 1; + + if (!ItinData || ItinData->isEmpty()) + return 1; + + unsigned Opcode = Node->getMachineOpcode(); + switch (Opcode) { + default: + return ItinData->getStageLatency(get(Opcode).getSchedClass()); + case ARM::VLDMQIA: + case ARM::VSTMQIA: + return 2; + } +} + +bool ARMBaseInstrInfo:: +hasHighOperandLatency(const InstrItineraryData *ItinData, + const MachineRegisterInfo *MRI, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const { + unsigned DDomain = DefMI->getDesc().TSFlags & ARMII::DomainMask; + unsigned UDomain = UseMI->getDesc().TSFlags & ARMII::DomainMask; + if (Subtarget.isCortexA8() && + (DDomain == ARMII::DomainVFP || UDomain == ARMII::DomainVFP)) + // CortexA8 VFP instructions are not pipelined. + return true; + + // Hoist VFP / NEON instructions with 4 or higher latency. + int Latency = getOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx); + if (Latency <= 3) + return false; + return DDomain == ARMII::DomainVFP || DDomain == ARMII::DomainNEON || + UDomain == ARMII::DomainVFP || UDomain == ARMII::DomainNEON; +} + +bool ARMBaseInstrInfo:: +hasLowDefLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI, unsigned DefIdx) const { + if (!ItinData || ItinData->isEmpty()) + return false; + + unsigned DDomain = DefMI->getDesc().TSFlags & ARMII::DomainMask; + if (DDomain == ARMII::DomainGeneral) { + unsigned DefClass = DefMI->getDesc().getSchedClass(); + int DefCycle = ItinData->getOperandCycle(DefClass, DefIdx); + return (DefCycle != -1 && DefCycle <= 2); + } + return false; +} + +bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr *MI, + StringRef &ErrInfo) const { + if (convertAddSubFlagsOpcode(MI->getOpcode())) { + ErrInfo = "Pseudo flag setting opcodes only exist in Selection DAG"; + return false; + } + return true; +} + +bool +ARMBaseInstrInfo::isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc, + unsigned &AddSubOpc, + bool &NegAcc, bool &HasLane) const { + DenseMap<unsigned, unsigned>::const_iterator I = MLxEntryMap.find(Opcode); + if (I == MLxEntryMap.end()) + return false; + + const ARM_MLxEntry &Entry = ARM_MLxTable[I->second]; + MulOpc = Entry.MulOpc; + AddSubOpc = Entry.AddSubOpc; + NegAcc = Entry.NegAcc; + HasLane = Entry.HasLane; + return true; +} + +//===----------------------------------------------------------------------===// +// Execution domains. +//===----------------------------------------------------------------------===// +// +// Some instructions go down the NEON pipeline, some go down the VFP pipeline, +// and some can go down both. The vmov instructions go down the VFP pipeline, +// but they can be changed to vorr equivalents that are executed by the NEON +// pipeline. +// +// We use the following execution domain numbering: +// +enum ARMExeDomain { + ExeGeneric = 0, + ExeVFP = 1, + ExeNEON = 2 +}; +// +// Also see ARMInstrFormats.td and Domain* enums in ARMBaseInfo.h +// +std::pair<uint16_t, uint16_t> +ARMBaseInstrInfo::getExecutionDomain(const MachineInstr *MI) const { + // VMOVD is a VFP instruction, but can be changed to NEON if it isn't + // predicated. + if (MI->getOpcode() == ARM::VMOVD && !isPredicated(MI)) + return std::make_pair(ExeVFP, (1<<ExeVFP) | (1<<ExeNEON)); + + // No other instructions can be swizzled, so just determine their domain. + unsigned Domain = MI->getDesc().TSFlags & ARMII::DomainMask; + + if (Domain & ARMII::DomainNEON) + return std::make_pair(ExeNEON, 0); + + // Certain instructions can go either way on Cortex-A8. + // Treat them as NEON instructions. + if ((Domain & ARMII::DomainNEONA8) && Subtarget.isCortexA8()) + return std::make_pair(ExeNEON, 0); + + if (Domain & ARMII::DomainVFP) + return std::make_pair(ExeVFP, 0); + + return std::make_pair(ExeGeneric, 0); +} + +void +ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { + // We only know how to change VMOVD into VORR. + assert(MI->getOpcode() == ARM::VMOVD && "Can only swizzle VMOVD"); + if (Domain != ExeNEON) + return; + + // Zap the predicate operands. + assert(!isPredicated(MI) && "Cannot predicate a VORRd"); + MI->RemoveOperand(3); + MI->RemoveOperand(2); + + // Change to a VORRd which requires two identical use operands. + MI->setDesc(get(ARM::VORRd)); + + // Add the extra source operand and new predicates. + // This will go before any implicit ops. + AddDefaultPred(MachineInstrBuilder(MI).addOperand(MI->getOperand(1))); +} diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h new file mode 100644 index 0000000..0f9f321 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -0,0 +1,384 @@ +//===- ARMBaseInstrInfo.h - ARM Base Instruction Information ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Base ARM implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMBASEINSTRUCTIONINFO_H +#define ARMBASEINSTRUCTIONINFO_H + +#include "ARM.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallSet.h" + +#define GET_INSTRINFO_HEADER +#include "ARMGenInstrInfo.inc" + +namespace llvm { + class ARMSubtarget; + class ARMBaseRegisterInfo; + +class ARMBaseInstrInfo : public ARMGenInstrInfo { + const ARMSubtarget &Subtarget; + +protected: + // Can be only subclassed. + explicit ARMBaseInstrInfo(const ARMSubtarget &STI); + +public: + // Return the non-pre/post incrementing version of 'Opc'. Return 0 + // if there is not such an opcode. + virtual unsigned getUnindexedOpcode(unsigned Opc) const =0; + + virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const; + + virtual const ARMBaseRegisterInfo &getRegisterInfo() const =0; + const ARMSubtarget &getSubtarget() const { return Subtarget; } + + ScheduleHazardRecognizer * + CreateTargetHazardRecognizer(const TargetMachine *TM, + const ScheduleDAG *DAG) const; + + ScheduleHazardRecognizer * + CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, + const ScheduleDAG *DAG) const; + + // Branch analysis. + virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify = false) const; + virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const; + virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const SmallVectorImpl<MachineOperand> &Cond, + DebugLoc DL) const; + + virtual + bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const; + + // Predication support. + bool isPredicated(const MachineInstr *MI) const { + int PIdx = MI->findFirstPredOperandIdx(); + return PIdx != -1 && MI->getOperand(PIdx).getImm() != ARMCC::AL; + } + + ARMCC::CondCodes getPredicate(const MachineInstr *MI) const { + int PIdx = MI->findFirstPredOperandIdx(); + return PIdx != -1 ? (ARMCC::CondCodes)MI->getOperand(PIdx).getImm() + : ARMCC::AL; + } + + virtual + bool PredicateInstruction(MachineInstr *MI, + const SmallVectorImpl<MachineOperand> &Pred) const; + + virtual + bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1, + const SmallVectorImpl<MachineOperand> &Pred2) const; + + virtual bool DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const; + + virtual bool isPredicable(MachineInstr *MI) const; + + /// GetInstSize - Returns the size of the specified MachineInstr. + /// + virtual unsigned GetInstSizeInBytes(const MachineInstr* MI) const; + + virtual unsigned isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const; + virtual unsigned isStoreToStackSlot(const MachineInstr *MI, + int &FrameIndex) const; + virtual unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const; + virtual unsigned isStoreToStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const; + + virtual void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const; + + virtual void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned SrcReg, bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const; + + virtual void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const; + + virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const; + + virtual MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, + int FrameIx, + uint64_t Offset, + const MDNode *MDPtr, + DebugLoc DL) const; + + virtual void reMaterialize(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, unsigned SubIdx, + const MachineInstr *Orig, + const TargetRegisterInfo &TRI) const; + + MachineInstr *duplicate(MachineInstr *Orig, MachineFunction &MF) const; + + virtual bool produceSameValue(const MachineInstr *MI0, + const MachineInstr *MI1, + const MachineRegisterInfo *MRI) const; + + /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to + /// determine if two loads are loading from the same base address. It should + /// only return true if the base pointers are the same and the only + /// differences between the two addresses is the offset. It also returns the + /// offsets by reference. + virtual bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, + int64_t &Offset1, int64_t &Offset2)const; + + /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to + /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads + /// should be scheduled togther. On some targets if two loads are loading from + /// addresses in the same cache line, it's better if they are scheduled + /// together. This function takes two integers that represent the load offsets + /// from the common base address. It returns true if it decides it's desirable + /// to schedule the two loads together. "NumLoads" is the number of loads that + /// have already been scheduled after Load1. + virtual bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, + int64_t Offset1, int64_t Offset2, + unsigned NumLoads) const; + + virtual bool isSchedulingBoundary(const MachineInstr *MI, + const MachineBasicBlock *MBB, + const MachineFunction &MF) const; + + virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB, + unsigned NumCycles, unsigned ExtraPredCycles, + const BranchProbability &Probability) const; + + virtual bool isProfitableToIfCvt(MachineBasicBlock &TMBB, + unsigned NumT, unsigned ExtraT, + MachineBasicBlock &FMBB, + unsigned NumF, unsigned ExtraF, + const BranchProbability &Probability) const; + + virtual bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, + unsigned NumCycles, + const BranchProbability + &Probability) const { + return NumCycles == 1; + } + + /// AnalyzeCompare - For a comparison instruction, return the source register + /// in SrcReg and the value it compares against in CmpValue. Return true if + /// the comparison instruction can be analyzed. + virtual bool AnalyzeCompare(const MachineInstr *MI, unsigned &SrcReg, + int &CmpMask, int &CmpValue) const; + + /// OptimizeCompareInstr - Convert the instruction to set the zero flag so + /// that we can remove a "comparison with zero". + virtual bool OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, + int CmpMask, int CmpValue, + const MachineRegisterInfo *MRI) const; + + /// FoldImmediate - 'Reg' is known to be defined by a move immediate + /// instruction, try to fold the immediate into the use instruction. + virtual bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, + unsigned Reg, MachineRegisterInfo *MRI) const; + + virtual unsigned getNumMicroOps(const InstrItineraryData *ItinData, + const MachineInstr *MI) const; + + virtual + int getOperandLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const; + virtual + int getOperandLatency(const InstrItineraryData *ItinData, + SDNode *DefNode, unsigned DefIdx, + SDNode *UseNode, unsigned UseIdx) const; + + /// VFP/NEON execution domains. + std::pair<uint16_t, uint16_t> + getExecutionDomain(const MachineInstr *MI) const; + void setExecutionDomain(MachineInstr *MI, unsigned Domain) const; + +private: + int getVLDMDefCycle(const InstrItineraryData *ItinData, + const MCInstrDesc &DefMCID, + unsigned DefClass, + unsigned DefIdx, unsigned DefAlign) const; + int getLDMDefCycle(const InstrItineraryData *ItinData, + const MCInstrDesc &DefMCID, + unsigned DefClass, + unsigned DefIdx, unsigned DefAlign) const; + int getVSTMUseCycle(const InstrItineraryData *ItinData, + const MCInstrDesc &UseMCID, + unsigned UseClass, + unsigned UseIdx, unsigned UseAlign) const; + int getSTMUseCycle(const InstrItineraryData *ItinData, + const MCInstrDesc &UseMCID, + unsigned UseClass, + unsigned UseIdx, unsigned UseAlign) const; + int getOperandLatency(const InstrItineraryData *ItinData, + const MCInstrDesc &DefMCID, + unsigned DefIdx, unsigned DefAlign, + const MCInstrDesc &UseMCID, + unsigned UseIdx, unsigned UseAlign) const; + + int getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, unsigned *PredCost = 0) const; + + int getInstrLatency(const InstrItineraryData *ItinData, + SDNode *Node) const; + + bool hasHighOperandLatency(const InstrItineraryData *ItinData, + const MachineRegisterInfo *MRI, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const; + bool hasLowDefLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI, unsigned DefIdx) const; + + /// verifyInstruction - Perform target specific instruction verification. + bool verifyInstruction(const MachineInstr *MI, StringRef &ErrInfo) const; + +private: + /// Modeling special VFP / NEON fp MLA / MLS hazards. + + /// MLxEntryMap - Map fp MLA / MLS to the corresponding entry in the internal + /// MLx table. + DenseMap<unsigned, unsigned> MLxEntryMap; + + /// MLxHazardOpcodes - Set of add / sub and multiply opcodes that would cause + /// stalls when scheduled together with fp MLA / MLS opcodes. + SmallSet<unsigned, 16> MLxHazardOpcodes; + +public: + /// isFpMLxInstruction - Return true if the specified opcode is a fp MLA / MLS + /// instruction. + bool isFpMLxInstruction(unsigned Opcode) const { + return MLxEntryMap.count(Opcode); + } + + /// isFpMLxInstruction - This version also returns the multiply opcode and the + /// addition / subtraction opcode to expand to. Return true for 'HasLane' for + /// the MLX instructions with an extra lane operand. + bool isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc, + unsigned &AddSubOpc, bool &NegAcc, + bool &HasLane) const; + + /// canCauseFpMLxStall - Return true if an instruction of the specified opcode + /// will cause stalls when scheduled after (within 4-cycle window) a fp + /// MLA / MLS instruction. + bool canCauseFpMLxStall(unsigned Opcode) const { + return MLxHazardOpcodes.count(Opcode); + } +}; + +static inline +const MachineInstrBuilder &AddDefaultPred(const MachineInstrBuilder &MIB) { + return MIB.addImm((int64_t)ARMCC::AL).addReg(0); +} + +static inline +const MachineInstrBuilder &AddDefaultCC(const MachineInstrBuilder &MIB) { + return MIB.addReg(0); +} + +static inline +const MachineInstrBuilder &AddDefaultT1CC(const MachineInstrBuilder &MIB, + bool isDead = false) { + return MIB.addReg(ARM::CPSR, getDefRegState(true) | getDeadRegState(isDead)); +} + +static inline +const MachineInstrBuilder &AddNoT1CC(const MachineInstrBuilder &MIB) { + return MIB.addReg(0); +} + +static inline +bool isUncondBranchOpcode(int Opc) { + return Opc == ARM::B || Opc == ARM::tB || Opc == ARM::t2B; +} + +static inline +bool isCondBranchOpcode(int Opc) { + return Opc == ARM::Bcc || Opc == ARM::tBcc || Opc == ARM::t2Bcc; +} + +static inline +bool isJumpTableBranchOpcode(int Opc) { + return Opc == ARM::BR_JTr || Opc == ARM::BR_JTm || Opc == ARM::BR_JTadd || + Opc == ARM::tBR_JTr || Opc == ARM::t2BR_JT; +} + +static inline +bool isIndirectBranchOpcode(int Opc) { + return Opc == ARM::BX || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND; +} + +/// getInstrPredicate - If instruction is predicated, returns its predicate +/// condition, otherwise returns AL. It also returns the condition code +/// register by reference. +ARMCC::CondCodes getInstrPredicate(const MachineInstr *MI, unsigned &PredReg); + +int getMatchingCondBranchOpcode(int Opc); + + +/// Map pseudo instructions that imply an 'S' bit onto real opcodes. Whether +/// the instruction is encoded with an 'S' bit is determined by the optional +/// CPSR def operand. +unsigned convertAddSubFlagsOpcode(unsigned OldOpc); + +/// emitARMRegPlusImmediate / emitT2RegPlusImmediate - Emits a series of +/// instructions to materializea destreg = basereg + immediate in ARM / Thumb2 +/// code. +void emitARMRegPlusImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, DebugLoc dl, + unsigned DestReg, unsigned BaseReg, int NumBytes, + ARMCC::CondCodes Pred, unsigned PredReg, + const ARMBaseInstrInfo &TII, unsigned MIFlags = 0); + +void emitT2RegPlusImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, DebugLoc dl, + unsigned DestReg, unsigned BaseReg, int NumBytes, + ARMCC::CondCodes Pred, unsigned PredReg, + const ARMBaseInstrInfo &TII, unsigned MIFlags = 0); +void emitThumbRegPlusImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, DebugLoc dl, + unsigned DestReg, unsigned BaseReg, + int NumBytes, const TargetInstrInfo &TII, + const ARMBaseRegisterInfo& MRI, + unsigned MIFlags = 0); + + +/// rewriteARMFrameIndex / rewriteT2FrameIndex - +/// Rewrite MI to access 'Offset' bytes from the FP. Return false if the +/// offset could not be handled directly in MI, and return the left-over +/// portion by reference. +bool rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const ARMBaseInstrInfo &TII); + +bool rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const ARMBaseInstrInfo &TII); + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp new file mode 100644 index 0000000..7c42342 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -0,0 +1,1208 @@ +//===- ARMBaseRegisterInfo.cpp - ARM Register Information -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the base ARM implementation of TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "ARMBaseRegisterInfo.h" +#include "ARMFrameLowering.h" +#include "ARMInstrInfo.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMSubtarget.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/LLVMContext.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/CommandLine.h" + +#define GET_REGINFO_TARGET_DESC +#include "ARMGenRegisterInfo.inc" + +using namespace llvm; + +static cl::opt<bool> +ForceAllBaseRegAlloc("arm-force-base-reg-alloc", cl::Hidden, cl::init(false), + cl::desc("Force use of virtual base registers for stack load/store")); +static cl::opt<bool> +EnableLocalStackAlloc("enable-local-stack-alloc", cl::init(true), cl::Hidden, + cl::desc("Enable pre-regalloc stack frame index allocation")); +static cl::opt<bool> +EnableBasePointer("arm-use-base-pointer", cl::Hidden, cl::init(true), + cl::desc("Enable use of a base pointer for complex stack frames")); + +ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii, + const ARMSubtarget &sti) + : ARMGenRegisterInfo(ARM::LR), TII(tii), STI(sti), + FramePtr((STI.isTargetDarwin() || STI.isThumb()) ? ARM::R7 : ARM::R11), + BasePtr(ARM::R6) { +} + +const unsigned* +ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { + static const unsigned CalleeSavedRegs[] = { + ARM::LR, ARM::R11, ARM::R10, ARM::R9, ARM::R8, + ARM::R7, ARM::R6, ARM::R5, ARM::R4, + + ARM::D15, ARM::D14, ARM::D13, ARM::D12, + ARM::D11, ARM::D10, ARM::D9, ARM::D8, + 0 + }; + + static const unsigned DarwinCalleeSavedRegs[] = { + // Darwin ABI deviates from ARM standard ABI. R9 is not a callee-saved + // register. + ARM::LR, ARM::R7, ARM::R6, ARM::R5, ARM::R4, + ARM::R11, ARM::R10, ARM::R8, + + ARM::D15, ARM::D14, ARM::D13, ARM::D12, + ARM::D11, ARM::D10, ARM::D9, ARM::D8, + 0 + }; + return STI.isTargetDarwin() ? DarwinCalleeSavedRegs : CalleeSavedRegs; +} + +BitVector ARMBaseRegisterInfo:: +getReservedRegs(const MachineFunction &MF) const { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + + // FIXME: avoid re-calculating this every time. + BitVector Reserved(getNumRegs()); + Reserved.set(ARM::SP); + Reserved.set(ARM::PC); + Reserved.set(ARM::FPSCR); + if (TFI->hasFP(MF)) + Reserved.set(FramePtr); + if (hasBasePointer(MF)) + Reserved.set(BasePtr); + // Some targets reserve R9. + if (STI.isR9Reserved()) + Reserved.set(ARM::R9); + // Reserve D16-D31 if the subtarget doesn't support them. + if (!STI.hasVFP3() || STI.hasD16()) { + assert(ARM::D31 == ARM::D16 + 15); + for (unsigned i = 0; i != 16; ++i) + Reserved.set(ARM::D16 + i); + } + return Reserved; +} + +bool ARMBaseRegisterInfo::isReservedReg(const MachineFunction &MF, + unsigned Reg) const { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + + switch (Reg) { + default: break; + case ARM::SP: + case ARM::PC: + return true; + case ARM::R6: + if (hasBasePointer(MF)) + return true; + break; + case ARM::R7: + case ARM::R11: + if (FramePtr == Reg && TFI->hasFP(MF)) + return true; + break; + case ARM::R9: + return STI.isR9Reserved(); + } + + return false; +} + +const TargetRegisterClass * +ARMBaseRegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, + const TargetRegisterClass *B, + unsigned SubIdx) const { + switch (SubIdx) { + default: return 0; + case ARM::ssub_0: + case ARM::ssub_1: + case ARM::ssub_2: + case ARM::ssub_3: { + // S sub-registers. + if (A->getSize() == 8) { + if (B == &ARM::SPR_8RegClass) + return &ARM::DPR_8RegClass; + assert(B == &ARM::SPRRegClass && "Expecting SPR register class!"); + if (A == &ARM::DPR_8RegClass) + return A; + return &ARM::DPR_VFP2RegClass; + } + + if (A->getSize() == 16) { + if (B == &ARM::SPR_8RegClass) + return &ARM::QPR_8RegClass; + return &ARM::QPR_VFP2RegClass; + } + + if (A->getSize() == 32) { + if (B == &ARM::SPR_8RegClass) + return 0; // Do not allow coalescing! + return &ARM::QQPR_VFP2RegClass; + } + + assert(A->getSize() == 64 && "Expecting a QQQQ register class!"); + return 0; // Do not allow coalescing! + } + case ARM::dsub_0: + case ARM::dsub_1: + case ARM::dsub_2: + case ARM::dsub_3: { + // D sub-registers. + if (A->getSize() == 16) { + if (B == &ARM::DPR_VFP2RegClass) + return &ARM::QPR_VFP2RegClass; + if (B == &ARM::DPR_8RegClass) + return 0; // Do not allow coalescing! + return A; + } + + if (A->getSize() == 32) { + if (B == &ARM::DPR_VFP2RegClass) + return &ARM::QQPR_VFP2RegClass; + if (B == &ARM::DPR_8RegClass) + return 0; // Do not allow coalescing! + return A; + } + + assert(A->getSize() == 64 && "Expecting a QQQQ register class!"); + if (B != &ARM::DPRRegClass) + return 0; // Do not allow coalescing! + return A; + } + case ARM::dsub_4: + case ARM::dsub_5: + case ARM::dsub_6: + case ARM::dsub_7: { + // D sub-registers of QQQQ registers. + if (A->getSize() == 64 && B == &ARM::DPRRegClass) + return A; + return 0; // Do not allow coalescing! + } + + case ARM::qsub_0: + case ARM::qsub_1: { + // Q sub-registers. + if (A->getSize() == 32) { + if (B == &ARM::QPR_VFP2RegClass) + return &ARM::QQPR_VFP2RegClass; + if (B == &ARM::QPR_8RegClass) + return 0; // Do not allow coalescing! + return A; + } + + assert(A->getSize() == 64 && "Expecting a QQQQ register class!"); + if (B == &ARM::QPRRegClass) + return A; + return 0; // Do not allow coalescing! + } + case ARM::qsub_2: + case ARM::qsub_3: { + // Q sub-registers of QQQQ registers. + if (A->getSize() == 64 && B == &ARM::QPRRegClass) + return A; + return 0; // Do not allow coalescing! + } + } + return 0; +} + +bool +ARMBaseRegisterInfo::canCombineSubRegIndices(const TargetRegisterClass *RC, + SmallVectorImpl<unsigned> &SubIndices, + unsigned &NewSubIdx) const { + + unsigned Size = RC->getSize() * 8; + if (Size < 6) + return 0; + + NewSubIdx = 0; // Whole register. + unsigned NumRegs = SubIndices.size(); + if (NumRegs == 8) { + // 8 D registers -> 1 QQQQ register. + return (Size == 512 && + SubIndices[0] == ARM::dsub_0 && + SubIndices[1] == ARM::dsub_1 && + SubIndices[2] == ARM::dsub_2 && + SubIndices[3] == ARM::dsub_3 && + SubIndices[4] == ARM::dsub_4 && + SubIndices[5] == ARM::dsub_5 && + SubIndices[6] == ARM::dsub_6 && + SubIndices[7] == ARM::dsub_7); + } else if (NumRegs == 4) { + if (SubIndices[0] == ARM::qsub_0) { + // 4 Q registers -> 1 QQQQ register. + return (Size == 512 && + SubIndices[1] == ARM::qsub_1 && + SubIndices[2] == ARM::qsub_2 && + SubIndices[3] == ARM::qsub_3); + } else if (SubIndices[0] == ARM::dsub_0) { + // 4 D registers -> 1 QQ register. + if (Size >= 256 && + SubIndices[1] == ARM::dsub_1 && + SubIndices[2] == ARM::dsub_2 && + SubIndices[3] == ARM::dsub_3) { + if (Size == 512) + NewSubIdx = ARM::qqsub_0; + return true; + } + } else if (SubIndices[0] == ARM::dsub_4) { + // 4 D registers -> 1 QQ register (2nd). + if (Size == 512 && + SubIndices[1] == ARM::dsub_5 && + SubIndices[2] == ARM::dsub_6 && + SubIndices[3] == ARM::dsub_7) { + NewSubIdx = ARM::qqsub_1; + return true; + } + } else if (SubIndices[0] == ARM::ssub_0) { + // 4 S registers -> 1 Q register. + if (Size >= 128 && + SubIndices[1] == ARM::ssub_1 && + SubIndices[2] == ARM::ssub_2 && + SubIndices[3] == ARM::ssub_3) { + if (Size >= 256) + NewSubIdx = ARM::qsub_0; + return true; + } + } + } else if (NumRegs == 2) { + if (SubIndices[0] == ARM::qsub_0) { + // 2 Q registers -> 1 QQ register. + if (Size >= 256 && SubIndices[1] == ARM::qsub_1) { + if (Size == 512) + NewSubIdx = ARM::qqsub_0; + return true; + } + } else if (SubIndices[0] == ARM::qsub_2) { + // 2 Q registers -> 1 QQ register (2nd). + if (Size == 512 && SubIndices[1] == ARM::qsub_3) { + NewSubIdx = ARM::qqsub_1; + return true; + } + } else if (SubIndices[0] == ARM::dsub_0) { + // 2 D registers -> 1 Q register. + if (Size >= 128 && SubIndices[1] == ARM::dsub_1) { + if (Size >= 256) + NewSubIdx = ARM::qsub_0; + return true; + } + } else if (SubIndices[0] == ARM::dsub_2) { + // 2 D registers -> 1 Q register (2nd). + if (Size >= 256 && SubIndices[1] == ARM::dsub_3) { + NewSubIdx = ARM::qsub_1; + return true; + } + } else if (SubIndices[0] == ARM::dsub_4) { + // 2 D registers -> 1 Q register (3rd). + if (Size == 512 && SubIndices[1] == ARM::dsub_5) { + NewSubIdx = ARM::qsub_2; + return true; + } + } else if (SubIndices[0] == ARM::dsub_6) { + // 2 D registers -> 1 Q register (3rd). + if (Size == 512 && SubIndices[1] == ARM::dsub_7) { + NewSubIdx = ARM::qsub_3; + return true; + } + } else if (SubIndices[0] == ARM::ssub_0) { + // 2 S registers -> 1 D register. + if (SubIndices[1] == ARM::ssub_1) { + if (Size >= 128) + NewSubIdx = ARM::dsub_0; + return true; + } + } else if (SubIndices[0] == ARM::ssub_2) { + // 2 S registers -> 1 D register (2nd). + if (Size >= 128 && SubIndices[1] == ARM::ssub_3) { + NewSubIdx = ARM::dsub_1; + return true; + } + } + } + return false; +} + +const TargetRegisterClass* +ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) + const { + const TargetRegisterClass *Super = RC; + TargetRegisterClass::sc_iterator I = RC->getSuperClasses(); + do { + switch (Super->getID()) { + case ARM::GPRRegClassID: + case ARM::SPRRegClassID: + case ARM::DPRRegClassID: + case ARM::QPRRegClassID: + case ARM::QQPRRegClassID: + case ARM::QQQQPRRegClassID: + return Super; + } + Super = *I++; + } while (Super); + return RC; +} + +const TargetRegisterClass * +ARMBaseRegisterInfo::getPointerRegClass(unsigned Kind) const { + return ARM::GPRRegisterClass; +} + +const TargetRegisterClass * +ARMBaseRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { + if (RC == &ARM::CCRRegClass) + return 0; // Can't copy CCR registers. + return RC; +} + +unsigned +ARMBaseRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + + switch (RC->getID()) { + default: + return 0; + case ARM::tGPRRegClassID: + return TFI->hasFP(MF) ? 4 : 5; + case ARM::GPRRegClassID: { + unsigned FP = TFI->hasFP(MF) ? 1 : 0; + return 10 - FP - (STI.isR9Reserved() ? 1 : 0); + } + case ARM::SPRRegClassID: // Currently not used as 'rep' register class. + case ARM::DPRRegClassID: + return 32 - 10; + } +} + +/// getRawAllocationOrder - Returns the register allocation order for a +/// specified register class with a target-dependent hint. +ArrayRef<unsigned> +ARMBaseRegisterInfo::getRawAllocationOrder(const TargetRegisterClass *RC, + unsigned HintType, unsigned HintReg, + const MachineFunction &MF) const { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + // Alternative register allocation orders when favoring even / odd registers + // of register pairs. + + // No FP, R9 is available. + static const unsigned GPREven1[] = { + ARM::R0, ARM::R2, ARM::R4, ARM::R6, ARM::R8, ARM::R10, + ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R7, + ARM::R9, ARM::R11 + }; + static const unsigned GPROdd1[] = { + ARM::R1, ARM::R3, ARM::R5, ARM::R7, ARM::R9, ARM::R11, + ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6, + ARM::R8, ARM::R10 + }; + + // FP is R7, R9 is available. + static const unsigned GPREven2[] = { + ARM::R0, ARM::R2, ARM::R4, ARM::R8, ARM::R10, + ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R6, + ARM::R9, ARM::R11 + }; + static const unsigned GPROdd2[] = { + ARM::R1, ARM::R3, ARM::R5, ARM::R9, ARM::R11, + ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6, + ARM::R8, ARM::R10 + }; + + // FP is R11, R9 is available. + static const unsigned GPREven3[] = { + ARM::R0, ARM::R2, ARM::R4, ARM::R6, ARM::R8, + ARM::R1, ARM::R3, ARM::R10,ARM::R12,ARM::LR, ARM::R5, ARM::R7, + ARM::R9 + }; + static const unsigned GPROdd3[] = { + ARM::R1, ARM::R3, ARM::R5, ARM::R6, ARM::R9, + ARM::R0, ARM::R2, ARM::R10,ARM::R12,ARM::LR, ARM::R4, ARM::R7, + ARM::R8 + }; + + // No FP, R9 is not available. + static const unsigned GPREven4[] = { + ARM::R0, ARM::R2, ARM::R4, ARM::R6, ARM::R10, + ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R7, ARM::R8, + ARM::R11 + }; + static const unsigned GPROdd4[] = { + ARM::R1, ARM::R3, ARM::R5, ARM::R7, ARM::R11, + ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8, + ARM::R10 + }; + + // FP is R7, R9 is not available. + static const unsigned GPREven5[] = { + ARM::R0, ARM::R2, ARM::R4, ARM::R10, + ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R6, ARM::R8, + ARM::R11 + }; + static const unsigned GPROdd5[] = { + ARM::R1, ARM::R3, ARM::R5, ARM::R11, + ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8, + ARM::R10 + }; + + // FP is R11, R9 is not available. + static const unsigned GPREven6[] = { + ARM::R0, ARM::R2, ARM::R4, ARM::R6, + ARM::R1, ARM::R3, ARM::R10,ARM::R12,ARM::LR, ARM::R5, ARM::R7, ARM::R8 + }; + static const unsigned GPROdd6[] = { + ARM::R1, ARM::R3, ARM::R5, ARM::R7, + ARM::R0, ARM::R2, ARM::R10,ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8 + }; + + // We only support even/odd hints for GPR and rGPR. + if (RC != ARM::GPRRegisterClass && RC != ARM::rGPRRegisterClass) + return RC->getRawAllocationOrder(MF); + + if (HintType == ARMRI::RegPairEven) { + if (isPhysicalRegister(HintReg) && getRegisterPairEven(HintReg, MF) == 0) + // It's no longer possible to fulfill this hint. Return the default + // allocation order. + return RC->getRawAllocationOrder(MF); + + if (!TFI->hasFP(MF)) { + if (!STI.isR9Reserved()) + return makeArrayRef(GPREven1); + else + return makeArrayRef(GPREven4); + } else if (FramePtr == ARM::R7) { + if (!STI.isR9Reserved()) + return makeArrayRef(GPREven2); + else + return makeArrayRef(GPREven5); + } else { // FramePtr == ARM::R11 + if (!STI.isR9Reserved()) + return makeArrayRef(GPREven3); + else + return makeArrayRef(GPREven6); + } + } else if (HintType == ARMRI::RegPairOdd) { + if (isPhysicalRegister(HintReg) && getRegisterPairOdd(HintReg, MF) == 0) + // It's no longer possible to fulfill this hint. Return the default + // allocation order. + return RC->getRawAllocationOrder(MF); + + if (!TFI->hasFP(MF)) { + if (!STI.isR9Reserved()) + return makeArrayRef(GPROdd1); + else + return makeArrayRef(GPROdd4); + } else if (FramePtr == ARM::R7) { + if (!STI.isR9Reserved()) + return makeArrayRef(GPROdd2); + else + return makeArrayRef(GPROdd5); + } else { // FramePtr == ARM::R11 + if (!STI.isR9Reserved()) + return makeArrayRef(GPROdd3); + else + return makeArrayRef(GPROdd6); + } + } + return RC->getRawAllocationOrder(MF); +} + +/// ResolveRegAllocHint - Resolves the specified register allocation hint +/// to a physical register. Returns the physical register if it is successful. +unsigned +ARMBaseRegisterInfo::ResolveRegAllocHint(unsigned Type, unsigned Reg, + const MachineFunction &MF) const { + if (Reg == 0 || !isPhysicalRegister(Reg)) + return 0; + if (Type == 0) + return Reg; + else if (Type == (unsigned)ARMRI::RegPairOdd) + // Odd register. + return getRegisterPairOdd(Reg, MF); + else if (Type == (unsigned)ARMRI::RegPairEven) + // Even register. + return getRegisterPairEven(Reg, MF); + return 0; +} + +void +ARMBaseRegisterInfo::UpdateRegAllocHint(unsigned Reg, unsigned NewReg, + MachineFunction &MF) const { + MachineRegisterInfo *MRI = &MF.getRegInfo(); + std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(Reg); + if ((Hint.first == (unsigned)ARMRI::RegPairOdd || + Hint.first == (unsigned)ARMRI::RegPairEven) && + TargetRegisterInfo::isVirtualRegister(Hint.second)) { + // If 'Reg' is one of the even / odd register pair and it's now changed + // (e.g. coalesced) into a different register. The other register of the + // pair allocation hint must be updated to reflect the relationship + // change. + unsigned OtherReg = Hint.second; + Hint = MRI->getRegAllocationHint(OtherReg); + if (Hint.second == Reg) + // Make sure the pair has not already divorced. + MRI->setRegAllocationHint(OtherReg, Hint.first, NewReg); + } +} + +bool +ARMBaseRegisterInfo::avoidWriteAfterWrite(const TargetRegisterClass *RC) const { + // CortexA9 has a Write-after-write hazard for NEON registers. + if (!STI.isCortexA9()) + return false; + + switch (RC->getID()) { + case ARM::DPRRegClassID: + case ARM::DPR_8RegClassID: + case ARM::DPR_VFP2RegClassID: + case ARM::QPRRegClassID: + case ARM::QPR_8RegClassID: + case ARM::QPR_VFP2RegClassID: + case ARM::SPRRegClassID: + case ARM::SPR_8RegClassID: + // Avoid reusing S, D, and Q registers. + // Don't increase register pressure for QQ and QQQQ. + return true; + default: + return false; + } +} + +bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + + if (!EnableBasePointer) + return false; + + if (needsStackRealignment(MF) && MFI->hasVarSizedObjects()) + return true; + + // Thumb has trouble with negative offsets from the FP. Thumb2 has a limited + // negative range for ldr/str (255), and thumb1 is positive offsets only. + // It's going to be better to use the SP or Base Pointer instead. When there + // are variable sized objects, we can't reference off of the SP, so we + // reserve a Base Pointer. + if (AFI->isThumbFunction() && MFI->hasVarSizedObjects()) { + // Conservatively estimate whether the negative offset from the frame + // pointer will be sufficient to reach. If a function has a smallish + // frame, it's less likely to have lots of spills and callee saved + // space, so it's all more likely to be within range of the frame pointer. + // If it's wrong, the scavenger will still enable access to work, it just + // won't be optimal. + if (AFI->isThumb2Function() && MFI->getLocalFrameSize() < 128) + return false; + return true; + } + + return false; +} + +bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + // We can't realign the stack if: + // 1. Dynamic stack realignment is explicitly disabled, + // 2. This is a Thumb1 function (it's not useful, so we don't bother), or + // 3. There are VLAs in the function and the base pointer is disabled. + return (RealignStack && !AFI->isThumb1OnlyFunction() && + (!MFI->hasVarSizedObjects() || EnableBasePointer)); +} + +bool ARMBaseRegisterInfo:: +needsStackRealignment(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const Function *F = MF.getFunction(); + unsigned StackAlign = MF.getTarget().getFrameLowering()->getStackAlignment(); + bool requiresRealignment = ((MFI->getLocalFrameMaxAlign() > StackAlign) || + F->hasFnAttr(Attribute::StackAlignment)); + + return requiresRealignment && canRealignStack(MF); +} + +bool ARMBaseRegisterInfo:: +cannotEliminateFrame(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + if (DisableFramePointerElim(MF) && MFI->adjustsStack()) + return true; + return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() + || needsStackRealignment(MF); +} + +unsigned +ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + + if (TFI->hasFP(MF)) + return FramePtr; + return ARM::SP; +} + +unsigned ARMBaseRegisterInfo::getEHExceptionRegister() const { + llvm_unreachable("What is the exception register"); + return 0; +} + +unsigned ARMBaseRegisterInfo::getEHHandlerRegister() const { + llvm_unreachable("What is the exception handler register"); + return 0; +} + +unsigned ARMBaseRegisterInfo::getRegisterPairEven(unsigned Reg, + const MachineFunction &MF) const { + switch (Reg) { + default: break; + // Return 0 if either register of the pair is a special register. + // So no R12, etc. + case ARM::R1: return ARM::R0; + case ARM::R3: return ARM::R2; + case ARM::R5: return ARM::R4; + case ARM::R7: + return (isReservedReg(MF, ARM::R7) || isReservedReg(MF, ARM::R6)) + ? 0 : ARM::R6; + case ARM::R9: return isReservedReg(MF, ARM::R9) ? 0 :ARM::R8; + case ARM::R11: return isReservedReg(MF, ARM::R11) ? 0 : ARM::R10; + + case ARM::S1: return ARM::S0; + case ARM::S3: return ARM::S2; + case ARM::S5: return ARM::S4; + case ARM::S7: return ARM::S6; + case ARM::S9: return ARM::S8; + case ARM::S11: return ARM::S10; + case ARM::S13: return ARM::S12; + case ARM::S15: return ARM::S14; + case ARM::S17: return ARM::S16; + case ARM::S19: return ARM::S18; + case ARM::S21: return ARM::S20; + case ARM::S23: return ARM::S22; + case ARM::S25: return ARM::S24; + case ARM::S27: return ARM::S26; + case ARM::S29: return ARM::S28; + case ARM::S31: return ARM::S30; + + case ARM::D1: return ARM::D0; + case ARM::D3: return ARM::D2; + case ARM::D5: return ARM::D4; + case ARM::D7: return ARM::D6; + case ARM::D9: return ARM::D8; + case ARM::D11: return ARM::D10; + case ARM::D13: return ARM::D12; + case ARM::D15: return ARM::D14; + case ARM::D17: return ARM::D16; + case ARM::D19: return ARM::D18; + case ARM::D21: return ARM::D20; + case ARM::D23: return ARM::D22; + case ARM::D25: return ARM::D24; + case ARM::D27: return ARM::D26; + case ARM::D29: return ARM::D28; + case ARM::D31: return ARM::D30; + } + + return 0; +} + +unsigned ARMBaseRegisterInfo::getRegisterPairOdd(unsigned Reg, + const MachineFunction &MF) const { + switch (Reg) { + default: break; + // Return 0 if either register of the pair is a special register. + // So no R12, etc. + case ARM::R0: return ARM::R1; + case ARM::R2: return ARM::R3; + case ARM::R4: return ARM::R5; + case ARM::R6: + return (isReservedReg(MF, ARM::R7) || isReservedReg(MF, ARM::R6)) + ? 0 : ARM::R7; + case ARM::R8: return isReservedReg(MF, ARM::R9) ? 0 :ARM::R9; + case ARM::R10: return isReservedReg(MF, ARM::R11) ? 0 : ARM::R11; + + case ARM::S0: return ARM::S1; + case ARM::S2: return ARM::S3; + case ARM::S4: return ARM::S5; + case ARM::S6: return ARM::S7; + case ARM::S8: return ARM::S9; + case ARM::S10: return ARM::S11; + case ARM::S12: return ARM::S13; + case ARM::S14: return ARM::S15; + case ARM::S16: return ARM::S17; + case ARM::S18: return ARM::S19; + case ARM::S20: return ARM::S21; + case ARM::S22: return ARM::S23; + case ARM::S24: return ARM::S25; + case ARM::S26: return ARM::S27; + case ARM::S28: return ARM::S29; + case ARM::S30: return ARM::S31; + + case ARM::D0: return ARM::D1; + case ARM::D2: return ARM::D3; + case ARM::D4: return ARM::D5; + case ARM::D6: return ARM::D7; + case ARM::D8: return ARM::D9; + case ARM::D10: return ARM::D11; + case ARM::D12: return ARM::D13; + case ARM::D14: return ARM::D15; + case ARM::D16: return ARM::D17; + case ARM::D18: return ARM::D19; + case ARM::D20: return ARM::D21; + case ARM::D22: return ARM::D23; + case ARM::D24: return ARM::D25; + case ARM::D26: return ARM::D27; + case ARM::D28: return ARM::D29; + case ARM::D30: return ARM::D31; + } + + return 0; +} + +/// emitLoadConstPool - Emits a load from constpool to materialize the +/// specified immediate. +void ARMBaseRegisterInfo:: +emitLoadConstPool(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + DebugLoc dl, + unsigned DestReg, unsigned SubIdx, int Val, + ARMCC::CondCodes Pred, + unsigned PredReg, unsigned MIFlags) const { + MachineFunction &MF = *MBB.getParent(); + MachineConstantPool *ConstantPool = MF.getConstantPool(); + const Constant *C = + ConstantInt::get(Type::getInt32Ty(MF.getFunction()->getContext()), Val); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4); + + BuildMI(MBB, MBBI, dl, TII.get(ARM::LDRcp)) + .addReg(DestReg, getDefRegState(true), SubIdx) + .addConstantPoolIndex(Idx) + .addImm(0).addImm(Pred).addReg(PredReg) + .setMIFlags(MIFlags); +} + +bool ARMBaseRegisterInfo:: +requiresRegisterScavenging(const MachineFunction &MF) const { + return true; +} + +bool ARMBaseRegisterInfo:: +requiresFrameIndexScavenging(const MachineFunction &MF) const { + return true; +} + +bool ARMBaseRegisterInfo:: +requiresVirtualBaseRegisters(const MachineFunction &MF) const { + return EnableLocalStackAlloc; +} + +static void +emitSPUpdate(bool isARM, + MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, + DebugLoc dl, const ARMBaseInstrInfo &TII, + int NumBytes, + ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0) { + if (isARM) + emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, + Pred, PredReg, TII); + else + emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, + Pred, PredReg, TII); +} + + +void ARMBaseRegisterInfo:: +eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + if (!TFI->hasReservedCallFrame(MF)) { + // If we have alloca, convert as follows: + // ADJCALLSTACKDOWN -> sub, sp, sp, amount + // ADJCALLSTACKUP -> add, sp, sp, amount + MachineInstr *Old = I; + DebugLoc dl = Old->getDebugLoc(); + unsigned Amount = Old->getOperand(0).getImm(); + if (Amount != 0) { + // We need to keep the stack aligned properly. To do this, we round the + // amount of space needed for the outgoing arguments up to the next + // alignment boundary. + unsigned Align = TFI->getStackAlignment(); + Amount = (Amount+Align-1)/Align*Align; + + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + assert(!AFI->isThumb1OnlyFunction() && + "This eliminateCallFramePseudoInstr does not support Thumb1!"); + bool isARM = !AFI->isThumbFunction(); + + // Replace the pseudo instruction with a new instruction... + unsigned Opc = Old->getOpcode(); + int PIdx = Old->findFirstPredOperandIdx(); + ARMCC::CondCodes Pred = (PIdx == -1) + ? ARMCC::AL : (ARMCC::CondCodes)Old->getOperand(PIdx).getImm(); + if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) { + // Note: PredReg is operand 2 for ADJCALLSTACKDOWN. + unsigned PredReg = Old->getOperand(2).getReg(); + emitSPUpdate(isARM, MBB, I, dl, TII, -Amount, Pred, PredReg); + } else { + // Note: PredReg is operand 3 for ADJCALLSTACKUP. + unsigned PredReg = Old->getOperand(3).getReg(); + assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP); + emitSPUpdate(isARM, MBB, I, dl, TII, Amount, Pred, PredReg); + } + } + } + MBB.erase(I); +} + +int64_t ARMBaseRegisterInfo:: +getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const { + const MCInstrDesc &Desc = MI->getDesc(); + unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); + int64_t InstrOffs = 0;; + int Scale = 1; + unsigned ImmIdx = 0; + switch (AddrMode) { + case ARMII::AddrModeT2_i8: + case ARMII::AddrModeT2_i12: + case ARMII::AddrMode_i12: + InstrOffs = MI->getOperand(Idx+1).getImm(); + Scale = 1; + break; + case ARMII::AddrMode5: { + // VFP address mode. + const MachineOperand &OffOp = MI->getOperand(Idx+1); + InstrOffs = ARM_AM::getAM5Offset(OffOp.getImm()); + if (ARM_AM::getAM5Op(OffOp.getImm()) == ARM_AM::sub) + InstrOffs = -InstrOffs; + Scale = 4; + break; + } + case ARMII::AddrMode2: { + ImmIdx = Idx+2; + InstrOffs = ARM_AM::getAM2Offset(MI->getOperand(ImmIdx).getImm()); + if (ARM_AM::getAM2Op(MI->getOperand(ImmIdx).getImm()) == ARM_AM::sub) + InstrOffs = -InstrOffs; + break; + } + case ARMII::AddrMode3: { + ImmIdx = Idx+2; + InstrOffs = ARM_AM::getAM3Offset(MI->getOperand(ImmIdx).getImm()); + if (ARM_AM::getAM3Op(MI->getOperand(ImmIdx).getImm()) == ARM_AM::sub) + InstrOffs = -InstrOffs; + break; + } + case ARMII::AddrModeT1_s: { + ImmIdx = Idx+1; + InstrOffs = MI->getOperand(ImmIdx).getImm(); + Scale = 4; + break; + } + default: + llvm_unreachable("Unsupported addressing mode!"); + break; + } + + return InstrOffs * Scale; +} + +/// needsFrameBaseReg - Returns true if the instruction's frame index +/// reference would be better served by a base register other than FP +/// or SP. Used by LocalStackFrameAllocation to determine which frame index +/// references it should create new base registers for. +bool ARMBaseRegisterInfo:: +needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { + for (unsigned i = 0; !MI->getOperand(i).isFI(); ++i) { + assert(i < MI->getNumOperands() &&"Instr doesn't have FrameIndex operand!"); + } + + // It's the load/store FI references that cause issues, as it can be difficult + // to materialize the offset if it won't fit in the literal field. Estimate + // based on the size of the local frame and some conservative assumptions + // about the rest of the stack frame (note, this is pre-regalloc, so + // we don't know everything for certain yet) whether this offset is likely + // to be out of range of the immediate. Return true if so. + + // We only generate virtual base registers for loads and stores, so + // return false for everything else. + unsigned Opc = MI->getOpcode(); + switch (Opc) { + case ARM::LDRi12: case ARM::LDRH: case ARM::LDRBi12: + case ARM::STRi12: case ARM::STRH: case ARM::STRBi12: + case ARM::t2LDRi12: case ARM::t2LDRi8: + case ARM::t2STRi12: case ARM::t2STRi8: + case ARM::VLDRS: case ARM::VLDRD: + case ARM::VSTRS: case ARM::VSTRD: + case ARM::tSTRspi: case ARM::tLDRspi: + if (ForceAllBaseRegAlloc) + return true; + break; + default: + return false; + } + + // Without a virtual base register, if the function has variable sized + // objects, all fixed-size local references will be via the frame pointer, + // Approximate the offset and see if it's legal for the instruction. + // Note that the incoming offset is based on the SP value at function entry, + // so it'll be negative. + MachineFunction &MF = *MI->getParent()->getParent(); + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + + // Estimate an offset from the frame pointer. + // Conservatively assume all callee-saved registers get pushed. R4-R6 + // will be earlier than the FP, so we ignore those. + // R7, LR + int64_t FPOffset = Offset - 8; + // ARM and Thumb2 functions also need to consider R8-R11 and D8-D15 + if (!AFI->isThumbFunction() || !AFI->isThumb1OnlyFunction()) + FPOffset -= 80; + // Estimate an offset from the stack pointer. + // The incoming offset is relating to the SP at the start of the function, + // but when we access the local it'll be relative to the SP after local + // allocation, so adjust our SP-relative offset by that allocation size. + Offset = -Offset; + Offset += MFI->getLocalFrameSize(); + // Assume that we'll have at least some spill slots allocated. + // FIXME: This is a total SWAG number. We should run some statistics + // and pick a real one. + Offset += 128; // 128 bytes of spill slots + + // If there is a frame pointer, try using it. + // The FP is only available if there is no dynamic realignment. We + // don't know for sure yet whether we'll need that, so we guess based + // on whether there are any local variables that would trigger it. + unsigned StackAlign = TFI->getStackAlignment(); + if (TFI->hasFP(MF) && + !((MFI->getLocalFrameMaxAlign() > StackAlign) && canRealignStack(MF))) { + if (isFrameOffsetLegal(MI, FPOffset)) + return false; + } + // If we can reference via the stack pointer, try that. + // FIXME: This (and the code that resolves the references) can be improved + // to only disallow SP relative references in the live range of + // the VLA(s). In practice, it's unclear how much difference that + // would make, but it may be worth doing. + if (!MFI->hasVarSizedObjects() && isFrameOffsetLegal(MI, Offset)) + return false; + + // The offset likely isn't legal, we want to allocate a virtual base register. + return true; +} + +/// materializeFrameBaseRegister - Insert defining instruction(s) for BaseReg to +/// be a pointer to FrameIdx at the beginning of the basic block. +void ARMBaseRegisterInfo:: +materializeFrameBaseRegister(MachineBasicBlock *MBB, + unsigned BaseReg, int FrameIdx, + int64_t Offset) const { + ARMFunctionInfo *AFI = MBB->getParent()->getInfo<ARMFunctionInfo>(); + unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri : + (AFI->isThumb1OnlyFunction() ? ARM::tADDrSPi : ARM::t2ADDri); + + MachineBasicBlock::iterator Ins = MBB->begin(); + DebugLoc DL; // Defaults to "unknown" + if (Ins != MBB->end()) + DL = Ins->getDebugLoc(); + + const MCInstrDesc &MCID = TII.get(ADDriOpc); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this)); + + MachineInstrBuilder MIB = AddDefaultPred(BuildMI(*MBB, Ins, DL, MCID, BaseReg) + .addFrameIndex(FrameIdx).addImm(Offset)); + + if (!AFI->isThumb1OnlyFunction()) + AddDefaultCC(MIB); +} + +void +ARMBaseRegisterInfo::resolveFrameIndex(MachineBasicBlock::iterator I, + unsigned BaseReg, int64_t Offset) const { + MachineInstr &MI = *I; + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + int Off = Offset; // ARM doesn't need the general 64-bit offsets + unsigned i = 0; + + assert(!AFI->isThumb1OnlyFunction() && + "This resolveFrameIndex does not support Thumb1!"); + + while (!MI.getOperand(i).isFI()) { + ++i; + assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!"); + } + bool Done = false; + if (!AFI->isThumbFunction()) + Done = rewriteARMFrameIndex(MI, i, BaseReg, Off, TII); + else { + assert(AFI->isThumb2Function()); + Done = rewriteT2FrameIndex(MI, i, BaseReg, Off, TII); + } + assert (Done && "Unable to resolve frame index!"); + (void)Done; +} + +bool ARMBaseRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, + int64_t Offset) const { + const MCInstrDesc &Desc = MI->getDesc(); + unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); + unsigned i = 0; + + while (!MI->getOperand(i).isFI()) { + ++i; + assert(i < MI->getNumOperands() &&"Instr doesn't have FrameIndex operand!"); + } + + // AddrMode4 and AddrMode6 cannot handle any offset. + if (AddrMode == ARMII::AddrMode4 || AddrMode == ARMII::AddrMode6) + return Offset == 0; + + unsigned NumBits = 0; + unsigned Scale = 1; + bool isSigned = true; + switch (AddrMode) { + case ARMII::AddrModeT2_i8: + case ARMII::AddrModeT2_i12: + // i8 supports only negative, and i12 supports only positive, so + // based on Offset sign, consider the appropriate instruction + Scale = 1; + if (Offset < 0) { + NumBits = 8; + Offset = -Offset; + } else { + NumBits = 12; + } + break; + case ARMII::AddrMode5: + // VFP address mode. + NumBits = 8; + Scale = 4; + break; + case ARMII::AddrMode_i12: + case ARMII::AddrMode2: + NumBits = 12; + break; + case ARMII::AddrMode3: + NumBits = 8; + break; + case ARMII::AddrModeT1_s: + NumBits = 5; + Scale = 4; + isSigned = false; + break; + default: + llvm_unreachable("Unsupported addressing mode!"); + break; + } + + Offset += getFrameIndexInstrOffset(MI, i); + // Make sure the offset is encodable for instructions that scale the + // immediate. + if ((Offset & (Scale-1)) != 0) + return false; + + if (isSigned && Offset < 0) + Offset = -Offset; + + unsigned Mask = (1 << NumBits) - 1; + if ((unsigned)Offset <= Mask * Scale) + return true; + + return false; +} + +void +ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, RegScavenger *RS) const { + unsigned i = 0; + MachineInstr &MI = *II; + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const ARMFrameLowering *TFI = + static_cast<const ARMFrameLowering*>(MF.getTarget().getFrameLowering()); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + assert(!AFI->isThumb1OnlyFunction() && + "This eliminateFrameIndex does not support Thumb1!"); + + while (!MI.getOperand(i).isFI()) { + ++i; + assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!"); + } + + int FrameIndex = MI.getOperand(i).getIndex(); + unsigned FrameReg; + + int Offset = TFI->ResolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj); + + // Special handling of dbg_value instructions. + if (MI.isDebugValue()) { + MI.getOperand(i). ChangeToRegister(FrameReg, false /*isDef*/); + MI.getOperand(i+1).ChangeToImmediate(Offset); + return; + } + + // Modify MI as necessary to handle as much of 'Offset' as possible + bool Done = false; + if (!AFI->isThumbFunction()) + Done = rewriteARMFrameIndex(MI, i, FrameReg, Offset, TII); + else { + assert(AFI->isThumb2Function()); + Done = rewriteT2FrameIndex(MI, i, FrameReg, Offset, TII); + } + if (Done) + return; + + // If we get here, the immediate doesn't fit into the instruction. We folded + // as much as possible above, handle the rest, providing a register that is + // SP+LargeImm. + assert((Offset || + (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode4 || + (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode6) && + "This code isn't needed if offset already handled!"); + + unsigned ScratchReg = 0; + int PIdx = MI.findFirstPredOperandIdx(); + ARMCC::CondCodes Pred = (PIdx == -1) + ? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImm(); + unsigned PredReg = (PIdx == -1) ? 0 : MI.getOperand(PIdx+1).getReg(); + if (Offset == 0) + // Must be addrmode4/6. + MI.getOperand(i).ChangeToRegister(FrameReg, false, false, false); + else { + ScratchReg = MF.getRegInfo().createVirtualRegister(ARM::GPRRegisterClass); + if (!AFI->isThumbFunction()) + emitARMRegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, + Offset, Pred, PredReg, TII); + else { + assert(AFI->isThumb2Function()); + emitT2RegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, + Offset, Pred, PredReg, TII); + } + // Update the original instruction to use the scratch register. + MI.getOperand(i).ChangeToRegister(ScratchReg, false, false, true); + } +} diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h new file mode 100644 index 0000000..fee17ff --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h @@ -0,0 +1,201 @@ +//===- ARMBaseRegisterInfo.h - ARM Register Information Impl ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the base ARM implementation of TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMBASEREGISTERINFO_H +#define ARMBASEREGISTERINFO_H + +#include "ARM.h" +#include "llvm/Target/TargetRegisterInfo.h" + +#define GET_REGINFO_HEADER +#include "ARMGenRegisterInfo.inc" + +namespace llvm { + class ARMSubtarget; + class ARMBaseInstrInfo; + class Type; + +/// Register allocation hints. +namespace ARMRI { + enum { + RegPairOdd = 1, + RegPairEven = 2 + }; +} + +/// isARMArea1Register - Returns true if the register is a low register (r0-r7) +/// or a stack/pc register that we should push/pop. +static inline bool isARMArea1Register(unsigned Reg, bool isDarwin) { + using namespace ARM; + switch (Reg) { + case R0: case R1: case R2: case R3: + case R4: case R5: case R6: case R7: + case LR: case SP: case PC: + return true; + case R8: case R9: case R10: case R11: + // For darwin we want r7 and lr to be next to each other. + return !isDarwin; + default: + return false; + } +} + +static inline bool isARMArea2Register(unsigned Reg, bool isDarwin) { + using namespace ARM; + switch (Reg) { + case R8: case R9: case R10: case R11: + // Darwin has this second area. + return isDarwin; + default: + return false; + } +} + +static inline bool isARMArea3Register(unsigned Reg, bool isDarwin) { + using namespace ARM; + switch (Reg) { + case D15: case D14: case D13: case D12: + case D11: case D10: case D9: case D8: + return true; + default: + return false; + } +} + +class ARMBaseRegisterInfo : public ARMGenRegisterInfo { +protected: + const ARMBaseInstrInfo &TII; + const ARMSubtarget &STI; + + /// FramePtr - ARM physical register used as frame ptr. + unsigned FramePtr; + + /// BasePtr - ARM physical register used as a base ptr in complex stack + /// frames. I.e., when we need a 3rd base, not just SP and FP, due to + /// variable size stack objects. + unsigned BasePtr; + + // Can be only subclassed. + explicit ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii, + const ARMSubtarget &STI); + + // Return the opcode that implements 'Op', or 0 if no opcode + unsigned getOpcode(int Op) const; + +public: + /// Code Generation virtual methods... + const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const; + + BitVector getReservedRegs(const MachineFunction &MF) const; + + /// getMatchingSuperRegClass - Return a subclass of the specified register + /// class A so that each register in it has a sub-register of the + /// specified sub-register index which is in the specified register class B. + virtual const TargetRegisterClass * + getMatchingSuperRegClass(const TargetRegisterClass *A, + const TargetRegisterClass *B, unsigned Idx) const; + + /// canCombineSubRegIndices - Given a register class and a list of + /// subregister indices, return true if it's possible to combine the + /// subregister indices into one that corresponds to a larger + /// subregister. Return the new subregister index by reference. Note the + /// new index may be zero if the given subregisters can be combined to + /// form the whole register. + virtual bool canCombineSubRegIndices(const TargetRegisterClass *RC, + SmallVectorImpl<unsigned> &SubIndices, + unsigned &NewSubIdx) const; + + const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const; + const TargetRegisterClass* + getCrossCopyRegClass(const TargetRegisterClass *RC) const; + + const TargetRegisterClass* + getLargestLegalSuperClass(const TargetRegisterClass *RC) const; + + unsigned getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const; + + ArrayRef<unsigned> getRawAllocationOrder(const TargetRegisterClass *RC, + unsigned HintType, unsigned HintReg, + const MachineFunction &MF) const; + + unsigned ResolveRegAllocHint(unsigned Type, unsigned Reg, + const MachineFunction &MF) const; + + void UpdateRegAllocHint(unsigned Reg, unsigned NewReg, + MachineFunction &MF) const; + + virtual bool avoidWriteAfterWrite(const TargetRegisterClass *RC) const; + + bool hasBasePointer(const MachineFunction &MF) const; + + bool canRealignStack(const MachineFunction &MF) const; + bool needsStackRealignment(const MachineFunction &MF) const; + int64_t getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const; + bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const; + void materializeFrameBaseRegister(MachineBasicBlock *MBB, + unsigned BaseReg, int FrameIdx, + int64_t Offset) const; + void resolveFrameIndex(MachineBasicBlock::iterator I, + unsigned BaseReg, int64_t Offset) const; + bool isFrameOffsetLegal(const MachineInstr *MI, int64_t Offset) const; + + bool cannotEliminateFrame(const MachineFunction &MF) const; + + // Debug information queries. + unsigned getFrameRegister(const MachineFunction &MF) const; + unsigned getBaseRegister() const { return BasePtr; } + + // Exception handling queries. + unsigned getEHExceptionRegister() const; + unsigned getEHHandlerRegister() const; + + bool isLowRegister(unsigned Reg) const; + + + /// emitLoadConstPool - Emits a load from constpool to materialize the + /// specified immediate. + virtual void emitLoadConstPool(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + DebugLoc dl, + unsigned DestReg, unsigned SubIdx, + int Val, + ARMCC::CondCodes Pred = ARMCC::AL, + unsigned PredReg = 0, + unsigned MIFlags = MachineInstr::NoFlags)const; + + /// Code Generation virtual methods... + virtual bool isReservedReg(const MachineFunction &MF, unsigned Reg) const; + + virtual bool requiresRegisterScavenging(const MachineFunction &MF) const; + + virtual bool requiresFrameIndexScavenging(const MachineFunction &MF) const; + + virtual bool requiresVirtualBaseRegisters(const MachineFunction &MF) const; + + virtual void eliminateCallFramePseudoInstr(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const; + + virtual void eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, RegScavenger *RS = NULL) const; + +private: + unsigned getRegisterPairEven(unsigned Reg, const MachineFunction &MF) const; + + unsigned getRegisterPairOdd(unsigned Reg, const MachineFunction &MF) const; +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMBuildAttrs.h b/contrib/llvm/lib/Target/ARM/ARMBuildAttrs.h new file mode 100644 index 0000000..69eddf0 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMBuildAttrs.h @@ -0,0 +1,131 @@ +//===-------- ARMBuildAttrs.h - ARM Build Attributes ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains enumerations and support routines for ARM build attributes +// as defined in ARM ABI addenda document (ABI release 2.08). +// +//===----------------------------------------------------------------------===// + +#ifndef __TARGET_ARMBUILDATTRS_H__ +#define __TARGET_ARMBUILDATTRS_H__ + +namespace ARMBuildAttrs { + enum SpecialAttr { + // This is for the .cpu asm attr. It translates into one or more + // AttrType (below) entries in the .ARM.attributes section in the ELF. + SEL_CPU + }; + + enum AttrType { + // Rest correspond to ELF/.ARM.attributes + File = 1, + Section = 2, + Symbol = 3, + CPU_raw_name = 4, + CPU_name = 5, + CPU_arch = 6, + CPU_arch_profile = 7, + ARM_ISA_use = 8, + THUMB_ISA_use = 9, + VFP_arch = 10, + WMMX_arch = 11, + Advanced_SIMD_arch = 12, + PCS_config = 13, + ABI_PCS_R9_use = 14, + ABI_PCS_RW_data = 15, + ABI_PCS_RO_data = 16, + ABI_PCS_GOT_use = 17, + ABI_PCS_wchar_t = 18, + ABI_FP_rounding = 19, + ABI_FP_denormal = 20, + ABI_FP_exceptions = 21, + ABI_FP_user_exceptions = 22, + ABI_FP_number_model = 23, + ABI_align8_needed = 24, + ABI_align8_preserved = 25, + ABI_enum_size = 26, + ABI_HardFP_use = 27, + ABI_VFP_args = 28, + ABI_WMMX_args = 29, + ABI_optimization_goals = 30, + ABI_FP_optimization_goals = 31, + compatibility = 32, + CPU_unaligned_access = 34, + VFP_HP_extension = 36, + ABI_FP_16bit_format = 38, + MPextension_use = 42, // was 70, 2.08 ABI + DIV_use = 44, + nodefaults = 64, + also_compatible_with = 65, + T2EE_use = 66, + conformance = 67, + Virtualization_use = 68, + MPextension_use_old = 70 + }; + + // Magic numbers for .ARM.attributes + enum AttrMagic { + Format_Version = 0x41 + }; + + // Legal Values for CPU_arch, (=6), uleb128 + enum CPUArch { + Pre_v4 = 0, + v4 = 1, // e.g. SA110 + v4T = 2, // e.g. ARM7TDMI + v5T = 3, // e.g. ARM9TDMI + v5TE = 4, // e.g. ARM946E_S + v5TEJ = 5, // e.g. ARM926EJ_S + v6 = 6, // e.g. ARM1136J_S + v6KZ = 7, // e.g. ARM1176JZ_S + v6T2 = 8, // e.g. ARM1156T2F_S + v6K = 9, // e.g. ARM1136J_S + v7 = 10, // e.g. Cortex A8, Cortex M3 + v6_M = 11, // e.g. Cortex M1 + v6S_M = 12, // v6_M with the System extensions + v7E_M = 13 // v7_M with DSP extensions + }; + + enum CPUArchProfile { // (=7), uleb128 + Not_Applicable = 0, // pre v7, or cross-profile code + ApplicationProfile = (0x41), // 'A' (e.g. for Cortex A8) + RealTimeProfile = (0x52), // 'R' (e.g. for Cortex R4) + MicroControllerProfile = (0x4D), // 'M' (e.g. for Cortex M3) + SystemProfile = (0x53) // 'S' Application or real-time profile + }; + + // The following have a lot of common use cases + enum { + //ARMISAUse (=8), uleb128 and THUMBISAUse (=9), uleb128 + Not_Allowed = 0, + Allowed = 1, + + // FP_arch (=10), uleb128 (formerly Tag_VFP_arch = 10) + AllowFPv2 = 2, // v2 FP ISA permitted (implies use of the v1 FP ISA) + AllowFPv3A = 3, // v3 FP ISA permitted (implies use of the v2 FP ISA) + AllowFPv3B = 4, // v3 FP ISA permitted, but only D0-D15, S0-S31 + AllowFPv4A = 5, // v4 FP ISA permitted (implies use of v3 FP ISA) + AllowFPv4B = 6, // v4 FP ISA was permitted, but only D0-D15, S0-S31 + + // Tag_WMMX_arch, (=11), uleb128 + AllowThumb32 = 2, // 32-bit Thumb (implies 16-bit instructions) + + // Tag_WMMX_arch, (=11), uleb128 + AllowWMMXv1 = 2, // The user permitted this entity to use WMMX v2 + + // Tag_ABI_FP_denormal, (=20), uleb128 + PreserveFPSign = 2, // sign when flushed-to-zero is preserved + + // Tag_ABI_FP_number_model, (=23), uleb128 + AllowRTABI = 2, // numbers, infinities, and one quiet NaN (see [RTABI]) + AllowIEE754 = 3 // this code to use all the IEEE 754-defined FP encodings + }; +} + +#endif // __TARGET_ARMBUILDATTRS_H__ diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.h b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h new file mode 100644 index 0000000..ff7db1f --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h @@ -0,0 +1,160 @@ +//===-- ARMCallingConv.h - ARM Custom Calling Convention Routines ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the custom routines for the ARM Calling Convention that +// aren't done by tablegen. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMCALLINGCONV_H +#define ARMCALLINGCONV_H + +#include "llvm/CallingConv.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "ARMBaseInstrInfo.h" +#include "ARMRegisterInfo.h" +#include "ARMSubtarget.h" +#include "ARM.h" + +namespace llvm { + +// APCS f64 is in register pairs, possibly split to stack +static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + CCState &State, bool CanFail) { + static const unsigned RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; + + // Try to get the first register. + if (unsigned Reg = State.AllocateReg(RegList, 4)) + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + else { + // For the 2nd half of a v2f64, do not fail. + if (CanFail) + return false; + + // Put the whole thing on the stack. + State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, + State.AllocateStack(8, 4), + LocVT, LocInfo)); + return true; + } + + // Try to get the second register. + if (unsigned Reg = State.AllocateReg(RegList, 4)) + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + else + State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, + State.AllocateStack(4, 4), + LocVT, LocInfo)); + return true; +} + +static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + if (!f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, true)) + return false; + if (LocVT == MVT::v2f64 && + !f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, false)) + return false; + return true; // we handled it +} + +// AAPCS f64 is in aligned register pairs +static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + CCState &State, bool CanFail) { + static const unsigned HiRegList[] = { ARM::R0, ARM::R2 }; + static const unsigned LoRegList[] = { ARM::R1, ARM::R3 }; + static const unsigned ShadowRegList[] = { ARM::R0, ARM::R1 }; + + unsigned Reg = State.AllocateReg(HiRegList, ShadowRegList, 2); + if (Reg == 0) { + // For the 2nd half of a v2f64, do not just fail. + if (CanFail) + return false; + + // Put the whole thing on the stack. + State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, + State.AllocateStack(8, 8), + LocVT, LocInfo)); + return true; + } + + unsigned i; + for (i = 0; i < 2; ++i) + if (HiRegList[i] == Reg) + break; + + unsigned T = State.AllocateReg(LoRegList[i]); + (void)T; + assert(T == LoRegList[i] && "Could not allocate register"); + + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i], + LocVT, LocInfo)); + return true; +} + +static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + if (!f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, true)) + return false; + if (LocVT == MVT::v2f64 && + !f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, false)) + return false; + return true; // we handled it +} + +static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, CCState &State) { + static const unsigned HiRegList[] = { ARM::R0, ARM::R2 }; + static const unsigned LoRegList[] = { ARM::R1, ARM::R3 }; + + unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 2); + if (Reg == 0) + return false; // we didn't handle it + + unsigned i; + for (i = 0; i < 2; ++i) + if (HiRegList[i] == Reg) + break; + + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i], + LocVT, LocInfo)); + return true; +} + +static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + if (!f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State)) + return false; + if (LocVT == MVT::v2f64 && !f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State)) + return false; + return true; // we handled it +} + +static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + return RetCC_ARM_APCS_Custom_f64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, + State); +} + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.td b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td new file mode 100644 index 0000000..d2981c0 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td @@ -0,0 +1,164 @@ +//===- ARMCallingConv.td - Calling Conventions for ARM -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This describes the calling conventions for ARM architecture. +//===----------------------------------------------------------------------===// + +/// CCIfSubtarget - Match if the current subtarget has a feature F. +class CCIfSubtarget<string F, CCAction A>: + CCIf<!strconcat("State.getTarget().getSubtarget<ARMSubtarget>().", F), A>; + +/// CCIfAlign - Match of the original alignment of the arg +class CCIfAlign<string Align, CCAction A>: + CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>; + +//===----------------------------------------------------------------------===// +// ARM APCS Calling Convention +//===----------------------------------------------------------------------===// +def CC_ARM_APCS : CallingConv<[ + + // Handles byval parameters. + CCIfByVal<CCPassByVal<4, 4>>, + + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + // f64 and v2f64 are passed in adjacent GPRs, possibly split onto the stack + CCIfType<[f64, v2f64], CCCustom<"CC_ARM_APCS_Custom_f64">>, + + CCIfType<[f32], CCBitConvertToType<i32>>, + CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>, + + CCIfType<[i32], CCAssignToStack<4, 4>>, + CCIfType<[f64], CCAssignToStack<8, 4>>, + CCIfType<[v2f64], CCAssignToStack<16, 4>> +]>; + +def RetCC_ARM_APCS : CallingConv<[ + CCIfType<[f32], CCBitConvertToType<i32>>, + + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>, + + CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>, + CCIfType<[i64], CCAssignToRegWithShadow<[R0, R2], [R1, R3]>> +]>; + +//===----------------------------------------------------------------------===// +// ARM APCS Calling Convention for FastCC (when VFP2 or later is available) +//===----------------------------------------------------------------------===// +def FastCC_ARM_APCS : CallingConv<[ + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, + CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, + CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, + S9, S10, S11, S12, S13, S14, S15]>>, + CCDelegateTo<CC_ARM_APCS> +]>; + +def RetFastCC_ARM_APCS : CallingConv<[ + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, + CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, + CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, + S9, S10, S11, S12, S13, S14, S15]>>, + CCDelegateTo<RetCC_ARM_APCS> +]>; + + +//===----------------------------------------------------------------------===// +// ARM AAPCS (EABI) Calling Convention, common parts +//===----------------------------------------------------------------------===// + +def CC_ARM_AAPCS_Common : CallingConv<[ + + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // i64/f64 is passed in even pairs of GPRs + // i64 is 8-aligned i32 here, so we may need to eat R1 as a pad register + // (and the same is true for f64 if VFP is not enabled) + CCIfType<[i32], CCIfAlign<"8", CCAssignToRegWithShadow<[R0, R2], [R0, R1]>>>, + CCIfType<[i32], CCIf<"State.getNextStackOffset() == 0 &&" + "ArgFlags.getOrigAlign() != 8", + CCAssignToReg<[R0, R1, R2, R3]>>>, + + CCIfType<[i32], CCIfAlign<"8", CCAssignToStackWithShadow<4, 8, R3>>>, + CCIfType<[i32, f32], CCAssignToStack<4, 4>>, + CCIfType<[f64], CCAssignToStack<8, 8>>, + CCIfType<[v2f64], CCAssignToStack<16, 8>> +]>; + +def RetCC_ARM_AAPCS_Common : CallingConv<[ + CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>, + CCIfType<[i64], CCAssignToRegWithShadow<[R0, R2], [R1, R3]>> +]>; + +//===----------------------------------------------------------------------===// +// ARM AAPCS (EABI) Calling Convention +//===----------------------------------------------------------------------===// + +def CC_ARM_AAPCS : CallingConv<[ + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + CCIfType<[f64, v2f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>, + CCIfType<[f32], CCBitConvertToType<i32>>, + CCDelegateTo<CC_ARM_AAPCS_Common> +]>; + +def RetCC_ARM_AAPCS : CallingConv<[ + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>, + CCIfType<[f32], CCBitConvertToType<i32>>, + CCDelegateTo<RetCC_ARM_AAPCS_Common> +]>; + +//===----------------------------------------------------------------------===// +// ARM AAPCS-VFP (EABI) Calling Convention +// Also used for FastCC (when VFP2 or later is available) +//===----------------------------------------------------------------------===// + +def CC_ARM_AAPCS_VFP : CallingConv<[ + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, + CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, + CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, + S9, S10, S11, S12, S13, S14, S15]>>, + CCDelegateTo<CC_ARM_AAPCS_Common> +]>; + +def RetCC_ARM_AAPCS_VFP : CallingConv<[ + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, + CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, + CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, + S9, S10, S11, S12, S13, S14, S15]>>, + CCDelegateTo<RetCC_ARM_AAPCS_Common> +]>; diff --git a/contrib/llvm/lib/Target/ARM/ARMCodeEmitter.cpp b/contrib/llvm/lib/Target/ARM/ARMCodeEmitter.cpp new file mode 100644 index 0000000..4148d4a --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMCodeEmitter.cpp @@ -0,0 +1,1903 @@ +//===-- ARM/ARMCodeEmitter.cpp - Convert ARM code to machine code ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the pass that transforms the ARM machine instructions into +// relocatable machine code. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "jit" +#include "ARM.h" +#include "ARMConstantPoolValue.h" +#include "ARMInstrInfo.h" +#include "ARMRelocations.h" +#include "ARMSubtarget.h" +#include "ARMTargetMachine.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/PassManager.h" +#include "llvm/CodeGen/JITCodeEmitter.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#ifndef NDEBUG +#include <iomanip> +#endif +using namespace llvm; + +STATISTIC(NumEmitted, "Number of machine instructions emitted"); + +namespace { + + class ARMCodeEmitter : public MachineFunctionPass { + ARMJITInfo *JTI; + const ARMInstrInfo *II; + const TargetData *TD; + const ARMSubtarget *Subtarget; + TargetMachine &TM; + JITCodeEmitter &MCE; + MachineModuleInfo *MMI; + const std::vector<MachineConstantPoolEntry> *MCPEs; + const std::vector<MachineJumpTableEntry> *MJTEs; + bool IsPIC; + bool IsThumb; + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<MachineModuleInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + static char ID; + public: + ARMCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce) + : MachineFunctionPass(ID), JTI(0), + II((const ARMInstrInfo *)tm.getInstrInfo()), + TD(tm.getTargetData()), TM(tm), + MCE(mce), MCPEs(0), MJTEs(0), + IsPIC(TM.getRelocationModel() == Reloc::PIC_), IsThumb(false) {} + + /// getBinaryCodeForInstr - This function, generated by the + /// CodeEmitterGenerator using TableGen, produces the binary encoding for + /// machine instructions. + unsigned getBinaryCodeForInstr(const MachineInstr &MI) const; + + bool runOnMachineFunction(MachineFunction &MF); + + virtual const char *getPassName() const { + return "ARM Machine Code Emitter"; + } + + void emitInstruction(const MachineInstr &MI); + + private: + + void emitWordLE(unsigned Binary); + void emitDWordLE(uint64_t Binary); + void emitConstPoolInstruction(const MachineInstr &MI); + void emitMOVi32immInstruction(const MachineInstr &MI); + void emitMOVi2piecesInstruction(const MachineInstr &MI); + void emitLEApcrelJTInstruction(const MachineInstr &MI); + void emitPseudoMoveInstruction(const MachineInstr &MI); + void addPCLabel(unsigned LabelID); + void emitPseudoInstruction(const MachineInstr &MI); + unsigned getMachineSoRegOpValue(const MachineInstr &MI, + const MCInstrDesc &MCID, + const MachineOperand &MO, + unsigned OpIdx); + + unsigned getMachineSoImmOpValue(unsigned SoImm); + unsigned getAddrModeSBit(const MachineInstr &MI, + const MCInstrDesc &MCID) const; + + void emitDataProcessingInstruction(const MachineInstr &MI, + unsigned ImplicitRd = 0, + unsigned ImplicitRn = 0); + + void emitLoadStoreInstruction(const MachineInstr &MI, + unsigned ImplicitRd = 0, + unsigned ImplicitRn = 0); + + void emitMiscLoadStoreInstruction(const MachineInstr &MI, + unsigned ImplicitRn = 0); + + void emitLoadStoreMultipleInstruction(const MachineInstr &MI); + + void emitMulFrmInstruction(const MachineInstr &MI); + + void emitExtendInstruction(const MachineInstr &MI); + + void emitMiscArithInstruction(const MachineInstr &MI); + + void emitSaturateInstruction(const MachineInstr &MI); + + void emitBranchInstruction(const MachineInstr &MI); + + void emitInlineJumpTable(unsigned JTIndex); + + void emitMiscBranchInstruction(const MachineInstr &MI); + + void emitVFPArithInstruction(const MachineInstr &MI); + + void emitVFPConversionInstruction(const MachineInstr &MI); + + void emitVFPLoadStoreInstruction(const MachineInstr &MI); + + void emitVFPLoadStoreMultipleInstruction(const MachineInstr &MI); + + void emitNEONLaneInstruction(const MachineInstr &MI); + void emitNEONDupInstruction(const MachineInstr &MI); + void emitNEON1RegModImmInstruction(const MachineInstr &MI); + void emitNEON2RegInstruction(const MachineInstr &MI); + void emitNEON3RegInstruction(const MachineInstr &MI); + + /// getMachineOpValue - Return binary encoding of operand. If the machine + /// operand requires relocation, record the relocation and return zero. + unsigned getMachineOpValue(const MachineInstr &MI, + const MachineOperand &MO) const; + unsigned getMachineOpValue(const MachineInstr &MI, unsigned OpIdx) const { + return getMachineOpValue(MI, MI.getOperand(OpIdx)); + } + + // FIXME: The legacy JIT ARMCodeEmitter doesn't rely on the the + // TableGen'erated getBinaryCodeForInstr() function to encode any + // operand values, instead querying getMachineOpValue() directly for + // each operand it needs to encode. Thus, any of the new encoder + // helper functions can simply return 0 as the values the return + // are already handled elsewhere. They are placeholders to allow this + // encoder to continue to function until the MC encoder is sufficiently + // far along that this one can be eliminated entirely. + unsigned NEONThumb2DataIPostEncoder(const MachineInstr &MI, unsigned Val) + const { return 0; } + unsigned NEONThumb2LoadStorePostEncoder(const MachineInstr &MI,unsigned Val) + const { return 0; } + unsigned NEONThumb2DupPostEncoder(const MachineInstr &MI,unsigned Val) + const { return 0; } + unsigned VFPThumb2PostEncoder(const MachineInstr&MI, unsigned Val) + const { return 0; } + unsigned getAdrLabelOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getThumbAdrLabelOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getThumbBLTargetOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getThumbBLXTargetOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getThumbBRTargetOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getThumbBCCTargetOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getThumbCBTargetOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getBranchTargetOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getUnconditionalBranchTargetOpValue(const MachineInstr &MI, + unsigned Op) const { return 0; } + unsigned getARMBranchTargetOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getARMBLXTargetOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getCCOutOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getSOImmOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getT2SOImmOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getSORegRegOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getSORegImmOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getThumbAddrModeRegRegOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getT2AddrModeImm12OpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getT2AddrModeImm8OpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getT2Imm8s4OpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getT2AddrModeImm8s4OpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getT2AddrModeImm0_1020s4OpValue(const MachineInstr &MI,unsigned Op) + const { return 0; } + unsigned getT2AddrModeImm8OffsetOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getT2AddrModeImm12OffsetOpValue(const MachineInstr &MI,unsigned Op) + const { return 0; } + unsigned getT2AddrModeSORegOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getT2SORegOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getT2AdrLabelOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getAddrMode6AddressOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getAddrMode6OneLane32AddressOpValue(const MachineInstr &MI, + unsigned Op) + const { return 0; } + unsigned getAddrMode6DupAddressOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getAddrMode6OffsetOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getBitfieldInvertedMaskOpValue(const MachineInstr &MI, + unsigned Op) const { return 0; } + unsigned getSsatBitPosValue(const MachineInstr &MI, + unsigned Op) const { return 0; } + uint32_t getLdStmModeOpValue(const MachineInstr &MI, unsigned OpIdx) + const {return 0; } + uint32_t getLdStSORegOpValue(const MachineInstr &MI, unsigned OpIdx) + const { return 0; } + + unsigned getAddrModeImm12OpValue(const MachineInstr &MI, unsigned Op) + const { + // {17-13} = reg + // {12} = (U)nsigned (add == '1', sub == '0') + // {11-0} = imm12 + const MachineOperand &MO = MI.getOperand(Op); + const MachineOperand &MO1 = MI.getOperand(Op + 1); + if (!MO.isReg()) { + emitConstPoolAddress(MO.getIndex(), ARM::reloc_arm_cp_entry); + return 0; + } + unsigned Reg = getARMRegisterNumbering(MO.getReg()); + int32_t Imm12 = MO1.getImm(); + uint32_t Binary; + Binary = Imm12 & 0xfff; + if (Imm12 >= 0) + Binary |= (1 << 12); + Binary |= (Reg << 13); + return Binary; + } + + unsigned getHiLo16ImmOpValue(const MachineInstr &MI, unsigned Op) const { + return 0; + } + + uint32_t getAddrMode2OpValue(const MachineInstr &MI, unsigned OpIdx) + const { return 0;} + uint32_t getAddrMode2OffsetOpValue(const MachineInstr &MI, unsigned OpIdx) + const { return 0;} + uint32_t getPostIdxRegOpValue(const MachineInstr &MI, unsigned OpIdx) + const { return 0;} + uint32_t getAddrMode3OffsetOpValue(const MachineInstr &MI, unsigned OpIdx) + const { return 0;} + uint32_t getAddrMode3OpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + uint32_t getAddrModeThumbSPOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + uint32_t getAddrModeSOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + uint32_t getAddrModeISOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + uint32_t getAddrModePCOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + uint32_t getAddrMode5OpValue(const MachineInstr &MI, unsigned Op) const { + // {17-13} = reg + // {12} = (U)nsigned (add == '1', sub == '0') + // {11-0} = imm12 + const MachineOperand &MO = MI.getOperand(Op); + const MachineOperand &MO1 = MI.getOperand(Op + 1); + if (!MO.isReg()) { + emitConstPoolAddress(MO.getIndex(), ARM::reloc_arm_cp_entry); + return 0; + } + unsigned Reg = getARMRegisterNumbering(MO.getReg()); + int32_t Imm12 = MO1.getImm(); + + // Special value for #-0 + if (Imm12 == INT32_MIN) + Imm12 = 0; + + // Immediate is always encoded as positive. The 'U' bit controls add vs + // sub. + bool isAdd = true; + if (Imm12 < 0) { + Imm12 = -Imm12; + isAdd = false; + } + + uint32_t Binary = Imm12 & 0xfff; + if (isAdd) + Binary |= (1 << 12); + Binary |= (Reg << 13); + return Binary; + } + unsigned getNEONVcvtImm32OpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + + unsigned getRegisterListOpValue(const MachineInstr &MI, unsigned Op) + const { return 0; } + + unsigned getShiftRight8Imm(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getShiftRight16Imm(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getShiftRight32Imm(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getShiftRight64Imm(const MachineInstr &MI, unsigned Op) + const { return 0; } + + /// getMovi32Value - Return binary encoding of operand for movw/movt. If the + /// machine operand requires relocation, record the relocation and return + /// zero. + unsigned getMovi32Value(const MachineInstr &MI,const MachineOperand &MO, + unsigned Reloc); + + /// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value. + /// + unsigned getShiftOp(unsigned Imm) const ; + + /// Routines that handle operands which add machine relocations which are + /// fixed up by the relocation stage. + void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc, + bool MayNeedFarStub, bool Indirect, + intptr_t ACPV = 0) const; + void emitExternalSymbolAddress(const char *ES, unsigned Reloc) const; + void emitConstPoolAddress(unsigned CPI, unsigned Reloc) const; + void emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) const; + void emitMachineBasicBlock(MachineBasicBlock *BB, unsigned Reloc, + intptr_t JTBase = 0) const; + }; +} + +char ARMCodeEmitter::ID = 0; + +/// createARMJITCodeEmitterPass - Return a pass that emits the collected ARM +/// code to the specified MCE object. +FunctionPass *llvm::createARMJITCodeEmitterPass(ARMBaseTargetMachine &TM, + JITCodeEmitter &JCE) { + return new ARMCodeEmitter(TM, JCE); +} + +bool ARMCodeEmitter::runOnMachineFunction(MachineFunction &MF) { + assert((MF.getTarget().getRelocationModel() != Reloc::Default || + MF.getTarget().getRelocationModel() != Reloc::Static) && + "JIT relocation model must be set to static or default!"); + JTI = ((ARMTargetMachine &)MF.getTarget()).getJITInfo(); + II = ((const ARMTargetMachine &)MF.getTarget()).getInstrInfo(); + TD = ((const ARMTargetMachine &)MF.getTarget()).getTargetData(); + Subtarget = &TM.getSubtarget<ARMSubtarget>(); + MCPEs = &MF.getConstantPool()->getConstants(); + MJTEs = 0; + if (MF.getJumpTableInfo()) MJTEs = &MF.getJumpTableInfo()->getJumpTables(); + IsPIC = TM.getRelocationModel() == Reloc::PIC_; + IsThumb = MF.getInfo<ARMFunctionInfo>()->isThumbFunction(); + JTI->Initialize(MF, IsPIC); + MMI = &getAnalysis<MachineModuleInfo>(); + MCE.setModuleInfo(MMI); + + do { + DEBUG(errs() << "JITTing function '" + << MF.getFunction()->getName() << "'\n"); + MCE.startFunction(MF); + for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); + MBB != E; ++MBB) { + MCE.StartMachineBasicBlock(MBB); + for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end(); + I != E; ++I) + emitInstruction(*I); + } + } while (MCE.finishFunction(MF)); + + return false; +} + +/// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value. +/// +unsigned ARMCodeEmitter::getShiftOp(unsigned Imm) const { + switch (ARM_AM::getAM2ShiftOpc(Imm)) { + default: llvm_unreachable("Unknown shift opc!"); + case ARM_AM::asr: return 2; + case ARM_AM::lsl: return 0; + case ARM_AM::lsr: return 1; + case ARM_AM::ror: + case ARM_AM::rrx: return 3; + } + return 0; +} + +/// getMovi32Value - Return binary encoding of operand for movw/movt. If the +/// machine operand requires relocation, record the relocation and return zero. +unsigned ARMCodeEmitter::getMovi32Value(const MachineInstr &MI, + const MachineOperand &MO, + unsigned Reloc) { + assert(((Reloc == ARM::reloc_arm_movt) || (Reloc == ARM::reloc_arm_movw)) + && "Relocation to this function should be for movt or movw"); + + if (MO.isImm()) + return static_cast<unsigned>(MO.getImm()); + else if (MO.isGlobal()) + emitGlobalAddress(MO.getGlobal(), Reloc, true, false); + else if (MO.isSymbol()) + emitExternalSymbolAddress(MO.getSymbolName(), Reloc); + else if (MO.isMBB()) + emitMachineBasicBlock(MO.getMBB(), Reloc); + else { +#ifndef NDEBUG + errs() << MO; +#endif + llvm_unreachable("Unsupported operand type for movw/movt"); + } + return 0; +} + +/// getMachineOpValue - Return binary encoding of operand. If the machine +/// operand requires relocation, record the relocation and return zero. +unsigned ARMCodeEmitter::getMachineOpValue(const MachineInstr &MI, + const MachineOperand &MO) const { + if (MO.isReg()) + return getARMRegisterNumbering(MO.getReg()); + else if (MO.isImm()) + return static_cast<unsigned>(MO.getImm()); + else if (MO.isGlobal()) + emitGlobalAddress(MO.getGlobal(), ARM::reloc_arm_branch, true, false); + else if (MO.isSymbol()) + emitExternalSymbolAddress(MO.getSymbolName(), ARM::reloc_arm_branch); + else if (MO.isCPI()) { + const MCInstrDesc &MCID = MI.getDesc(); + // For VFP load, the immediate offset is multiplied by 4. + unsigned Reloc = ((MCID.TSFlags & ARMII::FormMask) == ARMII::VFPLdStFrm) + ? ARM::reloc_arm_vfp_cp_entry : ARM::reloc_arm_cp_entry; + emitConstPoolAddress(MO.getIndex(), Reloc); + } else if (MO.isJTI()) + emitJumpTableAddress(MO.getIndex(), ARM::reloc_arm_relative); + else if (MO.isMBB()) + emitMachineBasicBlock(MO.getMBB(), ARM::reloc_arm_branch); + else + llvm_unreachable("Unable to encode MachineOperand!"); + return 0; +} + +/// emitGlobalAddress - Emit the specified address to the code stream. +/// +void ARMCodeEmitter::emitGlobalAddress(const GlobalValue *GV, unsigned Reloc, + bool MayNeedFarStub, bool Indirect, + intptr_t ACPV) const { + MachineRelocation MR = Indirect + ? MachineRelocation::getIndirectSymbol(MCE.getCurrentPCOffset(), Reloc, + const_cast<GlobalValue *>(GV), + ACPV, MayNeedFarStub) + : MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc, + const_cast<GlobalValue *>(GV), ACPV, + MayNeedFarStub); + MCE.addRelocation(MR); +} + +/// emitExternalSymbolAddress - Arrange for the address of an external symbol to +/// be emitted to the current location in the function, and allow it to be PC +/// relative. +void ARMCodeEmitter:: +emitExternalSymbolAddress(const char *ES, unsigned Reloc) const { + MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(), + Reloc, ES)); +} + +/// emitConstPoolAddress - Arrange for the address of an constant pool +/// to be emitted to the current location in the function, and allow it to be PC +/// relative. +void ARMCodeEmitter::emitConstPoolAddress(unsigned CPI, unsigned Reloc) const { + // Tell JIT emitter we'll resolve the address. + MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(), + Reloc, CPI, 0, true)); +} + +/// emitJumpTableAddress - Arrange for the address of a jump table to +/// be emitted to the current location in the function, and allow it to be PC +/// relative. +void ARMCodeEmitter:: +emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) const { + MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(), + Reloc, JTIndex, 0, true)); +} + +/// emitMachineBasicBlock - Emit the specified address basic block. +void ARMCodeEmitter::emitMachineBasicBlock(MachineBasicBlock *BB, + unsigned Reloc, + intptr_t JTBase) const { + MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(), + Reloc, BB, JTBase)); +} + +void ARMCodeEmitter::emitWordLE(unsigned Binary) { + DEBUG(errs() << " 0x"; + errs().write_hex(Binary) << "\n"); + MCE.emitWordLE(Binary); +} + +void ARMCodeEmitter::emitDWordLE(uint64_t Binary) { + DEBUG(errs() << " 0x"; + errs().write_hex(Binary) << "\n"); + MCE.emitDWordLE(Binary); +} + +void ARMCodeEmitter::emitInstruction(const MachineInstr &MI) { + DEBUG(errs() << "JIT: " << (void*)MCE.getCurrentPCValue() << ":\t" << MI); + + MCE.processDebugLoc(MI.getDebugLoc(), true); + + ++NumEmitted; // Keep track of the # of mi's emitted + switch (MI.getDesc().TSFlags & ARMII::FormMask) { + default: { + llvm_unreachable("Unhandled instruction encoding format!"); + break; + } + case ARMII::MiscFrm: + if (MI.getOpcode() == ARM::LEApcrelJT) { + // Materialize jumptable address. + emitLEApcrelJTInstruction(MI); + break; + } + llvm_unreachable("Unhandled instruction encoding!"); + break; + case ARMII::Pseudo: + emitPseudoInstruction(MI); + break; + case ARMII::DPFrm: + case ARMII::DPSoRegFrm: + emitDataProcessingInstruction(MI); + break; + case ARMII::LdFrm: + case ARMII::StFrm: + emitLoadStoreInstruction(MI); + break; + case ARMII::LdMiscFrm: + case ARMII::StMiscFrm: + emitMiscLoadStoreInstruction(MI); + break; + case ARMII::LdStMulFrm: + emitLoadStoreMultipleInstruction(MI); + break; + case ARMII::MulFrm: + emitMulFrmInstruction(MI); + break; + case ARMII::ExtFrm: + emitExtendInstruction(MI); + break; + case ARMII::ArithMiscFrm: + emitMiscArithInstruction(MI); + break; + case ARMII::SatFrm: + emitSaturateInstruction(MI); + break; + case ARMII::BrFrm: + emitBranchInstruction(MI); + break; + case ARMII::BrMiscFrm: + emitMiscBranchInstruction(MI); + break; + // VFP instructions. + case ARMII::VFPUnaryFrm: + case ARMII::VFPBinaryFrm: + emitVFPArithInstruction(MI); + break; + case ARMII::VFPConv1Frm: + case ARMII::VFPConv2Frm: + case ARMII::VFPConv3Frm: + case ARMII::VFPConv4Frm: + case ARMII::VFPConv5Frm: + emitVFPConversionInstruction(MI); + break; + case ARMII::VFPLdStFrm: + emitVFPLoadStoreInstruction(MI); + break; + case ARMII::VFPLdStMulFrm: + emitVFPLoadStoreMultipleInstruction(MI); + break; + + // NEON instructions. + case ARMII::NGetLnFrm: + case ARMII::NSetLnFrm: + emitNEONLaneInstruction(MI); + break; + case ARMII::NDupFrm: + emitNEONDupInstruction(MI); + break; + case ARMII::N1RegModImmFrm: + emitNEON1RegModImmInstruction(MI); + break; + case ARMII::N2RegFrm: + emitNEON2RegInstruction(MI); + break; + case ARMII::N3RegFrm: + emitNEON3RegInstruction(MI); + break; + } + MCE.processDebugLoc(MI.getDebugLoc(), false); +} + +void ARMCodeEmitter::emitConstPoolInstruction(const MachineInstr &MI) { + unsigned CPI = MI.getOperand(0).getImm(); // CP instruction index. + unsigned CPIndex = MI.getOperand(1).getIndex(); // Actual cp entry index. + const MachineConstantPoolEntry &MCPE = (*MCPEs)[CPIndex]; + + // Remember the CONSTPOOL_ENTRY address for later relocation. + JTI->addConstantPoolEntryAddr(CPI, MCE.getCurrentPCValue()); + + // Emit constpool island entry. In most cases, the actual values will be + // resolved and relocated after code emission. + if (MCPE.isMachineConstantPoolEntry()) { + ARMConstantPoolValue *ACPV = + static_cast<ARMConstantPoolValue*>(MCPE.Val.MachineCPVal); + + DEBUG(errs() << " ** ARM constant pool #" << CPI << " @ " + << (void*)MCE.getCurrentPCValue() << " " << *ACPV << '\n'); + + assert(ACPV->isGlobalValue() && "unsupported constant pool value"); + const GlobalValue *GV = cast<ARMConstantPoolConstant>(ACPV)->getGV(); + if (GV) { + Reloc::Model RelocM = TM.getRelocationModel(); + emitGlobalAddress(GV, ARM::reloc_arm_machine_cp_entry, + isa<Function>(GV), + Subtarget->GVIsIndirectSymbol(GV, RelocM), + (intptr_t)ACPV); + } else { + const char *Sym = cast<ARMConstantPoolSymbol>(ACPV)->getSymbol(); + emitExternalSymbolAddress(Sym, ARM::reloc_arm_absolute); + } + emitWordLE(0); + } else { + const Constant *CV = MCPE.Val.ConstVal; + + DEBUG({ + errs() << " ** Constant pool #" << CPI << " @ " + << (void*)MCE.getCurrentPCValue() << " "; + if (const Function *F = dyn_cast<Function>(CV)) + errs() << F->getName(); + else + errs() << *CV; + errs() << '\n'; + }); + + if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV)) { + emitGlobalAddress(GV, ARM::reloc_arm_absolute, isa<Function>(GV), false); + emitWordLE(0); + } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) { + uint32_t Val = uint32_t(*CI->getValue().getRawData()); + emitWordLE(Val); + } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) { + if (CFP->getType()->isFloatTy()) + emitWordLE(CFP->getValueAPF().bitcastToAPInt().getZExtValue()); + else if (CFP->getType()->isDoubleTy()) + emitDWordLE(CFP->getValueAPF().bitcastToAPInt().getZExtValue()); + else { + llvm_unreachable("Unable to handle this constantpool entry!"); + } + } else { + llvm_unreachable("Unable to handle this constantpool entry!"); + } + } +} + +void ARMCodeEmitter::emitMOVi32immInstruction(const MachineInstr &MI) { + const MachineOperand &MO0 = MI.getOperand(0); + const MachineOperand &MO1 = MI.getOperand(1); + + // Emit the 'movw' instruction. + unsigned Binary = 0x30 << 20; // mov: Insts{27-20} = 0b00110000 + + unsigned Lo16 = getMovi32Value(MI, MO1, ARM::reloc_arm_movw) & 0xFFFF; + + // Set the conditional execution predicate. + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + // Encode Rd. + Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRdShift; + + // Encode imm16 as imm4:imm12 + Binary |= Lo16 & 0xFFF; // Insts{11-0} = imm12 + Binary |= ((Lo16 >> 12) & 0xF) << 16; // Insts{19-16} = imm4 + emitWordLE(Binary); + + unsigned Hi16 = getMovi32Value(MI, MO1, ARM::reloc_arm_movt) >> 16; + // Emit the 'movt' instruction. + Binary = 0x34 << 20; // movt: Insts{27-20} = 0b00110100 + + // Set the conditional execution predicate. + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + // Encode Rd. + Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRdShift; + + // Encode imm16 as imm4:imm1, same as movw above. + Binary |= Hi16 & 0xFFF; + Binary |= ((Hi16 >> 12) & 0xF) << 16; + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitMOVi2piecesInstruction(const MachineInstr &MI) { + const MachineOperand &MO0 = MI.getOperand(0); + const MachineOperand &MO1 = MI.getOperand(1); + assert(MO1.isImm() && ARM_AM::isSOImmTwoPartVal(MO1.getImm()) && + "Not a valid so_imm value!"); + unsigned V1 = ARM_AM::getSOImmTwoPartFirst(MO1.getImm()); + unsigned V2 = ARM_AM::getSOImmTwoPartSecond(MO1.getImm()); + + // Emit the 'mov' instruction. + unsigned Binary = 0xd << 21; // mov: Insts{24-21} = 0b1101 + + // Set the conditional execution predicate. + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + // Encode Rd. + Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRdShift; + + // Encode so_imm. + // Set bit I(25) to identify this is the immediate form of <shifter_op> + Binary |= 1 << ARMII::I_BitShift; + Binary |= getMachineSoImmOpValue(V1); + emitWordLE(Binary); + + // Now the 'orr' instruction. + Binary = 0xc << 21; // orr: Insts{24-21} = 0b1100 + + // Set the conditional execution predicate. + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + // Encode Rd. + Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRdShift; + + // Encode Rn. + Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRnShift; + + // Encode so_imm. + // Set bit I(25) to identify this is the immediate form of <shifter_op> + Binary |= 1 << ARMII::I_BitShift; + Binary |= getMachineSoImmOpValue(V2); + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitLEApcrelJTInstruction(const MachineInstr &MI) { + // It's basically add r, pc, (LJTI - $+8) + + const MCInstrDesc &MCID = MI.getDesc(); + + // Emit the 'add' instruction. + unsigned Binary = 0x4 << 21; // add: Insts{24-21} = 0b0100 + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + // Encode S bit if MI modifies CPSR. + Binary |= getAddrModeSBit(MI, MCID); + + // Encode Rd. + Binary |= getMachineOpValue(MI, 0) << ARMII::RegRdShift; + + // Encode Rn which is PC. + Binary |= getARMRegisterNumbering(ARM::PC) << ARMII::RegRnShift; + + // Encode the displacement. + Binary |= 1 << ARMII::I_BitShift; + emitJumpTableAddress(MI.getOperand(1).getIndex(), ARM::reloc_arm_jt_base); + + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitPseudoMoveInstruction(const MachineInstr &MI) { + unsigned Opcode = MI.getDesc().Opcode; + + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + // Encode S bit if MI modifies CPSR. + if (Opcode == ARM::MOVsrl_flag || Opcode == ARM::MOVsra_flag) + Binary |= 1 << ARMII::S_BitShift; + + // Encode register def if there is one. + Binary |= getMachineOpValue(MI, 0) << ARMII::RegRdShift; + + // Encode the shift operation. + switch (Opcode) { + default: break; + case ARM::RRX: + // rrx + Binary |= 0x6 << 4; + break; + case ARM::MOVsrl_flag: + // lsr #1 + Binary |= (0x2 << 4) | (1 << 7); + break; + case ARM::MOVsra_flag: + // asr #1 + Binary |= (0x4 << 4) | (1 << 7); + break; + } + + // Encode register Rm. + Binary |= getMachineOpValue(MI, 1); + + emitWordLE(Binary); +} + +void ARMCodeEmitter::addPCLabel(unsigned LabelID) { + DEBUG(errs() << " ** LPC" << LabelID << " @ " + << (void*)MCE.getCurrentPCValue() << '\n'); + JTI->addPCLabelAddr(LabelID, MCE.getCurrentPCValue()); +} + +void ARMCodeEmitter::emitPseudoInstruction(const MachineInstr &MI) { + unsigned Opcode = MI.getDesc().Opcode; + switch (Opcode) { + default: + llvm_unreachable("ARMCodeEmitter::emitPseudoInstruction"); + case ARM::BX_CALL: + case ARM::BMOVPCRX_CALL: + case ARM::BXr9_CALL: + case ARM::BMOVPCRXr9_CALL: { + // First emit mov lr, pc + unsigned Binary = 0x01a0e00f; + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + emitWordLE(Binary); + + // and then emit the branch. + emitMiscBranchInstruction(MI); + break; + } + case TargetOpcode::INLINEASM: { + // We allow inline assembler nodes with empty bodies - they can + // implicitly define registers, which is ok for JIT. + if (MI.getOperand(0).getSymbolName()[0]) { + report_fatal_error("JIT does not support inline asm!"); + } + break; + } + case TargetOpcode::PROLOG_LABEL: + case TargetOpcode::EH_LABEL: + MCE.emitLabel(MI.getOperand(0).getMCSymbol()); + break; + case TargetOpcode::IMPLICIT_DEF: + case TargetOpcode::KILL: + // Do nothing. + break; + case ARM::CONSTPOOL_ENTRY: + emitConstPoolInstruction(MI); + break; + case ARM::PICADD: { + // Remember of the address of the PC label for relocation later. + addPCLabel(MI.getOperand(2).getImm()); + // PICADD is just an add instruction that implicitly read pc. + emitDataProcessingInstruction(MI, 0, ARM::PC); + break; + } + case ARM::PICLDR: + case ARM::PICLDRB: + case ARM::PICSTR: + case ARM::PICSTRB: { + // Remember of the address of the PC label for relocation later. + addPCLabel(MI.getOperand(2).getImm()); + // These are just load / store instructions that implicitly read pc. + emitLoadStoreInstruction(MI, 0, ARM::PC); + break; + } + case ARM::PICLDRH: + case ARM::PICLDRSH: + case ARM::PICLDRSB: + case ARM::PICSTRH: { + // Remember of the address of the PC label for relocation later. + addPCLabel(MI.getOperand(2).getImm()); + // These are just load / store instructions that implicitly read pc. + emitMiscLoadStoreInstruction(MI, ARM::PC); + break; + } + + case ARM::MOVi32imm: + // Two instructions to materialize a constant. + if (Subtarget->hasV6T2Ops()) + emitMOVi32immInstruction(MI); + else + emitMOVi2piecesInstruction(MI); + break; + + case ARM::LEApcrelJT: + // Materialize jumptable address. + emitLEApcrelJTInstruction(MI); + break; + case ARM::RRX: + case ARM::MOVsrl_flag: + case ARM::MOVsra_flag: + emitPseudoMoveInstruction(MI); + break; + } +} + +unsigned ARMCodeEmitter::getMachineSoRegOpValue(const MachineInstr &MI, + const MCInstrDesc &MCID, + const MachineOperand &MO, + unsigned OpIdx) { + unsigned Binary = getMachineOpValue(MI, MO); + + const MachineOperand &MO1 = MI.getOperand(OpIdx + 1); + const MachineOperand &MO2 = MI.getOperand(OpIdx + 2); + ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO2.getImm()); + + // Encode the shift opcode. + unsigned SBits = 0; + unsigned Rs = MO1.getReg(); + if (Rs) { + // Set shift operand (bit[7:4]). + // LSL - 0001 + // LSR - 0011 + // ASR - 0101 + // ROR - 0111 + // RRX - 0110 and bit[11:8] clear. + switch (SOpc) { + default: llvm_unreachable("Unknown shift opc!"); + case ARM_AM::lsl: SBits = 0x1; break; + case ARM_AM::lsr: SBits = 0x3; break; + case ARM_AM::asr: SBits = 0x5; break; + case ARM_AM::ror: SBits = 0x7; break; + case ARM_AM::rrx: SBits = 0x6; break; + } + } else { + // Set shift operand (bit[6:4]). + // LSL - 000 + // LSR - 010 + // ASR - 100 + // ROR - 110 + switch (SOpc) { + default: llvm_unreachable("Unknown shift opc!"); + case ARM_AM::lsl: SBits = 0x0; break; + case ARM_AM::lsr: SBits = 0x2; break; + case ARM_AM::asr: SBits = 0x4; break; + case ARM_AM::ror: SBits = 0x6; break; + } + } + Binary |= SBits << 4; + if (SOpc == ARM_AM::rrx) + return Binary; + + // Encode the shift operation Rs or shift_imm (except rrx). + if (Rs) { + // Encode Rs bit[11:8]. + assert(ARM_AM::getSORegOffset(MO2.getImm()) == 0); + return Binary | (getARMRegisterNumbering(Rs) << ARMII::RegRsShift); + } + + // Encode shift_imm bit[11:7]. + return Binary | ARM_AM::getSORegOffset(MO2.getImm()) << 7; +} + +unsigned ARMCodeEmitter::getMachineSoImmOpValue(unsigned SoImm) { + int SoImmVal = ARM_AM::getSOImmVal(SoImm); + assert(SoImmVal != -1 && "Not a valid so_imm value!"); + + // Encode rotate_imm. + unsigned Binary = (ARM_AM::getSOImmValRot((unsigned)SoImmVal) >> 1) + << ARMII::SoRotImmShift; + + // Encode immed_8. + Binary |= ARM_AM::getSOImmValImm((unsigned)SoImmVal); + return Binary; +} + +unsigned ARMCodeEmitter::getAddrModeSBit(const MachineInstr &MI, + const MCInstrDesc &MCID) const { + for (unsigned i = MI.getNumOperands(), e = MCID.getNumOperands(); i >= e;--i){ + const MachineOperand &MO = MI.getOperand(i-1); + if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) + return 1 << ARMII::S_BitShift; + } + return 0; +} + +void ARMCodeEmitter::emitDataProcessingInstruction(const MachineInstr &MI, + unsigned ImplicitRd, + unsigned ImplicitRn) { + const MCInstrDesc &MCID = MI.getDesc(); + + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + // Encode S bit if MI modifies CPSR. + Binary |= getAddrModeSBit(MI, MCID); + + // Encode register def if there is one. + unsigned NumDefs = MCID.getNumDefs(); + unsigned OpIdx = 0; + if (NumDefs) + Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift; + else if (ImplicitRd) + // Special handling for implicit use (e.g. PC). + Binary |= (getARMRegisterNumbering(ImplicitRd) << ARMII::RegRdShift); + + if (MCID.Opcode == ARM::MOVi16) { + // Get immediate from MI. + unsigned Lo16 = getMovi32Value(MI, MI.getOperand(OpIdx), + ARM::reloc_arm_movw); + // Encode imm which is the same as in emitMOVi32immInstruction(). + Binary |= Lo16 & 0xFFF; + Binary |= ((Lo16 >> 12) & 0xF) << 16; + emitWordLE(Binary); + return; + } else if(MCID.Opcode == ARM::MOVTi16) { + unsigned Hi16 = (getMovi32Value(MI, MI.getOperand(OpIdx), + ARM::reloc_arm_movt) >> 16); + Binary |= Hi16 & 0xFFF; + Binary |= ((Hi16 >> 12) & 0xF) << 16; + emitWordLE(Binary); + return; + } else if ((MCID.Opcode == ARM::BFC) || (MCID.Opcode == ARM::BFI)) { + uint32_t v = ~MI.getOperand(2).getImm(); + int32_t lsb = CountTrailingZeros_32(v); + int32_t msb = (32 - CountLeadingZeros_32(v)) - 1; + // Instr{20-16} = msb, Instr{11-7} = lsb + Binary |= (msb & 0x1F) << 16; + Binary |= (lsb & 0x1F) << 7; + emitWordLE(Binary); + return; + } else if ((MCID.Opcode == ARM::UBFX) || (MCID.Opcode == ARM::SBFX)) { + // Encode Rn in Instr{0-3} + Binary |= getMachineOpValue(MI, OpIdx++); + + uint32_t lsb = MI.getOperand(OpIdx++).getImm(); + uint32_t widthm1 = MI.getOperand(OpIdx++).getImm() - 1; + + // Instr{20-16} = widthm1, Instr{11-7} = lsb + Binary |= (widthm1 & 0x1F) << 16; + Binary |= (lsb & 0x1F) << 7; + emitWordLE(Binary); + return; + } + + // If this is a two-address operand, skip it. e.g. MOVCCr operand 1. + if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1) + ++OpIdx; + + // Encode first non-shifter register operand if there is one. + bool isUnary = MCID.TSFlags & ARMII::UnaryDP; + if (!isUnary) { + if (ImplicitRn) + // Special handling for implicit use (e.g. PC). + Binary |= (getARMRegisterNumbering(ImplicitRn) << ARMII::RegRnShift); + else { + Binary |= getMachineOpValue(MI, OpIdx) << ARMII::RegRnShift; + ++OpIdx; + } + } + + // Encode shifter operand. + const MachineOperand &MO = MI.getOperand(OpIdx); + if ((MCID.TSFlags & ARMII::FormMask) == ARMII::DPSoRegFrm) { + // Encode SoReg. + emitWordLE(Binary | getMachineSoRegOpValue(MI, MCID, MO, OpIdx)); + return; + } + + if (MO.isReg()) { + // Encode register Rm. + emitWordLE(Binary | getARMRegisterNumbering(MO.getReg())); + return; + } + + // Encode so_imm. + Binary |= getMachineSoImmOpValue((unsigned)MO.getImm()); + + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitLoadStoreInstruction(const MachineInstr &MI, + unsigned ImplicitRd, + unsigned ImplicitRn) { + const MCInstrDesc &MCID = MI.getDesc(); + unsigned Form = MCID.TSFlags & ARMII::FormMask; + bool IsPrePost = (MCID.TSFlags & ARMII::IndexModeMask) != 0; + + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // If this is an LDRi12, STRi12 or LDRcp, nothing more needs be done. + if (MI.getOpcode() == ARM::LDRi12 || MI.getOpcode() == ARM::LDRcp || + MI.getOpcode() == ARM::STRi12) { + emitWordLE(Binary); + return; + } + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + unsigned OpIdx = 0; + + // Operand 0 of a pre- and post-indexed store is the address base + // writeback. Skip it. + bool Skipped = false; + if (IsPrePost && Form == ARMII::StFrm) { + ++OpIdx; + Skipped = true; + } + + // Set first operand + if (ImplicitRd) + // Special handling for implicit use (e.g. PC). + Binary |= (getARMRegisterNumbering(ImplicitRd) << ARMII::RegRdShift); + else + Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift; + + // Set second operand + if (ImplicitRn) + // Special handling for implicit use (e.g. PC). + Binary |= (getARMRegisterNumbering(ImplicitRn) << ARMII::RegRnShift); + else + Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift; + + // If this is a two-address operand, skip it. e.g. LDR_PRE. + if (!Skipped && MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1) + ++OpIdx; + + const MachineOperand &MO2 = MI.getOperand(OpIdx); + unsigned AM2Opc = (ImplicitRn == ARM::PC) + ? 0 : MI.getOperand(OpIdx+1).getImm(); + + // Set bit U(23) according to sign of immed value (positive or negative). + Binary |= ((ARM_AM::getAM2Op(AM2Opc) == ARM_AM::add ? 1 : 0) << + ARMII::U_BitShift); + if (!MO2.getReg()) { // is immediate + if (ARM_AM::getAM2Offset(AM2Opc)) + // Set the value of offset_12 field + Binary |= ARM_AM::getAM2Offset(AM2Opc); + emitWordLE(Binary); + return; + } + + // Set bit I(25), because this is not in immediate encoding. + Binary |= 1 << ARMII::I_BitShift; + assert(TargetRegisterInfo::isPhysicalRegister(MO2.getReg())); + // Set bit[3:0] to the corresponding Rm register + Binary |= getARMRegisterNumbering(MO2.getReg()); + + // If this instr is in scaled register offset/index instruction, set + // shift_immed(bit[11:7]) and shift(bit[6:5]) fields. + if (unsigned ShImm = ARM_AM::getAM2Offset(AM2Opc)) { + Binary |= getShiftOp(AM2Opc) << ARMII::ShiftImmShift; // shift + Binary |= ShImm << ARMII::ShiftShift; // shift_immed + } + + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitMiscLoadStoreInstruction(const MachineInstr &MI, + unsigned ImplicitRn) { + const MCInstrDesc &MCID = MI.getDesc(); + unsigned Form = MCID.TSFlags & ARMII::FormMask; + bool IsPrePost = (MCID.TSFlags & ARMII::IndexModeMask) != 0; + + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + unsigned OpIdx = 0; + + // Operand 0 of a pre- and post-indexed store is the address base + // writeback. Skip it. + bool Skipped = false; + if (IsPrePost && Form == ARMII::StMiscFrm) { + ++OpIdx; + Skipped = true; + } + + // Set first operand + Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift; + + // Skip LDRD and STRD's second operand. + if (MCID.Opcode == ARM::LDRD || MCID.Opcode == ARM::STRD) + ++OpIdx; + + // Set second operand + if (ImplicitRn) + // Special handling for implicit use (e.g. PC). + Binary |= (getARMRegisterNumbering(ImplicitRn) << ARMII::RegRnShift); + else + Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift; + + // If this is a two-address operand, skip it. e.g. LDRH_POST. + if (!Skipped && MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1) + ++OpIdx; + + const MachineOperand &MO2 = MI.getOperand(OpIdx); + unsigned AM3Opc = (ImplicitRn == ARM::PC) + ? 0 : MI.getOperand(OpIdx+1).getImm(); + + // Set bit U(23) according to sign of immed value (positive or negative) + Binary |= ((ARM_AM::getAM3Op(AM3Opc) == ARM_AM::add ? 1 : 0) << + ARMII::U_BitShift); + + // If this instr is in register offset/index encoding, set bit[3:0] + // to the corresponding Rm register. + if (MO2.getReg()) { + Binary |= getARMRegisterNumbering(MO2.getReg()); + emitWordLE(Binary); + return; + } + + // This instr is in immediate offset/index encoding, set bit 22 to 1. + Binary |= 1 << ARMII::AM3_I_BitShift; + if (unsigned ImmOffs = ARM_AM::getAM3Offset(AM3Opc)) { + // Set operands + Binary |= (ImmOffs >> 4) << ARMII::ImmHiShift; // immedH + Binary |= (ImmOffs & 0xF); // immedL + } + + emitWordLE(Binary); +} + +static unsigned getAddrModeUPBits(unsigned Mode) { + unsigned Binary = 0; + + // Set addressing mode by modifying bits U(23) and P(24) + // IA - Increment after - bit U = 1 and bit P = 0 + // IB - Increment before - bit U = 1 and bit P = 1 + // DA - Decrement after - bit U = 0 and bit P = 0 + // DB - Decrement before - bit U = 0 and bit P = 1 + switch (Mode) { + default: llvm_unreachable("Unknown addressing sub-mode!"); + case ARM_AM::da: break; + case ARM_AM::db: Binary |= 0x1 << ARMII::P_BitShift; break; + case ARM_AM::ia: Binary |= 0x1 << ARMII::U_BitShift; break; + case ARM_AM::ib: Binary |= 0x3 << ARMII::U_BitShift; break; + } + + return Binary; +} + +void ARMCodeEmitter::emitLoadStoreMultipleInstruction(const MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + bool IsUpdating = (MCID.TSFlags & ARMII::IndexModeMask) != 0; + + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + // Skip operand 0 of an instruction with base register update. + unsigned OpIdx = 0; + if (IsUpdating) + ++OpIdx; + + // Set base address operand + Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift; + + // Set addressing mode by modifying bits U(23) and P(24) + ARM_AM::AMSubMode Mode = ARM_AM::getLoadStoreMultipleSubMode(MI.getOpcode()); + Binary |= getAddrModeUPBits(ARM_AM::getAM4SubMode(Mode)); + + // Set bit W(21) + if (IsUpdating) + Binary |= 0x1 << ARMII::W_BitShift; + + // Set registers + for (unsigned i = OpIdx+2, e = MI.getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI.getOperand(i); + if (!MO.isReg() || MO.isImplicit()) + break; + unsigned RegNum = getARMRegisterNumbering(MO.getReg()); + assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg()) && + RegNum < 16); + Binary |= 0x1 << RegNum; + } + + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitMulFrmInstruction(const MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + // Encode S bit if MI modifies CPSR. + Binary |= getAddrModeSBit(MI, MCID); + + // 32x32->64bit operations have two destination registers. The number + // of register definitions will tell us if that's what we're dealing with. + unsigned OpIdx = 0; + if (MCID.getNumDefs() == 2) + Binary |= getMachineOpValue (MI, OpIdx++) << ARMII::RegRdLoShift; + + // Encode Rd + Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdHiShift; + + // Encode Rm + Binary |= getMachineOpValue(MI, OpIdx++); + + // Encode Rs + Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRsShift; + + // Many multiple instructions (e.g. MLA) have three src operands. Encode + // it as Rn (for multiply, that's in the same offset as RdLo. + if (MCID.getNumOperands() > OpIdx && + !MCID.OpInfo[OpIdx].isPredicate() && + !MCID.OpInfo[OpIdx].isOptionalDef()) + Binary |= getMachineOpValue(MI, OpIdx) << ARMII::RegRdLoShift; + + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitExtendInstruction(const MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + unsigned OpIdx = 0; + + // Encode Rd + Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift; + + const MachineOperand &MO1 = MI.getOperand(OpIdx++); + const MachineOperand &MO2 = MI.getOperand(OpIdx); + if (MO2.isReg()) { + // Two register operand form. + // Encode Rn. + Binary |= getMachineOpValue(MI, MO1) << ARMII::RegRnShift; + + // Encode Rm. + Binary |= getMachineOpValue(MI, MO2); + ++OpIdx; + } else { + Binary |= getMachineOpValue(MI, MO1); + } + + // Encode rot imm (0, 8, 16, or 24) if it has a rotate immediate operand. + if (MI.getOperand(OpIdx).isImm() && + !MCID.OpInfo[OpIdx].isPredicate() && + !MCID.OpInfo[OpIdx].isOptionalDef()) + Binary |= (getMachineOpValue(MI, OpIdx) / 8) << ARMII::ExtRotImmShift; + + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitMiscArithInstruction(const MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + // PKH instructions are finished at this point + if (MCID.Opcode == ARM::PKHBT || MCID.Opcode == ARM::PKHTB) { + emitWordLE(Binary); + return; + } + + unsigned OpIdx = 0; + + // Encode Rd + Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift; + + const MachineOperand &MO = MI.getOperand(OpIdx++); + if (OpIdx == MCID.getNumOperands() || + MCID.OpInfo[OpIdx].isPredicate() || + MCID.OpInfo[OpIdx].isOptionalDef()) { + // Encode Rm and it's done. + Binary |= getMachineOpValue(MI, MO); + emitWordLE(Binary); + return; + } + + // Encode Rn. + Binary |= getMachineOpValue(MI, MO) << ARMII::RegRnShift; + + // Encode Rm. + Binary |= getMachineOpValue(MI, OpIdx++); + + // Encode shift_imm. + unsigned ShiftAmt = MI.getOperand(OpIdx).getImm(); + if (MCID.Opcode == ARM::PKHTB) { + assert(ShiftAmt != 0 && "PKHTB shift_imm is 0!"); + if (ShiftAmt == 32) + ShiftAmt = 0; + } + assert(ShiftAmt < 32 && "shift_imm range is 0 to 31!"); + Binary |= ShiftAmt << ARMII::ShiftShift; + + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitSaturateInstruction(const MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + + // Part of binary is determined by TableGen. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + // Encode Rd + Binary |= getMachineOpValue(MI, 0) << ARMII::RegRdShift; + + // Encode saturate bit position. + unsigned Pos = MI.getOperand(1).getImm(); + if (MCID.Opcode == ARM::SSAT || MCID.Opcode == ARM::SSAT16) + Pos -= 1; + assert((Pos < 16 || (Pos < 32 && + MCID.Opcode != ARM::SSAT16 && + MCID.Opcode != ARM::USAT16)) && + "saturate bit position out of range"); + Binary |= Pos << 16; + + // Encode Rm + Binary |= getMachineOpValue(MI, 2); + + // Encode shift_imm. + if (MCID.getNumOperands() == 4) { + unsigned ShiftOp = MI.getOperand(3).getImm(); + ARM_AM::ShiftOpc Opc = ARM_AM::getSORegShOp(ShiftOp); + if (Opc == ARM_AM::asr) + Binary |= (1 << 6); + unsigned ShiftAmt = MI.getOperand(3).getImm(); + if (ShiftAmt == 32 && Opc == ARM_AM::asr) + ShiftAmt = 0; + assert(ShiftAmt < 32 && "shift_imm range is 0 to 31!"); + Binary |= ShiftAmt << ARMII::ShiftShift; + } + + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitBranchInstruction(const MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + + if (MCID.Opcode == ARM::TPsoft) { + llvm_unreachable("ARM::TPsoft FIXME"); // FIXME + } + + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + // Set signed_immed_24 field + Binary |= getMachineOpValue(MI, 0); + + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitInlineJumpTable(unsigned JTIndex) { + // Remember the base address of the inline jump table. + uintptr_t JTBase = MCE.getCurrentPCValue(); + JTI->addJumpTableBaseAddr(JTIndex, JTBase); + DEBUG(errs() << " ** Jump Table #" << JTIndex << " @ " << (void*)JTBase + << '\n'); + + // Now emit the jump table entries. + const std::vector<MachineBasicBlock*> &MBBs = (*MJTEs)[JTIndex].MBBs; + for (unsigned i = 0, e = MBBs.size(); i != e; ++i) { + if (IsPIC) + // DestBB address - JT base. + emitMachineBasicBlock(MBBs[i], ARM::reloc_arm_pic_jt, JTBase); + else + // Absolute DestBB address. + emitMachineBasicBlock(MBBs[i], ARM::reloc_arm_absolute); + emitWordLE(0); + } +} + +void ARMCodeEmitter::emitMiscBranchInstruction(const MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + + // Handle jump tables. + if (MCID.Opcode == ARM::BR_JTr || MCID.Opcode == ARM::BR_JTadd) { + // First emit a ldr pc, [] instruction. + emitDataProcessingInstruction(MI, ARM::PC); + + // Then emit the inline jump table. + unsigned JTIndex = + (MCID.Opcode == ARM::BR_JTr) + ? MI.getOperand(1).getIndex() : MI.getOperand(2).getIndex(); + emitInlineJumpTable(JTIndex); + return; + } else if (MCID.Opcode == ARM::BR_JTm) { + // First emit a ldr pc, [] instruction. + emitLoadStoreInstruction(MI, ARM::PC); + + // Then emit the inline jump table. + emitInlineJumpTable(MI.getOperand(3).getIndex()); + return; + } + + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + if (MCID.Opcode == ARM::BX_RET || MCID.Opcode == ARM::MOVPCLR) + // The return register is LR. + Binary |= getARMRegisterNumbering(ARM::LR); + else + // otherwise, set the return register + Binary |= getMachineOpValue(MI, 0); + + emitWordLE(Binary); +} + +static unsigned encodeVFPRd(const MachineInstr &MI, unsigned OpIdx) { + unsigned RegD = MI.getOperand(OpIdx).getReg(); + unsigned Binary = 0; + bool isSPVFP = ARM::SPRRegisterClass->contains(RegD); + RegD = getARMRegisterNumbering(RegD); + if (!isSPVFP) + Binary |= RegD << ARMII::RegRdShift; + else { + Binary |= ((RegD & 0x1E) >> 1) << ARMII::RegRdShift; + Binary |= (RegD & 0x01) << ARMII::D_BitShift; + } + return Binary; +} + +static unsigned encodeVFPRn(const MachineInstr &MI, unsigned OpIdx) { + unsigned RegN = MI.getOperand(OpIdx).getReg(); + unsigned Binary = 0; + bool isSPVFP = ARM::SPRRegisterClass->contains(RegN); + RegN = getARMRegisterNumbering(RegN); + if (!isSPVFP) + Binary |= RegN << ARMII::RegRnShift; + else { + Binary |= ((RegN & 0x1E) >> 1) << ARMII::RegRnShift; + Binary |= (RegN & 0x01) << ARMII::N_BitShift; + } + return Binary; +} + +static unsigned encodeVFPRm(const MachineInstr &MI, unsigned OpIdx) { + unsigned RegM = MI.getOperand(OpIdx).getReg(); + unsigned Binary = 0; + bool isSPVFP = ARM::SPRRegisterClass->contains(RegM); + RegM = getARMRegisterNumbering(RegM); + if (!isSPVFP) + Binary |= RegM; + else { + Binary |= ((RegM & 0x1E) >> 1); + Binary |= (RegM & 0x01) << ARMII::M_BitShift; + } + return Binary; +} + +void ARMCodeEmitter::emitVFPArithInstruction(const MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + unsigned OpIdx = 0; + assert((Binary & ARMII::D_BitShift) == 0 && + (Binary & ARMII::N_BitShift) == 0 && + (Binary & ARMII::M_BitShift) == 0 && "VFP encoding bug!"); + + // Encode Dd / Sd. + Binary |= encodeVFPRd(MI, OpIdx++); + + // If this is a two-address operand, skip it, e.g. FMACD. + if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1) + ++OpIdx; + + // Encode Dn / Sn. + if ((MCID.TSFlags & ARMII::FormMask) == ARMII::VFPBinaryFrm) + Binary |= encodeVFPRn(MI, OpIdx++); + + if (OpIdx == MCID.getNumOperands() || + MCID.OpInfo[OpIdx].isPredicate() || + MCID.OpInfo[OpIdx].isOptionalDef()) { + // FCMPEZD etc. has only one operand. + emitWordLE(Binary); + return; + } + + // Encode Dm / Sm. + Binary |= encodeVFPRm(MI, OpIdx); + + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitVFPConversionInstruction(const MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + unsigned Form = MCID.TSFlags & ARMII::FormMask; + + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + switch (Form) { + default: break; + case ARMII::VFPConv1Frm: + case ARMII::VFPConv2Frm: + case ARMII::VFPConv3Frm: + // Encode Dd / Sd. + Binary |= encodeVFPRd(MI, 0); + break; + case ARMII::VFPConv4Frm: + // Encode Dn / Sn. + Binary |= encodeVFPRn(MI, 0); + break; + case ARMII::VFPConv5Frm: + // Encode Dm / Sm. + Binary |= encodeVFPRm(MI, 0); + break; + } + + switch (Form) { + default: break; + case ARMII::VFPConv1Frm: + // Encode Dm / Sm. + Binary |= encodeVFPRm(MI, 1); + break; + case ARMII::VFPConv2Frm: + case ARMII::VFPConv3Frm: + // Encode Dn / Sn. + Binary |= encodeVFPRn(MI, 1); + break; + case ARMII::VFPConv4Frm: + case ARMII::VFPConv5Frm: + // Encode Dd / Sd. + Binary |= encodeVFPRd(MI, 1); + break; + } + + if (Form == ARMII::VFPConv5Frm) + // Encode Dn / Sn. + Binary |= encodeVFPRn(MI, 2); + else if (Form == ARMII::VFPConv3Frm) + // Encode Dm / Sm. + Binary |= encodeVFPRm(MI, 2); + + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitVFPLoadStoreInstruction(const MachineInstr &MI) { + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + unsigned OpIdx = 0; + + // Encode Dd / Sd. + Binary |= encodeVFPRd(MI, OpIdx++); + + // Encode address base. + const MachineOperand &Base = MI.getOperand(OpIdx++); + Binary |= getMachineOpValue(MI, Base) << ARMII::RegRnShift; + + // If there is a non-zero immediate offset, encode it. + if (Base.isReg()) { + const MachineOperand &Offset = MI.getOperand(OpIdx); + if (unsigned ImmOffs = ARM_AM::getAM5Offset(Offset.getImm())) { + if (ARM_AM::getAM5Op(Offset.getImm()) == ARM_AM::add) + Binary |= 1 << ARMII::U_BitShift; + Binary |= ImmOffs; + emitWordLE(Binary); + return; + } + } + + // If immediate offset is omitted, default to +0. + Binary |= 1 << ARMII::U_BitShift; + + emitWordLE(Binary); +} + +void +ARMCodeEmitter::emitVFPLoadStoreMultipleInstruction(const MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + bool IsUpdating = (MCID.TSFlags & ARMII::IndexModeMask) != 0; + + // Part of binary is determined by TableGn. + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= II->getPredicate(&MI) << ARMII::CondShift; + + // Skip operand 0 of an instruction with base register update. + unsigned OpIdx = 0; + if (IsUpdating) + ++OpIdx; + + // Set base address operand + Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift; + + // Set addressing mode by modifying bits U(23) and P(24) + ARM_AM::AMSubMode Mode = ARM_AM::getLoadStoreMultipleSubMode(MI.getOpcode()); + Binary |= getAddrModeUPBits(ARM_AM::getAM4SubMode(Mode)); + + // Set bit W(21) + if (IsUpdating) + Binary |= 0x1 << ARMII::W_BitShift; + + // First register is encoded in Dd. + Binary |= encodeVFPRd(MI, OpIdx+2); + + // Count the number of registers. + unsigned NumRegs = 1; + for (unsigned i = OpIdx+3, e = MI.getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI.getOperand(i); + if (!MO.isReg() || MO.isImplicit()) + break; + ++NumRegs; + } + // Bit 8 will be set if <list> is consecutive 64-bit registers (e.g., D0) + // Otherwise, it will be 0, in the case of 32-bit registers. + if(Binary & 0x100) + Binary |= NumRegs * 2; + else + Binary |= NumRegs; + + emitWordLE(Binary); +} + +static unsigned encodeNEONRd(const MachineInstr &MI, unsigned OpIdx) { + unsigned RegD = MI.getOperand(OpIdx).getReg(); + unsigned Binary = 0; + RegD = getARMRegisterNumbering(RegD); + Binary |= (RegD & 0xf) << ARMII::RegRdShift; + Binary |= ((RegD >> 4) & 1) << ARMII::D_BitShift; + return Binary; +} + +static unsigned encodeNEONRn(const MachineInstr &MI, unsigned OpIdx) { + unsigned RegN = MI.getOperand(OpIdx).getReg(); + unsigned Binary = 0; + RegN = getARMRegisterNumbering(RegN); + Binary |= (RegN & 0xf) << ARMII::RegRnShift; + Binary |= ((RegN >> 4) & 1) << ARMII::N_BitShift; + return Binary; +} + +static unsigned encodeNEONRm(const MachineInstr &MI, unsigned OpIdx) { + unsigned RegM = MI.getOperand(OpIdx).getReg(); + unsigned Binary = 0; + RegM = getARMRegisterNumbering(RegM); + Binary |= (RegM & 0xf); + Binary |= ((RegM >> 4) & 1) << ARMII::M_BitShift; + return Binary; +} + +/// convertNEONDataProcToThumb - Convert the ARM mode encoding for a NEON +/// data-processing instruction to the corresponding Thumb encoding. +static unsigned convertNEONDataProcToThumb(unsigned Binary) { + assert((Binary & 0xfe000000) == 0xf2000000 && + "not an ARM NEON data-processing instruction"); + unsigned UBit = (Binary >> 24) & 1; + return 0xef000000 | (UBit << 28) | (Binary & 0xffffff); +} + +void ARMCodeEmitter::emitNEONLaneInstruction(const MachineInstr &MI) { + unsigned Binary = getBinaryCodeForInstr(MI); + + unsigned RegTOpIdx, RegNOpIdx, LnOpIdx; + const MCInstrDesc &MCID = MI.getDesc(); + if ((MCID.TSFlags & ARMII::FormMask) == ARMII::NGetLnFrm) { + RegTOpIdx = 0; + RegNOpIdx = 1; + LnOpIdx = 2; + } else { // ARMII::NSetLnFrm + RegTOpIdx = 2; + RegNOpIdx = 0; + LnOpIdx = 3; + } + + // Set the conditional execution predicate + Binary |= (IsThumb ? ARMCC::AL : II->getPredicate(&MI)) << ARMII::CondShift; + + unsigned RegT = MI.getOperand(RegTOpIdx).getReg(); + RegT = getARMRegisterNumbering(RegT); + Binary |= (RegT << ARMII::RegRdShift); + Binary |= encodeNEONRn(MI, RegNOpIdx); + + unsigned LaneShift; + if ((Binary & (1 << 22)) != 0) + LaneShift = 0; // 8-bit elements + else if ((Binary & (1 << 5)) != 0) + LaneShift = 1; // 16-bit elements + else + LaneShift = 2; // 32-bit elements + + unsigned Lane = MI.getOperand(LnOpIdx).getImm() << LaneShift; + unsigned Opc1 = Lane >> 2; + unsigned Opc2 = Lane & 3; + assert((Opc1 & 3) == 0 && "out-of-range lane number operand"); + Binary |= (Opc1 << 21); + Binary |= (Opc2 << 5); + + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitNEONDupInstruction(const MachineInstr &MI) { + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= (IsThumb ? ARMCC::AL : II->getPredicate(&MI)) << ARMII::CondShift; + + unsigned RegT = MI.getOperand(1).getReg(); + RegT = getARMRegisterNumbering(RegT); + Binary |= (RegT << ARMII::RegRdShift); + Binary |= encodeNEONRn(MI, 0); + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitNEON1RegModImmInstruction(const MachineInstr &MI) { + unsigned Binary = getBinaryCodeForInstr(MI); + // Destination register is encoded in Dd. + Binary |= encodeNEONRd(MI, 0); + // Immediate fields: Op, Cmode, I, Imm3, Imm4 + unsigned Imm = MI.getOperand(1).getImm(); + unsigned Op = (Imm >> 12) & 1; + unsigned Cmode = (Imm >> 8) & 0xf; + unsigned I = (Imm >> 7) & 1; + unsigned Imm3 = (Imm >> 4) & 0x7; + unsigned Imm4 = Imm & 0xf; + Binary |= (I << 24) | (Imm3 << 16) | (Cmode << 8) | (Op << 5) | Imm4; + if (IsThumb) + Binary = convertNEONDataProcToThumb(Binary); + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitNEON2RegInstruction(const MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + unsigned Binary = getBinaryCodeForInstr(MI); + // Destination register is encoded in Dd; source register in Dm. + unsigned OpIdx = 0; + Binary |= encodeNEONRd(MI, OpIdx++); + if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1) + ++OpIdx; + Binary |= encodeNEONRm(MI, OpIdx); + if (IsThumb) + Binary = convertNEONDataProcToThumb(Binary); + // FIXME: This does not handle VDUPfdf or VDUPfqf. + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitNEON3RegInstruction(const MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + unsigned Binary = getBinaryCodeForInstr(MI); + // Destination register is encoded in Dd; source registers in Dn and Dm. + unsigned OpIdx = 0; + Binary |= encodeNEONRd(MI, OpIdx++); + if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1) + ++OpIdx; + Binary |= encodeNEONRn(MI, OpIdx++); + if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1) + ++OpIdx; + Binary |= encodeNEONRm(MI, OpIdx); + if (IsThumb) + Binary = convertNEONDataProcToThumb(Binary); + // FIXME: This does not handle VMOVDneon or VMOVQ. + emitWordLE(Binary); +} + +#include "ARMGenCodeEmitter.inc" diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp new file mode 100644 index 0000000..3e3a413 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -0,0 +1,1919 @@ +//===-- ARMConstantIslandPass.cpp - ARM constant islands ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that splits the constant pool up into 'islands' +// which are scattered through-out the function. This is required due to the +// limited pc-relative displacements that ARM has. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "arm-cp-islands" +#include "ARM.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMInstrInfo.h" +#include "Thumb2InstrInfo.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/CommandLine.h" +#include <algorithm> +using namespace llvm; + +STATISTIC(NumCPEs, "Number of constpool entries"); +STATISTIC(NumSplit, "Number of uncond branches inserted"); +STATISTIC(NumCBrFixed, "Number of cond branches fixed"); +STATISTIC(NumUBrFixed, "Number of uncond branches fixed"); +STATISTIC(NumTBs, "Number of table branches generated"); +STATISTIC(NumT2CPShrunk, "Number of Thumb2 constantpool instructions shrunk"); +STATISTIC(NumT2BrShrunk, "Number of Thumb2 immediate branches shrunk"); +STATISTIC(NumCBZ, "Number of CBZ / CBNZ formed"); +STATISTIC(NumJTMoved, "Number of jump table destination blocks moved"); +STATISTIC(NumJTInserted, "Number of jump table intermediate blocks inserted"); + + +static cl::opt<bool> +AdjustJumpTableBlocks("arm-adjust-jump-tables", cl::Hidden, cl::init(true), + cl::desc("Adjust basic block layout to better use TB[BH]")); + +namespace { + /// ARMConstantIslands - Due to limited PC-relative displacements, ARM + /// requires constant pool entries to be scattered among the instructions + /// inside a function. To do this, it completely ignores the normal LLVM + /// constant pool; instead, it places constants wherever it feels like with + /// special instructions. + /// + /// The terminology used in this pass includes: + /// Islands - Clumps of constants placed in the function. + /// Water - Potential places where an island could be formed. + /// CPE - A constant pool entry that has been placed somewhere, which + /// tracks a list of users. + class ARMConstantIslands : public MachineFunctionPass { + /// BBSizes - The size of each MachineBasicBlock in bytes of code, indexed + /// by MBB Number. The two-byte pads required for Thumb alignment are + /// counted as part of the following block (i.e., the offset and size for + /// a padded block will both be ==2 mod 4). + std::vector<unsigned> BBSizes; + + /// BBOffsets - the offset of each MBB in bytes, starting from 0. + /// The two-byte pads required for Thumb alignment are counted as part of + /// the following block. + std::vector<unsigned> BBOffsets; + + /// WaterList - A sorted list of basic blocks where islands could be placed + /// (i.e. blocks that don't fall through to the following block, due + /// to a return, unreachable, or unconditional branch). + std::vector<MachineBasicBlock*> WaterList; + + /// NewWaterList - The subset of WaterList that was created since the + /// previous iteration by inserting unconditional branches. + SmallSet<MachineBasicBlock*, 4> NewWaterList; + + typedef std::vector<MachineBasicBlock*>::iterator water_iterator; + + /// CPUser - One user of a constant pool, keeping the machine instruction + /// pointer, the constant pool being referenced, and the max displacement + /// allowed from the instruction to the CP. The HighWaterMark records the + /// highest basic block where a new CPEntry can be placed. To ensure this + /// pass terminates, the CP entries are initially placed at the end of the + /// function and then move monotonically to lower addresses. The + /// exception to this rule is when the current CP entry for a particular + /// CPUser is out of range, but there is another CP entry for the same + /// constant value in range. We want to use the existing in-range CP + /// entry, but if it later moves out of range, the search for new water + /// should resume where it left off. The HighWaterMark is used to record + /// that point. + struct CPUser { + MachineInstr *MI; + MachineInstr *CPEMI; + MachineBasicBlock *HighWaterMark; + unsigned MaxDisp; + bool NegOk; + bool IsSoImm; + CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp, + bool neg, bool soimm) + : MI(mi), CPEMI(cpemi), MaxDisp(maxdisp), NegOk(neg), IsSoImm(soimm) { + HighWaterMark = CPEMI->getParent(); + } + }; + + /// CPUsers - Keep track of all of the machine instructions that use various + /// constant pools and their max displacement. + std::vector<CPUser> CPUsers; + + /// CPEntry - One per constant pool entry, keeping the machine instruction + /// pointer, the constpool index, and the number of CPUser's which + /// reference this entry. + struct CPEntry { + MachineInstr *CPEMI; + unsigned CPI; + unsigned RefCount; + CPEntry(MachineInstr *cpemi, unsigned cpi, unsigned rc = 0) + : CPEMI(cpemi), CPI(cpi), RefCount(rc) {} + }; + + /// CPEntries - Keep track of all of the constant pool entry machine + /// instructions. For each original constpool index (i.e. those that + /// existed upon entry to this pass), it keeps a vector of entries. + /// Original elements are cloned as we go along; the clones are + /// put in the vector of the original element, but have distinct CPIs. + std::vector<std::vector<CPEntry> > CPEntries; + + /// ImmBranch - One per immediate branch, keeping the machine instruction + /// pointer, conditional or unconditional, the max displacement, + /// and (if isCond is true) the corresponding unconditional branch + /// opcode. + struct ImmBranch { + MachineInstr *MI; + unsigned MaxDisp : 31; + bool isCond : 1; + int UncondBr; + ImmBranch(MachineInstr *mi, unsigned maxdisp, bool cond, int ubr) + : MI(mi), MaxDisp(maxdisp), isCond(cond), UncondBr(ubr) {} + }; + + /// ImmBranches - Keep track of all the immediate branch instructions. + /// + std::vector<ImmBranch> ImmBranches; + + /// PushPopMIs - Keep track of all the Thumb push / pop instructions. + /// + SmallVector<MachineInstr*, 4> PushPopMIs; + + /// T2JumpTables - Keep track of all the Thumb2 jumptable instructions. + SmallVector<MachineInstr*, 4> T2JumpTables; + + /// HasFarJump - True if any far jump instruction has been emitted during + /// the branch fix up pass. + bool HasFarJump; + + /// HasInlineAsm - True if the function contains inline assembly. + bool HasInlineAsm; + + const ARMInstrInfo *TII; + const ARMSubtarget *STI; + ARMFunctionInfo *AFI; + bool isThumb; + bool isThumb1; + bool isThumb2; + public: + static char ID; + ARMConstantIslands() : MachineFunctionPass(ID) {} + + virtual bool runOnMachineFunction(MachineFunction &MF); + + virtual const char *getPassName() const { + return "ARM constant island placement and branch shortening pass"; + } + + private: + void DoInitialPlacement(MachineFunction &MF, + std::vector<MachineInstr*> &CPEMIs); + CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI); + void JumpTableFunctionScan(MachineFunction &MF); + void InitialFunctionScan(MachineFunction &MF, + const std::vector<MachineInstr*> &CPEMIs); + MachineBasicBlock *SplitBlockBeforeInstr(MachineInstr *MI); + void UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB); + void AdjustBBOffsetsAfter(MachineBasicBlock *BB, int delta); + bool DecrementOldEntry(unsigned CPI, MachineInstr* CPEMI); + int LookForExistingCPEntry(CPUser& U, unsigned UserOffset); + bool LookForWater(CPUser&U, unsigned UserOffset, water_iterator &WaterIter); + void CreateNewWater(unsigned CPUserIndex, unsigned UserOffset, + MachineBasicBlock *&NewMBB); + bool HandleConstantPoolUser(MachineFunction &MF, unsigned CPUserIndex); + void RemoveDeadCPEMI(MachineInstr *CPEMI); + bool RemoveUnusedCPEntries(); + bool CPEIsInRange(MachineInstr *MI, unsigned UserOffset, + MachineInstr *CPEMI, unsigned Disp, bool NegOk, + bool DoDump = false); + bool WaterIsInRange(unsigned UserOffset, MachineBasicBlock *Water, + CPUser &U); + bool OffsetIsInRange(unsigned UserOffset, unsigned TrialOffset, + unsigned Disp, bool NegativeOK, bool IsSoImm = false); + bool BBIsInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp); + bool FixUpImmediateBr(MachineFunction &MF, ImmBranch &Br); + bool FixUpConditionalBr(MachineFunction &MF, ImmBranch &Br); + bool FixUpUnconditionalBr(MachineFunction &MF, ImmBranch &Br); + bool UndoLRSpillRestore(); + bool OptimizeThumb2Instructions(MachineFunction &MF); + bool OptimizeThumb2Branches(MachineFunction &MF); + bool ReorderThumb2JumpTables(MachineFunction &MF); + bool OptimizeThumb2JumpTables(MachineFunction &MF); + MachineBasicBlock *AdjustJTTargetBlockForward(MachineBasicBlock *BB, + MachineBasicBlock *JTBB); + + unsigned GetOffsetOf(MachineInstr *MI) const; + void dumpBBs(); + void verify(MachineFunction &MF); + }; + char ARMConstantIslands::ID = 0; +} + +/// verify - check BBOffsets, BBSizes, alignment of islands +void ARMConstantIslands::verify(MachineFunction &MF) { + assert(BBOffsets.size() == BBSizes.size()); + for (unsigned i = 1, e = BBOffsets.size(); i != e; ++i) + assert(BBOffsets[i-1]+BBSizes[i-1] == BBOffsets[i]); + if (!isThumb) + return; +#ifndef NDEBUG + for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end(); + MBBI != E; ++MBBI) { + MachineBasicBlock *MBB = MBBI; + if (!MBB->empty() && + MBB->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY) { + unsigned MBBId = MBB->getNumber(); + assert(HasInlineAsm || + (BBOffsets[MBBId]%4 == 0 && BBSizes[MBBId]%4 == 0) || + (BBOffsets[MBBId]%4 != 0 && BBSizes[MBBId]%4 != 0)); + } + } + for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) { + CPUser &U = CPUsers[i]; + unsigned UserOffset = GetOffsetOf(U.MI) + (isThumb ? 4 : 8); + unsigned CPEOffset = GetOffsetOf(U.CPEMI); + unsigned Disp = UserOffset < CPEOffset ? CPEOffset - UserOffset : + UserOffset - CPEOffset; + assert(Disp <= U.MaxDisp || "Constant pool entry out of range!"); + } +#endif +} + +/// print block size and offset information - debugging +void ARMConstantIslands::dumpBBs() { + for (unsigned J = 0, E = BBOffsets.size(); J !=E; ++J) { + DEBUG(errs() << "block " << J << " offset " << BBOffsets[J] + << " size " << BBSizes[J] << "\n"); + } +} + +/// createARMConstantIslandPass - returns an instance of the constpool +/// island pass. +FunctionPass *llvm::createARMConstantIslandPass() { + return new ARMConstantIslands(); +} + +bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) { + MachineConstantPool &MCP = *MF.getConstantPool(); + + TII = (const ARMInstrInfo*)MF.getTarget().getInstrInfo(); + AFI = MF.getInfo<ARMFunctionInfo>(); + STI = &MF.getTarget().getSubtarget<ARMSubtarget>(); + + isThumb = AFI->isThumbFunction(); + isThumb1 = AFI->isThumb1OnlyFunction(); + isThumb2 = AFI->isThumb2Function(); + + HasFarJump = false; + HasInlineAsm = false; + + // Renumber all of the machine basic blocks in the function, guaranteeing that + // the numbers agree with the position of the block in the function. + MF.RenumberBlocks(); + + // Try to reorder and otherwise adjust the block layout to make good use + // of the TB[BH] instructions. + bool MadeChange = false; + if (isThumb2 && AdjustJumpTableBlocks) { + JumpTableFunctionScan(MF); + MadeChange |= ReorderThumb2JumpTables(MF); + // Data is out of date, so clear it. It'll be re-computed later. + T2JumpTables.clear(); + // Blocks may have shifted around. Keep the numbering up to date. + MF.RenumberBlocks(); + } + + // Thumb1 functions containing constant pools get 4-byte alignment. + // This is so we can keep exact track of where the alignment padding goes. + + // ARM and Thumb2 functions need to be 4-byte aligned. + if (!isThumb1) + MF.EnsureAlignment(2); // 2 = log2(4) + + // Perform the initial placement of the constant pool entries. To start with, + // we put them all at the end of the function. + std::vector<MachineInstr*> CPEMIs; + if (!MCP.isEmpty()) { + DoInitialPlacement(MF, CPEMIs); + if (isThumb1) + MF.EnsureAlignment(2); // 2 = log2(4) + } + + /// The next UID to take is the first unused one. + AFI->initPICLabelUId(CPEMIs.size()); + + // Do the initial scan of the function, building up information about the + // sizes of each block, the location of all the water, and finding all of the + // constant pool users. + InitialFunctionScan(MF, CPEMIs); + CPEMIs.clear(); + DEBUG(dumpBBs()); + + + /// Remove dead constant pool entries. + MadeChange |= RemoveUnusedCPEntries(); + + // Iteratively place constant pool entries and fix up branches until there + // is no change. + unsigned NoCPIters = 0, NoBRIters = 0; + while (true) { + bool CPChange = false; + for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) + CPChange |= HandleConstantPoolUser(MF, i); + if (CPChange && ++NoCPIters > 30) + llvm_unreachable("Constant Island pass failed to converge!"); + DEBUG(dumpBBs()); + + // Clear NewWaterList now. If we split a block for branches, it should + // appear as "new water" for the next iteration of constant pool placement. + NewWaterList.clear(); + + bool BRChange = false; + for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i) + BRChange |= FixUpImmediateBr(MF, ImmBranches[i]); + if (BRChange && ++NoBRIters > 30) + llvm_unreachable("Branch Fix Up pass failed to converge!"); + DEBUG(dumpBBs()); + + if (!CPChange && !BRChange) + break; + MadeChange = true; + } + + // Shrink 32-bit Thumb2 branch, load, and store instructions. + if (isThumb2 && !STI->prefers32BitThumb()) + MadeChange |= OptimizeThumb2Instructions(MF); + + // After a while, this might be made debug-only, but it is not expensive. + verify(MF); + + // If LR has been forced spilled and no far jump (i.e. BL) has been issued, + // undo the spill / restore of LR if possible. + if (isThumb && !HasFarJump && AFI->isLRSpilledForFarJump()) + MadeChange |= UndoLRSpillRestore(); + + // Save the mapping between original and cloned constpool entries. + for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) { + for (unsigned j = 0, je = CPEntries[i].size(); j != je; ++j) { + const CPEntry & CPE = CPEntries[i][j]; + AFI->recordCPEClone(i, CPE.CPI); + } + } + + DEBUG(errs() << '\n'; dumpBBs()); + + BBSizes.clear(); + BBOffsets.clear(); + WaterList.clear(); + CPUsers.clear(); + CPEntries.clear(); + ImmBranches.clear(); + PushPopMIs.clear(); + T2JumpTables.clear(); + + return MadeChange; +} + +/// DoInitialPlacement - Perform the initial placement of the constant pool +/// entries. To start with, we put them all at the end of the function. +void ARMConstantIslands::DoInitialPlacement(MachineFunction &MF, + std::vector<MachineInstr*> &CPEMIs) { + // Create the basic block to hold the CPE's. + MachineBasicBlock *BB = MF.CreateMachineBasicBlock(); + MF.push_back(BB); + + // Add all of the constants from the constant pool to the end block, use an + // identity mapping of CPI's to CPE's. + const std::vector<MachineConstantPoolEntry> &CPs = + MF.getConstantPool()->getConstants(); + + const TargetData &TD = *MF.getTarget().getTargetData(); + for (unsigned i = 0, e = CPs.size(); i != e; ++i) { + unsigned Size = TD.getTypeAllocSize(CPs[i].getType()); + // Verify that all constant pool entries are a multiple of 4 bytes. If not, + // we would have to pad them out or something so that instructions stay + // aligned. + assert((Size & 3) == 0 && "CP Entry not multiple of 4 bytes!"); + MachineInstr *CPEMI = + BuildMI(BB, DebugLoc(), TII->get(ARM::CONSTPOOL_ENTRY)) + .addImm(i).addConstantPoolIndex(i).addImm(Size); + CPEMIs.push_back(CPEMI); + + // Add a new CPEntry, but no corresponding CPUser yet. + std::vector<CPEntry> CPEs; + CPEs.push_back(CPEntry(CPEMI, i)); + CPEntries.push_back(CPEs); + ++NumCPEs; + DEBUG(errs() << "Moved CPI#" << i << " to end of function as #" << i + << "\n"); + } +} + +/// BBHasFallthrough - Return true if the specified basic block can fallthrough +/// into the block immediately after it. +static bool BBHasFallthrough(MachineBasicBlock *MBB) { + // Get the next machine basic block in the function. + MachineFunction::iterator MBBI = MBB; + // Can't fall off end of function. + if (llvm::next(MBBI) == MBB->getParent()->end()) + return false; + + MachineBasicBlock *NextBB = llvm::next(MBBI); + for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), + E = MBB->succ_end(); I != E; ++I) + if (*I == NextBB) + return true; + + return false; +} + +/// findConstPoolEntry - Given the constpool index and CONSTPOOL_ENTRY MI, +/// look up the corresponding CPEntry. +ARMConstantIslands::CPEntry +*ARMConstantIslands::findConstPoolEntry(unsigned CPI, + const MachineInstr *CPEMI) { + std::vector<CPEntry> &CPEs = CPEntries[CPI]; + // Number of entries per constpool index should be small, just do a + // linear search. + for (unsigned i = 0, e = CPEs.size(); i != e; ++i) { + if (CPEs[i].CPEMI == CPEMI) + return &CPEs[i]; + } + return NULL; +} + +/// JumpTableFunctionScan - Do a scan of the function, building up +/// information about the sizes of each block and the locations of all +/// the jump tables. +void ARMConstantIslands::JumpTableFunctionScan(MachineFunction &MF) { + for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end(); + MBBI != E; ++MBBI) { + MachineBasicBlock &MBB = *MBBI; + + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) + if (I->getDesc().isBranch() && I->getOpcode() == ARM::t2BR_JT) + T2JumpTables.push_back(I); + } +} + +/// InitialFunctionScan - Do the initial scan of the function, building up +/// information about the sizes of each block, the location of all the water, +/// and finding all of the constant pool users. +void ARMConstantIslands::InitialFunctionScan(MachineFunction &MF, + const std::vector<MachineInstr*> &CPEMIs) { + // First thing, see if the function has any inline assembly in it. If so, + // we have to be conservative about alignment assumptions, as we don't + // know for sure the size of any instructions in the inline assembly. + for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end(); + MBBI != E; ++MBBI) { + MachineBasicBlock &MBB = *MBBI; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) + if (I->getOpcode() == ARM::INLINEASM) + HasInlineAsm = true; + } + + // Now go back through the instructions and build up our data structures. + unsigned Offset = 0; + for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end(); + MBBI != E; ++MBBI) { + MachineBasicBlock &MBB = *MBBI; + + // If this block doesn't fall through into the next MBB, then this is + // 'water' that a constant pool island could be placed. + if (!BBHasFallthrough(&MBB)) + WaterList.push_back(&MBB); + + unsigned MBBSize = 0; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + if (I->isDebugValue()) + continue; + // Add instruction size to MBBSize. + MBBSize += TII->GetInstSizeInBytes(I); + + int Opc = I->getOpcode(); + if (I->getDesc().isBranch()) { + bool isCond = false; + unsigned Bits = 0; + unsigned Scale = 1; + int UOpc = Opc; + switch (Opc) { + default: + continue; // Ignore other JT branches + case ARM::tBR_JTr: + // A Thumb1 table jump may involve padding; for the offsets to + // be right, functions containing these must be 4-byte aligned. + // tBR_JTr expands to a mov pc followed by .align 2 and then the jump + // table entries. So this code checks whether offset of tBR_JTr + 2 + // is aligned. That is held in Offset+MBBSize, which already has + // 2 added in for the size of the mov pc instruction. + MF.EnsureAlignment(2U); + if ((Offset+MBBSize)%4 != 0 || HasInlineAsm) + // FIXME: Add a pseudo ALIGN instruction instead. + MBBSize += 2; // padding + continue; // Does not get an entry in ImmBranches + case ARM::t2BR_JT: + T2JumpTables.push_back(I); + continue; // Does not get an entry in ImmBranches + case ARM::Bcc: + isCond = true; + UOpc = ARM::B; + // Fallthrough + case ARM::B: + Bits = 24; + Scale = 4; + break; + case ARM::tBcc: + isCond = true; + UOpc = ARM::tB; + Bits = 8; + Scale = 2; + break; + case ARM::tB: + Bits = 11; + Scale = 2; + break; + case ARM::t2Bcc: + isCond = true; + UOpc = ARM::t2B; + Bits = 20; + Scale = 2; + break; + case ARM::t2B: + Bits = 24; + Scale = 2; + break; + } + + // Record this immediate branch. + unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale; + ImmBranches.push_back(ImmBranch(I, MaxOffs, isCond, UOpc)); + } + + if (Opc == ARM::tPUSH || Opc == ARM::tPOP_RET) + PushPopMIs.push_back(I); + + if (Opc == ARM::CONSTPOOL_ENTRY) + continue; + + // Scan the instructions for constant pool operands. + for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op) + if (I->getOperand(op).isCPI()) { + // We found one. The addressing mode tells us the max displacement + // from the PC that this instruction permits. + + // Basic size info comes from the TSFlags field. + unsigned Bits = 0; + unsigned Scale = 1; + bool NegOk = false; + bool IsSoImm = false; + + switch (Opc) { + default: + llvm_unreachable("Unknown addressing mode for CP reference!"); + break; + + // Taking the address of a CP entry. + case ARM::LEApcrel: + // This takes a SoImm, which is 8 bit immediate rotated. We'll + // pretend the maximum offset is 255 * 4. Since each instruction + // 4 byte wide, this is always correct. We'll check for other + // displacements that fits in a SoImm as well. + Bits = 8; + Scale = 4; + NegOk = true; + IsSoImm = true; + break; + case ARM::t2LEApcrel: + Bits = 12; + NegOk = true; + break; + case ARM::tLEApcrel: + Bits = 8; + Scale = 4; + break; + + case ARM::LDRi12: + case ARM::LDRcp: + case ARM::t2LDRpci: + Bits = 12; // +-offset_12 + NegOk = true; + break; + + case ARM::tLDRpci: + Bits = 8; + Scale = 4; // +(offset_8*4) + break; + + case ARM::VLDRD: + case ARM::VLDRS: + Bits = 8; + Scale = 4; // +-(offset_8*4) + NegOk = true; + break; + } + + // Remember that this is a user of a CP entry. + unsigned CPI = I->getOperand(op).getIndex(); + MachineInstr *CPEMI = CPEMIs[CPI]; + unsigned MaxOffs = ((1 << Bits)-1) * Scale; + CPUsers.push_back(CPUser(I, CPEMI, MaxOffs, NegOk, IsSoImm)); + + // Increment corresponding CPEntry reference count. + CPEntry *CPE = findConstPoolEntry(CPI, CPEMI); + assert(CPE && "Cannot find a corresponding CPEntry!"); + CPE->RefCount++; + + // Instructions can only use one CP entry, don't bother scanning the + // rest of the operands. + break; + } + } + + // In thumb mode, if this block is a constpool island, we may need padding + // so it's aligned on 4 byte boundary. + if (isThumb && + !MBB.empty() && + MBB.begin()->getOpcode() == ARM::CONSTPOOL_ENTRY && + ((Offset%4) != 0 || HasInlineAsm)) + MBBSize += 2; + + BBSizes.push_back(MBBSize); + BBOffsets.push_back(Offset); + Offset += MBBSize; + } +} + +/// GetOffsetOf - Return the current offset of the specified machine instruction +/// from the start of the function. This offset changes as stuff is moved +/// around inside the function. +unsigned ARMConstantIslands::GetOffsetOf(MachineInstr *MI) const { + MachineBasicBlock *MBB = MI->getParent(); + + // The offset is composed of two things: the sum of the sizes of all MBB's + // before this instruction's block, and the offset from the start of the block + // it is in. + unsigned Offset = BBOffsets[MBB->getNumber()]; + + // If we're looking for a CONSTPOOL_ENTRY in Thumb, see if this block has + // alignment padding, and compensate if so. + if (isThumb && + MI->getOpcode() == ARM::CONSTPOOL_ENTRY && + (Offset%4 != 0 || HasInlineAsm)) + Offset += 2; + + // Sum instructions before MI in MBB. + for (MachineBasicBlock::iterator I = MBB->begin(); ; ++I) { + assert(I != MBB->end() && "Didn't find MI in its own basic block?"); + if (&*I == MI) return Offset; + Offset += TII->GetInstSizeInBytes(I); + } +} + +/// CompareMBBNumbers - Little predicate function to sort the WaterList by MBB +/// ID. +static bool CompareMBBNumbers(const MachineBasicBlock *LHS, + const MachineBasicBlock *RHS) { + return LHS->getNumber() < RHS->getNumber(); +} + +/// UpdateForInsertedWaterBlock - When a block is newly inserted into the +/// machine function, it upsets all of the block numbers. Renumber the blocks +/// and update the arrays that parallel this numbering. +void ARMConstantIslands::UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB) { + // Renumber the MBB's to keep them consecutive. + NewBB->getParent()->RenumberBlocks(NewBB); + + // Insert a size into BBSizes to align it properly with the (newly + // renumbered) block numbers. + BBSizes.insert(BBSizes.begin()+NewBB->getNumber(), 0); + + // Likewise for BBOffsets. + BBOffsets.insert(BBOffsets.begin()+NewBB->getNumber(), 0); + + // Next, update WaterList. Specifically, we need to add NewMBB as having + // available water after it. + water_iterator IP = + std::lower_bound(WaterList.begin(), WaterList.end(), NewBB, + CompareMBBNumbers); + WaterList.insert(IP, NewBB); +} + + +/// Split the basic block containing MI into two blocks, which are joined by +/// an unconditional branch. Update data structures and renumber blocks to +/// account for this change and returns the newly created block. +MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) { + MachineBasicBlock *OrigBB = MI->getParent(); + MachineFunction &MF = *OrigBB->getParent(); + + // Create a new MBB for the code after the OrigBB. + MachineBasicBlock *NewBB = + MF.CreateMachineBasicBlock(OrigBB->getBasicBlock()); + MachineFunction::iterator MBBI = OrigBB; ++MBBI; + MF.insert(MBBI, NewBB); + + // Splice the instructions starting with MI over to NewBB. + NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end()); + + // Add an unconditional branch from OrigBB to NewBB. + // Note the new unconditional branch is not being recorded. + // There doesn't seem to be meaningful DebugInfo available; this doesn't + // correspond to anything in the source. + unsigned Opc = isThumb ? (isThumb2 ? ARM::t2B : ARM::tB) : ARM::B; + if (!isThumb) + BuildMI(OrigBB, DebugLoc(), TII->get(Opc)).addMBB(NewBB); + else + BuildMI(OrigBB, DebugLoc(), TII->get(Opc)).addMBB(NewBB) + .addImm(ARMCC::AL).addReg(0); + ++NumSplit; + + // Update the CFG. All succs of OrigBB are now succs of NewBB. + while (!OrigBB->succ_empty()) { + MachineBasicBlock *Succ = *OrigBB->succ_begin(); + OrigBB->removeSuccessor(Succ); + NewBB->addSuccessor(Succ); + + // This pass should be run after register allocation, so there should be no + // PHI nodes to update. + assert((Succ->empty() || !Succ->begin()->isPHI()) + && "PHI nodes should be eliminated by now!"); + } + + // OrigBB branches to NewBB. + OrigBB->addSuccessor(NewBB); + + // Update internal data structures to account for the newly inserted MBB. + // This is almost the same as UpdateForInsertedWaterBlock, except that + // the Water goes after OrigBB, not NewBB. + MF.RenumberBlocks(NewBB); + + // Insert a size into BBSizes to align it properly with the (newly + // renumbered) block numbers. + BBSizes.insert(BBSizes.begin()+NewBB->getNumber(), 0); + + // Likewise for BBOffsets. + BBOffsets.insert(BBOffsets.begin()+NewBB->getNumber(), 0); + + // Next, update WaterList. Specifically, we need to add OrigMBB as having + // available water after it (but not if it's already there, which happens + // when splitting before a conditional branch that is followed by an + // unconditional branch - in that case we want to insert NewBB). + water_iterator IP = + std::lower_bound(WaterList.begin(), WaterList.end(), OrigBB, + CompareMBBNumbers); + MachineBasicBlock* WaterBB = *IP; + if (WaterBB == OrigBB) + WaterList.insert(llvm::next(IP), NewBB); + else + WaterList.insert(IP, OrigBB); + NewWaterList.insert(OrigBB); + + unsigned OrigBBI = OrigBB->getNumber(); + unsigned NewBBI = NewBB->getNumber(); + + int delta = isThumb1 ? 2 : 4; + + // Figure out how large the OrigBB is. As the first half of the original + // block, it cannot contain a tablejump. The size includes + // the new jump we added. (It should be possible to do this without + // recounting everything, but it's very confusing, and this is rarely + // executed.) + unsigned OrigBBSize = 0; + for (MachineBasicBlock::iterator I = OrigBB->begin(), E = OrigBB->end(); + I != E; ++I) + OrigBBSize += TII->GetInstSizeInBytes(I); + BBSizes[OrigBBI] = OrigBBSize; + + // ...and adjust BBOffsets for NewBB accordingly. + BBOffsets[NewBBI] = BBOffsets[OrigBBI] + BBSizes[OrigBBI]; + + // Figure out how large the NewMBB is. As the second half of the original + // block, it may contain a tablejump. + unsigned NewBBSize = 0; + for (MachineBasicBlock::iterator I = NewBB->begin(), E = NewBB->end(); + I != E; ++I) + NewBBSize += TII->GetInstSizeInBytes(I); + // Set the size of NewBB in BBSizes. It does not include any padding now. + BBSizes[NewBBI] = NewBBSize; + + MachineInstr* ThumbJTMI = prior(NewBB->end()); + if (ThumbJTMI->getOpcode() == ARM::tBR_JTr) { + // We've added another 2-byte instruction before this tablejump, which + // means we will always need padding if we didn't before, and vice versa. + + // The original offset of the jump instruction was: + unsigned OrigOffset = BBOffsets[OrigBBI] + BBSizes[OrigBBI] - delta; + if (OrigOffset%4 == 0) { + // We had padding before and now we don't. No net change in code size. + delta = 0; + } else { + // We didn't have padding before and now we do. + BBSizes[NewBBI] += 2; + delta = 4; + } + } + + // All BBOffsets following these blocks must be modified. + if (delta) + AdjustBBOffsetsAfter(NewBB, delta); + + return NewBB; +} + +/// OffsetIsInRange - Checks whether UserOffset (the location of a constant pool +/// reference) is within MaxDisp of TrialOffset (a proposed location of a +/// constant pool entry). +bool ARMConstantIslands::OffsetIsInRange(unsigned UserOffset, + unsigned TrialOffset, unsigned MaxDisp, + bool NegativeOK, bool IsSoImm) { + // On Thumb offsets==2 mod 4 are rounded down by the hardware for + // purposes of the displacement computation; compensate for that here. + // Effectively, the valid range of displacements is 2 bytes smaller for such + // references. + unsigned TotalAdj = 0; + if (isThumb && UserOffset%4 !=0) { + UserOffset -= 2; + TotalAdj = 2; + } + // CPEs will be rounded up to a multiple of 4. + if (isThumb && TrialOffset%4 != 0) { + TrialOffset += 2; + TotalAdj += 2; + } + + // In Thumb2 mode, later branch adjustments can shift instructions up and + // cause alignment change. In the worst case scenario this can cause the + // user's effective address to be subtracted by 2 and the CPE's address to + // be plus 2. + if (isThumb2 && TotalAdj != 4) + MaxDisp -= (4 - TotalAdj); + + if (UserOffset <= TrialOffset) { + // User before the Trial. + if (TrialOffset - UserOffset <= MaxDisp) + return true; + // FIXME: Make use full range of soimm values. + } else if (NegativeOK) { + if (UserOffset - TrialOffset <= MaxDisp) + return true; + // FIXME: Make use full range of soimm values. + } + return false; +} + +/// WaterIsInRange - Returns true if a CPE placed after the specified +/// Water (a basic block) will be in range for the specific MI. + +bool ARMConstantIslands::WaterIsInRange(unsigned UserOffset, + MachineBasicBlock* Water, CPUser &U) { + unsigned MaxDisp = U.MaxDisp; + unsigned CPEOffset = BBOffsets[Water->getNumber()] + + BBSizes[Water->getNumber()]; + + // If the CPE is to be inserted before the instruction, that will raise + // the offset of the instruction. + if (CPEOffset < UserOffset) + UserOffset += U.CPEMI->getOperand(2).getImm(); + + return OffsetIsInRange(UserOffset, CPEOffset, MaxDisp, U.NegOk, U.IsSoImm); +} + +/// CPEIsInRange - Returns true if the distance between specific MI and +/// specific ConstPool entry instruction can fit in MI's displacement field. +bool ARMConstantIslands::CPEIsInRange(MachineInstr *MI, unsigned UserOffset, + MachineInstr *CPEMI, unsigned MaxDisp, + bool NegOk, bool DoDump) { + unsigned CPEOffset = GetOffsetOf(CPEMI); + assert((CPEOffset%4 == 0 || HasInlineAsm) && "Misaligned CPE"); + + if (DoDump) { + DEBUG(errs() << "User of CPE#" << CPEMI->getOperand(0).getImm() + << " max delta=" << MaxDisp + << " insn address=" << UserOffset + << " CPE address=" << CPEOffset + << " offset=" << int(CPEOffset-UserOffset) << "\t" << *MI); + } + + return OffsetIsInRange(UserOffset, CPEOffset, MaxDisp, NegOk); +} + +#ifndef NDEBUG +/// BBIsJumpedOver - Return true of the specified basic block's only predecessor +/// unconditionally branches to its only successor. +static bool BBIsJumpedOver(MachineBasicBlock *MBB) { + if (MBB->pred_size() != 1 || MBB->succ_size() != 1) + return false; + + MachineBasicBlock *Succ = *MBB->succ_begin(); + MachineBasicBlock *Pred = *MBB->pred_begin(); + MachineInstr *PredMI = &Pred->back(); + if (PredMI->getOpcode() == ARM::B || PredMI->getOpcode() == ARM::tB + || PredMI->getOpcode() == ARM::t2B) + return PredMI->getOperand(0).getMBB() == Succ; + return false; +} +#endif // NDEBUG + +void ARMConstantIslands::AdjustBBOffsetsAfter(MachineBasicBlock *BB, + int delta) { + MachineFunction::iterator MBBI = BB; MBBI = llvm::next(MBBI); + for(unsigned i = BB->getNumber()+1, e = BB->getParent()->getNumBlockIDs(); + i < e; ++i) { + BBOffsets[i] += delta; + // If some existing blocks have padding, adjust the padding as needed, a + // bit tricky. delta can be negative so don't use % on that. + if (!isThumb) + continue; + MachineBasicBlock *MBB = MBBI; + if (!MBB->empty() && !HasInlineAsm) { + // Constant pool entries require padding. + if (MBB->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY) { + unsigned OldOffset = BBOffsets[i] - delta; + if ((OldOffset%4) == 0 && (BBOffsets[i]%4) != 0) { + // add new padding + BBSizes[i] += 2; + delta += 2; + } else if ((OldOffset%4) != 0 && (BBOffsets[i]%4) == 0) { + // remove existing padding + BBSizes[i] -= 2; + delta -= 2; + } + } + // Thumb1 jump tables require padding. They should be at the end; + // following unconditional branches are removed by AnalyzeBranch. + // tBR_JTr expands to a mov pc followed by .align 2 and then the jump + // table entries. So this code checks whether offset of tBR_JTr + // is aligned; if it is, the offset of the jump table following the + // instruction will not be aligned, and we need padding. + MachineInstr *ThumbJTMI = prior(MBB->end()); + if (ThumbJTMI->getOpcode() == ARM::tBR_JTr) { + unsigned NewMIOffset = GetOffsetOf(ThumbJTMI); + unsigned OldMIOffset = NewMIOffset - delta; + if ((OldMIOffset%4) == 0 && (NewMIOffset%4) != 0) { + // remove existing padding + BBSizes[i] -= 2; + delta -= 2; + } else if ((OldMIOffset%4) != 0 && (NewMIOffset%4) == 0) { + // add new padding + BBSizes[i] += 2; + delta += 2; + } + } + if (delta==0) + return; + } + MBBI = llvm::next(MBBI); + } +} + +/// DecrementOldEntry - find the constant pool entry with index CPI +/// and instruction CPEMI, and decrement its refcount. If the refcount +/// becomes 0 remove the entry and instruction. Returns true if we removed +/// the entry, false if we didn't. + +bool ARMConstantIslands::DecrementOldEntry(unsigned CPI, MachineInstr *CPEMI) { + // Find the old entry. Eliminate it if it is no longer used. + CPEntry *CPE = findConstPoolEntry(CPI, CPEMI); + assert(CPE && "Unexpected!"); + if (--CPE->RefCount == 0) { + RemoveDeadCPEMI(CPEMI); + CPE->CPEMI = NULL; + --NumCPEs; + return true; + } + return false; +} + +/// LookForCPEntryInRange - see if the currently referenced CPE is in range; +/// if not, see if an in-range clone of the CPE is in range, and if so, +/// change the data structures so the user references the clone. Returns: +/// 0 = no existing entry found +/// 1 = entry found, and there were no code insertions or deletions +/// 2 = entry found, and there were code insertions or deletions +int ARMConstantIslands::LookForExistingCPEntry(CPUser& U, unsigned UserOffset) +{ + MachineInstr *UserMI = U.MI; + MachineInstr *CPEMI = U.CPEMI; + + // Check to see if the CPE is already in-range. + if (CPEIsInRange(UserMI, UserOffset, CPEMI, U.MaxDisp, U.NegOk, true)) { + DEBUG(errs() << "In range\n"); + return 1; + } + + // No. Look for previously created clones of the CPE that are in range. + unsigned CPI = CPEMI->getOperand(1).getIndex(); + std::vector<CPEntry> &CPEs = CPEntries[CPI]; + for (unsigned i = 0, e = CPEs.size(); i != e; ++i) { + // We already tried this one + if (CPEs[i].CPEMI == CPEMI) + continue; + // Removing CPEs can leave empty entries, skip + if (CPEs[i].CPEMI == NULL) + continue; + if (CPEIsInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.MaxDisp, U.NegOk)) { + DEBUG(errs() << "Replacing CPE#" << CPI << " with CPE#" + << CPEs[i].CPI << "\n"); + // Point the CPUser node to the replacement + U.CPEMI = CPEs[i].CPEMI; + // Change the CPI in the instruction operand to refer to the clone. + for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j) + if (UserMI->getOperand(j).isCPI()) { + UserMI->getOperand(j).setIndex(CPEs[i].CPI); + break; + } + // Adjust the refcount of the clone... + CPEs[i].RefCount++; + // ...and the original. If we didn't remove the old entry, none of the + // addresses changed, so we don't need another pass. + return DecrementOldEntry(CPI, CPEMI) ? 2 : 1; + } + } + return 0; +} + +/// getUnconditionalBrDisp - Returns the maximum displacement that can fit in +/// the specific unconditional branch instruction. +static inline unsigned getUnconditionalBrDisp(int Opc) { + switch (Opc) { + case ARM::tB: + return ((1<<10)-1)*2; + case ARM::t2B: + return ((1<<23)-1)*2; + default: + break; + } + + return ((1<<23)-1)*4; +} + +/// LookForWater - Look for an existing entry in the WaterList in which +/// we can place the CPE referenced from U so it's within range of U's MI. +/// Returns true if found, false if not. If it returns true, WaterIter +/// is set to the WaterList entry. For Thumb, prefer water that will not +/// introduce padding to water that will. To ensure that this pass +/// terminates, the CPE location for a particular CPUser is only allowed to +/// move to a lower address, so search backward from the end of the list and +/// prefer the first water that is in range. +bool ARMConstantIslands::LookForWater(CPUser &U, unsigned UserOffset, + water_iterator &WaterIter) { + if (WaterList.empty()) + return false; + + bool FoundWaterThatWouldPad = false; + water_iterator IPThatWouldPad; + for (water_iterator IP = prior(WaterList.end()), + B = WaterList.begin();; --IP) { + MachineBasicBlock* WaterBB = *IP; + // Check if water is in range and is either at a lower address than the + // current "high water mark" or a new water block that was created since + // the previous iteration by inserting an unconditional branch. In the + // latter case, we want to allow resetting the high water mark back to + // this new water since we haven't seen it before. Inserting branches + // should be relatively uncommon and when it does happen, we want to be + // sure to take advantage of it for all the CPEs near that block, so that + // we don't insert more branches than necessary. + if (WaterIsInRange(UserOffset, WaterBB, U) && + (WaterBB->getNumber() < U.HighWaterMark->getNumber() || + NewWaterList.count(WaterBB))) { + unsigned WBBId = WaterBB->getNumber(); + if (isThumb && + (BBOffsets[WBBId] + BBSizes[WBBId])%4 != 0) { + // This is valid Water, but would introduce padding. Remember + // it in case we don't find any Water that doesn't do this. + if (!FoundWaterThatWouldPad) { + FoundWaterThatWouldPad = true; + IPThatWouldPad = IP; + } + } else { + WaterIter = IP; + return true; + } + } + if (IP == B) + break; + } + if (FoundWaterThatWouldPad) { + WaterIter = IPThatWouldPad; + return true; + } + return false; +} + +/// CreateNewWater - No existing WaterList entry will work for +/// CPUsers[CPUserIndex], so create a place to put the CPE. The end of the +/// block is used if in range, and the conditional branch munged so control +/// flow is correct. Otherwise the block is split to create a hole with an +/// unconditional branch around it. In either case NewMBB is set to a +/// block following which the new island can be inserted (the WaterList +/// is not adjusted). +void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex, + unsigned UserOffset, + MachineBasicBlock *&NewMBB) { + CPUser &U = CPUsers[CPUserIndex]; + MachineInstr *UserMI = U.MI; + MachineInstr *CPEMI = U.CPEMI; + MachineBasicBlock *UserMBB = UserMI->getParent(); + unsigned OffsetOfNextBlock = BBOffsets[UserMBB->getNumber()] + + BBSizes[UserMBB->getNumber()]; + assert(OffsetOfNextBlock== BBOffsets[UserMBB->getNumber()+1]); + + // If the block does not end in an unconditional branch already, and if the + // end of the block is within range, make new water there. (The addition + // below is for the unconditional branch we will be adding: 4 bytes on ARM + + // Thumb2, 2 on Thumb1. Possible Thumb1 alignment padding is allowed for + // inside OffsetIsInRange. + if (BBHasFallthrough(UserMBB) && + OffsetIsInRange(UserOffset, OffsetOfNextBlock + (isThumb1 ? 2: 4), + U.MaxDisp, U.NegOk, U.IsSoImm)) { + DEBUG(errs() << "Split at end of block\n"); + if (&UserMBB->back() == UserMI) + assert(BBHasFallthrough(UserMBB) && "Expected a fallthrough BB!"); + NewMBB = llvm::next(MachineFunction::iterator(UserMBB)); + // Add an unconditional branch from UserMBB to fallthrough block. + // Record it for branch lengthening; this new branch will not get out of + // range, but if the preceding conditional branch is out of range, the + // targets will be exchanged, and the altered branch may be out of + // range, so the machinery has to know about it. + int UncondBr = isThumb ? ((isThumb2) ? ARM::t2B : ARM::tB) : ARM::B; + if (!isThumb) + BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB); + else + BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB) + .addImm(ARMCC::AL).addReg(0); + unsigned MaxDisp = getUnconditionalBrDisp(UncondBr); + ImmBranches.push_back(ImmBranch(&UserMBB->back(), + MaxDisp, false, UncondBr)); + int delta = isThumb1 ? 2 : 4; + BBSizes[UserMBB->getNumber()] += delta; + AdjustBBOffsetsAfter(UserMBB, delta); + } else { + // What a big block. Find a place within the block to split it. + // This is a little tricky on Thumb1 since instructions are 2 bytes + // and constant pool entries are 4 bytes: if instruction I references + // island CPE, and instruction I+1 references CPE', it will + // not work well to put CPE as far forward as possible, since then + // CPE' cannot immediately follow it (that location is 2 bytes + // farther away from I+1 than CPE was from I) and we'd need to create + // a new island. So, we make a first guess, then walk through the + // instructions between the one currently being looked at and the + // possible insertion point, and make sure any other instructions + // that reference CPEs will be able to use the same island area; + // if not, we back up the insertion point. + + // The 4 in the following is for the unconditional branch we'll be + // inserting (allows for long branch on Thumb1). Alignment of the + // island is handled inside OffsetIsInRange. + unsigned BaseInsertOffset = UserOffset + U.MaxDisp -4; + // This could point off the end of the block if we've already got + // constant pool entries following this block; only the last one is + // in the water list. Back past any possible branches (allow for a + // conditional and a maximally long unconditional). + if (BaseInsertOffset >= BBOffsets[UserMBB->getNumber()+1]) + BaseInsertOffset = BBOffsets[UserMBB->getNumber()+1] - + (isThumb1 ? 6 : 8); + unsigned EndInsertOffset = BaseInsertOffset + + CPEMI->getOperand(2).getImm(); + MachineBasicBlock::iterator MI = UserMI; + ++MI; + unsigned CPUIndex = CPUserIndex+1; + unsigned NumCPUsers = CPUsers.size(); + MachineInstr *LastIT = 0; + for (unsigned Offset = UserOffset+TII->GetInstSizeInBytes(UserMI); + Offset < BaseInsertOffset; + Offset += TII->GetInstSizeInBytes(MI), + MI = llvm::next(MI)) { + if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == MI) { + CPUser &U = CPUsers[CPUIndex]; + if (!OffsetIsInRange(Offset, EndInsertOffset, + U.MaxDisp, U.NegOk, U.IsSoImm)) { + BaseInsertOffset -= (isThumb1 ? 2 : 4); + EndInsertOffset -= (isThumb1 ? 2 : 4); + } + // This is overly conservative, as we don't account for CPEMIs + // being reused within the block, but it doesn't matter much. + EndInsertOffset += CPUsers[CPUIndex].CPEMI->getOperand(2).getImm(); + CPUIndex++; + } + + // Remember the last IT instruction. + if (MI->getOpcode() == ARM::t2IT) + LastIT = MI; + } + + DEBUG(errs() << "Split in middle of big block\n"); + --MI; + + // Avoid splitting an IT block. + if (LastIT) { + unsigned PredReg = 0; + ARMCC::CondCodes CC = llvm::getITInstrPredicate(MI, PredReg); + if (CC != ARMCC::AL) + MI = LastIT; + } + NewMBB = SplitBlockBeforeInstr(MI); + } +} + +/// HandleConstantPoolUser - Analyze the specified user, checking to see if it +/// is out-of-range. If so, pick up the constant pool value and move it some +/// place in-range. Return true if we changed any addresses (thus must run +/// another pass of branch lengthening), false otherwise. +bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &MF, + unsigned CPUserIndex) { + CPUser &U = CPUsers[CPUserIndex]; + MachineInstr *UserMI = U.MI; + MachineInstr *CPEMI = U.CPEMI; + unsigned CPI = CPEMI->getOperand(1).getIndex(); + unsigned Size = CPEMI->getOperand(2).getImm(); + // Compute this only once, it's expensive. The 4 or 8 is the value the + // hardware keeps in the PC. + unsigned UserOffset = GetOffsetOf(UserMI) + (isThumb ? 4 : 8); + + // See if the current entry is within range, or there is a clone of it + // in range. + int result = LookForExistingCPEntry(U, UserOffset); + if (result==1) return false; + else if (result==2) return true; + + // No existing clone of this CPE is within range. + // We will be generating a new clone. Get a UID for it. + unsigned ID = AFI->createPICLabelUId(); + + // Look for water where we can place this CPE. + MachineBasicBlock *NewIsland = MF.CreateMachineBasicBlock(); + MachineBasicBlock *NewMBB; + water_iterator IP; + if (LookForWater(U, UserOffset, IP)) { + DEBUG(errs() << "found water in range\n"); + MachineBasicBlock *WaterBB = *IP; + + // If the original WaterList entry was "new water" on this iteration, + // propagate that to the new island. This is just keeping NewWaterList + // updated to match the WaterList, which will be updated below. + if (NewWaterList.count(WaterBB)) { + NewWaterList.erase(WaterBB); + NewWaterList.insert(NewIsland); + } + // The new CPE goes before the following block (NewMBB). + NewMBB = llvm::next(MachineFunction::iterator(WaterBB)); + + } else { + // No water found. + DEBUG(errs() << "No water found\n"); + CreateNewWater(CPUserIndex, UserOffset, NewMBB); + + // SplitBlockBeforeInstr adds to WaterList, which is important when it is + // called while handling branches so that the water will be seen on the + // next iteration for constant pools, but in this context, we don't want + // it. Check for this so it will be removed from the WaterList. + // Also remove any entry from NewWaterList. + MachineBasicBlock *WaterBB = prior(MachineFunction::iterator(NewMBB)); + IP = std::find(WaterList.begin(), WaterList.end(), WaterBB); + if (IP != WaterList.end()) + NewWaterList.erase(WaterBB); + + // We are adding new water. Update NewWaterList. + NewWaterList.insert(NewIsland); + } + + // Remove the original WaterList entry; we want subsequent insertions in + // this vicinity to go after the one we're about to insert. This + // considerably reduces the number of times we have to move the same CPE + // more than once and is also important to ensure the algorithm terminates. + if (IP != WaterList.end()) + WaterList.erase(IP); + + // Okay, we know we can put an island before NewMBB now, do it! + MF.insert(NewMBB, NewIsland); + + // Update internal data structures to account for the newly inserted MBB. + UpdateForInsertedWaterBlock(NewIsland); + + // Decrement the old entry, and remove it if refcount becomes 0. + DecrementOldEntry(CPI, CPEMI); + + // Now that we have an island to add the CPE to, clone the original CPE and + // add it to the island. + U.HighWaterMark = NewIsland; + U.CPEMI = BuildMI(NewIsland, DebugLoc(), TII->get(ARM::CONSTPOOL_ENTRY)) + .addImm(ID).addConstantPoolIndex(CPI).addImm(Size); + CPEntries[CPI].push_back(CPEntry(U.CPEMI, ID, 1)); + ++NumCPEs; + + BBOffsets[NewIsland->getNumber()] = BBOffsets[NewMBB->getNumber()]; + // Compensate for .align 2 in thumb mode. + if (isThumb && (BBOffsets[NewIsland->getNumber()]%4 != 0 || HasInlineAsm)) + Size += 2; + // Increase the size of the island block to account for the new entry. + BBSizes[NewIsland->getNumber()] += Size; + AdjustBBOffsetsAfter(NewIsland, Size); + + // Finally, change the CPI in the instruction operand to be ID. + for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i) + if (UserMI->getOperand(i).isCPI()) { + UserMI->getOperand(i).setIndex(ID); + break; + } + + DEBUG(errs() << " Moved CPE to #" << ID << " CPI=" << CPI + << '\t' << *UserMI); + + return true; +} + +/// RemoveDeadCPEMI - Remove a dead constant pool entry instruction. Update +/// sizes and offsets of impacted basic blocks. +void ARMConstantIslands::RemoveDeadCPEMI(MachineInstr *CPEMI) { + MachineBasicBlock *CPEBB = CPEMI->getParent(); + unsigned Size = CPEMI->getOperand(2).getImm(); + CPEMI->eraseFromParent(); + BBSizes[CPEBB->getNumber()] -= Size; + // All succeeding offsets have the current size value added in, fix this. + if (CPEBB->empty()) { + // In thumb1 mode, the size of island may be padded by two to compensate for + // the alignment requirement. Then it will now be 2 when the block is + // empty, so fix this. + // All succeeding offsets have the current size value added in, fix this. + if (BBSizes[CPEBB->getNumber()] != 0) { + Size += BBSizes[CPEBB->getNumber()]; + BBSizes[CPEBB->getNumber()] = 0; + } + } + AdjustBBOffsetsAfter(CPEBB, -Size); + // An island has only one predecessor BB and one successor BB. Check if + // this BB's predecessor jumps directly to this BB's successor. This + // shouldn't happen currently. + assert(!BBIsJumpedOver(CPEBB) && "How did this happen?"); + // FIXME: remove the empty blocks after all the work is done? +} + +/// RemoveUnusedCPEntries - Remove constant pool entries whose refcounts +/// are zero. +bool ARMConstantIslands::RemoveUnusedCPEntries() { + unsigned MadeChange = false; + for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) { + std::vector<CPEntry> &CPEs = CPEntries[i]; + for (unsigned j = 0, ee = CPEs.size(); j != ee; ++j) { + if (CPEs[j].RefCount == 0 && CPEs[j].CPEMI) { + RemoveDeadCPEMI(CPEs[j].CPEMI); + CPEs[j].CPEMI = NULL; + MadeChange = true; + } + } + } + return MadeChange; +} + +/// BBIsInRange - Returns true if the distance between specific MI and +/// specific BB can fit in MI's displacement field. +bool ARMConstantIslands::BBIsInRange(MachineInstr *MI,MachineBasicBlock *DestBB, + unsigned MaxDisp) { + unsigned PCAdj = isThumb ? 4 : 8; + unsigned BrOffset = GetOffsetOf(MI) + PCAdj; + unsigned DestOffset = BBOffsets[DestBB->getNumber()]; + + DEBUG(errs() << "Branch of destination BB#" << DestBB->getNumber() + << " from BB#" << MI->getParent()->getNumber() + << " max delta=" << MaxDisp + << " from " << GetOffsetOf(MI) << " to " << DestOffset + << " offset " << int(DestOffset-BrOffset) << "\t" << *MI); + + if (BrOffset <= DestOffset) { + // Branch before the Dest. + if (DestOffset-BrOffset <= MaxDisp) + return true; + } else { + if (BrOffset-DestOffset <= MaxDisp) + return true; + } + return false; +} + +/// FixUpImmediateBr - Fix up an immediate branch whose destination is too far +/// away to fit in its displacement field. +bool ARMConstantIslands::FixUpImmediateBr(MachineFunction &MF, ImmBranch &Br) { + MachineInstr *MI = Br.MI; + MachineBasicBlock *DestBB = MI->getOperand(0).getMBB(); + + // Check to see if the DestBB is already in-range. + if (BBIsInRange(MI, DestBB, Br.MaxDisp)) + return false; + + if (!Br.isCond) + return FixUpUnconditionalBr(MF, Br); + return FixUpConditionalBr(MF, Br); +} + +/// FixUpUnconditionalBr - Fix up an unconditional branch whose destination is +/// too far away to fit in its displacement field. If the LR register has been +/// spilled in the epilogue, then we can use BL to implement a far jump. +/// Otherwise, add an intermediate branch instruction to a branch. +bool +ARMConstantIslands::FixUpUnconditionalBr(MachineFunction &MF, ImmBranch &Br) { + MachineInstr *MI = Br.MI; + MachineBasicBlock *MBB = MI->getParent(); + if (!isThumb1) + llvm_unreachable("FixUpUnconditionalBr is Thumb1 only!"); + + // Use BL to implement far jump. + Br.MaxDisp = (1 << 21) * 2; + MI->setDesc(TII->get(ARM::tBfar)); + BBSizes[MBB->getNumber()] += 2; + AdjustBBOffsetsAfter(MBB, 2); + HasFarJump = true; + ++NumUBrFixed; + + DEBUG(errs() << " Changed B to long jump " << *MI); + + return true; +} + +/// FixUpConditionalBr - Fix up a conditional branch whose destination is too +/// far away to fit in its displacement field. It is converted to an inverse +/// conditional branch + an unconditional branch to the destination. +bool +ARMConstantIslands::FixUpConditionalBr(MachineFunction &MF, ImmBranch &Br) { + MachineInstr *MI = Br.MI; + MachineBasicBlock *DestBB = MI->getOperand(0).getMBB(); + + // Add an unconditional branch to the destination and invert the branch + // condition to jump over it: + // blt L1 + // => + // bge L2 + // b L1 + // L2: + ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(1).getImm(); + CC = ARMCC::getOppositeCondition(CC); + unsigned CCReg = MI->getOperand(2).getReg(); + + // If the branch is at the end of its MBB and that has a fall-through block, + // direct the updated conditional branch to the fall-through block. Otherwise, + // split the MBB before the next instruction. + MachineBasicBlock *MBB = MI->getParent(); + MachineInstr *BMI = &MBB->back(); + bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB); + + ++NumCBrFixed; + if (BMI != MI) { + if (llvm::next(MachineBasicBlock::iterator(MI)) == prior(MBB->end()) && + BMI->getOpcode() == Br.UncondBr) { + // Last MI in the BB is an unconditional branch. Can we simply invert the + // condition and swap destinations: + // beq L1 + // b L2 + // => + // bne L2 + // b L1 + MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB(); + if (BBIsInRange(MI, NewDest, Br.MaxDisp)) { + DEBUG(errs() << " Invert Bcc condition and swap its destination with " + << *BMI); + BMI->getOperand(0).setMBB(DestBB); + MI->getOperand(0).setMBB(NewDest); + MI->getOperand(1).setImm(CC); + return true; + } + } + } + + if (NeedSplit) { + SplitBlockBeforeInstr(MI); + // No need for the branch to the next block. We're adding an unconditional + // branch to the destination. + int delta = TII->GetInstSizeInBytes(&MBB->back()); + BBSizes[MBB->getNumber()] -= delta; + MachineBasicBlock* SplitBB = llvm::next(MachineFunction::iterator(MBB)); + AdjustBBOffsetsAfter(SplitBB, -delta); + MBB->back().eraseFromParent(); + // BBOffsets[SplitBB] is wrong temporarily, fixed below + } + MachineBasicBlock *NextBB = llvm::next(MachineFunction::iterator(MBB)); + + DEBUG(errs() << " Insert B to BB#" << DestBB->getNumber() + << " also invert condition and change dest. to BB#" + << NextBB->getNumber() << "\n"); + + // Insert a new conditional branch and a new unconditional branch. + // Also update the ImmBranch as well as adding a new entry for the new branch. + BuildMI(MBB, DebugLoc(), TII->get(MI->getOpcode())) + .addMBB(NextBB).addImm(CC).addReg(CCReg); + Br.MI = &MBB->back(); + BBSizes[MBB->getNumber()] += TII->GetInstSizeInBytes(&MBB->back()); + if (isThumb) + BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB) + .addImm(ARMCC::AL).addReg(0); + else + BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB); + BBSizes[MBB->getNumber()] += TII->GetInstSizeInBytes(&MBB->back()); + unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr); + ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr)); + + // Remove the old conditional branch. It may or may not still be in MBB. + BBSizes[MI->getParent()->getNumber()] -= TII->GetInstSizeInBytes(MI); + MI->eraseFromParent(); + + // The net size change is an addition of one unconditional branch. + int delta = TII->GetInstSizeInBytes(&MBB->back()); + AdjustBBOffsetsAfter(MBB, delta); + return true; +} + +/// UndoLRSpillRestore - Remove Thumb push / pop instructions that only spills +/// LR / restores LR to pc. FIXME: This is done here because it's only possible +/// to do this if tBfar is not used. +bool ARMConstantIslands::UndoLRSpillRestore() { + bool MadeChange = false; + for (unsigned i = 0, e = PushPopMIs.size(); i != e; ++i) { + MachineInstr *MI = PushPopMIs[i]; + // First two operands are predicates. + if (MI->getOpcode() == ARM::tPOP_RET && + MI->getOperand(2).getReg() == ARM::PC && + MI->getNumExplicitOperands() == 3) { + // Create the new insn and copy the predicate from the old. + BuildMI(MI->getParent(), MI->getDebugLoc(), TII->get(ARM::tBX_RET)) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)); + MI->eraseFromParent(); + MadeChange = true; + } + } + return MadeChange; +} + +bool ARMConstantIslands::OptimizeThumb2Instructions(MachineFunction &MF) { + bool MadeChange = false; + + // Shrink ADR and LDR from constantpool. + for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) { + CPUser &U = CPUsers[i]; + unsigned Opcode = U.MI->getOpcode(); + unsigned NewOpc = 0; + unsigned Scale = 1; + unsigned Bits = 0; + switch (Opcode) { + default: break; + case ARM::t2LEApcrel: + if (isARMLowRegister(U.MI->getOperand(0).getReg())) { + NewOpc = ARM::tLEApcrel; + Bits = 8; + Scale = 4; + } + break; + case ARM::t2LDRpci: + if (isARMLowRegister(U.MI->getOperand(0).getReg())) { + NewOpc = ARM::tLDRpci; + Bits = 8; + Scale = 4; + } + break; + } + + if (!NewOpc) + continue; + + unsigned UserOffset = GetOffsetOf(U.MI) + 4; + unsigned MaxOffs = ((1 << Bits) - 1) * Scale; + // FIXME: Check if offset is multiple of scale if scale is not 4. + if (CPEIsInRange(U.MI, UserOffset, U.CPEMI, MaxOffs, false, true)) { + U.MI->setDesc(TII->get(NewOpc)); + MachineBasicBlock *MBB = U.MI->getParent(); + BBSizes[MBB->getNumber()] -= 2; + AdjustBBOffsetsAfter(MBB, -2); + ++NumT2CPShrunk; + MadeChange = true; + } + } + + MadeChange |= OptimizeThumb2Branches(MF); + MadeChange |= OptimizeThumb2JumpTables(MF); + return MadeChange; +} + +bool ARMConstantIslands::OptimizeThumb2Branches(MachineFunction &MF) { + bool MadeChange = false; + + for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i) { + ImmBranch &Br = ImmBranches[i]; + unsigned Opcode = Br.MI->getOpcode(); + unsigned NewOpc = 0; + unsigned Scale = 1; + unsigned Bits = 0; + switch (Opcode) { + default: break; + case ARM::t2B: + NewOpc = ARM::tB; + Bits = 11; + Scale = 2; + break; + case ARM::t2Bcc: { + NewOpc = ARM::tBcc; + Bits = 8; + Scale = 2; + break; + } + } + if (NewOpc) { + unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale; + MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB(); + if (BBIsInRange(Br.MI, DestBB, MaxOffs)) { + Br.MI->setDesc(TII->get(NewOpc)); + MachineBasicBlock *MBB = Br.MI->getParent(); + BBSizes[MBB->getNumber()] -= 2; + AdjustBBOffsetsAfter(MBB, -2); + ++NumT2BrShrunk; + MadeChange = true; + } + } + + Opcode = Br.MI->getOpcode(); + if (Opcode != ARM::tBcc) + continue; + + NewOpc = 0; + unsigned PredReg = 0; + ARMCC::CondCodes Pred = llvm::getInstrPredicate(Br.MI, PredReg); + if (Pred == ARMCC::EQ) + NewOpc = ARM::tCBZ; + else if (Pred == ARMCC::NE) + NewOpc = ARM::tCBNZ; + if (!NewOpc) + continue; + MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB(); + // Check if the distance is within 126. Subtract starting offset by 2 + // because the cmp will be eliminated. + unsigned BrOffset = GetOffsetOf(Br.MI) + 4 - 2; + unsigned DestOffset = BBOffsets[DestBB->getNumber()]; + if (BrOffset < DestOffset && (DestOffset - BrOffset) <= 126) { + MachineBasicBlock::iterator CmpMI = Br.MI; + if (CmpMI != Br.MI->getParent()->begin()) { + --CmpMI; + if (CmpMI->getOpcode() == ARM::tCMPi8) { + unsigned Reg = CmpMI->getOperand(0).getReg(); + Pred = llvm::getInstrPredicate(CmpMI, PredReg); + if (Pred == ARMCC::AL && + CmpMI->getOperand(1).getImm() == 0 && + isARMLowRegister(Reg)) { + MachineBasicBlock *MBB = Br.MI->getParent(); + MachineInstr *NewBR = + BuildMI(*MBB, CmpMI, Br.MI->getDebugLoc(), TII->get(NewOpc)) + .addReg(Reg).addMBB(DestBB,Br.MI->getOperand(0).getTargetFlags()); + CmpMI->eraseFromParent(); + Br.MI->eraseFromParent(); + Br.MI = NewBR; + BBSizes[MBB->getNumber()] -= 2; + AdjustBBOffsetsAfter(MBB, -2); + ++NumCBZ; + MadeChange = true; + } + } + } + } + } + + return MadeChange; +} + +/// OptimizeThumb2JumpTables - Use tbb / tbh instructions to generate smaller +/// jumptables when it's possible. +bool ARMConstantIslands::OptimizeThumb2JumpTables(MachineFunction &MF) { + bool MadeChange = false; + + // FIXME: After the tables are shrunk, can we get rid some of the + // constantpool tables? + MachineJumpTableInfo *MJTI = MF.getJumpTableInfo(); + if (MJTI == 0) return false; + + const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); + for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) { + MachineInstr *MI = T2JumpTables[i]; + const MCInstrDesc &MCID = MI->getDesc(); + unsigned NumOps = MCID.getNumOperands(); + unsigned JTOpIdx = NumOps - (MCID.isPredicable() ? 3 : 2); + MachineOperand JTOP = MI->getOperand(JTOpIdx); + unsigned JTI = JTOP.getIndex(); + assert(JTI < JT.size()); + + bool ByteOk = true; + bool HalfWordOk = true; + unsigned JTOffset = GetOffsetOf(MI) + 4; + const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs; + for (unsigned j = 0, ee = JTBBs.size(); j != ee; ++j) { + MachineBasicBlock *MBB = JTBBs[j]; + unsigned DstOffset = BBOffsets[MBB->getNumber()]; + // Negative offset is not ok. FIXME: We should change BB layout to make + // sure all the branches are forward. + if (ByteOk && (DstOffset - JTOffset) > ((1<<8)-1)*2) + ByteOk = false; + unsigned TBHLimit = ((1<<16)-1)*2; + if (HalfWordOk && (DstOffset - JTOffset) > TBHLimit) + HalfWordOk = false; + if (!ByteOk && !HalfWordOk) + break; + } + + if (ByteOk || HalfWordOk) { + MachineBasicBlock *MBB = MI->getParent(); + unsigned BaseReg = MI->getOperand(0).getReg(); + bool BaseRegKill = MI->getOperand(0).isKill(); + if (!BaseRegKill) + continue; + unsigned IdxReg = MI->getOperand(1).getReg(); + bool IdxRegKill = MI->getOperand(1).isKill(); + + // Scan backwards to find the instruction that defines the base + // register. Due to post-RA scheduling, we can't count on it + // immediately preceding the branch instruction. + MachineBasicBlock::iterator PrevI = MI; + MachineBasicBlock::iterator B = MBB->begin(); + while (PrevI != B && !PrevI->definesRegister(BaseReg)) + --PrevI; + + // If for some reason we didn't find it, we can't do anything, so + // just skip this one. + if (!PrevI->definesRegister(BaseReg)) + continue; + + MachineInstr *AddrMI = PrevI; + bool OptOk = true; + // Examine the instruction that calculates the jumptable entry address. + // Make sure it only defines the base register and kills any uses + // other than the index register. + for (unsigned k = 0, eee = AddrMI->getNumOperands(); k != eee; ++k) { + const MachineOperand &MO = AddrMI->getOperand(k); + if (!MO.isReg() || !MO.getReg()) + continue; + if (MO.isDef() && MO.getReg() != BaseReg) { + OptOk = false; + break; + } + if (MO.isUse() && !MO.isKill() && MO.getReg() != IdxReg) { + OptOk = false; + break; + } + } + if (!OptOk) + continue; + + // Now scan back again to find the tLEApcrel or t2LEApcrelJT instruction + // that gave us the initial base register definition. + for (--PrevI; PrevI != B && !PrevI->definesRegister(BaseReg); --PrevI) + ; + + // The instruction should be a tLEApcrel or t2LEApcrelJT; we want + // to delete it as well. + MachineInstr *LeaMI = PrevI; + if ((LeaMI->getOpcode() != ARM::tLEApcrelJT && + LeaMI->getOpcode() != ARM::t2LEApcrelJT) || + LeaMI->getOperand(0).getReg() != BaseReg) + OptOk = false; + + if (!OptOk) + continue; + + unsigned Opc = ByteOk ? ARM::t2TBB_JT : ARM::t2TBH_JT; + MachineInstr *NewJTMI = BuildMI(MBB, MI->getDebugLoc(), TII->get(Opc)) + .addReg(IdxReg, getKillRegState(IdxRegKill)) + .addJumpTableIndex(JTI, JTOP.getTargetFlags()) + .addImm(MI->getOperand(JTOpIdx+1).getImm()); + // FIXME: Insert an "ALIGN" instruction to ensure the next instruction + // is 2-byte aligned. For now, asm printer will fix it up. + unsigned NewSize = TII->GetInstSizeInBytes(NewJTMI); + unsigned OrigSize = TII->GetInstSizeInBytes(AddrMI); + OrigSize += TII->GetInstSizeInBytes(LeaMI); + OrigSize += TII->GetInstSizeInBytes(MI); + + AddrMI->eraseFromParent(); + LeaMI->eraseFromParent(); + MI->eraseFromParent(); + + int delta = OrigSize - NewSize; + BBSizes[MBB->getNumber()] -= delta; + AdjustBBOffsetsAfter(MBB, -delta); + + ++NumTBs; + MadeChange = true; + } + } + + return MadeChange; +} + +/// ReorderThumb2JumpTables - Adjust the function's block layout to ensure that +/// jump tables always branch forwards, since that's what tbb and tbh need. +bool ARMConstantIslands::ReorderThumb2JumpTables(MachineFunction &MF) { + bool MadeChange = false; + + MachineJumpTableInfo *MJTI = MF.getJumpTableInfo(); + if (MJTI == 0) return false; + + const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); + for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) { + MachineInstr *MI = T2JumpTables[i]; + const MCInstrDesc &MCID = MI->getDesc(); + unsigned NumOps = MCID.getNumOperands(); + unsigned JTOpIdx = NumOps - (MCID.isPredicable() ? 3 : 2); + MachineOperand JTOP = MI->getOperand(JTOpIdx); + unsigned JTI = JTOP.getIndex(); + assert(JTI < JT.size()); + + // We prefer if target blocks for the jump table come after the jump + // instruction so we can use TB[BH]. Loop through the target blocks + // and try to adjust them such that that's true. + int JTNumber = MI->getParent()->getNumber(); + const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs; + for (unsigned j = 0, ee = JTBBs.size(); j != ee; ++j) { + MachineBasicBlock *MBB = JTBBs[j]; + int DTNumber = MBB->getNumber(); + + if (DTNumber < JTNumber) { + // The destination precedes the switch. Try to move the block forward + // so we have a positive offset. + MachineBasicBlock *NewBB = + AdjustJTTargetBlockForward(MBB, MI->getParent()); + if (NewBB) + MJTI->ReplaceMBBInJumpTable(JTI, JTBBs[j], NewBB); + MadeChange = true; + } + } + } + + return MadeChange; +} + +MachineBasicBlock *ARMConstantIslands:: +AdjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) +{ + MachineFunction &MF = *BB->getParent(); + + // If the destination block is terminated by an unconditional branch, + // try to move it; otherwise, create a new block following the jump + // table that branches back to the actual target. This is a very simple + // heuristic. FIXME: We can definitely improve it. + MachineBasicBlock *TBB = 0, *FBB = 0; + SmallVector<MachineOperand, 4> Cond; + SmallVector<MachineOperand, 4> CondPrior; + MachineFunction::iterator BBi = BB; + MachineFunction::iterator OldPrior = prior(BBi); + + // If the block terminator isn't analyzable, don't try to move the block + bool B = TII->AnalyzeBranch(*BB, TBB, FBB, Cond); + + // If the block ends in an unconditional branch, move it. The prior block + // has to have an analyzable terminator for us to move this one. Be paranoid + // and make sure we're not trying to move the entry block of the function. + if (!B && Cond.empty() && BB != MF.begin() && + !TII->AnalyzeBranch(*OldPrior, TBB, FBB, CondPrior)) { + BB->moveAfter(JTBB); + OldPrior->updateTerminator(); + BB->updateTerminator(); + // Update numbering to account for the block being moved. + MF.RenumberBlocks(); + ++NumJTMoved; + return NULL; + } + + // Create a new MBB for the code after the jump BB. + MachineBasicBlock *NewBB = + MF.CreateMachineBasicBlock(JTBB->getBasicBlock()); + MachineFunction::iterator MBBI = JTBB; ++MBBI; + MF.insert(MBBI, NewBB); + + // Add an unconditional branch from NewBB to BB. + // There doesn't seem to be meaningful DebugInfo available; this doesn't + // correspond directly to anything in the source. + assert (isThumb2 && "Adjusting for TB[BH] but not in Thumb2?"); + BuildMI(NewBB, DebugLoc(), TII->get(ARM::t2B)).addMBB(BB) + .addImm(ARMCC::AL).addReg(0); + + // Update internal data structures to account for the newly inserted MBB. + MF.RenumberBlocks(NewBB); + + // Update the CFG. + NewBB->addSuccessor(BB); + JTBB->removeSuccessor(BB); + JTBB->addSuccessor(NewBB); + + ++NumJTInserted; + return NewBB; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp new file mode 100644 index 0000000..aadfd47 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp @@ -0,0 +1,319 @@ +//===- ARMConstantPoolValue.cpp - ARM constantpool value --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the ARM specific constantpool value class. +// +//===----------------------------------------------------------------------===// + +#include "ARMConstantPoolValue.h" +#include "llvm/ADT/FoldingSet.h" +#include "llvm/Constant.h" +#include "llvm/Constants.h" +#include "llvm/GlobalValue.h" +#include "llvm/Type.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/Support/raw_ostream.h" +#include <cstdlib> +using namespace llvm; + +//===----------------------------------------------------------------------===// +// ARMConstantPoolValue +//===----------------------------------------------------------------------===// + +ARMConstantPoolValue::ARMConstantPoolValue(Type *Ty, unsigned id, + ARMCP::ARMCPKind kind, + unsigned char PCAdj, + ARMCP::ARMCPModifier modifier, + bool addCurrentAddress) + : MachineConstantPoolValue(Ty), LabelId(id), Kind(kind), + PCAdjust(PCAdj), Modifier(modifier), + AddCurrentAddress(addCurrentAddress) {} + +ARMConstantPoolValue::ARMConstantPoolValue(LLVMContext &C, unsigned id, + ARMCP::ARMCPKind kind, + unsigned char PCAdj, + ARMCP::ARMCPModifier modifier, + bool addCurrentAddress) + : MachineConstantPoolValue((Type*)Type::getInt32Ty(C)), + LabelId(id), Kind(kind), PCAdjust(PCAdj), Modifier(modifier), + AddCurrentAddress(addCurrentAddress) {} + +ARMConstantPoolValue::~ARMConstantPoolValue() {} + +const char *ARMConstantPoolValue::getModifierText() const { + switch (Modifier) { + default: llvm_unreachable("Unknown modifier!"); + // FIXME: Are these case sensitive? It'd be nice to lower-case all the + // strings if that's legal. + case ARMCP::no_modifier: return "none"; + case ARMCP::TLSGD: return "tlsgd"; + case ARMCP::GOT: return "GOT"; + case ARMCP::GOTOFF: return "GOTOFF"; + case ARMCP::GOTTPOFF: return "gottpoff"; + case ARMCP::TPOFF: return "tpoff"; + } +} + +int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP, + unsigned Alignment) { + assert(false && "Shouldn't be calling this directly!"); + return -1; +} + +void +ARMConstantPoolValue::addSelectionDAGCSEId(FoldingSetNodeID &ID) { + ID.AddInteger(LabelId); + ID.AddInteger(PCAdjust); +} + +bool +ARMConstantPoolValue::hasSameValue(ARMConstantPoolValue *ACPV) { + if (ACPV->Kind == Kind && + ACPV->PCAdjust == PCAdjust && + ACPV->Modifier == Modifier) { + if (ACPV->LabelId == LabelId) + return true; + // Two PC relative constpool entries containing the same GV address or + // external symbols. FIXME: What about blockaddress? + if (Kind == ARMCP::CPValue || Kind == ARMCP::CPExtSymbol) + return true; + } + return false; +} + +void ARMConstantPoolValue::dump() const { + errs() << " " << *this; +} + +void ARMConstantPoolValue::print(raw_ostream &O) const { + if (Modifier) O << "(" << getModifierText() << ")"; + if (PCAdjust != 0) { + O << "-(LPC" << LabelId << "+" << (unsigned)PCAdjust; + if (AddCurrentAddress) O << "-."; + O << ")"; + } +} + +//===----------------------------------------------------------------------===// +// ARMConstantPoolConstant +//===----------------------------------------------------------------------===// + +ARMConstantPoolConstant::ARMConstantPoolConstant(Type *Ty, + const Constant *C, + unsigned ID, + ARMCP::ARMCPKind Kind, + unsigned char PCAdj, + ARMCP::ARMCPModifier Modifier, + bool AddCurrentAddress) + : ARMConstantPoolValue(Ty, ID, Kind, PCAdj, Modifier, AddCurrentAddress), + CVal(C) {} + +ARMConstantPoolConstant::ARMConstantPoolConstant(const Constant *C, + unsigned ID, + ARMCP::ARMCPKind Kind, + unsigned char PCAdj, + ARMCP::ARMCPModifier Modifier, + bool AddCurrentAddress) + : ARMConstantPoolValue((Type*)C->getType(), ID, Kind, PCAdj, Modifier, + AddCurrentAddress), + CVal(C) {} + +ARMConstantPoolConstant * +ARMConstantPoolConstant::Create(const Constant *C, unsigned ID) { + return new ARMConstantPoolConstant(C, ID, ARMCP::CPValue, 0, + ARMCP::no_modifier, false); +} + +ARMConstantPoolConstant * +ARMConstantPoolConstant::Create(const GlobalValue *GV, + ARMCP::ARMCPModifier Modifier) { + return new ARMConstantPoolConstant((Type*)Type::getInt32Ty(GV->getContext()), + GV, 0, ARMCP::CPValue, 0, + Modifier, false); +} + +ARMConstantPoolConstant * +ARMConstantPoolConstant::Create(const Constant *C, unsigned ID, + ARMCP::ARMCPKind Kind, unsigned char PCAdj) { + return new ARMConstantPoolConstant(C, ID, Kind, PCAdj, + ARMCP::no_modifier, false); +} + +ARMConstantPoolConstant * +ARMConstantPoolConstant::Create(const Constant *C, unsigned ID, + ARMCP::ARMCPKind Kind, unsigned char PCAdj, + ARMCP::ARMCPModifier Modifier, + bool AddCurrentAddress) { + return new ARMConstantPoolConstant(C, ID, Kind, PCAdj, Modifier, + AddCurrentAddress); +} + +const GlobalValue *ARMConstantPoolConstant::getGV() const { + return dyn_cast_or_null<GlobalValue>(CVal); +} + +const BlockAddress *ARMConstantPoolConstant::getBlockAddress() const { + return dyn_cast_or_null<BlockAddress>(CVal); +} + +int ARMConstantPoolConstant::getExistingMachineCPValue(MachineConstantPool *CP, + unsigned Alignment) { + unsigned AlignMask = Alignment - 1; + const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants(); + for (unsigned i = 0, e = Constants.size(); i != e; ++i) { + if (Constants[i].isMachineConstantPoolEntry() && + (Constants[i].getAlignment() & AlignMask) == 0) { + ARMConstantPoolValue *CPV = + (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal; + ARMConstantPoolConstant *APC = dyn_cast<ARMConstantPoolConstant>(CPV); + if (!APC) continue; + if (APC->CVal == CVal && equals(APC)) + return i; + } + } + + return -1; +} + +bool ARMConstantPoolConstant::hasSameValue(ARMConstantPoolValue *ACPV) { + const ARMConstantPoolConstant *ACPC = dyn_cast<ARMConstantPoolConstant>(ACPV); + return ACPC && ACPC->CVal == CVal && ARMConstantPoolValue::hasSameValue(ACPV); +} + +void ARMConstantPoolConstant::addSelectionDAGCSEId(FoldingSetNodeID &ID) { + ID.AddPointer(CVal); + ARMConstantPoolValue::addSelectionDAGCSEId(ID); +} + +void ARMConstantPoolConstant::print(raw_ostream &O) const { + O << CVal->getName(); + ARMConstantPoolValue::print(O); +} + +//===----------------------------------------------------------------------===// +// ARMConstantPoolSymbol +//===----------------------------------------------------------------------===// + +ARMConstantPoolSymbol::ARMConstantPoolSymbol(LLVMContext &C, const char *s, + unsigned id, + unsigned char PCAdj, + ARMCP::ARMCPModifier Modifier, + bool AddCurrentAddress) + : ARMConstantPoolValue(C, id, ARMCP::CPExtSymbol, PCAdj, Modifier, + AddCurrentAddress), + S(strdup(s)) {} + +ARMConstantPoolSymbol::~ARMConstantPoolSymbol() { + free((void*)S); +} + +ARMConstantPoolSymbol * +ARMConstantPoolSymbol::Create(LLVMContext &C, const char *s, + unsigned ID, unsigned char PCAdj) { + return new ARMConstantPoolSymbol(C, s, ID, PCAdj, ARMCP::no_modifier, false); +} + +static bool CPV_streq(const char *S1, const char *S2) { + if (S1 == S2) + return true; + if (S1 && S2 && strcmp(S1, S2) == 0) + return true; + return false; +} + +int ARMConstantPoolSymbol::getExistingMachineCPValue(MachineConstantPool *CP, + unsigned Alignment) { + unsigned AlignMask = Alignment - 1; + const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants(); + for (unsigned i = 0, e = Constants.size(); i != e; ++i) { + if (Constants[i].isMachineConstantPoolEntry() && + (Constants[i].getAlignment() & AlignMask) == 0) { + ARMConstantPoolValue *CPV = + (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal; + ARMConstantPoolSymbol *APS = dyn_cast<ARMConstantPoolSymbol>(CPV); + if (!APS) continue; + + if (CPV_streq(APS->S, S) && equals(APS)) + return i; + } + } + + return -1; +} + +bool ARMConstantPoolSymbol::hasSameValue(ARMConstantPoolValue *ACPV) { + const ARMConstantPoolSymbol *ACPS = dyn_cast<ARMConstantPoolSymbol>(ACPV); + return ACPS && CPV_streq(ACPS->S, S) && + ARMConstantPoolValue::hasSameValue(ACPV); +} + +void ARMConstantPoolSymbol::addSelectionDAGCSEId(FoldingSetNodeID &ID) { + ID.AddPointer(S); + ARMConstantPoolValue::addSelectionDAGCSEId(ID); +} + +void ARMConstantPoolSymbol::print(raw_ostream &O) const { + O << S; + ARMConstantPoolValue::print(O); +} + +//===----------------------------------------------------------------------===// +// ARMConstantPoolMBB +//===----------------------------------------------------------------------===// + +ARMConstantPoolMBB::ARMConstantPoolMBB(LLVMContext &C, + const MachineBasicBlock *mbb, + unsigned id, unsigned char PCAdj, + ARMCP::ARMCPModifier Modifier, + bool AddCurrentAddress) + : ARMConstantPoolValue(C, id, ARMCP::CPMachineBasicBlock, PCAdj, + Modifier, AddCurrentAddress), + MBB(mbb) {} + +ARMConstantPoolMBB *ARMConstantPoolMBB::Create(LLVMContext &C, + const MachineBasicBlock *mbb, + unsigned ID, + unsigned char PCAdj) { + return new ARMConstantPoolMBB(C, mbb, ID, PCAdj, ARMCP::no_modifier, false); +} + +int ARMConstantPoolMBB::getExistingMachineCPValue(MachineConstantPool *CP, + unsigned Alignment) { + unsigned AlignMask = Alignment - 1; + const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants(); + for (unsigned i = 0, e = Constants.size(); i != e; ++i) { + if (Constants[i].isMachineConstantPoolEntry() && + (Constants[i].getAlignment() & AlignMask) == 0) { + ARMConstantPoolValue *CPV = + (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal; + ARMConstantPoolMBB *APMBB = dyn_cast<ARMConstantPoolMBB>(CPV); + if (!APMBB) continue; + + if (APMBB->MBB == MBB && equals(APMBB)) + return i; + } + } + + return -1; +} + +bool ARMConstantPoolMBB::hasSameValue(ARMConstantPoolValue *ACPV) { + const ARMConstantPoolMBB *ACPMBB = dyn_cast<ARMConstantPoolMBB>(ACPV); + return ACPMBB && ACPMBB->MBB == MBB && + ARMConstantPoolValue::hasSameValue(ACPV); +} + +void ARMConstantPoolMBB::addSelectionDAGCSEId(FoldingSetNodeID &ID) { + ID.AddPointer(MBB); + ARMConstantPoolValue::addSelectionDAGCSEId(ID); +} + +void ARMConstantPoolMBB::print(raw_ostream &O) const { + ARMConstantPoolValue::print(O); +} diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h new file mode 100644 index 0000000..0d0def3 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h @@ -0,0 +1,233 @@ +//===- ARMConstantPoolValue.h - ARM constantpool value ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the ARM specific constantpool value class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H +#define LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H + +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/Support/ErrorHandling.h" +#include <cstddef> + +namespace llvm { + +class BlockAddress; +class Constant; +class GlobalValue; +class LLVMContext; +class MachineBasicBlock; + +namespace ARMCP { + enum ARMCPKind { + CPValue, + CPExtSymbol, + CPBlockAddress, + CPLSDA, + CPMachineBasicBlock + }; + + enum ARMCPModifier { + no_modifier, + TLSGD, + GOT, + GOTOFF, + GOTTPOFF, + TPOFF + }; +} + +/// ARMConstantPoolValue - ARM specific constantpool value. This is used to +/// represent PC-relative displacement between the address of the load +/// instruction and the constant being loaded, i.e. (&GV-(LPIC+8)). +class ARMConstantPoolValue : public MachineConstantPoolValue { + unsigned LabelId; // Label id of the load. + ARMCP::ARMCPKind Kind; // Kind of constant. + unsigned char PCAdjust; // Extra adjustment if constantpool is pc-relative. + // 8 for ARM, 4 for Thumb. + ARMCP::ARMCPModifier Modifier; // GV modifier i.e. (&GV(modifier)-(LPIC+8)) + bool AddCurrentAddress; + +protected: + ARMConstantPoolValue(Type *Ty, unsigned id, ARMCP::ARMCPKind Kind, + unsigned char PCAdj, ARMCP::ARMCPModifier Modifier, + bool AddCurrentAddress); + + ARMConstantPoolValue(LLVMContext &C, unsigned id, ARMCP::ARMCPKind Kind, + unsigned char PCAdj, ARMCP::ARMCPModifier Modifier, + bool AddCurrentAddress); +public: + virtual ~ARMConstantPoolValue(); + + ARMCP::ARMCPModifier getModifier() const { return Modifier; } + const char *getModifierText() const; + bool hasModifier() const { return Modifier != ARMCP::no_modifier; } + + bool mustAddCurrentAddress() const { return AddCurrentAddress; } + + unsigned getLabelId() const { return LabelId; } + unsigned char getPCAdjustment() const { return PCAdjust; } + + bool isGlobalValue() const { return Kind == ARMCP::CPValue; } + bool isExtSymbol() const { return Kind == ARMCP::CPExtSymbol; } + bool isBlockAddress() const { return Kind == ARMCP::CPBlockAddress; } + bool isLSDA() const { return Kind == ARMCP::CPLSDA; } + bool isMachineBasicBlock() const{ return Kind == ARMCP::CPMachineBasicBlock; } + + virtual unsigned getRelocationInfo() const { return 2; } + + virtual int getExistingMachineCPValue(MachineConstantPool *CP, + unsigned Alignment); + + virtual void addSelectionDAGCSEId(FoldingSetNodeID &ID); + + /// hasSameValue - Return true if this ARM constpool value can share the same + /// constantpool entry as another ARM constpool value. + virtual bool hasSameValue(ARMConstantPoolValue *ACPV); + + bool equals(const ARMConstantPoolValue *A) const { + return this->LabelId == A->LabelId && + this->PCAdjust == A->PCAdjust && + this->Modifier == A->Modifier; + } + + virtual void print(raw_ostream &O) const; + void print(raw_ostream *O) const { if (O) print(*O); } + void dump() const; + + static bool classof(const ARMConstantPoolValue *) { return true; } +}; + +inline raw_ostream &operator<<(raw_ostream &O, const ARMConstantPoolValue &V) { + V.print(O); + return O; +} + +/// ARMConstantPoolConstant - ARM-specific constant pool values for Constants, +/// Functions, and BlockAddresses. +class ARMConstantPoolConstant : public ARMConstantPoolValue { + const Constant *CVal; // Constant being loaded. + + ARMConstantPoolConstant(const Constant *C, + unsigned ID, + ARMCP::ARMCPKind Kind, + unsigned char PCAdj, + ARMCP::ARMCPModifier Modifier, + bool AddCurrentAddress); + ARMConstantPoolConstant(Type *Ty, const Constant *C, + unsigned ID, + ARMCP::ARMCPKind Kind, + unsigned char PCAdj, + ARMCP::ARMCPModifier Modifier, + bool AddCurrentAddress); + +public: + static ARMConstantPoolConstant *Create(const Constant *C, unsigned ID); + static ARMConstantPoolConstant *Create(const GlobalValue *GV, + ARMCP::ARMCPModifier Modifier); + static ARMConstantPoolConstant *Create(const Constant *C, unsigned ID, + ARMCP::ARMCPKind Kind, + unsigned char PCAdj); + static ARMConstantPoolConstant *Create(const Constant *C, unsigned ID, + ARMCP::ARMCPKind Kind, + unsigned char PCAdj, + ARMCP::ARMCPModifier Modifier, + bool AddCurrentAddress); + + const GlobalValue *getGV() const; + const BlockAddress *getBlockAddress() const; + + virtual int getExistingMachineCPValue(MachineConstantPool *CP, + unsigned Alignment); + + /// hasSameValue - Return true if this ARM constpool value can share the same + /// constantpool entry as another ARM constpool value. + virtual bool hasSameValue(ARMConstantPoolValue *ACPV); + + virtual void addSelectionDAGCSEId(FoldingSetNodeID &ID); + + virtual void print(raw_ostream &O) const; + static bool classof(const ARMConstantPoolValue *APV) { + return APV->isGlobalValue() || APV->isBlockAddress() || APV->isLSDA(); + } + static bool classof(const ARMConstantPoolConstant *) { return true; } +}; + +/// ARMConstantPoolSymbol - ARM-specific constantpool values for external +/// symbols. +class ARMConstantPoolSymbol : public ARMConstantPoolValue { + const char *S; // ExtSymbol being loaded. + + ARMConstantPoolSymbol(LLVMContext &C, const char *s, unsigned id, + unsigned char PCAdj, ARMCP::ARMCPModifier Modifier, + bool AddCurrentAddress); + +public: + ~ARMConstantPoolSymbol(); + + static ARMConstantPoolSymbol *Create(LLVMContext &C, const char *s, + unsigned ID, unsigned char PCAdj); + + const char *getSymbol() const { return S; } + + virtual int getExistingMachineCPValue(MachineConstantPool *CP, + unsigned Alignment); + + virtual void addSelectionDAGCSEId(FoldingSetNodeID &ID); + + /// hasSameValue - Return true if this ARM constpool value can share the same + /// constantpool entry as another ARM constpool value. + virtual bool hasSameValue(ARMConstantPoolValue *ACPV); + + virtual void print(raw_ostream &O) const; + + static bool classof(const ARMConstantPoolValue *ACPV) { + return ACPV->isExtSymbol(); + } + static bool classof(const ARMConstantPoolSymbol *) { return true; } +}; + +/// ARMConstantPoolMBB - ARM-specific constantpool value of a machine basic +/// block. +class ARMConstantPoolMBB : public ARMConstantPoolValue { + const MachineBasicBlock *MBB; // Machine basic block. + + ARMConstantPoolMBB(LLVMContext &C, const MachineBasicBlock *mbb, unsigned id, + unsigned char PCAdj, ARMCP::ARMCPModifier Modifier, + bool AddCurrentAddress); + +public: + static ARMConstantPoolMBB *Create(LLVMContext &C, + const MachineBasicBlock *mbb, + unsigned ID, unsigned char PCAdj); + + const MachineBasicBlock *getMBB() const { return MBB; } + + virtual int getExistingMachineCPValue(MachineConstantPool *CP, + unsigned Alignment); + + virtual void addSelectionDAGCSEId(FoldingSetNodeID &ID); + + /// hasSameValue - Return true if this ARM constpool value can share the same + /// constantpool entry as another ARM constpool value. + virtual bool hasSameValue(ARMConstantPoolValue *ACPV); + + virtual void print(raw_ostream &O) const; + + static bool classof(const ARMConstantPoolValue *ACPV) { + return ACPV->isMachineBasicBlock(); + } + static bool classof(const ARMConstantPoolMBB *) { return true; } +}; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMELFWriterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMELFWriterInfo.cpp new file mode 100644 index 0000000..51e68b4 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMELFWriterInfo.cpp @@ -0,0 +1,83 @@ +//===-- ARMELFWriterInfo.cpp - ELF Writer Info for the ARM backend --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements ELF writer information for the ARM backend. +// +//===----------------------------------------------------------------------===// + +#include "ARMELFWriterInfo.h" +#include "ARMRelocations.h" +#include "llvm/Function.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Support/ELF.h" + +using namespace llvm; + +//===----------------------------------------------------------------------===// +// Implementation of the ARMELFWriterInfo class +//===----------------------------------------------------------------------===// + +ARMELFWriterInfo::ARMELFWriterInfo(TargetMachine &TM) + : TargetELFWriterInfo(TM.getTargetData()->getPointerSizeInBits() == 64, + TM.getTargetData()->isLittleEndian()) { +} + +ARMELFWriterInfo::~ARMELFWriterInfo() {} + +unsigned ARMELFWriterInfo::getRelocationType(unsigned MachineRelTy) const { + switch (MachineRelTy) { + case ARM::reloc_arm_absolute: + case ARM::reloc_arm_relative: + case ARM::reloc_arm_cp_entry: + case ARM::reloc_arm_vfp_cp_entry: + case ARM::reloc_arm_machine_cp_entry: + case ARM::reloc_arm_jt_base: + case ARM::reloc_arm_pic_jt: + assert(0 && "unsupported ARM relocation type"); break; + + case ARM::reloc_arm_branch: return ELF::R_ARM_CALL; break; + case ARM::reloc_arm_movt: return ELF::R_ARM_MOVT_ABS; break; + case ARM::reloc_arm_movw: return ELF::R_ARM_MOVW_ABS_NC; break; + default: + llvm_unreachable("unknown ARM relocation type"); break; + } + return 0; +} + +long int ARMELFWriterInfo::getDefaultAddendForRelTy(unsigned RelTy, + long int Modifier) const { + assert(0 && "ARMELFWriterInfo::getDefaultAddendForRelTy() not implemented"); + return 0; +} + +unsigned ARMELFWriterInfo::getRelocationTySize(unsigned RelTy) const { + assert(0 && "ARMELFWriterInfo::getRelocationTySize() not implemented"); + return 0; +} + +bool ARMELFWriterInfo::isPCRelativeRel(unsigned RelTy) const { + assert(0 && "ARMELFWriterInfo::isPCRelativeRel() not implemented"); + return 1; +} + +unsigned ARMELFWriterInfo::getAbsoluteLabelMachineRelTy() const { + assert(0 && + "ARMELFWriterInfo::getAbsoluteLabelMachineRelTy() not implemented"); + return 0; +} + +long int ARMELFWriterInfo::computeRelocation(unsigned SymOffset, + unsigned RelOffset, + unsigned RelTy) const { + assert(0 && + "ARMELFWriterInfo::getAbsoluteLabelMachineRelTy() not implemented"); + return 0; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMELFWriterInfo.h b/contrib/llvm/lib/Target/ARM/ARMELFWriterInfo.h new file mode 100644 index 0000000..1c4e532 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMELFWriterInfo.h @@ -0,0 +1,58 @@ +//===-- ARMELFWriterInfo.h - ELF Writer Info for ARM ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements ELF writer information for the ARM backend. +// +//===----------------------------------------------------------------------===// + +#ifndef ARM_ELF_WRITER_INFO_H +#define ARM_ELF_WRITER_INFO_H + +#include "llvm/Target/TargetELFWriterInfo.h" + +namespace llvm { + + class ARMELFWriterInfo : public TargetELFWriterInfo { + public: + ARMELFWriterInfo(TargetMachine &TM); + virtual ~ARMELFWriterInfo(); + + /// getRelocationType - Returns the target specific ELF Relocation type. + /// 'MachineRelTy' contains the object code independent relocation type + virtual unsigned getRelocationType(unsigned MachineRelTy) const; + + /// hasRelocationAddend - True if the target uses an addend in the + /// ELF relocation entry. + virtual bool hasRelocationAddend() const { return false; } + + /// getDefaultAddendForRelTy - Gets the default addend value for a + /// relocation entry based on the target ELF relocation type. + virtual long int getDefaultAddendForRelTy(unsigned RelTy, + long int Modifier = 0) const; + + /// getRelTySize - Returns the size of relocatable field in bits + virtual unsigned getRelocationTySize(unsigned RelTy) const; + + /// isPCRelativeRel - True if the relocation type is pc relative + virtual bool isPCRelativeRel(unsigned RelTy) const; + + /// getJumpTableRelocationTy - Returns the machine relocation type used + /// to reference a jumptable. + virtual unsigned getAbsoluteLabelMachineRelTy() const; + + /// computeRelocation - Some relocatable fields could be relocated + /// directly, avoiding the relocation symbol emission, compute the + /// final relocation value for this symbol. + virtual long int computeRelocation(unsigned SymOffset, unsigned RelOffset, + unsigned RelTy) const; + }; + +} // end llvm namespace + +#endif // ARM_ELF_WRITER_INFO_H diff --git a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp new file mode 100644 index 0000000..7872cb9 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -0,0 +1,1317 @@ +//===-- ARMExpandPseudoInsts.cpp - Expand pseudo instructions -----*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that expands pseudo instructions into target +// instructions to allow proper scheduling, if-conversion, and other late +// optimizations. This pass should be run after register allocation but before +// the post-regalloc scheduling pass. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "arm-pseudo" +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "ARMBaseRegisterInfo.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMRegisterInfo.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/raw_ostream.h" // FIXME: for debug only. remove! +using namespace llvm; + +static cl::opt<bool> +VerifyARMPseudo("verify-arm-pseudo-expand", cl::Hidden, + cl::desc("Verify machine code after expanding ARM pseudos")); + +namespace { + class ARMExpandPseudo : public MachineFunctionPass { + public: + static char ID; + ARMExpandPseudo() : MachineFunctionPass(ID) {} + + const ARMBaseInstrInfo *TII; + const TargetRegisterInfo *TRI; + const ARMSubtarget *STI; + ARMFunctionInfo *AFI; + + virtual bool runOnMachineFunction(MachineFunction &Fn); + + virtual const char *getPassName() const { + return "ARM pseudo instruction expansion pass"; + } + + private: + void TransferImpOps(MachineInstr &OldMI, + MachineInstrBuilder &UseMI, MachineInstrBuilder &DefMI); + bool ExpandMI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI); + bool ExpandMBB(MachineBasicBlock &MBB); + void ExpandVLD(MachineBasicBlock::iterator &MBBI); + void ExpandVST(MachineBasicBlock::iterator &MBBI); + void ExpandLaneOp(MachineBasicBlock::iterator &MBBI); + void ExpandVTBL(MachineBasicBlock::iterator &MBBI, + unsigned Opc, bool IsExt, unsigned NumRegs); + void ExpandMOV32BitImm(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI); + }; + char ARMExpandPseudo::ID = 0; +} + +/// TransferImpOps - Transfer implicit operands on the pseudo instruction to +/// the instructions created from the expansion. +void ARMExpandPseudo::TransferImpOps(MachineInstr &OldMI, + MachineInstrBuilder &UseMI, + MachineInstrBuilder &DefMI) { + const MCInstrDesc &Desc = OldMI.getDesc(); + for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands(); + i != e; ++i) { + const MachineOperand &MO = OldMI.getOperand(i); + assert(MO.isReg() && MO.getReg()); + if (MO.isUse()) + UseMI.addOperand(MO); + else + DefMI.addOperand(MO); + } +} + +namespace { + // Constants for register spacing in NEON load/store instructions. + // For quad-register load-lane and store-lane pseudo instructors, the + // spacing is initially assumed to be EvenDblSpc, and that is changed to + // OddDblSpc depending on the lane number operand. + enum NEONRegSpacing { + SingleSpc, + EvenDblSpc, + OddDblSpc + }; + + // Entries for NEON load/store information table. The table is sorted by + // PseudoOpc for fast binary-search lookups. + struct NEONLdStTableEntry { + unsigned PseudoOpc; + unsigned RealOpc; + bool IsLoad; + bool HasWriteBack; + NEONRegSpacing RegSpacing; + unsigned char NumRegs; // D registers loaded or stored + unsigned char RegElts; // elements per D register; used for lane ops + + // Comparison methods for binary search of the table. + bool operator<(const NEONLdStTableEntry &TE) const { + return PseudoOpc < TE.PseudoOpc; + } + friend bool operator<(const NEONLdStTableEntry &TE, unsigned PseudoOpc) { + return TE.PseudoOpc < PseudoOpc; + } + friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned PseudoOpc, + const NEONLdStTableEntry &TE) { + return PseudoOpc < TE.PseudoOpc; + } + }; +} + +static const NEONLdStTableEntry NEONLdStTable[] = { +{ ARM::VLD1DUPq16Pseudo, ARM::VLD1DUPq16, true, false, SingleSpc, 2, 4}, +{ ARM::VLD1DUPq16Pseudo_UPD, ARM::VLD1DUPq16_UPD, true, true, SingleSpc, 2, 4}, +{ ARM::VLD1DUPq32Pseudo, ARM::VLD1DUPq32, true, false, SingleSpc, 2, 2}, +{ ARM::VLD1DUPq32Pseudo_UPD, ARM::VLD1DUPq32_UPD, true, true, SingleSpc, 2, 2}, +{ ARM::VLD1DUPq8Pseudo, ARM::VLD1DUPq8, true, false, SingleSpc, 2, 8}, +{ ARM::VLD1DUPq8Pseudo_UPD, ARM::VLD1DUPq8_UPD, true, true, SingleSpc, 2, 8}, + +{ ARM::VLD1LNq16Pseudo, ARM::VLD1LNd16, true, false, EvenDblSpc, 1, 4 }, +{ ARM::VLD1LNq16Pseudo_UPD, ARM::VLD1LNd16_UPD, true, true, EvenDblSpc, 1, 4 }, +{ ARM::VLD1LNq32Pseudo, ARM::VLD1LNd32, true, false, EvenDblSpc, 1, 2 }, +{ ARM::VLD1LNq32Pseudo_UPD, ARM::VLD1LNd32_UPD, true, true, EvenDblSpc, 1, 2 }, +{ ARM::VLD1LNq8Pseudo, ARM::VLD1LNd8, true, false, EvenDblSpc, 1, 8 }, +{ ARM::VLD1LNq8Pseudo_UPD, ARM::VLD1LNd8_UPD, true, true, EvenDblSpc, 1, 8 }, + +{ ARM::VLD1d64QPseudo, ARM::VLD1d64Q, true, false, SingleSpc, 4, 1 }, +{ ARM::VLD1d64QPseudo_UPD, ARM::VLD1d64Q_UPD, true, true, SingleSpc, 4, 1 }, +{ ARM::VLD1d64TPseudo, ARM::VLD1d64T, true, false, SingleSpc, 3, 1 }, +{ ARM::VLD1d64TPseudo_UPD, ARM::VLD1d64T_UPD, true, true, SingleSpc, 3, 1 }, + +{ ARM::VLD1q16Pseudo, ARM::VLD1q16, true, false, SingleSpc, 2, 4 }, +{ ARM::VLD1q16Pseudo_UPD, ARM::VLD1q16_UPD, true, true, SingleSpc, 2, 4 }, +{ ARM::VLD1q32Pseudo, ARM::VLD1q32, true, false, SingleSpc, 2, 2 }, +{ ARM::VLD1q32Pseudo_UPD, ARM::VLD1q32_UPD, true, true, SingleSpc, 2, 2 }, +{ ARM::VLD1q64Pseudo, ARM::VLD1q64, true, false, SingleSpc, 2, 1 }, +{ ARM::VLD1q64Pseudo_UPD, ARM::VLD1q64_UPD, true, true, SingleSpc, 2, 1 }, +{ ARM::VLD1q8Pseudo, ARM::VLD1q8, true, false, SingleSpc, 2, 8 }, +{ ARM::VLD1q8Pseudo_UPD, ARM::VLD1q8_UPD, true, true, SingleSpc, 2, 8 }, + +{ ARM::VLD2DUPd16Pseudo, ARM::VLD2DUPd16, true, false, SingleSpc, 2, 4}, +{ ARM::VLD2DUPd16Pseudo_UPD, ARM::VLD2DUPd16_UPD, true, true, SingleSpc, 2, 4}, +{ ARM::VLD2DUPd32Pseudo, ARM::VLD2DUPd32, true, false, SingleSpc, 2, 2}, +{ ARM::VLD2DUPd32Pseudo_UPD, ARM::VLD2DUPd32_UPD, true, true, SingleSpc, 2, 2}, +{ ARM::VLD2DUPd8Pseudo, ARM::VLD2DUPd8, true, false, SingleSpc, 2, 8}, +{ ARM::VLD2DUPd8Pseudo_UPD, ARM::VLD2DUPd8_UPD, true, true, SingleSpc, 2, 8}, + +{ ARM::VLD2LNd16Pseudo, ARM::VLD2LNd16, true, false, SingleSpc, 2, 4 }, +{ ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, SingleSpc, 2, 4 }, +{ ARM::VLD2LNd32Pseudo, ARM::VLD2LNd32, true, false, SingleSpc, 2, 2 }, +{ ARM::VLD2LNd32Pseudo_UPD, ARM::VLD2LNd32_UPD, true, true, SingleSpc, 2, 2 }, +{ ARM::VLD2LNd8Pseudo, ARM::VLD2LNd8, true, false, SingleSpc, 2, 8 }, +{ ARM::VLD2LNd8Pseudo_UPD, ARM::VLD2LNd8_UPD, true, true, SingleSpc, 2, 8 }, +{ ARM::VLD2LNq16Pseudo, ARM::VLD2LNq16, true, false, EvenDblSpc, 2, 4 }, +{ ARM::VLD2LNq16Pseudo_UPD, ARM::VLD2LNq16_UPD, true, true, EvenDblSpc, 2, 4 }, +{ ARM::VLD2LNq32Pseudo, ARM::VLD2LNq32, true, false, EvenDblSpc, 2, 2 }, +{ ARM::VLD2LNq32Pseudo_UPD, ARM::VLD2LNq32_UPD, true, true, EvenDblSpc, 2, 2 }, + +{ ARM::VLD2d16Pseudo, ARM::VLD2d16, true, false, SingleSpc, 2, 4 }, +{ ARM::VLD2d16Pseudo_UPD, ARM::VLD2d16_UPD, true, true, SingleSpc, 2, 4 }, +{ ARM::VLD2d32Pseudo, ARM::VLD2d32, true, false, SingleSpc, 2, 2 }, +{ ARM::VLD2d32Pseudo_UPD, ARM::VLD2d32_UPD, true, true, SingleSpc, 2, 2 }, +{ ARM::VLD2d8Pseudo, ARM::VLD2d8, true, false, SingleSpc, 2, 8 }, +{ ARM::VLD2d8Pseudo_UPD, ARM::VLD2d8_UPD, true, true, SingleSpc, 2, 8 }, + +{ ARM::VLD2q16Pseudo, ARM::VLD2q16, true, false, SingleSpc, 4, 4 }, +{ ARM::VLD2q16Pseudo_UPD, ARM::VLD2q16_UPD, true, true, SingleSpc, 4, 4 }, +{ ARM::VLD2q32Pseudo, ARM::VLD2q32, true, false, SingleSpc, 4, 2 }, +{ ARM::VLD2q32Pseudo_UPD, ARM::VLD2q32_UPD, true, true, SingleSpc, 4, 2 }, +{ ARM::VLD2q8Pseudo, ARM::VLD2q8, true, false, SingleSpc, 4, 8 }, +{ ARM::VLD2q8Pseudo_UPD, ARM::VLD2q8_UPD, true, true, SingleSpc, 4, 8 }, + +{ ARM::VLD3DUPd16Pseudo, ARM::VLD3DUPd16, true, false, SingleSpc, 3, 4}, +{ ARM::VLD3DUPd16Pseudo_UPD, ARM::VLD3DUPd16_UPD, true, true, SingleSpc, 3, 4}, +{ ARM::VLD3DUPd32Pseudo, ARM::VLD3DUPd32, true, false, SingleSpc, 3, 2}, +{ ARM::VLD3DUPd32Pseudo_UPD, ARM::VLD3DUPd32_UPD, true, true, SingleSpc, 3, 2}, +{ ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd8, true, false, SingleSpc, 3, 8}, +{ ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd8_UPD, true, true, SingleSpc, 3, 8}, + +{ ARM::VLD3LNd16Pseudo, ARM::VLD3LNd16, true, false, SingleSpc, 3, 4 }, +{ ARM::VLD3LNd16Pseudo_UPD, ARM::VLD3LNd16_UPD, true, true, SingleSpc, 3, 4 }, +{ ARM::VLD3LNd32Pseudo, ARM::VLD3LNd32, true, false, SingleSpc, 3, 2 }, +{ ARM::VLD3LNd32Pseudo_UPD, ARM::VLD3LNd32_UPD, true, true, SingleSpc, 3, 2 }, +{ ARM::VLD3LNd8Pseudo, ARM::VLD3LNd8, true, false, SingleSpc, 3, 8 }, +{ ARM::VLD3LNd8Pseudo_UPD, ARM::VLD3LNd8_UPD, true, true, SingleSpc, 3, 8 }, +{ ARM::VLD3LNq16Pseudo, ARM::VLD3LNq16, true, false, EvenDblSpc, 3, 4 }, +{ ARM::VLD3LNq16Pseudo_UPD, ARM::VLD3LNq16_UPD, true, true, EvenDblSpc, 3, 4 }, +{ ARM::VLD3LNq32Pseudo, ARM::VLD3LNq32, true, false, EvenDblSpc, 3, 2 }, +{ ARM::VLD3LNq32Pseudo_UPD, ARM::VLD3LNq32_UPD, true, true, EvenDblSpc, 3, 2 }, + +{ ARM::VLD3d16Pseudo, ARM::VLD3d16, true, false, SingleSpc, 3, 4 }, +{ ARM::VLD3d16Pseudo_UPD, ARM::VLD3d16_UPD, true, true, SingleSpc, 3, 4 }, +{ ARM::VLD3d32Pseudo, ARM::VLD3d32, true, false, SingleSpc, 3, 2 }, +{ ARM::VLD3d32Pseudo_UPD, ARM::VLD3d32_UPD, true, true, SingleSpc, 3, 2 }, +{ ARM::VLD3d8Pseudo, ARM::VLD3d8, true, false, SingleSpc, 3, 8 }, +{ ARM::VLD3d8Pseudo_UPD, ARM::VLD3d8_UPD, true, true, SingleSpc, 3, 8 }, + +{ ARM::VLD3q16Pseudo_UPD, ARM::VLD3q16_UPD, true, true, EvenDblSpc, 3, 4 }, +{ ARM::VLD3q16oddPseudo, ARM::VLD3q16, true, false, OddDblSpc, 3, 4 }, +{ ARM::VLD3q16oddPseudo_UPD, ARM::VLD3q16_UPD, true, true, OddDblSpc, 3, 4 }, +{ ARM::VLD3q32Pseudo_UPD, ARM::VLD3q32_UPD, true, true, EvenDblSpc, 3, 2 }, +{ ARM::VLD3q32oddPseudo, ARM::VLD3q32, true, false, OddDblSpc, 3, 2 }, +{ ARM::VLD3q32oddPseudo_UPD, ARM::VLD3q32_UPD, true, true, OddDblSpc, 3, 2 }, +{ ARM::VLD3q8Pseudo_UPD, ARM::VLD3q8_UPD, true, true, EvenDblSpc, 3, 8 }, +{ ARM::VLD3q8oddPseudo, ARM::VLD3q8, true, false, OddDblSpc, 3, 8 }, +{ ARM::VLD3q8oddPseudo_UPD, ARM::VLD3q8_UPD, true, true, OddDblSpc, 3, 8 }, + +{ ARM::VLD4DUPd16Pseudo, ARM::VLD4DUPd16, true, false, SingleSpc, 4, 4}, +{ ARM::VLD4DUPd16Pseudo_UPD, ARM::VLD4DUPd16_UPD, true, true, SingleSpc, 4, 4}, +{ ARM::VLD4DUPd32Pseudo, ARM::VLD4DUPd32, true, false, SingleSpc, 4, 2}, +{ ARM::VLD4DUPd32Pseudo_UPD, ARM::VLD4DUPd32_UPD, true, true, SingleSpc, 4, 2}, +{ ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd8, true, false, SingleSpc, 4, 8}, +{ ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd8_UPD, true, true, SingleSpc, 4, 8}, + +{ ARM::VLD4LNd16Pseudo, ARM::VLD4LNd16, true, false, SingleSpc, 4, 4 }, +{ ARM::VLD4LNd16Pseudo_UPD, ARM::VLD4LNd16_UPD, true, true, SingleSpc, 4, 4 }, +{ ARM::VLD4LNd32Pseudo, ARM::VLD4LNd32, true, false, SingleSpc, 4, 2 }, +{ ARM::VLD4LNd32Pseudo_UPD, ARM::VLD4LNd32_UPD, true, true, SingleSpc, 4, 2 }, +{ ARM::VLD4LNd8Pseudo, ARM::VLD4LNd8, true, false, SingleSpc, 4, 8 }, +{ ARM::VLD4LNd8Pseudo_UPD, ARM::VLD4LNd8_UPD, true, true, SingleSpc, 4, 8 }, +{ ARM::VLD4LNq16Pseudo, ARM::VLD4LNq16, true, false, EvenDblSpc, 4, 4 }, +{ ARM::VLD4LNq16Pseudo_UPD, ARM::VLD4LNq16_UPD, true, true, EvenDblSpc, 4, 4 }, +{ ARM::VLD4LNq32Pseudo, ARM::VLD4LNq32, true, false, EvenDblSpc, 4, 2 }, +{ ARM::VLD4LNq32Pseudo_UPD, ARM::VLD4LNq32_UPD, true, true, EvenDblSpc, 4, 2 }, + +{ ARM::VLD4d16Pseudo, ARM::VLD4d16, true, false, SingleSpc, 4, 4 }, +{ ARM::VLD4d16Pseudo_UPD, ARM::VLD4d16_UPD, true, true, SingleSpc, 4, 4 }, +{ ARM::VLD4d32Pseudo, ARM::VLD4d32, true, false, SingleSpc, 4, 2 }, +{ ARM::VLD4d32Pseudo_UPD, ARM::VLD4d32_UPD, true, true, SingleSpc, 4, 2 }, +{ ARM::VLD4d8Pseudo, ARM::VLD4d8, true, false, SingleSpc, 4, 8 }, +{ ARM::VLD4d8Pseudo_UPD, ARM::VLD4d8_UPD, true, true, SingleSpc, 4, 8 }, + +{ ARM::VLD4q16Pseudo_UPD, ARM::VLD4q16_UPD, true, true, EvenDblSpc, 4, 4 }, +{ ARM::VLD4q16oddPseudo, ARM::VLD4q16, true, false, OddDblSpc, 4, 4 }, +{ ARM::VLD4q16oddPseudo_UPD, ARM::VLD4q16_UPD, true, true, OddDblSpc, 4, 4 }, +{ ARM::VLD4q32Pseudo_UPD, ARM::VLD4q32_UPD, true, true, EvenDblSpc, 4, 2 }, +{ ARM::VLD4q32oddPseudo, ARM::VLD4q32, true, false, OddDblSpc, 4, 2 }, +{ ARM::VLD4q32oddPseudo_UPD, ARM::VLD4q32_UPD, true, true, OddDblSpc, 4, 2 }, +{ ARM::VLD4q8Pseudo_UPD, ARM::VLD4q8_UPD, true, true, EvenDblSpc, 4, 8 }, +{ ARM::VLD4q8oddPseudo, ARM::VLD4q8, true, false, OddDblSpc, 4, 8 }, +{ ARM::VLD4q8oddPseudo_UPD, ARM::VLD4q8_UPD, true, true, OddDblSpc, 4, 8 }, + +{ ARM::VST1LNq16Pseudo, ARM::VST1LNd16, false, false, EvenDblSpc, 1, 4 }, +{ ARM::VST1LNq16Pseudo_UPD, ARM::VST1LNd16_UPD,false, true, EvenDblSpc, 1, 4 }, +{ ARM::VST1LNq32Pseudo, ARM::VST1LNd32, false, false, EvenDblSpc, 1, 2 }, +{ ARM::VST1LNq32Pseudo_UPD, ARM::VST1LNd32_UPD,false, true, EvenDblSpc, 1, 2 }, +{ ARM::VST1LNq8Pseudo, ARM::VST1LNd8, false, false, EvenDblSpc, 1, 8 }, +{ ARM::VST1LNq8Pseudo_UPD, ARM::VST1LNd8_UPD, false, true, EvenDblSpc, 1, 8 }, + +{ ARM::VST1d64QPseudo, ARM::VST1d64Q, false, false, SingleSpc, 4, 1 }, +{ ARM::VST1d64QPseudo_UPD, ARM::VST1d64Q_UPD, false, true, SingleSpc, 4, 1 }, +{ ARM::VST1d64TPseudo, ARM::VST1d64T, false, false, SingleSpc, 3, 1 }, +{ ARM::VST1d64TPseudo_UPD, ARM::VST1d64T_UPD, false, true, SingleSpc, 3, 1 }, + +{ ARM::VST1q16Pseudo, ARM::VST1q16, false, false, SingleSpc, 2, 4 }, +{ ARM::VST1q16Pseudo_UPD, ARM::VST1q16_UPD, false, true, SingleSpc, 2, 4 }, +{ ARM::VST1q32Pseudo, ARM::VST1q32, false, false, SingleSpc, 2, 2 }, +{ ARM::VST1q32Pseudo_UPD, ARM::VST1q32_UPD, false, true, SingleSpc, 2, 2 }, +{ ARM::VST1q64Pseudo, ARM::VST1q64, false, false, SingleSpc, 2, 1 }, +{ ARM::VST1q64Pseudo_UPD, ARM::VST1q64_UPD, false, true, SingleSpc, 2, 1 }, +{ ARM::VST1q8Pseudo, ARM::VST1q8, false, false, SingleSpc, 2, 8 }, +{ ARM::VST1q8Pseudo_UPD, ARM::VST1q8_UPD, false, true, SingleSpc, 2, 8 }, + +{ ARM::VST2LNd16Pseudo, ARM::VST2LNd16, false, false, SingleSpc, 2, 4 }, +{ ARM::VST2LNd16Pseudo_UPD, ARM::VST2LNd16_UPD, false, true, SingleSpc, 2, 4 }, +{ ARM::VST2LNd32Pseudo, ARM::VST2LNd32, false, false, SingleSpc, 2, 2 }, +{ ARM::VST2LNd32Pseudo_UPD, ARM::VST2LNd32_UPD, false, true, SingleSpc, 2, 2 }, +{ ARM::VST2LNd8Pseudo, ARM::VST2LNd8, false, false, SingleSpc, 2, 8 }, +{ ARM::VST2LNd8Pseudo_UPD, ARM::VST2LNd8_UPD, false, true, SingleSpc, 2, 8 }, +{ ARM::VST2LNq16Pseudo, ARM::VST2LNq16, false, false, EvenDblSpc, 2, 4}, +{ ARM::VST2LNq16Pseudo_UPD, ARM::VST2LNq16_UPD, false, true, EvenDblSpc, 2, 4}, +{ ARM::VST2LNq32Pseudo, ARM::VST2LNq32, false, false, EvenDblSpc, 2, 2}, +{ ARM::VST2LNq32Pseudo_UPD, ARM::VST2LNq32_UPD, false, true, EvenDblSpc, 2, 2}, + +{ ARM::VST2d16Pseudo, ARM::VST2d16, false, false, SingleSpc, 2, 4 }, +{ ARM::VST2d16Pseudo_UPD, ARM::VST2d16_UPD, false, true, SingleSpc, 2, 4 }, +{ ARM::VST2d32Pseudo, ARM::VST2d32, false, false, SingleSpc, 2, 2 }, +{ ARM::VST2d32Pseudo_UPD, ARM::VST2d32_UPD, false, true, SingleSpc, 2, 2 }, +{ ARM::VST2d8Pseudo, ARM::VST2d8, false, false, SingleSpc, 2, 8 }, +{ ARM::VST2d8Pseudo_UPD, ARM::VST2d8_UPD, false, true, SingleSpc, 2, 8 }, + +{ ARM::VST2q16Pseudo, ARM::VST2q16, false, false, SingleSpc, 4, 4 }, +{ ARM::VST2q16Pseudo_UPD, ARM::VST2q16_UPD, false, true, SingleSpc, 4, 4 }, +{ ARM::VST2q32Pseudo, ARM::VST2q32, false, false, SingleSpc, 4, 2 }, +{ ARM::VST2q32Pseudo_UPD, ARM::VST2q32_UPD, false, true, SingleSpc, 4, 2 }, +{ ARM::VST2q8Pseudo, ARM::VST2q8, false, false, SingleSpc, 4, 8 }, +{ ARM::VST2q8Pseudo_UPD, ARM::VST2q8_UPD, false, true, SingleSpc, 4, 8 }, + +{ ARM::VST3LNd16Pseudo, ARM::VST3LNd16, false, false, SingleSpc, 3, 4 }, +{ ARM::VST3LNd16Pseudo_UPD, ARM::VST3LNd16_UPD, false, true, SingleSpc, 3, 4 }, +{ ARM::VST3LNd32Pseudo, ARM::VST3LNd32, false, false, SingleSpc, 3, 2 }, +{ ARM::VST3LNd32Pseudo_UPD, ARM::VST3LNd32_UPD, false, true, SingleSpc, 3, 2 }, +{ ARM::VST3LNd8Pseudo, ARM::VST3LNd8, false, false, SingleSpc, 3, 8 }, +{ ARM::VST3LNd8Pseudo_UPD, ARM::VST3LNd8_UPD, false, true, SingleSpc, 3, 8 }, +{ ARM::VST3LNq16Pseudo, ARM::VST3LNq16, false, false, EvenDblSpc, 3, 4}, +{ ARM::VST3LNq16Pseudo_UPD, ARM::VST3LNq16_UPD, false, true, EvenDblSpc, 3, 4}, +{ ARM::VST3LNq32Pseudo, ARM::VST3LNq32, false, false, EvenDblSpc, 3, 2}, +{ ARM::VST3LNq32Pseudo_UPD, ARM::VST3LNq32_UPD, false, true, EvenDblSpc, 3, 2}, + +{ ARM::VST3d16Pseudo, ARM::VST3d16, false, false, SingleSpc, 3, 4 }, +{ ARM::VST3d16Pseudo_UPD, ARM::VST3d16_UPD, false, true, SingleSpc, 3, 4 }, +{ ARM::VST3d32Pseudo, ARM::VST3d32, false, false, SingleSpc, 3, 2 }, +{ ARM::VST3d32Pseudo_UPD, ARM::VST3d32_UPD, false, true, SingleSpc, 3, 2 }, +{ ARM::VST3d8Pseudo, ARM::VST3d8, false, false, SingleSpc, 3, 8 }, +{ ARM::VST3d8Pseudo_UPD, ARM::VST3d8_UPD, false, true, SingleSpc, 3, 8 }, + +{ ARM::VST3q16Pseudo_UPD, ARM::VST3q16_UPD, false, true, EvenDblSpc, 3, 4 }, +{ ARM::VST3q16oddPseudo, ARM::VST3q16, false, false, OddDblSpc, 3, 4 }, +{ ARM::VST3q16oddPseudo_UPD, ARM::VST3q16_UPD, false, true, OddDblSpc, 3, 4 }, +{ ARM::VST3q32Pseudo_UPD, ARM::VST3q32_UPD, false, true, EvenDblSpc, 3, 2 }, +{ ARM::VST3q32oddPseudo, ARM::VST3q32, false, false, OddDblSpc, 3, 2 }, +{ ARM::VST3q32oddPseudo_UPD, ARM::VST3q32_UPD, false, true, OddDblSpc, 3, 2 }, +{ ARM::VST3q8Pseudo_UPD, ARM::VST3q8_UPD, false, true, EvenDblSpc, 3, 8 }, +{ ARM::VST3q8oddPseudo, ARM::VST3q8, false, false, OddDblSpc, 3, 8 }, +{ ARM::VST3q8oddPseudo_UPD, ARM::VST3q8_UPD, false, true, OddDblSpc, 3, 8 }, + +{ ARM::VST4LNd16Pseudo, ARM::VST4LNd16, false, false, SingleSpc, 4, 4 }, +{ ARM::VST4LNd16Pseudo_UPD, ARM::VST4LNd16_UPD, false, true, SingleSpc, 4, 4 }, +{ ARM::VST4LNd32Pseudo, ARM::VST4LNd32, false, false, SingleSpc, 4, 2 }, +{ ARM::VST4LNd32Pseudo_UPD, ARM::VST4LNd32_UPD, false, true, SingleSpc, 4, 2 }, +{ ARM::VST4LNd8Pseudo, ARM::VST4LNd8, false, false, SingleSpc, 4, 8 }, +{ ARM::VST4LNd8Pseudo_UPD, ARM::VST4LNd8_UPD, false, true, SingleSpc, 4, 8 }, +{ ARM::VST4LNq16Pseudo, ARM::VST4LNq16, false, false, EvenDblSpc, 4, 4}, +{ ARM::VST4LNq16Pseudo_UPD, ARM::VST4LNq16_UPD, false, true, EvenDblSpc, 4, 4}, +{ ARM::VST4LNq32Pseudo, ARM::VST4LNq32, false, false, EvenDblSpc, 4, 2}, +{ ARM::VST4LNq32Pseudo_UPD, ARM::VST4LNq32_UPD, false, true, EvenDblSpc, 4, 2}, + +{ ARM::VST4d16Pseudo, ARM::VST4d16, false, false, SingleSpc, 4, 4 }, +{ ARM::VST4d16Pseudo_UPD, ARM::VST4d16_UPD, false, true, SingleSpc, 4, 4 }, +{ ARM::VST4d32Pseudo, ARM::VST4d32, false, false, SingleSpc, 4, 2 }, +{ ARM::VST4d32Pseudo_UPD, ARM::VST4d32_UPD, false, true, SingleSpc, 4, 2 }, +{ ARM::VST4d8Pseudo, ARM::VST4d8, false, false, SingleSpc, 4, 8 }, +{ ARM::VST4d8Pseudo_UPD, ARM::VST4d8_UPD, false, true, SingleSpc, 4, 8 }, + +{ ARM::VST4q16Pseudo_UPD, ARM::VST4q16_UPD, false, true, EvenDblSpc, 4, 4 }, +{ ARM::VST4q16oddPseudo, ARM::VST4q16, false, false, OddDblSpc, 4, 4 }, +{ ARM::VST4q16oddPseudo_UPD, ARM::VST4q16_UPD, false, true, OddDblSpc, 4, 4 }, +{ ARM::VST4q32Pseudo_UPD, ARM::VST4q32_UPD, false, true, EvenDblSpc, 4, 2 }, +{ ARM::VST4q32oddPseudo, ARM::VST4q32, false, false, OddDblSpc, 4, 2 }, +{ ARM::VST4q32oddPseudo_UPD, ARM::VST4q32_UPD, false, true, OddDblSpc, 4, 2 }, +{ ARM::VST4q8Pseudo_UPD, ARM::VST4q8_UPD, false, true, EvenDblSpc, 4, 8 }, +{ ARM::VST4q8oddPseudo, ARM::VST4q8, false, false, OddDblSpc, 4, 8 }, +{ ARM::VST4q8oddPseudo_UPD, ARM::VST4q8_UPD, false, true, OddDblSpc, 4, 8 } +}; + +/// LookupNEONLdSt - Search the NEONLdStTable for information about a NEON +/// load or store pseudo instruction. +static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) { + unsigned NumEntries = array_lengthof(NEONLdStTable); + +#ifndef NDEBUG + // Make sure the table is sorted. + static bool TableChecked = false; + if (!TableChecked) { + for (unsigned i = 0; i != NumEntries-1; ++i) + assert(NEONLdStTable[i] < NEONLdStTable[i+1] && + "NEONLdStTable is not sorted!"); + TableChecked = true; + } +#endif + + const NEONLdStTableEntry *I = + std::lower_bound(NEONLdStTable, NEONLdStTable + NumEntries, Opcode); + if (I != NEONLdStTable + NumEntries && I->PseudoOpc == Opcode) + return I; + return NULL; +} + +/// GetDSubRegs - Get 4 D subregisters of a Q, QQ, or QQQQ register, +/// corresponding to the specified register spacing. Not all of the results +/// are necessarily valid, e.g., a Q register only has 2 D subregisters. +static void GetDSubRegs(unsigned Reg, NEONRegSpacing RegSpc, + const TargetRegisterInfo *TRI, unsigned &D0, + unsigned &D1, unsigned &D2, unsigned &D3) { + if (RegSpc == SingleSpc) { + D0 = TRI->getSubReg(Reg, ARM::dsub_0); + D1 = TRI->getSubReg(Reg, ARM::dsub_1); + D2 = TRI->getSubReg(Reg, ARM::dsub_2); + D3 = TRI->getSubReg(Reg, ARM::dsub_3); + } else if (RegSpc == EvenDblSpc) { + D0 = TRI->getSubReg(Reg, ARM::dsub_0); + D1 = TRI->getSubReg(Reg, ARM::dsub_2); + D2 = TRI->getSubReg(Reg, ARM::dsub_4); + D3 = TRI->getSubReg(Reg, ARM::dsub_6); + } else { + assert(RegSpc == OddDblSpc && "unknown register spacing"); + D0 = TRI->getSubReg(Reg, ARM::dsub_1); + D1 = TRI->getSubReg(Reg, ARM::dsub_3); + D2 = TRI->getSubReg(Reg, ARM::dsub_5); + D3 = TRI->getSubReg(Reg, ARM::dsub_7); + } +} + +/// ExpandVLD - Translate VLD pseudo instructions with Q, QQ or QQQQ register +/// operands to real VLD instructions with D register operands. +void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) { + MachineInstr &MI = *MBBI; + MachineBasicBlock &MBB = *MI.getParent(); + + const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode()); + assert(TableEntry && TableEntry->IsLoad && "NEONLdStTable lookup failed"); + NEONRegSpacing RegSpc = TableEntry->RegSpacing; + unsigned NumRegs = TableEntry->NumRegs; + + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(TableEntry->RealOpc)); + unsigned OpIdx = 0; + + bool DstIsDead = MI.getOperand(OpIdx).isDead(); + unsigned DstReg = MI.getOperand(OpIdx++).getReg(); + unsigned D0, D1, D2, D3; + GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3); + MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(D1, RegState::Define | getDeadRegState(DstIsDead)); + if (NumRegs > 2) + MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead)); + if (NumRegs > 3) + MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead)); + + if (TableEntry->HasWriteBack) + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Copy the addrmode6 operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + // Copy the am6offset operand. + if (TableEntry->HasWriteBack) + MIB.addOperand(MI.getOperand(OpIdx++)); + + // For an instruction writing double-spaced subregs, the pseudo instruction + // has an extra operand that is a use of the super-register. Record the + // operand index and skip over it. + unsigned SrcOpIdx = 0; + if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc) + SrcOpIdx = OpIdx++; + + // Copy the predicate operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Copy the super-register source operand used for double-spaced subregs over + // to the new instruction as an implicit operand. + if (SrcOpIdx != 0) { + MachineOperand MO = MI.getOperand(SrcOpIdx); + MO.setImplicit(true); + MIB.addOperand(MO); + } + // Add an implicit def for the super-register. + MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead)); + TransferImpOps(MI, MIB, MIB); + + // Transfer memoperands. + MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + + MI.eraseFromParent(); +} + +/// ExpandVST - Translate VST pseudo instructions with Q, QQ or QQQQ register +/// operands to real VST instructions with D register operands. +void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) { + MachineInstr &MI = *MBBI; + MachineBasicBlock &MBB = *MI.getParent(); + + const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode()); + assert(TableEntry && !TableEntry->IsLoad && "NEONLdStTable lookup failed"); + NEONRegSpacing RegSpc = TableEntry->RegSpacing; + unsigned NumRegs = TableEntry->NumRegs; + + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(TableEntry->RealOpc)); + unsigned OpIdx = 0; + if (TableEntry->HasWriteBack) + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Copy the addrmode6 operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + // Copy the am6offset operand. + if (TableEntry->HasWriteBack) + MIB.addOperand(MI.getOperand(OpIdx++)); + + bool SrcIsKill = MI.getOperand(OpIdx).isKill(); + unsigned SrcReg = MI.getOperand(OpIdx++).getReg(); + unsigned D0, D1, D2, D3; + GetDSubRegs(SrcReg, RegSpc, TRI, D0, D1, D2, D3); + MIB.addReg(D0).addReg(D1); + if (NumRegs > 2) + MIB.addReg(D2); + if (NumRegs > 3) + MIB.addReg(D3); + + // Copy the predicate operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + + if (SrcIsKill) // Add an implicit kill for the super-reg. + MIB->addRegisterKilled(SrcReg, TRI, true); + TransferImpOps(MI, MIB, MIB); + + // Transfer memoperands. + MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + + MI.eraseFromParent(); +} + +/// ExpandLaneOp - Translate VLD*LN and VST*LN instructions with Q, QQ or QQQQ +/// register operands to real instructions with D register operands. +void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) { + MachineInstr &MI = *MBBI; + MachineBasicBlock &MBB = *MI.getParent(); + + const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode()); + assert(TableEntry && "NEONLdStTable lookup failed"); + NEONRegSpacing RegSpc = TableEntry->RegSpacing; + unsigned NumRegs = TableEntry->NumRegs; + unsigned RegElts = TableEntry->RegElts; + + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(TableEntry->RealOpc)); + unsigned OpIdx = 0; + // The lane operand is always the 3rd from last operand, before the 2 + // predicate operands. + unsigned Lane = MI.getOperand(MI.getDesc().getNumOperands() - 3).getImm(); + + // Adjust the lane and spacing as needed for Q registers. + assert(RegSpc != OddDblSpc && "unexpected register spacing for VLD/VST-lane"); + if (RegSpc == EvenDblSpc && Lane >= RegElts) { + RegSpc = OddDblSpc; + Lane -= RegElts; + } + assert(Lane < RegElts && "out of range lane for VLD/VST-lane"); + + unsigned D0 = 0, D1 = 0, D2 = 0, D3 = 0; + unsigned DstReg = 0; + bool DstIsDead = false; + if (TableEntry->IsLoad) { + DstIsDead = MI.getOperand(OpIdx).isDead(); + DstReg = MI.getOperand(OpIdx++).getReg(); + GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3); + MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead)); + if (NumRegs > 1) + MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead)); + if (NumRegs > 2) + MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead)); + if (NumRegs > 3) + MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead)); + } + + if (TableEntry->HasWriteBack) + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Copy the addrmode6 operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + // Copy the am6offset operand. + if (TableEntry->HasWriteBack) + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Grab the super-register source. + MachineOperand MO = MI.getOperand(OpIdx++); + if (!TableEntry->IsLoad) + GetDSubRegs(MO.getReg(), RegSpc, TRI, D0, D1, D2, D3); + + // Add the subregs as sources of the new instruction. + unsigned SrcFlags = (getUndefRegState(MO.isUndef()) | + getKillRegState(MO.isKill())); + MIB.addReg(D0, SrcFlags); + if (NumRegs > 1) + MIB.addReg(D1, SrcFlags); + if (NumRegs > 2) + MIB.addReg(D2, SrcFlags); + if (NumRegs > 3) + MIB.addReg(D3, SrcFlags); + + // Add the lane number operand. + MIB.addImm(Lane); + OpIdx += 1; + + // Copy the predicate operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Copy the super-register source to be an implicit source. + MO.setImplicit(true); + MIB.addOperand(MO); + if (TableEntry->IsLoad) + // Add an implicit def for the super-register. + MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead)); + TransferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); +} + +/// ExpandVTBL - Translate VTBL and VTBX pseudo instructions with Q or QQ +/// register operands to real instructions with D register operands. +void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI, + unsigned Opc, bool IsExt, unsigned NumRegs) { + MachineInstr &MI = *MBBI; + MachineBasicBlock &MBB = *MI.getParent(); + + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)); + unsigned OpIdx = 0; + + // Transfer the destination register operand. + MIB.addOperand(MI.getOperand(OpIdx++)); + if (IsExt) + MIB.addOperand(MI.getOperand(OpIdx++)); + + bool SrcIsKill = MI.getOperand(OpIdx).isKill(); + unsigned SrcReg = MI.getOperand(OpIdx++).getReg(); + unsigned D0, D1, D2, D3; + GetDSubRegs(SrcReg, SingleSpc, TRI, D0, D1, D2, D3); + MIB.addReg(D0).addReg(D1); + if (NumRegs > 2) + MIB.addReg(D2); + if (NumRegs > 3) + MIB.addReg(D3); + + // Copy the other source register operand. + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Copy the predicate operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + + if (SrcIsKill) // Add an implicit kill for the super-reg. + MIB->addRegisterKilled(SrcReg, TRI, true); + TransferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); +} + +void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI) { + MachineInstr &MI = *MBBI; + unsigned Opcode = MI.getOpcode(); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = llvm::getInstrPredicate(&MI, PredReg); + unsigned DstReg = MI.getOperand(0).getReg(); + bool DstIsDead = MI.getOperand(0).isDead(); + bool isCC = Opcode == ARM::MOVCCi32imm || Opcode == ARM::t2MOVCCi32imm; + const MachineOperand &MO = MI.getOperand(isCC ? 2 : 1); + MachineInstrBuilder LO16, HI16; + + if (!STI->hasV6T2Ops() && + (Opcode == ARM::MOVi32imm || Opcode == ARM::MOVCCi32imm)) { + // Expand into a movi + orr. + LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi), DstReg); + HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::ORRri)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstReg); + + assert (MO.isImm() && "MOVi32imm w/ non-immediate source operand!"); + unsigned ImmVal = (unsigned)MO.getImm(); + unsigned SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal); + unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal); + LO16 = LO16.addImm(SOImmValV1); + HI16 = HI16.addImm(SOImmValV2); + LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + LO16.addImm(Pred).addReg(PredReg).addReg(0); + HI16.addImm(Pred).addReg(PredReg).addReg(0); + TransferImpOps(MI, LO16, HI16); + MI.eraseFromParent(); + return; + } + + unsigned LO16Opc = 0; + unsigned HI16Opc = 0; + if (Opcode == ARM::t2MOVi32imm || Opcode == ARM::t2MOVCCi32imm) { + LO16Opc = ARM::t2MOVi16; + HI16Opc = ARM::t2MOVTi16; + } else { + LO16Opc = ARM::MOVi16; + HI16Opc = ARM::MOVTi16; + } + + LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LO16Opc), DstReg); + HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(HI16Opc)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstReg); + + if (MO.isImm()) { + unsigned Imm = MO.getImm(); + unsigned Lo16 = Imm & 0xffff; + unsigned Hi16 = (Imm >> 16) & 0xffff; + LO16 = LO16.addImm(Lo16); + HI16 = HI16.addImm(Hi16); + } else { + const GlobalValue *GV = MO.getGlobal(); + unsigned TF = MO.getTargetFlags(); + LO16 = LO16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_LO16); + HI16 = HI16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_HI16); + } + + LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + LO16.addImm(Pred).addReg(PredReg); + HI16.addImm(Pred).addReg(PredReg); + + TransferImpOps(MI, LO16, HI16); + MI.eraseFromParent(); +} + +bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + MachineInstr &MI = *MBBI; + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + default: + return false; + case ARM::VMOVScc: + case ARM::VMOVDcc: { + unsigned newOpc = Opcode == ARM::VMOVScc ? ARM::VMOVS : ARM::VMOVD; + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(newOpc), + MI.getOperand(1).getReg()) + .addReg(MI.getOperand(2).getReg(), + getKillRegState(MI.getOperand(2).isKill())) + .addImm(MI.getOperand(3).getImm()) // 'pred' + .addReg(MI.getOperand(4).getReg()); + + MI.eraseFromParent(); + return true; + } + case ARM::t2MOVCCr: + case ARM::MOVCCr: { + unsigned Opc = AFI->isThumbFunction() ? ARM::t2MOVr : ARM::MOVr; + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc), + MI.getOperand(1).getReg()) + .addReg(MI.getOperand(2).getReg(), + getKillRegState(MI.getOperand(2).isKill())) + .addImm(MI.getOperand(3).getImm()) // 'pred' + .addReg(MI.getOperand(4).getReg()) + .addReg(0); // 's' bit + + MI.eraseFromParent(); + return true; + } + case ARM::MOVCCsi: { + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi), + (MI.getOperand(1).getReg())) + .addReg(MI.getOperand(2).getReg(), + getKillRegState(MI.getOperand(2).isKill())) + .addImm(MI.getOperand(3).getImm()) + .addImm(MI.getOperand(4).getImm()) // 'pred' + .addReg(MI.getOperand(5).getReg()) + .addReg(0); // 's' bit + + MI.eraseFromParent(); + return true; + } + + case ARM::MOVCCsr: { + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsr), + (MI.getOperand(1).getReg())) + .addReg(MI.getOperand(2).getReg(), + getKillRegState(MI.getOperand(2).isKill())) + .addReg(MI.getOperand(3).getReg(), + getKillRegState(MI.getOperand(3).isKill())) + .addImm(MI.getOperand(4).getImm()) + .addImm(MI.getOperand(5).getImm()) // 'pred' + .addReg(MI.getOperand(6).getReg()) + .addReg(0); // 's' bit + + MI.eraseFromParent(); + return true; + } + case ARM::MOVCCi16: { + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi16), + MI.getOperand(1).getReg()) + .addImm(MI.getOperand(2).getImm()) + .addImm(MI.getOperand(3).getImm()) // 'pred' + .addReg(MI.getOperand(4).getReg()); + + MI.eraseFromParent(); + return true; + } + case ARM::t2MOVCCi: + case ARM::MOVCCi: { + unsigned Opc = AFI->isThumbFunction() ? ARM::t2MOVi : ARM::MOVi; + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc), + MI.getOperand(1).getReg()) + .addImm(MI.getOperand(2).getImm()) + .addImm(MI.getOperand(3).getImm()) // 'pred' + .addReg(MI.getOperand(4).getReg()) + .addReg(0); // 's' bit + + MI.eraseFromParent(); + return true; + } + case ARM::MVNCCi: { + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MVNi), + MI.getOperand(1).getReg()) + .addImm(MI.getOperand(2).getImm()) + .addImm(MI.getOperand(3).getImm()) // 'pred' + .addReg(MI.getOperand(4).getReg()) + .addReg(0); // 's' bit + + MI.eraseFromParent(); + return true; + } + case ARM::Int_eh_sjlj_dispatchsetup: { + MachineFunction &MF = *MI.getParent()->getParent(); + const ARMBaseInstrInfo *AII = + static_cast<const ARMBaseInstrInfo*>(TII); + const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); + // For functions using a base pointer, we rematerialize it (via the frame + // pointer) here since eh.sjlj.setjmp and eh.sjlj.longjmp don't do it + // for us. Otherwise, expand to nothing. + if (RI.hasBasePointer(MF)) { + int32_t NumBytes = AFI->getFramePtrSpillOffset(); + unsigned FramePtr = RI.getFrameRegister(MF); + assert(MF.getTarget().getFrameLowering()->hasFP(MF) && + "base pointer without frame pointer?"); + + if (AFI->isThumb2Function()) { + llvm::emitT2RegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6, + FramePtr, -NumBytes, ARMCC::AL, 0, *TII); + } else if (AFI->isThumbFunction()) { + llvm::emitThumbRegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6, + FramePtr, -NumBytes, *TII, RI); + } else { + llvm::emitARMRegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6, + FramePtr, -NumBytes, ARMCC::AL, 0, + *TII); + } + // If there's dynamic realignment, adjust for it. + if (RI.needsStackRealignment(MF)) { + MachineFrameInfo *MFI = MF.getFrameInfo(); + unsigned MaxAlign = MFI->getMaxAlignment(); + assert (!AFI->isThumb1OnlyFunction()); + // Emit bic r6, r6, MaxAlign + unsigned bicOpc = AFI->isThumbFunction() ? + ARM::t2BICri : ARM::BICri; + AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(bicOpc), ARM::R6) + .addReg(ARM::R6, RegState::Kill) + .addImm(MaxAlign-1))); + } + + } + MI.eraseFromParent(); + return true; + } + + case ARM::MOVsrl_flag: + case ARM::MOVsra_flag: { + // These are just fancy MOVs insructions. + AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi), + MI.getOperand(0).getReg()) + .addOperand(MI.getOperand(1)) + .addImm(ARM_AM::getSORegOpc((Opcode == ARM::MOVsrl_flag ? + ARM_AM::lsr : ARM_AM::asr), + 1))) + .addReg(ARM::CPSR, RegState::Define); + MI.eraseFromParent(); + return true; + } + case ARM::RRX: { + // This encodes as "MOVs Rd, Rm, rrx + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),TII->get(ARM::MOVsi), + MI.getOperand(0).getReg()) + .addOperand(MI.getOperand(1)) + .addImm(ARM_AM::getSORegOpc(ARM_AM::rrx, 0))) + .addReg(0); + TransferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); + return true; + } + case ARM::tTPsoft: + case ARM::TPsoft: { + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == ARM::tTPsoft ? ARM::tBL : ARM::BL)) + .addExternalSymbol("__aeabi_read_tp", 0); + + MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + TransferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); + return true; + } + case ARM::tLDRpci_pic: + case ARM::t2LDRpci_pic: { + unsigned NewLdOpc = (Opcode == ARM::tLDRpci_pic) + ? ARM::tLDRpci : ARM::t2LDRpci; + unsigned DstReg = MI.getOperand(0).getReg(); + bool DstIsDead = MI.getOperand(0).isDead(); + MachineInstrBuilder MIB1 = + AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(NewLdOpc), DstReg) + .addOperand(MI.getOperand(1))); + MIB1->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(ARM::tPICADD)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstReg) + .addOperand(MI.getOperand(2)); + TransferImpOps(MI, MIB1, MIB2); + MI.eraseFromParent(); + return true; + } + + case ARM::MOV_ga_dyn: + case ARM::MOV_ga_pcrel: + case ARM::MOV_ga_pcrel_ldr: + case ARM::t2MOV_ga_dyn: + case ARM::t2MOV_ga_pcrel: { + // Expand into movw + movw. Also "add pc" / ldr [pc] in PIC mode. + unsigned LabelId = AFI->createPICLabelUId(); + unsigned DstReg = MI.getOperand(0).getReg(); + bool DstIsDead = MI.getOperand(0).isDead(); + const MachineOperand &MO1 = MI.getOperand(1); + const GlobalValue *GV = MO1.getGlobal(); + unsigned TF = MO1.getTargetFlags(); + bool isARM = (Opcode != ARM::t2MOV_ga_pcrel && Opcode!=ARM::t2MOV_ga_dyn); + bool isPIC = (Opcode != ARM::MOV_ga_dyn && Opcode != ARM::t2MOV_ga_dyn); + unsigned LO16Opc = isARM ? ARM::MOVi16_ga_pcrel : ARM::t2MOVi16_ga_pcrel; + unsigned HI16Opc = isARM ? ARM::MOVTi16_ga_pcrel :ARM::t2MOVTi16_ga_pcrel; + unsigned LO16TF = isPIC + ? ARMII::MO_LO16_NONLAZY_PIC : ARMII::MO_LO16_NONLAZY; + unsigned HI16TF = isPIC + ? ARMII::MO_HI16_NONLAZY_PIC : ARMII::MO_HI16_NONLAZY; + unsigned PICAddOpc = isARM + ? (Opcode == ARM::MOV_ga_pcrel_ldr ? ARM::PICLDR : ARM::PICADD) + : ARM::tPICADD; + MachineInstrBuilder MIB1 = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(LO16Opc), DstReg) + .addGlobalAddress(GV, MO1.getOffset(), TF | LO16TF) + .addImm(LabelId); + MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(HI16Opc), DstReg) + .addReg(DstReg) + .addGlobalAddress(GV, MO1.getOffset(), TF | HI16TF) + .addImm(LabelId); + if (!isPIC) { + TransferImpOps(MI, MIB1, MIB2); + MI.eraseFromParent(); + return true; + } + + MachineInstrBuilder MIB3 = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(PICAddOpc)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstReg).addImm(LabelId); + if (isARM) { + AddDefaultPred(MIB3); + if (Opcode == ARM::MOV_ga_pcrel_ldr) + MIB2->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + } + TransferImpOps(MI, MIB1, MIB3); + MI.eraseFromParent(); + return true; + } + + case ARM::MOVi32imm: + case ARM::MOVCCi32imm: + case ARM::t2MOVi32imm: + case ARM::t2MOVCCi32imm: + ExpandMOV32BitImm(MBB, MBBI); + return true; + + case ARM::VLDMQIA: { + unsigned NewOpc = ARM::VLDMDIA; + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc)); + unsigned OpIdx = 0; + + // Grab the Q register destination. + bool DstIsDead = MI.getOperand(OpIdx).isDead(); + unsigned DstReg = MI.getOperand(OpIdx++).getReg(); + + // Copy the source register. + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Copy the predicate operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Add the destination operands (D subregs). + unsigned D0 = TRI->getSubReg(DstReg, ARM::dsub_0); + unsigned D1 = TRI->getSubReg(DstReg, ARM::dsub_1); + MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(D1, RegState::Define | getDeadRegState(DstIsDead)); + + // Add an implicit def for the super-register. + MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead)); + TransferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); + return true; + } + + case ARM::VSTMQIA: { + unsigned NewOpc = ARM::VSTMDIA; + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc)); + unsigned OpIdx = 0; + + // Grab the Q register source. + bool SrcIsKill = MI.getOperand(OpIdx).isKill(); + unsigned SrcReg = MI.getOperand(OpIdx++).getReg(); + + // Copy the destination register. + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Copy the predicate operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Add the source operands (D subregs). + unsigned D0 = TRI->getSubReg(SrcReg, ARM::dsub_0); + unsigned D1 = TRI->getSubReg(SrcReg, ARM::dsub_1); + MIB.addReg(D0).addReg(D1); + + if (SrcIsKill) // Add an implicit kill for the Q register. + MIB->addRegisterKilled(SrcReg, TRI, true); + + TransferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); + return true; + } + case ARM::VDUPfqf: + case ARM::VDUPfdf:{ + unsigned NewOpc = Opcode == ARM::VDUPfqf ? ARM::VDUPLN32q : + ARM::VDUPLN32d; + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc)); + unsigned OpIdx = 0; + unsigned SrcReg = MI.getOperand(1).getReg(); + unsigned Lane = getARMRegisterNumbering(SrcReg) & 1; + unsigned DReg = TRI->getMatchingSuperReg(SrcReg, + Lane & 1 ? ARM::ssub_1 : ARM::ssub_0, + &ARM::DPR_VFP2RegClass); + // The lane is [0,1] for the containing DReg superregister. + // Copy the dst/src register operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addReg(DReg); + ++OpIdx; + // Add the lane select operand. + MIB.addImm(Lane); + // Add the predicate operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + + TransferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); + return true; + } + + case ARM::VLD1q8Pseudo: + case ARM::VLD1q16Pseudo: + case ARM::VLD1q32Pseudo: + case ARM::VLD1q64Pseudo: + case ARM::VLD1q8Pseudo_UPD: + case ARM::VLD1q16Pseudo_UPD: + case ARM::VLD1q32Pseudo_UPD: + case ARM::VLD1q64Pseudo_UPD: + case ARM::VLD2d8Pseudo: + case ARM::VLD2d16Pseudo: + case ARM::VLD2d32Pseudo: + case ARM::VLD2q8Pseudo: + case ARM::VLD2q16Pseudo: + case ARM::VLD2q32Pseudo: + case ARM::VLD2d8Pseudo_UPD: + case ARM::VLD2d16Pseudo_UPD: + case ARM::VLD2d32Pseudo_UPD: + case ARM::VLD2q8Pseudo_UPD: + case ARM::VLD2q16Pseudo_UPD: + case ARM::VLD2q32Pseudo_UPD: + case ARM::VLD3d8Pseudo: + case ARM::VLD3d16Pseudo: + case ARM::VLD3d32Pseudo: + case ARM::VLD1d64TPseudo: + case ARM::VLD3d8Pseudo_UPD: + case ARM::VLD3d16Pseudo_UPD: + case ARM::VLD3d32Pseudo_UPD: + case ARM::VLD1d64TPseudo_UPD: + case ARM::VLD3q8Pseudo_UPD: + case ARM::VLD3q16Pseudo_UPD: + case ARM::VLD3q32Pseudo_UPD: + case ARM::VLD3q8oddPseudo: + case ARM::VLD3q16oddPseudo: + case ARM::VLD3q32oddPseudo: + case ARM::VLD3q8oddPseudo_UPD: + case ARM::VLD3q16oddPseudo_UPD: + case ARM::VLD3q32oddPseudo_UPD: + case ARM::VLD4d8Pseudo: + case ARM::VLD4d16Pseudo: + case ARM::VLD4d32Pseudo: + case ARM::VLD1d64QPseudo: + case ARM::VLD4d8Pseudo_UPD: + case ARM::VLD4d16Pseudo_UPD: + case ARM::VLD4d32Pseudo_UPD: + case ARM::VLD1d64QPseudo_UPD: + case ARM::VLD4q8Pseudo_UPD: + case ARM::VLD4q16Pseudo_UPD: + case ARM::VLD4q32Pseudo_UPD: + case ARM::VLD4q8oddPseudo: + case ARM::VLD4q16oddPseudo: + case ARM::VLD4q32oddPseudo: + case ARM::VLD4q8oddPseudo_UPD: + case ARM::VLD4q16oddPseudo_UPD: + case ARM::VLD4q32oddPseudo_UPD: + case ARM::VLD1DUPq8Pseudo: + case ARM::VLD1DUPq16Pseudo: + case ARM::VLD1DUPq32Pseudo: + case ARM::VLD1DUPq8Pseudo_UPD: + case ARM::VLD1DUPq16Pseudo_UPD: + case ARM::VLD1DUPq32Pseudo_UPD: + case ARM::VLD2DUPd8Pseudo: + case ARM::VLD2DUPd16Pseudo: + case ARM::VLD2DUPd32Pseudo: + case ARM::VLD2DUPd8Pseudo_UPD: + case ARM::VLD2DUPd16Pseudo_UPD: + case ARM::VLD2DUPd32Pseudo_UPD: + case ARM::VLD3DUPd8Pseudo: + case ARM::VLD3DUPd16Pseudo: + case ARM::VLD3DUPd32Pseudo: + case ARM::VLD3DUPd8Pseudo_UPD: + case ARM::VLD3DUPd16Pseudo_UPD: + case ARM::VLD3DUPd32Pseudo_UPD: + case ARM::VLD4DUPd8Pseudo: + case ARM::VLD4DUPd16Pseudo: + case ARM::VLD4DUPd32Pseudo: + case ARM::VLD4DUPd8Pseudo_UPD: + case ARM::VLD4DUPd16Pseudo_UPD: + case ARM::VLD4DUPd32Pseudo_UPD: + ExpandVLD(MBBI); + return true; + + case ARM::VST1q8Pseudo: + case ARM::VST1q16Pseudo: + case ARM::VST1q32Pseudo: + case ARM::VST1q64Pseudo: + case ARM::VST1q8Pseudo_UPD: + case ARM::VST1q16Pseudo_UPD: + case ARM::VST1q32Pseudo_UPD: + case ARM::VST1q64Pseudo_UPD: + case ARM::VST2d8Pseudo: + case ARM::VST2d16Pseudo: + case ARM::VST2d32Pseudo: + case ARM::VST2q8Pseudo: + case ARM::VST2q16Pseudo: + case ARM::VST2q32Pseudo: + case ARM::VST2d8Pseudo_UPD: + case ARM::VST2d16Pseudo_UPD: + case ARM::VST2d32Pseudo_UPD: + case ARM::VST2q8Pseudo_UPD: + case ARM::VST2q16Pseudo_UPD: + case ARM::VST2q32Pseudo_UPD: + case ARM::VST3d8Pseudo: + case ARM::VST3d16Pseudo: + case ARM::VST3d32Pseudo: + case ARM::VST1d64TPseudo: + case ARM::VST3d8Pseudo_UPD: + case ARM::VST3d16Pseudo_UPD: + case ARM::VST3d32Pseudo_UPD: + case ARM::VST1d64TPseudo_UPD: + case ARM::VST3q8Pseudo_UPD: + case ARM::VST3q16Pseudo_UPD: + case ARM::VST3q32Pseudo_UPD: + case ARM::VST3q8oddPseudo: + case ARM::VST3q16oddPseudo: + case ARM::VST3q32oddPseudo: + case ARM::VST3q8oddPseudo_UPD: + case ARM::VST3q16oddPseudo_UPD: + case ARM::VST3q32oddPseudo_UPD: + case ARM::VST4d8Pseudo: + case ARM::VST4d16Pseudo: + case ARM::VST4d32Pseudo: + case ARM::VST1d64QPseudo: + case ARM::VST4d8Pseudo_UPD: + case ARM::VST4d16Pseudo_UPD: + case ARM::VST4d32Pseudo_UPD: + case ARM::VST1d64QPseudo_UPD: + case ARM::VST4q8Pseudo_UPD: + case ARM::VST4q16Pseudo_UPD: + case ARM::VST4q32Pseudo_UPD: + case ARM::VST4q8oddPseudo: + case ARM::VST4q16oddPseudo: + case ARM::VST4q32oddPseudo: + case ARM::VST4q8oddPseudo_UPD: + case ARM::VST4q16oddPseudo_UPD: + case ARM::VST4q32oddPseudo_UPD: + ExpandVST(MBBI); + return true; + + case ARM::VLD1LNq8Pseudo: + case ARM::VLD1LNq16Pseudo: + case ARM::VLD1LNq32Pseudo: + case ARM::VLD1LNq8Pseudo_UPD: + case ARM::VLD1LNq16Pseudo_UPD: + case ARM::VLD1LNq32Pseudo_UPD: + case ARM::VLD2LNd8Pseudo: + case ARM::VLD2LNd16Pseudo: + case ARM::VLD2LNd32Pseudo: + case ARM::VLD2LNq16Pseudo: + case ARM::VLD2LNq32Pseudo: + case ARM::VLD2LNd8Pseudo_UPD: + case ARM::VLD2LNd16Pseudo_UPD: + case ARM::VLD2LNd32Pseudo_UPD: + case ARM::VLD2LNq16Pseudo_UPD: + case ARM::VLD2LNq32Pseudo_UPD: + case ARM::VLD3LNd8Pseudo: + case ARM::VLD3LNd16Pseudo: + case ARM::VLD3LNd32Pseudo: + case ARM::VLD3LNq16Pseudo: + case ARM::VLD3LNq32Pseudo: + case ARM::VLD3LNd8Pseudo_UPD: + case ARM::VLD3LNd16Pseudo_UPD: + case ARM::VLD3LNd32Pseudo_UPD: + case ARM::VLD3LNq16Pseudo_UPD: + case ARM::VLD3LNq32Pseudo_UPD: + case ARM::VLD4LNd8Pseudo: + case ARM::VLD4LNd16Pseudo: + case ARM::VLD4LNd32Pseudo: + case ARM::VLD4LNq16Pseudo: + case ARM::VLD4LNq32Pseudo: + case ARM::VLD4LNd8Pseudo_UPD: + case ARM::VLD4LNd16Pseudo_UPD: + case ARM::VLD4LNd32Pseudo_UPD: + case ARM::VLD4LNq16Pseudo_UPD: + case ARM::VLD4LNq32Pseudo_UPD: + case ARM::VST1LNq8Pseudo: + case ARM::VST1LNq16Pseudo: + case ARM::VST1LNq32Pseudo: + case ARM::VST1LNq8Pseudo_UPD: + case ARM::VST1LNq16Pseudo_UPD: + case ARM::VST1LNq32Pseudo_UPD: + case ARM::VST2LNd8Pseudo: + case ARM::VST2LNd16Pseudo: + case ARM::VST2LNd32Pseudo: + case ARM::VST2LNq16Pseudo: + case ARM::VST2LNq32Pseudo: + case ARM::VST2LNd8Pseudo_UPD: + case ARM::VST2LNd16Pseudo_UPD: + case ARM::VST2LNd32Pseudo_UPD: + case ARM::VST2LNq16Pseudo_UPD: + case ARM::VST2LNq32Pseudo_UPD: + case ARM::VST3LNd8Pseudo: + case ARM::VST3LNd16Pseudo: + case ARM::VST3LNd32Pseudo: + case ARM::VST3LNq16Pseudo: + case ARM::VST3LNq32Pseudo: + case ARM::VST3LNd8Pseudo_UPD: + case ARM::VST3LNd16Pseudo_UPD: + case ARM::VST3LNd32Pseudo_UPD: + case ARM::VST3LNq16Pseudo_UPD: + case ARM::VST3LNq32Pseudo_UPD: + case ARM::VST4LNd8Pseudo: + case ARM::VST4LNd16Pseudo: + case ARM::VST4LNd32Pseudo: + case ARM::VST4LNq16Pseudo: + case ARM::VST4LNq32Pseudo: + case ARM::VST4LNd8Pseudo_UPD: + case ARM::VST4LNd16Pseudo_UPD: + case ARM::VST4LNd32Pseudo_UPD: + case ARM::VST4LNq16Pseudo_UPD: + case ARM::VST4LNq32Pseudo_UPD: + ExpandLaneOp(MBBI); + return true; + + case ARM::VTBL2Pseudo: ExpandVTBL(MBBI, ARM::VTBL2, false, 2); return true; + case ARM::VTBL3Pseudo: ExpandVTBL(MBBI, ARM::VTBL3, false, 3); return true; + case ARM::VTBL4Pseudo: ExpandVTBL(MBBI, ARM::VTBL4, false, 4); return true; + case ARM::VTBX2Pseudo: ExpandVTBL(MBBI, ARM::VTBX2, true, 2); return true; + case ARM::VTBX3Pseudo: ExpandVTBL(MBBI, ARM::VTBX3, true, 3); return true; + case ARM::VTBX4Pseudo: ExpandVTBL(MBBI, ARM::VTBX4, true, 4); return true; + } + + return false; +} + +bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { + bool Modified = false; + + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + while (MBBI != E) { + MachineBasicBlock::iterator NMBBI = llvm::next(MBBI); + Modified |= ExpandMI(MBB, MBBI); + MBBI = NMBBI; + } + + return Modified; +} + +bool ARMExpandPseudo::runOnMachineFunction(MachineFunction &MF) { + const TargetMachine &TM = MF.getTarget(); + TII = static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo()); + TRI = TM.getRegisterInfo(); + STI = &TM.getSubtarget<ARMSubtarget>(); + AFI = MF.getInfo<ARMFunctionInfo>(); + + bool Modified = false; + for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E; + ++MFI) + Modified |= ExpandMBB(*MFI); + if (VerifyARMPseudo) + MF.verify(this, "After expanding ARM pseudo instructions."); + return Modified; +} + +/// createARMExpandPseudoPass - returns an instance of the pseudo instruction +/// expansion pass. +FunctionPass *llvm::createARMExpandPseudoPass() { + return new ARMExpandPseudo(); +} diff --git a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp new file mode 100644 index 0000000..9bc7ef2 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp @@ -0,0 +1,2120 @@ +//===-- ARMFastISel.cpp - ARM FastISel implementation ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the ARM-specific support for the FastISel class. Some +// of the target-specific code is generated by tablegen in the file +// ARMGenFastISel.inc, which is #included here. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "ARMCallingConv.h" +#include "ARMRegisterInfo.h" +#include "ARMTargetMachine.h" +#include "ARMSubtarget.h" +#include "ARMConstantPoolValue.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/CallingConv.h" +#include "llvm/DerivedTypes.h" +#include "llvm/GlobalVariable.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Module.h" +#include "llvm/Operator.h" +#include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/FastISel.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/GetElementPtrTypeIterator.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +using namespace llvm; + +static cl::opt<bool> +DisableARMFastISel("disable-arm-fast-isel", + cl::desc("Turn off experimental ARM fast-isel support"), + cl::init(false), cl::Hidden); + +extern cl::opt<bool> EnableARMLongCalls; + +namespace { + + // All possible address modes, plus some. + typedef struct Address { + enum { + RegBase, + FrameIndexBase + } BaseType; + + union { + unsigned Reg; + int FI; + } Base; + + int Offset; + + // Innocuous defaults for our address. + Address() + : BaseType(RegBase), Offset(0) { + Base.Reg = 0; + } + } Address; + +class ARMFastISel : public FastISel { + + /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can + /// make the right decision when generating code for different targets. + const ARMSubtarget *Subtarget; + const TargetMachine &TM; + const TargetInstrInfo &TII; + const TargetLowering &TLI; + ARMFunctionInfo *AFI; + + // Convenience variables to avoid some queries. + bool isThumb; + LLVMContext *Context; + + public: + explicit ARMFastISel(FunctionLoweringInfo &funcInfo) + : FastISel(funcInfo), + TM(funcInfo.MF->getTarget()), + TII(*TM.getInstrInfo()), + TLI(*TM.getTargetLowering()) { + Subtarget = &TM.getSubtarget<ARMSubtarget>(); + AFI = funcInfo.MF->getInfo<ARMFunctionInfo>(); + isThumb = AFI->isThumbFunction(); + Context = &funcInfo.Fn->getContext(); + } + + // Code from FastISel.cpp. + virtual unsigned FastEmitInst_(unsigned MachineInstOpcode, + const TargetRegisterClass *RC); + virtual unsigned FastEmitInst_r(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill); + virtual unsigned FastEmitInst_rr(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill); + virtual unsigned FastEmitInst_rrr(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill, + unsigned Op2, bool Op2IsKill); + virtual unsigned FastEmitInst_ri(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + uint64_t Imm); + virtual unsigned FastEmitInst_rf(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + const ConstantFP *FPImm); + virtual unsigned FastEmitInst_rri(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill, + uint64_t Imm); + virtual unsigned FastEmitInst_i(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + uint64_t Imm); + virtual unsigned FastEmitInst_ii(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + uint64_t Imm1, uint64_t Imm2); + + virtual unsigned FastEmitInst_extractsubreg(MVT RetVT, + unsigned Op0, bool Op0IsKill, + uint32_t Idx); + + // Backend specific FastISel code. + virtual bool TargetSelectInstruction(const Instruction *I); + virtual unsigned TargetMaterializeConstant(const Constant *C); + virtual unsigned TargetMaterializeAlloca(const AllocaInst *AI); + + #include "ARMGenFastISel.inc" + + // Instruction selection routines. + private: + bool SelectLoad(const Instruction *I); + bool SelectStore(const Instruction *I); + bool SelectBranch(const Instruction *I); + bool SelectCmp(const Instruction *I); + bool SelectFPExt(const Instruction *I); + bool SelectFPTrunc(const Instruction *I); + bool SelectBinaryOp(const Instruction *I, unsigned ISDOpcode); + bool SelectSIToFP(const Instruction *I); + bool SelectFPToSI(const Instruction *I); + bool SelectSDiv(const Instruction *I); + bool SelectSRem(const Instruction *I); + bool SelectCall(const Instruction *I); + bool SelectSelect(const Instruction *I); + bool SelectRet(const Instruction *I); + bool SelectIntCast(const Instruction *I); + + // Utility routines. + private: + bool isTypeLegal(Type *Ty, MVT &VT); + bool isLoadTypeLegal(Type *Ty, MVT &VT); + bool ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr); + bool ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr); + bool ARMComputeAddress(const Value *Obj, Address &Addr); + void ARMSimplifyAddress(Address &Addr, EVT VT); + unsigned ARMMaterializeFP(const ConstantFP *CFP, EVT VT); + unsigned ARMMaterializeInt(const Constant *C, EVT VT); + unsigned ARMMaterializeGV(const GlobalValue *GV, EVT VT); + unsigned ARMMoveToFPReg(EVT VT, unsigned SrcReg); + unsigned ARMMoveToIntReg(EVT VT, unsigned SrcReg); + unsigned ARMSelectCallOp(const GlobalValue *GV); + + // Call handling routines. + private: + bool FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT, + unsigned &ResultReg); + CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool Return); + bool ProcessCallArgs(SmallVectorImpl<Value*> &Args, + SmallVectorImpl<unsigned> &ArgRegs, + SmallVectorImpl<MVT> &ArgVTs, + SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags, + SmallVectorImpl<unsigned> &RegArgs, + CallingConv::ID CC, + unsigned &NumBytes); + bool FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs, + const Instruction *I, CallingConv::ID CC, + unsigned &NumBytes); + bool ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call); + + // OptionalDef handling routines. + private: + bool isARMNEONPred(const MachineInstr *MI); + bool DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR); + const MachineInstrBuilder &AddOptionalDefs(const MachineInstrBuilder &MIB); + void AddLoadStoreOperands(EVT VT, Address &Addr, + const MachineInstrBuilder &MIB, + unsigned Flags); +}; + +} // end anonymous namespace + +#include "ARMGenCallingConv.inc" + +// DefinesOptionalPredicate - This is different from DefinesPredicate in that +// we don't care about implicit defs here, just places we'll need to add a +// default CCReg argument. Sets CPSR if we're setting CPSR instead of CCR. +bool ARMFastISel::DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR) { + const MCInstrDesc &MCID = MI->getDesc(); + if (!MCID.hasOptionalDef()) + return false; + + // Look to see if our OptionalDef is defining CPSR or CCR. + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg() || !MO.isDef()) continue; + if (MO.getReg() == ARM::CPSR) + *CPSR = true; + } + return true; +} + +bool ARMFastISel::isARMNEONPred(const MachineInstr *MI) { + const MCInstrDesc &MCID = MI->getDesc(); + + // If we're a thumb2 or not NEON function we were handled via isPredicable. + if ((MCID.TSFlags & ARMII::DomainMask) != ARMII::DomainNEON || + AFI->isThumb2Function()) + return false; + + for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) + if (MCID.OpInfo[i].isPredicate()) + return true; + + return false; +} + +// If the machine is predicable go ahead and add the predicate operands, if +// it needs default CC operands add those. +// TODO: If we want to support thumb1 then we'll need to deal with optional +// CPSR defs that need to be added before the remaining operands. See s_cc_out +// for descriptions why. +const MachineInstrBuilder & +ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) { + MachineInstr *MI = &*MIB; + + // Do we use a predicate? or... + // Are we NEON in ARM mode and have a predicate operand? If so, I know + // we're not predicable but add it anyways. + if (TII.isPredicable(MI) || isARMNEONPred(MI)) + AddDefaultPred(MIB); + + // Do we optionally set a predicate? Preds is size > 0 iff the predicate + // defines CPSR. All other OptionalDefines in ARM are the CCR register. + bool CPSR = false; + if (DefinesOptionalPredicate(MI, &CPSR)) { + if (CPSR) + AddDefaultT1CC(MIB); + else + AddDefaultCC(MIB); + } + return MIB; +} + +unsigned ARMFastISel::FastEmitInst_(unsigned MachineInstOpcode, + const TargetRegisterClass* RC) { + unsigned ResultReg = createResultReg(RC); + const MCInstrDesc &II = TII.get(MachineInstOpcode); + + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)); + return ResultReg; +} + +unsigned ARMFastISel::FastEmitInst_r(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill) { + unsigned ResultReg = createResultReg(RC); + const MCInstrDesc &II = TII.get(MachineInstOpcode); + + if (II.getNumDefs() >= 1) + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg) + .addReg(Op0, Op0IsKill * RegState::Kill)); + else { + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II) + .addReg(Op0, Op0IsKill * RegState::Kill)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(II.ImplicitDefs[0])); + } + return ResultReg; +} + +unsigned ARMFastISel::FastEmitInst_rr(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill) { + unsigned ResultReg = createResultReg(RC); + const MCInstrDesc &II = TII.get(MachineInstOpcode); + + if (II.getNumDefs() >= 1) + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg) + .addReg(Op0, Op0IsKill * RegState::Kill) + .addReg(Op1, Op1IsKill * RegState::Kill)); + else { + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II) + .addReg(Op0, Op0IsKill * RegState::Kill) + .addReg(Op1, Op1IsKill * RegState::Kill)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(II.ImplicitDefs[0])); + } + return ResultReg; +} + +unsigned ARMFastISel::FastEmitInst_rrr(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill, + unsigned Op2, bool Op2IsKill) { + unsigned ResultReg = createResultReg(RC); + const MCInstrDesc &II = TII.get(MachineInstOpcode); + + if (II.getNumDefs() >= 1) + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg) + .addReg(Op0, Op0IsKill * RegState::Kill) + .addReg(Op1, Op1IsKill * RegState::Kill) + .addReg(Op2, Op2IsKill * RegState::Kill)); + else { + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II) + .addReg(Op0, Op0IsKill * RegState::Kill) + .addReg(Op1, Op1IsKill * RegState::Kill) + .addReg(Op2, Op2IsKill * RegState::Kill)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(II.ImplicitDefs[0])); + } + return ResultReg; +} + +unsigned ARMFastISel::FastEmitInst_ri(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + uint64_t Imm) { + unsigned ResultReg = createResultReg(RC); + const MCInstrDesc &II = TII.get(MachineInstOpcode); + + if (II.getNumDefs() >= 1) + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg) + .addReg(Op0, Op0IsKill * RegState::Kill) + .addImm(Imm)); + else { + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II) + .addReg(Op0, Op0IsKill * RegState::Kill) + .addImm(Imm)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(II.ImplicitDefs[0])); + } + return ResultReg; +} + +unsigned ARMFastISel::FastEmitInst_rf(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + const ConstantFP *FPImm) { + unsigned ResultReg = createResultReg(RC); + const MCInstrDesc &II = TII.get(MachineInstOpcode); + + if (II.getNumDefs() >= 1) + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg) + .addReg(Op0, Op0IsKill * RegState::Kill) + .addFPImm(FPImm)); + else { + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II) + .addReg(Op0, Op0IsKill * RegState::Kill) + .addFPImm(FPImm)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(II.ImplicitDefs[0])); + } + return ResultReg; +} + +unsigned ARMFastISel::FastEmitInst_rri(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill, + uint64_t Imm) { + unsigned ResultReg = createResultReg(RC); + const MCInstrDesc &II = TII.get(MachineInstOpcode); + + if (II.getNumDefs() >= 1) + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg) + .addReg(Op0, Op0IsKill * RegState::Kill) + .addReg(Op1, Op1IsKill * RegState::Kill) + .addImm(Imm)); + else { + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II) + .addReg(Op0, Op0IsKill * RegState::Kill) + .addReg(Op1, Op1IsKill * RegState::Kill) + .addImm(Imm)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(II.ImplicitDefs[0])); + } + return ResultReg; +} + +unsigned ARMFastISel::FastEmitInst_i(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + uint64_t Imm) { + unsigned ResultReg = createResultReg(RC); + const MCInstrDesc &II = TII.get(MachineInstOpcode); + + if (II.getNumDefs() >= 1) + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg) + .addImm(Imm)); + else { + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II) + .addImm(Imm)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(II.ImplicitDefs[0])); + } + return ResultReg; +} + +unsigned ARMFastISel::FastEmitInst_ii(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + uint64_t Imm1, uint64_t Imm2) { + unsigned ResultReg = createResultReg(RC); + const MCInstrDesc &II = TII.get(MachineInstOpcode); + + if (II.getNumDefs() >= 1) + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg) + .addImm(Imm1).addImm(Imm2)); + else { + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II) + .addImm(Imm1).addImm(Imm2)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(TargetOpcode::COPY), + ResultReg) + .addReg(II.ImplicitDefs[0])); + } + return ResultReg; +} + +unsigned ARMFastISel::FastEmitInst_extractsubreg(MVT RetVT, + unsigned Op0, bool Op0IsKill, + uint32_t Idx) { + unsigned ResultReg = createResultReg(TLI.getRegClassFor(RetVT)); + assert(TargetRegisterInfo::isVirtualRegister(Op0) && + "Cannot yet extract from physregs"); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, + DL, TII.get(TargetOpcode::COPY), ResultReg) + .addReg(Op0, getKillRegState(Op0IsKill), Idx)); + return ResultReg; +} + +// TODO: Don't worry about 64-bit now, but when this is fixed remove the +// checks from the various callers. +unsigned ARMFastISel::ARMMoveToFPReg(EVT VT, unsigned SrcReg) { + if (VT == MVT::f64) return 0; + + unsigned MoveReg = createResultReg(TLI.getRegClassFor(VT)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(ARM::VMOVRS), MoveReg) + .addReg(SrcReg)); + return MoveReg; +} + +unsigned ARMFastISel::ARMMoveToIntReg(EVT VT, unsigned SrcReg) { + if (VT == MVT::i64) return 0; + + unsigned MoveReg = createResultReg(TLI.getRegClassFor(VT)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(ARM::VMOVSR), MoveReg) + .addReg(SrcReg)); + return MoveReg; +} + +// For double width floating point we need to materialize two constants +// (the high and the low) into integer registers then use a move to get +// the combined constant into an FP reg. +unsigned ARMFastISel::ARMMaterializeFP(const ConstantFP *CFP, EVT VT) { + const APFloat Val = CFP->getValueAPF(); + bool is64bit = VT == MVT::f64; + + // This checks to see if we can use VFP3 instructions to materialize + // a constant, otherwise we have to go through the constant pool. + if (TLI.isFPImmLegal(Val, VT)) { + int Imm; + unsigned Opc; + if (is64bit) { + Imm = ARM_AM::getFP64Imm(Val); + Opc = ARM::FCONSTD; + } else { + Imm = ARM_AM::getFP32Imm(Val); + Opc = ARM::FCONSTS; + } + unsigned DestReg = createResultReg(TLI.getRegClassFor(VT)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), + DestReg) + .addImm(Imm)); + return DestReg; + } + + // Require VFP2 for loading fp constants. + if (!Subtarget->hasVFP2()) return false; + + // MachineConstantPool wants an explicit alignment. + unsigned Align = TD.getPrefTypeAlignment(CFP->getType()); + if (Align == 0) { + // TODO: Figure out if this is correct. + Align = TD.getTypeAllocSize(CFP->getType()); + } + unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align); + unsigned DestReg = createResultReg(TLI.getRegClassFor(VT)); + unsigned Opc = is64bit ? ARM::VLDRD : ARM::VLDRS; + + // The extra reg is for addrmode5. + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), + DestReg) + .addConstantPoolIndex(Idx) + .addReg(0)); + return DestReg; +} + +unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, EVT VT) { + + // For now 32-bit only. + if (VT != MVT::i32) return false; + + unsigned DestReg = createResultReg(TLI.getRegClassFor(VT)); + + // If we can do this in a single instruction without a constant pool entry + // do so now. + const ConstantInt *CI = cast<ConstantInt>(C); + if (Subtarget->hasV6T2Ops() && isUInt<16>(CI->getSExtValue())) { + unsigned Opc = isThumb ? ARM::t2MOVi16 : ARM::MOVi16; + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(Opc), DestReg) + .addImm(CI->getSExtValue())); + return DestReg; + } + + // MachineConstantPool wants an explicit alignment. + unsigned Align = TD.getPrefTypeAlignment(C->getType()); + if (Align == 0) { + // TODO: Figure out if this is correct. + Align = TD.getTypeAllocSize(C->getType()); + } + unsigned Idx = MCP.getConstantPoolIndex(C, Align); + + if (isThumb) + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(ARM::t2LDRpci), DestReg) + .addConstantPoolIndex(Idx)); + else + // The extra immediate is for addrmode2. + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(ARM::LDRcp), DestReg) + .addConstantPoolIndex(Idx) + .addImm(0)); + + return DestReg; +} + +unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, EVT VT) { + // For now 32-bit only. + if (VT != MVT::i32) return 0; + + Reloc::Model RelocM = TM.getRelocationModel(); + + // TODO: Need more magic for ARM PIC. + if (!isThumb && (RelocM == Reloc::PIC_)) return 0; + + // MachineConstantPool wants an explicit alignment. + unsigned Align = TD.getPrefTypeAlignment(GV->getType()); + if (Align == 0) { + // TODO: Figure out if this is correct. + Align = TD.getTypeAllocSize(GV->getType()); + } + + // Grab index. + unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb() ? 4 : 8); + unsigned Id = AFI->createPICLabelUId(); + ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, Id, + ARMCP::CPValue, + PCAdj); + unsigned Idx = MCP.getConstantPoolIndex(CPV, Align); + + // Load value. + MachineInstrBuilder MIB; + unsigned DestReg = createResultReg(TLI.getRegClassFor(VT)); + if (isThumb) { + unsigned Opc = (RelocM != Reloc::PIC_) ? ARM::t2LDRpci : ARM::t2LDRpci_pic; + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg) + .addConstantPoolIndex(Idx); + if (RelocM == Reloc::PIC_) + MIB.addImm(Id); + } else { + // The extra immediate is for addrmode2. + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::LDRcp), + DestReg) + .addConstantPoolIndex(Idx) + .addImm(0); + } + AddOptionalDefs(MIB); + + if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) { + unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT)); + if (isThumb) + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(ARM::t2LDRi12), NewDestReg) + .addReg(DestReg) + .addImm(0); + else + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::LDRi12), + NewDestReg) + .addReg(DestReg) + .addImm(0); + DestReg = NewDestReg; + AddOptionalDefs(MIB); + } + + return DestReg; +} + +unsigned ARMFastISel::TargetMaterializeConstant(const Constant *C) { + EVT VT = TLI.getValueType(C->getType(), true); + + // Only handle simple types. + if (!VT.isSimple()) return 0; + + if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) + return ARMMaterializeFP(CFP, VT); + else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C)) + return ARMMaterializeGV(GV, VT); + else if (isa<ConstantInt>(C)) + return ARMMaterializeInt(C, VT); + + return 0; +} + +unsigned ARMFastISel::TargetMaterializeAlloca(const AllocaInst *AI) { + // Don't handle dynamic allocas. + if (!FuncInfo.StaticAllocaMap.count(AI)) return 0; + + MVT VT; + if (!isLoadTypeLegal(AI->getType(), VT)) return false; + + DenseMap<const AllocaInst*, int>::iterator SI = + FuncInfo.StaticAllocaMap.find(AI); + + // This will get lowered later into the correct offsets and registers + // via rewriteXFrameIndex. + if (SI != FuncInfo.StaticAllocaMap.end()) { + TargetRegisterClass* RC = TLI.getRegClassFor(VT); + unsigned ResultReg = createResultReg(RC); + unsigned Opc = isThumb ? ARM::t2ADDri : ARM::ADDri; + AddOptionalDefs(BuildMI(*FuncInfo.MBB, *FuncInfo.InsertPt, DL, + TII.get(Opc), ResultReg) + .addFrameIndex(SI->second) + .addImm(0)); + return ResultReg; + } + + return 0; +} + +bool ARMFastISel::isTypeLegal(Type *Ty, MVT &VT) { + EVT evt = TLI.getValueType(Ty, true); + + // Only handle simple types. + if (evt == MVT::Other || !evt.isSimple()) return false; + VT = evt.getSimpleVT(); + + // Handle all legal types, i.e. a register that will directly hold this + // value. + return TLI.isTypeLegal(VT); +} + +bool ARMFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) { + if (isTypeLegal(Ty, VT)) return true; + + // If this is a type than can be sign or zero-extended to a basic operation + // go ahead and accept it now. + if (VT == MVT::i8 || VT == MVT::i16) + return true; + + return false; +} + +// Computes the address to get to an object. +bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) { + // Some boilerplate from the X86 FastISel. + const User *U = NULL; + unsigned Opcode = Instruction::UserOp1; + if (const Instruction *I = dyn_cast<Instruction>(Obj)) { + // Don't walk into other basic blocks unless the object is an alloca from + // another block, otherwise it may not have a virtual register assigned. + if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) || + FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) { + Opcode = I->getOpcode(); + U = I; + } + } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) { + Opcode = C->getOpcode(); + U = C; + } + + if (PointerType *Ty = dyn_cast<PointerType>(Obj->getType())) + if (Ty->getAddressSpace() > 255) + // Fast instruction selection doesn't support the special + // address spaces. + return false; + + switch (Opcode) { + default: + break; + case Instruction::BitCast: { + // Look through bitcasts. + return ARMComputeAddress(U->getOperand(0), Addr); + } + case Instruction::IntToPtr: { + // Look past no-op inttoptrs. + if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy()) + return ARMComputeAddress(U->getOperand(0), Addr); + break; + } + case Instruction::PtrToInt: { + // Look past no-op ptrtoints. + if (TLI.getValueType(U->getType()) == TLI.getPointerTy()) + return ARMComputeAddress(U->getOperand(0), Addr); + break; + } + case Instruction::GetElementPtr: { + Address SavedAddr = Addr; + int TmpOffset = Addr.Offset; + + // Iterate through the GEP folding the constants into offsets where + // we can. + gep_type_iterator GTI = gep_type_begin(U); + for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); + i != e; ++i, ++GTI) { + const Value *Op = *i; + if (StructType *STy = dyn_cast<StructType>(*GTI)) { + const StructLayout *SL = TD.getStructLayout(STy); + unsigned Idx = cast<ConstantInt>(Op)->getZExtValue(); + TmpOffset += SL->getElementOffset(Idx); + } else { + uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType()); + for (;;) { + if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { + // Constant-offset addressing. + TmpOffset += CI->getSExtValue() * S; + break; + } + if (isa<AddOperator>(Op) && + (!isa<Instruction>(Op) || + FuncInfo.MBBMap[cast<Instruction>(Op)->getParent()] + == FuncInfo.MBB) && + isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) { + // An add (in the same block) with a constant operand. Fold the + // constant. + ConstantInt *CI = + cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1)); + TmpOffset += CI->getSExtValue() * S; + // Iterate on the other operand. + Op = cast<AddOperator>(Op)->getOperand(0); + continue; + } + // Unsupported + goto unsupported_gep; + } + } + } + + // Try to grab the base operand now. + Addr.Offset = TmpOffset; + if (ARMComputeAddress(U->getOperand(0), Addr)) return true; + + // We failed, restore everything and try the other options. + Addr = SavedAddr; + + unsupported_gep: + break; + } + case Instruction::Alloca: { + const AllocaInst *AI = cast<AllocaInst>(Obj); + DenseMap<const AllocaInst*, int>::iterator SI = + FuncInfo.StaticAllocaMap.find(AI); + if (SI != FuncInfo.StaticAllocaMap.end()) { + Addr.BaseType = Address::FrameIndexBase; + Addr.Base.FI = SI->second; + return true; + } + break; + } + } + + // Materialize the global variable's address into a reg which can + // then be used later to load the variable. + if (const GlobalValue *GV = dyn_cast<GlobalValue>(Obj)) { + unsigned Tmp = ARMMaterializeGV(GV, TLI.getValueType(Obj->getType())); + if (Tmp == 0) return false; + + Addr.Base.Reg = Tmp; + return true; + } + + // Try to get this in a register if nothing else has worked. + if (Addr.Base.Reg == 0) Addr.Base.Reg = getRegForValue(Obj); + return Addr.Base.Reg != 0; +} + +void ARMFastISel::ARMSimplifyAddress(Address &Addr, EVT VT) { + + assert(VT.isSimple() && "Non-simple types are invalid here!"); + + bool needsLowering = false; + switch (VT.getSimpleVT().SimpleTy) { + default: + assert(false && "Unhandled load/store type!"); + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + // Integer loads/stores handle 12-bit offsets. + needsLowering = ((Addr.Offset & 0xfff) != Addr.Offset); + break; + case MVT::f32: + case MVT::f64: + // Floating point operands handle 8-bit offsets. + needsLowering = ((Addr.Offset & 0xff) != Addr.Offset); + break; + } + + // If this is a stack pointer and the offset needs to be simplified then + // put the alloca address into a register, set the base type back to + // register and continue. This should almost never happen. + if (needsLowering && Addr.BaseType == Address::FrameIndexBase) { + TargetRegisterClass *RC = isThumb ? ARM::tGPRRegisterClass : + ARM::GPRRegisterClass; + unsigned ResultReg = createResultReg(RC); + unsigned Opc = isThumb ? ARM::t2ADDri : ARM::ADDri; + AddOptionalDefs(BuildMI(*FuncInfo.MBB, *FuncInfo.InsertPt, DL, + TII.get(Opc), ResultReg) + .addFrameIndex(Addr.Base.FI) + .addImm(0)); + Addr.Base.Reg = ResultReg; + Addr.BaseType = Address::RegBase; + } + + // Since the offset is too large for the load/store instruction + // get the reg+offset into a register. + if (needsLowering) { + Addr.Base.Reg = FastEmit_ri_(MVT::i32, ISD::ADD, Addr.Base.Reg, + /*Op0IsKill*/false, Addr.Offset, MVT::i32); + Addr.Offset = 0; + } +} + +void ARMFastISel::AddLoadStoreOperands(EVT VT, Address &Addr, + const MachineInstrBuilder &MIB, + unsigned Flags) { + // addrmode5 output depends on the selection dag addressing dividing the + // offset by 4 that it then later multiplies. Do this here as well. + if (VT.getSimpleVT().SimpleTy == MVT::f32 || + VT.getSimpleVT().SimpleTy == MVT::f64) + Addr.Offset /= 4; + + // Frame base works a bit differently. Handle it separately. + if (Addr.BaseType == Address::FrameIndexBase) { + int FI = Addr.Base.FI; + int Offset = Addr.Offset; + MachineMemOperand *MMO = + FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(FI, Offset), + Flags, + MFI.getObjectSize(FI), + MFI.getObjectAlignment(FI)); + // Now add the rest of the operands. + MIB.addFrameIndex(FI); + + // ARM halfword load/stores need an additional operand. + if (!isThumb && VT.getSimpleVT().SimpleTy == MVT::i16) MIB.addReg(0); + + MIB.addImm(Addr.Offset); + MIB.addMemOperand(MMO); + } else { + // Now add the rest of the operands. + MIB.addReg(Addr.Base.Reg); + + // ARM halfword load/stores need an additional operand. + if (!isThumb && VT.getSimpleVT().SimpleTy == MVT::i16) MIB.addReg(0); + + MIB.addImm(Addr.Offset); + } + AddOptionalDefs(MIB); +} + +bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr) { + + assert(VT.isSimple() && "Non-simple types are invalid here!"); + unsigned Opc; + TargetRegisterClass *RC; + switch (VT.getSimpleVT().SimpleTy) { + // This is mostly going to be Neon/vector support. + default: return false; + case MVT::i16: + Opc = isThumb ? ARM::t2LDRHi12 : ARM::LDRH; + RC = ARM::GPRRegisterClass; + break; + case MVT::i8: + Opc = isThumb ? ARM::t2LDRBi12 : ARM::LDRBi12; + RC = ARM::GPRRegisterClass; + break; + case MVT::i32: + Opc = isThumb ? ARM::t2LDRi12 : ARM::LDRi12; + RC = ARM::GPRRegisterClass; + break; + case MVT::f32: + Opc = ARM::VLDRS; + RC = TLI.getRegClassFor(VT); + break; + case MVT::f64: + Opc = ARM::VLDRD; + RC = TLI.getRegClassFor(VT); + break; + } + // Simplify this down to something we can handle. + ARMSimplifyAddress(Addr, VT); + + // Create the base instruction, then add the operands. + ResultReg = createResultReg(RC); + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(Opc), ResultReg); + AddLoadStoreOperands(VT, Addr, MIB, MachineMemOperand::MOLoad); + return true; +} + +bool ARMFastISel::SelectLoad(const Instruction *I) { + // Atomic loads need special handling. + if (cast<LoadInst>(I)->isAtomic()) + return false; + + // Verify we have a legal type before going any further. + MVT VT; + if (!isLoadTypeLegal(I->getType(), VT)) + return false; + + // See if we can handle this address. + Address Addr; + if (!ARMComputeAddress(I->getOperand(0), Addr)) return false; + + unsigned ResultReg; + if (!ARMEmitLoad(VT, ResultReg, Addr)) return false; + UpdateValueMap(I, ResultReg); + return true; +} + +bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr) { + unsigned StrOpc; + switch (VT.getSimpleVT().SimpleTy) { + // This is mostly going to be Neon/vector support. + default: return false; + case MVT::i1: { + unsigned Res = createResultReg(isThumb ? ARM::tGPRRegisterClass : + ARM::GPRRegisterClass); + unsigned Opc = isThumb ? ARM::t2ANDri : ARM::ANDri; + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(Opc), Res) + .addReg(SrcReg).addImm(1)); + SrcReg = Res; + } // Fallthrough here. + case MVT::i8: + StrOpc = isThumb ? ARM::t2STRBi12 : ARM::STRBi12; + break; + case MVT::i16: + StrOpc = isThumb ? ARM::t2STRHi12 : ARM::STRH; + break; + case MVT::i32: + StrOpc = isThumb ? ARM::t2STRi12 : ARM::STRi12; + break; + case MVT::f32: + if (!Subtarget->hasVFP2()) return false; + StrOpc = ARM::VSTRS; + break; + case MVT::f64: + if (!Subtarget->hasVFP2()) return false; + StrOpc = ARM::VSTRD; + break; + } + // Simplify this down to something we can handle. + ARMSimplifyAddress(Addr, VT); + + // Create the base instruction, then add the operands. + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(StrOpc)) + .addReg(SrcReg, getKillRegState(true)); + AddLoadStoreOperands(VT, Addr, MIB, MachineMemOperand::MOStore); + return true; +} + +bool ARMFastISel::SelectStore(const Instruction *I) { + Value *Op0 = I->getOperand(0); + unsigned SrcReg = 0; + + // Atomic stores need special handling. + if (cast<StoreInst>(I)->isAtomic()) + return false; + + // Verify we have a legal type before going any further. + MVT VT; + if (!isLoadTypeLegal(I->getOperand(0)->getType(), VT)) + return false; + + // Get the value to be stored into a register. + SrcReg = getRegForValue(Op0); + if (SrcReg == 0) return false; + + // See if we can handle this address. + Address Addr; + if (!ARMComputeAddress(I->getOperand(1), Addr)) + return false; + + if (!ARMEmitStore(VT, SrcReg, Addr)) return false; + return true; +} + +static ARMCC::CondCodes getComparePred(CmpInst::Predicate Pred) { + switch (Pred) { + // Needs two compares... + case CmpInst::FCMP_ONE: + case CmpInst::FCMP_UEQ: + default: + // AL is our "false" for now. The other two need more compares. + return ARMCC::AL; + case CmpInst::ICMP_EQ: + case CmpInst::FCMP_OEQ: + return ARMCC::EQ; + case CmpInst::ICMP_SGT: + case CmpInst::FCMP_OGT: + return ARMCC::GT; + case CmpInst::ICMP_SGE: + case CmpInst::FCMP_OGE: + return ARMCC::GE; + case CmpInst::ICMP_UGT: + case CmpInst::FCMP_UGT: + return ARMCC::HI; + case CmpInst::FCMP_OLT: + return ARMCC::MI; + case CmpInst::ICMP_ULE: + case CmpInst::FCMP_OLE: + return ARMCC::LS; + case CmpInst::FCMP_ORD: + return ARMCC::VC; + case CmpInst::FCMP_UNO: + return ARMCC::VS; + case CmpInst::FCMP_UGE: + return ARMCC::PL; + case CmpInst::ICMP_SLT: + case CmpInst::FCMP_ULT: + return ARMCC::LT; + case CmpInst::ICMP_SLE: + case CmpInst::FCMP_ULE: + return ARMCC::LE; + case CmpInst::FCMP_UNE: + case CmpInst::ICMP_NE: + return ARMCC::NE; + case CmpInst::ICMP_UGE: + return ARMCC::HS; + case CmpInst::ICMP_ULT: + return ARMCC::LO; + } +} + +bool ARMFastISel::SelectBranch(const Instruction *I) { + const BranchInst *BI = cast<BranchInst>(I); + MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)]; + MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)]; + + // Simple branch support. + + // If we can, avoid recomputing the compare - redoing it could lead to wonky + // behavior. + // TODO: Factor this out. + if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) { + MVT SourceVT; + Type *Ty = CI->getOperand(0)->getType(); + if (CI->hasOneUse() && (CI->getParent() == I->getParent()) + && isTypeLegal(Ty, SourceVT)) { + bool isFloat = (Ty->isDoubleTy() || Ty->isFloatTy()); + if (isFloat && !Subtarget->hasVFP2()) + return false; + + unsigned CmpOpc; + switch (SourceVT.SimpleTy) { + default: return false; + // TODO: Verify compares. + case MVT::f32: + CmpOpc = ARM::VCMPES; + break; + case MVT::f64: + CmpOpc = ARM::VCMPED; + break; + case MVT::i32: + CmpOpc = isThumb ? ARM::t2CMPrr : ARM::CMPrr; + break; + } + + // Get the compare predicate. + // Try to take advantage of fallthrough opportunities. + CmpInst::Predicate Predicate = CI->getPredicate(); + if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { + std::swap(TBB, FBB); + Predicate = CmpInst::getInversePredicate(Predicate); + } + + ARMCC::CondCodes ARMPred = getComparePred(Predicate); + + // We may not handle every CC for now. + if (ARMPred == ARMCC::AL) return false; + + unsigned Arg1 = getRegForValue(CI->getOperand(0)); + if (Arg1 == 0) return false; + + unsigned Arg2 = getRegForValue(CI->getOperand(1)); + if (Arg2 == 0) return false; + + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(CmpOpc)) + .addReg(Arg1).addReg(Arg2)); + + // For floating point we need to move the result to a comparison register + // that we can then use for branches. + if (isFloat) + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(ARM::FMSTAT))); + + unsigned BrOpc = isThumb ? ARM::t2Bcc : ARM::Bcc; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BrOpc)) + .addMBB(TBB).addImm(ARMPred).addReg(ARM::CPSR); + FastEmitBranch(FBB, DL); + FuncInfo.MBB->addSuccessor(TBB); + return true; + } + } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) { + MVT SourceVT; + if (TI->hasOneUse() && TI->getParent() == I->getParent() && + (isLoadTypeLegal(TI->getOperand(0)->getType(), SourceVT))) { + unsigned TstOpc = isThumb ? ARM::t2TSTri : ARM::TSTri; + unsigned OpReg = getRegForValue(TI->getOperand(0)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(TstOpc)) + .addReg(OpReg).addImm(1)); + + unsigned CCMode = ARMCC::NE; + if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { + std::swap(TBB, FBB); + CCMode = ARMCC::EQ; + } + + unsigned BrOpc = isThumb ? ARM::t2Bcc : ARM::Bcc; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BrOpc)) + .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR); + + FastEmitBranch(FBB, DL); + FuncInfo.MBB->addSuccessor(TBB); + return true; + } + } + + unsigned CmpReg = getRegForValue(BI->getCondition()); + if (CmpReg == 0) return false; + + // We've been divorced from our compare! Our block was split, and + // now our compare lives in a predecessor block. We musn't + // re-compare here, as the children of the compare aren't guaranteed + // live across the block boundary (we *could* check for this). + // Regardless, the compare has been done in the predecessor block, + // and it left a value for us in a virtual register. Ergo, we test + // the one-bit value left in the virtual register. + unsigned TstOpc = isThumb ? ARM::t2TSTri : ARM::TSTri; + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TstOpc)) + .addReg(CmpReg).addImm(1)); + + unsigned CCMode = ARMCC::NE; + if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { + std::swap(TBB, FBB); + CCMode = ARMCC::EQ; + } + + unsigned BrOpc = isThumb ? ARM::t2Bcc : ARM::Bcc; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BrOpc)) + .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR); + FastEmitBranch(FBB, DL); + FuncInfo.MBB->addSuccessor(TBB); + return true; +} + +bool ARMFastISel::SelectCmp(const Instruction *I) { + const CmpInst *CI = cast<CmpInst>(I); + + MVT VT; + Type *Ty = CI->getOperand(0)->getType(); + if (!isTypeLegal(Ty, VT)) + return false; + + bool isFloat = (Ty->isDoubleTy() || Ty->isFloatTy()); + if (isFloat && !Subtarget->hasVFP2()) + return false; + + unsigned CmpOpc; + unsigned CondReg; + switch (VT.SimpleTy) { + default: return false; + // TODO: Verify compares. + case MVT::f32: + CmpOpc = ARM::VCMPES; + CondReg = ARM::FPSCR; + break; + case MVT::f64: + CmpOpc = ARM::VCMPED; + CondReg = ARM::FPSCR; + break; + case MVT::i32: + CmpOpc = isThumb ? ARM::t2CMPrr : ARM::CMPrr; + CondReg = ARM::CPSR; + break; + } + + // Get the compare predicate. + ARMCC::CondCodes ARMPred = getComparePred(CI->getPredicate()); + + // We may not handle every CC for now. + if (ARMPred == ARMCC::AL) return false; + + unsigned Arg1 = getRegForValue(CI->getOperand(0)); + if (Arg1 == 0) return false; + + unsigned Arg2 = getRegForValue(CI->getOperand(1)); + if (Arg2 == 0) return false; + + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc)) + .addReg(Arg1).addReg(Arg2)); + + // For floating point we need to move the result to a comparison register + // that we can then use for branches. + if (isFloat) + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(ARM::FMSTAT))); + + // Now set a register based on the comparison. Explicitly set the predicates + // here. + unsigned MovCCOpc = isThumb ? ARM::t2MOVCCi : ARM::MOVCCi; + TargetRegisterClass *RC = isThumb ? ARM::rGPRRegisterClass + : ARM::GPRRegisterClass; + unsigned DestReg = createResultReg(RC); + Constant *Zero + = ConstantInt::get(Type::getInt32Ty(*Context), 0); + unsigned ZeroReg = TargetMaterializeConstant(Zero); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(MovCCOpc), DestReg) + .addReg(ZeroReg).addImm(1) + .addImm(ARMPred).addReg(CondReg); + + UpdateValueMap(I, DestReg); + return true; +} + +bool ARMFastISel::SelectFPExt(const Instruction *I) { + // Make sure we have VFP and that we're extending float to double. + if (!Subtarget->hasVFP2()) return false; + + Value *V = I->getOperand(0); + if (!I->getType()->isDoubleTy() || + !V->getType()->isFloatTy()) return false; + + unsigned Op = getRegForValue(V); + if (Op == 0) return false; + + unsigned Result = createResultReg(ARM::DPRRegisterClass); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(ARM::VCVTDS), Result) + .addReg(Op)); + UpdateValueMap(I, Result); + return true; +} + +bool ARMFastISel::SelectFPTrunc(const Instruction *I) { + // Make sure we have VFP and that we're truncating double to float. + if (!Subtarget->hasVFP2()) return false; + + Value *V = I->getOperand(0); + if (!(I->getType()->isFloatTy() && + V->getType()->isDoubleTy())) return false; + + unsigned Op = getRegForValue(V); + if (Op == 0) return false; + + unsigned Result = createResultReg(ARM::SPRRegisterClass); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(ARM::VCVTSD), Result) + .addReg(Op)); + UpdateValueMap(I, Result); + return true; +} + +bool ARMFastISel::SelectSIToFP(const Instruction *I) { + // Make sure we have VFP. + if (!Subtarget->hasVFP2()) return false; + + MVT DstVT; + Type *Ty = I->getType(); + if (!isTypeLegal(Ty, DstVT)) + return false; + + // FIXME: Handle sign-extension where necessary. + if (!I->getOperand(0)->getType()->isIntegerTy(32)) + return false; + + unsigned Op = getRegForValue(I->getOperand(0)); + if (Op == 0) return false; + + // The conversion routine works on fp-reg to fp-reg and the operand above + // was an integer, move it to the fp registers if possible. + unsigned FP = ARMMoveToFPReg(MVT::f32, Op); + if (FP == 0) return false; + + unsigned Opc; + if (Ty->isFloatTy()) Opc = ARM::VSITOS; + else if (Ty->isDoubleTy()) Opc = ARM::VSITOD; + else return false; + + unsigned ResultReg = createResultReg(TLI.getRegClassFor(DstVT)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), + ResultReg) + .addReg(FP)); + UpdateValueMap(I, ResultReg); + return true; +} + +bool ARMFastISel::SelectFPToSI(const Instruction *I) { + // Make sure we have VFP. + if (!Subtarget->hasVFP2()) return false; + + MVT DstVT; + Type *RetTy = I->getType(); + if (!isTypeLegal(RetTy, DstVT)) + return false; + + unsigned Op = getRegForValue(I->getOperand(0)); + if (Op == 0) return false; + + unsigned Opc; + Type *OpTy = I->getOperand(0)->getType(); + if (OpTy->isFloatTy()) Opc = ARM::VTOSIZS; + else if (OpTy->isDoubleTy()) Opc = ARM::VTOSIZD; + else return false; + + // f64->s32 or f32->s32 both need an intermediate f32 reg. + unsigned ResultReg = createResultReg(TLI.getRegClassFor(MVT::f32)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), + ResultReg) + .addReg(Op)); + + // This result needs to be in an integer register, but the conversion only + // takes place in fp-regs. + unsigned IntReg = ARMMoveToIntReg(DstVT, ResultReg); + if (IntReg == 0) return false; + + UpdateValueMap(I, IntReg); + return true; +} + +bool ARMFastISel::SelectSelect(const Instruction *I) { + MVT VT; + if (!isTypeLegal(I->getType(), VT)) + return false; + + // Things need to be register sized for register moves. + if (VT != MVT::i32) return false; + const TargetRegisterClass *RC = TLI.getRegClassFor(VT); + + unsigned CondReg = getRegForValue(I->getOperand(0)); + if (CondReg == 0) return false; + unsigned Op1Reg = getRegForValue(I->getOperand(1)); + if (Op1Reg == 0) return false; + unsigned Op2Reg = getRegForValue(I->getOperand(2)); + if (Op2Reg == 0) return false; + + unsigned CmpOpc = isThumb ? ARM::t2TSTri : ARM::TSTri; + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc)) + .addReg(CondReg).addImm(1)); + unsigned ResultReg = createResultReg(RC); + unsigned MovCCOpc = isThumb ? ARM::t2MOVCCr : ARM::MOVCCr; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(MovCCOpc), ResultReg) + .addReg(Op1Reg).addReg(Op2Reg) + .addImm(ARMCC::EQ).addReg(ARM::CPSR); + UpdateValueMap(I, ResultReg); + return true; +} + +bool ARMFastISel::SelectSDiv(const Instruction *I) { + MVT VT; + Type *Ty = I->getType(); + if (!isTypeLegal(Ty, VT)) + return false; + + // If we have integer div support we should have selected this automagically. + // In case we have a real miss go ahead and return false and we'll pick + // it up later. + if (Subtarget->hasDivide()) return false; + + // Otherwise emit a libcall. + RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; + if (VT == MVT::i8) + LC = RTLIB::SDIV_I8; + else if (VT == MVT::i16) + LC = RTLIB::SDIV_I16; + else if (VT == MVT::i32) + LC = RTLIB::SDIV_I32; + else if (VT == MVT::i64) + LC = RTLIB::SDIV_I64; + else if (VT == MVT::i128) + LC = RTLIB::SDIV_I128; + assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SDIV!"); + + return ARMEmitLibcall(I, LC); +} + +bool ARMFastISel::SelectSRem(const Instruction *I) { + MVT VT; + Type *Ty = I->getType(); + if (!isTypeLegal(Ty, VT)) + return false; + + RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; + if (VT == MVT::i8) + LC = RTLIB::SREM_I8; + else if (VT == MVT::i16) + LC = RTLIB::SREM_I16; + else if (VT == MVT::i32) + LC = RTLIB::SREM_I32; + else if (VT == MVT::i64) + LC = RTLIB::SREM_I64; + else if (VT == MVT::i128) + LC = RTLIB::SREM_I128; + assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SREM!"); + + return ARMEmitLibcall(I, LC); +} + +bool ARMFastISel::SelectBinaryOp(const Instruction *I, unsigned ISDOpcode) { + EVT VT = TLI.getValueType(I->getType(), true); + + // We can get here in the case when we want to use NEON for our fp + // operations, but can't figure out how to. Just use the vfp instructions + // if we have them. + // FIXME: It'd be nice to use NEON instructions. + Type *Ty = I->getType(); + bool isFloat = (Ty->isDoubleTy() || Ty->isFloatTy()); + if (isFloat && !Subtarget->hasVFP2()) + return false; + + unsigned Op1 = getRegForValue(I->getOperand(0)); + if (Op1 == 0) return false; + + unsigned Op2 = getRegForValue(I->getOperand(1)); + if (Op2 == 0) return false; + + unsigned Opc; + bool is64bit = VT == MVT::f64 || VT == MVT::i64; + switch (ISDOpcode) { + default: return false; + case ISD::FADD: + Opc = is64bit ? ARM::VADDD : ARM::VADDS; + break; + case ISD::FSUB: + Opc = is64bit ? ARM::VSUBD : ARM::VSUBS; + break; + case ISD::FMUL: + Opc = is64bit ? ARM::VMULD : ARM::VMULS; + break; + } + unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(Opc), ResultReg) + .addReg(Op1).addReg(Op2)); + UpdateValueMap(I, ResultReg); + return true; +} + +// Call Handling Code + +bool ARMFastISel::FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, + EVT SrcVT, unsigned &ResultReg) { + unsigned RR = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc, + Src, /*TODO: Kill=*/false); + + if (RR != 0) { + ResultReg = RR; + return true; + } else + return false; +} + +// This is largely taken directly from CCAssignFnForNode - we don't support +// varargs in FastISel so that part has been removed. +// TODO: We may not support all of this. +CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC, bool Return) { + switch (CC) { + default: + llvm_unreachable("Unsupported calling convention"); + case CallingConv::Fast: + // Ignore fastcc. Silence compiler warnings. + (void)RetFastCC_ARM_APCS; + (void)FastCC_ARM_APCS; + // Fallthrough + case CallingConv::C: + // Use target triple & subtarget features to do actual dispatch. + if (Subtarget->isAAPCS_ABI()) { + if (Subtarget->hasVFP2() && + FloatABIType == FloatABI::Hard) + return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP); + else + return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS); + } else + return (Return ? RetCC_ARM_APCS: CC_ARM_APCS); + case CallingConv::ARM_AAPCS_VFP: + return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP); + case CallingConv::ARM_AAPCS: + return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS); + case CallingConv::ARM_APCS: + return (Return ? RetCC_ARM_APCS: CC_ARM_APCS); + } +} + +bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args, + SmallVectorImpl<unsigned> &ArgRegs, + SmallVectorImpl<MVT> &ArgVTs, + SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags, + SmallVectorImpl<unsigned> &RegArgs, + CallingConv::ID CC, + unsigned &NumBytes) { + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CC, false, *FuncInfo.MF, TM, ArgLocs, *Context); + CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC, false)); + + // Get a count of how many bytes are to be pushed on the stack. + NumBytes = CCInfo.getNextStackOffset(); + + // Issue CALLSEQ_START + unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(AdjStackDown)) + .addImm(NumBytes)); + + // Process the args. + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + unsigned Arg = ArgRegs[VA.getValNo()]; + MVT ArgVT = ArgVTs[VA.getValNo()]; + + // We don't handle NEON/vector parameters yet. + if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64) + return false; + + // Handle arg promotion, etc. + switch (VA.getLocInfo()) { + case CCValAssign::Full: break; + case CCValAssign::SExt: { + bool Emitted = FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), + Arg, ArgVT, Arg); + assert(Emitted && "Failed to emit a sext!"); (void)Emitted; + Emitted = true; + ArgVT = VA.getLocVT(); + break; + } + case CCValAssign::ZExt: { + bool Emitted = FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), + Arg, ArgVT, Arg); + assert(Emitted && "Failed to emit a zext!"); (void)Emitted; + Emitted = true; + ArgVT = VA.getLocVT(); + break; + } + case CCValAssign::AExt: { + bool Emitted = FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), + Arg, ArgVT, Arg); + if (!Emitted) + Emitted = FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), + Arg, ArgVT, Arg); + if (!Emitted) + Emitted = FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), + Arg, ArgVT, Arg); + + assert(Emitted && "Failed to emit a aext!"); (void)Emitted; + ArgVT = VA.getLocVT(); + break; + } + case CCValAssign::BCvt: { + unsigned BC = FastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, Arg, + /*TODO: Kill=*/false); + assert(BC != 0 && "Failed to emit a bitcast!"); + Arg = BC; + ArgVT = VA.getLocVT(); + break; + } + default: llvm_unreachable("Unknown arg promotion!"); + } + + // Now copy/store arg to correct locations. + if (VA.isRegLoc() && !VA.needsCustom()) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + VA.getLocReg()) + .addReg(Arg); + RegArgs.push_back(VA.getLocReg()); + } else if (VA.needsCustom()) { + // TODO: We need custom lowering for vector (v2f64) args. + if (VA.getLocVT() != MVT::f64) return false; + + CCValAssign &NextVA = ArgLocs[++i]; + + // TODO: Only handle register args for now. + if(!(VA.isRegLoc() && NextVA.isRegLoc())) return false; + + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(ARM::VMOVRRD), VA.getLocReg()) + .addReg(NextVA.getLocReg(), RegState::Define) + .addReg(Arg)); + RegArgs.push_back(VA.getLocReg()); + RegArgs.push_back(NextVA.getLocReg()); + } else { + assert(VA.isMemLoc()); + // Need to store on the stack. + Address Addr; + Addr.BaseType = Address::RegBase; + Addr.Base.Reg = ARM::SP; + Addr.Offset = VA.getLocMemOffset(); + + if (!ARMEmitStore(ArgVT, Arg, Addr)) return false; + } + } + return true; +} + +bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs, + const Instruction *I, CallingConv::ID CC, + unsigned &NumBytes) { + // Issue CALLSEQ_END + unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(AdjStackUp)) + .addImm(NumBytes).addImm(0)); + + // Now the return value. + if (RetVT != MVT::isVoid) { + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CC, false, *FuncInfo.MF, TM, RVLocs, *Context); + CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true)); + + // Copy all of the result registers out of their specified physreg. + if (RVLocs.size() == 2 && RetVT == MVT::f64) { + // For this move we copy into two registers and then move into the + // double fp reg we want. + EVT DestVT = RVLocs[0].getValVT(); + TargetRegisterClass* DstRC = TLI.getRegClassFor(DestVT); + unsigned ResultReg = createResultReg(DstRC); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(ARM::VMOVDRR), ResultReg) + .addReg(RVLocs[0].getLocReg()) + .addReg(RVLocs[1].getLocReg())); + + UsedRegs.push_back(RVLocs[0].getLocReg()); + UsedRegs.push_back(RVLocs[1].getLocReg()); + + // Finally update the result. + UpdateValueMap(I, ResultReg); + } else { + assert(RVLocs.size() == 1 &&"Can't handle non-double multi-reg retvals!"); + EVT CopyVT = RVLocs[0].getValVT(); + TargetRegisterClass* DstRC = TLI.getRegClassFor(CopyVT); + + unsigned ResultReg = createResultReg(DstRC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + ResultReg).addReg(RVLocs[0].getLocReg()); + UsedRegs.push_back(RVLocs[0].getLocReg()); + + // Finally update the result. + UpdateValueMap(I, ResultReg); + } + } + + return true; +} + +bool ARMFastISel::SelectRet(const Instruction *I) { + const ReturnInst *Ret = cast<ReturnInst>(I); + const Function &F = *I->getParent()->getParent(); + + if (!FuncInfo.CanLowerReturn) + return false; + + if (F.isVarArg()) + return false; + + CallingConv::ID CC = F.getCallingConv(); + if (Ret->getNumOperands() > 0) { + SmallVector<ISD::OutputArg, 4> Outs; + GetReturnInfo(F.getReturnType(), F.getAttributes().getRetAttributes(), + Outs, TLI); + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ValLocs; + CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,I->getContext()); + CCInfo.AnalyzeReturn(Outs, CCAssignFnForCall(CC, true /* is Ret */)); + + const Value *RV = Ret->getOperand(0); + unsigned Reg = getRegForValue(RV); + if (Reg == 0) + return false; + + // Only handle a single return value for now. + if (ValLocs.size() != 1) + return false; + + CCValAssign &VA = ValLocs[0]; + + // Don't bother handling odd stuff for now. + if (VA.getLocInfo() != CCValAssign::Full) + return false; + // Only handle register returns for now. + if (!VA.isRegLoc()) + return false; + // TODO: For now, don't try to handle cases where getLocInfo() + // says Full but the types don't match. + if (TLI.getValueType(RV->getType()) != VA.getValVT()) + return false; + + // Make the copy. + unsigned SrcReg = Reg + VA.getValNo(); + unsigned DstReg = VA.getLocReg(); + const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg); + // Avoid a cross-class copy. This is very unlikely. + if (!SrcRC->contains(DstReg)) + return false; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + DstReg).addReg(SrcReg); + + // Mark the register as live out of the function. + MRI.addLiveOut(VA.getLocReg()); + } + + unsigned RetOpc = isThumb ? ARM::tBX_RET : ARM::BX_RET; + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(RetOpc))); + return true; +} + +unsigned ARMFastISel::ARMSelectCallOp(const GlobalValue *GV) { + + // Darwin needs the r9 versions of the opcodes. + bool isDarwin = Subtarget->isTargetDarwin(); + if (isThumb) { + return isDarwin ? ARM::tBLr9 : ARM::tBL; + } else { + return isDarwin ? ARM::BLr9 : ARM::BL; + } +} + +// A quick function that will emit a call for a named libcall in F with the +// vector of passed arguments for the Instruction in I. We can assume that we +// can emit a call for any libcall we can produce. This is an abridged version +// of the full call infrastructure since we won't need to worry about things +// like computed function pointers or strange arguments at call sites. +// TODO: Try to unify this and the normal call bits for ARM, then try to unify +// with X86. +bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { + CallingConv::ID CC = TLI.getLibcallCallingConv(Call); + + // Handle *simple* calls for now. + Type *RetTy = I->getType(); + MVT RetVT; + if (RetTy->isVoidTy()) + RetVT = MVT::isVoid; + else if (!isTypeLegal(RetTy, RetVT)) + return false; + + // TODO: For now if we have long calls specified we don't handle the call. + if (EnableARMLongCalls) return false; + + // Set up the argument vectors. + SmallVector<Value*, 8> Args; + SmallVector<unsigned, 8> ArgRegs; + SmallVector<MVT, 8> ArgVTs; + SmallVector<ISD::ArgFlagsTy, 8> ArgFlags; + Args.reserve(I->getNumOperands()); + ArgRegs.reserve(I->getNumOperands()); + ArgVTs.reserve(I->getNumOperands()); + ArgFlags.reserve(I->getNumOperands()); + for (unsigned i = 0; i < I->getNumOperands(); ++i) { + Value *Op = I->getOperand(i); + unsigned Arg = getRegForValue(Op); + if (Arg == 0) return false; + + Type *ArgTy = Op->getType(); + MVT ArgVT; + if (!isTypeLegal(ArgTy, ArgVT)) return false; + + ISD::ArgFlagsTy Flags; + unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy); + Flags.setOrigAlign(OriginalAlignment); + + Args.push_back(Op); + ArgRegs.push_back(Arg); + ArgVTs.push_back(ArgVT); + ArgFlags.push_back(Flags); + } + + // Handle the arguments now that we've gotten them. + SmallVector<unsigned, 4> RegArgs; + unsigned NumBytes; + if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes)) + return false; + + // Issue the call, BLr9 for darwin, BL otherwise. + // TODO: Turn this into the table of arm call ops. + MachineInstrBuilder MIB; + unsigned CallOpc = ARMSelectCallOp(NULL); + if(isThumb) + // Explicitly adding the predicate here. + MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(CallOpc))) + .addExternalSymbol(TLI.getLibcallName(Call)); + else + // Explicitly adding the predicate here. + MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(CallOpc)) + .addExternalSymbol(TLI.getLibcallName(Call))); + + // Add implicit physical register uses to the call. + for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) + MIB.addReg(RegArgs[i]); + + // Finish off the call including any return values. + SmallVector<unsigned, 4> UsedRegs; + if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes)) return false; + + // Set all unused physreg defs as dead. + static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI); + + return true; +} + +bool ARMFastISel::SelectCall(const Instruction *I) { + const CallInst *CI = cast<CallInst>(I); + const Value *Callee = CI->getCalledValue(); + + // Can't handle inline asm or worry about intrinsics yet. + if (isa<InlineAsm>(Callee) || isa<IntrinsicInst>(CI)) return false; + + // Only handle global variable Callees. + const GlobalValue *GV = dyn_cast<GlobalValue>(Callee); + if (!GV) + return false; + + // Check the calling convention. + ImmutableCallSite CS(CI); + CallingConv::ID CC = CS.getCallingConv(); + + // TODO: Avoid some calling conventions? + + // Let SDISel handle vararg functions. + PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType()); + FunctionType *FTy = cast<FunctionType>(PT->getElementType()); + if (FTy->isVarArg()) + return false; + + // Handle *simple* calls for now. + Type *RetTy = I->getType(); + MVT RetVT; + if (RetTy->isVoidTy()) + RetVT = MVT::isVoid; + else if (!isTypeLegal(RetTy, RetVT)) + return false; + + // TODO: For now if we have long calls specified we don't handle the call. + if (EnableARMLongCalls) return false; + + // Set up the argument vectors. + SmallVector<Value*, 8> Args; + SmallVector<unsigned, 8> ArgRegs; + SmallVector<MVT, 8> ArgVTs; + SmallVector<ISD::ArgFlagsTy, 8> ArgFlags; + Args.reserve(CS.arg_size()); + ArgRegs.reserve(CS.arg_size()); + ArgVTs.reserve(CS.arg_size()); + ArgFlags.reserve(CS.arg_size()); + for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end(); + i != e; ++i) { + unsigned Arg = getRegForValue(*i); + + if (Arg == 0) + return false; + ISD::ArgFlagsTy Flags; + unsigned AttrInd = i - CS.arg_begin() + 1; + if (CS.paramHasAttr(AttrInd, Attribute::SExt)) + Flags.setSExt(); + if (CS.paramHasAttr(AttrInd, Attribute::ZExt)) + Flags.setZExt(); + + // FIXME: Only handle *easy* calls for now. + if (CS.paramHasAttr(AttrInd, Attribute::InReg) || + CS.paramHasAttr(AttrInd, Attribute::StructRet) || + CS.paramHasAttr(AttrInd, Attribute::Nest) || + CS.paramHasAttr(AttrInd, Attribute::ByVal)) + return false; + + Type *ArgTy = (*i)->getType(); + MVT ArgVT; + if (!isTypeLegal(ArgTy, ArgVT)) + return false; + unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy); + Flags.setOrigAlign(OriginalAlignment); + + Args.push_back(*i); + ArgRegs.push_back(Arg); + ArgVTs.push_back(ArgVT); + ArgFlags.push_back(Flags); + } + + // Handle the arguments now that we've gotten them. + SmallVector<unsigned, 4> RegArgs; + unsigned NumBytes; + if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes)) + return false; + + // Issue the call, BLr9 for darwin, BL otherwise. + // TODO: Turn this into the table of arm call ops. + MachineInstrBuilder MIB; + unsigned CallOpc = ARMSelectCallOp(GV); + // Explicitly adding the predicate here. + if(isThumb) + // Explicitly adding the predicate here. + MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(CallOpc))) + .addGlobalAddress(GV, 0, 0); + else + // Explicitly adding the predicate here. + MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(CallOpc)) + .addGlobalAddress(GV, 0, 0)); + + // Add implicit physical register uses to the call. + for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) + MIB.addReg(RegArgs[i]); + + // Finish off the call including any return values. + SmallVector<unsigned, 4> UsedRegs; + if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes)) return false; + + // Set all unused physreg defs as dead. + static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI); + + return true; + +} + +bool ARMFastISel::SelectIntCast(const Instruction *I) { + // On ARM, in general, integer casts don't involve legal types; this code + // handles promotable integers. The high bits for a type smaller than + // the register size are assumed to be undefined. + Type *DestTy = I->getType(); + Value *Op = I->getOperand(0); + Type *SrcTy = Op->getType(); + + EVT SrcVT, DestVT; + SrcVT = TLI.getValueType(SrcTy, true); + DestVT = TLI.getValueType(DestTy, true); + + if (isa<TruncInst>(I)) { + if (SrcVT != MVT::i32 && SrcVT != MVT::i16 && SrcVT != MVT::i8) + return false; + if (DestVT != MVT::i16 && DestVT != MVT::i8 && DestVT != MVT::i1) + return false; + + unsigned SrcReg = getRegForValue(Op); + if (!SrcReg) return false; + + // Because the high bits are undefined, a truncate doesn't generate + // any code. + UpdateValueMap(I, SrcReg); + return true; + } + if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8) + return false; + + unsigned Opc; + bool isZext = isa<ZExtInst>(I); + bool isBoolZext = false; + if (!SrcVT.isSimple()) + return false; + switch (SrcVT.getSimpleVT().SimpleTy) { + default: return false; + case MVT::i16: + if (!Subtarget->hasV6Ops()) return false; + if (isZext) + Opc = isThumb ? ARM::t2UXTH : ARM::UXTH; + else + Opc = isThumb ? ARM::t2SXTH : ARM::SXTH; + break; + case MVT::i8: + if (!Subtarget->hasV6Ops()) return false; + if (isZext) + Opc = isThumb ? ARM::t2UXTB : ARM::UXTB; + else + Opc = isThumb ? ARM::t2SXTB : ARM::SXTB; + break; + case MVT::i1: + if (isZext) { + Opc = isThumb ? ARM::t2ANDri : ARM::ANDri; + isBoolZext = true; + break; + } + return false; + } + + // FIXME: We could save an instruction in many cases by special-casing + // load instructions. + unsigned SrcReg = getRegForValue(Op); + if (!SrcReg) return false; + + unsigned DestReg = createResultReg(TLI.getRegClassFor(MVT::i32)); + MachineInstrBuilder MIB; + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg) + .addReg(SrcReg); + if (isBoolZext) + MIB.addImm(1); + else + MIB.addImm(0); + AddOptionalDefs(MIB); + UpdateValueMap(I, DestReg); + return true; +} + +// TODO: SoftFP support. +bool ARMFastISel::TargetSelectInstruction(const Instruction *I) { + + switch (I->getOpcode()) { + case Instruction::Load: + return SelectLoad(I); + case Instruction::Store: + return SelectStore(I); + case Instruction::Br: + return SelectBranch(I); + case Instruction::ICmp: + case Instruction::FCmp: + return SelectCmp(I); + case Instruction::FPExt: + return SelectFPExt(I); + case Instruction::FPTrunc: + return SelectFPTrunc(I); + case Instruction::SIToFP: + return SelectSIToFP(I); + case Instruction::FPToSI: + return SelectFPToSI(I); + case Instruction::FAdd: + return SelectBinaryOp(I, ISD::FADD); + case Instruction::FSub: + return SelectBinaryOp(I, ISD::FSUB); + case Instruction::FMul: + return SelectBinaryOp(I, ISD::FMUL); + case Instruction::SDiv: + return SelectSDiv(I); + case Instruction::SRem: + return SelectSRem(I); + case Instruction::Call: + return SelectCall(I); + case Instruction::Select: + return SelectSelect(I); + case Instruction::Ret: + return SelectRet(I); + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + return SelectIntCast(I); + default: break; + } + return false; +} + +namespace llvm { + llvm::FastISel *ARM::createFastISel(FunctionLoweringInfo &funcInfo) { + // Completely untested on non-darwin. + const TargetMachine &TM = funcInfo.MF->getTarget(); + + // Darwin and thumb1 only for now. + const ARMSubtarget *Subtarget = &TM.getSubtarget<ARMSubtarget>(); + if (Subtarget->isTargetDarwin() && !Subtarget->isThumb1Only() && + !DisableARMFastISel) + return new ARMFastISel(funcInfo); + return 0; + } +} diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp new file mode 100644 index 0000000..2d1de6f --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp @@ -0,0 +1,1087 @@ +//=======- ARMFrameLowering.cpp - ARM Frame Information --------*- C++ -*-====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the ARM implementation of TargetFrameLowering class. +// +//===----------------------------------------------------------------------===// + +#include "ARMFrameLowering.h" +#include "ARMBaseInstrInfo.h" +#include "ARMBaseRegisterInfo.h" +#include "ARMMachineFunctionInfo.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/Target/TargetOptions.h" + +using namespace llvm; + +/// hasFP - Return true if the specified function should have a dedicated frame +/// pointer register. This is true if the function has variable sized allocas +/// or if frame pointer elimination is disabled. +bool ARMFrameLowering::hasFP(const MachineFunction &MF) const { + const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo(); + + // Mac OS X requires FP not to be clobbered for backtracing purpose. + if (STI.isTargetDarwin()) + return true; + + const MachineFrameInfo *MFI = MF.getFrameInfo(); + // Always eliminate non-leaf frame pointers. + return ((DisableFramePointerElim(MF) && MFI->hasCalls()) || + RegInfo->needsStackRealignment(MF) || + MFI->hasVarSizedObjects() || + MFI->isFrameAddressTaken()); +} + +/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is +/// not required, we reserve argument space for call sites in the function +/// immediately on entry to the current function. This eliminates the need for +/// add/sub sp brackets around call sites. Returns true if the call frame is +/// included as part of the stack frame. +bool ARMFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { + const MachineFrameInfo *FFI = MF.getFrameInfo(); + unsigned CFSize = FFI->getMaxCallFrameSize(); + // It's not always a good idea to include the call frame as part of the + // stack frame. ARM (especially Thumb) has small immediate offset to + // address the stack frame. So a large call frame can cause poor codegen + // and may even makes it impossible to scavenge a register. + if (CFSize >= ((1 << 12) - 1) / 2) // Half of imm12 + return false; + + return !MF.getFrameInfo()->hasVarSizedObjects(); +} + +/// canSimplifyCallFramePseudos - If there is a reserved call frame, the +/// call frame pseudos can be simplified. Unlike most targets, having a FP +/// is not sufficient here since we still may reference some objects via SP +/// even when FP is available in Thumb2 mode. +bool +ARMFrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const { + return hasReservedCallFrame(MF) || MF.getFrameInfo()->hasVarSizedObjects(); +} + +static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) { + for (unsigned i = 0; CSRegs[i]; ++i) + if (Reg == CSRegs[i]) + return true; + return false; +} + +static bool isCSRestore(MachineInstr *MI, + const ARMBaseInstrInfo &TII, + const unsigned *CSRegs) { + // Integer spill area is handled with "pop". + if (MI->getOpcode() == ARM::LDMIA_RET || + MI->getOpcode() == ARM::t2LDMIA_RET || + MI->getOpcode() == ARM::LDMIA_UPD || + MI->getOpcode() == ARM::t2LDMIA_UPD || + MI->getOpcode() == ARM::VLDMDIA_UPD) { + // The first two operands are predicates. The last two are + // imp-def and imp-use of SP. Check everything in between. + for (int i = 5, e = MI->getNumOperands(); i != e; ++i) + if (!isCalleeSavedRegister(MI->getOperand(i).getReg(), CSRegs)) + return false; + return true; + } + if ((MI->getOpcode() == ARM::LDR_POST_IMM || + MI->getOpcode() == ARM::LDR_POST_REG || + MI->getOpcode() == ARM::t2LDR_POST) && + isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs) && + MI->getOperand(1).getReg() == ARM::SP) + return true; + + return false; +} + +static void +emitSPUpdate(bool isARM, + MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, + DebugLoc dl, const ARMBaseInstrInfo &TII, + int NumBytes, unsigned MIFlags = MachineInstr::NoFlags) { + if (isARM) + emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, + ARMCC::AL, 0, TII, MIFlags); + else + emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, + ARMCC::AL, 0, TII, MIFlags); +} + +void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { + MachineBasicBlock &MBB = MF.front(); + MachineBasicBlock::iterator MBBI = MBB.begin(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + const ARMBaseRegisterInfo *RegInfo = + static_cast<const ARMBaseRegisterInfo*>(MF.getTarget().getRegisterInfo()); + const ARMBaseInstrInfo &TII = + *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo()); + assert(!AFI->isThumb1OnlyFunction() && + "This emitPrologue does not support Thumb1!"); + bool isARM = !AFI->isThumbFunction(); + unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); + unsigned NumBytes = MFI->getStackSize(); + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + unsigned FramePtr = RegInfo->getFrameRegister(MF); + + // Determine the sizes of each callee-save spill areas and record which frame + // belongs to which callee-save spill areas. + unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0; + int FramePtrSpillFI = 0; + + // Allocate the vararg register save area. This is not counted in NumBytes. + if (VARegSaveSize) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -VARegSaveSize, + MachineInstr::FrameSetup); + + if (!AFI->hasStackFrame()) { + if (NumBytes != 0) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes, + MachineInstr::FrameSetup); + return; + } + + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + int FI = CSI[i].getFrameIdx(); + switch (Reg) { + case ARM::R4: + case ARM::R5: + case ARM::R6: + case ARM::R7: + case ARM::LR: + if (Reg == FramePtr) + FramePtrSpillFI = FI; + AFI->addGPRCalleeSavedArea1Frame(FI); + GPRCS1Size += 4; + break; + case ARM::R8: + case ARM::R9: + case ARM::R10: + case ARM::R11: + if (Reg == FramePtr) + FramePtrSpillFI = FI; + if (STI.isTargetDarwin()) { + AFI->addGPRCalleeSavedArea2Frame(FI); + GPRCS2Size += 4; + } else { + AFI->addGPRCalleeSavedArea1Frame(FI); + GPRCS1Size += 4; + } + break; + default: + AFI->addDPRCalleeSavedAreaFrame(FI); + DPRCSSize += 8; + } + } + + // Move past area 1. + if (GPRCS1Size > 0) MBBI++; + + // Set FP to point to the stack slot that contains the previous FP. + // For Darwin, FP is R7, which has now been stored in spill area 1. + // Otherwise, if this is not Darwin, all the callee-saved registers go + // into spill area 1, including the FP in R11. In either case, it is + // now safe to emit this assignment. + bool HasFP = hasFP(MF); + if (HasFP) { + unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri : ARM::t2ADDri; + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, dl, TII.get(ADDriOpc), FramePtr) + .addFrameIndex(FramePtrSpillFI).addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + AddDefaultCC(AddDefaultPred(MIB)); + } + + // Move past area 2. + if (GPRCS2Size > 0) MBBI++; + + // Determine starting offsets of spill areas. + unsigned DPRCSOffset = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize); + unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize; + unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size; + if (HasFP) + AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + + NumBytes); + AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset); + AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset); + AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset); + + // Move past area 3. + if (DPRCSSize > 0) { + MBBI++; + // Since vpush register list cannot have gaps, there may be multiple vpush + // instructions in the prologue. + while (MBBI->getOpcode() == ARM::VSTMDDB_UPD) + MBBI++; + } + + NumBytes = DPRCSOffset; + if (NumBytes) { + // Adjust SP after all the callee-save spills. + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes, + MachineInstr::FrameSetup); + if (HasFP && isARM) + // Restore from fp only in ARM mode: e.g. sub sp, r7, #24 + // Note it's not safe to do this in Thumb2 mode because it would have + // taken two instructions: + // mov sp, r7 + // sub sp, #24 + // If an interrupt is taken between the two instructions, then sp is in + // an inconsistent state (pointing to the middle of callee-saved area). + // The interrupt handler can end up clobbering the registers. + AFI->setShouldRestoreSPFromFP(true); + } + + if (STI.isTargetELF() && hasFP(MF)) + MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() - + AFI->getFramePtrSpillOffset()); + + AFI->setGPRCalleeSavedArea1Size(GPRCS1Size); + AFI->setGPRCalleeSavedArea2Size(GPRCS2Size); + AFI->setDPRCalleeSavedAreaSize(DPRCSSize); + + // If we need dynamic stack realignment, do it here. Be paranoid and make + // sure if we also have VLAs, we have a base pointer for frame access. + if (RegInfo->needsStackRealignment(MF)) { + unsigned MaxAlign = MFI->getMaxAlignment(); + assert (!AFI->isThumb1OnlyFunction()); + if (!AFI->isThumbFunction()) { + // Emit bic sp, sp, MaxAlign + AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, + TII.get(ARM::BICri), ARM::SP) + .addReg(ARM::SP, RegState::Kill) + .addImm(MaxAlign-1))); + } else { + // We cannot use sp as source/dest register here, thus we're emitting the + // following sequence: + // mov r4, sp + // bic r4, r4, MaxAlign + // mov sp, r4 + // FIXME: It will be better just to find spare register here. + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::R4) + .addReg(ARM::SP, RegState::Kill)); + AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, + TII.get(ARM::t2BICri), ARM::R4) + .addReg(ARM::R4, RegState::Kill) + .addImm(MaxAlign-1))); + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP) + .addReg(ARM::R4, RegState::Kill)); + } + + AFI->setShouldRestoreSPFromFP(true); + } + + // If we need a base pointer, set it up here. It's whatever the value + // of the stack pointer is at this point. Any variable size objects + // will be allocated after this, so we can still use the base pointer + // to reference locals. + // FIXME: Clarify FrameSetup flags here. + if (RegInfo->hasBasePointer(MF)) { + if (isARM) + BuildMI(MBB, MBBI, dl, + TII.get(ARM::MOVr), RegInfo->getBaseRegister()) + .addReg(ARM::SP) + .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0); + else + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), + RegInfo->getBaseRegister()) + .addReg(ARM::SP)); + } + + // If the frame has variable sized objects then the epilogue must restore + // the sp from fp. We can assume there's an FP here since hasFP already + // checks for hasVarSizedObjects. + if (MFI->hasVarSizedObjects()) + AFI->setShouldRestoreSPFromFP(true); +} + +void ARMFrameLowering::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + assert(MBBI->getDesc().isReturn() && + "Can only insert epilog into returning blocks"); + unsigned RetOpcode = MBBI->getOpcode(); + DebugLoc dl = MBBI->getDebugLoc(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo(); + const ARMBaseInstrInfo &TII = + *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo()); + assert(!AFI->isThumb1OnlyFunction() && + "This emitEpilogue does not support Thumb1!"); + bool isARM = !AFI->isThumbFunction(); + + unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); + int NumBytes = (int)MFI->getStackSize(); + unsigned FramePtr = RegInfo->getFrameRegister(MF); + + if (!AFI->hasStackFrame()) { + if (NumBytes != 0) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes); + } else { + // Unwind MBBI to point to first LDR / VLDRD. + const unsigned *CSRegs = RegInfo->getCalleeSavedRegs(); + if (MBBI != MBB.begin()) { + do + --MBBI; + while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs)); + if (!isCSRestore(MBBI, TII, CSRegs)) + ++MBBI; + } + + // Move SP to start of FP callee save spill area. + NumBytes -= (AFI->getGPRCalleeSavedArea1Size() + + AFI->getGPRCalleeSavedArea2Size() + + AFI->getDPRCalleeSavedAreaSize()); + + // Reset SP based on frame pointer only if the stack frame extends beyond + // frame pointer stack slot or target is ELF and the function has FP. + if (AFI->shouldRestoreSPFromFP()) { + NumBytes = AFI->getFramePtrSpillOffset() - NumBytes; + if (NumBytes) { + if (isARM) + emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes, + ARMCC::AL, 0, TII); + else { + // It's not possible to restore SP from FP in a single instruction. + // For Darwin, this looks like: + // mov sp, r7 + // sub sp, #24 + // This is bad, if an interrupt is taken after the mov, sp is in an + // inconsistent state. + // Use the first callee-saved register as a scratch register. + assert(MF.getRegInfo().isPhysRegUsed(ARM::R4) && + "No scratch register to restore SP from FP!"); + emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes, + ARMCC::AL, 0, TII); + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), + ARM::SP) + .addReg(ARM::R4)); + } + } else { + // Thumb2 or ARM. + if (isARM) + BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP) + .addReg(FramePtr).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0); + else + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), + ARM::SP) + .addReg(FramePtr)); + } + } else if (NumBytes) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes); + + // Increment past our save areas. + if (AFI->getDPRCalleeSavedAreaSize()) { + MBBI++; + // Since vpop register list cannot have gaps, there may be multiple vpop + // instructions in the epilogue. + while (MBBI->getOpcode() == ARM::VLDMDIA_UPD) + MBBI++; + } + if (AFI->getGPRCalleeSavedArea2Size()) MBBI++; + if (AFI->getGPRCalleeSavedArea1Size()) MBBI++; + } + + if (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNdiND || + RetOpcode == ARM::TCRETURNri || RetOpcode == ARM::TCRETURNriND) { + // Tail call return: adjust the stack pointer and jump to callee. + MBBI = MBB.getLastNonDebugInstr(); + MachineOperand &JumpTarget = MBBI->getOperand(0); + + // Jump to label or value in register. + if (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNdiND) { + unsigned TCOpcode = (RetOpcode == ARM::TCRETURNdi) + ? (STI.isThumb() ? ARM::tTAILJMPd : ARM::TAILJMPd) + : (STI.isThumb() ? ARM::tTAILJMPdND : ARM::TAILJMPdND); + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(TCOpcode)); + if (JumpTarget.isGlobal()) + MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), + JumpTarget.getTargetFlags()); + else { + assert(JumpTarget.isSymbol()); + MIB.addExternalSymbol(JumpTarget.getSymbolName(), + JumpTarget.getTargetFlags()); + } + + // Add the default predicate in Thumb mode. + if (STI.isThumb()) MIB.addImm(ARMCC::AL).addReg(0); + } else if (RetOpcode == ARM::TCRETURNri) { + BuildMI(MBB, MBBI, dl, + TII.get(STI.isThumb() ? ARM::tTAILJMPr : ARM::TAILJMPr)). + addReg(JumpTarget.getReg(), RegState::Kill); + } else if (RetOpcode == ARM::TCRETURNriND) { + BuildMI(MBB, MBBI, dl, + TII.get(STI.isThumb() ? ARM::tTAILJMPrND : ARM::TAILJMPrND)). + addReg(JumpTarget.getReg(), RegState::Kill); + } + + MachineInstr *NewMI = prior(MBBI); + for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i) + NewMI->addOperand(MBBI->getOperand(i)); + + // Delete the pseudo instruction TCRETURN. + MBB.erase(MBBI); + MBBI = NewMI; + } + + if (VARegSaveSize) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, VARegSaveSize); +} + +/// getFrameIndexReference - Provide a base+offset reference to an FI slot for +/// debug info. It's the same as what we use for resolving the code-gen +/// references for now. FIXME: This can go wrong when references are +/// SP-relative and simple call frames aren't used. +int +ARMFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const { + return ResolveFrameIndexReference(MF, FI, FrameReg, 0); +} + +int +ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF, + int FI, unsigned &FrameReg, + int SPAdj) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const ARMBaseRegisterInfo *RegInfo = + static_cast<const ARMBaseRegisterInfo*>(MF.getTarget().getRegisterInfo()); + const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize(); + int FPOffset = Offset - AFI->getFramePtrSpillOffset(); + bool isFixed = MFI->isFixedObjectIndex(FI); + + FrameReg = ARM::SP; + Offset += SPAdj; + if (AFI->isGPRCalleeSavedArea1Frame(FI)) + return Offset - AFI->getGPRCalleeSavedArea1Offset(); + else if (AFI->isGPRCalleeSavedArea2Frame(FI)) + return Offset - AFI->getGPRCalleeSavedArea2Offset(); + else if (AFI->isDPRCalleeSavedAreaFrame(FI)) + return Offset - AFI->getDPRCalleeSavedAreaOffset(); + + // When dynamically realigning the stack, use the frame pointer for + // parameters, and the stack/base pointer for locals. + if (RegInfo->needsStackRealignment(MF)) { + assert (hasFP(MF) && "dynamic stack realignment without a FP!"); + if (isFixed) { + FrameReg = RegInfo->getFrameRegister(MF); + Offset = FPOffset; + } else if (MFI->hasVarSizedObjects()) { + assert(RegInfo->hasBasePointer(MF) && + "VLAs and dynamic stack alignment, but missing base pointer!"); + FrameReg = RegInfo->getBaseRegister(); + } + return Offset; + } + + // If there is a frame pointer, use it when we can. + if (hasFP(MF) && AFI->hasStackFrame()) { + // Use frame pointer to reference fixed objects. Use it for locals if + // there are VLAs (and thus the SP isn't reliable as a base). + if (isFixed || (MFI->hasVarSizedObjects() && + !RegInfo->hasBasePointer(MF))) { + FrameReg = RegInfo->getFrameRegister(MF); + return FPOffset; + } else if (MFI->hasVarSizedObjects()) { + assert(RegInfo->hasBasePointer(MF) && "missing base pointer!"); + if (AFI->isThumb2Function()) { + // Try to use the frame pointer if we can, else use the base pointer + // since it's available. This is handy for the emergency spill slot, in + // particular. + if (FPOffset >= -255 && FPOffset < 0) { + FrameReg = RegInfo->getFrameRegister(MF); + return FPOffset; + } + } + } else if (AFI->isThumb2Function()) { + // Use add <rd>, sp, #<imm8> + // ldr <rd>, [sp, #<imm8>] + // if at all possible to save space. + if (Offset >= 0 && (Offset & 3) == 0 && Offset <= 1020) + return Offset; + // In Thumb2 mode, the negative offset is very limited. Try to avoid + // out of range references. ldr <rt>,[<rn>, #-<imm8>] + if (FPOffset >= -255 && FPOffset < 0) { + FrameReg = RegInfo->getFrameRegister(MF); + return FPOffset; + } + } else if (Offset > (FPOffset < 0 ? -FPOffset : FPOffset)) { + // Otherwise, use SP or FP, whichever is closer to the stack slot. + FrameReg = RegInfo->getFrameRegister(MF); + return FPOffset; + } + } + // Use the base pointer if we have one. + if (RegInfo->hasBasePointer(MF)) + FrameReg = RegInfo->getBaseRegister(); + return Offset; +} + +int ARMFrameLowering::getFrameIndexOffset(const MachineFunction &MF, + int FI) const { + unsigned FrameReg; + return getFrameIndexReference(MF, FI, FrameReg); +} + +void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + unsigned StmOpc, unsigned StrOpc, + bool NoGap, + bool(*Func)(unsigned, bool), + unsigned MIFlags) const { + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + + DebugLoc DL; + if (MI != MBB.end()) DL = MI->getDebugLoc(); + + SmallVector<std::pair<unsigned,bool>, 4> Regs; + unsigned i = CSI.size(); + while (i != 0) { + unsigned LastReg = 0; + for (; i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + if (!(Func)(Reg, STI.isTargetDarwin())) continue; + + // Add the callee-saved register as live-in unless it's LR and + // @llvm.returnaddress is called. If LR is returned for + // @llvm.returnaddress then it's already added to the function and + // entry block live-in sets. + bool isKill = true; + if (Reg == ARM::LR) { + if (MF.getFrameInfo()->isReturnAddressTaken() && + MF.getRegInfo().isLiveIn(Reg)) + isKill = false; + } + + if (isKill) + MBB.addLiveIn(Reg); + + // If NoGap is true, push consecutive registers and then leave the rest + // for other instructions. e.g. + // vpush {d8, d10, d11} -> vpush {d8}, vpush {d10, d11} + if (NoGap && LastReg && LastReg != Reg-1) + break; + LastReg = Reg; + Regs.push_back(std::make_pair(Reg, isKill)); + } + + if (Regs.empty()) + continue; + if (Regs.size() > 1 || StrOpc== 0) { + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(StmOpc), ARM::SP) + .addReg(ARM::SP).setMIFlags(MIFlags)); + for (unsigned i = 0, e = Regs.size(); i < e; ++i) + MIB.addReg(Regs[i].first, getKillRegState(Regs[i].second)); + } else if (Regs.size() == 1) { + MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc), + ARM::SP) + .addReg(Regs[0].first, getKillRegState(Regs[0].second)) + .addReg(ARM::SP).setMIFlags(MIFlags) + .addImm(-4); + AddDefaultPred(MIB); + } + Regs.clear(); + } +} + +void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + unsigned LdmOpc, unsigned LdrOpc, + bool isVarArg, bool NoGap, + bool(*Func)(unsigned, bool)) const { + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + DebugLoc DL = MI->getDebugLoc(); + unsigned RetOpcode = MI->getOpcode(); + bool isTailCall = (RetOpcode == ARM::TCRETURNdi || + RetOpcode == ARM::TCRETURNdiND || + RetOpcode == ARM::TCRETURNri || + RetOpcode == ARM::TCRETURNriND); + + SmallVector<unsigned, 4> Regs; + unsigned i = CSI.size(); + while (i != 0) { + unsigned LastReg = 0; + bool DeleteRet = false; + for (; i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + if (!(Func)(Reg, STI.isTargetDarwin())) continue; + + if (Reg == ARM::LR && !isTailCall && !isVarArg && STI.hasV5TOps()) { + Reg = ARM::PC; + LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET; + // Fold the return instruction into the LDM. + DeleteRet = true; + } + + // If NoGap is true, pop consecutive registers and then leave the rest + // for other instructions. e.g. + // vpop {d8, d10, d11} -> vpop {d8}, vpop {d10, d11} + if (NoGap && LastReg && LastReg != Reg-1) + break; + + LastReg = Reg; + Regs.push_back(Reg); + } + + if (Regs.empty()) + continue; + if (Regs.size() > 1 || LdrOpc == 0) { + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(LdmOpc), ARM::SP) + .addReg(ARM::SP)); + for (unsigned i = 0, e = Regs.size(); i < e; ++i) + MIB.addReg(Regs[i], getDefRegState(true)); + if (DeleteRet) { + MIB->copyImplicitOps(&*MI); + MI->eraseFromParent(); + } + MI = MIB; + } else if (Regs.size() == 1) { + // If we adjusted the reg to PC from LR above, switch it back here. We + // only do that for LDM. + if (Regs[0] == ARM::PC) + Regs[0] = ARM::LR; + MachineInstrBuilder MIB = + BuildMI(MBB, MI, DL, TII.get(LdrOpc), Regs[0]) + .addReg(ARM::SP, RegState::Define) + .addReg(ARM::SP); + // ARM mode needs an extra reg0 here due to addrmode2. Will go away once + // that refactoring is complete (eventually). + if (LdrOpc == ARM::LDR_POST_REG || LdrOpc == ARM::LDR_POST_IMM) { + MIB.addReg(0); + MIB.addImm(ARM_AM::getAM2Opc(ARM_AM::add, 4, ARM_AM::no_shift)); + } else + MIB.addImm(4); + AddDefaultPred(MIB); + } + Regs.clear(); + } +} + +bool ARMFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + if (CSI.empty()) + return false; + + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + + unsigned PushOpc = AFI->isThumbFunction() ? ARM::t2STMDB_UPD : ARM::STMDB_UPD; + unsigned PushOneOpc = AFI->isThumbFunction() ? + ARM::t2STR_PRE : ARM::STR_PRE_IMM; + unsigned FltOpc = ARM::VSTMDDB_UPD; + emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea1Register, + MachineInstr::FrameSetup); + emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea2Register, + MachineInstr::FrameSetup); + emitPushInst(MBB, MI, CSI, FltOpc, 0, true, &isARMArea3Register, + MachineInstr::FrameSetup); + + return true; +} + +bool ARMFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + if (CSI.empty()) + return false; + + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + bool isVarArg = AFI->getVarArgsRegSaveSize() > 0; + + unsigned PopOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_UPD : ARM::LDMIA_UPD; + unsigned LdrOpc = AFI->isThumbFunction() ? ARM::t2LDR_POST :ARM::LDR_POST_IMM; + unsigned FltOpc = ARM::VLDMDIA_UPD; + emitPopInst(MBB, MI, CSI, FltOpc, 0, isVarArg, true, &isARMArea3Register); + emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false, + &isARMArea2Register); + emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false, + &isARMArea1Register); + + return true; +} + +// FIXME: Make generic? +static unsigned GetFunctionSizeInBytes(const MachineFunction &MF, + const ARMBaseInstrInfo &TII) { + unsigned FnSize = 0; + for (MachineFunction::const_iterator MBBI = MF.begin(), E = MF.end(); + MBBI != E; ++MBBI) { + const MachineBasicBlock &MBB = *MBBI; + for (MachineBasicBlock::const_iterator I = MBB.begin(),E = MBB.end(); + I != E; ++I) + FnSize += TII.GetInstSizeInBytes(I); + } + return FnSize; +} + +/// estimateStackSize - Estimate and return the size of the frame. +/// FIXME: Make generic? +static unsigned estimateStackSize(MachineFunction &MF) { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo(); + unsigned MaxAlign = MFI->getMaxAlignment(); + int Offset = 0; + + // This code is very, very similar to PEI::calculateFrameObjectOffsets(). + // It really should be refactored to share code. Until then, changes + // should keep in mind that there's tight coupling between the two. + + for (int i = MFI->getObjectIndexBegin(); i != 0; ++i) { + int FixedOff = -MFI->getObjectOffset(i); + if (FixedOff > Offset) Offset = FixedOff; + } + for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) { + if (MFI->isDeadObjectIndex(i)) + continue; + Offset += MFI->getObjectSize(i); + unsigned Align = MFI->getObjectAlignment(i); + // Adjust to alignment boundary + Offset = (Offset+Align-1)/Align*Align; + + MaxAlign = std::max(Align, MaxAlign); + } + + if (MFI->adjustsStack() && TFI->hasReservedCallFrame(MF)) + Offset += MFI->getMaxCallFrameSize(); + + // Round up the size to a multiple of the alignment. If the function has + // any calls or alloca's, align to the target's StackAlignment value to + // ensure that the callee's frame or the alloca data is suitably aligned; + // otherwise, for leaf functions, align to the TransientStackAlignment + // value. + unsigned StackAlign; + if (MFI->adjustsStack() || MFI->hasVarSizedObjects() || + (RegInfo->needsStackRealignment(MF) && MFI->getObjectIndexEnd() != 0)) + StackAlign = TFI->getStackAlignment(); + else + StackAlign = TFI->getTransientStackAlignment(); + + // If the frame pointer is eliminated, all frame offsets will be relative to + // SP not FP. Align to MaxAlign so this works. + StackAlign = std::max(StackAlign, MaxAlign); + unsigned AlignMask = StackAlign - 1; + Offset = (Offset + AlignMask) & ~uint64_t(AlignMask); + + return (unsigned)Offset; +} + +/// estimateRSStackSizeLimit - Look at each instruction that references stack +/// frames and return the stack size limit beyond which some of these +/// instructions will require a scratch register during their expansion later. +// FIXME: Move to TII? +static unsigned estimateRSStackSizeLimit(MachineFunction &MF, + const TargetFrameLowering *TFI) { + const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned Limit = (1 << 12) - 1; + for (MachineFunction::iterator BB = MF.begin(),E = MF.end(); BB != E; ++BB) { + for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); + I != E; ++I) { + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { + if (!I->getOperand(i).isFI()) continue; + + // When using ADDri to get the address of a stack object, 255 is the + // largest offset guaranteed to fit in the immediate offset. + if (I->getOpcode() == ARM::ADDri) { + Limit = std::min(Limit, (1U << 8) - 1); + break; + } + + // Otherwise check the addressing mode. + switch (I->getDesc().TSFlags & ARMII::AddrModeMask) { + case ARMII::AddrMode3: + case ARMII::AddrModeT2_i8: + Limit = std::min(Limit, (1U << 8) - 1); + break; + case ARMII::AddrMode5: + case ARMII::AddrModeT2_i8s4: + Limit = std::min(Limit, ((1U << 8) - 1) * 4); + break; + case ARMII::AddrModeT2_i12: + // i12 supports only positive offset so these will be converted to + // i8 opcodes. See llvm::rewriteT2FrameIndex. + if (TFI->hasFP(MF) && AFI->hasStackFrame()) + Limit = std::min(Limit, (1U << 8) - 1); + break; + case ARMII::AddrMode4: + case ARMII::AddrMode6: + // Addressing modes 4 & 6 (load/store) instructions can't encode an + // immediate offset for stack references. + return 0; + default: + break; + } + break; // At most one FI per instruction + } + } + } + + return Limit; +} + +void +ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + RegScavenger *RS) const { + // This tells PEI to spill the FP as if it is any other callee-save register + // to take advantage the eliminateFrameIndex machinery. This also ensures it + // is spilled in the order specified by getCalleeSavedRegs() to make it easier + // to combine multiple loads / stores. + bool CanEliminateFrame = true; + bool CS1Spilled = false; + bool LRSpilled = false; + unsigned NumGPRSpills = 0; + SmallVector<unsigned, 4> UnspilledCS1GPRs; + SmallVector<unsigned, 4> UnspilledCS2GPRs; + const ARMBaseRegisterInfo *RegInfo = + static_cast<const ARMBaseRegisterInfo*>(MF.getTarget().getRegisterInfo()); + const ARMBaseInstrInfo &TII = + *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo()); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + unsigned FramePtr = RegInfo->getFrameRegister(MF); + + // Spill R4 if Thumb2 function requires stack realignment - it will be used as + // scratch register. Also spill R4 if Thumb2 function has varsized objects, + // since it's not always possible to restore sp from fp in a single + // instruction. + // FIXME: It will be better just to find spare register here. + if (AFI->isThumb2Function() && + (MFI->hasVarSizedObjects() || RegInfo->needsStackRealignment(MF))) + MF.getRegInfo().setPhysRegUsed(ARM::R4); + + if (AFI->isThumb1OnlyFunction()) { + // Spill LR if Thumb1 function uses variable length argument lists. + if (AFI->getVarArgsRegSaveSize() > 0) + MF.getRegInfo().setPhysRegUsed(ARM::LR); + + // Spill R4 if Thumb1 epilogue has to restore SP from FP. We don't know + // for sure what the stack size will be, but for this, an estimate is good + // enough. If there anything changes it, it'll be a spill, which implies + // we've used all the registers and so R4 is already used, so not marking + // it here will be OK. + // FIXME: It will be better just to find spare register here. + unsigned StackSize = estimateStackSize(MF); + if (MFI->hasVarSizedObjects() || StackSize > 508) + MF.getRegInfo().setPhysRegUsed(ARM::R4); + } + + // Spill the BasePtr if it's used. + if (RegInfo->hasBasePointer(MF)) + MF.getRegInfo().setPhysRegUsed(RegInfo->getBaseRegister()); + + // Don't spill FP if the frame can be eliminated. This is determined + // by scanning the callee-save registers to see if any is used. + const unsigned *CSRegs = RegInfo->getCalleeSavedRegs(); + for (unsigned i = 0; CSRegs[i]; ++i) { + unsigned Reg = CSRegs[i]; + bool Spilled = false; + if (MF.getRegInfo().isPhysRegUsed(Reg)) { + Spilled = true; + CanEliminateFrame = false; + } else { + // Check alias registers too. + for (const unsigned *Aliases = + RegInfo->getAliasSet(Reg); *Aliases; ++Aliases) { + if (MF.getRegInfo().isPhysRegUsed(*Aliases)) { + Spilled = true; + CanEliminateFrame = false; + } + } + } + + if (!ARM::GPRRegisterClass->contains(Reg)) + continue; + + if (Spilled) { + NumGPRSpills++; + + if (!STI.isTargetDarwin()) { + if (Reg == ARM::LR) + LRSpilled = true; + CS1Spilled = true; + continue; + } + + // Keep track if LR and any of R4, R5, R6, and R7 is spilled. + switch (Reg) { + case ARM::LR: + LRSpilled = true; + // Fallthrough + case ARM::R4: case ARM::R5: + case ARM::R6: case ARM::R7: + CS1Spilled = true; + break; + default: + break; + } + } else { + if (!STI.isTargetDarwin()) { + UnspilledCS1GPRs.push_back(Reg); + continue; + } + + switch (Reg) { + case ARM::R4: case ARM::R5: + case ARM::R6: case ARM::R7: + case ARM::LR: + UnspilledCS1GPRs.push_back(Reg); + break; + default: + UnspilledCS2GPRs.push_back(Reg); + break; + } + } + } + + bool ForceLRSpill = false; + if (!LRSpilled && AFI->isThumb1OnlyFunction()) { + unsigned FnSize = GetFunctionSizeInBytes(MF, TII); + // Force LR to be spilled if the Thumb function size is > 2048. This enables + // use of BL to implement far jump. If it turns out that it's not needed + // then the branch fix up path will undo it. + if (FnSize >= (1 << 11)) { + CanEliminateFrame = false; + ForceLRSpill = true; + } + } + + // If any of the stack slot references may be out of range of an immediate + // offset, make sure a register (or a spill slot) is available for the + // register scavenger. Note that if we're indexing off the frame pointer, the + // effective stack size is 4 bytes larger since the FP points to the stack + // slot of the previous FP. Also, if we have variable sized objects in the + // function, stack slot references will often be negative, and some of + // our instructions are positive-offset only, so conservatively consider + // that case to want a spill slot (or register) as well. Similarly, if + // the function adjusts the stack pointer during execution and the + // adjustments aren't already part of our stack size estimate, our offset + // calculations may be off, so be conservative. + // FIXME: We could add logic to be more precise about negative offsets + // and which instructions will need a scratch register for them. Is it + // worth the effort and added fragility? + bool BigStack = + (RS && + (estimateStackSize(MF) + ((hasFP(MF) && AFI->hasStackFrame()) ? 4:0) >= + estimateRSStackSizeLimit(MF, this))) + || MFI->hasVarSizedObjects() + || (MFI->adjustsStack() && !canSimplifyCallFramePseudos(MF)); + + bool ExtraCSSpill = false; + if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) { + AFI->setHasStackFrame(true); + + // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled. + // Spill LR as well so we can fold BX_RET to the registers restore (LDM). + if (!LRSpilled && CS1Spilled) { + MF.getRegInfo().setPhysRegUsed(ARM::LR); + NumGPRSpills++; + UnspilledCS1GPRs.erase(std::find(UnspilledCS1GPRs.begin(), + UnspilledCS1GPRs.end(), (unsigned)ARM::LR)); + ForceLRSpill = false; + ExtraCSSpill = true; + } + + if (hasFP(MF)) { + MF.getRegInfo().setPhysRegUsed(FramePtr); + NumGPRSpills++; + } + + // If stack and double are 8-byte aligned and we are spilling an odd number + // of GPRs, spill one extra callee save GPR so we won't have to pad between + // the integer and double callee save areas. + unsigned TargetAlign = getStackAlignment(); + if (TargetAlign == 8 && (NumGPRSpills & 1)) { + if (CS1Spilled && !UnspilledCS1GPRs.empty()) { + for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) { + unsigned Reg = UnspilledCS1GPRs[i]; + // Don't spill high register if the function is thumb1 + if (!AFI->isThumb1OnlyFunction() || + isARMLowRegister(Reg) || Reg == ARM::LR) { + MF.getRegInfo().setPhysRegUsed(Reg); + if (!RegInfo->isReservedReg(MF, Reg)) + ExtraCSSpill = true; + break; + } + } + } else if (!UnspilledCS2GPRs.empty() && !AFI->isThumb1OnlyFunction()) { + unsigned Reg = UnspilledCS2GPRs.front(); + MF.getRegInfo().setPhysRegUsed(Reg); + if (!RegInfo->isReservedReg(MF, Reg)) + ExtraCSSpill = true; + } + } + + // Estimate if we might need to scavenge a register at some point in order + // to materialize a stack offset. If so, either spill one additional + // callee-saved register or reserve a special spill slot to facilitate + // register scavenging. Thumb1 needs a spill slot for stack pointer + // adjustments also, even when the frame itself is small. + if (BigStack && !ExtraCSSpill) { + // If any non-reserved CS register isn't spilled, just spill one or two + // extra. That should take care of it! + unsigned NumExtras = TargetAlign / 4; + SmallVector<unsigned, 2> Extras; + while (NumExtras && !UnspilledCS1GPRs.empty()) { + unsigned Reg = UnspilledCS1GPRs.back(); + UnspilledCS1GPRs.pop_back(); + if (!RegInfo->isReservedReg(MF, Reg) && + (!AFI->isThumb1OnlyFunction() || isARMLowRegister(Reg) || + Reg == ARM::LR)) { + Extras.push_back(Reg); + NumExtras--; + } + } + // For non-Thumb1 functions, also check for hi-reg CS registers + if (!AFI->isThumb1OnlyFunction()) { + while (NumExtras && !UnspilledCS2GPRs.empty()) { + unsigned Reg = UnspilledCS2GPRs.back(); + UnspilledCS2GPRs.pop_back(); + if (!RegInfo->isReservedReg(MF, Reg)) { + Extras.push_back(Reg); + NumExtras--; + } + } + } + if (Extras.size() && NumExtras == 0) { + for (unsigned i = 0, e = Extras.size(); i != e; ++i) { + MF.getRegInfo().setPhysRegUsed(Extras[i]); + } + } else if (!AFI->isThumb1OnlyFunction()) { + // note: Thumb1 functions spill to R12, not the stack. Reserve a slot + // closest to SP or frame pointer. + const TargetRegisterClass *RC = ARM::GPRRegisterClass; + RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(), + RC->getAlignment(), + false)); + } + } + } + + if (ForceLRSpill) { + MF.getRegInfo().setPhysRegUsed(ARM::LR); + AFI->setLRIsSpilledForFarJump(true); + } +} diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h new file mode 100644 index 0000000..61bb8af --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h @@ -0,0 +1,76 @@ +//==-- ARMTargetFrameLowering.h - Define frame lowering for ARM --*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +// +//===----------------------------------------------------------------------===// + +#ifndef ARM_FRAMEINFO_H +#define ARM_FRAMEINFO_H + +#include "ARM.h" +#include "ARMSubtarget.h" +#include "llvm/Target/TargetFrameLowering.h" + +namespace llvm { + class ARMSubtarget; + +class ARMFrameLowering : public TargetFrameLowering { +protected: + const ARMSubtarget &STI; + +public: + explicit ARMFrameLowering(const ARMSubtarget &sti) + : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, 4), + STI(sti) { + } + + /// emitProlog/emitEpilog - These methods insert prolog and epilog code into + /// the function. + void emitPrologue(MachineFunction &MF) const; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; + + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const; + + bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const; + + bool hasFP(const MachineFunction &MF) const; + bool hasReservedCallFrame(const MachineFunction &MF) const; + bool canSimplifyCallFramePseudos(const MachineFunction &MF) const; + int getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const; + int ResolveFrameIndexReference(const MachineFunction &MF, + int FI, + unsigned &FrameReg, int SPAdj) const; + int getFrameIndexOffset(const MachineFunction &MF, int FI) const; + + void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + RegScavenger *RS) const; + + private: + void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, unsigned StmOpc, + unsigned StrOpc, bool NoGap, + bool(*Func)(unsigned, bool), + unsigned MIFlags = 0) const; + void emitPopInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, unsigned LdmOpc, + unsigned LdrOpc, bool isVarArg, bool NoGap, + bool(*Func)(unsigned, bool)) const; +}; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMGlobalMerge.cpp b/contrib/llvm/lib/Target/ARM/ARMGlobalMerge.cpp new file mode 100644 index 0000000..5f863ea --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMGlobalMerge.cpp @@ -0,0 +1,219 @@ +//===-- ARMGlobalMerge.cpp - Internal globals merging --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This pass merges globals with internal linkage into one. This way all the +// globals which were merged into a biggest one can be addressed using offsets +// from the same base pointer (no need for separate base pointer for each of the +// global). Such a transformation can significantly reduce the register pressure +// when many globals are involved. +// +// For example, consider the code which touches several global variables at +// once: +// +// static int foo[N], bar[N], baz[N]; +// +// for (i = 0; i < N; ++i) { +// foo[i] = bar[i] * baz[i]; +// } +// +// On ARM the addresses of 3 arrays should be kept in the registers, thus +// this code has quite large register pressure (loop body): +// +// ldr r1, [r5], #4 +// ldr r2, [r6], #4 +// mul r1, r2, r1 +// str r1, [r0], #4 +// +// Pass converts the code to something like: +// +// static struct { +// int foo[N]; +// int bar[N]; +// int baz[N]; +// } merged; +// +// for (i = 0; i < N; ++i) { +// merged.foo[i] = merged.bar[i] * merged.baz[i]; +// } +// +// and in ARM code this becomes: +// +// ldr r0, [r5, #40] +// ldr r1, [r5, #80] +// mul r0, r1, r0 +// str r0, [r5], #4 +// +// note that we saved 2 registers here almostly "for free". +// ===---------------------------------------------------------------------===// + +#define DEBUG_TYPE "arm-global-merge" +#include "ARM.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Attributes.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/GlobalVariable.h" +#include "llvm/Instructions.h" +#include "llvm/Intrinsics.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetLoweringObjectFile.h" +using namespace llvm; + +namespace { + class ARMGlobalMerge : public FunctionPass { + /// TLI - Keep a pointer of a TargetLowering to consult for determining + /// target type sizes. + const TargetLowering *TLI; + + bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals, + Module &M, bool isConst) const; + + public: + static char ID; // Pass identification, replacement for typeid. + explicit ARMGlobalMerge(const TargetLowering *tli) + : FunctionPass(ID), TLI(tli) {} + + virtual bool doInitialization(Module &M); + virtual bool runOnFunction(Function &F); + + const char *getPassName() const { + return "Merge internal globals"; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + FunctionPass::getAnalysisUsage(AU); + } + + struct GlobalCmp { + const TargetData *TD; + + GlobalCmp(const TargetData *td) : TD(td) { } + + bool operator()(const GlobalVariable *GV1, const GlobalVariable *GV2) { + Type *Ty1 = cast<PointerType>(GV1->getType())->getElementType(); + Type *Ty2 = cast<PointerType>(GV2->getType())->getElementType(); + + return (TD->getTypeAllocSize(Ty1) < TD->getTypeAllocSize(Ty2)); + } + }; + }; +} // end anonymous namespace + +char ARMGlobalMerge::ID = 0; + +bool ARMGlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals, + Module &M, bool isConst) const { + const TargetData *TD = TLI->getTargetData(); + + // FIXME: Infer the maximum possible offset depending on the actual users + // (these max offsets are different for the users inside Thumb or ARM + // functions) + unsigned MaxOffset = TLI->getMaximalGlobalOffset(); + + // FIXME: Find better heuristics + std::stable_sort(Globals.begin(), Globals.end(), GlobalCmp(TD)); + + Type *Int32Ty = Type::getInt32Ty(M.getContext()); + + for (size_t i = 0, e = Globals.size(); i != e; ) { + size_t j = 0; + uint64_t MergedSize = 0; + std::vector<Type*> Tys; + std::vector<Constant*> Inits; + for (j = i; j != e; ++j) { + Type *Ty = Globals[j]->getType()->getElementType(); + MergedSize += TD->getTypeAllocSize(Ty); + if (MergedSize > MaxOffset) { + break; + } + Tys.push_back(Ty); + Inits.push_back(Globals[j]->getInitializer()); + } + + StructType *MergedTy = StructType::get(M.getContext(), Tys); + Constant *MergedInit = ConstantStruct::get(MergedTy, Inits); + GlobalVariable *MergedGV = new GlobalVariable(M, MergedTy, isConst, + GlobalValue::InternalLinkage, + MergedInit, "_MergedGlobals"); + for (size_t k = i; k < j; ++k) { + Constant *Idx[2] = { + ConstantInt::get(Int32Ty, 0), + ConstantInt::get(Int32Ty, k-i) + }; + Constant *GEP = ConstantExpr::getInBoundsGetElementPtr(MergedGV, Idx); + Globals[k]->replaceAllUsesWith(GEP); + Globals[k]->eraseFromParent(); + } + i = j; + } + + return true; +} + + +bool ARMGlobalMerge::doInitialization(Module &M) { + SmallVector<GlobalVariable*, 16> Globals, ConstGlobals, BSSGlobals; + const TargetData *TD = TLI->getTargetData(); + unsigned MaxOffset = TLI->getMaximalGlobalOffset(); + bool Changed = false; + + // Grab all non-const globals. + for (Module::global_iterator I = M.global_begin(), + E = M.global_end(); I != E; ++I) { + // Merge is safe for "normal" internal globals only + if (!I->hasLocalLinkage() || I->isThreadLocal() || I->hasSection()) + continue; + + // Ignore fancy-aligned globals for now. + unsigned Alignment = I->getAlignment(); + Type *Ty = I->getType()->getElementType(); + if (Alignment > TD->getABITypeAlignment(Ty)) + continue; + + // Ignore all 'special' globals. + if (I->getName().startswith("llvm.") || + I->getName().startswith(".llvm.")) + continue; + + if (TD->getTypeAllocSize(Ty) < MaxOffset) { + const TargetLoweringObjectFile &TLOF = TLI->getObjFileLowering(); + if (TLOF.getKindForGlobal(I, TLI->getTargetMachine()).isBSSLocal()) + BSSGlobals.push_back(I); + else if (I->isConstant()) + ConstGlobals.push_back(I); + else + Globals.push_back(I); + } + } + + if (Globals.size() > 1) + Changed |= doMerge(Globals, M, false); + if (BSSGlobals.size() > 1) + Changed |= doMerge(BSSGlobals, M, false); + + // FIXME: This currently breaks the EH processing due to way how the + // typeinfo detection works. We might want to detect the TIs and ignore + // them in the future. + // if (ConstGlobals.size() > 1) + // Changed |= doMerge(ConstGlobals, M, true); + + return Changed; +} + +bool ARMGlobalMerge::runOnFunction(Function &F) { + return false; +} + +FunctionPass *llvm::createARMGlobalMergePass(const TargetLowering *tli) { + return new ARMGlobalMerge(tli); +} diff --git a/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp new file mode 100644 index 0000000..787f6a2 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp @@ -0,0 +1,120 @@ +//===-- ARMHazardRecognizer.cpp - ARM postra hazard recognizer ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "ARMHazardRecognizer.h" +#include "ARMBaseInstrInfo.h" +#include "ARMBaseRegisterInfo.h" +#include "ARMSubtarget.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/Target/TargetRegisterInfo.h" +using namespace llvm; + +static bool hasRAWHazard(MachineInstr *DefMI, MachineInstr *MI, + const TargetRegisterInfo &TRI) { + // FIXME: Detect integer instructions properly. + const MCInstrDesc &MCID = MI->getDesc(); + unsigned Domain = MCID.TSFlags & ARMII::DomainMask; + if (MCID.mayStore()) + return false; + unsigned Opcode = MCID.getOpcode(); + if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD) + return false; + if ((Domain & ARMII::DomainVFP) || (Domain & ARMII::DomainNEON)) + return MI->readsRegister(DefMI->getOperand(0).getReg(), &TRI); + return false; +} + +ScheduleHazardRecognizer::HazardType +ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { + assert(Stalls == 0 && "ARM hazards don't support scoreboard lookahead"); + + MachineInstr *MI = SU->getInstr(); + + if (!MI->isDebugValue()) { + if (ITBlockSize && MI != ITBlockMIs[ITBlockSize-1]) + return Hazard; + + // Look for special VMLA / VMLS hazards. A VMUL / VADD / VSUB following + // a VMLA / VMLS will cause 4 cycle stall. + const MCInstrDesc &MCID = MI->getDesc(); + if (LastMI && (MCID.TSFlags & ARMII::DomainMask) != ARMII::DomainGeneral) { + MachineInstr *DefMI = LastMI; + const MCInstrDesc &LastMCID = LastMI->getDesc(); + // Skip over one non-VFP / NEON instruction. + if (!LastMCID.isBarrier() && + // On A9, AGU and NEON/FPU are muxed. + !(STI.isCortexA9() && (LastMCID.mayLoad() || LastMCID.mayStore())) && + (LastMCID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) { + MachineBasicBlock::iterator I = LastMI; + if (I != LastMI->getParent()->begin()) { + I = llvm::prior(I); + DefMI = &*I; + } + } + + if (TII.isFpMLxInstruction(DefMI->getOpcode()) && + (TII.canCauseFpMLxStall(MI->getOpcode()) || + hasRAWHazard(DefMI, MI, TRI))) { + // Try to schedule another instruction for the next 4 cycles. + if (FpMLxStalls == 0) + FpMLxStalls = 4; + return Hazard; + } + } + } + + return ScoreboardHazardRecognizer::getHazardType(SU, Stalls); +} + +void ARMHazardRecognizer::Reset() { + LastMI = 0; + FpMLxStalls = 0; + ITBlockSize = 0; + ScoreboardHazardRecognizer::Reset(); +} + +void ARMHazardRecognizer::EmitInstruction(SUnit *SU) { + MachineInstr *MI = SU->getInstr(); + unsigned Opcode = MI->getOpcode(); + if (ITBlockSize) { + --ITBlockSize; + } else if (Opcode == ARM::t2IT) { + unsigned Mask = MI->getOperand(1).getImm(); + unsigned NumTZ = CountTrailingZeros_32(Mask); + assert(NumTZ <= 3 && "Invalid IT mask!"); + ITBlockSize = 4 - NumTZ; + MachineBasicBlock::iterator I = MI; + for (unsigned i = 0; i < ITBlockSize; ++i) { + // Advance to the next instruction, skipping any dbg_value instructions. + do { + ++I; + } while (I->isDebugValue()); + ITBlockMIs[ITBlockSize-1-i] = &*I; + } + } + + if (!MI->isDebugValue()) { + LastMI = MI; + FpMLxStalls = 0; + } + + ScoreboardHazardRecognizer::EmitInstruction(SU); +} + +void ARMHazardRecognizer::AdvanceCycle() { + if (FpMLxStalls && --FpMLxStalls == 0) + // Stalled for 4 cycles but still can't schedule any other instructions. + LastMI = 0; + ScoreboardHazardRecognizer::AdvanceCycle(); +} + +void ARMHazardRecognizer::RecedeCycle() { + llvm_unreachable("reverse ARM hazard checking unsupported"); +} diff --git a/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.h b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.h new file mode 100644 index 0000000..2bc218d --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.h @@ -0,0 +1,54 @@ +//===-- ARMHazardRecognizer.h - ARM Hazard Recognizers ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines hazard recognizers for scheduling ARM functions. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMHAZARDRECOGNIZER_H +#define ARMHAZARDRECOGNIZER_H + +#include "llvm/CodeGen/ScoreboardHazardRecognizer.h" + +namespace llvm { + +class ARMBaseInstrInfo; +class ARMBaseRegisterInfo; +class ARMSubtarget; +class MachineInstr; + +class ARMHazardRecognizer : public ScoreboardHazardRecognizer { + const ARMBaseInstrInfo &TII; + const ARMBaseRegisterInfo &TRI; + const ARMSubtarget &STI; + + MachineInstr *LastMI; + unsigned FpMLxStalls; + unsigned ITBlockSize; // No. of MIs in current IT block yet to be scheduled. + MachineInstr *ITBlockMIs[4]; + +public: + ARMHazardRecognizer(const InstrItineraryData *ItinData, + const ARMBaseInstrInfo &tii, + const ARMBaseRegisterInfo &tri, + const ARMSubtarget &sti, + const ScheduleDAG *DAG) : + ScoreboardHazardRecognizer(ItinData, DAG, "post-RA-sched"), TII(tii), + TRI(tri), STI(sti), LastMI(0), ITBlockSize(0) {} + + virtual HazardType getHazardType(SUnit *SU, int Stalls); + virtual void Reset(); + virtual void EmitInstruction(SUnit *SU); + virtual void AdvanceCycle(); + virtual void RecedeCycle(); +}; + +} // end namespace llvm + +#endif // ARMHAZARDRECOGNIZER_H diff --git a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp new file mode 100644 index 0000000..5ee009c --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -0,0 +1,3211 @@ +//===-- ARMISelDAGToDAG.cpp - A dag to dag inst selector for ARM ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines an instruction selector for the ARM target. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "arm-isel" +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "ARMTargetMachine.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/CallingConv.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/Intrinsics.h" +#include "llvm/LLVMContext.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +static cl::opt<bool> +DisableShifterOp("disable-shifter-op", cl::Hidden, + cl::desc("Disable isel of shifter-op"), + cl::init(false)); + +static cl::opt<bool> +CheckVMLxHazard("check-vmlx-hazard", cl::Hidden, + cl::desc("Check fp vmla / vmls hazard at isel time"), + cl::init(true)); + +static cl::opt<bool> +DisableARMIntABS("disable-arm-int-abs", cl::Hidden, + cl::desc("Enable / disable ARM integer abs transform"), + cl::init(false)); + +//===--------------------------------------------------------------------===// +/// ARMDAGToDAGISel - ARM specific code to select ARM machine +/// instructions for SelectionDAG operations. +/// +namespace { + +enum AddrMode2Type { + AM2_BASE, // Simple AM2 (+-imm12) + AM2_SHOP // Shifter-op AM2 +}; + +class ARMDAGToDAGISel : public SelectionDAGISel { + ARMBaseTargetMachine &TM; + const ARMBaseInstrInfo *TII; + + /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can + /// make the right decision when generating code for different targets. + const ARMSubtarget *Subtarget; + +public: + explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm, + CodeGenOpt::Level OptLevel) + : SelectionDAGISel(tm, OptLevel), TM(tm), + TII(static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo())), + Subtarget(&TM.getSubtarget<ARMSubtarget>()) { + } + + virtual const char *getPassName() const { + return "ARM Instruction Selection"; + } + + /// getI32Imm - Return a target constant of type i32 with the specified + /// value. + inline SDValue getI32Imm(unsigned Imm) { + return CurDAG->getTargetConstant(Imm, MVT::i32); + } + + SDNode *Select(SDNode *N); + + + bool hasNoVMLxHazardUse(SDNode *N) const; + bool isShifterOpProfitable(const SDValue &Shift, + ARM_AM::ShiftOpc ShOpcVal, unsigned ShAmt); + bool SelectRegShifterOperand(SDValue N, SDValue &A, + SDValue &B, SDValue &C, + bool CheckProfitability = true); + bool SelectImmShifterOperand(SDValue N, SDValue &A, + SDValue &B, bool CheckProfitability = true); + bool SelectShiftRegShifterOperand(SDValue N, SDValue &A, + SDValue &B, SDValue &C) { + // Don't apply the profitability check + return SelectRegShifterOperand(N, A, B, C, false); + } + bool SelectShiftImmShifterOperand(SDValue N, SDValue &A, + SDValue &B) { + // Don't apply the profitability check + return SelectImmShifterOperand(N, A, B, false); + } + + bool SelectAddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm); + bool SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset, SDValue &Opc); + + AddrMode2Type SelectAddrMode2Worker(SDValue N, SDValue &Base, + SDValue &Offset, SDValue &Opc); + bool SelectAddrMode2Base(SDValue N, SDValue &Base, SDValue &Offset, + SDValue &Opc) { + return SelectAddrMode2Worker(N, Base, Offset, Opc) == AM2_BASE; + } + + bool SelectAddrMode2ShOp(SDValue N, SDValue &Base, SDValue &Offset, + SDValue &Opc) { + return SelectAddrMode2Worker(N, Base, Offset, Opc) == AM2_SHOP; + } + + bool SelectAddrMode2(SDValue N, SDValue &Base, SDValue &Offset, + SDValue &Opc) { + SelectAddrMode2Worker(N, Base, Offset, Opc); +// return SelectAddrMode2ShOp(N, Base, Offset, Opc); + // This always matches one way or another. + return true; + } + + bool SelectAddrMode2OffsetReg(SDNode *Op, SDValue N, + SDValue &Offset, SDValue &Opc); + bool SelectAddrMode2OffsetImm(SDNode *Op, SDValue N, + SDValue &Offset, SDValue &Opc); + bool SelectAddrMode2OffsetImmPre(SDNode *Op, SDValue N, + SDValue &Offset, SDValue &Opc); + bool SelectAddrOffsetNone(SDValue N, SDValue &Base); + bool SelectAddrMode3(SDValue N, SDValue &Base, + SDValue &Offset, SDValue &Opc); + bool SelectAddrMode3Offset(SDNode *Op, SDValue N, + SDValue &Offset, SDValue &Opc); + bool SelectAddrMode5(SDValue N, SDValue &Base, + SDValue &Offset); + bool SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,SDValue &Align); + bool SelectAddrMode6Offset(SDNode *Op, SDValue N, SDValue &Offset); + + bool SelectAddrModePC(SDValue N, SDValue &Offset, SDValue &Label); + + // Thumb Addressing Modes: + bool SelectThumbAddrModeRR(SDValue N, SDValue &Base, SDValue &Offset); + bool SelectThumbAddrModeRI(SDValue N, SDValue &Base, SDValue &Offset, + unsigned Scale); + bool SelectThumbAddrModeRI5S1(SDValue N, SDValue &Base, SDValue &Offset); + bool SelectThumbAddrModeRI5S2(SDValue N, SDValue &Base, SDValue &Offset); + bool SelectThumbAddrModeRI5S4(SDValue N, SDValue &Base, SDValue &Offset); + bool SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, SDValue &Base, + SDValue &OffImm); + bool SelectThumbAddrModeImm5S1(SDValue N, SDValue &Base, + SDValue &OffImm); + bool SelectThumbAddrModeImm5S2(SDValue N, SDValue &Base, + SDValue &OffImm); + bool SelectThumbAddrModeImm5S4(SDValue N, SDValue &Base, + SDValue &OffImm); + bool SelectThumbAddrModeSP(SDValue N, SDValue &Base, SDValue &OffImm); + + // Thumb 2 Addressing Modes: + bool SelectT2ShifterOperandReg(SDValue N, + SDValue &BaseReg, SDValue &Opc); + bool SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm); + bool SelectT2AddrModeImm8(SDValue N, SDValue &Base, + SDValue &OffImm); + bool SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N, + SDValue &OffImm); + bool SelectT2AddrModeSoReg(SDValue N, SDValue &Base, + SDValue &OffReg, SDValue &ShImm); + + inline bool is_so_imm(unsigned Imm) const { + return ARM_AM::getSOImmVal(Imm) != -1; + } + + inline bool is_so_imm_not(unsigned Imm) const { + return ARM_AM::getSOImmVal(~Imm) != -1; + } + + inline bool is_t2_so_imm(unsigned Imm) const { + return ARM_AM::getT2SOImmVal(Imm) != -1; + } + + inline bool is_t2_so_imm_not(unsigned Imm) const { + return ARM_AM::getT2SOImmVal(~Imm) != -1; + } + + // Include the pieces autogenerated from the target description. +#include "ARMGenDAGISel.inc" + +private: + /// SelectARMIndexedLoad - Indexed (pre/post inc/dec) load matching code for + /// ARM. + SDNode *SelectARMIndexedLoad(SDNode *N); + SDNode *SelectT2IndexedLoad(SDNode *N); + + /// SelectVLD - Select NEON load intrinsics. NumVecs should be + /// 1, 2, 3 or 4. The opcode arrays specify the instructions used for + /// loads of D registers and even subregs and odd subregs of Q registers. + /// For NumVecs <= 2, QOpcodes1 is not used. + SDNode *SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, + unsigned *DOpcodes, + unsigned *QOpcodes0, unsigned *QOpcodes1); + + /// SelectVST - Select NEON store intrinsics. NumVecs should + /// be 1, 2, 3 or 4. The opcode arrays specify the instructions used for + /// stores of D registers and even subregs and odd subregs of Q registers. + /// For NumVecs <= 2, QOpcodes1 is not used. + SDNode *SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, + unsigned *DOpcodes, + unsigned *QOpcodes0, unsigned *QOpcodes1); + + /// SelectVLDSTLane - Select NEON load/store lane intrinsics. NumVecs should + /// be 2, 3 or 4. The opcode arrays specify the instructions used for + /// load/store of D registers and Q registers. + SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, + bool isUpdating, unsigned NumVecs, + unsigned *DOpcodes, unsigned *QOpcodes); + + /// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs + /// should be 2, 3 or 4. The opcode array specifies the instructions used + /// for loading D registers. (Q registers are not supported.) + SDNode *SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs, + unsigned *Opcodes); + + /// SelectVTBL - Select NEON VTBL and VTBX intrinsics. NumVecs should be 2, + /// 3 or 4. These are custom-selected so that a REG_SEQUENCE can be + /// generated to force the table registers to be consecutive. + SDNode *SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, unsigned Opc); + + /// SelectV6T2BitfieldExtractOp - Select SBFX/UBFX instructions for ARM. + SDNode *SelectV6T2BitfieldExtractOp(SDNode *N, bool isSigned); + + /// SelectCMOVOp - Select CMOV instructions for ARM. + SDNode *SelectCMOVOp(SDNode *N); + SDNode *SelectT2CMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, + ARMCC::CondCodes CCVal, SDValue CCR, + SDValue InFlag); + SDNode *SelectARMCMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, + ARMCC::CondCodes CCVal, SDValue CCR, + SDValue InFlag); + SDNode *SelectT2CMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, + ARMCC::CondCodes CCVal, SDValue CCR, + SDValue InFlag); + SDNode *SelectARMCMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, + ARMCC::CondCodes CCVal, SDValue CCR, + SDValue InFlag); + + // Select special operations if node forms integer ABS pattern + SDNode *SelectABSOp(SDNode *N); + + SDNode *SelectConcatVector(SDNode *N); + + SDNode *SelectAtomic64(SDNode *Node, unsigned Opc); + + /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for + /// inline asm expressions. + virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op, + char ConstraintCode, + std::vector<SDValue> &OutOps); + + // Form pairs of consecutive S, D, or Q registers. + SDNode *PairSRegs(EVT VT, SDValue V0, SDValue V1); + SDNode *PairDRegs(EVT VT, SDValue V0, SDValue V1); + SDNode *PairQRegs(EVT VT, SDValue V0, SDValue V1); + + // Form sequences of 4 consecutive S, D, or Q registers. + SDNode *QuadSRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3); + SDNode *QuadDRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3); + SDNode *QuadQRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3); + + // Get the alignment operand for a NEON VLD or VST instruction. + SDValue GetVLDSTAlign(SDValue Align, unsigned NumVecs, bool is64BitVector); +}; +} + +/// isInt32Immediate - This method tests to see if the node is a 32-bit constant +/// operand. If so Imm will receive the 32-bit value. +static bool isInt32Immediate(SDNode *N, unsigned &Imm) { + if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) { + Imm = cast<ConstantSDNode>(N)->getZExtValue(); + return true; + } + return false; +} + +// isInt32Immediate - This method tests to see if a constant operand. +// If so Imm will receive the 32 bit value. +static bool isInt32Immediate(SDValue N, unsigned &Imm) { + return isInt32Immediate(N.getNode(), Imm); +} + +// isOpcWithIntImmediate - This method tests to see if the node is a specific +// opcode and that it has a immediate integer right operand. +// If so Imm will receive the 32 bit value. +static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) { + return N->getOpcode() == Opc && + isInt32Immediate(N->getOperand(1).getNode(), Imm); +} + +/// \brief Check whether a particular node is a constant value representable as +/// (N * Scale) where (N in [\arg RangeMin, \arg RangeMax). +/// +/// \param ScaledConstant [out] - On success, the pre-scaled constant value. +static bool isScaledConstantInRange(SDValue Node, int Scale, + int RangeMin, int RangeMax, + int &ScaledConstant) { + assert(Scale > 0 && "Invalid scale!"); + + // Check that this is a constant. + const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Node); + if (!C) + return false; + + ScaledConstant = (int) C->getZExtValue(); + if ((ScaledConstant % Scale) != 0) + return false; + + ScaledConstant /= Scale; + return ScaledConstant >= RangeMin && ScaledConstant < RangeMax; +} + +/// hasNoVMLxHazardUse - Return true if it's desirable to select a FP MLA / MLS +/// node. VFP / NEON fp VMLA / VMLS instructions have special RAW hazards (at +/// least on current ARM implementations) which should be avoidded. +bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const { + if (OptLevel == CodeGenOpt::None) + return true; + + if (!CheckVMLxHazard) + return true; + + if (!Subtarget->isCortexA8() && !Subtarget->isCortexA9()) + return true; + + if (!N->hasOneUse()) + return false; + + SDNode *Use = *N->use_begin(); + if (Use->getOpcode() == ISD::CopyToReg) + return true; + if (Use->isMachineOpcode()) { + const MCInstrDesc &MCID = TII->get(Use->getMachineOpcode()); + if (MCID.mayStore()) + return true; + unsigned Opcode = MCID.getOpcode(); + if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD) + return true; + // vmlx feeding into another vmlx. We actually want to unfold + // the use later in the MLxExpansion pass. e.g. + // vmla + // vmla (stall 8 cycles) + // + // vmul (5 cycles) + // vadd (5 cycles) + // vmla + // This adds up to about 18 - 19 cycles. + // + // vmla + // vmul (stall 4 cycles) + // vadd adds up to about 14 cycles. + return TII->isFpMLxInstruction(Opcode); + } + + return false; +} + +bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift, + ARM_AM::ShiftOpc ShOpcVal, + unsigned ShAmt) { + if (!Subtarget->isCortexA9()) + return true; + if (Shift.hasOneUse()) + return true; + // R << 2 is free. + return ShOpcVal == ARM_AM::lsl && ShAmt == 2; +} + +bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N, + SDValue &BaseReg, + SDValue &Opc, + bool CheckProfitability) { + if (DisableShifterOp) + return false; + + ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode()); + + // Don't match base register only case. That is matched to a separate + // lower complexity pattern with explicit register operand. + if (ShOpcVal == ARM_AM::no_shift) return false; + + BaseReg = N.getOperand(0); + unsigned ShImmVal = 0; + ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1)); + if (!RHS) return false; + ShImmVal = RHS->getZExtValue() & 31; + Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal), + MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectRegShifterOperand(SDValue N, + SDValue &BaseReg, + SDValue &ShReg, + SDValue &Opc, + bool CheckProfitability) { + if (DisableShifterOp) + return false; + + ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode()); + + // Don't match base register only case. That is matched to a separate + // lower complexity pattern with explicit register operand. + if (ShOpcVal == ARM_AM::no_shift) return false; + + BaseReg = N.getOperand(0); + unsigned ShImmVal = 0; + ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1)); + if (RHS) return false; + + ShReg = N.getOperand(1); + if (CheckProfitability && !isShifterOpProfitable(N, ShOpcVal, ShImmVal)) + return false; + Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal), + MVT::i32); + return true; +} + + +bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N, + SDValue &Base, + SDValue &OffImm) { + // Match simple R + imm12 operands. + + // Base only. + if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB && + !CurDAG->isBaseWithConstantOffset(N)) { + if (N.getOpcode() == ISD::FrameIndex) { + // Match frame index. + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + return true; + } + + if (N.getOpcode() == ARMISD::Wrapper && + !(Subtarget->useMovt() && + N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) { + Base = N.getOperand(0); + } else + Base = N; + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + return true; + } + + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int RHSC = (int)RHS->getZExtValue(); + if (N.getOpcode() == ISD::SUB) + RHSC = -RHSC; + + if (RHSC >= 0 && RHSC < 0x1000) { // 12 bits (unsigned) + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } + OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); + return true; + } + } + + // Base only. + Base = N; + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + return true; +} + + + +bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset, + SDValue &Opc) { + if (N.getOpcode() == ISD::MUL && + (!Subtarget->isCortexA9() || N.hasOneUse())) { + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + // X * [3,5,9] -> X + X * [2,4,8] etc. + int RHSC = (int)RHS->getZExtValue(); + if (RHSC & 1) { + RHSC = RHSC & ~1; + ARM_AM::AddrOpc AddSub = ARM_AM::add; + if (RHSC < 0) { + AddSub = ARM_AM::sub; + RHSC = - RHSC; + } + if (isPowerOf2_32(RHSC)) { + unsigned ShAmt = Log2_32(RHSC); + Base = Offset = N.getOperand(0); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, + ARM_AM::lsl), + MVT::i32); + return true; + } + } + } + } + + if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB && + // ISD::OR that is equivalent to an ISD::ADD. + !CurDAG->isBaseWithConstantOffset(N)) + return false; + + // Leave simple R +/- imm12 operands for LDRi12 + if (N.getOpcode() == ISD::ADD || N.getOpcode() == ISD::OR) { + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/1, + -0x1000+1, 0x1000, RHSC)) // 12 bits. + return false; + } + + // Otherwise this is R +/- [possibly shifted] R. + ARM_AM::AddrOpc AddSub = N.getOpcode() == ISD::SUB ? ARM_AM::sub:ARM_AM::add; + ARM_AM::ShiftOpc ShOpcVal = + ARM_AM::getShiftOpcForNode(N.getOperand(1).getOpcode()); + unsigned ShAmt = 0; + + Base = N.getOperand(0); + Offset = N.getOperand(1); + + if (ShOpcVal != ARM_AM::no_shift) { + // Check to see if the RHS of the shift is a constant, if not, we can't fold + // it. + if (ConstantSDNode *Sh = + dyn_cast<ConstantSDNode>(N.getOperand(1).getOperand(1))) { + ShAmt = Sh->getZExtValue(); + if (isShifterOpProfitable(Offset, ShOpcVal, ShAmt)) + Offset = N.getOperand(1).getOperand(0); + else { + ShAmt = 0; + ShOpcVal = ARM_AM::no_shift; + } + } else { + ShOpcVal = ARM_AM::no_shift; + } + } + + // Try matching (R shl C) + (R). + if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift && + !(Subtarget->isCortexA9() || N.getOperand(0).hasOneUse())) { + ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0).getOpcode()); + if (ShOpcVal != ARM_AM::no_shift) { + // Check to see if the RHS of the shift is a constant, if not, we can't + // fold it. + if (ConstantSDNode *Sh = + dyn_cast<ConstantSDNode>(N.getOperand(0).getOperand(1))) { + ShAmt = Sh->getZExtValue(); + if (isShifterOpProfitable(N.getOperand(0), ShOpcVal, ShAmt)) { + Offset = N.getOperand(0).getOperand(0); + Base = N.getOperand(1); + } else { + ShAmt = 0; + ShOpcVal = ARM_AM::no_shift; + } + } else { + ShOpcVal = ARM_AM::no_shift; + } + } + } + + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal), + MVT::i32); + return true; +} + + + + +//----- + +AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N, + SDValue &Base, + SDValue &Offset, + SDValue &Opc) { + if (N.getOpcode() == ISD::MUL && + (!Subtarget->isCortexA9() || N.hasOneUse())) { + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + // X * [3,5,9] -> X + X * [2,4,8] etc. + int RHSC = (int)RHS->getZExtValue(); + if (RHSC & 1) { + RHSC = RHSC & ~1; + ARM_AM::AddrOpc AddSub = ARM_AM::add; + if (RHSC < 0) { + AddSub = ARM_AM::sub; + RHSC = - RHSC; + } + if (isPowerOf2_32(RHSC)) { + unsigned ShAmt = Log2_32(RHSC); + Base = Offset = N.getOperand(0); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, + ARM_AM::lsl), + MVT::i32); + return AM2_SHOP; + } + } + } + } + + if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB && + // ISD::OR that is equivalent to an ADD. + !CurDAG->isBaseWithConstantOffset(N)) { + Base = N; + if (N.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } else if (N.getOpcode() == ARMISD::Wrapper && + !(Subtarget->useMovt() && + N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) { + Base = N.getOperand(0); + } + Offset = CurDAG->getRegister(0, MVT::i32); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(ARM_AM::add, 0, + ARM_AM::no_shift), + MVT::i32); + return AM2_BASE; + } + + // Match simple R +/- imm12 operands. + if (N.getOpcode() != ISD::SUB) { + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/1, + -0x1000+1, 0x1000, RHSC)) { // 12 bits. + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } + Offset = CurDAG->getRegister(0, MVT::i32); + + ARM_AM::AddrOpc AddSub = ARM_AM::add; + if (RHSC < 0) { + AddSub = ARM_AM::sub; + RHSC = - RHSC; + } + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, RHSC, + ARM_AM::no_shift), + MVT::i32); + return AM2_BASE; + } + } + + if (Subtarget->isCortexA9() && !N.hasOneUse()) { + // Compute R +/- (R << N) and reuse it. + Base = N; + Offset = CurDAG->getRegister(0, MVT::i32); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(ARM_AM::add, 0, + ARM_AM::no_shift), + MVT::i32); + return AM2_BASE; + } + + // Otherwise this is R +/- [possibly shifted] R. + ARM_AM::AddrOpc AddSub = N.getOpcode() != ISD::SUB ? ARM_AM::add:ARM_AM::sub; + ARM_AM::ShiftOpc ShOpcVal = + ARM_AM::getShiftOpcForNode(N.getOperand(1).getOpcode()); + unsigned ShAmt = 0; + + Base = N.getOperand(0); + Offset = N.getOperand(1); + + if (ShOpcVal != ARM_AM::no_shift) { + // Check to see if the RHS of the shift is a constant, if not, we can't fold + // it. + if (ConstantSDNode *Sh = + dyn_cast<ConstantSDNode>(N.getOperand(1).getOperand(1))) { + ShAmt = Sh->getZExtValue(); + if (isShifterOpProfitable(Offset, ShOpcVal, ShAmt)) + Offset = N.getOperand(1).getOperand(0); + else { + ShAmt = 0; + ShOpcVal = ARM_AM::no_shift; + } + } else { + ShOpcVal = ARM_AM::no_shift; + } + } + + // Try matching (R shl C) + (R). + if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift && + !(Subtarget->isCortexA9() || N.getOperand(0).hasOneUse())) { + ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0).getOpcode()); + if (ShOpcVal != ARM_AM::no_shift) { + // Check to see if the RHS of the shift is a constant, if not, we can't + // fold it. + if (ConstantSDNode *Sh = + dyn_cast<ConstantSDNode>(N.getOperand(0).getOperand(1))) { + ShAmt = Sh->getZExtValue(); + if (isShifterOpProfitable(N.getOperand(0), ShOpcVal, ShAmt)) { + Offset = N.getOperand(0).getOperand(0); + Base = N.getOperand(1); + } else { + ShAmt = 0; + ShOpcVal = ARM_AM::no_shift; + } + } else { + ShOpcVal = ARM_AM::no_shift; + } + } + } + + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal), + MVT::i32); + return AM2_SHOP; +} + +bool ARMDAGToDAGISel::SelectAddrMode2OffsetReg(SDNode *Op, SDValue N, + SDValue &Offset, SDValue &Opc) { + unsigned Opcode = Op->getOpcode(); + ISD::MemIndexedMode AM = (Opcode == ISD::LOAD) + ? cast<LoadSDNode>(Op)->getAddressingMode() + : cast<StoreSDNode>(Op)->getAddressingMode(); + ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC) + ? ARM_AM::add : ARM_AM::sub; + int Val; + if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val)) + return false; + + Offset = N; + ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode()); + unsigned ShAmt = 0; + if (ShOpcVal != ARM_AM::no_shift) { + // Check to see if the RHS of the shift is a constant, if not, we can't fold + // it. + if (ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + ShAmt = Sh->getZExtValue(); + if (isShifterOpProfitable(N, ShOpcVal, ShAmt)) + Offset = N.getOperand(0); + else { + ShAmt = 0; + ShOpcVal = ARM_AM::no_shift; + } + } else { + ShOpcVal = ARM_AM::no_shift; + } + } + + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal), + MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectAddrMode2OffsetImmPre(SDNode *Op, SDValue N, + SDValue &Offset, SDValue &Opc) { + unsigned Opcode = Op->getOpcode(); + ISD::MemIndexedMode AM = (Opcode == ISD::LOAD) + ? cast<LoadSDNode>(Op)->getAddressingMode() + : cast<StoreSDNode>(Op)->getAddressingMode(); + ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC) + ? ARM_AM::add : ARM_AM::sub; + int Val; + if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val)) { // 12 bits. + if (AddSub == ARM_AM::sub) Val *= -1; + Offset = CurDAG->getRegister(0, MVT::i32); + Opc = CurDAG->getTargetConstant(Val, MVT::i32); + return true; + } + + return false; +} + + +bool ARMDAGToDAGISel::SelectAddrMode2OffsetImm(SDNode *Op, SDValue N, + SDValue &Offset, SDValue &Opc) { + unsigned Opcode = Op->getOpcode(); + ISD::MemIndexedMode AM = (Opcode == ISD::LOAD) + ? cast<LoadSDNode>(Op)->getAddressingMode() + : cast<StoreSDNode>(Op)->getAddressingMode(); + ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC) + ? ARM_AM::add : ARM_AM::sub; + int Val; + if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val)) { // 12 bits. + Offset = CurDAG->getRegister(0, MVT::i32); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, Val, + ARM_AM::no_shift), + MVT::i32); + return true; + } + + return false; +} + +bool ARMDAGToDAGISel::SelectAddrOffsetNone(SDValue N, SDValue &Base) { + Base = N; + return true; +} + +bool ARMDAGToDAGISel::SelectAddrMode3(SDValue N, + SDValue &Base, SDValue &Offset, + SDValue &Opc) { + if (N.getOpcode() == ISD::SUB) { + // X - C is canonicalize to X + -C, no need to handle it here. + Base = N.getOperand(0); + Offset = N.getOperand(1); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::sub, 0),MVT::i32); + return true; + } + + if (!CurDAG->isBaseWithConstantOffset(N)) { + Base = N; + if (N.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } + Offset = CurDAG->getRegister(0, MVT::i32); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0),MVT::i32); + return true; + } + + // If the RHS is +/- imm8, fold into addr mode. + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/1, + -256 + 1, 256, RHSC)) { // 8 bits. + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } + Offset = CurDAG->getRegister(0, MVT::i32); + + ARM_AM::AddrOpc AddSub = ARM_AM::add; + if (RHSC < 0) { + AddSub = ARM_AM::sub; + RHSC = -RHSC; + } + Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, RHSC),MVT::i32); + return true; + } + + Base = N.getOperand(0); + Offset = N.getOperand(1); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0), MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectAddrMode3Offset(SDNode *Op, SDValue N, + SDValue &Offset, SDValue &Opc) { + unsigned Opcode = Op->getOpcode(); + ISD::MemIndexedMode AM = (Opcode == ISD::LOAD) + ? cast<LoadSDNode>(Op)->getAddressingMode() + : cast<StoreSDNode>(Op)->getAddressingMode(); + ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC) + ? ARM_AM::add : ARM_AM::sub; + int Val; + if (isScaledConstantInRange(N, /*Scale=*/1, 0, 256, Val)) { // 12 bits. + Offset = CurDAG->getRegister(0, MVT::i32); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, Val), MVT::i32); + return true; + } + + Offset = N; + Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, 0), MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N, + SDValue &Base, SDValue &Offset) { + if (!CurDAG->isBaseWithConstantOffset(N)) { + Base = N; + if (N.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } else if (N.getOpcode() == ARMISD::Wrapper && + !(Subtarget->useMovt() && + N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) { + Base = N.getOperand(0); + } + Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0), + MVT::i32); + return true; + } + + // If the RHS is +/- imm8, fold into addr mode. + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/4, + -256 + 1, 256, RHSC)) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } + + ARM_AM::AddrOpc AddSub = ARM_AM::add; + if (RHSC < 0) { + AddSub = ARM_AM::sub; + RHSC = -RHSC; + } + Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC), + MVT::i32); + return true; + } + + Base = N; + Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0), + MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr, + SDValue &Align) { + Addr = N; + + unsigned Alignment = 0; + if (LSBaseSDNode *LSN = dyn_cast<LSBaseSDNode>(Parent)) { + // This case occurs only for VLD1-lane/dup and VST1-lane instructions. + // The maximum alignment is equal to the memory size being referenced. + unsigned LSNAlign = LSN->getAlignment(); + unsigned MemSize = LSN->getMemoryVT().getSizeInBits() / 8; + if (LSNAlign > MemSize && MemSize > 1) + Alignment = MemSize; + } else { + // All other uses of addrmode6 are for intrinsics. For now just record + // the raw alignment value; it will be refined later based on the legal + // alignment operands for the intrinsic. + Alignment = cast<MemIntrinsicSDNode>(Parent)->getAlignment(); + } + + Align = CurDAG->getTargetConstant(Alignment, MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectAddrMode6Offset(SDNode *Op, SDValue N, + SDValue &Offset) { + LSBaseSDNode *LdSt = cast<LSBaseSDNode>(Op); + ISD::MemIndexedMode AM = LdSt->getAddressingMode(); + if (AM != ISD::POST_INC) + return false; + Offset = N; + if (ConstantSDNode *NC = dyn_cast<ConstantSDNode>(N)) { + if (NC->getZExtValue() * 8 == LdSt->getMemoryVT().getSizeInBits()) + Offset = CurDAG->getRegister(0, MVT::i32); + } + return true; +} + +bool ARMDAGToDAGISel::SelectAddrModePC(SDValue N, + SDValue &Offset, SDValue &Label) { + if (N.getOpcode() == ARMISD::PIC_ADD && N.hasOneUse()) { + Offset = N.getOperand(0); + SDValue N1 = N.getOperand(1); + Label = CurDAG->getTargetConstant(cast<ConstantSDNode>(N1)->getZExtValue(), + MVT::i32); + return true; + } + + return false; +} + + +//===----------------------------------------------------------------------===// +// Thumb Addressing Modes +//===----------------------------------------------------------------------===// + +bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDValue N, + SDValue &Base, SDValue &Offset){ + if (N.getOpcode() != ISD::ADD && !CurDAG->isBaseWithConstantOffset(N)) { + ConstantSDNode *NC = dyn_cast<ConstantSDNode>(N); + if (!NC || !NC->isNullValue()) + return false; + + Base = Offset = N; + return true; + } + + Base = N.getOperand(0); + Offset = N.getOperand(1); + return true; +} + +bool +ARMDAGToDAGISel::SelectThumbAddrModeRI(SDValue N, SDValue &Base, + SDValue &Offset, unsigned Scale) { + if (Scale == 4) { + SDValue TmpBase, TmpOffImm; + if (SelectThumbAddrModeSP(N, TmpBase, TmpOffImm)) + return false; // We want to select tLDRspi / tSTRspi instead. + + if (N.getOpcode() == ARMISD::Wrapper && + N.getOperand(0).getOpcode() == ISD::TargetConstantPool) + return false; // We want to select tLDRpci instead. + } + + if (!CurDAG->isBaseWithConstantOffset(N)) + return false; + + // Thumb does not have [sp, r] address mode. + RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0)); + RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(N.getOperand(1)); + if ((LHSR && LHSR->getReg() == ARM::SP) || + (RHSR && RHSR->getReg() == ARM::SP)) + return false; + + // FIXME: Why do we explicitly check for a match here and then return false? + // Presumably to allow something else to match, but shouldn't this be + // documented? + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), Scale, 0, 32, RHSC)) + return false; + + Base = N.getOperand(0); + Offset = N.getOperand(1); + return true; +} + +bool +ARMDAGToDAGISel::SelectThumbAddrModeRI5S1(SDValue N, + SDValue &Base, + SDValue &Offset) { + return SelectThumbAddrModeRI(N, Base, Offset, 1); +} + +bool +ARMDAGToDAGISel::SelectThumbAddrModeRI5S2(SDValue N, + SDValue &Base, + SDValue &Offset) { + return SelectThumbAddrModeRI(N, Base, Offset, 2); +} + +bool +ARMDAGToDAGISel::SelectThumbAddrModeRI5S4(SDValue N, + SDValue &Base, + SDValue &Offset) { + return SelectThumbAddrModeRI(N, Base, Offset, 4); +} + +bool +ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, + SDValue &Base, SDValue &OffImm) { + if (Scale == 4) { + SDValue TmpBase, TmpOffImm; + if (SelectThumbAddrModeSP(N, TmpBase, TmpOffImm)) + return false; // We want to select tLDRspi / tSTRspi instead. + + if (N.getOpcode() == ARMISD::Wrapper && + N.getOperand(0).getOpcode() == ISD::TargetConstantPool) + return false; // We want to select tLDRpci instead. + } + + if (!CurDAG->isBaseWithConstantOffset(N)) { + if (N.getOpcode() == ARMISD::Wrapper && + !(Subtarget->useMovt() && + N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) { + Base = N.getOperand(0); + } else { + Base = N; + } + + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + return true; + } + + RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0)); + RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(N.getOperand(1)); + if ((LHSR && LHSR->getReg() == ARM::SP) || + (RHSR && RHSR->getReg() == ARM::SP)) { + ConstantSDNode *LHS = dyn_cast<ConstantSDNode>(N.getOperand(0)); + ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1)); + unsigned LHSC = LHS ? LHS->getZExtValue() : 0; + unsigned RHSC = RHS ? RHS->getZExtValue() : 0; + + // Thumb does not have [sp, #imm5] address mode for non-zero imm5. + if (LHSC != 0 || RHSC != 0) return false; + + Base = N; + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + return true; + } + + // If the RHS is + imm5 * scale, fold into addr mode. + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), Scale, 0, 32, RHSC)) { + Base = N.getOperand(0); + OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); + return true; + } + + Base = N.getOperand(0); + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + return true; +} + +bool +ARMDAGToDAGISel::SelectThumbAddrModeImm5S4(SDValue N, SDValue &Base, + SDValue &OffImm) { + return SelectThumbAddrModeImm5S(N, 4, Base, OffImm); +} + +bool +ARMDAGToDAGISel::SelectThumbAddrModeImm5S2(SDValue N, SDValue &Base, + SDValue &OffImm) { + return SelectThumbAddrModeImm5S(N, 2, Base, OffImm); +} + +bool +ARMDAGToDAGISel::SelectThumbAddrModeImm5S1(SDValue N, SDValue &Base, + SDValue &OffImm) { + return SelectThumbAddrModeImm5S(N, 1, Base, OffImm); +} + +bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N, + SDValue &Base, SDValue &OffImm) { + if (N.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + return true; + } + + if (!CurDAG->isBaseWithConstantOffset(N)) + return false; + + RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0)); + if (N.getOperand(0).getOpcode() == ISD::FrameIndex || + (LHSR && LHSR->getReg() == ARM::SP)) { + // If the RHS is + imm8 * scale, fold into addr mode. + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/4, 0, 256, RHSC)) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } + OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); + return true; + } + } + + return false; +} + + +//===----------------------------------------------------------------------===// +// Thumb 2 Addressing Modes +//===----------------------------------------------------------------------===// + + +bool ARMDAGToDAGISel::SelectT2ShifterOperandReg(SDValue N, SDValue &BaseReg, + SDValue &Opc) { + if (DisableShifterOp) + return false; + + ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode()); + + // Don't match base register only case. That is matched to a separate + // lower complexity pattern with explicit register operand. + if (ShOpcVal == ARM_AM::no_shift) return false; + + BaseReg = N.getOperand(0); + unsigned ShImmVal = 0; + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + ShImmVal = RHS->getZExtValue() & 31; + Opc = getI32Imm(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal)); + return true; + } + + return false; +} + +bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N, + SDValue &Base, SDValue &OffImm) { + // Match simple R + imm12 operands. + + // Base only. + if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB && + !CurDAG->isBaseWithConstantOffset(N)) { + if (N.getOpcode() == ISD::FrameIndex) { + // Match frame index. + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + return true; + } + + if (N.getOpcode() == ARMISD::Wrapper && + !(Subtarget->useMovt() && + N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::TargetConstantPool) + return false; // We want to select t2LDRpci instead. + } else + Base = N; + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + return true; + } + + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + if (SelectT2AddrModeImm8(N, Base, OffImm)) + // Let t2LDRi8 handle (R - imm8). + return false; + + int RHSC = (int)RHS->getZExtValue(); + if (N.getOpcode() == ISD::SUB) + RHSC = -RHSC; + + if (RHSC >= 0 && RHSC < 0x1000) { // 12 bits (unsigned) + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } + OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); + return true; + } + } + + // Base only. + Base = N; + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N, + SDValue &Base, SDValue &OffImm) { + // Match simple R - imm8 operands. + if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB && + !CurDAG->isBaseWithConstantOffset(N)) + return false; + + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int RHSC = (int)RHS->getSExtValue(); + if (N.getOpcode() == ISD::SUB) + RHSC = -RHSC; + + if ((RHSC >= -255) && (RHSC < 0)) { // 8 bits (always negative) + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } + OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); + return true; + } + } + + return false; +} + +bool ARMDAGToDAGISel::SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N, + SDValue &OffImm){ + unsigned Opcode = Op->getOpcode(); + ISD::MemIndexedMode AM = (Opcode == ISD::LOAD) + ? cast<LoadSDNode>(Op)->getAddressingMode() + : cast<StoreSDNode>(Op)->getAddressingMode(); + int RHSC; + if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x100, RHSC)) { // 8 bits. + OffImm = ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC)) + ? CurDAG->getTargetConstant(RHSC, MVT::i32) + : CurDAG->getTargetConstant(-RHSC, MVT::i32); + return true; + } + + return false; +} + +bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N, + SDValue &Base, + SDValue &OffReg, SDValue &ShImm) { + // (R - imm8) should be handled by t2LDRi8. The rest are handled by t2LDRi12. + if (N.getOpcode() != ISD::ADD && !CurDAG->isBaseWithConstantOffset(N)) + return false; + + // Leave (R + imm12) for t2LDRi12, (R - imm8) for t2LDRi8. + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int RHSC = (int)RHS->getZExtValue(); + if (RHSC >= 0 && RHSC < 0x1000) // 12 bits (unsigned) + return false; + else if (RHSC < 0 && RHSC >= -255) // 8 bits + return false; + } + + // Look for (R + R) or (R + (R << [1,2,3])). + unsigned ShAmt = 0; + Base = N.getOperand(0); + OffReg = N.getOperand(1); + + // Swap if it is ((R << c) + R). + ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(OffReg.getOpcode()); + if (ShOpcVal != ARM_AM::lsl) { + ShOpcVal = ARM_AM::getShiftOpcForNode(Base.getOpcode()); + if (ShOpcVal == ARM_AM::lsl) + std::swap(Base, OffReg); + } + + if (ShOpcVal == ARM_AM::lsl) { + // Check to see if the RHS of the shift is a constant, if not, we can't fold + // it. + if (ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(OffReg.getOperand(1))) { + ShAmt = Sh->getZExtValue(); + if (ShAmt < 4 && isShifterOpProfitable(OffReg, ShOpcVal, ShAmt)) + OffReg = OffReg.getOperand(0); + else { + ShAmt = 0; + ShOpcVal = ARM_AM::no_shift; + } + } else { + ShOpcVal = ARM_AM::no_shift; + } + } + + ShImm = CurDAG->getTargetConstant(ShAmt, MVT::i32); + + return true; +} + +//===--------------------------------------------------------------------===// + +/// getAL - Returns a ARMCC::AL immediate node. +static inline SDValue getAL(SelectionDAG *CurDAG) { + return CurDAG->getTargetConstant((uint64_t)ARMCC::AL, MVT::i32); +} + +SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDNode *N) { + LoadSDNode *LD = cast<LoadSDNode>(N); + ISD::MemIndexedMode AM = LD->getAddressingMode(); + if (AM == ISD::UNINDEXED) + return NULL; + + EVT LoadedVT = LD->getMemoryVT(); + SDValue Offset, AMOpc; + bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC); + unsigned Opcode = 0; + bool Match = false; + if (LoadedVT == MVT::i32 && isPre && + SelectAddrMode2OffsetImmPre(N, LD->getOffset(), Offset, AMOpc)) { + Opcode = ARM::LDR_PRE_IMM; + Match = true; + } else if (LoadedVT == MVT::i32 && !isPre && + SelectAddrMode2OffsetImm(N, LD->getOffset(), Offset, AMOpc)) { + Opcode = ARM::LDR_POST_IMM; + Match = true; + } else if (LoadedVT == MVT::i32 && + SelectAddrMode2OffsetReg(N, LD->getOffset(), Offset, AMOpc)) { + Opcode = isPre ? ARM::LDR_PRE_REG : ARM::LDR_POST_REG; + Match = true; + + } else if (LoadedVT == MVT::i16 && + SelectAddrMode3Offset(N, LD->getOffset(), Offset, AMOpc)) { + Match = true; + Opcode = (LD->getExtensionType() == ISD::SEXTLOAD) + ? (isPre ? ARM::LDRSH_PRE : ARM::LDRSH_POST) + : (isPre ? ARM::LDRH_PRE : ARM::LDRH_POST); + } else if (LoadedVT == MVT::i8 || LoadedVT == MVT::i1) { + if (LD->getExtensionType() == ISD::SEXTLOAD) { + if (SelectAddrMode3Offset(N, LD->getOffset(), Offset, AMOpc)) { + Match = true; + Opcode = isPre ? ARM::LDRSB_PRE : ARM::LDRSB_POST; + } + } else { + if (isPre && + SelectAddrMode2OffsetImmPre(N, LD->getOffset(), Offset, AMOpc)) { + Match = true; + Opcode = ARM::LDRB_PRE_IMM; + } else if (!isPre && + SelectAddrMode2OffsetImm(N, LD->getOffset(), Offset, AMOpc)) { + Match = true; + Opcode = ARM::LDRB_POST_IMM; + } else if (SelectAddrMode2OffsetReg(N, LD->getOffset(), Offset, AMOpc)) { + Match = true; + Opcode = isPre ? ARM::LDRB_PRE_REG : ARM::LDRB_POST_REG; + } + } + } + + if (Match) { + if (Opcode == ARM::LDR_PRE_IMM || Opcode == ARM::LDRB_PRE_IMM) { + SDValue Chain = LD->getChain(); + SDValue Base = LD->getBasePtr(); + SDValue Ops[]= { Base, AMOpc, getAL(CurDAG), + CurDAG->getRegister(0, MVT::i32), Chain }; + return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32, + MVT::i32, MVT::Other, Ops, 5); + } else { + SDValue Chain = LD->getChain(); + SDValue Base = LD->getBasePtr(); + SDValue Ops[]= { Base, Offset, AMOpc, getAL(CurDAG), + CurDAG->getRegister(0, MVT::i32), Chain }; + return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32, + MVT::i32, MVT::Other, Ops, 6); + } + } + + return NULL; +} + +SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) { + LoadSDNode *LD = cast<LoadSDNode>(N); + ISD::MemIndexedMode AM = LD->getAddressingMode(); + if (AM == ISD::UNINDEXED) + return NULL; + + EVT LoadedVT = LD->getMemoryVT(); + bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD; + SDValue Offset; + bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC); + unsigned Opcode = 0; + bool Match = false; + if (SelectT2AddrModeImm8Offset(N, LD->getOffset(), Offset)) { + switch (LoadedVT.getSimpleVT().SimpleTy) { + case MVT::i32: + Opcode = isPre ? ARM::t2LDR_PRE : ARM::t2LDR_POST; + break; + case MVT::i16: + if (isSExtLd) + Opcode = isPre ? ARM::t2LDRSH_PRE : ARM::t2LDRSH_POST; + else + Opcode = isPre ? ARM::t2LDRH_PRE : ARM::t2LDRH_POST; + break; + case MVT::i8: + case MVT::i1: + if (isSExtLd) + Opcode = isPre ? ARM::t2LDRSB_PRE : ARM::t2LDRSB_POST; + else + Opcode = isPre ? ARM::t2LDRB_PRE : ARM::t2LDRB_POST; + break; + default: + return NULL; + } + Match = true; + } + + if (Match) { + SDValue Chain = LD->getChain(); + SDValue Base = LD->getBasePtr(); + SDValue Ops[]= { Base, Offset, getAL(CurDAG), + CurDAG->getRegister(0, MVT::i32), Chain }; + return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32, MVT::i32, + MVT::Other, Ops, 5); + } + + return NULL; +} + +/// PairSRegs - Form a D register from a pair of S registers. +/// +SDNode *ARMDAGToDAGISel::PairSRegs(EVT VT, SDValue V0, SDValue V1) { + DebugLoc dl = V0.getNode()->getDebugLoc(); + SDValue RegClass = + CurDAG->getTargetConstant(ARM::DPR_VFP2RegClassID, MVT::i32); + SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(ARM::ssub_1, MVT::i32); + const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 5); +} + +/// PairDRegs - Form a quad register from a pair of D registers. +/// +SDNode *ARMDAGToDAGISel::PairDRegs(EVT VT, SDValue V0, SDValue V1) { + DebugLoc dl = V0.getNode()->getDebugLoc(); + SDValue RegClass = CurDAG->getTargetConstant(ARM::QPRRegClassID, MVT::i32); + SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32); + const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 5); +} + +/// PairQRegs - Form 4 consecutive D registers from a pair of Q registers. +/// +SDNode *ARMDAGToDAGISel::PairQRegs(EVT VT, SDValue V0, SDValue V1) { + DebugLoc dl = V0.getNode()->getDebugLoc(); + SDValue RegClass = CurDAG->getTargetConstant(ARM::QQPRRegClassID, MVT::i32); + SDValue SubReg0 = CurDAG->getTargetConstant(ARM::qsub_0, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(ARM::qsub_1, MVT::i32); + const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 5); +} + +/// QuadSRegs - Form 4 consecutive S registers. +/// +SDNode *ARMDAGToDAGISel::QuadSRegs(EVT VT, SDValue V0, SDValue V1, + SDValue V2, SDValue V3) { + DebugLoc dl = V0.getNode()->getDebugLoc(); + SDValue RegClass = + CurDAG->getTargetConstant(ARM::QPR_VFP2RegClassID, MVT::i32); + SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(ARM::ssub_1, MVT::i32); + SDValue SubReg2 = CurDAG->getTargetConstant(ARM::ssub_2, MVT::i32); + SDValue SubReg3 = CurDAG->getTargetConstant(ARM::ssub_3, MVT::i32); + const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1, + V2, SubReg2, V3, SubReg3 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 9); +} + +/// QuadDRegs - Form 4 consecutive D registers. +/// +SDNode *ARMDAGToDAGISel::QuadDRegs(EVT VT, SDValue V0, SDValue V1, + SDValue V2, SDValue V3) { + DebugLoc dl = V0.getNode()->getDebugLoc(); + SDValue RegClass = CurDAG->getTargetConstant(ARM::QQPRRegClassID, MVT::i32); + SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32); + SDValue SubReg2 = CurDAG->getTargetConstant(ARM::dsub_2, MVT::i32); + SDValue SubReg3 = CurDAG->getTargetConstant(ARM::dsub_3, MVT::i32); + const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1, + V2, SubReg2, V3, SubReg3 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 9); +} + +/// QuadQRegs - Form 4 consecutive Q registers. +/// +SDNode *ARMDAGToDAGISel::QuadQRegs(EVT VT, SDValue V0, SDValue V1, + SDValue V2, SDValue V3) { + DebugLoc dl = V0.getNode()->getDebugLoc(); + SDValue RegClass = CurDAG->getTargetConstant(ARM::QQQQPRRegClassID, MVT::i32); + SDValue SubReg0 = CurDAG->getTargetConstant(ARM::qsub_0, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(ARM::qsub_1, MVT::i32); + SDValue SubReg2 = CurDAG->getTargetConstant(ARM::qsub_2, MVT::i32); + SDValue SubReg3 = CurDAG->getTargetConstant(ARM::qsub_3, MVT::i32); + const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1, + V2, SubReg2, V3, SubReg3 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 9); +} + +/// GetVLDSTAlign - Get the alignment (in bytes) for the alignment operand +/// of a NEON VLD or VST instruction. The supported values depend on the +/// number of registers being loaded. +SDValue ARMDAGToDAGISel::GetVLDSTAlign(SDValue Align, unsigned NumVecs, + bool is64BitVector) { + unsigned NumRegs = NumVecs; + if (!is64BitVector && NumVecs < 3) + NumRegs *= 2; + + unsigned Alignment = cast<ConstantSDNode>(Align)->getZExtValue(); + if (Alignment >= 32 && NumRegs == 4) + Alignment = 32; + else if (Alignment >= 16 && (NumRegs == 2 || NumRegs == 4)) + Alignment = 16; + else if (Alignment >= 8) + Alignment = 8; + else + Alignment = 0; + + return CurDAG->getTargetConstant(Alignment, MVT::i32); +} + +SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, + unsigned *DOpcodes, unsigned *QOpcodes0, + unsigned *QOpcodes1) { + assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range"); + DebugLoc dl = N->getDebugLoc(); + + SDValue MemAddr, Align; + unsigned AddrOpIdx = isUpdating ? 1 : 2; + if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) + return NULL; + + SDValue Chain = N->getOperand(0); + EVT VT = N->getValueType(0); + bool is64BitVector = VT.is64BitVector(); + Align = GetVLDSTAlign(Align, NumVecs, is64BitVector); + + unsigned OpcodeIndex; + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled vld type"); + // Double-register operations: + case MVT::v8i8: OpcodeIndex = 0; break; + case MVT::v4i16: OpcodeIndex = 1; break; + case MVT::v2f32: + case MVT::v2i32: OpcodeIndex = 2; break; + case MVT::v1i64: OpcodeIndex = 3; break; + // Quad-register operations: + case MVT::v16i8: OpcodeIndex = 0; break; + case MVT::v8i16: OpcodeIndex = 1; break; + case MVT::v4f32: + case MVT::v4i32: OpcodeIndex = 2; break; + case MVT::v2i64: OpcodeIndex = 3; + assert(NumVecs == 1 && "v2i64 type only supported for VLD1"); + break; + } + + EVT ResTy; + if (NumVecs == 1) + ResTy = VT; + else { + unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs; + if (!is64BitVector) + ResTyElts *= 2; + ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts); + } + std::vector<EVT> ResTys; + ResTys.push_back(ResTy); + if (isUpdating) + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::Other); + + SDValue Pred = getAL(CurDAG); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + SDNode *VLd; + SmallVector<SDValue, 7> Ops; + + // Double registers and VLD1/VLD2 quad registers are directly supported. + if (is64BitVector || NumVecs <= 2) { + unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] : + QOpcodes0[OpcodeIndex]); + Ops.push_back(MemAddr); + Ops.push_back(Align); + if (isUpdating) { + SDValue Inc = N->getOperand(AddrOpIdx + 1); + Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc); + } + Ops.push_back(Pred); + Ops.push_back(Reg0); + Ops.push_back(Chain); + VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size()); + + } else { + // Otherwise, quad registers are loaded with two separate instructions, + // where one loads the even registers and the other loads the odd registers. + EVT AddrTy = MemAddr.getValueType(); + + // Load the even subregs. This is always an updating load, so that it + // provides the address to the second load for the odd subregs. + SDValue ImplDef = + SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0); + const SDValue OpsA[] = { MemAddr, Align, Reg0, ImplDef, Pred, Reg0, Chain }; + SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl, + ResTy, AddrTy, MVT::Other, OpsA, 7); + Chain = SDValue(VLdA, 2); + + // Load the odd subregs. + Ops.push_back(SDValue(VLdA, 1)); + Ops.push_back(Align); + if (isUpdating) { + SDValue Inc = N->getOperand(AddrOpIdx + 1); + assert(isa<ConstantSDNode>(Inc.getNode()) && + "only constant post-increment update allowed for VLD3/4"); + (void)Inc; + Ops.push_back(Reg0); + } + Ops.push_back(SDValue(VLdA, 0)); + Ops.push_back(Pred); + Ops.push_back(Reg0); + Ops.push_back(Chain); + VLd = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, + Ops.data(), Ops.size()); + } + + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + cast<MachineSDNode>(VLd)->setMemRefs(MemOp, MemOp + 1); + + if (NumVecs == 1) + return VLd; + + // Extract out the subregisters. + SDValue SuperReg = SDValue(VLd, 0); + assert(ARM::dsub_7 == ARM::dsub_0+7 && + ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering"); + unsigned Sub0 = (is64BitVector ? ARM::dsub_0 : ARM::qsub_0); + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + ReplaceUses(SDValue(N, Vec), + CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg)); + ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1)); + if (isUpdating) + ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLd, 2)); + return NULL; +} + +SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, + unsigned *DOpcodes, unsigned *QOpcodes0, + unsigned *QOpcodes1) { + assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range"); + DebugLoc dl = N->getDebugLoc(); + + SDValue MemAddr, Align; + unsigned AddrOpIdx = isUpdating ? 1 : 2; + unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1) + if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) + return NULL; + + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + + SDValue Chain = N->getOperand(0); + EVT VT = N->getOperand(Vec0Idx).getValueType(); + bool is64BitVector = VT.is64BitVector(); + Align = GetVLDSTAlign(Align, NumVecs, is64BitVector); + + unsigned OpcodeIndex; + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled vst type"); + // Double-register operations: + case MVT::v8i8: OpcodeIndex = 0; break; + case MVT::v4i16: OpcodeIndex = 1; break; + case MVT::v2f32: + case MVT::v2i32: OpcodeIndex = 2; break; + case MVT::v1i64: OpcodeIndex = 3; break; + // Quad-register operations: + case MVT::v16i8: OpcodeIndex = 0; break; + case MVT::v8i16: OpcodeIndex = 1; break; + case MVT::v4f32: + case MVT::v4i32: OpcodeIndex = 2; break; + case MVT::v2i64: OpcodeIndex = 3; + assert(NumVecs == 1 && "v2i64 type only supported for VST1"); + break; + } + + std::vector<EVT> ResTys; + if (isUpdating) + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::Other); + + SDValue Pred = getAL(CurDAG); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + SmallVector<SDValue, 7> Ops; + + // Double registers and VST1/VST2 quad registers are directly supported. + if (is64BitVector || NumVecs <= 2) { + SDValue SrcReg; + if (NumVecs == 1) { + SrcReg = N->getOperand(Vec0Idx); + } else if (is64BitVector) { + // Form a REG_SEQUENCE to force register allocation. + SDValue V0 = N->getOperand(Vec0Idx + 0); + SDValue V1 = N->getOperand(Vec0Idx + 1); + if (NumVecs == 2) + SrcReg = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0); + else { + SDValue V2 = N->getOperand(Vec0Idx + 2); + // If it's a vst3, form a quad D-register and leave the last part as + // an undef. + SDValue V3 = (NumVecs == 3) + ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0) + : N->getOperand(Vec0Idx + 3); + SrcReg = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0); + } + } else { + // Form a QQ register. + SDValue Q0 = N->getOperand(Vec0Idx); + SDValue Q1 = N->getOperand(Vec0Idx + 1); + SrcReg = SDValue(PairQRegs(MVT::v4i64, Q0, Q1), 0); + } + + unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] : + QOpcodes0[OpcodeIndex]); + Ops.push_back(MemAddr); + Ops.push_back(Align); + if (isUpdating) { + SDValue Inc = N->getOperand(AddrOpIdx + 1); + Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc); + } + Ops.push_back(SrcReg); + Ops.push_back(Pred); + Ops.push_back(Reg0); + Ops.push_back(Chain); + SDNode *VSt = + CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size()); + + // Transfer memoperands. + cast<MachineSDNode>(VSt)->setMemRefs(MemOp, MemOp + 1); + + return VSt; + } + + // Otherwise, quad registers are stored with two separate instructions, + // where one stores the even registers and the other stores the odd registers. + + // Form the QQQQ REG_SEQUENCE. + SDValue V0 = N->getOperand(Vec0Idx + 0); + SDValue V1 = N->getOperand(Vec0Idx + 1); + SDValue V2 = N->getOperand(Vec0Idx + 2); + SDValue V3 = (NumVecs == 3) + ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0) + : N->getOperand(Vec0Idx + 3); + SDValue RegSeq = SDValue(QuadQRegs(MVT::v8i64, V0, V1, V2, V3), 0); + + // Store the even D registers. This is always an updating store, so that it + // provides the address to the second store for the odd subregs. + const SDValue OpsA[] = { MemAddr, Align, Reg0, RegSeq, Pred, Reg0, Chain }; + SDNode *VStA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl, + MemAddr.getValueType(), + MVT::Other, OpsA, 7); + cast<MachineSDNode>(VStA)->setMemRefs(MemOp, MemOp + 1); + Chain = SDValue(VStA, 1); + + // Store the odd D registers. + Ops.push_back(SDValue(VStA, 0)); + Ops.push_back(Align); + if (isUpdating) { + SDValue Inc = N->getOperand(AddrOpIdx + 1); + assert(isa<ConstantSDNode>(Inc.getNode()) && + "only constant post-increment update allowed for VST3/4"); + (void)Inc; + Ops.push_back(Reg0); + } + Ops.push_back(RegSeq); + Ops.push_back(Pred); + Ops.push_back(Reg0); + Ops.push_back(Chain); + SDNode *VStB = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, + Ops.data(), Ops.size()); + cast<MachineSDNode>(VStB)->setMemRefs(MemOp, MemOp + 1); + return VStB; +} + +SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, + bool isUpdating, unsigned NumVecs, + unsigned *DOpcodes, + unsigned *QOpcodes) { + assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range"); + DebugLoc dl = N->getDebugLoc(); + + SDValue MemAddr, Align; + unsigned AddrOpIdx = isUpdating ? 1 : 2; + unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1) + if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) + return NULL; + + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + + SDValue Chain = N->getOperand(0); + unsigned Lane = + cast<ConstantSDNode>(N->getOperand(Vec0Idx + NumVecs))->getZExtValue(); + EVT VT = N->getOperand(Vec0Idx).getValueType(); + bool is64BitVector = VT.is64BitVector(); + + unsigned Alignment = 0; + if (NumVecs != 3) { + Alignment = cast<ConstantSDNode>(Align)->getZExtValue(); + unsigned NumBytes = NumVecs * VT.getVectorElementType().getSizeInBits()/8; + if (Alignment > NumBytes) + Alignment = NumBytes; + if (Alignment < 8 && Alignment < NumBytes) + Alignment = 0; + // Alignment must be a power of two; make sure of that. + Alignment = (Alignment & -Alignment); + if (Alignment == 1) + Alignment = 0; + } + Align = CurDAG->getTargetConstant(Alignment, MVT::i32); + + unsigned OpcodeIndex; + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled vld/vst lane type"); + // Double-register operations: + case MVT::v8i8: OpcodeIndex = 0; break; + case MVT::v4i16: OpcodeIndex = 1; break; + case MVT::v2f32: + case MVT::v2i32: OpcodeIndex = 2; break; + // Quad-register operations: + case MVT::v8i16: OpcodeIndex = 0; break; + case MVT::v4f32: + case MVT::v4i32: OpcodeIndex = 1; break; + } + + std::vector<EVT> ResTys; + if (IsLoad) { + unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs; + if (!is64BitVector) + ResTyElts *= 2; + ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), + MVT::i64, ResTyElts)); + } + if (isUpdating) + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::Other); + + SDValue Pred = getAL(CurDAG); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + + SmallVector<SDValue, 8> Ops; + Ops.push_back(MemAddr); + Ops.push_back(Align); + if (isUpdating) { + SDValue Inc = N->getOperand(AddrOpIdx + 1); + Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc); + } + + SDValue SuperReg; + SDValue V0 = N->getOperand(Vec0Idx + 0); + SDValue V1 = N->getOperand(Vec0Idx + 1); + if (NumVecs == 2) { + if (is64BitVector) + SuperReg = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0); + else + SuperReg = SDValue(PairQRegs(MVT::v4i64, V0, V1), 0); + } else { + SDValue V2 = N->getOperand(Vec0Idx + 2); + SDValue V3 = (NumVecs == 3) + ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0) + : N->getOperand(Vec0Idx + 3); + if (is64BitVector) + SuperReg = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0); + else + SuperReg = SDValue(QuadQRegs(MVT::v8i64, V0, V1, V2, V3), 0); + } + Ops.push_back(SuperReg); + Ops.push_back(getI32Imm(Lane)); + Ops.push_back(Pred); + Ops.push_back(Reg0); + Ops.push_back(Chain); + + unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] : + QOpcodes[OpcodeIndex]); + SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, + Ops.data(), Ops.size()); + cast<MachineSDNode>(VLdLn)->setMemRefs(MemOp, MemOp + 1); + if (!IsLoad) + return VLdLn; + + // Extract the subregisters. + SuperReg = SDValue(VLdLn, 0); + assert(ARM::dsub_7 == ARM::dsub_0+7 && + ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering"); + unsigned Sub0 = is64BitVector ? ARM::dsub_0 : ARM::qsub_0; + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + ReplaceUses(SDValue(N, Vec), + CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg)); + ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1)); + if (isUpdating) + ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2)); + return NULL; +} + +SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, + unsigned NumVecs, unsigned *Opcodes) { + assert(NumVecs >=2 && NumVecs <= 4 && "VLDDup NumVecs out-of-range"); + DebugLoc dl = N->getDebugLoc(); + + SDValue MemAddr, Align; + if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align)) + return NULL; + + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + + SDValue Chain = N->getOperand(0); + EVT VT = N->getValueType(0); + + unsigned Alignment = 0; + if (NumVecs != 3) { + Alignment = cast<ConstantSDNode>(Align)->getZExtValue(); + unsigned NumBytes = NumVecs * VT.getVectorElementType().getSizeInBits()/8; + if (Alignment > NumBytes) + Alignment = NumBytes; + if (Alignment < 8 && Alignment < NumBytes) + Alignment = 0; + // Alignment must be a power of two; make sure of that. + Alignment = (Alignment & -Alignment); + if (Alignment == 1) + Alignment = 0; + } + Align = CurDAG->getTargetConstant(Alignment, MVT::i32); + + unsigned OpcodeIndex; + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled vld-dup type"); + case MVT::v8i8: OpcodeIndex = 0; break; + case MVT::v4i16: OpcodeIndex = 1; break; + case MVT::v2f32: + case MVT::v2i32: OpcodeIndex = 2; break; + } + + SDValue Pred = getAL(CurDAG); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + SDValue SuperReg; + unsigned Opc = Opcodes[OpcodeIndex]; + SmallVector<SDValue, 6> Ops; + Ops.push_back(MemAddr); + Ops.push_back(Align); + if (isUpdating) { + SDValue Inc = N->getOperand(2); + Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc); + } + Ops.push_back(Pred); + Ops.push_back(Reg0); + Ops.push_back(Chain); + + unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs; + std::vector<EVT> ResTys; + ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,ResTyElts)); + if (isUpdating) + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::Other); + SDNode *VLdDup = + CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size()); + cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1); + SuperReg = SDValue(VLdDup, 0); + + // Extract the subregisters. + assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); + unsigned SubIdx = ARM::dsub_0; + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + ReplaceUses(SDValue(N, Vec), + CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg)); + ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1)); + if (isUpdating) + ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2)); + return NULL; +} + +SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, + unsigned Opc) { + assert(NumVecs >= 2 && NumVecs <= 4 && "VTBL NumVecs out-of-range"); + DebugLoc dl = N->getDebugLoc(); + EVT VT = N->getValueType(0); + unsigned FirstTblReg = IsExt ? 2 : 1; + + // Form a REG_SEQUENCE to force register allocation. + SDValue RegSeq; + SDValue V0 = N->getOperand(FirstTblReg + 0); + SDValue V1 = N->getOperand(FirstTblReg + 1); + if (NumVecs == 2) + RegSeq = SDValue(PairDRegs(MVT::v16i8, V0, V1), 0); + else { + SDValue V2 = N->getOperand(FirstTblReg + 2); + // If it's a vtbl3, form a quad D-register and leave the last part as + // an undef. + SDValue V3 = (NumVecs == 3) + ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0) + : N->getOperand(FirstTblReg + 3); + RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0); + } + + SmallVector<SDValue, 6> Ops; + if (IsExt) + Ops.push_back(N->getOperand(1)); + Ops.push_back(RegSeq); + Ops.push_back(N->getOperand(FirstTblReg + NumVecs)); + Ops.push_back(getAL(CurDAG)); // predicate + Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // predicate register + return CurDAG->getMachineNode(Opc, dl, VT, Ops.data(), Ops.size()); +} + +SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N, + bool isSigned) { + if (!Subtarget->hasV6T2Ops()) + return NULL; + + unsigned Opc = isSigned ? (Subtarget->isThumb() ? ARM::t2SBFX : ARM::SBFX) + : (Subtarget->isThumb() ? ARM::t2UBFX : ARM::UBFX); + + + // For unsigned extracts, check for a shift right and mask + unsigned And_imm = 0; + if (N->getOpcode() == ISD::AND) { + if (isOpcWithIntImmediate(N, ISD::AND, And_imm)) { + + // The immediate is a mask of the low bits iff imm & (imm+1) == 0 + if (And_imm & (And_imm + 1)) + return NULL; + + unsigned Srl_imm = 0; + if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRL, + Srl_imm)) { + assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!"); + + // Note: The width operand is encoded as width-1. + unsigned Width = CountTrailingOnes_32(And_imm) - 1; + unsigned LSB = Srl_imm; + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { N->getOperand(0).getOperand(0), + CurDAG->getTargetConstant(LSB, MVT::i32), + CurDAG->getTargetConstant(Width, MVT::i32), + getAL(CurDAG), Reg0 }; + return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5); + } + } + return NULL; + } + + // Otherwise, we're looking for a shift of a shift + unsigned Shl_imm = 0; + if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) { + assert(Shl_imm > 0 && Shl_imm < 32 && "bad amount in shift node!"); + unsigned Srl_imm = 0; + if (isInt32Immediate(N->getOperand(1), Srl_imm)) { + assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!"); + // Note: The width operand is encoded as width-1. + unsigned Width = 32 - Srl_imm - 1; + int LSB = Srl_imm - Shl_imm; + if (LSB < 0) + return NULL; + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { N->getOperand(0).getOperand(0), + CurDAG->getTargetConstant(LSB, MVT::i32), + CurDAG->getTargetConstant(Width, MVT::i32), + getAL(CurDAG), Reg0 }; + return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5); + } + } + return NULL; +} + +SDNode *ARMDAGToDAGISel:: +SelectT2CMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, + ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) { + SDValue CPTmp0; + SDValue CPTmp1; + if (SelectT2ShifterOperandReg(TrueVal, CPTmp0, CPTmp1)) { + unsigned SOVal = cast<ConstantSDNode>(CPTmp1)->getZExtValue(); + unsigned SOShOp = ARM_AM::getSORegShOp(SOVal); + unsigned Opc = 0; + switch (SOShOp) { + case ARM_AM::lsl: Opc = ARM::t2MOVCClsl; break; + case ARM_AM::lsr: Opc = ARM::t2MOVCClsr; break; + case ARM_AM::asr: Opc = ARM::t2MOVCCasr; break; + case ARM_AM::ror: Opc = ARM::t2MOVCCror; break; + default: + llvm_unreachable("Unknown so_reg opcode!"); + break; + } + SDValue SOShImm = + CurDAG->getTargetConstant(ARM_AM::getSORegOffset(SOVal), MVT::i32); + SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32); + SDValue Ops[] = { FalseVal, CPTmp0, SOShImm, CC, CCR, InFlag }; + return CurDAG->SelectNodeTo(N, Opc, MVT::i32,Ops, 6); + } + return 0; +} + +SDNode *ARMDAGToDAGISel:: +SelectARMCMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, + ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) { + SDValue CPTmp0; + SDValue CPTmp1; + SDValue CPTmp2; + if (SelectImmShifterOperand(TrueVal, CPTmp0, CPTmp2)) { + SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32); + SDValue Ops[] = { FalseVal, CPTmp0, CPTmp2, CC, CCR, InFlag }; + return CurDAG->SelectNodeTo(N, ARM::MOVCCsi, MVT::i32, Ops, 6); + } + + if (SelectRegShifterOperand(TrueVal, CPTmp0, CPTmp1, CPTmp2)) { + SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32); + SDValue Ops[] = { FalseVal, CPTmp0, CPTmp1, CPTmp2, CC, CCR, InFlag }; + return CurDAG->SelectNodeTo(N, ARM::MOVCCsr, MVT::i32, Ops, 7); + } + return 0; +} + +SDNode *ARMDAGToDAGISel:: +SelectT2CMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, + ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) { + ConstantSDNode *T = dyn_cast<ConstantSDNode>(TrueVal); + if (!T) + return 0; + + unsigned Opc = 0; + unsigned TrueImm = T->getZExtValue(); + if (is_t2_so_imm(TrueImm)) { + Opc = ARM::t2MOVCCi; + } else if (TrueImm <= 0xffff) { + Opc = ARM::t2MOVCCi16; + } else if (is_t2_so_imm_not(TrueImm)) { + TrueImm = ~TrueImm; + Opc = ARM::t2MVNCCi; + } else if (TrueVal.getNode()->hasOneUse() && Subtarget->hasV6T2Ops()) { + // Large immediate. + Opc = ARM::t2MOVCCi32imm; + } + + if (Opc) { + SDValue True = CurDAG->getTargetConstant(TrueImm, MVT::i32); + SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32); + SDValue Ops[] = { FalseVal, True, CC, CCR, InFlag }; + return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5); + } + + return 0; +} + +SDNode *ARMDAGToDAGISel:: +SelectARMCMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal, + ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) { + ConstantSDNode *T = dyn_cast<ConstantSDNode>(TrueVal); + if (!T) + return 0; + + unsigned Opc = 0; + unsigned TrueImm = T->getZExtValue(); + bool isSoImm = is_so_imm(TrueImm); + if (isSoImm) { + Opc = ARM::MOVCCi; + } else if (Subtarget->hasV6T2Ops() && TrueImm <= 0xffff) { + Opc = ARM::MOVCCi16; + } else if (is_so_imm_not(TrueImm)) { + TrueImm = ~TrueImm; + Opc = ARM::MVNCCi; + } else if (TrueVal.getNode()->hasOneUse() && + (Subtarget->hasV6T2Ops() || ARM_AM::isSOImmTwoPartVal(TrueImm))) { + // Large immediate. + Opc = ARM::MOVCCi32imm; + } + + if (Opc) { + SDValue True = CurDAG->getTargetConstant(TrueImm, MVT::i32); + SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32); + SDValue Ops[] = { FalseVal, True, CC, CCR, InFlag }; + return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5); + } + + return 0; +} + +SDNode *ARMDAGToDAGISel::SelectCMOVOp(SDNode *N) { + EVT VT = N->getValueType(0); + SDValue FalseVal = N->getOperand(0); + SDValue TrueVal = N->getOperand(1); + SDValue CC = N->getOperand(2); + SDValue CCR = N->getOperand(3); + SDValue InFlag = N->getOperand(4); + assert(CC.getOpcode() == ISD::Constant); + assert(CCR.getOpcode() == ISD::Register); + ARMCC::CondCodes CCVal = + (ARMCC::CondCodes)cast<ConstantSDNode>(CC)->getZExtValue(); + + if (!Subtarget->isThumb1Only() && VT == MVT::i32) { + // Pattern: (ARMcmov:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc) + // Emits: (MOVCCs:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc) + // Pattern complexity = 18 cost = 1 size = 0 + SDValue CPTmp0; + SDValue CPTmp1; + SDValue CPTmp2; + if (Subtarget->isThumb()) { + SDNode *Res = SelectT2CMOVShiftOp(N, FalseVal, TrueVal, + CCVal, CCR, InFlag); + if (!Res) + Res = SelectT2CMOVShiftOp(N, TrueVal, FalseVal, + ARMCC::getOppositeCondition(CCVal), CCR, InFlag); + if (Res) + return Res; + } else { + SDNode *Res = SelectARMCMOVShiftOp(N, FalseVal, TrueVal, + CCVal, CCR, InFlag); + if (!Res) + Res = SelectARMCMOVShiftOp(N, TrueVal, FalseVal, + ARMCC::getOppositeCondition(CCVal), CCR, InFlag); + if (Res) + return Res; + } + + // Pattern: (ARMcmov:i32 GPR:i32:$false, + // (imm:i32)<<P:Pred_so_imm>>:$true, + // (imm:i32):$cc) + // Emits: (MOVCCi:i32 GPR:i32:$false, + // (so_imm:i32 (imm:i32):$true), (imm:i32):$cc) + // Pattern complexity = 10 cost = 1 size = 0 + if (Subtarget->isThumb()) { + SDNode *Res = SelectT2CMOVImmOp(N, FalseVal, TrueVal, + CCVal, CCR, InFlag); + if (!Res) + Res = SelectT2CMOVImmOp(N, TrueVal, FalseVal, + ARMCC::getOppositeCondition(CCVal), CCR, InFlag); + if (Res) + return Res; + } else { + SDNode *Res = SelectARMCMOVImmOp(N, FalseVal, TrueVal, + CCVal, CCR, InFlag); + if (!Res) + Res = SelectARMCMOVImmOp(N, TrueVal, FalseVal, + ARMCC::getOppositeCondition(CCVal), CCR, InFlag); + if (Res) + return Res; + } + } + + // Pattern: (ARMcmov:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc) + // Emits: (MOVCCr:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc) + // Pattern complexity = 6 cost = 1 size = 0 + // + // Pattern: (ARMcmov:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc) + // Emits: (tMOVCCr:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc) + // Pattern complexity = 6 cost = 11 size = 0 + // + // Also VMOVScc and VMOVDcc. + SDValue Tmp2 = CurDAG->getTargetConstant(CCVal, MVT::i32); + SDValue Ops[] = { FalseVal, TrueVal, Tmp2, CCR, InFlag }; + unsigned Opc = 0; + switch (VT.getSimpleVT().SimpleTy) { + default: assert(false && "Illegal conditional move type!"); + break; + case MVT::i32: + Opc = Subtarget->isThumb() + ? (Subtarget->hasThumb2() ? ARM::t2MOVCCr : ARM::tMOVCCr_pseudo) + : ARM::MOVCCr; + break; + case MVT::f32: + Opc = ARM::VMOVScc; + break; + case MVT::f64: + Opc = ARM::VMOVDcc; + break; + } + return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 5); +} + +/// Target-specific DAG combining for ISD::XOR. +/// Target-independent combining lowers SELECT_CC nodes of the form +/// select_cc setg[ge] X, 0, X, -X +/// select_cc setgt X, -1, X, -X +/// select_cc setl[te] X, 0, -X, X +/// select_cc setlt X, 1, -X, X +/// which represent Integer ABS into: +/// Y = sra (X, size(X)-1); xor (add (X, Y), Y) +/// ARM instruction selection detects the latter and matches it to +/// ARM::ABS or ARM::t2ABS machine node. +SDNode *ARMDAGToDAGISel::SelectABSOp(SDNode *N){ + SDValue XORSrc0 = N->getOperand(0); + SDValue XORSrc1 = N->getOperand(1); + DebugLoc DL = N->getDebugLoc(); + EVT VT = N->getValueType(0); + + if (DisableARMIntABS) + return NULL; + + if (Subtarget->isThumb1Only()) + return NULL; + + if (XORSrc0.getOpcode() != ISD::ADD || + XORSrc1.getOpcode() != ISD::SRA) + return NULL; + + SDValue ADDSrc0 = XORSrc0.getOperand(0); + SDValue ADDSrc1 = XORSrc0.getOperand(1); + SDValue SRASrc0 = XORSrc1.getOperand(0); + SDValue SRASrc1 = XORSrc1.getOperand(1); + ConstantSDNode *SRAConstant = dyn_cast<ConstantSDNode>(SRASrc1); + EVT XType = SRASrc0.getValueType(); + unsigned Size = XType.getSizeInBits() - 1; + + if (ADDSrc1 == XORSrc1 && + ADDSrc0 == SRASrc0 && + XType.isInteger() && + SRAConstant != NULL && + Size == SRAConstant->getZExtValue()) { + + unsigned Opcode = ARM::ABS; + if (Subtarget->isThumb2()) + Opcode = ARM::t2ABS; + + return CurDAG->SelectNodeTo(N, Opcode, VT, ADDSrc0); + } + + return NULL; +} + +SDNode *ARMDAGToDAGISel::SelectConcatVector(SDNode *N) { + // The only time a CONCAT_VECTORS operation can have legal types is when + // two 64-bit vectors are concatenated to a 128-bit vector. + EVT VT = N->getValueType(0); + if (!VT.is128BitVector() || N->getNumOperands() != 2) + llvm_unreachable("unexpected CONCAT_VECTORS"); + return PairDRegs(VT, N->getOperand(0), N->getOperand(1)); +} + +SDNode *ARMDAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) { + SmallVector<SDValue, 6> Ops; + Ops.push_back(Node->getOperand(1)); // Ptr + Ops.push_back(Node->getOperand(2)); // Low part of Val1 + Ops.push_back(Node->getOperand(3)); // High part of Val1 + if (Opc == ARM::ATOMCMPXCHG6432) { + Ops.push_back(Node->getOperand(4)); // Low part of Val2 + Ops.push_back(Node->getOperand(5)); // High part of Val2 + } + Ops.push_back(Node->getOperand(0)); // Chain + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemSDNode>(Node)->getMemOperand(); + SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(), + MVT::i32, MVT::i32, MVT::Other, + Ops.data() ,Ops.size()); + cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1); + return ResNode; +} + +SDNode *ARMDAGToDAGISel::Select(SDNode *N) { + DebugLoc dl = N->getDebugLoc(); + + if (N->isMachineOpcode()) + return NULL; // Already selected. + + switch (N->getOpcode()) { + default: break; + case ISD::XOR: { + // Select special operations if XOR node forms integer ABS pattern + SDNode *ResNode = SelectABSOp(N); + if (ResNode) + return ResNode; + // Other cases are autogenerated. + break; + } + case ISD::Constant: { + unsigned Val = cast<ConstantSDNode>(N)->getZExtValue(); + bool UseCP = true; + if (Subtarget->hasThumb2()) + // Thumb2-aware targets have the MOVT instruction, so all immediates can + // be done with MOV + MOVT, at worst. + UseCP = 0; + else { + if (Subtarget->isThumb()) { + UseCP = (Val > 255 && // MOV + ~Val > 255 && // MOV + MVN + !ARM_AM::isThumbImmShiftedVal(Val)); // MOV + LSL + } else + UseCP = (ARM_AM::getSOImmVal(Val) == -1 && // MOV + ARM_AM::getSOImmVal(~Val) == -1 && // MVN + !ARM_AM::isSOImmTwoPartVal(Val)); // two instrs. + } + + if (UseCP) { + SDValue CPIdx = + CurDAG->getTargetConstantPool(ConstantInt::get( + Type::getInt32Ty(*CurDAG->getContext()), Val), + TLI.getPointerTy()); + + SDNode *ResNode; + if (Subtarget->isThumb1Only()) { + SDValue Pred = getAL(CurDAG); + SDValue PredReg = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { CPIdx, Pred, PredReg, CurDAG->getEntryNode() }; + ResNode = CurDAG->getMachineNode(ARM::tLDRpci, dl, MVT::i32, MVT::Other, + Ops, 4); + } else { + SDValue Ops[] = { + CPIdx, + CurDAG->getTargetConstant(0, MVT::i32), + getAL(CurDAG), + CurDAG->getRegister(0, MVT::i32), + CurDAG->getEntryNode() + }; + ResNode=CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other, + Ops, 5); + } + ReplaceUses(SDValue(N, 0), SDValue(ResNode, 0)); + return NULL; + } + + // Other cases are autogenerated. + break; + } + case ISD::FrameIndex: { + // Selects to ADDri FI, 0 which in turn will become ADDri SP, imm. + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + if (Subtarget->isThumb1Only()) { + SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32), + getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->SelectNodeTo(N, ARM::tADDrSPi, MVT::i32, Ops, 4); + } else { + unsigned Opc = ((Subtarget->isThumb() && Subtarget->hasThumb2()) ? + ARM::t2ADDri : ARM::ADDri); + SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32), + getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), + CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5); + } + } + case ISD::SRL: + if (SDNode *I = SelectV6T2BitfieldExtractOp(N, false)) + return I; + break; + case ISD::SRA: + if (SDNode *I = SelectV6T2BitfieldExtractOp(N, true)) + return I; + break; + case ISD::MUL: + if (Subtarget->isThumb1Only()) + break; + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) { + unsigned RHSV = C->getZExtValue(); + if (!RHSV) break; + if (isPowerOf2_32(RHSV-1)) { // 2^n+1? + unsigned ShImm = Log2_32(RHSV-1); + if (ShImm >= 32) + break; + SDValue V = N->getOperand(0); + ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, ShImm); + SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, MVT::i32); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + if (Subtarget->isThumb()) { + SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG), Reg0, Reg0 }; + return CurDAG->SelectNodeTo(N, ARM::t2ADDrs, MVT::i32, Ops, 6); + } else { + SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG), Reg0, Reg0 }; + return CurDAG->SelectNodeTo(N, ARM::ADDrsi, MVT::i32, Ops, 7); + } + } + if (isPowerOf2_32(RHSV+1)) { // 2^n-1? + unsigned ShImm = Log2_32(RHSV+1); + if (ShImm >= 32) + break; + SDValue V = N->getOperand(0); + ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, ShImm); + SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, MVT::i32); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + if (Subtarget->isThumb()) { + SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG), Reg0, Reg0 }; + return CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops, 6); + } else { + SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG), Reg0, Reg0 }; + return CurDAG->SelectNodeTo(N, ARM::RSBrsi, MVT::i32, Ops, 7); + } + } + } + break; + case ISD::AND: { + // Check for unsigned bitfield extract + if (SDNode *I = SelectV6T2BitfieldExtractOp(N, false)) + return I; + + // (and (or x, c2), c1) and top 16-bits of c1 and c2 match, lower 16-bits + // of c1 are 0xffff, and lower 16-bit of c2 are 0. That is, the top 16-bits + // are entirely contributed by c2 and lower 16-bits are entirely contributed + // by x. That's equal to (or (and x, 0xffff), (and c1, 0xffff0000)). + // Select it to: "movt x, ((c1 & 0xffff) >> 16) + EVT VT = N->getValueType(0); + if (VT != MVT::i32) + break; + unsigned Opc = (Subtarget->isThumb() && Subtarget->hasThumb2()) + ? ARM::t2MOVTi16 + : (Subtarget->hasV6T2Ops() ? ARM::MOVTi16 : 0); + if (!Opc) + break; + SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); + ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); + if (!N1C) + break; + if (N0.getOpcode() == ISD::OR && N0.getNode()->hasOneUse()) { + SDValue N2 = N0.getOperand(1); + ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2); + if (!N2C) + break; + unsigned N1CVal = N1C->getZExtValue(); + unsigned N2CVal = N2C->getZExtValue(); + if ((N1CVal & 0xffff0000U) == (N2CVal & 0xffff0000U) && + (N1CVal & 0xffffU) == 0xffffU && + (N2CVal & 0xffffU) == 0x0U) { + SDValue Imm16 = CurDAG->getTargetConstant((N2CVal & 0xFFFF0000U) >> 16, + MVT::i32); + SDValue Ops[] = { N0.getOperand(0), Imm16, + getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->getMachineNode(Opc, dl, VT, Ops, 4); + } + } + break; + } + case ARMISD::VMOVRRD: + return CurDAG->getMachineNode(ARM::VMOVRRD, dl, MVT::i32, MVT::i32, + N->getOperand(0), getAL(CurDAG), + CurDAG->getRegister(0, MVT::i32)); + case ISD::UMUL_LOHI: { + if (Subtarget->isThumb1Only()) + break; + if (Subtarget->isThumb()) { + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), + getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), + CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32,Ops,4); + } else { + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), + getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), + CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->getMachineNode(Subtarget->hasV6Ops() ? + ARM::UMULL : ARM::UMULLv5, + dl, MVT::i32, MVT::i32, Ops, 5); + } + } + case ISD::SMUL_LOHI: { + if (Subtarget->isThumb1Only()) + break; + if (Subtarget->isThumb()) { + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), + getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32,Ops,4); + } else { + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), + getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), + CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->getMachineNode(Subtarget->hasV6Ops() ? + ARM::SMULL : ARM::SMULLv5, + dl, MVT::i32, MVT::i32, Ops, 5); + } + } + case ISD::LOAD: { + SDNode *ResNode = 0; + if (Subtarget->isThumb() && Subtarget->hasThumb2()) + ResNode = SelectT2IndexedLoad(N); + else + ResNode = SelectARMIndexedLoad(N); + if (ResNode) + return ResNode; + // Other cases are autogenerated. + break; + } + case ARMISD::BRCOND: { + // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc) + // Emits: (Bcc:void (bb:Other):$dst, (imm:i32):$cc) + // Pattern complexity = 6 cost = 1 size = 0 + + // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc) + // Emits: (tBcc:void (bb:Other):$dst, (imm:i32):$cc) + // Pattern complexity = 6 cost = 1 size = 0 + + // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc) + // Emits: (t2Bcc:void (bb:Other):$dst, (imm:i32):$cc) + // Pattern complexity = 6 cost = 1 size = 0 + + unsigned Opc = Subtarget->isThumb() ? + ((Subtarget->hasThumb2()) ? ARM::t2Bcc : ARM::tBcc) : ARM::Bcc; + SDValue Chain = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue N2 = N->getOperand(2); + SDValue N3 = N->getOperand(3); + SDValue InFlag = N->getOperand(4); + assert(N1.getOpcode() == ISD::BasicBlock); + assert(N2.getOpcode() == ISD::Constant); + assert(N3.getOpcode() == ISD::Register); + + SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned) + cast<ConstantSDNode>(N2)->getZExtValue()), + MVT::i32); + SDValue Ops[] = { N1, Tmp2, N3, Chain, InFlag }; + SDNode *ResNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, + MVT::Glue, Ops, 5); + Chain = SDValue(ResNode, 0); + if (N->getNumValues() == 2) { + InFlag = SDValue(ResNode, 1); + ReplaceUses(SDValue(N, 1), InFlag); + } + ReplaceUses(SDValue(N, 0), + SDValue(Chain.getNode(), Chain.getResNo())); + return NULL; + } + case ARMISD::CMOV: + return SelectCMOVOp(N); + case ARMISD::VZIP: { + unsigned Opc = 0; + EVT VT = N->getValueType(0); + switch (VT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::v8i8: Opc = ARM::VZIPd8; break; + case MVT::v4i16: Opc = ARM::VZIPd16; break; + case MVT::v2f32: + case MVT::v2i32: Opc = ARM::VZIPd32; break; + case MVT::v16i8: Opc = ARM::VZIPq8; break; + case MVT::v8i16: Opc = ARM::VZIPq16; break; + case MVT::v4f32: + case MVT::v4i32: Opc = ARM::VZIPq32; break; + } + SDValue Pred = getAL(CurDAG); + SDValue PredReg = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg }; + return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops, 4); + } + case ARMISD::VUZP: { + unsigned Opc = 0; + EVT VT = N->getValueType(0); + switch (VT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::v8i8: Opc = ARM::VUZPd8; break; + case MVT::v4i16: Opc = ARM::VUZPd16; break; + case MVT::v2f32: + case MVT::v2i32: Opc = ARM::VUZPd32; break; + case MVT::v16i8: Opc = ARM::VUZPq8; break; + case MVT::v8i16: Opc = ARM::VUZPq16; break; + case MVT::v4f32: + case MVT::v4i32: Opc = ARM::VUZPq32; break; + } + SDValue Pred = getAL(CurDAG); + SDValue PredReg = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg }; + return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops, 4); + } + case ARMISD::VTRN: { + unsigned Opc = 0; + EVT VT = N->getValueType(0); + switch (VT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::v8i8: Opc = ARM::VTRNd8; break; + case MVT::v4i16: Opc = ARM::VTRNd16; break; + case MVT::v2f32: + case MVT::v2i32: Opc = ARM::VTRNd32; break; + case MVT::v16i8: Opc = ARM::VTRNq8; break; + case MVT::v8i16: Opc = ARM::VTRNq16; break; + case MVT::v4f32: + case MVT::v4i32: Opc = ARM::VTRNq32; break; + } + SDValue Pred = getAL(CurDAG); + SDValue PredReg = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg }; + return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops, 4); + } + case ARMISD::BUILD_VECTOR: { + EVT VecVT = N->getValueType(0); + EVT EltVT = VecVT.getVectorElementType(); + unsigned NumElts = VecVT.getVectorNumElements(); + if (EltVT == MVT::f64) { + assert(NumElts == 2 && "unexpected type for BUILD_VECTOR"); + return PairDRegs(VecVT, N->getOperand(0), N->getOperand(1)); + } + assert(EltVT == MVT::f32 && "unexpected type for BUILD_VECTOR"); + if (NumElts == 2) + return PairSRegs(VecVT, N->getOperand(0), N->getOperand(1)); + assert(NumElts == 4 && "unexpected type for BUILD_VECTOR"); + return QuadSRegs(VecVT, N->getOperand(0), N->getOperand(1), + N->getOperand(2), N->getOperand(3)); + } + + case ARMISD::VLD2DUP: { + unsigned Opcodes[] = { ARM::VLD2DUPd8Pseudo, ARM::VLD2DUPd16Pseudo, + ARM::VLD2DUPd32Pseudo }; + return SelectVLDDup(N, false, 2, Opcodes); + } + + case ARMISD::VLD3DUP: { + unsigned Opcodes[] = { ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd16Pseudo, + ARM::VLD3DUPd32Pseudo }; + return SelectVLDDup(N, false, 3, Opcodes); + } + + case ARMISD::VLD4DUP: { + unsigned Opcodes[] = { ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd16Pseudo, + ARM::VLD4DUPd32Pseudo }; + return SelectVLDDup(N, false, 4, Opcodes); + } + + case ARMISD::VLD2DUP_UPD: { + unsigned Opcodes[] = { ARM::VLD2DUPd8Pseudo_UPD, ARM::VLD2DUPd16Pseudo_UPD, + ARM::VLD2DUPd32Pseudo_UPD }; + return SelectVLDDup(N, true, 2, Opcodes); + } + + case ARMISD::VLD3DUP_UPD: { + unsigned Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd16Pseudo_UPD, + ARM::VLD3DUPd32Pseudo_UPD }; + return SelectVLDDup(N, true, 3, Opcodes); + } + + case ARMISD::VLD4DUP_UPD: { + unsigned Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd16Pseudo_UPD, + ARM::VLD4DUPd32Pseudo_UPD }; + return SelectVLDDup(N, true, 4, Opcodes); + } + + case ARMISD::VLD1_UPD: { + unsigned DOpcodes[] = { ARM::VLD1d8_UPD, ARM::VLD1d16_UPD, + ARM::VLD1d32_UPD, ARM::VLD1d64_UPD }; + unsigned QOpcodes[] = { ARM::VLD1q8Pseudo_UPD, ARM::VLD1q16Pseudo_UPD, + ARM::VLD1q32Pseudo_UPD, ARM::VLD1q64Pseudo_UPD }; + return SelectVLD(N, true, 1, DOpcodes, QOpcodes, 0); + } + + case ARMISD::VLD2_UPD: { + unsigned DOpcodes[] = { ARM::VLD2d8Pseudo_UPD, ARM::VLD2d16Pseudo_UPD, + ARM::VLD2d32Pseudo_UPD, ARM::VLD1q64Pseudo_UPD }; + unsigned QOpcodes[] = { ARM::VLD2q8Pseudo_UPD, ARM::VLD2q16Pseudo_UPD, + ARM::VLD2q32Pseudo_UPD }; + return SelectVLD(N, true, 2, DOpcodes, QOpcodes, 0); + } + + case ARMISD::VLD3_UPD: { + unsigned DOpcodes[] = { ARM::VLD3d8Pseudo_UPD, ARM::VLD3d16Pseudo_UPD, + ARM::VLD3d32Pseudo_UPD, ARM::VLD1d64TPseudo_UPD }; + unsigned QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD, + ARM::VLD3q16Pseudo_UPD, + ARM::VLD3q32Pseudo_UPD }; + unsigned QOpcodes1[] = { ARM::VLD3q8oddPseudo_UPD, + ARM::VLD3q16oddPseudo_UPD, + ARM::VLD3q32oddPseudo_UPD }; + return SelectVLD(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1); + } + + case ARMISD::VLD4_UPD: { + unsigned DOpcodes[] = { ARM::VLD4d8Pseudo_UPD, ARM::VLD4d16Pseudo_UPD, + ARM::VLD4d32Pseudo_UPD, ARM::VLD1d64QPseudo_UPD }; + unsigned QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD, + ARM::VLD4q16Pseudo_UPD, + ARM::VLD4q32Pseudo_UPD }; + unsigned QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD, + ARM::VLD4q16oddPseudo_UPD, + ARM::VLD4q32oddPseudo_UPD }; + return SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); + } + + case ARMISD::VLD2LN_UPD: { + unsigned DOpcodes[] = { ARM::VLD2LNd8Pseudo_UPD, ARM::VLD2LNd16Pseudo_UPD, + ARM::VLD2LNd32Pseudo_UPD }; + unsigned QOpcodes[] = { ARM::VLD2LNq16Pseudo_UPD, + ARM::VLD2LNq32Pseudo_UPD }; + return SelectVLDSTLane(N, true, true, 2, DOpcodes, QOpcodes); + } + + case ARMISD::VLD3LN_UPD: { + unsigned DOpcodes[] = { ARM::VLD3LNd8Pseudo_UPD, ARM::VLD3LNd16Pseudo_UPD, + ARM::VLD3LNd32Pseudo_UPD }; + unsigned QOpcodes[] = { ARM::VLD3LNq16Pseudo_UPD, + ARM::VLD3LNq32Pseudo_UPD }; + return SelectVLDSTLane(N, true, true, 3, DOpcodes, QOpcodes); + } + + case ARMISD::VLD4LN_UPD: { + unsigned DOpcodes[] = { ARM::VLD4LNd8Pseudo_UPD, ARM::VLD4LNd16Pseudo_UPD, + ARM::VLD4LNd32Pseudo_UPD }; + unsigned QOpcodes[] = { ARM::VLD4LNq16Pseudo_UPD, + ARM::VLD4LNq32Pseudo_UPD }; + return SelectVLDSTLane(N, true, true, 4, DOpcodes, QOpcodes); + } + + case ARMISD::VST1_UPD: { + unsigned DOpcodes[] = { ARM::VST1d8_UPD, ARM::VST1d16_UPD, + ARM::VST1d32_UPD, ARM::VST1d64_UPD }; + unsigned QOpcodes[] = { ARM::VST1q8Pseudo_UPD, ARM::VST1q16Pseudo_UPD, + ARM::VST1q32Pseudo_UPD, ARM::VST1q64Pseudo_UPD }; + return SelectVST(N, true, 1, DOpcodes, QOpcodes, 0); + } + + case ARMISD::VST2_UPD: { + unsigned DOpcodes[] = { ARM::VST2d8Pseudo_UPD, ARM::VST2d16Pseudo_UPD, + ARM::VST2d32Pseudo_UPD, ARM::VST1q64Pseudo_UPD }; + unsigned QOpcodes[] = { ARM::VST2q8Pseudo_UPD, ARM::VST2q16Pseudo_UPD, + ARM::VST2q32Pseudo_UPD }; + return SelectVST(N, true, 2, DOpcodes, QOpcodes, 0); + } + + case ARMISD::VST3_UPD: { + unsigned DOpcodes[] = { ARM::VST3d8Pseudo_UPD, ARM::VST3d16Pseudo_UPD, + ARM::VST3d32Pseudo_UPD, ARM::VST1d64TPseudo_UPD }; + unsigned QOpcodes0[] = { ARM::VST3q8Pseudo_UPD, + ARM::VST3q16Pseudo_UPD, + ARM::VST3q32Pseudo_UPD }; + unsigned QOpcodes1[] = { ARM::VST3q8oddPseudo_UPD, + ARM::VST3q16oddPseudo_UPD, + ARM::VST3q32oddPseudo_UPD }; + return SelectVST(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1); + } + + case ARMISD::VST4_UPD: { + unsigned DOpcodes[] = { ARM::VST4d8Pseudo_UPD, ARM::VST4d16Pseudo_UPD, + ARM::VST4d32Pseudo_UPD, ARM::VST1d64QPseudo_UPD }; + unsigned QOpcodes0[] = { ARM::VST4q8Pseudo_UPD, + ARM::VST4q16Pseudo_UPD, + ARM::VST4q32Pseudo_UPD }; + unsigned QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD, + ARM::VST4q16oddPseudo_UPD, + ARM::VST4q32oddPseudo_UPD }; + return SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); + } + + case ARMISD::VST2LN_UPD: { + unsigned DOpcodes[] = { ARM::VST2LNd8Pseudo_UPD, ARM::VST2LNd16Pseudo_UPD, + ARM::VST2LNd32Pseudo_UPD }; + unsigned QOpcodes[] = { ARM::VST2LNq16Pseudo_UPD, + ARM::VST2LNq32Pseudo_UPD }; + return SelectVLDSTLane(N, false, true, 2, DOpcodes, QOpcodes); + } + + case ARMISD::VST3LN_UPD: { + unsigned DOpcodes[] = { ARM::VST3LNd8Pseudo_UPD, ARM::VST3LNd16Pseudo_UPD, + ARM::VST3LNd32Pseudo_UPD }; + unsigned QOpcodes[] = { ARM::VST3LNq16Pseudo_UPD, + ARM::VST3LNq32Pseudo_UPD }; + return SelectVLDSTLane(N, false, true, 3, DOpcodes, QOpcodes); + } + + case ARMISD::VST4LN_UPD: { + unsigned DOpcodes[] = { ARM::VST4LNd8Pseudo_UPD, ARM::VST4LNd16Pseudo_UPD, + ARM::VST4LNd32Pseudo_UPD }; + unsigned QOpcodes[] = { ARM::VST4LNq16Pseudo_UPD, + ARM::VST4LNq32Pseudo_UPD }; + return SelectVLDSTLane(N, false, true, 4, DOpcodes, QOpcodes); + } + + case ISD::INTRINSIC_VOID: + case ISD::INTRINSIC_W_CHAIN: { + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + switch (IntNo) { + default: + break; + + case Intrinsic::arm_ldrexd: { + SDValue MemAddr = N->getOperand(2); + DebugLoc dl = N->getDebugLoc(); + SDValue Chain = N->getOperand(0); + + unsigned NewOpc = ARM::LDREXD; + if (Subtarget->isThumb() && Subtarget->hasThumb2()) + NewOpc = ARM::t2LDREXD; + + // arm_ldrexd returns a i64 value in {i32, i32} + std::vector<EVT> ResTys; + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::Other); + + // place arguments in the right order + SmallVector<SDValue, 7> Ops; + Ops.push_back(MemAddr); + Ops.push_back(getAL(CurDAG)); + Ops.push_back(CurDAG->getRegister(0, MVT::i32)); + Ops.push_back(Chain); + SDNode *Ld = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops.data(), + Ops.size()); + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1); + + // Until there's support for specifing explicit register constraints + // like the use of even/odd register pair, hardcode ldrexd to always + // use the pair [R0, R1] to hold the load result. + Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ARM::R0, + SDValue(Ld, 0), SDValue(0,0)); + Chain = CurDAG->getCopyToReg(Chain, dl, ARM::R1, + SDValue(Ld, 1), Chain.getValue(1)); + + // Remap uses. + SDValue Glue = Chain.getValue(1); + if (!SDValue(N, 0).use_empty()) { + SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, + ARM::R0, MVT::i32, Glue); + Glue = Result.getValue(2); + ReplaceUses(SDValue(N, 0), Result); + } + if (!SDValue(N, 1).use_empty()) { + SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, + ARM::R1, MVT::i32, Glue); + Glue = Result.getValue(2); + ReplaceUses(SDValue(N, 1), Result); + } + + ReplaceUses(SDValue(N, 2), SDValue(Ld, 2)); + return NULL; + } + + case Intrinsic::arm_strexd: { + DebugLoc dl = N->getDebugLoc(); + SDValue Chain = N->getOperand(0); + SDValue Val0 = N->getOperand(2); + SDValue Val1 = N->getOperand(3); + SDValue MemAddr = N->getOperand(4); + + // Until there's support for specifing explicit register constraints + // like the use of even/odd register pair, hardcode strexd to always + // use the pair [R2, R3] to hold the i64 (i32, i32) value to be stored. + Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ARM::R2, Val0, + SDValue(0, 0)); + Chain = CurDAG->getCopyToReg(Chain, dl, ARM::R3, Val1, Chain.getValue(1)); + + SDValue Glue = Chain.getValue(1); + Val0 = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, + ARM::R2, MVT::i32, Glue); + Glue = Val0.getValue(1); + Val1 = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, + ARM::R3, MVT::i32, Glue); + + // Store exclusive double return a i32 value which is the return status + // of the issued store. + std::vector<EVT> ResTys; + ResTys.push_back(MVT::i32); + ResTys.push_back(MVT::Other); + + // place arguments in the right order + SmallVector<SDValue, 7> Ops; + Ops.push_back(Val0); + Ops.push_back(Val1); + Ops.push_back(MemAddr); + Ops.push_back(getAL(CurDAG)); + Ops.push_back(CurDAG->getRegister(0, MVT::i32)); + Ops.push_back(Chain); + + unsigned NewOpc = ARM::STREXD; + if (Subtarget->isThumb() && Subtarget->hasThumb2()) + NewOpc = ARM::t2STREXD; + + SDNode *St = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops.data(), + Ops.size()); + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1); + + return St; + } + + case Intrinsic::arm_neon_vld1: { + unsigned DOpcodes[] = { ARM::VLD1d8, ARM::VLD1d16, + ARM::VLD1d32, ARM::VLD1d64 }; + unsigned QOpcodes[] = { ARM::VLD1q8Pseudo, ARM::VLD1q16Pseudo, + ARM::VLD1q32Pseudo, ARM::VLD1q64Pseudo }; + return SelectVLD(N, false, 1, DOpcodes, QOpcodes, 0); + } + + case Intrinsic::arm_neon_vld2: { + unsigned DOpcodes[] = { ARM::VLD2d8Pseudo, ARM::VLD2d16Pseudo, + ARM::VLD2d32Pseudo, ARM::VLD1q64Pseudo }; + unsigned QOpcodes[] = { ARM::VLD2q8Pseudo, ARM::VLD2q16Pseudo, + ARM::VLD2q32Pseudo }; + return SelectVLD(N, false, 2, DOpcodes, QOpcodes, 0); + } + + case Intrinsic::arm_neon_vld3: { + unsigned DOpcodes[] = { ARM::VLD3d8Pseudo, ARM::VLD3d16Pseudo, + ARM::VLD3d32Pseudo, ARM::VLD1d64TPseudo }; + unsigned QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD, + ARM::VLD3q16Pseudo_UPD, + ARM::VLD3q32Pseudo_UPD }; + unsigned QOpcodes1[] = { ARM::VLD3q8oddPseudo, + ARM::VLD3q16oddPseudo, + ARM::VLD3q32oddPseudo }; + return SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1); + } + + case Intrinsic::arm_neon_vld4: { + unsigned DOpcodes[] = { ARM::VLD4d8Pseudo, ARM::VLD4d16Pseudo, + ARM::VLD4d32Pseudo, ARM::VLD1d64QPseudo }; + unsigned QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD, + ARM::VLD4q16Pseudo_UPD, + ARM::VLD4q32Pseudo_UPD }; + unsigned QOpcodes1[] = { ARM::VLD4q8oddPseudo, + ARM::VLD4q16oddPseudo, + ARM::VLD4q32oddPseudo }; + return SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1); + } + + case Intrinsic::arm_neon_vld2lane: { + unsigned DOpcodes[] = { ARM::VLD2LNd8Pseudo, ARM::VLD2LNd16Pseudo, + ARM::VLD2LNd32Pseudo }; + unsigned QOpcodes[] = { ARM::VLD2LNq16Pseudo, ARM::VLD2LNq32Pseudo }; + return SelectVLDSTLane(N, true, false, 2, DOpcodes, QOpcodes); + } + + case Intrinsic::arm_neon_vld3lane: { + unsigned DOpcodes[] = { ARM::VLD3LNd8Pseudo, ARM::VLD3LNd16Pseudo, + ARM::VLD3LNd32Pseudo }; + unsigned QOpcodes[] = { ARM::VLD3LNq16Pseudo, ARM::VLD3LNq32Pseudo }; + return SelectVLDSTLane(N, true, false, 3, DOpcodes, QOpcodes); + } + + case Intrinsic::arm_neon_vld4lane: { + unsigned DOpcodes[] = { ARM::VLD4LNd8Pseudo, ARM::VLD4LNd16Pseudo, + ARM::VLD4LNd32Pseudo }; + unsigned QOpcodes[] = { ARM::VLD4LNq16Pseudo, ARM::VLD4LNq32Pseudo }; + return SelectVLDSTLane(N, true, false, 4, DOpcodes, QOpcodes); + } + + case Intrinsic::arm_neon_vst1: { + unsigned DOpcodes[] = { ARM::VST1d8, ARM::VST1d16, + ARM::VST1d32, ARM::VST1d64 }; + unsigned QOpcodes[] = { ARM::VST1q8Pseudo, ARM::VST1q16Pseudo, + ARM::VST1q32Pseudo, ARM::VST1q64Pseudo }; + return SelectVST(N, false, 1, DOpcodes, QOpcodes, 0); + } + + case Intrinsic::arm_neon_vst2: { + unsigned DOpcodes[] = { ARM::VST2d8Pseudo, ARM::VST2d16Pseudo, + ARM::VST2d32Pseudo, ARM::VST1q64Pseudo }; + unsigned QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo, + ARM::VST2q32Pseudo }; + return SelectVST(N, false, 2, DOpcodes, QOpcodes, 0); + } + + case Intrinsic::arm_neon_vst3: { + unsigned DOpcodes[] = { ARM::VST3d8Pseudo, ARM::VST3d16Pseudo, + ARM::VST3d32Pseudo, ARM::VST1d64TPseudo }; + unsigned QOpcodes0[] = { ARM::VST3q8Pseudo_UPD, + ARM::VST3q16Pseudo_UPD, + ARM::VST3q32Pseudo_UPD }; + unsigned QOpcodes1[] = { ARM::VST3q8oddPseudo, + ARM::VST3q16oddPseudo, + ARM::VST3q32oddPseudo }; + return SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1); + } + + case Intrinsic::arm_neon_vst4: { + unsigned DOpcodes[] = { ARM::VST4d8Pseudo, ARM::VST4d16Pseudo, + ARM::VST4d32Pseudo, ARM::VST1d64QPseudo }; + unsigned QOpcodes0[] = { ARM::VST4q8Pseudo_UPD, + ARM::VST4q16Pseudo_UPD, + ARM::VST4q32Pseudo_UPD }; + unsigned QOpcodes1[] = { ARM::VST4q8oddPseudo, + ARM::VST4q16oddPseudo, + ARM::VST4q32oddPseudo }; + return SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1); + } + + case Intrinsic::arm_neon_vst2lane: { + unsigned DOpcodes[] = { ARM::VST2LNd8Pseudo, ARM::VST2LNd16Pseudo, + ARM::VST2LNd32Pseudo }; + unsigned QOpcodes[] = { ARM::VST2LNq16Pseudo, ARM::VST2LNq32Pseudo }; + return SelectVLDSTLane(N, false, false, 2, DOpcodes, QOpcodes); + } + + case Intrinsic::arm_neon_vst3lane: { + unsigned DOpcodes[] = { ARM::VST3LNd8Pseudo, ARM::VST3LNd16Pseudo, + ARM::VST3LNd32Pseudo }; + unsigned QOpcodes[] = { ARM::VST3LNq16Pseudo, ARM::VST3LNq32Pseudo }; + return SelectVLDSTLane(N, false, false, 3, DOpcodes, QOpcodes); + } + + case Intrinsic::arm_neon_vst4lane: { + unsigned DOpcodes[] = { ARM::VST4LNd8Pseudo, ARM::VST4LNd16Pseudo, + ARM::VST4LNd32Pseudo }; + unsigned QOpcodes[] = { ARM::VST4LNq16Pseudo, ARM::VST4LNq32Pseudo }; + return SelectVLDSTLane(N, false, false, 4, DOpcodes, QOpcodes); + } + } + break; + } + + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + switch (IntNo) { + default: + break; + + case Intrinsic::arm_neon_vtbl2: + return SelectVTBL(N, false, 2, ARM::VTBL2Pseudo); + case Intrinsic::arm_neon_vtbl3: + return SelectVTBL(N, false, 3, ARM::VTBL3Pseudo); + case Intrinsic::arm_neon_vtbl4: + return SelectVTBL(N, false, 4, ARM::VTBL4Pseudo); + + case Intrinsic::arm_neon_vtbx2: + return SelectVTBL(N, true, 2, ARM::VTBX2Pseudo); + case Intrinsic::arm_neon_vtbx3: + return SelectVTBL(N, true, 3, ARM::VTBX3Pseudo); + case Intrinsic::arm_neon_vtbx4: + return SelectVTBL(N, true, 4, ARM::VTBX4Pseudo); + } + break; + } + + case ARMISD::VTBL1: { + DebugLoc dl = N->getDebugLoc(); + EVT VT = N->getValueType(0); + SmallVector<SDValue, 6> Ops; + + Ops.push_back(N->getOperand(0)); + Ops.push_back(N->getOperand(1)); + Ops.push_back(getAL(CurDAG)); // Predicate + Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register + return CurDAG->getMachineNode(ARM::VTBL1, dl, VT, Ops.data(), Ops.size()); + } + case ARMISD::VTBL2: { + DebugLoc dl = N->getDebugLoc(); + EVT VT = N->getValueType(0); + + // Form a REG_SEQUENCE to force register allocation. + SDValue V0 = N->getOperand(0); + SDValue V1 = N->getOperand(1); + SDValue RegSeq = SDValue(PairDRegs(MVT::v16i8, V0, V1), 0); + + SmallVector<SDValue, 6> Ops; + Ops.push_back(RegSeq); + Ops.push_back(N->getOperand(2)); + Ops.push_back(getAL(CurDAG)); // Predicate + Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register + return CurDAG->getMachineNode(ARM::VTBL2Pseudo, dl, VT, + Ops.data(), Ops.size()); + } + + case ISD::CONCAT_VECTORS: + return SelectConcatVector(N); + + case ARMISD::ATOMOR64_DAG: + return SelectAtomic64(N, ARM::ATOMOR6432); + case ARMISD::ATOMXOR64_DAG: + return SelectAtomic64(N, ARM::ATOMXOR6432); + case ARMISD::ATOMADD64_DAG: + return SelectAtomic64(N, ARM::ATOMADD6432); + case ARMISD::ATOMSUB64_DAG: + return SelectAtomic64(N, ARM::ATOMSUB6432); + case ARMISD::ATOMNAND64_DAG: + return SelectAtomic64(N, ARM::ATOMNAND6432); + case ARMISD::ATOMAND64_DAG: + return SelectAtomic64(N, ARM::ATOMAND6432); + case ARMISD::ATOMSWAP64_DAG: + return SelectAtomic64(N, ARM::ATOMSWAP6432); + case ARMISD::ATOMCMPXCHG64_DAG: + return SelectAtomic64(N, ARM::ATOMCMPXCHG6432); + } + + return SelectCode(N); +} + +bool ARMDAGToDAGISel:: +SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode, + std::vector<SDValue> &OutOps) { + assert(ConstraintCode == 'm' && "unexpected asm memory constraint"); + // Require the address to be in a register. That is safe for all ARM + // variants and it is hard to do anything much smarter without knowing + // how the operand is used. + OutOps.push_back(Op); + return false; +} + +/// createARMISelDag - This pass converts a legalized DAG into a +/// ARM-specific DAG, ready for instruction scheduling. +/// +FunctionPass *llvm::createARMISelDag(ARMBaseTargetMachine &TM, + CodeGenOpt::Level OptLevel) { + return new ARMDAGToDAGISel(TM, OptLevel); +} diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp new file mode 100644 index 0000000..e44e356 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -0,0 +1,8835 @@ +//===-- ARMISelLowering.cpp - ARM DAG Lowering Implementation -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that ARM uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "arm-isel" +#include "ARM.h" +#include "ARMCallingConv.h" +#include "ARMConstantPoolValue.h" +#include "ARMISelLowering.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMPerfectShuffle.h" +#include "ARMRegisterInfo.h" +#include "ARMSubtarget.h" +#include "ARMTargetMachine.h" +#include "ARMTargetObjectFile.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/CallingConv.h" +#include "llvm/Constants.h" +#include "llvm/Function.h" +#include "llvm/GlobalValue.h" +#include "llvm/Instruction.h" +#include "llvm/Instructions.h" +#include "llvm/Intrinsics.h" +#include "llvm/Type.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/IntrinsicLowering.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/ADT/VectorExtras.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include <sstream> +using namespace llvm; + +STATISTIC(NumTailCalls, "Number of tail calls"); +STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); + +// This option should go away when tail calls fully work. +static cl::opt<bool> +EnableARMTailCalls("arm-tail-calls", cl::Hidden, + cl::desc("Generate tail calls (TEMPORARY OPTION)."), + cl::init(false)); + +cl::opt<bool> +EnableARMLongCalls("arm-long-calls", cl::Hidden, + cl::desc("Generate calls via indirect call instructions"), + cl::init(false)); + +static cl::opt<bool> +ARMInterworking("arm-interworking", cl::Hidden, + cl::desc("Enable / disable ARM interworking (for debugging only)"), + cl::init(true)); + +namespace llvm { + class ARMCCState : public CCState { + public: + ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF, + const TargetMachine &TM, SmallVector<CCValAssign, 16> &locs, + LLVMContext &C, ParmContext PC) + : CCState(CC, isVarArg, MF, TM, locs, C) { + assert(((PC == Call) || (PC == Prologue)) && + "ARMCCState users must specify whether their context is call" + "or prologue generation."); + CallOrPrologue = PC; + } + }; +} + +// The APCS parameter registers. +static const unsigned GPRArgRegs[] = { + ARM::R0, ARM::R1, ARM::R2, ARM::R3 +}; + +void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT, + EVT PromotedBitwiseVT) { + if (VT != PromotedLdStVT) { + setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); + AddPromotedToType (ISD::LOAD, VT.getSimpleVT(), + PromotedLdStVT.getSimpleVT()); + + setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); + AddPromotedToType (ISD::STORE, VT.getSimpleVT(), + PromotedLdStVT.getSimpleVT()); + } + + EVT ElemTy = VT.getVectorElementType(); + if (ElemTy != MVT::i64 && ElemTy != MVT::f64) + setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom); + if (ElemTy != MVT::i32) { + setOperationAction(ISD::SINT_TO_FP, VT.getSimpleVT(), Expand); + setOperationAction(ISD::UINT_TO_FP, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Expand); + } + setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal); + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Legal); + setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand); + setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand); + if (VT.isInteger()) { + setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom); + setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom); + setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom); + setLoadExtAction(ISD::SEXTLOAD, VT.getSimpleVT(), Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT.getSimpleVT(), Expand); + for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; + InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) + setTruncStoreAction(VT.getSimpleVT(), + (MVT::SimpleValueType)InnerVT, Expand); + } + setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand); + + // Promote all bit-wise operations. + if (VT.isInteger() && VT != PromotedBitwiseVT) { + setOperationAction(ISD::AND, VT.getSimpleVT(), Promote); + AddPromotedToType (ISD::AND, VT.getSimpleVT(), + PromotedBitwiseVT.getSimpleVT()); + setOperationAction(ISD::OR, VT.getSimpleVT(), Promote); + AddPromotedToType (ISD::OR, VT.getSimpleVT(), + PromotedBitwiseVT.getSimpleVT()); + setOperationAction(ISD::XOR, VT.getSimpleVT(), Promote); + AddPromotedToType (ISD::XOR, VT.getSimpleVT(), + PromotedBitwiseVT.getSimpleVT()); + } + + // Neon does not support vector divide/remainder operations. + setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand); + setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FDIV, VT.getSimpleVT(), Expand); + setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand); + setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand); +} + +void ARMTargetLowering::addDRTypeForNEON(EVT VT) { + addRegisterClass(VT, ARM::DPRRegisterClass); + addTypeForNEON(VT, MVT::f64, MVT::v2i32); +} + +void ARMTargetLowering::addQRTypeForNEON(EVT VT) { + addRegisterClass(VT, ARM::QPRRegisterClass); + addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); +} + +static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) { + if (TM.getSubtarget<ARMSubtarget>().isTargetDarwin()) + return new TargetLoweringObjectFileMachO(); + + return new ARMElfTargetObjectFile(); +} + +ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) + : TargetLowering(TM, createTLOF(TM)) { + Subtarget = &TM.getSubtarget<ARMSubtarget>(); + RegInfo = TM.getRegisterInfo(); + Itins = TM.getInstrItineraryData(); + + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); + + if (Subtarget->isTargetDarwin()) { + // Uses VFP for Thumb libfuncs if available. + if (Subtarget->isThumb() && Subtarget->hasVFP2()) { + // Single-precision floating-point arithmetic. + setLibcallName(RTLIB::ADD_F32, "__addsf3vfp"); + setLibcallName(RTLIB::SUB_F32, "__subsf3vfp"); + setLibcallName(RTLIB::MUL_F32, "__mulsf3vfp"); + setLibcallName(RTLIB::DIV_F32, "__divsf3vfp"); + + // Double-precision floating-point arithmetic. + setLibcallName(RTLIB::ADD_F64, "__adddf3vfp"); + setLibcallName(RTLIB::SUB_F64, "__subdf3vfp"); + setLibcallName(RTLIB::MUL_F64, "__muldf3vfp"); + setLibcallName(RTLIB::DIV_F64, "__divdf3vfp"); + + // Single-precision comparisons. + setLibcallName(RTLIB::OEQ_F32, "__eqsf2vfp"); + setLibcallName(RTLIB::UNE_F32, "__nesf2vfp"); + setLibcallName(RTLIB::OLT_F32, "__ltsf2vfp"); + setLibcallName(RTLIB::OLE_F32, "__lesf2vfp"); + setLibcallName(RTLIB::OGE_F32, "__gesf2vfp"); + setLibcallName(RTLIB::OGT_F32, "__gtsf2vfp"); + setLibcallName(RTLIB::UO_F32, "__unordsf2vfp"); + setLibcallName(RTLIB::O_F32, "__unordsf2vfp"); + + setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE); + setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETNE); + setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE); + setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE); + setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE); + setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE); + setCmpLibcallCC(RTLIB::UO_F32, ISD::SETNE); + setCmpLibcallCC(RTLIB::O_F32, ISD::SETEQ); + + // Double-precision comparisons. + setLibcallName(RTLIB::OEQ_F64, "__eqdf2vfp"); + setLibcallName(RTLIB::UNE_F64, "__nedf2vfp"); + setLibcallName(RTLIB::OLT_F64, "__ltdf2vfp"); + setLibcallName(RTLIB::OLE_F64, "__ledf2vfp"); + setLibcallName(RTLIB::OGE_F64, "__gedf2vfp"); + setLibcallName(RTLIB::OGT_F64, "__gtdf2vfp"); + setLibcallName(RTLIB::UO_F64, "__unorddf2vfp"); + setLibcallName(RTLIB::O_F64, "__unorddf2vfp"); + + setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE); + setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETNE); + setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE); + setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE); + setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE); + setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE); + setCmpLibcallCC(RTLIB::UO_F64, ISD::SETNE); + setCmpLibcallCC(RTLIB::O_F64, ISD::SETEQ); + + // Floating-point to integer conversions. + // i64 conversions are done via library routines even when generating VFP + // instructions, so use the same ones. + setLibcallName(RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp"); + setLibcallName(RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp"); + setLibcallName(RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp"); + setLibcallName(RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp"); + + // Conversions between floating types. + setLibcallName(RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp"); + setLibcallName(RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp"); + + // Integer to floating-point conversions. + // i64 conversions are done via library routines even when generating VFP + // instructions, so use the same ones. + // FIXME: There appears to be some naming inconsistency in ARM libgcc: + // e.g., __floatunsidf vs. __floatunssidfvfp. + setLibcallName(RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp"); + setLibcallName(RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp"); + setLibcallName(RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp"); + setLibcallName(RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp"); + } + } + + // These libcalls are not available in 32-bit. + setLibcallName(RTLIB::SHL_I128, 0); + setLibcallName(RTLIB::SRL_I128, 0); + setLibcallName(RTLIB::SRA_I128, 0); + + if (Subtarget->isAAPCS_ABI()) { + // Double-precision floating-point arithmetic helper functions + // RTABI chapter 4.1.2, Table 2 + setLibcallName(RTLIB::ADD_F64, "__aeabi_dadd"); + setLibcallName(RTLIB::DIV_F64, "__aeabi_ddiv"); + setLibcallName(RTLIB::MUL_F64, "__aeabi_dmul"); + setLibcallName(RTLIB::SUB_F64, "__aeabi_dsub"); + setLibcallCallingConv(RTLIB::ADD_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::DIV_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::MUL_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SUB_F64, CallingConv::ARM_AAPCS); + + // Double-precision floating-point comparison helper functions + // RTABI chapter 4.1.2, Table 3 + setLibcallName(RTLIB::OEQ_F64, "__aeabi_dcmpeq"); + setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE); + setLibcallName(RTLIB::UNE_F64, "__aeabi_dcmpeq"); + setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETEQ); + setLibcallName(RTLIB::OLT_F64, "__aeabi_dcmplt"); + setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE); + setLibcallName(RTLIB::OLE_F64, "__aeabi_dcmple"); + setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE); + setLibcallName(RTLIB::OGE_F64, "__aeabi_dcmpge"); + setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE); + setLibcallName(RTLIB::OGT_F64, "__aeabi_dcmpgt"); + setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE); + setLibcallName(RTLIB::UO_F64, "__aeabi_dcmpun"); + setCmpLibcallCC(RTLIB::UO_F64, ISD::SETNE); + setLibcallName(RTLIB::O_F64, "__aeabi_dcmpun"); + setCmpLibcallCC(RTLIB::O_F64, ISD::SETEQ); + setLibcallCallingConv(RTLIB::OEQ_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UNE_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::OLT_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::OLE_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::OGE_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::OGT_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UO_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::O_F64, CallingConv::ARM_AAPCS); + + // Single-precision floating-point arithmetic helper functions + // RTABI chapter 4.1.2, Table 4 + setLibcallName(RTLIB::ADD_F32, "__aeabi_fadd"); + setLibcallName(RTLIB::DIV_F32, "__aeabi_fdiv"); + setLibcallName(RTLIB::MUL_F32, "__aeabi_fmul"); + setLibcallName(RTLIB::SUB_F32, "__aeabi_fsub"); + setLibcallCallingConv(RTLIB::ADD_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::DIV_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::MUL_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SUB_F32, CallingConv::ARM_AAPCS); + + // Single-precision floating-point comparison helper functions + // RTABI chapter 4.1.2, Table 5 + setLibcallName(RTLIB::OEQ_F32, "__aeabi_fcmpeq"); + setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE); + setLibcallName(RTLIB::UNE_F32, "__aeabi_fcmpeq"); + setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETEQ); + setLibcallName(RTLIB::OLT_F32, "__aeabi_fcmplt"); + setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE); + setLibcallName(RTLIB::OLE_F32, "__aeabi_fcmple"); + setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE); + setLibcallName(RTLIB::OGE_F32, "__aeabi_fcmpge"); + setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE); + setLibcallName(RTLIB::OGT_F32, "__aeabi_fcmpgt"); + setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE); + setLibcallName(RTLIB::UO_F32, "__aeabi_fcmpun"); + setCmpLibcallCC(RTLIB::UO_F32, ISD::SETNE); + setLibcallName(RTLIB::O_F32, "__aeabi_fcmpun"); + setCmpLibcallCC(RTLIB::O_F32, ISD::SETEQ); + setLibcallCallingConv(RTLIB::OEQ_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UNE_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::OLT_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::OLE_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::OGE_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::OGT_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UO_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::O_F32, CallingConv::ARM_AAPCS); + + // Floating-point to integer conversions. + // RTABI chapter 4.1.2, Table 6 + setLibcallName(RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz"); + setLibcallName(RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz"); + setLibcallName(RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz"); + setLibcallName(RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz"); + setLibcallName(RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz"); + setLibcallName(RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz"); + setLibcallName(RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz"); + setLibcallName(RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz"); + setLibcallCallingConv(RTLIB::FPTOSINT_F64_I32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPTOUINT_F64_I32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPTOSINT_F64_I64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPTOSINT_F32_I32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPTOUINT_F32_I32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPTOSINT_F32_I64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::ARM_AAPCS); + + // Conversions between floating types. + // RTABI chapter 4.1.2, Table 7 + setLibcallName(RTLIB::FPROUND_F64_F32, "__aeabi_d2f"); + setLibcallName(RTLIB::FPEXT_F32_F64, "__aeabi_f2d"); + setLibcallCallingConv(RTLIB::FPROUND_F64_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPEXT_F32_F64, CallingConv::ARM_AAPCS); + + // Integer to floating-point conversions. + // RTABI chapter 4.1.2, Table 8 + setLibcallName(RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d"); + setLibcallName(RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d"); + setLibcallName(RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d"); + setLibcallName(RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d"); + setLibcallName(RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f"); + setLibcallName(RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f"); + setLibcallName(RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f"); + setLibcallName(RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f"); + setLibcallCallingConv(RTLIB::SINTTOFP_I32_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UINTTOFP_I32_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SINTTOFP_I64_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UINTTOFP_I64_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SINTTOFP_I32_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UINTTOFP_I32_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SINTTOFP_I64_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UINTTOFP_I64_F32, CallingConv::ARM_AAPCS); + + // Long long helper functions + // RTABI chapter 4.2, Table 9 + setLibcallName(RTLIB::MUL_I64, "__aeabi_lmul"); + setLibcallName(RTLIB::SDIV_I64, "__aeabi_ldivmod"); + setLibcallName(RTLIB::UDIV_I64, "__aeabi_uldivmod"); + setLibcallName(RTLIB::SHL_I64, "__aeabi_llsl"); + setLibcallName(RTLIB::SRL_I64, "__aeabi_llsr"); + setLibcallName(RTLIB::SRA_I64, "__aeabi_lasr"); + setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SHL_I64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SRL_I64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SRA_I64, CallingConv::ARM_AAPCS); + + // Integer division functions + // RTABI chapter 4.3.1 + setLibcallName(RTLIB::SDIV_I8, "__aeabi_idiv"); + setLibcallName(RTLIB::SDIV_I16, "__aeabi_idiv"); + setLibcallName(RTLIB::SDIV_I32, "__aeabi_idiv"); + setLibcallName(RTLIB::UDIV_I8, "__aeabi_uidiv"); + setLibcallName(RTLIB::UDIV_I16, "__aeabi_uidiv"); + setLibcallName(RTLIB::UDIV_I32, "__aeabi_uidiv"); + setLibcallCallingConv(RTLIB::SDIV_I8, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SDIV_I16, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SDIV_I32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UDIV_I8, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UDIV_I16, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UDIV_I32, CallingConv::ARM_AAPCS); + + // Memory operations + // RTABI chapter 4.3.4 + setLibcallName(RTLIB::MEMCPY, "__aeabi_memcpy"); + setLibcallName(RTLIB::MEMMOVE, "__aeabi_memmove"); + setLibcallName(RTLIB::MEMSET, "__aeabi_memset"); + } + + // Use divmod compiler-rt calls for iOS 5.0 and later. + if (Subtarget->getTargetTriple().getOS() == Triple::IOS && + !Subtarget->getTargetTriple().isOSVersionLT(5, 0)) { + setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); + setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); + } + + if (Subtarget->isThumb1Only()) + addRegisterClass(MVT::i32, ARM::tGPRRegisterClass); + else + addRegisterClass(MVT::i32, ARM::GPRRegisterClass); + if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) { + addRegisterClass(MVT::f32, ARM::SPRRegisterClass); + if (!Subtarget->isFPOnlySP()) + addRegisterClass(MVT::f64, ARM::DPRRegisterClass); + + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + } + + if (Subtarget->hasNEON()) { + addDRTypeForNEON(MVT::v2f32); + addDRTypeForNEON(MVT::v8i8); + addDRTypeForNEON(MVT::v4i16); + addDRTypeForNEON(MVT::v2i32); + addDRTypeForNEON(MVT::v1i64); + + addQRTypeForNEON(MVT::v4f32); + addQRTypeForNEON(MVT::v2f64); + addQRTypeForNEON(MVT::v16i8); + addQRTypeForNEON(MVT::v8i16); + addQRTypeForNEON(MVT::v4i32); + addQRTypeForNEON(MVT::v2i64); + + // v2f64 is legal so that QR subregs can be extracted as f64 elements, but + // neither Neon nor VFP support any arithmetic operations on it. + setOperationAction(ISD::FADD, MVT::v2f64, Expand); + setOperationAction(ISD::FSUB, MVT::v2f64, Expand); + setOperationAction(ISD::FMUL, MVT::v2f64, Expand); + setOperationAction(ISD::FDIV, MVT::v2f64, Expand); + setOperationAction(ISD::FREM, MVT::v2f64, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand); + setOperationAction(ISD::SETCC, MVT::v2f64, Expand); + setOperationAction(ISD::FNEG, MVT::v2f64, Expand); + setOperationAction(ISD::FABS, MVT::v2f64, Expand); + setOperationAction(ISD::FSQRT, MVT::v2f64, Expand); + setOperationAction(ISD::FSIN, MVT::v2f64, Expand); + setOperationAction(ISD::FCOS, MVT::v2f64, Expand); + setOperationAction(ISD::FPOWI, MVT::v2f64, Expand); + setOperationAction(ISD::FPOW, MVT::v2f64, Expand); + setOperationAction(ISD::FLOG, MVT::v2f64, Expand); + setOperationAction(ISD::FLOG2, MVT::v2f64, Expand); + setOperationAction(ISD::FLOG10, MVT::v2f64, Expand); + setOperationAction(ISD::FEXP, MVT::v2f64, Expand); + setOperationAction(ISD::FEXP2, MVT::v2f64, Expand); + setOperationAction(ISD::FCEIL, MVT::v2f64, Expand); + setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand); + setOperationAction(ISD::FRINT, MVT::v2f64, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); + setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); + + setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); + + // Neon does not support some operations on v1i64 and v2i64 types. + setOperationAction(ISD::MUL, MVT::v1i64, Expand); + // Custom handling for some quad-vector types to detect VMULL. + setOperationAction(ISD::MUL, MVT::v8i16, Custom); + setOperationAction(ISD::MUL, MVT::v4i32, Custom); + setOperationAction(ISD::MUL, MVT::v2i64, Custom); + // Custom handling for some vector types to avoid expensive expansions + setOperationAction(ISD::SDIV, MVT::v4i16, Custom); + setOperationAction(ISD::SDIV, MVT::v8i8, Custom); + setOperationAction(ISD::UDIV, MVT::v4i16, Custom); + setOperationAction(ISD::UDIV, MVT::v8i8, Custom); + setOperationAction(ISD::SETCC, MVT::v1i64, Expand); + setOperationAction(ISD::SETCC, MVT::v2i64, Expand); + // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with + // a destination type that is wider than the source. + setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); + + setTargetDAGCombine(ISD::INTRINSIC_VOID); + setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); + setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::SRL); + setTargetDAGCombine(ISD::SRA); + setTargetDAGCombine(ISD::SIGN_EXTEND); + setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::ANY_EXTEND); + setTargetDAGCombine(ISD::SELECT_CC); + setTargetDAGCombine(ISD::BUILD_VECTOR); + setTargetDAGCombine(ISD::VECTOR_SHUFFLE); + setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::FP_TO_SINT); + setTargetDAGCombine(ISD::FP_TO_UINT); + setTargetDAGCombine(ISD::FDIV); + } + + computeRegisterProperties(); + + // ARM does not have f32 extending load. + setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); + + // ARM does not have i1 sign extending load. + setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); + + // ARM supports all 4 flavors of integer indexed load / store. + if (!Subtarget->isThumb1Only()) { + for (unsigned im = (unsigned)ISD::PRE_INC; + im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { + setIndexedLoadAction(im, MVT::i1, Legal); + setIndexedLoadAction(im, MVT::i8, Legal); + setIndexedLoadAction(im, MVT::i16, Legal); + setIndexedLoadAction(im, MVT::i32, Legal); + setIndexedStoreAction(im, MVT::i1, Legal); + setIndexedStoreAction(im, MVT::i8, Legal); + setIndexedStoreAction(im, MVT::i16, Legal); + setIndexedStoreAction(im, MVT::i32, Legal); + } + } + + // i64 operation support. + setOperationAction(ISD::MUL, MVT::i64, Expand); + setOperationAction(ISD::MULHU, MVT::i32, Expand); + if (Subtarget->isThumb1Only()) { + setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); + setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); + } + if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops() + || (Subtarget->isThumb2() && !Subtarget->hasThumb2DSP())) + setOperationAction(ISD::MULHS, MVT::i32, Expand); + + setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRL, MVT::i64, Custom); + setOperationAction(ISD::SRA, MVT::i64, Custom); + + if (!Subtarget->isThumb1Only()) { + // FIXME: We should do this for Thumb1 as well. + setOperationAction(ISD::ADDC, MVT::i32, Custom); + setOperationAction(ISD::ADDE, MVT::i32, Custom); + setOperationAction(ISD::SUBC, MVT::i32, Custom); + setOperationAction(ISD::SUBE, MVT::i32, Custom); + } + + // ARM does not have ROTL. + setOperationAction(ISD::ROTL, MVT::i32, Expand); + setOperationAction(ISD::CTTZ, MVT::i32, Custom); + setOperationAction(ISD::CTPOP, MVT::i32, Expand); + if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) + setOperationAction(ISD::CTLZ, MVT::i32, Expand); + + // Only ARMv6 has BSWAP. + if (!Subtarget->hasV6Ops()) + setOperationAction(ISD::BSWAP, MVT::i32, Expand); + + // These are expanded into libcalls. + if (!Subtarget->hasDivide() || !Subtarget->isThumb2()) { + // v7M has a hardware divider + setOperationAction(ISD::SDIV, MVT::i32, Expand); + setOperationAction(ISD::UDIV, MVT::i32, Expand); + } + setOperationAction(ISD::SREM, MVT::i32, Expand); + setOperationAction(ISD::UREM, MVT::i32, Expand); + setOperationAction(ISD::SDIVREM, MVT::i32, Expand); + setOperationAction(ISD::UDIVREM, MVT::i32, Expand); + + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + setOperationAction(ISD::ConstantPool, MVT::i32, Custom); + setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom); + setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); + setOperationAction(ISD::BlockAddress, MVT::i32, Custom); + + setOperationAction(ISD::TRAP, MVT::Other, Legal); + + // Use the default implementation. + setOperationAction(ISD::VASTART, MVT::Other, Custom); + setOperationAction(ISD::VAARG, MVT::Other, Expand); + setOperationAction(ISD::VACOPY, MVT::Other, Expand); + setOperationAction(ISD::VAEND, MVT::Other, Expand); + setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); + setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); + setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); + setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); + setExceptionPointerRegister(ARM::R0); + setExceptionSelectorRegister(ARM::R1); + + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); + // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use + // the default expansion. + // FIXME: This should be checking for v6k, not just v6. + if (Subtarget->hasDataBarrier() || + (Subtarget->hasV6Ops() && !Subtarget->isThumb())) { + // membarrier needs custom lowering; the rest are legal and handled + // normally. + setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom); + setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); + // Custom lowering for 64-bit ops + setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); + // Automatically insert fences (dmb ist) around ATOMIC_SWAP etc. + setInsertFencesForAtomic(true); + } else { + // Set them all for expansion, which will force libcalls. + setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand); + setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand); + // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the + // Unordered/Monotonic case. + setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom); + setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom); + // Since the libcalls include locking, fold in the fences + setShouldFoldAtomicFences(true); + } + + setOperationAction(ISD::PREFETCH, MVT::Other, Custom); + + // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes. + if (!Subtarget->hasV6Ops()) { + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); + } + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + + if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) { + // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR + // iff target supports vfp2. + setOperationAction(ISD::BITCAST, MVT::i64, Custom); + setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); + } + + // We want to custom lower some of our intrinsics. + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + if (Subtarget->isTargetDarwin()) { + setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); + setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); + setOperationAction(ISD::EH_SJLJ_DISPATCHSETUP, MVT::Other, Custom); + setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); + } + + setOperationAction(ISD::SETCC, MVT::i32, Expand); + setOperationAction(ISD::SETCC, MVT::f32, Expand); + setOperationAction(ISD::SETCC, MVT::f64, Expand); + setOperationAction(ISD::SELECT, MVT::i32, Custom); + setOperationAction(ISD::SELECT, MVT::f32, Custom); + setOperationAction(ISD::SELECT, MVT::f64, Custom); + setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); + + setOperationAction(ISD::BRCOND, MVT::Other, Expand); + setOperationAction(ISD::BR_CC, MVT::i32, Custom); + setOperationAction(ISD::BR_CC, MVT::f32, Custom); + setOperationAction(ISD::BR_CC, MVT::f64, Custom); + setOperationAction(ISD::BR_JT, MVT::Other, Custom); + + // We don't support sin/cos/fmod/copysign/pow + setOperationAction(ISD::FSIN, MVT::f64, Expand); + setOperationAction(ISD::FSIN, MVT::f32, Expand); + setOperationAction(ISD::FCOS, MVT::f32, Expand); + setOperationAction(ISD::FCOS, MVT::f64, Expand); + setOperationAction(ISD::FREM, MVT::f64, Expand); + setOperationAction(ISD::FREM, MVT::f32, Expand); + if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) { + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); + } + setOperationAction(ISD::FPOW, MVT::f64, Expand); + setOperationAction(ISD::FPOW, MVT::f32, Expand); + + setOperationAction(ISD::FMA, MVT::f64, Expand); + setOperationAction(ISD::FMA, MVT::f32, Expand); + + // Various VFP goodness + if (!UseSoftFloat && !Subtarget->isThumb1Only()) { + // int <-> fp are custom expanded into bit_convert + ARMISD ops. + if (Subtarget->hasVFP2()) { + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); + } + // Special handling for half-precision FP. + if (!Subtarget->hasFP16()) { + setOperationAction(ISD::FP16_TO_FP32, MVT::f32, Expand); + setOperationAction(ISD::FP32_TO_FP16, MVT::i32, Expand); + } + } + + // We have target-specific dag combine patterns for the following nodes: + // ARMISD::VMOVRRD - No need to call setTargetDAGCombine + setTargetDAGCombine(ISD::ADD); + setTargetDAGCombine(ISD::SUB); + setTargetDAGCombine(ISD::MUL); + + if (Subtarget->hasV6T2Ops() || Subtarget->hasNEON()) + setTargetDAGCombine(ISD::OR); + if (Subtarget->hasNEON()) + setTargetDAGCombine(ISD::AND); + + setStackPointerRegisterToSaveRestore(ARM::SP); + + if (UseSoftFloat || Subtarget->isThumb1Only() || !Subtarget->hasVFP2()) + setSchedulingPreference(Sched::RegPressure); + else + setSchedulingPreference(Sched::Hybrid); + + //// temporary - rewrite interface to use type + maxStoresPerMemcpy = maxStoresPerMemcpyOptSize = 1; + + // On ARM arguments smaller than 4 bytes are extended, so all arguments + // are at least 4 bytes aligned. + setMinStackArgumentAlignment(4); + + benefitFromCodePlacementOpt = true; + + setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2); +} + +// FIXME: It might make sense to define the representative register class as the +// nearest super-register that has a non-null superset. For example, DPR_VFP2 is +// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, +// SPR's representative would be DPR_VFP2. This should work well if register +// pressure tracking were modified such that a register use would increment the +// pressure of the register class's representative and all of it's super +// classes' representatives transitively. We have not implemented this because +// of the difficulty prior to coalescing of modeling operand register classes +// due to the common occurrence of cross class copies and subregister insertions +// and extractions. +std::pair<const TargetRegisterClass*, uint8_t> +ARMTargetLowering::findRepresentativeClass(EVT VT) const{ + const TargetRegisterClass *RRC = 0; + uint8_t Cost = 1; + switch (VT.getSimpleVT().SimpleTy) { + default: + return TargetLowering::findRepresentativeClass(VT); + // Use DPR as representative register class for all floating point + // and vector types. Since there are 32 SPR registers and 32 DPR registers so + // the cost is 1 for both f32 and f64. + case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16: + case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: + RRC = ARM::DPRRegisterClass; + // When NEON is used for SP, only half of the register file is available + // because operations that define both SP and DP results will be constrained + // to the VFP2 class (D0-D15). We currently model this constraint prior to + // coalescing by double-counting the SP regs. See the FIXME above. + if (Subtarget->useNEONForSinglePrecisionFP()) + Cost = 2; + break; + case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: + case MVT::v4f32: case MVT::v2f64: + RRC = ARM::DPRRegisterClass; + Cost = 2; + break; + case MVT::v4i64: + RRC = ARM::DPRRegisterClass; + Cost = 4; + break; + case MVT::v8i64: + RRC = ARM::DPRRegisterClass; + Cost = 8; + break; + } + return std::make_pair(RRC, Cost); +} + +const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { + switch (Opcode) { + default: return 0; + case ARMISD::Wrapper: return "ARMISD::Wrapper"; + case ARMISD::WrapperDYN: return "ARMISD::WrapperDYN"; + case ARMISD::WrapperPIC: return "ARMISD::WrapperPIC"; + case ARMISD::WrapperJT: return "ARMISD::WrapperJT"; + case ARMISD::CALL: return "ARMISD::CALL"; + case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED"; + case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK"; + case ARMISD::tCALL: return "ARMISD::tCALL"; + case ARMISD::BRCOND: return "ARMISD::BRCOND"; + case ARMISD::BR_JT: return "ARMISD::BR_JT"; + case ARMISD::BR2_JT: return "ARMISD::BR2_JT"; + case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; + case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; + case ARMISD::CMP: return "ARMISD::CMP"; + case ARMISD::CMPZ: return "ARMISD::CMPZ"; + case ARMISD::CMPFP: return "ARMISD::CMPFP"; + case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0"; + case ARMISD::BCC_i64: return "ARMISD::BCC_i64"; + case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; + case ARMISD::CMOV: return "ARMISD::CMOV"; + + case ARMISD::RBIT: return "ARMISD::RBIT"; + + case ARMISD::FTOSI: return "ARMISD::FTOSI"; + case ARMISD::FTOUI: return "ARMISD::FTOUI"; + case ARMISD::SITOF: return "ARMISD::SITOF"; + case ARMISD::UITOF: return "ARMISD::UITOF"; + + case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; + case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; + case ARMISD::RRX: return "ARMISD::RRX"; + + case ARMISD::ADDC: return "ARMISD::ADDC"; + case ARMISD::ADDE: return "ARMISD::ADDE"; + case ARMISD::SUBC: return "ARMISD::SUBC"; + case ARMISD::SUBE: return "ARMISD::SUBE"; + + case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD"; + case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; + + case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; + case ARMISD::EH_SJLJ_LONGJMP:return "ARMISD::EH_SJLJ_LONGJMP"; + case ARMISD::EH_SJLJ_DISPATCHSETUP:return "ARMISD::EH_SJLJ_DISPATCHSETUP"; + + case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN"; + + case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; + + case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC"; + + case ARMISD::MEMBARRIER: return "ARMISD::MEMBARRIER"; + case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR"; + + case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; + + case ARMISD::VCEQ: return "ARMISD::VCEQ"; + case ARMISD::VCEQZ: return "ARMISD::VCEQZ"; + case ARMISD::VCGE: return "ARMISD::VCGE"; + case ARMISD::VCGEZ: return "ARMISD::VCGEZ"; + case ARMISD::VCLEZ: return "ARMISD::VCLEZ"; + case ARMISD::VCGEU: return "ARMISD::VCGEU"; + case ARMISD::VCGT: return "ARMISD::VCGT"; + case ARMISD::VCGTZ: return "ARMISD::VCGTZ"; + case ARMISD::VCLTZ: return "ARMISD::VCLTZ"; + case ARMISD::VCGTU: return "ARMISD::VCGTU"; + case ARMISD::VTST: return "ARMISD::VTST"; + + case ARMISD::VSHL: return "ARMISD::VSHL"; + case ARMISD::VSHRs: return "ARMISD::VSHRs"; + case ARMISD::VSHRu: return "ARMISD::VSHRu"; + case ARMISD::VSHLLs: return "ARMISD::VSHLLs"; + case ARMISD::VSHLLu: return "ARMISD::VSHLLu"; + case ARMISD::VSHLLi: return "ARMISD::VSHLLi"; + case ARMISD::VSHRN: return "ARMISD::VSHRN"; + case ARMISD::VRSHRs: return "ARMISD::VRSHRs"; + case ARMISD::VRSHRu: return "ARMISD::VRSHRu"; + case ARMISD::VRSHRN: return "ARMISD::VRSHRN"; + case ARMISD::VQSHLs: return "ARMISD::VQSHLs"; + case ARMISD::VQSHLu: return "ARMISD::VQSHLu"; + case ARMISD::VQSHLsu: return "ARMISD::VQSHLsu"; + case ARMISD::VQSHRNs: return "ARMISD::VQSHRNs"; + case ARMISD::VQSHRNu: return "ARMISD::VQSHRNu"; + case ARMISD::VQSHRNsu: return "ARMISD::VQSHRNsu"; + case ARMISD::VQRSHRNs: return "ARMISD::VQRSHRNs"; + case ARMISD::VQRSHRNu: return "ARMISD::VQRSHRNu"; + case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu"; + case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; + case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; + case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM"; + case ARMISD::VMVNIMM: return "ARMISD::VMVNIMM"; + case ARMISD::VDUP: return "ARMISD::VDUP"; + case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE"; + case ARMISD::VEXT: return "ARMISD::VEXT"; + case ARMISD::VREV64: return "ARMISD::VREV64"; + case ARMISD::VREV32: return "ARMISD::VREV32"; + case ARMISD::VREV16: return "ARMISD::VREV16"; + case ARMISD::VZIP: return "ARMISD::VZIP"; + case ARMISD::VUZP: return "ARMISD::VUZP"; + case ARMISD::VTRN: return "ARMISD::VTRN"; + case ARMISD::VTBL1: return "ARMISD::VTBL1"; + case ARMISD::VTBL2: return "ARMISD::VTBL2"; + case ARMISD::VMULLs: return "ARMISD::VMULLs"; + case ARMISD::VMULLu: return "ARMISD::VMULLu"; + case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; + case ARMISD::FMAX: return "ARMISD::FMAX"; + case ARMISD::FMIN: return "ARMISD::FMIN"; + case ARMISD::BFI: return "ARMISD::BFI"; + case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; + case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; + case ARMISD::VBSL: return "ARMISD::VBSL"; + case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; + case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; + case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; + case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD"; + case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD"; + case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD"; + case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD"; + case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD"; + case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD"; + case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD"; + case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD"; + case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD"; + case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD"; + case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD"; + case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD"; + case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD"; + case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD"; + case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD"; + case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; + case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; + } +} + +EVT ARMTargetLowering::getSetCCResultType(EVT VT) const { + if (!VT.isVector()) return getPointerTy(); + return VT.changeVectorElementTypeToInteger(); +} + +/// getRegClassFor - Return the register class that should be used for the +/// specified value type. +TargetRegisterClass *ARMTargetLowering::getRegClassFor(EVT VT) const { + // Map v4i64 to QQ registers but do not make the type legal. Similarly map + // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to + // load / store 4 to 8 consecutive D registers. + if (Subtarget->hasNEON()) { + if (VT == MVT::v4i64) + return ARM::QQPRRegisterClass; + else if (VT == MVT::v8i64) + return ARM::QQQQPRRegisterClass; + } + return TargetLowering::getRegClassFor(VT); +} + +// Create a fast isel object. +FastISel * +ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { + return ARM::createFastISel(funcInfo); +} + +/// getMaximalGlobalOffset - Returns the maximal possible offset which can +/// be used for loads / stores from the global. +unsigned ARMTargetLowering::getMaximalGlobalOffset() const { + return (Subtarget->isThumb1Only() ? 127 : 4095); +} + +Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { + unsigned NumVals = N->getNumValues(); + if (!NumVals) + return Sched::RegPressure; + + for (unsigned i = 0; i != NumVals; ++i) { + EVT VT = N->getValueType(i); + if (VT == MVT::Glue || VT == MVT::Other) + continue; + if (VT.isFloatingPoint() || VT.isVector()) + return Sched::Latency; + } + + if (!N->isMachineOpcode()) + return Sched::RegPressure; + + // Load are scheduled for latency even if there instruction itinerary + // is not available. + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); + + if (MCID.getNumDefs() == 0) + return Sched::RegPressure; + if (!Itins->isEmpty() && + Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2) + return Sched::Latency; + + return Sched::RegPressure; +} + +//===----------------------------------------------------------------------===// +// Lowering Code +//===----------------------------------------------------------------------===// + +/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC +static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { + switch (CC) { + default: llvm_unreachable("Unknown condition code!"); + case ISD::SETNE: return ARMCC::NE; + case ISD::SETEQ: return ARMCC::EQ; + case ISD::SETGT: return ARMCC::GT; + case ISD::SETGE: return ARMCC::GE; + case ISD::SETLT: return ARMCC::LT; + case ISD::SETLE: return ARMCC::LE; + case ISD::SETUGT: return ARMCC::HI; + case ISD::SETUGE: return ARMCC::HS; + case ISD::SETULT: return ARMCC::LO; + case ISD::SETULE: return ARMCC::LS; + } +} + +/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. +static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, + ARMCC::CondCodes &CondCode2) { + CondCode2 = ARMCC::AL; + switch (CC) { + default: llvm_unreachable("Unknown FP condition!"); + case ISD::SETEQ: + case ISD::SETOEQ: CondCode = ARMCC::EQ; break; + case ISD::SETGT: + case ISD::SETOGT: CondCode = ARMCC::GT; break; + case ISD::SETGE: + case ISD::SETOGE: CondCode = ARMCC::GE; break; + case ISD::SETOLT: CondCode = ARMCC::MI; break; + case ISD::SETOLE: CondCode = ARMCC::LS; break; + case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break; + case ISD::SETO: CondCode = ARMCC::VC; break; + case ISD::SETUO: CondCode = ARMCC::VS; break; + case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break; + case ISD::SETUGT: CondCode = ARMCC::HI; break; + case ISD::SETUGE: CondCode = ARMCC::PL; break; + case ISD::SETLT: + case ISD::SETULT: CondCode = ARMCC::LT; break; + case ISD::SETLE: + case ISD::SETULE: CondCode = ARMCC::LE; break; + case ISD::SETNE: + case ISD::SETUNE: CondCode = ARMCC::NE; break; + } +} + +//===----------------------------------------------------------------------===// +// Calling Convention Implementation +//===----------------------------------------------------------------------===// + +#include "ARMGenCallingConv.inc" + +/// CCAssignFnForNode - Selects the correct CCAssignFn for a the +/// given CallingConvention value. +CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, + bool Return, + bool isVarArg) const { + switch (CC) { + default: + llvm_unreachable("Unsupported calling convention"); + case CallingConv::Fast: + if (Subtarget->hasVFP2() && !isVarArg) { + if (!Subtarget->isAAPCS_ABI()) + return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS); + // For AAPCS ABI targets, just use VFP variant of the calling convention. + return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); + } + // Fallthrough + case CallingConv::C: { + // Use target triple & subtarget features to do actual dispatch. + if (!Subtarget->isAAPCS_ABI()) + return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); + else if (Subtarget->hasVFP2() && + FloatABIType == FloatABI::Hard && !isVarArg) + return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); + return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); + } + case CallingConv::ARM_AAPCS_VFP: + return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); + case CallingConv::ARM_AAPCS: + return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); + case CallingConv::ARM_APCS: + return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); + } +} + +/// LowerCallResult - Lower the result values of a call into the +/// appropriate copies out of appropriate physical registers. +SDValue +ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + + // Assign locations to each value returned by this call. + SmallVector<CCValAssign, 16> RVLocs; + ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext(), Call); + CCInfo.AnalyzeCallResult(Ins, + CCAssignFnForNode(CallConv, /* Return*/ true, + isVarArg)); + + // Copy all of the result registers out of their specified physreg. + for (unsigned i = 0; i != RVLocs.size(); ++i) { + CCValAssign VA = RVLocs[i]; + + SDValue Val; + if (VA.needsCustom()) { + // Handle f64 or half of a v2f64. + SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, + InFlag); + Chain = Lo.getValue(1); + InFlag = Lo.getValue(2); + VA = RVLocs[++i]; // skip ahead to next loc + SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, + InFlag); + Chain = Hi.getValue(1); + InFlag = Hi.getValue(2); + Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); + + if (VA.getLocVT() == MVT::v2f64) { + SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, + DAG.getConstant(0, MVT::i32)); + + VA = RVLocs[++i]; // skip ahead to next loc + Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); + Chain = Lo.getValue(1); + InFlag = Lo.getValue(2); + VA = RVLocs[++i]; // skip ahead to next loc + Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); + Chain = Hi.getValue(1); + InFlag = Hi.getValue(2); + Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); + Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, + DAG.getConstant(1, MVT::i32)); + } + } else { + Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), + InFlag); + Chain = Val.getValue(1); + InFlag = Val.getValue(2); + } + + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::BCvt: + Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); + break; + } + + InVals.push_back(Val); + } + + return Chain; +} + +/// LowerMemOpCallTo - Store the argument to the stack. +SDValue +ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, + SDValue StackPtr, SDValue Arg, + DebugLoc dl, SelectionDAG &DAG, + const CCValAssign &VA, + ISD::ArgFlagsTy Flags) const { + unsigned LocMemOffset = VA.getLocMemOffset(); + SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); + PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); + return DAG.getStore(Chain, dl, Arg, PtrOff, + MachinePointerInfo::getStack(LocMemOffset), + false, false, 0); +} + +void ARMTargetLowering::PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG, + SDValue Chain, SDValue &Arg, + RegsToPassVector &RegsToPass, + CCValAssign &VA, CCValAssign &NextVA, + SDValue &StackPtr, + SmallVector<SDValue, 8> &MemOpChains, + ISD::ArgFlagsTy Flags) const { + + SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, + DAG.getVTList(MVT::i32, MVT::i32), Arg); + RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd)); + + if (NextVA.isRegLoc()) + RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1))); + else { + assert(NextVA.isMemLoc()); + if (StackPtr.getNode() == 0) + StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy()); + + MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1), + dl, DAG, NextVA, + Flags)); + } +} + +/// LowerCall - Lowering a call into a callseq_start <- +/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter +/// nodes. +SDValue +ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, + CallingConv::ID CallConv, bool isVarArg, + bool &isTailCall, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + MachineFunction &MF = DAG.getMachineFunction(); + bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); + bool IsSibCall = false; + // Disable tail calls if they're not supported. + if (!EnableARMTailCalls && !Subtarget->supportsTailCall()) + isTailCall = false; + if (isTailCall) { + // Check if it's really possible to do a tail call. + isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, + isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), + Outs, OutVals, Ins, DAG); + // We don't support GuaranteedTailCallOpt for ARM, only automatically + // detected sibcalls. + if (isTailCall) { + ++NumTailCalls; + IsSibCall = true; + } + } + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ArgLocs; + ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext(), Call); + CCInfo.AnalyzeCallOperands(Outs, + CCAssignFnForNode(CallConv, /* Return*/ false, + isVarArg)); + + // Get a count of how many bytes are to be pushed on the stack. + unsigned NumBytes = CCInfo.getNextStackOffset(); + + // For tail calls, memory operands are available in our caller's stack. + if (IsSibCall) + NumBytes = 0; + + // Adjust the stack pointer for the new arguments... + // These operations are automatically eliminated by the prolog/epilog pass + if (!IsSibCall) + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); + + SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy()); + + RegsToPassVector RegsToPass; + SmallVector<SDValue, 8> MemOpChains; + + // Walk the register/memloc assignments, inserting copies/loads. In the case + // of tail call optimization, arguments are handled later. + for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); + i != e; + ++i, ++realArgIdx) { + CCValAssign &VA = ArgLocs[i]; + SDValue Arg = OutVals[realArgIdx]; + ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; + bool isByVal = Flags.isByVal(); + + // Promote the value if needed. + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::SExt: + Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); + break; + case CCValAssign::ZExt: + Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); + break; + case CCValAssign::AExt: + Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); + break; + case CCValAssign::BCvt: + Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); + break; + } + + // f64 and v2f64 might be passed in i32 pairs and must be split into pieces + if (VA.needsCustom()) { + if (VA.getLocVT() == MVT::v2f64) { + SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, + DAG.getConstant(0, MVT::i32)); + SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, + DAG.getConstant(1, MVT::i32)); + + PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, + VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); + + VA = ArgLocs[++i]; // skip ahead to next loc + if (VA.isRegLoc()) { + PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, + VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); + } else { + assert(VA.isMemLoc()); + + MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1, + dl, DAG, VA, Flags)); + } + } else { + PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], + StackPtr, MemOpChains, Flags); + } + } else if (VA.isRegLoc()) { + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + } else if (isByVal) { + assert(VA.isMemLoc()); + unsigned offset = 0; + + // True if this byval aggregate will be split between registers + // and memory. + if (CCInfo.isFirstByValRegValid()) { + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + unsigned int i, j; + for (i = 0, j = CCInfo.getFirstByValReg(); j < ARM::R4; i++, j++) { + SDValue Const = DAG.getConstant(4*i, MVT::i32); + SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); + SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, + MachinePointerInfo(), + false, false, 0); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(j, Load)); + } + offset = ARM::R4 - CCInfo.getFirstByValReg(); + CCInfo.clearFirstByValReg(); + } + + unsigned LocMemOffset = VA.getLocMemOffset(); + SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset); + SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, + StkPtrOff); + SDValue SrcOffset = DAG.getIntPtrConstant(4*offset); + SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset); + SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, + MVT::i32); + // TODO: Disable AlwaysInline when it becomes possible + // to emit a nested call sequence. + MemOpChains.push_back(DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, + Flags.getByValAlign(), + /*isVolatile=*/false, + /*AlwaysInline=*/true, + MachinePointerInfo(0), + MachinePointerInfo(0))); + + } else if (!IsSibCall) { + assert(VA.isMemLoc()); + + MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, + dl, DAG, VA, Flags)); + } + } + + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + &MemOpChains[0], MemOpChains.size()); + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into the appropriate regs. + SDValue InFlag; + // Tail call byval lowering might overwrite argument registers so in case of + // tail call optimization the copies to registers are lowered later. + if (!isTailCall) + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); + } + + // For tail calls lower the arguments to the 'real' stack slot. + if (isTailCall) { + // Force all the incoming stack arguments to be loaded from the stack + // before any new outgoing arguments are stored to the stack, because the + // outgoing stack slots may alias the incoming argument stack slots, and + // the alias isn't otherwise explicit. This is slightly more conservative + // than necessary, because it means that each store effectively depends + // on every argument instead of just those arguments it would clobber. + + // Do not flag preceding copytoreg stuff together with the following stuff. + InFlag = SDValue(); + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); + } + InFlag =SDValue(); + } + + // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every + // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol + // node so that legalize doesn't hack it. + bool isDirect = false; + bool isARMFunc = false; + bool isLocalARMFunc = false; + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + + if (EnableARMLongCalls) { + assert (getTargetMachine().getRelocationModel() == Reloc::Static + && "long-calls with non-static relocation model!"); + // Handle a global address or an external symbol. If it's not one of + // those, the target's already in a register, so we don't need to do + // anything extra. + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + const GlobalValue *GV = G->getGlobal(); + // Create a constant pool entry for the callee address + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); + ARMConstantPoolValue *CPV = + ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); + + // Get the address of the callee into a register + SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + Callee = DAG.getLoad(getPointerTy(), dl, + DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(), + false, false, 0); + } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) { + const char *Sym = S->getSymbol(); + + // Create a constant pool entry for the callee address + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); + ARMConstantPoolValue *CPV = + ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, + ARMPCLabelIndex, 0); + // Get the address of the callee into a register + SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + Callee = DAG.getLoad(getPointerTy(), dl, + DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(), + false, false, 0); + } + } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + const GlobalValue *GV = G->getGlobal(); + isDirect = true; + bool isExt = GV->isDeclaration() || GV->isWeakForLinker(); + bool isStub = (isExt && Subtarget->isTargetDarwin()) && + getTargetMachine().getRelocationModel() != Reloc::Static; + isARMFunc = !Subtarget->isThumb() || isStub; + // ARM call to a local ARM function is predicable. + isLocalARMFunc = !Subtarget->isThumb() && (!isExt || !ARMInterworking); + // tBX takes a register source operand. + if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); + ARMConstantPoolValue *CPV = + ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 4); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + Callee = DAG.getLoad(getPointerTy(), dl, + DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(), + false, false, 0); + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); + Callee = DAG.getNode(ARMISD::PIC_ADD, dl, + getPointerTy(), Callee, PICLabel); + } else { + // On ELF targets for PIC code, direct calls should go through the PLT + unsigned OpFlags = 0; + if (Subtarget->isTargetELF() && + getTargetMachine().getRelocationModel() == Reloc::PIC_) + OpFlags = ARMII::MO_PLT; + Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); + } + } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { + isDirect = true; + bool isStub = Subtarget->isTargetDarwin() && + getTargetMachine().getRelocationModel() != Reloc::Static; + isARMFunc = !Subtarget->isThumb() || isStub; + // tBX takes a register source operand. + const char *Sym = S->getSymbol(); + if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); + ARMConstantPoolValue *CPV = + ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, + ARMPCLabelIndex, 4); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + Callee = DAG.getLoad(getPointerTy(), dl, + DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(), + false, false, 0); + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); + Callee = DAG.getNode(ARMISD::PIC_ADD, dl, + getPointerTy(), Callee, PICLabel); + } else { + unsigned OpFlags = 0; + // On ELF targets for PIC code, direct calls should go through the PLT + if (Subtarget->isTargetELF() && + getTargetMachine().getRelocationModel() == Reloc::PIC_) + OpFlags = ARMII::MO_PLT; + Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlags); + } + } + + // FIXME: handle tail calls differently. + unsigned CallOpc; + if (Subtarget->isThumb()) { + if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) + CallOpc = ARMISD::CALL_NOLINK; + else + CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL; + } else { + CallOpc = (isDirect || Subtarget->hasV5TOps()) + ? (isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL) + : ARMISD::CALL_NOLINK; + } + + std::vector<SDValue> Ops; + Ops.push_back(Chain); + Ops.push_back(Callee); + + // Add argument registers to the end of the list so that they are known live + // into the call. + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) + Ops.push_back(DAG.getRegister(RegsToPass[i].first, + RegsToPass[i].second.getValueType())); + + if (InFlag.getNode()) + Ops.push_back(InFlag); + + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + if (isTailCall) + return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size()); + + // Returns a chain and a flag for retval copy to use. + Chain = DAG.getNode(CallOpc, dl, NodeTys, &Ops[0], Ops.size()); + InFlag = Chain.getValue(1); + + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), + DAG.getIntPtrConstant(0, true), InFlag); + if (!Ins.empty()) + InFlag = Chain.getValue(1); + + // Handle result values, copying them out of physregs into vregs that we + // return. + return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, + dl, DAG, InVals); +} + +/// HandleByVal - Every parameter *after* a byval parameter is passed +/// on the stack. Remember the next parameter register to allocate, +/// and then confiscate the rest of the parameter registers to insure +/// this. +void +llvm::ARMTargetLowering::HandleByVal(CCState *State, unsigned &size) const { + unsigned reg = State->AllocateReg(GPRArgRegs, 4); + assert((State->getCallOrPrologue() == Prologue || + State->getCallOrPrologue() == Call) && + "unhandled ParmContext"); + if ((!State->isFirstByValRegValid()) && + (ARM::R0 <= reg) && (reg <= ARM::R3)) { + State->setFirstByValReg(reg); + // At a call site, a byval parameter that is split between + // registers and memory needs its size truncated here. In a + // function prologue, such byval parameters are reassembled in + // memory, and are not truncated. + if (State->getCallOrPrologue() == Call) { + unsigned excess = 4 * (ARM::R4 - reg); + assert(size >= excess && "expected larger existing stack allocation"); + size -= excess; + } + } + // Confiscate any remaining parameter registers to preclude their + // assignment to subsequent parameters. + while (State->AllocateReg(GPRArgRegs, 4)) + ; +} + +/// MatchingStackOffset - Return true if the given stack call argument is +/// already available in the same position (relatively) of the caller's +/// incoming argument stack. +static +bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, + MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, + const ARMInstrInfo *TII) { + unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; + int FI = INT_MAX; + if (Arg.getOpcode() == ISD::CopyFromReg) { + unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); + if (!TargetRegisterInfo::isVirtualRegister(VR)) + return false; + MachineInstr *Def = MRI->getVRegDef(VR); + if (!Def) + return false; + if (!Flags.isByVal()) { + if (!TII->isLoadFromStackSlot(Def, FI)) + return false; + } else { + return false; + } + } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { + if (Flags.isByVal()) + // ByVal argument is passed in as a pointer but it's now being + // dereferenced. e.g. + // define @foo(%struct.X* %A) { + // tail call @bar(%struct.X* byval %A) + // } + return false; + SDValue Ptr = Ld->getBasePtr(); + FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); + if (!FINode) + return false; + FI = FINode->getIndex(); + } else + return false; + + assert(FI != INT_MAX); + if (!MFI->isFixedObjectIndex(FI)) + return false; + return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); +} + +/// IsEligibleForTailCallOptimization - Check whether the call is eligible +/// for tail call optimization. Targets which want to do tail call +/// optimization should implement this function. +bool +ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, + CallingConv::ID CalleeCC, + bool isVarArg, + bool isCalleeStructRet, + bool isCallerStructRet, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + SelectionDAG& DAG) const { + const Function *CallerF = DAG.getMachineFunction().getFunction(); + CallingConv::ID CallerCC = CallerF->getCallingConv(); + bool CCMatch = CallerCC == CalleeCC; + + // Look for obvious safe cases to perform tail call optimization that do not + // require ABI changes. This is what gcc calls sibcall. + + // Do not sibcall optimize vararg calls unless the call site is not passing + // any arguments. + if (isVarArg && !Outs.empty()) + return false; + + // Also avoid sibcall optimization if either caller or callee uses struct + // return semantics. + if (isCalleeStructRet || isCallerStructRet) + return false; + + // FIXME: Completely disable sibcall for Thumb1 since Thumb1RegisterInfo:: + // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as + // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation + // support in the assembler and linker to be used. This would need to be + // fixed to fully support tail calls in Thumb1. + // + // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take + // LR. This means if we need to reload LR, it takes an extra instructions, + // which outweighs the value of the tail call; but here we don't know yet + // whether LR is going to be used. Probably the right approach is to + // generate the tail call here and turn it back into CALL/RET in + // emitEpilogue if LR is used. + + // Thumb1 PIC calls to external symbols use BX, so they can be tail calls, + // but we need to make sure there are enough registers; the only valid + // registers are the 4 used for parameters. We don't currently do this + // case. + if (Subtarget->isThumb1Only()) + return false; + + // If the calling conventions do not match, then we'd better make sure the + // results are returned in the same way as what the caller expects. + if (!CCMatch) { + SmallVector<CCValAssign, 16> RVLocs1; + ARMCCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), + getTargetMachine(), RVLocs1, *DAG.getContext(), Call); + CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC, true, isVarArg)); + + SmallVector<CCValAssign, 16> RVLocs2; + ARMCCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), + getTargetMachine(), RVLocs2, *DAG.getContext(), Call); + CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC, true, isVarArg)); + + if (RVLocs1.size() != RVLocs2.size()) + return false; + for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { + if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) + return false; + if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) + return false; + if (RVLocs1[i].isRegLoc()) { + if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) + return false; + } else { + if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) + return false; + } + } + } + + // If the callee takes no arguments then go on to check the results of the + // call. + if (!Outs.empty()) { + // Check if stack adjustment is needed. For now, do not do this if any + // argument is passed on the stack. + SmallVector<CCValAssign, 16> ArgLocs; + ARMCCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext(), Call); + CCInfo.AnalyzeCallOperands(Outs, + CCAssignFnForNode(CalleeCC, false, isVarArg)); + if (CCInfo.getNextStackOffset()) { + MachineFunction &MF = DAG.getMachineFunction(); + + // Check if the arguments are already laid out in the right way as + // the caller's fixed stack objects. + MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineRegisterInfo *MRI = &MF.getRegInfo(); + const ARMInstrInfo *TII = + ((ARMTargetMachine&)getTargetMachine()).getInstrInfo(); + for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); + i != e; + ++i, ++realArgIdx) { + CCValAssign &VA = ArgLocs[i]; + EVT RegVT = VA.getLocVT(); + SDValue Arg = OutVals[realArgIdx]; + ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; + if (VA.getLocInfo() == CCValAssign::Indirect) + return false; + if (VA.needsCustom()) { + // f64 and vector types are split into multiple registers or + // register/stack-slot combinations. The types will not match + // the registers; give up on memory f64 refs until we figure + // out what to do about this. + if (!VA.isRegLoc()) + return false; + if (!ArgLocs[++i].isRegLoc()) + return false; + if (RegVT == MVT::v2f64) { + if (!ArgLocs[++i].isRegLoc()) + return false; + if (!ArgLocs[++i].isRegLoc()) + return false; + } + } else if (!VA.isRegLoc()) { + if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, + MFI, MRI, TII)) + return false; + } + } + } + } + + return true; +} + +SDValue +ARMTargetLowering::LowerReturn(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + DebugLoc dl, SelectionDAG &DAG) const { + + // CCValAssign - represent the assignment of the return value to a location. + SmallVector<CCValAssign, 16> RVLocs; + + // CCState - Info about the registers and stack slots. + ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext(), Call); + + // Analyze outgoing return values. + CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true, + isVarArg)); + + // If this is the first return lowered for this function, add + // the regs to the liveout set for the function. + if (DAG.getMachineFunction().getRegInfo().liveout_empty()) { + for (unsigned i = 0; i != RVLocs.size(); ++i) + if (RVLocs[i].isRegLoc()) + DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg()); + } + + SDValue Flag; + + // Copy the result values into the output registers. + for (unsigned i = 0, realRVLocIdx = 0; + i != RVLocs.size(); + ++i, ++realRVLocIdx) { + CCValAssign &VA = RVLocs[i]; + assert(VA.isRegLoc() && "Can only return in registers!"); + + SDValue Arg = OutVals[realRVLocIdx]; + + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::BCvt: + Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); + break; + } + + if (VA.needsCustom()) { + if (VA.getLocVT() == MVT::v2f64) { + // Extract the first half and return it in two registers. + SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, + DAG.getConstant(0, MVT::i32)); + SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, + DAG.getVTList(MVT::i32, MVT::i32), Half); + + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), HalfGPRs, Flag); + Flag = Chain.getValue(1); + VA = RVLocs[++i]; // skip ahead to next loc + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), + HalfGPRs.getValue(1), Flag); + Flag = Chain.getValue(1); + VA = RVLocs[++i]; // skip ahead to next loc + + // Extract the 2nd half and fall through to handle it as an f64 value. + Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, + DAG.getConstant(1, MVT::i32)); + } + // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is + // available. + SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, + DAG.getVTList(MVT::i32, MVT::i32), &Arg, 1); + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd, Flag); + Flag = Chain.getValue(1); + VA = RVLocs[++i]; // skip ahead to next loc + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd.getValue(1), + Flag); + } else + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); + + // Guarantee that all emitted copies are + // stuck together, avoiding something bad. + Flag = Chain.getValue(1); + } + + SDValue result; + if (Flag.getNode()) + result = DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, Chain, Flag); + else // Return Void + result = DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, Chain); + + return result; +} + +bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N) const { + if (N->getNumValues() != 1) + return false; + if (!N->hasNUsesOfValue(1, 0)) + return false; + + unsigned NumCopies = 0; + SDNode* Copies[2]; + SDNode *Use = *N->use_begin(); + if (Use->getOpcode() == ISD::CopyToReg) { + Copies[NumCopies++] = Use; + } else if (Use->getOpcode() == ARMISD::VMOVRRD) { + // f64 returned in a pair of GPRs. + for (SDNode::use_iterator UI = Use->use_begin(), UE = Use->use_end(); + UI != UE; ++UI) { + if (UI->getOpcode() != ISD::CopyToReg) + return false; + Copies[UI.getUse().getResNo()] = *UI; + ++NumCopies; + } + } else if (Use->getOpcode() == ISD::BITCAST) { + // f32 returned in a single GPR. + if (!Use->hasNUsesOfValue(1, 0)) + return false; + Use = *Use->use_begin(); + if (Use->getOpcode() != ISD::CopyToReg || !Use->hasNUsesOfValue(1, 0)) + return false; + Copies[NumCopies++] = Use; + } else { + return false; + } + + if (NumCopies != 1 && NumCopies != 2) + return false; + + bool HasRet = false; + for (unsigned i = 0; i < NumCopies; ++i) { + SDNode *Copy = Copies[i]; + for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); + UI != UE; ++UI) { + if (UI->getOpcode() == ISD::CopyToReg) { + SDNode *Use = *UI; + if (Use == Copies[0] || Use == Copies[1]) + continue; + return false; + } + if (UI->getOpcode() != ARMISD::RET_FLAG) + return false; + HasRet = true; + } + } + + return HasRet; +} + +bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { + if (!EnableARMTailCalls) + return false; + + if (!CI->isTailCall()) + return false; + + return !Subtarget->isThumb1Only(); +} + +// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as +// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is +// one of the above mentioned nodes. It has to be wrapped because otherwise +// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only +// be used to form addressing mode. These wrapped nodes will be selected +// into MOVi. +static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) { + EVT PtrVT = Op.getValueType(); + // FIXME there is no actual debug info here + DebugLoc dl = Op.getDebugLoc(); + ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); + SDValue Res; + if (CP->isMachineConstantPoolEntry()) + Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, + CP->getAlignment()); + else + Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, + CP->getAlignment()); + return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); +} + +unsigned ARMTargetLowering::getJumpTableEncoding() const { + return MachineJumpTableInfo::EK_Inline; +} + +SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned ARMPCLabelIndex = 0; + DebugLoc DL = Op.getDebugLoc(); + EVT PtrVT = getPointerTy(); + const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); + Reloc::Model RelocM = getTargetMachine().getRelocationModel(); + SDValue CPAddr; + if (RelocM == Reloc::Static) { + CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4); + } else { + unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; + ARMPCLabelIndex = AFI->createPICLabelUId(); + ARMConstantPoolValue *CPV = + ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex, + ARMCP::CPBlockAddress, PCAdj); + CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); + } + CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); + SDValue Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(), + false, false, 0); + if (RelocM == Reloc::Static) + return Result; + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); + return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel); +} + +// Lower ISD::GlobalTLSAddress using the "general dynamic" model +SDValue +ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, + SelectionDAG &DAG) const { + DebugLoc dl = GA->getDebugLoc(); + EVT PtrVT = getPointerTy(); + unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); + ARMConstantPoolValue *CPV = + ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, + ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); + SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); + Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); + Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument, + MachinePointerInfo::getConstantPool(), + false, false, 0); + SDValue Chain = Argument.getValue(1); + + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); + Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel); + + // call __tls_get_addr. + ArgListTy Args; + ArgListEntry Entry; + Entry.Node = Argument; + Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext()); + Args.push_back(Entry); + // FIXME: is there useful debug info available here? + std::pair<SDValue, SDValue> CallResult = + LowerCallTo(Chain, (Type *) Type::getInt32Ty(*DAG.getContext()), + false, false, false, false, + 0, CallingConv::C, false, /*isReturnValueUsed=*/true, + DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG, dl); + return CallResult.first; +} + +// Lower ISD::GlobalTLSAddress using the "initial exec" or +// "local exec" model. +SDValue +ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, + SelectionDAG &DAG) const { + const GlobalValue *GV = GA->getGlobal(); + DebugLoc dl = GA->getDebugLoc(); + SDValue Offset; + SDValue Chain = DAG.getEntryNode(); + EVT PtrVT = getPointerTy(); + // Get the Thread Pointer + SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); + + if (GV->isDeclaration()) { + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); + // Initial exec model. + unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; + ARMConstantPoolValue *CPV = + ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, + ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, + true); + Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); + Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); + Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, + MachinePointerInfo::getConstantPool(), + false, false, 0); + Chain = Offset.getValue(1); + + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); + Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); + + Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, + MachinePointerInfo::getConstantPool(), + false, false, 0); + } else { + // local exec model + ARMConstantPoolValue *CPV = + ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); + Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); + Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); + Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, + MachinePointerInfo::getConstantPool(), + false, false, 0); + } + + // The address of the thread local variable is the add of the thread + // pointer with the offset of the variable. + return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); +} + +SDValue +ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { + // TODO: implement the "local dynamic" model + assert(Subtarget->isTargetELF() && + "TLS not implemented for non-ELF targets"); + GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); + // If the relocation model is PIC, use the "General Dynamic" TLS Model, + // otherwise use the "Local Exec" TLS Model + if (getTargetMachine().getRelocationModel() == Reloc::PIC_) + return LowerToTLSGeneralDynamicModel(GA, DAG); + else + return LowerToTLSExecModels(GA, DAG); +} + +SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, + SelectionDAG &DAG) const { + EVT PtrVT = getPointerTy(); + DebugLoc dl = Op.getDebugLoc(); + const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); + Reloc::Model RelocM = getTargetMachine().getRelocationModel(); + if (RelocM == Reloc::PIC_) { + bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility(); + ARMConstantPoolValue *CPV = + ARMConstantPoolConstant::Create(GV, + UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), + CPAddr, + MachinePointerInfo::getConstantPool(), + false, false, 0); + SDValue Chain = Result.getValue(1); + SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT); + Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, GOT); + if (!UseGOTOFF) + Result = DAG.getLoad(PtrVT, dl, Chain, Result, + MachinePointerInfo::getGOT(), false, false, 0); + return Result; + } + + // If we have T2 ops, we can materialize the address directly via movt/movw + // pair. This is always cheaper. + if (Subtarget->useMovt()) { + ++NumMovwMovt; + // FIXME: Once remat is capable of dealing with instructions with register + // operands, expand this into two nodes. + return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, + DAG.getTargetGlobalAddress(GV, dl, PtrVT)); + } else { + SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(), + false, false, 0); + } +} + +SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, + SelectionDAG &DAG) const { + EVT PtrVT = getPointerTy(); + DebugLoc dl = Op.getDebugLoc(); + const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); + Reloc::Model RelocM = getTargetMachine().getRelocationModel(); + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + + // FIXME: Enable this for static codegen when tool issues are fixed. + if (Subtarget->useMovt() && RelocM != Reloc::Static) { + ++NumMovwMovt; + // FIXME: Once remat is capable of dealing with instructions with register + // operands, expand this into two nodes. + if (RelocM == Reloc::Static) + return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, + DAG.getTargetGlobalAddress(GV, dl, PtrVT)); + + unsigned Wrapper = (RelocM == Reloc::PIC_) + ? ARMISD::WrapperPIC : ARMISD::WrapperDYN; + SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, + DAG.getTargetGlobalAddress(GV, dl, PtrVT)); + if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) + Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, + MachinePointerInfo::getGOT(), false, false, 0); + return Result; + } + + unsigned ARMPCLabelIndex = 0; + SDValue CPAddr; + if (RelocM == Reloc::Static) { + CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); + } else { + ARMPCLabelIndex = AFI->createPICLabelUId(); + unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb()?4:8); + ARMConstantPoolValue *CPV = + ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, + PCAdj); + CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); + } + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + + SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(), + false, false, 0); + SDValue Chain = Result.getValue(1); + + if (RelocM == Reloc::PIC_) { + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); + Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); + } + + if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) + Result = DAG.getLoad(PtrVT, dl, Chain, Result, MachinePointerInfo::getGOT(), + false, false, 0); + + return Result; +} + +SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, + SelectionDAG &DAG) const { + assert(Subtarget->isTargetELF() && + "GLOBAL OFFSET TABLE not implemented for non-ELF targets"); + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); + EVT PtrVT = getPointerTy(); + DebugLoc dl = Op.getDebugLoc(); + unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; + ARMConstantPoolValue *CPV = + ARMConstantPoolSymbol::Create(*DAG.getContext(), "_GLOBAL_OFFSET_TABLE_", + ARMPCLabelIndex, PCAdj); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(), + false, false, 0); + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); + return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); +} + +SDValue +ARMTargetLowering::LowerEH_SJLJ_DISPATCHSETUP(SDValue Op, SelectionDAG &DAG) + const { + DebugLoc dl = Op.getDebugLoc(); + return DAG.getNode(ARMISD::EH_SJLJ_DISPATCHSETUP, dl, MVT::Other, + Op.getOperand(0), Op.getOperand(1)); +} + +SDValue +ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { + DebugLoc dl = Op.getDebugLoc(); + SDValue Val = DAG.getConstant(0, MVT::i32); + return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, + DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), + Op.getOperand(1), Val); +} + +SDValue +ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { + DebugLoc dl = Op.getDebugLoc(); + return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), + Op.getOperand(1), DAG.getConstant(0, MVT::i32)); +} + +SDValue +ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) const { + unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + DebugLoc dl = Op.getDebugLoc(); + switch (IntNo) { + default: return SDValue(); // Don't custom lower most intrinsics. + case Intrinsic::arm_thread_pointer: { + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); + } + case Intrinsic::eh_sjlj_lsda: { + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); + EVT PtrVT = getPointerTy(); + DebugLoc dl = Op.getDebugLoc(); + Reloc::Model RelocM = getTargetMachine().getRelocationModel(); + SDValue CPAddr; + unsigned PCAdj = (RelocM != Reloc::PIC_) + ? 0 : (Subtarget->isThumb() ? 4 : 8); + ARMConstantPoolValue *CPV = + ARMConstantPoolConstant::Create(MF.getFunction(), ARMPCLabelIndex, + ARMCP::CPLSDA, PCAdj); + CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + SDValue Result = + DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(), + false, false, 0); + + if (RelocM == Reloc::PIC_) { + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); + Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); + } + return Result; + } + case Intrinsic::arm_neon_vmulls: + case Intrinsic::arm_neon_vmullu: { + unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) + ? ARMISD::VMULLs : ARMISD::VMULLu; + return DAG.getNode(NewOpc, Op.getDebugLoc(), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } + } +} + +static SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + DebugLoc dl = Op.getDebugLoc(); + if (!Subtarget->hasDataBarrier()) { + // Some ARMv6 cpus can support data barriers with an mcr instruction. + // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get + // here. + assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && + "Unexpected ISD::MEMBARRIER encountered. Should be libcall!"); + return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), + DAG.getConstant(0, MVT::i32)); + } + + SDValue Op5 = Op.getOperand(5); + bool isDeviceBarrier = cast<ConstantSDNode>(Op5)->getZExtValue() != 0; + unsigned isLL = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + unsigned isLS = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); + bool isOnlyStoreBarrier = (isLL == 0 && isLS == 0); + + ARM_MB::MemBOpt DMBOpt; + if (isDeviceBarrier) + DMBOpt = isOnlyStoreBarrier ? ARM_MB::ST : ARM_MB::SY; + else + DMBOpt = isOnlyStoreBarrier ? ARM_MB::ISHST : ARM_MB::ISH; + return DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0), + DAG.getConstant(DMBOpt, MVT::i32)); +} + + +static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + // FIXME: handle "fence singlethread" more efficiently. + DebugLoc dl = Op.getDebugLoc(); + if (!Subtarget->hasDataBarrier()) { + // Some ARMv6 cpus can support data barriers with an mcr instruction. + // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get + // here. + assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && + "Unexpected ISD::MEMBARRIER encountered. Should be libcall!"); + return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), + DAG.getConstant(0, MVT::i32)); + } + + return DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0), + DAG.getConstant(ARM_MB::ISH, MVT::i32)); +} + +static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + // ARM pre v5TE and Thumb1 does not have preload instructions. + if (!(Subtarget->isThumb2() || + (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps()))) + // Just preserve the chain. + return Op.getOperand(0); + + DebugLoc dl = Op.getDebugLoc(); + unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1; + if (!isRead && + (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension())) + // ARMv7 with MP extension has PLDW. + return Op.getOperand(0); + + unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); + if (Subtarget->isThumb()) { + // Invert the bits. + isRead = ~isRead & 1; + isData = ~isData & 1; + } + + return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0), + Op.getOperand(1), DAG.getConstant(isRead, MVT::i32), + DAG.getConstant(isData, MVT::i32)); +} + +static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>(); + + // vastart just stores the address of the VarArgsFrameIndex slot into the + // memory location argument. + DebugLoc dl = Op.getDebugLoc(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); + const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); + return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), + MachinePointerInfo(SV), false, false, 0); +} + +SDValue +ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, + SDValue &Root, SelectionDAG &DAG, + DebugLoc dl) const { + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + + TargetRegisterClass *RC; + if (AFI->isThumb1OnlyFunction()) + RC = ARM::tGPRRegisterClass; + else + RC = ARM::GPRRegisterClass; + + // Transform the arguments stored in physical registers into virtual ones. + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); + + SDValue ArgValue2; + if (NextVA.isMemLoc()) { + MachineFrameInfo *MFI = MF.getFrameInfo(); + int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true); + + // Create load node to retrieve arguments from the stack. + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN, + MachinePointerInfo::getFixedStack(FI), + false, false, 0); + } else { + Reg = MF.addLiveIn(NextVA.getLocReg(), RC); + ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); + } + + return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2); +} + +void +ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF, + unsigned &VARegSize, unsigned &VARegSaveSize) + const { + unsigned NumGPRs; + if (CCInfo.isFirstByValRegValid()) + NumGPRs = ARM::R4 - CCInfo.getFirstByValReg(); + else { + unsigned int firstUnalloced; + firstUnalloced = CCInfo.getFirstUnallocated(GPRArgRegs, + sizeof(GPRArgRegs) / + sizeof(GPRArgRegs[0])); + NumGPRs = (firstUnalloced <= 3) ? (4 - firstUnalloced) : 0; + } + + unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment(); + VARegSize = NumGPRs * 4; + VARegSaveSize = (VARegSize + Align - 1) & ~(Align - 1); +} + +// The remaining GPRs hold either the beginning of variable-argument +// data, or the beginning of an aggregate passed by value (usuall +// byval). Either way, we allocate stack slots adjacent to the data +// provided by our caller, and store the unallocated registers there. +// If this is a variadic function, the va_list pointer will begin with +// these values; otherwise, this reassembles a (byval) structure that +// was split between registers and memory. +void +ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, + DebugLoc dl, SDValue &Chain, + unsigned ArgOffset) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned firstRegToSaveIndex; + if (CCInfo.isFirstByValRegValid()) + firstRegToSaveIndex = CCInfo.getFirstByValReg() - ARM::R0; + else { + firstRegToSaveIndex = CCInfo.getFirstUnallocated + (GPRArgRegs, sizeof(GPRArgRegs) / sizeof(GPRArgRegs[0])); + } + + unsigned VARegSize, VARegSaveSize; + computeRegArea(CCInfo, MF, VARegSize, VARegSaveSize); + if (VARegSaveSize) { + // If this function is vararg, store any remaining integer argument regs + // to their spots on the stack so that they may be loaded by deferencing + // the result of va_next. + AFI->setVarArgsRegSaveSize(VARegSaveSize); + AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(VARegSaveSize, + ArgOffset + VARegSaveSize + - VARegSize, + false)); + SDValue FIN = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(), + getPointerTy()); + + SmallVector<SDValue, 4> MemOps; + for (; firstRegToSaveIndex < 4; ++firstRegToSaveIndex) { + TargetRegisterClass *RC; + if (AFI->isThumb1OnlyFunction()) + RC = ARM::tGPRRegisterClass; + else + RC = ARM::GPRRegisterClass; + + unsigned VReg = MF.addLiveIn(GPRArgRegs[firstRegToSaveIndex], RC); + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); + SDValue Store = + DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo::getFixedStack(AFI->getVarArgsFrameIndex()), + false, false, 0); + MemOps.push_back(Store); + FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, + DAG.getConstant(4, getPointerTy())); + } + if (!MemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + &MemOps[0], MemOps.size()); + } else + // This will point to the next argument passed via stack. + AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(4, ArgOffset, true)); +} + +SDValue +ARMTargetLowering::LowerFormalArguments(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> + &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) + const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + + // Assign locations to all of the incoming arguments. + SmallVector<CCValAssign, 16> ArgLocs; + ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext(), Prologue); + CCInfo.AnalyzeFormalArguments(Ins, + CCAssignFnForNode(CallConv, /* Return*/ false, + isVarArg)); + + SmallVector<SDValue, 16> ArgValues; + int lastInsIndex = -1; + + SDValue ArgValue; + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + + // Arguments stored in registers. + if (VA.isRegLoc()) { + EVT RegVT = VA.getLocVT(); + + if (VA.needsCustom()) { + // f64 and vector types are split up into multiple registers or + // combinations of registers and stack slots. + if (VA.getLocVT() == MVT::v2f64) { + SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], + Chain, DAG, dl); + VA = ArgLocs[++i]; // skip ahead to next loc + SDValue ArgValue2; + if (VA.isMemLoc()) { + int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true); + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, + MachinePointerInfo::getFixedStack(FI), + false, false, 0); + } else { + ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], + Chain, DAG, dl); + } + ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); + ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, + ArgValue, ArgValue1, DAG.getIntPtrConstant(0)); + ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, + ArgValue, ArgValue2, DAG.getIntPtrConstant(1)); + } else + ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); + + } else { + TargetRegisterClass *RC; + + if (RegVT == MVT::f32) + RC = ARM::SPRRegisterClass; + else if (RegVT == MVT::f64) + RC = ARM::DPRRegisterClass; + else if (RegVT == MVT::v2f64) + RC = ARM::QPRRegisterClass; + else if (RegVT == MVT::i32) + RC = (AFI->isThumb1OnlyFunction() ? + ARM::tGPRRegisterClass : ARM::GPRRegisterClass); + else + llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); + + // Transform the arguments in physical registers into virtual ones. + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); + } + + // If this is an 8 or 16-bit value, it is really passed promoted + // to 32 bits. Insert an assert[sz]ext to capture this, then + // truncate to the right size. + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::BCvt: + ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); + break; + case CCValAssign::SExt: + ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, + DAG.getValueType(VA.getValVT())); + ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); + break; + case CCValAssign::ZExt: + ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, + DAG.getValueType(VA.getValVT())); + ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); + break; + } + + InVals.push_back(ArgValue); + + } else { // VA.isRegLoc() + + // sanity check + assert(VA.isMemLoc()); + assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); + + int index = ArgLocs[i].getValNo(); + + // Some Ins[] entries become multiple ArgLoc[] entries. + // Process them only once. + if (index != lastInsIndex) + { + ISD::ArgFlagsTy Flags = Ins[index].Flags; + // FIXME: For now, all byval parameter objects are marked mutable. + // This can be changed with more analysis. + // In case of tail call optimization mark all arguments mutable. + // Since they could be overwritten by lowering of arguments in case of + // a tail call. + if (Flags.isByVal()) { + unsigned VARegSize, VARegSaveSize; + computeRegArea(CCInfo, MF, VARegSize, VARegSaveSize); + VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 0); + unsigned Bytes = Flags.getByValSize() - VARegSize; + if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. + int FI = MFI->CreateFixedObject(Bytes, + VA.getLocMemOffset(), false); + InVals.push_back(DAG.getFrameIndex(FI, getPointerTy())); + } else { + int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8, + VA.getLocMemOffset(), true); + + // Create load nodes to retrieve arguments from the stack. + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, + MachinePointerInfo::getFixedStack(FI), + false, false, 0)); + } + lastInsIndex = index; + } + } + } + + // varargs + if (isVarArg) + VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset()); + + return Chain; +} + +/// isFloatingPointZero - Return true if this is +0.0. +static bool isFloatingPointZero(SDValue Op) { + if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) + return CFP->getValueAPF().isPosZero(); + else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { + // Maybe this has already been legalized into the constant pool? + if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) { + SDValue WrapperOp = Op.getOperand(1).getOperand(0); + if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp)) + if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) + return CFP->getValueAPF().isPosZero(); + } + } + return false; +} + +/// Returns appropriate ARM CMP (cmp) and corresponding condition code for +/// the given operands. +SDValue +ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, + SDValue &ARMcc, SelectionDAG &DAG, + DebugLoc dl) const { + if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { + unsigned C = RHSC->getZExtValue(); + if (!isLegalICmpImmediate(C)) { + // Constant does not fit, try adjusting it by one? + switch (CC) { + default: break; + case ISD::SETLT: + case ISD::SETGE: + if (C != 0x80000000 && isLegalICmpImmediate(C-1)) { + CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; + RHS = DAG.getConstant(C-1, MVT::i32); + } + break; + case ISD::SETULT: + case ISD::SETUGE: + if (C != 0 && isLegalICmpImmediate(C-1)) { + CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; + RHS = DAG.getConstant(C-1, MVT::i32); + } + break; + case ISD::SETLE: + case ISD::SETGT: + if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) { + CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; + RHS = DAG.getConstant(C+1, MVT::i32); + } + break; + case ISD::SETULE: + case ISD::SETUGT: + if (C != 0xffffffff && isLegalICmpImmediate(C+1)) { + CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; + RHS = DAG.getConstant(C+1, MVT::i32); + } + break; + } + } + } + + ARMCC::CondCodes CondCode = IntCCToARMCC(CC); + ARMISD::NodeType CompareType; + switch (CondCode) { + default: + CompareType = ARMISD::CMP; + break; + case ARMCC::EQ: + case ARMCC::NE: + // Uses only Z Flag + CompareType = ARMISD::CMPZ; + break; + } + ARMcc = DAG.getConstant(CondCode, MVT::i32); + return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS); +} + +/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. +SDValue +ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, + DebugLoc dl) const { + SDValue Cmp; + if (!isFloatingPointZero(RHS)) + Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS); + else + Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS); + return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); +} + +/// duplicateCmp - Glue values can have only one use, so this function +/// duplicates a comparison node. +SDValue +ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { + unsigned Opc = Cmp.getOpcode(); + DebugLoc DL = Cmp.getDebugLoc(); + if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ) + return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); + + assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation"); + Cmp = Cmp.getOperand(0); + Opc = Cmp.getOpcode(); + if (Opc == ARMISD::CMPFP) + Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); + else { + assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT"); + Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0)); + } + return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); +} + +SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { + SDValue Cond = Op.getOperand(0); + SDValue SelectTrue = Op.getOperand(1); + SDValue SelectFalse = Op.getOperand(2); + DebugLoc dl = Op.getDebugLoc(); + + // Convert: + // + // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond) + // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond) + // + if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) { + const ConstantSDNode *CMOVTrue = + dyn_cast<ConstantSDNode>(Cond.getOperand(0)); + const ConstantSDNode *CMOVFalse = + dyn_cast<ConstantSDNode>(Cond.getOperand(1)); + + if (CMOVTrue && CMOVFalse) { + unsigned CMOVTrueVal = CMOVTrue->getZExtValue(); + unsigned CMOVFalseVal = CMOVFalse->getZExtValue(); + + SDValue True; + SDValue False; + if (CMOVTrueVal == 1 && CMOVFalseVal == 0) { + True = SelectTrue; + False = SelectFalse; + } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) { + True = SelectFalse; + False = SelectTrue; + } + + if (True.getNode() && False.getNode()) { + EVT VT = Op.getValueType(); + SDValue ARMcc = Cond.getOperand(2); + SDValue CCR = Cond.getOperand(3); + SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG); + assert(True.getValueType() == VT); + return DAG.getNode(ARMISD::CMOV, dl, VT, True, False, ARMcc, CCR, Cmp); + } + } + } + + return DAG.getSelectCC(dl, Cond, + DAG.getConstant(0, Cond.getValueType()), + SelectTrue, SelectFalse, ISD::SETNE); +} + +SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); + SDValue TrueVal = Op.getOperand(2); + SDValue FalseVal = Op.getOperand(3); + DebugLoc dl = Op.getDebugLoc(); + + if (LHS.getValueType() == MVT::i32) { + SDValue ARMcc; + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); + return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,Cmp); + } + + ARMCC::CondCodes CondCode, CondCode2; + FPCCToARMCC(CC, CondCode, CondCode2); + + SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32); + SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + SDValue Result = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, + ARMcc, CCR, Cmp); + if (CondCode2 != ARMCC::AL) { + SDValue ARMcc2 = DAG.getConstant(CondCode2, MVT::i32); + // FIXME: Needs another CMP because flag can have but one use. + SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl); + Result = DAG.getNode(ARMISD::CMOV, dl, VT, + Result, TrueVal, ARMcc2, CCR, Cmp2); + } + return Result; +} + +/// canChangeToInt - Given the fp compare operand, return true if it is suitable +/// to morph to an integer compare sequence. +static bool canChangeToInt(SDValue Op, bool &SeenZero, + const ARMSubtarget *Subtarget) { + SDNode *N = Op.getNode(); + if (!N->hasOneUse()) + // Otherwise it requires moving the value from fp to integer registers. + return false; + if (!N->getNumValues()) + return false; + EVT VT = Op.getValueType(); + if (VT != MVT::f32 && !Subtarget->isFPBrccSlow()) + // f32 case is generally profitable. f64 case only makes sense when vcmpe + + // vmrs are very slow, e.g. cortex-a8. + return false; + + if (isFloatingPointZero(Op)) { + SeenZero = true; + return true; + } + return ISD::isNormalLoad(N); +} + +static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { + if (isFloatingPointZero(Op)) + return DAG.getConstant(0, MVT::i32); + + if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) + return DAG.getLoad(MVT::i32, Op.getDebugLoc(), + Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), + Ld->isVolatile(), Ld->isNonTemporal(), + Ld->getAlignment()); + + llvm_unreachable("Unknown VFP cmp argument!"); +} + +static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, + SDValue &RetVal1, SDValue &RetVal2) { + if (isFloatingPointZero(Op)) { + RetVal1 = DAG.getConstant(0, MVT::i32); + RetVal2 = DAG.getConstant(0, MVT::i32); + return; + } + + if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) { + SDValue Ptr = Ld->getBasePtr(); + RetVal1 = DAG.getLoad(MVT::i32, Op.getDebugLoc(), + Ld->getChain(), Ptr, + Ld->getPointerInfo(), + Ld->isVolatile(), Ld->isNonTemporal(), + Ld->getAlignment()); + + EVT PtrType = Ptr.getValueType(); + unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); + SDValue NewPtr = DAG.getNode(ISD::ADD, Op.getDebugLoc(), + PtrType, Ptr, DAG.getConstant(4, PtrType)); + RetVal2 = DAG.getLoad(MVT::i32, Op.getDebugLoc(), + Ld->getChain(), NewPtr, + Ld->getPointerInfo().getWithOffset(4), + Ld->isVolatile(), Ld->isNonTemporal(), + NewAlign); + return; + } + + llvm_unreachable("Unknown VFP cmp argument!"); +} + +/// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some +/// f32 and even f64 comparisons to integer ones. +SDValue +ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); + SDValue LHS = Op.getOperand(2); + SDValue RHS = Op.getOperand(3); + SDValue Dest = Op.getOperand(4); + DebugLoc dl = Op.getDebugLoc(); + + bool SeenZero = false; + if (canChangeToInt(LHS, SeenZero, Subtarget) && + canChangeToInt(RHS, SeenZero, Subtarget) && + // If one of the operand is zero, it's safe to ignore the NaN case since + // we only care about equality comparisons. + (SeenZero || (DAG.isKnownNeverNaN(LHS) && DAG.isKnownNeverNaN(RHS)))) { + // If unsafe fp math optimization is enabled and there are no other uses of + // the CMP operands, and the condition code is EQ or NE, we can optimize it + // to an integer comparison. + if (CC == ISD::SETOEQ) + CC = ISD::SETEQ; + else if (CC == ISD::SETUNE) + CC = ISD::SETNE; + + SDValue ARMcc; + if (LHS.getValueType() == MVT::f32) { + LHS = bitcastf32Toi32(LHS, DAG); + RHS = bitcastf32Toi32(RHS, DAG); + SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, + Chain, Dest, ARMcc, CCR, Cmp); + } + + SDValue LHS1, LHS2; + SDValue RHS1, RHS2; + expandf64Toi32(LHS, DAG, LHS1, LHS2); + expandf64Toi32(RHS, DAG, RHS1, RHS2); + ARMCC::CondCodes CondCode = IntCCToARMCC(CC); + ARMcc = DAG.getConstant(CondCode, MVT::i32); + SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest }; + return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops, 7); + } + + return SDValue(); +} + +SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); + SDValue LHS = Op.getOperand(2); + SDValue RHS = Op.getOperand(3); + SDValue Dest = Op.getOperand(4); + DebugLoc dl = Op.getDebugLoc(); + + if (LHS.getValueType() == MVT::i32) { + SDValue ARMcc; + SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, + Chain, Dest, ARMcc, CCR, Cmp); + } + + assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); + + if (UnsafeFPMath && + (CC == ISD::SETEQ || CC == ISD::SETOEQ || + CC == ISD::SETNE || CC == ISD::SETUNE)) { + SDValue Result = OptimizeVFPBrcond(Op, DAG); + if (Result.getNode()) + return Result; + } + + ARMCC::CondCodes CondCode, CondCode2; + FPCCToARMCC(CC, CondCode, CondCode2); + + SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32); + SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; + SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5); + if (CondCode2 != ARMCC::AL) { + ARMcc = DAG.getConstant(CondCode2, MVT::i32); + SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) }; + Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5); + } + return Res; +} + +SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + SDValue Table = Op.getOperand(1); + SDValue Index = Op.getOperand(2); + DebugLoc dl = Op.getDebugLoc(); + + EVT PTy = getPointerTy(); + JumpTableSDNode *JT = cast<JumpTableSDNode>(Table); + ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>(); + SDValue UId = DAG.getConstant(AFI->createJumpTableUId(), PTy); + SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); + Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI, UId); + Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, PTy)); + SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table); + if (Subtarget->isThumb2()) { + // Thumb2 uses a two-level jump. That is, it jumps into the jump table + // which does another jump to the destination. This also makes it easier + // to translate it to TBB / TBH later. + // FIXME: This might not work if the function is extremely large. + return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain, + Addr, Op.getOperand(2), JTI, UId); + } + if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { + Addr = DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, + MachinePointerInfo::getJumpTable(), + false, false, 0); + Chain = Addr.getValue(1); + Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table); + return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId); + } else { + Addr = DAG.getLoad(PTy, dl, Chain, Addr, + MachinePointerInfo::getJumpTable(), false, false, 0); + Chain = Addr.getValue(1); + return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId); + } +} + +static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) { + DebugLoc dl = Op.getDebugLoc(); + unsigned Opc; + + switch (Op.getOpcode()) { + default: + assert(0 && "Invalid opcode!"); + case ISD::FP_TO_SINT: + Opc = ARMISD::FTOSI; + break; + case ISD::FP_TO_UINT: + Opc = ARMISD::FTOUI; + break; + } + Op = DAG.getNode(Opc, dl, MVT::f32, Op.getOperand(0)); + return DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op); +} + +static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + + assert(Op.getOperand(0).getValueType() == MVT::v4i16 && + "Invalid type for custom lowering!"); + if (VT != MVT::v4f32) + return DAG.UnrollVectorOp(Op.getNode()); + + unsigned CastOpc; + unsigned Opc; + switch (Op.getOpcode()) { + default: + assert(0 && "Invalid opcode!"); + case ISD::SINT_TO_FP: + CastOpc = ISD::SIGN_EXTEND; + Opc = ISD::SINT_TO_FP; + break; + case ISD::UINT_TO_FP: + CastOpc = ISD::ZERO_EXTEND; + Opc = ISD::UINT_TO_FP; + break; + } + + Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0)); + return DAG.getNode(Opc, dl, VT, Op); +} + +static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + if (VT.isVector()) + return LowerVectorINT_TO_FP(Op, DAG); + + DebugLoc dl = Op.getDebugLoc(); + unsigned Opc; + + switch (Op.getOpcode()) { + default: + assert(0 && "Invalid opcode!"); + case ISD::SINT_TO_FP: + Opc = ARMISD::SITOF; + break; + case ISD::UINT_TO_FP: + Opc = ARMISD::UITOF; + break; + } + + Op = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Op.getOperand(0)); + return DAG.getNode(Opc, dl, VT, Op); +} + +SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { + // Implement fcopysign with a fabs and a conditional fneg. + SDValue Tmp0 = Op.getOperand(0); + SDValue Tmp1 = Op.getOperand(1); + DebugLoc dl = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + EVT SrcVT = Tmp1.getValueType(); + bool InGPR = Tmp0.getOpcode() == ISD::BITCAST || + Tmp0.getOpcode() == ARMISD::VMOVDRR; + bool UseNEON = !InGPR && Subtarget->hasNEON(); + + if (UseNEON) { + // Use VBSL to copy the sign bit. + unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80); + SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32, + DAG.getTargetConstant(EncodedVal, MVT::i32)); + EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; + if (VT == MVT::f64) + Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT, + DAG.getNode(ISD::BITCAST, dl, OpVT, Mask), + DAG.getConstant(32, MVT::i32)); + else /*if (VT == MVT::f32)*/ + Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0); + if (SrcVT == MVT::f32) { + Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1); + if (VT == MVT::f64) + Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT, + DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), + DAG.getConstant(32, MVT::i32)); + } else if (VT == MVT::f32) + Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64, + DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), + DAG.getConstant(32, MVT::i32)); + Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); + Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1); + + SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff), + MVT::i32); + AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes); + SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask, + DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes)); + + SDValue Res = DAG.getNode(ISD::OR, dl, OpVT, + DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask), + DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot)); + if (VT == MVT::f32) { + Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res); + Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, + DAG.getConstant(0, MVT::i32)); + } else { + Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res); + } + + return Res; + } + + // Bitcast operand 1 to i32. + if (SrcVT == MVT::f64) + Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), + &Tmp1, 1).getValue(1); + Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1); + + // Or in the signbit with integer operations. + SDValue Mask1 = DAG.getConstant(0x80000000, MVT::i32); + SDValue Mask2 = DAG.getConstant(0x7fffffff, MVT::i32); + Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1); + if (VT == MVT::f32) { + Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32, + DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2); + return DAG.getNode(ISD::BITCAST, dl, MVT::f32, + DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1)); + } + + // f64: Or the high part with signbit and then combine two parts. + Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), + &Tmp0, 1); + SDValue Lo = Tmp0.getValue(0); + SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2); + Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1); + return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); +} + +SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + MFI->setReturnAddressIsTaken(true); + + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + if (Depth) { + SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); + SDValue Offset = DAG.getConstant(4, MVT::i32); + return DAG.getLoad(VT, dl, DAG.getEntryNode(), + DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), + MachinePointerInfo(), false, false, 0); + } + + // Return LR, which contains the return address. Mark it an implicit live-in. + unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); + return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); +} + +SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MFI->setFrameAddressIsTaken(true); + + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful + unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + unsigned FrameReg = (Subtarget->isThumb() || Subtarget->isTargetDarwin()) + ? ARM::R7 : ARM::R11; + SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); + while (Depth--) + FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, + MachinePointerInfo(), + false, false, 0); + return FrameAddr; +} + +/// ExpandBITCAST - If the target supports VFP, this function is called to +/// expand a bit convert where either the source or destination type is i64 to +/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 +/// operand type is illegal (e.g., v2f32 for a target that doesn't support +/// vectors), since the legalizer won't know what to do with that. +static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + DebugLoc dl = N->getDebugLoc(); + SDValue Op = N->getOperand(0); + + // This function is only supposed to be called for i64 types, either as the + // source or destination of the bit convert. + EVT SrcVT = Op.getValueType(); + EVT DstVT = N->getValueType(0); + assert((SrcVT == MVT::i64 || DstVT == MVT::i64) && + "ExpandBITCAST called for non-i64 type"); + + // Turn i64->f64 into VMOVDRR. + if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, + DAG.getConstant(0, MVT::i32)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, + DAG.getConstant(1, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, DstVT, + DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi)); + } + + // Turn f64->i64 into VMOVRRD. + if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { + SDValue Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, + DAG.getVTList(MVT::i32, MVT::i32), &Op, 1); + // Merge the pieces into a single i64 value. + return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); + } + + return SDValue(); +} + +/// getZeroVector - Returns a vector of specified type with all zero elements. +/// Zero vectors are used to represent vector negation and in those cases +/// will be implemented with the NEON VNEG instruction. However, VNEG does +/// not support i64 elements, so sometimes the zero vectors will need to be +/// explicitly constructed. Regardless, use a canonical VMOV to create the +/// zero vector. +static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { + assert(VT.isVector() && "Expected a vector type"); + // The canonical modified immediate encoding of a zero vector is....0! + SDValue EncodedVal = DAG.getTargetConstant(0, MVT::i32); + EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; + SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal); + return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); +} + +/// LowerShiftRightParts - Lower SRA_PARTS, which returns two +/// i32 values and take a 2 x i32 value to shift plus a shift amount. +SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getNumOperands() == 3 && "Not a double-shift!"); + EVT VT = Op.getValueType(); + unsigned VTBits = VT.getSizeInBits(); + DebugLoc dl = Op.getDebugLoc(); + SDValue ShOpLo = Op.getOperand(0); + SDValue ShOpHi = Op.getOperand(1); + SDValue ShAmt = Op.getOperand(2); + SDValue ARMcc; + unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; + + assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); + + SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant(VTBits, MVT::i32), ShAmt); + SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, + DAG.getConstant(VTBits, MVT::i32)); + SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); + SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); + SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); + + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE, + ARMcc, DAG, dl); + SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); + SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, + CCR, Cmp); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, 2, dl); +} + +/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two +/// i32 values and take a 2 x i32 value to shift plus a shift amount. +SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getNumOperands() == 3 && "Not a double-shift!"); + EVT VT = Op.getValueType(); + unsigned VTBits = VT.getSizeInBits(); + DebugLoc dl = Op.getDebugLoc(); + SDValue ShOpLo = Op.getOperand(0); + SDValue ShOpHi = Op.getOperand(1); + SDValue ShAmt = Op.getOperand(2); + SDValue ARMcc; + + assert(Op.getOpcode() == ISD::SHL_PARTS); + SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant(VTBits, MVT::i32), ShAmt); + SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, + DAG.getConstant(VTBits, MVT::i32)); + SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); + SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); + + SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE, + ARMcc, DAG, dl); + SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); + SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, Tmp3, ARMcc, + CCR, Cmp); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, 2, dl); +} + +SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, + SelectionDAG &DAG) const { + // The rounding mode is in bits 23:22 of the FPSCR. + // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 + // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) + // so that the shift + and get folded into a bitfield extract. + DebugLoc dl = Op.getDebugLoc(); + SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, + DAG.getConstant(Intrinsic::arm_get_fpscr, + MVT::i32)); + SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, + DAG.getConstant(1U << 22, MVT::i32)); + SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, + DAG.getConstant(22, MVT::i32)); + return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, + DAG.getConstant(3, MVT::i32)); +} + +static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VT = N->getValueType(0); + DebugLoc dl = N->getDebugLoc(); + + if (!ST->hasV6T2Ops()) + return SDValue(); + + SDValue rbit = DAG.getNode(ARMISD::RBIT, dl, VT, N->getOperand(0)); + return DAG.getNode(ISD::CTLZ, dl, VT, rbit); +} + +static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VT = N->getValueType(0); + DebugLoc dl = N->getDebugLoc(); + + if (!VT.isVector()) + return SDValue(); + + // Lower vector shifts on NEON to use VSHL. + assert(ST->hasNEON() && "unexpected vector shift"); + + // Left shifts translate directly to the vshiftu intrinsic. + if (N->getOpcode() == ISD::SHL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::arm_neon_vshiftu, MVT::i32), + N->getOperand(0), N->getOperand(1)); + + assert((N->getOpcode() == ISD::SRA || + N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode"); + + // NEON uses the same intrinsics for both left and right shifts. For + // right shifts, the shift amounts are negative, so negate the vector of + // shift amounts. + EVT ShiftVT = N->getOperand(1).getValueType(); + SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT, + getZeroVector(ShiftVT, DAG, dl), + N->getOperand(1)); + Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ? + Intrinsic::arm_neon_vshifts : + Intrinsic::arm_neon_vshiftu); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(vshiftInt, MVT::i32), + N->getOperand(0), NegatedCount); +} + +static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VT = N->getValueType(0); + DebugLoc dl = N->getDebugLoc(); + + // We can get here for a node like i32 = ISD::SHL i32, i64 + if (VT != MVT::i64) + return SDValue(); + + assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && + "Unknown shift to lower!"); + + // We only lower SRA, SRL of 1 here, all others use generic lowering. + if (!isa<ConstantSDNode>(N->getOperand(1)) || + cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != 1) + return SDValue(); + + // If we are in thumb mode, we don't have RRX. + if (ST->isThumb1Only()) return SDValue(); + + // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), + DAG.getConstant(0, MVT::i32)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), + DAG.getConstant(1, MVT::i32)); + + // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and + // captures the result into a carry flag. + unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG; + Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), &Hi, 1); + + // The low part is an ARMISD::RRX operand, which shifts the carry in. + Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1)); + + // Merge the pieces into a single i64 value. + return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); +} + +static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { + SDValue TmpOp0, TmpOp1; + bool Invert = false; + bool Swap = false; + unsigned Opc = 0; + + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue CC = Op.getOperand(2); + EVT VT = Op.getValueType(); + ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); + DebugLoc dl = Op.getDebugLoc(); + + if (Op.getOperand(1).getValueType().isFloatingPoint()) { + switch (SetCCOpcode) { + default: llvm_unreachable("Illegal FP comparison"); break; + case ISD::SETUNE: + case ISD::SETNE: Invert = true; // Fallthrough + case ISD::SETOEQ: + case ISD::SETEQ: Opc = ARMISD::VCEQ; break; + case ISD::SETOLT: + case ISD::SETLT: Swap = true; // Fallthrough + case ISD::SETOGT: + case ISD::SETGT: Opc = ARMISD::VCGT; break; + case ISD::SETOLE: + case ISD::SETLE: Swap = true; // Fallthrough + case ISD::SETOGE: + case ISD::SETGE: Opc = ARMISD::VCGE; break; + case ISD::SETUGE: Swap = true; // Fallthrough + case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break; + case ISD::SETUGT: Swap = true; // Fallthrough + case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break; + case ISD::SETUEQ: Invert = true; // Fallthrough + case ISD::SETONE: + // Expand this to (OLT | OGT). + TmpOp0 = Op0; + TmpOp1 = Op1; + Opc = ISD::OR; + Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0); + Op1 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp0, TmpOp1); + break; + case ISD::SETUO: Invert = true; // Fallthrough + case ISD::SETO: + // Expand this to (OLT | OGE). + TmpOp0 = Op0; + TmpOp1 = Op1; + Opc = ISD::OR; + Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0); + Op1 = DAG.getNode(ARMISD::VCGE, dl, VT, TmpOp0, TmpOp1); + break; + } + } else { + // Integer comparisons. + switch (SetCCOpcode) { + default: llvm_unreachable("Illegal integer comparison"); break; + case ISD::SETNE: Invert = true; + case ISD::SETEQ: Opc = ARMISD::VCEQ; break; + case ISD::SETLT: Swap = true; + case ISD::SETGT: Opc = ARMISD::VCGT; break; + case ISD::SETLE: Swap = true; + case ISD::SETGE: Opc = ARMISD::VCGE; break; + case ISD::SETULT: Swap = true; + case ISD::SETUGT: Opc = ARMISD::VCGTU; break; + case ISD::SETULE: Swap = true; + case ISD::SETUGE: Opc = ARMISD::VCGEU; break; + } + + // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero). + if (Opc == ARMISD::VCEQ) { + + SDValue AndOp; + if (ISD::isBuildVectorAllZeros(Op1.getNode())) + AndOp = Op0; + else if (ISD::isBuildVectorAllZeros(Op0.getNode())) + AndOp = Op1; + + // Ignore bitconvert. + if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST) + AndOp = AndOp.getOperand(0); + + if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { + Opc = ARMISD::VTST; + Op0 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(0)); + Op1 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(1)); + Invert = !Invert; + } + } + } + + if (Swap) + std::swap(Op0, Op1); + + // If one of the operands is a constant vector zero, attempt to fold the + // comparison to a specialized compare-against-zero form. + SDValue SingleOp; + if (ISD::isBuildVectorAllZeros(Op1.getNode())) + SingleOp = Op0; + else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { + if (Opc == ARMISD::VCGE) + Opc = ARMISD::VCLEZ; + else if (Opc == ARMISD::VCGT) + Opc = ARMISD::VCLTZ; + SingleOp = Op1; + } + + SDValue Result; + if (SingleOp.getNode()) { + switch (Opc) { + case ARMISD::VCEQ: + Result = DAG.getNode(ARMISD::VCEQZ, dl, VT, SingleOp); break; + case ARMISD::VCGE: + Result = DAG.getNode(ARMISD::VCGEZ, dl, VT, SingleOp); break; + case ARMISD::VCLEZ: + Result = DAG.getNode(ARMISD::VCLEZ, dl, VT, SingleOp); break; + case ARMISD::VCGT: + Result = DAG.getNode(ARMISD::VCGTZ, dl, VT, SingleOp); break; + case ARMISD::VCLTZ: + Result = DAG.getNode(ARMISD::VCLTZ, dl, VT, SingleOp); break; + default: + Result = DAG.getNode(Opc, dl, VT, Op0, Op1); + } + } else { + Result = DAG.getNode(Opc, dl, VT, Op0, Op1); + } + + if (Invert) + Result = DAG.getNOT(dl, Result, VT); + + return Result; +} + +/// isNEONModifiedImm - Check if the specified splat value corresponds to a +/// valid vector constant for a NEON instruction with a "modified immediate" +/// operand (e.g., VMOV). If so, return the encoded value. +static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, + unsigned SplatBitSize, SelectionDAG &DAG, + EVT &VT, bool is128Bits, NEONModImmType type) { + unsigned OpCmode, Imm; + + // SplatBitSize is set to the smallest size that splats the vector, so a + // zero vector will always have SplatBitSize == 8. However, NEON modified + // immediate instructions others than VMOV do not support the 8-bit encoding + // of a zero vector, and the default encoding of zero is supposed to be the + // 32-bit version. + if (SplatBits == 0) + SplatBitSize = 32; + + switch (SplatBitSize) { + case 8: + if (type != VMOVModImm) + return SDValue(); + // Any 1-byte value is OK. Op=0, Cmode=1110. + assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); + OpCmode = 0xe; + Imm = SplatBits; + VT = is128Bits ? MVT::v16i8 : MVT::v8i8; + break; + + case 16: + // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. + VT = is128Bits ? MVT::v8i16 : MVT::v4i16; + if ((SplatBits & ~0xff) == 0) { + // Value = 0x00nn: Op=x, Cmode=100x. + OpCmode = 0x8; + Imm = SplatBits; + break; + } + if ((SplatBits & ~0xff00) == 0) { + // Value = 0xnn00: Op=x, Cmode=101x. + OpCmode = 0xa; + Imm = SplatBits >> 8; + break; + } + return SDValue(); + + case 32: + // NEON's 32-bit VMOV supports splat values where: + // * only one byte is nonzero, or + // * the least significant byte is 0xff and the second byte is nonzero, or + // * the least significant 2 bytes are 0xff and the third is nonzero. + VT = is128Bits ? MVT::v4i32 : MVT::v2i32; + if ((SplatBits & ~0xff) == 0) { + // Value = 0x000000nn: Op=x, Cmode=000x. + OpCmode = 0; + Imm = SplatBits; + break; + } + if ((SplatBits & ~0xff00) == 0) { + // Value = 0x0000nn00: Op=x, Cmode=001x. + OpCmode = 0x2; + Imm = SplatBits >> 8; + break; + } + if ((SplatBits & ~0xff0000) == 0) { + // Value = 0x00nn0000: Op=x, Cmode=010x. + OpCmode = 0x4; + Imm = SplatBits >> 16; + break; + } + if ((SplatBits & ~0xff000000) == 0) { + // Value = 0xnn000000: Op=x, Cmode=011x. + OpCmode = 0x6; + Imm = SplatBits >> 24; + break; + } + + // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC + if (type == OtherModImm) return SDValue(); + + if ((SplatBits & ~0xffff) == 0 && + ((SplatBits | SplatUndef) & 0xff) == 0xff) { + // Value = 0x0000nnff: Op=x, Cmode=1100. + OpCmode = 0xc; + Imm = SplatBits >> 8; + SplatBits |= 0xff; + break; + } + + if ((SplatBits & ~0xffffff) == 0 && + ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { + // Value = 0x00nnffff: Op=x, Cmode=1101. + OpCmode = 0xd; + Imm = SplatBits >> 16; + SplatBits |= 0xffff; + break; + } + + // Note: there are a few 32-bit splat values (specifically: 00ffff00, + // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not + // VMOV.I32. A (very) minor optimization would be to replicate the value + // and fall through here to test for a valid 64-bit splat. But, then the + // caller would also need to check and handle the change in size. + return SDValue(); + + case 64: { + if (type != VMOVModImm) + return SDValue(); + // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. + uint64_t BitMask = 0xff; + uint64_t Val = 0; + unsigned ImmMask = 1; + Imm = 0; + for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { + if (((SplatBits | SplatUndef) & BitMask) == BitMask) { + Val |= BitMask; + Imm |= ImmMask; + } else if ((SplatBits & BitMask) != 0) { + return SDValue(); + } + BitMask <<= 8; + ImmMask <<= 1; + } + // Op=1, Cmode=1110. + OpCmode = 0x1e; + SplatBits = Val; + VT = is128Bits ? MVT::v2i64 : MVT::v1i64; + break; + } + + default: + llvm_unreachable("unexpected size for isNEONModifiedImm"); + return SDValue(); + } + + unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm); + return DAG.getTargetConstant(EncodedVal, MVT::i32); +} + +static bool isVEXTMask(const SmallVectorImpl<int> &M, EVT VT, + bool &ReverseVEXT, unsigned &Imm) { + unsigned NumElts = VT.getVectorNumElements(); + ReverseVEXT = false; + + // Assume that the first shuffle index is not UNDEF. Fail if it is. + if (M[0] < 0) + return false; + + Imm = M[0]; + + // If this is a VEXT shuffle, the immediate value is the index of the first + // element. The other shuffle indices must be the successive elements after + // the first one. + unsigned ExpectedElt = Imm; + for (unsigned i = 1; i < NumElts; ++i) { + // Increment the expected index. If it wraps around, it may still be + // a VEXT but the source vectors must be swapped. + ExpectedElt += 1; + if (ExpectedElt == NumElts * 2) { + ExpectedElt = 0; + ReverseVEXT = true; + } + + if (M[i] < 0) continue; // ignore UNDEF indices + if (ExpectedElt != static_cast<unsigned>(M[i])) + return false; + } + + // Adjust the index value if the source operands will be swapped. + if (ReverseVEXT) + Imm -= NumElts; + + return true; +} + +/// isVREVMask - Check if a vector shuffle corresponds to a VREV +/// instruction with the specified blocksize. (The order of the elements +/// within each block of the vector is reversed.) +static bool isVREVMask(const SmallVectorImpl<int> &M, EVT VT, + unsigned BlockSize) { + assert((BlockSize==16 || BlockSize==32 || BlockSize==64) && + "Only possible block sizes for VREV are: 16, 32, 64"); + + unsigned EltSz = VT.getVectorElementType().getSizeInBits(); + if (EltSz == 64) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + unsigned BlockElts = M[0] + 1; + // If the first shuffle index is UNDEF, be optimistic. + if (M[0] < 0) + BlockElts = BlockSize / EltSz; + + if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) + return false; + + for (unsigned i = 0; i < NumElts; ++i) { + if (M[i] < 0) continue; // ignore UNDEF indices + if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts)) + return false; + } + + return true; +} + +static bool isVTBLMask(const SmallVectorImpl<int> &M, EVT VT) { + // We can handle <8 x i8> vector shuffles. If the index in the mask is out of + // range, then 0 is placed into the resulting vector. So pretty much any mask + // of 8 elements can work here. + return VT == MVT::v8i8 && M.size() == 8; +} + +static bool isVTRNMask(const SmallVectorImpl<int> &M, EVT VT, + unsigned &WhichResult) { + unsigned EltSz = VT.getVectorElementType().getSizeInBits(); + if (EltSz == 64) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned i = 0; i < NumElts; i += 2) { + if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) || + (M[i+1] >= 0 && (unsigned) M[i+1] != i + NumElts + WhichResult)) + return false; + } + return true; +} + +/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of +/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". +/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. +static bool isVTRN_v_undef_Mask(const SmallVectorImpl<int> &M, EVT VT, + unsigned &WhichResult) { + unsigned EltSz = VT.getVectorElementType().getSizeInBits(); + if (EltSz == 64) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned i = 0; i < NumElts; i += 2) { + if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) || + (M[i+1] >= 0 && (unsigned) M[i+1] != i + WhichResult)) + return false; + } + return true; +} + +static bool isVUZPMask(const SmallVectorImpl<int> &M, EVT VT, + unsigned &WhichResult) { + unsigned EltSz = VT.getVectorElementType().getSizeInBits(); + if (EltSz == 64) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned i = 0; i != NumElts; ++i) { + if (M[i] < 0) continue; // ignore UNDEF indices + if ((unsigned) M[i] != 2 * i + WhichResult) + return false; + } + + // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. + if (VT.is64BitVector() && EltSz == 32) + return false; + + return true; +} + +/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of +/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". +/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, +static bool isVUZP_v_undef_Mask(const SmallVectorImpl<int> &M, EVT VT, + unsigned &WhichResult) { + unsigned EltSz = VT.getVectorElementType().getSizeInBits(); + if (EltSz == 64) + return false; + + unsigned Half = VT.getVectorNumElements() / 2; + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned j = 0; j != 2; ++j) { + unsigned Idx = WhichResult; + for (unsigned i = 0; i != Half; ++i) { + int MIdx = M[i + j * Half]; + if (MIdx >= 0 && (unsigned) MIdx != Idx) + return false; + Idx += 2; + } + } + + // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. + if (VT.is64BitVector() && EltSz == 32) + return false; + + return true; +} + +static bool isVZIPMask(const SmallVectorImpl<int> &M, EVT VT, + unsigned &WhichResult) { + unsigned EltSz = VT.getVectorElementType().getSizeInBits(); + if (EltSz == 64) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + WhichResult = (M[0] == 0 ? 0 : 1); + unsigned Idx = WhichResult * NumElts / 2; + for (unsigned i = 0; i != NumElts; i += 2) { + if ((M[i] >= 0 && (unsigned) M[i] != Idx) || + (M[i+1] >= 0 && (unsigned) M[i+1] != Idx + NumElts)) + return false; + Idx += 1; + } + + // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. + if (VT.is64BitVector() && EltSz == 32) + return false; + + return true; +} + +/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of +/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". +/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. +static bool isVZIP_v_undef_Mask(const SmallVectorImpl<int> &M, EVT VT, + unsigned &WhichResult) { + unsigned EltSz = VT.getVectorElementType().getSizeInBits(); + if (EltSz == 64) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + WhichResult = (M[0] == 0 ? 0 : 1); + unsigned Idx = WhichResult * NumElts / 2; + for (unsigned i = 0; i != NumElts; i += 2) { + if ((M[i] >= 0 && (unsigned) M[i] != Idx) || + (M[i+1] >= 0 && (unsigned) M[i+1] != Idx)) + return false; + Idx += 1; + } + + // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. + if (VT.is64BitVector() && EltSz == 32) + return false; + + return true; +} + +// If N is an integer constant that can be moved into a register in one +// instruction, return an SDValue of such a constant (will become a MOV +// instruction). Otherwise return null. +static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, + const ARMSubtarget *ST, DebugLoc dl) { + uint64_t Val; + if (!isa<ConstantSDNode>(N)) + return SDValue(); + Val = cast<ConstantSDNode>(N)->getZExtValue(); + + if (ST->isThumb1Only()) { + if (Val <= 255 || ~Val <= 255) + return DAG.getConstant(Val, MVT::i32); + } else { + if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1) + return DAG.getConstant(Val, MVT::i32); + } + return SDValue(); +} + +// If this is a case we can't handle, return null and let the default +// expansion code take care of it. +SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) const { + BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); + DebugLoc dl = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { + if (SplatBitSize <= 64) { + // Check if an immediate VMOV works. + EVT VmovVT; + SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), + SplatUndef.getZExtValue(), SplatBitSize, + DAG, VmovVT, VT.is128BitVector(), + VMOVModImm); + if (Val.getNode()) { + SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); + return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); + } + + // Try an immediate VMVN. + uint64_t NegatedImm = (~SplatBits).getZExtValue(); + Val = isNEONModifiedImm(NegatedImm, + SplatUndef.getZExtValue(), SplatBitSize, + DAG, VmovVT, VT.is128BitVector(), + VMVNModImm); + if (Val.getNode()) { + SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); + return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); + } + } + } + + // Scan through the operands to see if only one value is used. + unsigned NumElts = VT.getVectorNumElements(); + bool isOnlyLowElement = true; + bool usesOnlyOneValue = true; + bool isConstant = true; + SDValue Value; + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + if (i > 0) + isOnlyLowElement = false; + if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) + isConstant = false; + + if (!Value.getNode()) + Value = V; + else if (V != Value) + usesOnlyOneValue = false; + } + + if (!Value.getNode()) + return DAG.getUNDEF(VT); + + if (isOnlyLowElement) + return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); + + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + + // Use VDUP for non-constant splats. For f32 constant splats, reduce to + // i32 and try again. + if (usesOnlyOneValue && EltSize <= 32) { + if (!isConstant) + return DAG.getNode(ARMISD::VDUP, dl, VT, Value); + if (VT.getVectorElementType().isFloatingPoint()) { + SmallVector<SDValue, 8> Ops; + for (unsigned i = 0; i < NumElts; ++i) + Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32, + Op.getOperand(i))); + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); + SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, &Ops[0], NumElts); + Val = LowerBUILD_VECTOR(Val, DAG, ST); + if (Val.getNode()) + return DAG.getNode(ISD::BITCAST, dl, VT, Val); + } + SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); + if (Val.getNode()) + return DAG.getNode(ARMISD::VDUP, dl, VT, Val); + } + + // If all elements are constants and the case above didn't get hit, fall back + // to the default expansion, which will generate a load from the constant + // pool. + if (isConstant) + return SDValue(); + + // Empirical tests suggest this is rarely worth it for vectors of length <= 2. + if (NumElts >= 4) { + SDValue shuffle = ReconstructShuffle(Op, DAG); + if (shuffle != SDValue()) + return shuffle; + } + + // Vectors with 32- or 64-bit elements can be built by directly assigning + // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands + // will be legalized. + if (EltSize >= 32) { + // Do the expansion with floating-point types, since that is what the VFP + // registers are defined to use, and since i64 is not legal. + EVT EltVT = EVT::getFloatingPointVT(EltSize); + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); + SmallVector<SDValue, 8> Ops; + for (unsigned i = 0; i < NumElts; ++i) + Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i))); + SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts); + return DAG.getNode(ISD::BITCAST, dl, VT, Val); + } + + return SDValue(); +} + +// Gather data to see if the operation can be modelled as a +// shuffle in combination with VEXTs. +SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, + SelectionDAG &DAG) const { + DebugLoc dl = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + unsigned NumElts = VT.getVectorNumElements(); + + SmallVector<SDValue, 2> SourceVecs; + SmallVector<unsigned, 2> MinElts; + SmallVector<unsigned, 2> MaxElts; + + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { + // A shuffle can only come from building a vector from various + // elements of other vectors. + return SDValue(); + } else if (V.getOperand(0).getValueType().getVectorElementType() != + VT.getVectorElementType()) { + // This code doesn't know how to handle shuffles where the vector + // element types do not match (this happens because type legalization + // promotes the return type of EXTRACT_VECTOR_ELT). + // FIXME: It might be appropriate to extend this code to handle + // mismatched types. + return SDValue(); + } + + // Record this extraction against the appropriate vector if possible... + SDValue SourceVec = V.getOperand(0); + unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); + bool FoundSource = false; + for (unsigned j = 0; j < SourceVecs.size(); ++j) { + if (SourceVecs[j] == SourceVec) { + if (MinElts[j] > EltNo) + MinElts[j] = EltNo; + if (MaxElts[j] < EltNo) + MaxElts[j] = EltNo; + FoundSource = true; + break; + } + } + + // Or record a new source if not... + if (!FoundSource) { + SourceVecs.push_back(SourceVec); + MinElts.push_back(EltNo); + MaxElts.push_back(EltNo); + } + } + + // Currently only do something sane when at most two source vectors + // involved. + if (SourceVecs.size() > 2) + return SDValue(); + + SDValue ShuffleSrcs[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT) }; + int VEXTOffsets[2] = {0, 0}; + + // This loop extracts the usage patterns of the source vectors + // and prepares appropriate SDValues for a shuffle if possible. + for (unsigned i = 0; i < SourceVecs.size(); ++i) { + if (SourceVecs[i].getValueType() == VT) { + // No VEXT necessary + ShuffleSrcs[i] = SourceVecs[i]; + VEXTOffsets[i] = 0; + continue; + } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) { + // It probably isn't worth padding out a smaller vector just to + // break it down again in a shuffle. + return SDValue(); + } + + // Since only 64-bit and 128-bit vectors are legal on ARM and + // we've eliminated the other cases... + assert(SourceVecs[i].getValueType().getVectorNumElements() == 2*NumElts && + "unexpected vector sizes in ReconstructShuffle"); + + if (MaxElts[i] - MinElts[i] >= NumElts) { + // Span too large for a VEXT to cope + return SDValue(); + } + + if (MinElts[i] >= NumElts) { + // The extraction can just take the second half + VEXTOffsets[i] = NumElts; + ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + SourceVecs[i], + DAG.getIntPtrConstant(NumElts)); + } else if (MaxElts[i] < NumElts) { + // The extraction can just take the first half + VEXTOffsets[i] = 0; + ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + SourceVecs[i], + DAG.getIntPtrConstant(0)); + } else { + // An actual VEXT is needed + VEXTOffsets[i] = MinElts[i]; + SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + SourceVecs[i], + DAG.getIntPtrConstant(0)); + SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + SourceVecs[i], + DAG.getIntPtrConstant(NumElts)); + ShuffleSrcs[i] = DAG.getNode(ARMISD::VEXT, dl, VT, VEXTSrc1, VEXTSrc2, + DAG.getConstant(VEXTOffsets[i], MVT::i32)); + } + } + + SmallVector<int, 8> Mask; + + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Entry = Op.getOperand(i); + if (Entry.getOpcode() == ISD::UNDEF) { + Mask.push_back(-1); + continue; + } + + SDValue ExtractVec = Entry.getOperand(0); + int ExtractElt = cast<ConstantSDNode>(Op.getOperand(i) + .getOperand(1))->getSExtValue(); + if (ExtractVec == SourceVecs[0]) { + Mask.push_back(ExtractElt - VEXTOffsets[0]); + } else { + Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]); + } + } + + // Final check before we try to produce nonsense... + if (isShuffleMaskLegal(Mask, VT)) + return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1], + &Mask[0]); + + return SDValue(); +} + +/// isShuffleMaskLegal - Targets can use this to indicate that they only +/// support *some* VECTOR_SHUFFLE operations, those with specific masks. +/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values +/// are assumed to be legal. +bool +ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, + EVT VT) const { + if (VT.getVectorNumElements() == 4 && + (VT.is128BitVector() || VT.is64BitVector())) { + unsigned PFIndexes[4]; + for (unsigned i = 0; i != 4; ++i) { + if (M[i] < 0) + PFIndexes[i] = 8; + else + PFIndexes[i] = M[i]; + } + + // Compute the index in the perfect shuffle table. + unsigned PFTableIndex = + PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; + unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; + unsigned Cost = (PFEntry >> 30); + + if (Cost <= 4) + return true; + } + + bool ReverseVEXT; + unsigned Imm, WhichResult; + + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + return (EltSize >= 32 || + ShuffleVectorSDNode::isSplatMask(&M[0], VT) || + isVREVMask(M, VT, 64) || + isVREVMask(M, VT, 32) || + isVREVMask(M, VT, 16) || + isVEXTMask(M, VT, ReverseVEXT, Imm) || + isVTBLMask(M, VT) || + isVTRNMask(M, VT, WhichResult) || + isVUZPMask(M, VT, WhichResult) || + isVZIPMask(M, VT, WhichResult) || + isVTRN_v_undef_Mask(M, VT, WhichResult) || + isVUZP_v_undef_Mask(M, VT, WhichResult) || + isVZIP_v_undef_Mask(M, VT, WhichResult)); +} + +/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit +/// the specified operations to build the shuffle. +static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, + SDValue RHS, SelectionDAG &DAG, + DebugLoc dl) { + unsigned OpNum = (PFEntry >> 26) & 0x0F; + unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); + unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); + + enum { + OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> + OP_VREV, + OP_VDUP0, + OP_VDUP1, + OP_VDUP2, + OP_VDUP3, + OP_VEXT1, + OP_VEXT2, + OP_VEXT3, + OP_VUZPL, // VUZP, left result + OP_VUZPR, // VUZP, right result + OP_VZIPL, // VZIP, left result + OP_VZIPR, // VZIP, right result + OP_VTRNL, // VTRN, left result + OP_VTRNR // VTRN, right result + }; + + if (OpNum == OP_COPY) { + if (LHSID == (1*9+2)*9+3) return LHS; + assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); + return RHS; + } + + SDValue OpLHS, OpRHS; + OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); + OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); + EVT VT = OpLHS.getValueType(); + + switch (OpNum) { + default: llvm_unreachable("Unknown shuffle opcode!"); + case OP_VREV: + // VREV divides the vector in half and swaps within the half. + if (VT.getVectorElementType() == MVT::i32 || + VT.getVectorElementType() == MVT::f32) + return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); + // vrev <4 x i16> -> VREV32 + if (VT.getVectorElementType() == MVT::i16) + return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS); + // vrev <4 x i8> -> VREV16 + assert(VT.getVectorElementType() == MVT::i8); + return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS); + case OP_VDUP0: + case OP_VDUP1: + case OP_VDUP2: + case OP_VDUP3: + return DAG.getNode(ARMISD::VDUPLANE, dl, VT, + OpLHS, DAG.getConstant(OpNum-OP_VDUP0, MVT::i32)); + case OP_VEXT1: + case OP_VEXT2: + case OP_VEXT3: + return DAG.getNode(ARMISD::VEXT, dl, VT, + OpLHS, OpRHS, + DAG.getConstant(OpNum-OP_VEXT1+1, MVT::i32)); + case OP_VUZPL: + case OP_VUZPR: + return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), + OpLHS, OpRHS).getValue(OpNum-OP_VUZPL); + case OP_VZIPL: + case OP_VZIPR: + return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), + OpLHS, OpRHS).getValue(OpNum-OP_VZIPL); + case OP_VTRNL: + case OP_VTRNR: + return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), + OpLHS, OpRHS).getValue(OpNum-OP_VTRNL); + } +} + +static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, + SmallVectorImpl<int> &ShuffleMask, + SelectionDAG &DAG) { + // Check to see if we can use the VTBL instruction. + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + DebugLoc DL = Op.getDebugLoc(); + + SmallVector<SDValue, 8> VTBLMask; + for (SmallVectorImpl<int>::iterator + I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I) + VTBLMask.push_back(DAG.getConstant(*I, MVT::i32)); + + if (V2.getNode()->getOpcode() == ISD::UNDEF) + return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, + &VTBLMask[0], 8)); + + return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, + &VTBLMask[0], 8)); +} + +static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + DebugLoc dl = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); + SmallVector<int, 8> ShuffleMask; + + // Convert shuffles that are directly supported on NEON to target-specific + // DAG nodes, instead of keeping them as shuffles and matching them again + // during code selection. This is more efficient and avoids the possibility + // of inconsistencies between legalization and selection. + // FIXME: floating-point vectors should be canonicalized to integer vectors + // of the same time so that they get CSEd properly. + SVN->getMask(ShuffleMask); + + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + if (EltSize <= 32) { + if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) { + int Lane = SVN->getSplatIndex(); + // If this is undef splat, generate it via "just" vdup, if possible. + if (Lane == -1) Lane = 0; + + if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { + return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); + } + return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, + DAG.getConstant(Lane, MVT::i32)); + } + + bool ReverseVEXT; + unsigned Imm; + if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { + if (ReverseVEXT) + std::swap(V1, V2); + return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, + DAG.getConstant(Imm, MVT::i32)); + } + + if (isVREVMask(ShuffleMask, VT, 64)) + return DAG.getNode(ARMISD::VREV64, dl, VT, V1); + if (isVREVMask(ShuffleMask, VT, 32)) + return DAG.getNode(ARMISD::VREV32, dl, VT, V1); + if (isVREVMask(ShuffleMask, VT, 16)) + return DAG.getNode(ARMISD::VREV16, dl, VT, V1); + + // Check for Neon shuffles that modify both input vectors in place. + // If both results are used, i.e., if there are two shuffles with the same + // source operands and with masks corresponding to both results of one of + // these operations, DAG memoization will ensure that a single node is + // used for both shuffles. + unsigned WhichResult; + if (isVTRNMask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), + V1, V2).getValue(WhichResult); + if (isVUZPMask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), + V1, V2).getValue(WhichResult); + if (isVZIPMask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), + V1, V2).getValue(WhichResult); + + if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), + V1, V1).getValue(WhichResult); + if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), + V1, V1).getValue(WhichResult); + if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), + V1, V1).getValue(WhichResult); + } + + // If the shuffle is not directly supported and it has 4 elements, use + // the PerfectShuffle-generated table to synthesize it from other shuffles. + unsigned NumElts = VT.getVectorNumElements(); + if (NumElts == 4) { + unsigned PFIndexes[4]; + for (unsigned i = 0; i != 4; ++i) { + if (ShuffleMask[i] < 0) + PFIndexes[i] = 8; + else + PFIndexes[i] = ShuffleMask[i]; + } + + // Compute the index in the perfect shuffle table. + unsigned PFTableIndex = + PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; + unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; + unsigned Cost = (PFEntry >> 30); + + if (Cost <= 4) + return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); + } + + // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. + if (EltSize >= 32) { + // Do the expansion with floating-point types, since that is what the VFP + // registers are defined to use, and since i64 is not legal. + EVT EltVT = EVT::getFloatingPointVT(EltSize); + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); + V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2); + SmallVector<SDValue, 8> Ops; + for (unsigned i = 0; i < NumElts; ++i) { + if (ShuffleMask[i] < 0) + Ops.push_back(DAG.getUNDEF(EltVT)); + else + Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, + ShuffleMask[i] < (int)NumElts ? V1 : V2, + DAG.getConstant(ShuffleMask[i] & (NumElts-1), + MVT::i32))); + } + SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts); + return DAG.getNode(ISD::BITCAST, dl, VT, Val); + } + + if (VT == MVT::v8i8) { + SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG); + if (NewOp.getNode()) + return NewOp; + } + + return SDValue(); +} + +static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { + // EXTRACT_VECTOR_ELT is legal only for immediate indexes. + SDValue Lane = Op.getOperand(1); + if (!isa<ConstantSDNode>(Lane)) + return SDValue(); + + SDValue Vec = Op.getOperand(0); + if (Op.getValueType() == MVT::i32 && + Vec.getValueType().getVectorElementType().getSizeInBits() < 32) { + DebugLoc dl = Op.getDebugLoc(); + return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); + } + + return Op; +} + +static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { + // The only time a CONCAT_VECTORS operation can have legal types is when + // two 64-bit vectors are concatenated to a 128-bit vector. + assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && + "unexpected CONCAT_VECTORS"); + DebugLoc dl = Op.getDebugLoc(); + SDValue Val = DAG.getUNDEF(MVT::v2f64); + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + if (Op0.getOpcode() != ISD::UNDEF) + Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, + DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0), + DAG.getIntPtrConstant(0)); + if (Op1.getOpcode() != ISD::UNDEF) + Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, + DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1), + DAG.getIntPtrConstant(1)); + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); +} + +/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each +/// element has been zero/sign-extended, depending on the isSigned parameter, +/// from an integer type half its size. +static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, + bool isSigned) { + // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32. + EVT VT = N->getValueType(0); + if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) { + SDNode *BVN = N->getOperand(0).getNode(); + if (BVN->getValueType(0) != MVT::v4i32 || + BVN->getOpcode() != ISD::BUILD_VECTOR) + return false; + unsigned LoElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0; + unsigned HiElt = 1 - LoElt; + ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt)); + ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt)); + ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2)); + ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2)); + if (!Lo0 || !Hi0 || !Lo1 || !Hi1) + return false; + if (isSigned) { + if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 && + Hi1->getSExtValue() == Lo1->getSExtValue() >> 32) + return true; + } else { + if (Hi0->isNullValue() && Hi1->isNullValue()) + return true; + } + return false; + } + + if (N->getOpcode() != ISD::BUILD_VECTOR) + return false; + + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { + SDNode *Elt = N->getOperand(i).getNode(); + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + unsigned HalfSize = EltSize / 2; + if (isSigned) { + int64_t SExtVal = C->getSExtValue(); + if ((SExtVal >> HalfSize) != (SExtVal >> EltSize)) + return false; + } else { + if ((C->getZExtValue() >> HalfSize) != 0) + return false; + } + continue; + } + return false; + } + + return true; +} + +/// isSignExtended - Check if a node is a vector value that is sign-extended +/// or a constant BUILD_VECTOR with sign-extended elements. +static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { + if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N)) + return true; + if (isExtendedBUILD_VECTOR(N, DAG, true)) + return true; + return false; +} + +/// isZeroExtended - Check if a node is a vector value that is zero-extended +/// or a constant BUILD_VECTOR with zero-extended elements. +static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { + if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N)) + return true; + if (isExtendedBUILD_VECTOR(N, DAG, false)) + return true; + return false; +} + +/// SkipExtension - For a node that is a SIGN_EXTEND, ZERO_EXTEND, extending +/// load, or BUILD_VECTOR with extended elements, return the unextended value. +static SDValue SkipExtension(SDNode *N, SelectionDAG &DAG) { + if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) + return N->getOperand(0); + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) + return DAG.getLoad(LD->getMemoryVT(), N->getDebugLoc(), LD->getChain(), + LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(), + LD->isNonTemporal(), LD->getAlignment()); + // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will + // have been legalized as a BITCAST from v4i32. + if (N->getOpcode() == ISD::BITCAST) { + SDNode *BVN = N->getOperand(0).getNode(); + assert(BVN->getOpcode() == ISD::BUILD_VECTOR && + BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); + unsigned LowElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0; + return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), MVT::v2i32, + BVN->getOperand(LowElt), BVN->getOperand(LowElt+2)); + } + // Construct a new BUILD_VECTOR with elements truncated to half the size. + assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); + EVT VT = N->getValueType(0); + unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2; + unsigned NumElts = VT.getVectorNumElements(); + MVT TruncVT = MVT::getIntegerVT(EltSize); + SmallVector<SDValue, 8> Ops; + for (unsigned i = 0; i != NumElts; ++i) { + ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); + const APInt &CInt = C->getAPIntValue(); + Ops.push_back(DAG.getConstant(CInt.trunc(EltSize), TruncVT)); + } + return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), + MVT::getVectorVT(TruncVT, NumElts), Ops.data(), NumElts); +} + +static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { + unsigned Opcode = N->getOpcode(); + if (Opcode == ISD::ADD || Opcode == ISD::SUB) { + SDNode *N0 = N->getOperand(0).getNode(); + SDNode *N1 = N->getOperand(1).getNode(); + return N0->hasOneUse() && N1->hasOneUse() && + isSignExtended(N0, DAG) && isSignExtended(N1, DAG); + } + return false; +} + +static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { + unsigned Opcode = N->getOpcode(); + if (Opcode == ISD::ADD || Opcode == ISD::SUB) { + SDNode *N0 = N->getOperand(0).getNode(); + SDNode *N1 = N->getOperand(1).getNode(); + return N0->hasOneUse() && N1->hasOneUse() && + isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); + } + return false; +} + +static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { + // Multiplications are only custom-lowered for 128-bit vectors so that + // VMULL can be detected. Otherwise v2i64 multiplications are not legal. + EVT VT = Op.getValueType(); + assert(VT.is128BitVector() && "unexpected type for custom-lowering ISD::MUL"); + SDNode *N0 = Op.getOperand(0).getNode(); + SDNode *N1 = Op.getOperand(1).getNode(); + unsigned NewOpc = 0; + bool isMLA = false; + bool isN0SExt = isSignExtended(N0, DAG); + bool isN1SExt = isSignExtended(N1, DAG); + if (isN0SExt && isN1SExt) + NewOpc = ARMISD::VMULLs; + else { + bool isN0ZExt = isZeroExtended(N0, DAG); + bool isN1ZExt = isZeroExtended(N1, DAG); + if (isN0ZExt && isN1ZExt) + NewOpc = ARMISD::VMULLu; + else if (isN1SExt || isN1ZExt) { + // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these + // into (s/zext A * s/zext C) + (s/zext B * s/zext C) + if (isN1SExt && isAddSubSExt(N0, DAG)) { + NewOpc = ARMISD::VMULLs; + isMLA = true; + } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { + NewOpc = ARMISD::VMULLu; + isMLA = true; + } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { + std::swap(N0, N1); + NewOpc = ARMISD::VMULLu; + isMLA = true; + } + } + + if (!NewOpc) { + if (VT == MVT::v2i64) + // Fall through to expand this. It is not legal. + return SDValue(); + else + // Other vector multiplications are legal. + return Op; + } + } + + // Legalize to a VMULL instruction. + DebugLoc DL = Op.getDebugLoc(); + SDValue Op0; + SDValue Op1 = SkipExtension(N1, DAG); + if (!isMLA) { + Op0 = SkipExtension(N0, DAG); + assert(Op0.getValueType().is64BitVector() && + Op1.getValueType().is64BitVector() && + "unexpected types for extended operands to VMULL"); + return DAG.getNode(NewOpc, DL, VT, Op0, Op1); + } + + // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during + // isel lowering to take advantage of no-stall back to back vmul + vmla. + // vmull q0, d4, d6 + // vmlal q0, d5, d6 + // is faster than + // vaddl q0, d4, d5 + // vmovl q1, d6 + // vmul q0, q0, q1 + SDValue N00 = SkipExtension(N0->getOperand(0).getNode(), DAG); + SDValue N01 = SkipExtension(N0->getOperand(1).getNode(), DAG); + EVT Op1VT = Op1.getValueType(); + return DAG.getNode(N0->getOpcode(), DL, VT, + DAG.getNode(NewOpc, DL, VT, + DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), + DAG.getNode(NewOpc, DL, VT, + DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); +} + +static SDValue +LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) { + // Convert to float + // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); + // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); + X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X); + Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y); + X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X); + Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y); + // Get reciprocal estimate. + // float4 recip = vrecpeq_f32(yf); + Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), Y); + // Because char has a smaller range than uchar, we can actually get away + // without any newton steps. This requires that we use a weird bias + // of 0xb000, however (again, this has been exhaustively tested). + // float4 result = as_float4(as_int4(xf*recip) + 0xb000); + X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y); + X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X); + Y = DAG.getConstant(0xb000, MVT::i32); + Y = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Y, Y, Y, Y); + X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y); + X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X); + // Convert back to short. + X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X); + X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X); + return X; +} + +static SDValue +LowerSDIV_v4i16(SDValue N0, SDValue N1, DebugLoc dl, SelectionDAG &DAG) { + SDValue N2; + // Convert to float. + // float4 yf = vcvt_f32_s32(vmovl_s16(y)); + // float4 xf = vcvt_f32_s32(vmovl_s16(x)); + N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0); + N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1); + N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); + N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); + + // Use reciprocal estimate and one refinement step. + // float4 recip = vrecpeq_f32(yf); + // recip *= vrecpsq_f32(yf, recip); + N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1); + N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32), + N1, N2); + N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); + // Because short has a smaller range than ushort, we can actually get away + // with only a single newton step. This requires that we use a weird bias + // of 89, however (again, this has been exhaustively tested). + // float4 result = as_float4(as_int4(xf*recip) + 0x89); + N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); + N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); + N1 = DAG.getConstant(0x89, MVT::i32); + N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1); + N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); + N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); + // Convert back to integer and return. + // return vmovn_s32(vcvt_s32_f32(result)); + N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); + N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); + return N0; +} + +static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + assert((VT == MVT::v4i16 || VT == MVT::v8i8) && + "unexpected type for custom-lowering ISD::SDIV"); + + DebugLoc dl = Op.getDebugLoc(); + SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); + SDValue N2, N3; + + if (VT == MVT::v8i8) { + N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0); + N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1); + + N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, + DAG.getIntPtrConstant(4)); + N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, + DAG.getIntPtrConstant(4)); + N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, + DAG.getIntPtrConstant(0)); + N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, + DAG.getIntPtrConstant(0)); + + N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16 + N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 + + N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); + N0 = LowerCONCAT_VECTORS(N0, DAG); + + N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); + return N0; + } + return LowerSDIV_v4i16(N0, N1, dl, DAG); +} + +static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + assert((VT == MVT::v4i16 || VT == MVT::v8i8) && + "unexpected type for custom-lowering ISD::UDIV"); + + DebugLoc dl = Op.getDebugLoc(); + SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); + SDValue N2, N3; + + if (VT == MVT::v8i8) { + N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0); + N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1); + + N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, + DAG.getIntPtrConstant(4)); + N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, + DAG.getIntPtrConstant(4)); + N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, + DAG.getIntPtrConstant(0)); + N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, + DAG.getIntPtrConstant(0)); + + N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16 + N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 + + N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); + N0 = LowerCONCAT_VECTORS(N0, DAG); + + N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, + DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, MVT::i32), + N0); + return N0; + } + + // v4i16 sdiv ... Convert to float. + // float4 yf = vcvt_f32_s32(vmovl_u16(y)); + // float4 xf = vcvt_f32_s32(vmovl_u16(x)); + N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0); + N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1); + N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); + SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); + + // Use reciprocal estimate and two refinement steps. + // float4 recip = vrecpeq_f32(yf); + // recip *= vrecpsq_f32(yf, recip); + // recip *= vrecpsq_f32(yf, recip); + N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), BN1); + N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32), + BN1, N2); + N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); + N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32), + BN1, N2); + N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); + // Simply multiplying by the reciprocal estimate can leave us a few ulps + // too low, so we add 2 ulps (exhaustive testing shows that this is enough, + // and that it will never cause us to return an answer too large). + // float4 result = as_float4(as_int4(xf*recip) + 2); + N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); + N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); + N1 = DAG.getConstant(2, MVT::i32); + N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1); + N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); + N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); + // Convert back to integer and return. + // return vmovn_u32(vcvt_s32_f32(result)); + N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); + N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); + return N0; +} + +static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getNode()->getValueType(0); + SDVTList VTs = DAG.getVTList(VT, MVT::i32); + + unsigned Opc; + bool ExtraOp = false; + switch (Op.getOpcode()) { + default: assert(0 && "Invalid code"); + case ISD::ADDC: Opc = ARMISD::ADDC; break; + case ISD::ADDE: Opc = ARMISD::ADDE; ExtraOp = true; break; + case ISD::SUBC: Opc = ARMISD::SUBC; break; + case ISD::SUBE: Opc = ARMISD::SUBE; ExtraOp = true; break; + } + + if (!ExtraOp) + return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), + Op.getOperand(1)); + return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), + Op.getOperand(1), Op.getOperand(2)); +} + +static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { + // Monotonic load/store is legal for all targets + if (cast<AtomicSDNode>(Op)->getOrdering() <= Monotonic) + return Op; + + // Aquire/Release load/store is not legal for targets without a + // dmb or equivalent available. + return SDValue(); +} + + +static void +ReplaceATOMIC_OP_64(SDNode *Node, SmallVectorImpl<SDValue>& Results, + SelectionDAG &DAG, unsigned NewOp) { + DebugLoc dl = Node->getDebugLoc(); + assert (Node->getValueType(0) == MVT::i64 && + "Only know how to expand i64 atomics"); + + SmallVector<SDValue, 6> Ops; + Ops.push_back(Node->getOperand(0)); // Chain + Ops.push_back(Node->getOperand(1)); // Ptr + // Low part of Val1 + Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, + Node->getOperand(2), DAG.getIntPtrConstant(0))); + // High part of Val1 + Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, + Node->getOperand(2), DAG.getIntPtrConstant(1))); + if (NewOp == ARMISD::ATOMCMPXCHG64_DAG) { + // High part of Val1 + Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, + Node->getOperand(3), DAG.getIntPtrConstant(0))); + // High part of Val2 + Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, + Node->getOperand(3), DAG.getIntPtrConstant(1))); + } + SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); + SDValue Result = + DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops.data(), Ops.size(), MVT::i64, + cast<MemSDNode>(Node)->getMemOperand()); + SDValue OpsF[] = { Result.getValue(0), Result.getValue(1) }; + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); + Results.push_back(Result.getValue(2)); +} + +SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { + switch (Op.getOpcode()) { + default: llvm_unreachable("Don't know how to custom lower this!"); + case ISD::ConstantPool: return LowerConstantPool(Op, DAG); + case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); + case ISD::GlobalAddress: + return Subtarget->isTargetDarwin() ? LowerGlobalAddressDarwin(Op, DAG) : + LowerGlobalAddressELF(Op, DAG); + case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); + case ISD::SELECT: return LowerSELECT(Op, DAG); + case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); + case ISD::BR_CC: return LowerBR_CC(Op, DAG); + case ISD::BR_JT: return LowerBR_JT(Op, DAG); + case ISD::VASTART: return LowerVASTART(Op, DAG); + case ISD::MEMBARRIER: return LowerMEMBARRIER(Op, DAG, Subtarget); + case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget); + case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget); + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); + case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); + case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); + case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); + case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG); + case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); + case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); + case ISD::EH_SJLJ_DISPATCHSETUP: return LowerEH_SJLJ_DISPATCHSETUP(Op, DAG); + case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, + Subtarget); + case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG); + case ISD::SHL: + case ISD::SRL: + case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); + case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); + case ISD::SRL_PARTS: + case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); + case ISD::CTTZ: return LowerCTTZ(Op.getNode(), DAG, Subtarget); + case ISD::SETCC: return LowerVSETCC(Op, DAG); + case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); + case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); + case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); + case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); + case ISD::MUL: return LowerMUL(Op, DAG); + case ISD::SDIV: return LowerSDIV(Op, DAG); + case ISD::UDIV: return LowerUDIV(Op, DAG); + case ISD::ADDC: + case ISD::ADDE: + case ISD::SUBC: + case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); + case ISD::ATOMIC_LOAD: + case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); + } + return SDValue(); +} + +/// ReplaceNodeResults - Replace the results of node with an illegal result +/// type with new values built out of custom code. +void ARMTargetLowering::ReplaceNodeResults(SDNode *N, + SmallVectorImpl<SDValue>&Results, + SelectionDAG &DAG) const { + SDValue Res; + switch (N->getOpcode()) { + default: + llvm_unreachable("Don't know how to custom expand this!"); + break; + case ISD::BITCAST: + Res = ExpandBITCAST(N, DAG); + break; + case ISD::SRL: + case ISD::SRA: + Res = Expand64BitShift(N, DAG, Subtarget); + break; + case ISD::ATOMIC_LOAD_ADD: + ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMADD64_DAG); + return; + case ISD::ATOMIC_LOAD_AND: + ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMAND64_DAG); + return; + case ISD::ATOMIC_LOAD_NAND: + ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMNAND64_DAG); + return; + case ISD::ATOMIC_LOAD_OR: + ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMOR64_DAG); + return; + case ISD::ATOMIC_LOAD_SUB: + ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMSUB64_DAG); + return; + case ISD::ATOMIC_LOAD_XOR: + ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMXOR64_DAG); + return; + case ISD::ATOMIC_SWAP: + ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMSWAP64_DAG); + return; + case ISD::ATOMIC_CMP_SWAP: + ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMCMPXCHG64_DAG); + return; + } + if (Res.getNode()) + Results.push_back(Res); +} + +//===----------------------------------------------------------------------===// +// ARM Scheduler Hooks +//===----------------------------------------------------------------------===// + +MachineBasicBlock * +ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI, + MachineBasicBlock *BB, + unsigned Size) const { + unsigned dest = MI->getOperand(0).getReg(); + unsigned ptr = MI->getOperand(1).getReg(); + unsigned oldval = MI->getOperand(2).getReg(); + unsigned newval = MI->getOperand(3).getReg(); + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc dl = MI->getDebugLoc(); + bool isThumb2 = Subtarget->isThumb2(); + + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + unsigned scratch = + MRI.createVirtualRegister(isThumb2 ? ARM::rGPRRegisterClass + : ARM::GPRRegisterClass); + + if (isThumb2) { + MRI.constrainRegClass(dest, ARM::rGPRRegisterClass); + MRI.constrainRegClass(oldval, ARM::rGPRRegisterClass); + MRI.constrainRegClass(newval, ARM::rGPRRegisterClass); + } + + unsigned ldrOpc, strOpc; + switch (Size) { + default: llvm_unreachable("unsupported size for AtomicCmpSwap!"); + case 1: + ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB; + strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB; + break; + case 2: + ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH; + strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH; + break; + case 4: + ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX; + strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX; + break; + } + + MachineFunction *MF = BB->getParent(); + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction::iterator It = BB; + ++It; // insert the new blocks after the current block + + MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MF->insert(It, loop1MBB); + MF->insert(It, loop2MBB); + MF->insert(It, exitMBB); + + // Transfer the remainder of BB and its successor edges to exitMBB. + exitMBB->splice(exitMBB->begin(), BB, + llvm::next(MachineBasicBlock::iterator(MI)), + BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); + + // thisMBB: + // ... + // fallthrough --> loop1MBB + BB->addSuccessor(loop1MBB); + + // loop1MBB: + // ldrex dest, [ptr] + // cmp dest, oldval + // bne exitMBB + BB = loop1MBB; + MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr); + if (ldrOpc == ARM::t2LDREX) + MIB.addImm(0); + AddDefaultPred(MIB); + AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) + .addReg(dest).addReg(oldval)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) + .addMBB(exitMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); + BB->addSuccessor(loop2MBB); + BB->addSuccessor(exitMBB); + + // loop2MBB: + // strex scratch, newval, [ptr] + // cmp scratch, #0 + // bne loop1MBB + BB = loop2MBB; + MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(newval).addReg(ptr); + if (strOpc == ARM::t2STREX) + MIB.addImm(0); + AddDefaultPred(MIB); + AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) + .addReg(scratch).addImm(0)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) + .addMBB(loop1MBB).addImm(ARMCC::NE).addReg(ARM::CPSR); + BB->addSuccessor(loop1MBB); + BB->addSuccessor(exitMBB); + + // exitMBB: + // ... + BB = exitMBB; + + MI->eraseFromParent(); // The instruction is gone now. + + return BB; +} + +MachineBasicBlock * +ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, + unsigned Size, unsigned BinOpcode) const { + // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction *MF = BB->getParent(); + MachineFunction::iterator It = BB; + ++It; + + unsigned dest = MI->getOperand(0).getReg(); + unsigned ptr = MI->getOperand(1).getReg(); + unsigned incr = MI->getOperand(2).getReg(); + DebugLoc dl = MI->getDebugLoc(); + bool isThumb2 = Subtarget->isThumb2(); + + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + if (isThumb2) { + MRI.constrainRegClass(dest, ARM::rGPRRegisterClass); + MRI.constrainRegClass(ptr, ARM::rGPRRegisterClass); + } + + unsigned ldrOpc, strOpc; + switch (Size) { + default: llvm_unreachable("unsupported size for AtomicCmpSwap!"); + case 1: + ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB; + strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB; + break; + case 2: + ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH; + strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH; + break; + case 4: + ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX; + strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX; + break; + } + + MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MF->insert(It, loopMBB); + MF->insert(It, exitMBB); + + // Transfer the remainder of BB and its successor edges to exitMBB. + exitMBB->splice(exitMBB->begin(), BB, + llvm::next(MachineBasicBlock::iterator(MI)), + BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); + + TargetRegisterClass *TRC = + isThumb2 ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass; + unsigned scratch = MRI.createVirtualRegister(TRC); + unsigned scratch2 = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC); + + // thisMBB: + // ... + // fallthrough --> loopMBB + BB->addSuccessor(loopMBB); + + // loopMBB: + // ldrex dest, ptr + // <binop> scratch2, dest, incr + // strex scratch, scratch2, ptr + // cmp scratch, #0 + // bne- loopMBB + // fallthrough --> exitMBB + BB = loopMBB; + MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr); + if (ldrOpc == ARM::t2LDREX) + MIB.addImm(0); + AddDefaultPred(MIB); + if (BinOpcode) { + // operand order needs to go the other way for NAND + if (BinOpcode == ARM::BICrr || BinOpcode == ARM::t2BICrr) + AddDefaultPred(BuildMI(BB, dl, TII->get(BinOpcode), scratch2). + addReg(incr).addReg(dest)).addReg(0); + else + AddDefaultPred(BuildMI(BB, dl, TII->get(BinOpcode), scratch2). + addReg(dest).addReg(incr)).addReg(0); + } + + MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2).addReg(ptr); + if (strOpc == ARM::t2STREX) + MIB.addImm(0); + AddDefaultPred(MIB); + AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) + .addReg(scratch).addImm(0)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) + .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); + + BB->addSuccessor(loopMBB); + BB->addSuccessor(exitMBB); + + // exitMBB: + // ... + BB = exitMBB; + + MI->eraseFromParent(); // The instruction is gone now. + + return BB; +} + +MachineBasicBlock * +ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI, + MachineBasicBlock *BB, + unsigned Size, + bool signExtend, + ARMCC::CondCodes Cond) const { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction *MF = BB->getParent(); + MachineFunction::iterator It = BB; + ++It; + + unsigned dest = MI->getOperand(0).getReg(); + unsigned ptr = MI->getOperand(1).getReg(); + unsigned incr = MI->getOperand(2).getReg(); + unsigned oldval = dest; + DebugLoc dl = MI->getDebugLoc(); + bool isThumb2 = Subtarget->isThumb2(); + + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + if (isThumb2) { + MRI.constrainRegClass(dest, ARM::rGPRRegisterClass); + MRI.constrainRegClass(ptr, ARM::rGPRRegisterClass); + } + + unsigned ldrOpc, strOpc, extendOpc; + switch (Size) { + default: llvm_unreachable("unsupported size for AtomicCmpSwap!"); + case 1: + ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB; + strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB; + extendOpc = isThumb2 ? ARM::t2SXTB : ARM::SXTB; + break; + case 2: + ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH; + strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH; + extendOpc = isThumb2 ? ARM::t2SXTH : ARM::SXTH; + break; + case 4: + ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX; + strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX; + extendOpc = 0; + break; + } + + MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MF->insert(It, loopMBB); + MF->insert(It, exitMBB); + + // Transfer the remainder of BB and its successor edges to exitMBB. + exitMBB->splice(exitMBB->begin(), BB, + llvm::next(MachineBasicBlock::iterator(MI)), + BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); + + TargetRegisterClass *TRC = + isThumb2 ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass; + unsigned scratch = MRI.createVirtualRegister(TRC); + unsigned scratch2 = MRI.createVirtualRegister(TRC); + + // thisMBB: + // ... + // fallthrough --> loopMBB + BB->addSuccessor(loopMBB); + + // loopMBB: + // ldrex dest, ptr + // (sign extend dest, if required) + // cmp dest, incr + // cmov.cond scratch2, dest, incr + // strex scratch, scratch2, ptr + // cmp scratch, #0 + // bne- loopMBB + // fallthrough --> exitMBB + BB = loopMBB; + MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr); + if (ldrOpc == ARM::t2LDREX) + MIB.addImm(0); + AddDefaultPred(MIB); + + // Sign extend the value, if necessary. + if (signExtend && extendOpc) { + oldval = MRI.createVirtualRegister(ARM::GPRRegisterClass); + AddDefaultPred(BuildMI(BB, dl, TII->get(extendOpc), oldval) + .addReg(dest) + .addImm(0)); + } + + // Build compare and cmov instructions. + AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) + .addReg(oldval).addReg(incr)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2MOVCCr : ARM::MOVCCr), scratch2) + .addReg(oldval).addReg(incr).addImm(Cond).addReg(ARM::CPSR); + + MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2).addReg(ptr); + if (strOpc == ARM::t2STREX) + MIB.addImm(0); + AddDefaultPred(MIB); + AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) + .addReg(scratch).addImm(0)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) + .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); + + BB->addSuccessor(loopMBB); + BB->addSuccessor(exitMBB); + + // exitMBB: + // ... + BB = exitMBB; + + MI->eraseFromParent(); // The instruction is gone now. + + return BB; +} + +MachineBasicBlock * +ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, + unsigned Op1, unsigned Op2, + bool NeedsCarry, bool IsCmpxchg) const { + // This also handles ATOMIC_SWAP, indicated by Op1==0. + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction *MF = BB->getParent(); + MachineFunction::iterator It = BB; + ++It; + + unsigned destlo = MI->getOperand(0).getReg(); + unsigned desthi = MI->getOperand(1).getReg(); + unsigned ptr = MI->getOperand(2).getReg(); + unsigned vallo = MI->getOperand(3).getReg(); + unsigned valhi = MI->getOperand(4).getReg(); + DebugLoc dl = MI->getDebugLoc(); + bool isThumb2 = Subtarget->isThumb2(); + + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + if (isThumb2) { + MRI.constrainRegClass(destlo, ARM::rGPRRegisterClass); + MRI.constrainRegClass(desthi, ARM::rGPRRegisterClass); + MRI.constrainRegClass(ptr, ARM::rGPRRegisterClass); + } + + unsigned ldrOpc = isThumb2 ? ARM::t2LDREXD : ARM::LDREXD; + unsigned strOpc = isThumb2 ? ARM::t2STREXD : ARM::STREXD; + + MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *contBB = 0, *cont2BB = 0; + if (IsCmpxchg) { + contBB = MF->CreateMachineBasicBlock(LLVM_BB); + cont2BB = MF->CreateMachineBasicBlock(LLVM_BB); + } + MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MF->insert(It, loopMBB); + if (IsCmpxchg) { + MF->insert(It, contBB); + MF->insert(It, cont2BB); + } + MF->insert(It, exitMBB); + + // Transfer the remainder of BB and its successor edges to exitMBB. + exitMBB->splice(exitMBB->begin(), BB, + llvm::next(MachineBasicBlock::iterator(MI)), + BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); + + TargetRegisterClass *TRC = + isThumb2 ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass; + unsigned storesuccess = MRI.createVirtualRegister(TRC); + + // thisMBB: + // ... + // fallthrough --> loopMBB + BB->addSuccessor(loopMBB); + + // loopMBB: + // ldrexd r2, r3, ptr + // <binopa> r0, r2, incr + // <binopb> r1, r3, incr + // strexd storesuccess, r0, r1, ptr + // cmp storesuccess, #0 + // bne- loopMBB + // fallthrough --> exitMBB + // + // Note that the registers are explicitly specified because there is not any + // way to force the register allocator to allocate a register pair. + // + // FIXME: The hardcoded registers are not necessary for Thumb2, but we + // need to properly enforce the restriction that the two output registers + // for ldrexd must be different. + BB = loopMBB; + // Load + AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc)) + .addReg(ARM::R2, RegState::Define) + .addReg(ARM::R3, RegState::Define).addReg(ptr)); + // Copy r2/r3 into dest. (This copy will normally be coalesced.) + BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo).addReg(ARM::R2); + BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi).addReg(ARM::R3); + + if (IsCmpxchg) { + // Add early exit + for (unsigned i = 0; i < 2; i++) { + AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : + ARM::CMPrr)) + .addReg(i == 0 ? destlo : desthi) + .addReg(i == 0 ? vallo : valhi)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) + .addMBB(exitMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); + BB->addSuccessor(exitMBB); + BB->addSuccessor(i == 0 ? contBB : cont2BB); + BB = (i == 0 ? contBB : cont2BB); + } + + // Copy to physregs for strexd + unsigned setlo = MI->getOperand(5).getReg(); + unsigned sethi = MI->getOperand(6).getReg(); + BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R0).addReg(setlo); + BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R1).addReg(sethi); + } else if (Op1) { + // Perform binary operation + AddDefaultPred(BuildMI(BB, dl, TII->get(Op1), ARM::R0) + .addReg(destlo).addReg(vallo)) + .addReg(NeedsCarry ? ARM::CPSR : 0, getDefRegState(NeedsCarry)); + AddDefaultPred(BuildMI(BB, dl, TII->get(Op2), ARM::R1) + .addReg(desthi).addReg(valhi)).addReg(0); + } else { + // Copy to physregs for strexd + BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R0).addReg(vallo); + BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R1).addReg(valhi); + } + + // Store + AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), storesuccess) + .addReg(ARM::R0).addReg(ARM::R1).addReg(ptr)); + // Cmp+jump + AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) + .addReg(storesuccess).addImm(0)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) + .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); + + BB->addSuccessor(loopMBB); + BB->addSuccessor(exitMBB); + + // exitMBB: + // ... + BB = exitMBB; + + MI->eraseFromParent(); // The instruction is gone now. + + return BB; +} + +/// EmitBasePointerRecalculation - For functions using a base pointer, we +/// rematerialize it (via the frame pointer). +void ARMTargetLowering:: +EmitBasePointerRecalculation(MachineInstr *MI, MachineBasicBlock *MBB, + MachineBasicBlock *DispatchBB) const { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII); + MachineFunction &MF = *MI->getParent()->getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); + + if (!RI.hasBasePointer(MF)) return; + + MachineBasicBlock::iterator MBBI = MI; + + int32_t NumBytes = AFI->getFramePtrSpillOffset(); + unsigned FramePtr = RI.getFrameRegister(MF); + assert(MF.getTarget().getFrameLowering()->hasFP(MF) && + "Base pointer without frame pointer?"); + + if (AFI->isThumb2Function()) + llvm::emitT2RegPlusImmediate(*MBB, MBBI, MI->getDebugLoc(), ARM::R6, + FramePtr, -NumBytes, ARMCC::AL, 0, *AII); + else if (AFI->isThumbFunction()) + llvm::emitThumbRegPlusImmediate(*MBB, MBBI, MI->getDebugLoc(), ARM::R6, + FramePtr, -NumBytes, *AII, RI); + else + llvm::emitARMRegPlusImmediate(*MBB, MBBI, MI->getDebugLoc(), ARM::R6, + FramePtr, -NumBytes, ARMCC::AL, 0, *AII); + + if (!RI.needsStackRealignment(MF)) return; + + // If there's dynamic realignment, adjust for it. + MachineFrameInfo *MFI = MF.getFrameInfo(); + unsigned MaxAlign = MFI->getMaxAlignment(); + assert(!AFI->isThumb1OnlyFunction()); + + // Emit bic r6, r6, MaxAlign + unsigned bicOpc = AFI->isThumbFunction() ? ARM::t2BICri : ARM::BICri; + AddDefaultCC( + AddDefaultPred( + BuildMI(*MBB, MBBI, MI->getDebugLoc(), TII->get(bicOpc), ARM::R6) + .addReg(ARM::R6, RegState::Kill) + .addImm(MaxAlign - 1))); +} + +/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and +/// registers the function context. +void ARMTargetLowering:: +SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB, + MachineBasicBlock *DispatchBB, int FI) const { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc dl = MI->getDebugLoc(); + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo *MRI = &MF->getRegInfo(); + MachineConstantPool *MCP = MF->getConstantPool(); + ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>(); + const Function *F = MF->getFunction(); + + bool isThumb = Subtarget->isThumb(); + bool isThumb2 = Subtarget->isThumb2(); + + unsigned PCLabelId = AFI->createPICLabelUId(); + unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8; + ARMConstantPoolValue *CPV = + ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj); + unsigned CPI = MCP->getConstantPoolIndex(CPV, 4); + + const TargetRegisterClass *TRC = + isThumb ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass; + + // Grab constant pool and fixed stack memory operands. + MachineMemOperand *CPMMO = + MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(), + MachineMemOperand::MOLoad, 4, 4); + + MachineMemOperand *FIMMOSt = + MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), + MachineMemOperand::MOStore, 4, 4); + + EmitBasePointerRecalculation(MI, MBB, DispatchBB); + + // Load the address of the dispatch MBB into the jump buffer. + if (isThumb2) { + // Incoming value: jbuf + // ldr.n r5, LCPI1_1 + // orr r5, r5, #1 + // add r5, pc + // str r5, [$jbuf, #+4] ; &jbuf[1] + unsigned NewVReg1 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1) + .addConstantPoolIndex(CPI) + .addMemOperand(CPMMO)); + // Set the low bit because of thumb mode. + unsigned NewVReg2 = MRI->createVirtualRegister(TRC); + AddDefaultCC( + AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2) + .addReg(NewVReg1, RegState::Kill) + .addImm(0x01))); + unsigned NewVReg3 = MRI->createVirtualRegister(TRC); + BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3) + .addReg(NewVReg2, RegState::Kill) + .addImm(PCLabelId); + AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12)) + .addReg(NewVReg3, RegState::Kill) + .addFrameIndex(FI) + .addImm(36) // &jbuf[1] :: pc + .addMemOperand(FIMMOSt)); + } else if (isThumb) { + // Incoming value: jbuf + // ldr.n r1, LCPI1_4 + // add r1, pc + // mov r2, #1 + // orrs r1, r2 + // add r2, $jbuf, #+4 ; &jbuf[1] + // str r1, [r2] + unsigned NewVReg1 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1) + .addConstantPoolIndex(CPI) + .addMemOperand(CPMMO)); + unsigned NewVReg2 = MRI->createVirtualRegister(TRC); + BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2) + .addReg(NewVReg1, RegState::Kill) + .addImm(PCLabelId); + // Set the low bit because of thumb mode. + unsigned NewVReg3 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3) + .addReg(ARM::CPSR, RegState::Define) + .addImm(1)); + unsigned NewVReg4 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4) + .addReg(ARM::CPSR, RegState::Define) + .addReg(NewVReg2, RegState::Kill) + .addReg(NewVReg3, RegState::Kill)); + unsigned NewVReg5 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tADDrSPi), NewVReg5) + .addFrameIndex(FI) + .addImm(36)); // &jbuf[1] :: pc + AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi)) + .addReg(NewVReg4, RegState::Kill) + .addReg(NewVReg5, RegState::Kill) + .addImm(0) + .addMemOperand(FIMMOSt)); + } else { + // Incoming value: jbuf + // ldr r1, LCPI1_1 + // add r1, pc, r1 + // str r1, [$jbuf, #+4] ; &jbuf[1] + unsigned NewVReg1 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1) + .addConstantPoolIndex(CPI) + .addImm(0) + .addMemOperand(CPMMO)); + unsigned NewVReg2 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2) + .addReg(NewVReg1, RegState::Kill) + .addImm(PCLabelId)); + AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12)) + .addReg(NewVReg2, RegState::Kill) + .addFrameIndex(FI) + .addImm(36) // &jbuf[1] :: pc + .addMemOperand(FIMMOSt)); + } +} + +MachineBasicBlock *ARMTargetLowering:: +EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc dl = MI->getDebugLoc(); + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo *MRI = &MF->getRegInfo(); + ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>(); + MachineFrameInfo *MFI = MF->getFrameInfo(); + int FI = MFI->getFunctionContextIndex(); + + const TargetRegisterClass *TRC = + Subtarget->isThumb() ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass; + + // Get a mapping of the call site numbers to all of the landing pads they're + // associated with. + DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2> > CallSiteNumToLPad; + unsigned MaxCSNum = 0; + MachineModuleInfo &MMI = MF->getMMI(); + for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; ++BB) { + if (!BB->isLandingPad()) continue; + + // FIXME: We should assert that the EH_LABEL is the first MI in the landing + // pad. + for (MachineBasicBlock::iterator + II = BB->begin(), IE = BB->end(); II != IE; ++II) { + if (!II->isEHLabel()) continue; + + MCSymbol *Sym = II->getOperand(0).getMCSymbol(); + if (!MMI.hasCallSiteLandingPad(Sym)) continue; + + SmallVectorImpl<unsigned> &CallSiteIdxs = MMI.getCallSiteLandingPad(Sym); + for (SmallVectorImpl<unsigned>::iterator + CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end(); + CSI != CSE; ++CSI) { + CallSiteNumToLPad[*CSI].push_back(BB); + MaxCSNum = std::max(MaxCSNum, *CSI); + } + break; + } + } + + // Get an ordered list of the machine basic blocks for the jump table. + std::vector<MachineBasicBlock*> LPadList; + SmallPtrSet<MachineBasicBlock*, 64> InvokeBBs; + LPadList.reserve(CallSiteNumToLPad.size()); + for (unsigned I = 1; I <= MaxCSNum; ++I) { + SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I]; + for (SmallVectorImpl<MachineBasicBlock*>::iterator + II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) { + LPadList.push_back(*II); + InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end()); + } + } + + assert(!LPadList.empty() && + "No landing pad destinations for the dispatch jump table!"); + + // Create the jump table and associated information. + MachineJumpTableInfo *JTI = + MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline); + unsigned MJTI = JTI->createJumpTableIndex(LPadList); + unsigned UId = AFI->createJumpTableUId(); + + // Create the MBBs for the dispatch code. + + // Shove the dispatch's address into the return slot in the function context. + MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); + DispatchBB->setIsLandingPad(); + + MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); + BuildMI(TrapBB, dl, TII->get(Subtarget->isThumb() ? ARM::tTRAP : ARM::TRAP)); + DispatchBB->addSuccessor(TrapBB); + + MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); + DispatchBB->addSuccessor(DispContBB); + + // Insert and renumber MBBs. + MachineBasicBlock *Last = &MF->back(); + MF->insert(MF->end(), DispatchBB); + MF->insert(MF->end(), DispContBB); + MF->insert(MF->end(), TrapBB); + MF->RenumberBlocks(Last); + + // Insert code into the entry block that creates and registers the function + // context. + SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI); + + MachineMemOperand *FIMMOLd = + MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), + MachineMemOperand::MOLoad | + MachineMemOperand::MOVolatile, 4, 4); + + if (Subtarget->isThumb2()) { + unsigned NewVReg1 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1) + .addFrameIndex(FI) + .addImm(4) + .addMemOperand(FIMMOLd)); + AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri)) + .addReg(NewVReg1) + .addImm(LPadList.size())); + BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc)) + .addMBB(TrapBB) + .addImm(ARMCC::HI) + .addReg(ARM::CPSR); + + unsigned NewVReg2 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT),NewVReg2) + .addJumpTableIndex(MJTI) + .addImm(UId)); + + unsigned NewVReg3 = MRI->createVirtualRegister(TRC); + AddDefaultCC( + AddDefaultPred( + BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg3) + .addReg(NewVReg2, RegState::Kill) + .addReg(NewVReg1) + .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)))); + + BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT)) + .addReg(NewVReg3, RegState::Kill) + .addReg(NewVReg1) + .addJumpTableIndex(MJTI) + .addImm(UId); + } else if (Subtarget->isThumb()) { + unsigned NewVReg1 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1) + .addFrameIndex(FI) + .addImm(1) + .addMemOperand(FIMMOLd)); + + AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8)) + .addReg(NewVReg1) + .addImm(LPadList.size())); + BuildMI(DispatchBB, dl, TII->get(ARM::tBcc)) + .addMBB(TrapBB) + .addImm(ARMCC::HI) + .addReg(ARM::CPSR); + + unsigned NewVReg2 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2) + .addReg(ARM::CPSR, RegState::Define) + .addReg(NewVReg1) + .addImm(2)); + + unsigned NewVReg3 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3) + .addJumpTableIndex(MJTI) + .addImm(UId)); + + unsigned NewVReg4 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4) + .addReg(ARM::CPSR, RegState::Define) + .addReg(NewVReg2, RegState::Kill) + .addReg(NewVReg3)); + + MachineMemOperand *JTMMOLd = + MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(), + MachineMemOperand::MOLoad, 4, 4); + + unsigned NewVReg5 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) + .addReg(NewVReg4, RegState::Kill) + .addImm(0) + .addMemOperand(JTMMOLd)); + + unsigned NewVReg6 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6) + .addReg(ARM::CPSR, RegState::Define) + .addReg(NewVReg5, RegState::Kill) + .addReg(NewVReg3)); + + BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr)) + .addReg(NewVReg6, RegState::Kill) + .addJumpTableIndex(MJTI) + .addImm(UId); + } else { + unsigned NewVReg1 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1) + .addFrameIndex(FI) + .addImm(4) + .addMemOperand(FIMMOLd)); + AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPri)) + .addReg(NewVReg1) + .addImm(LPadList.size())); + BuildMI(DispatchBB, dl, TII->get(ARM::Bcc)) + .addMBB(TrapBB) + .addImm(ARMCC::HI) + .addReg(ARM::CPSR); + + unsigned NewVReg2 = MRI->createVirtualRegister(TRC); + AddDefaultCC( + AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg2) + .addReg(NewVReg1) + .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)))); + unsigned NewVReg3 = MRI->createVirtualRegister(TRC); + AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg3) + .addJumpTableIndex(MJTI) + .addImm(UId)); + + MachineMemOperand *JTMMOLd = + MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(), + MachineMemOperand::MOLoad, 4, 4); + unsigned NewVReg4 = MRI->createVirtualRegister(TRC); + AddDefaultPred( + BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg4) + .addReg(NewVReg2, RegState::Kill) + .addReg(NewVReg3) + .addImm(0) + .addMemOperand(JTMMOLd)); + + BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd)) + .addReg(NewVReg4, RegState::Kill) + .addReg(NewVReg3) + .addJumpTableIndex(MJTI) + .addImm(UId); + } + + // Add the jump table entries as successors to the MBB. + MachineBasicBlock *PrevMBB = 0; + for (std::vector<MachineBasicBlock*>::iterator + I = LPadList.begin(), E = LPadList.end(); I != E; ++I) { + MachineBasicBlock *CurMBB = *I; + if (PrevMBB != CurMBB) + DispContBB->addSuccessor(CurMBB); + PrevMBB = CurMBB; + } + + const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII); + const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); + const unsigned *SavedRegs = RI.getCalleeSavedRegs(MF); + for (SmallPtrSet<MachineBasicBlock*, 64>::iterator + I = InvokeBBs.begin(), E = InvokeBBs.end(); I != E; ++I) { + MachineBasicBlock *BB = *I; + + // Remove the landing pad successor from the invoke block and replace it + // with the new dispatch block. + for (MachineBasicBlock::succ_iterator + SI = BB->succ_begin(), SE = BB->succ_end(); SI != SE; ++SI) { + MachineBasicBlock *SMBB = *SI; + if (SMBB->isLandingPad()) { + BB->removeSuccessor(SMBB); + SMBB->setIsLandingPad(false); + } + } + + BB->addSuccessor(DispatchBB); + + // Find the invoke call and mark all of the callee-saved registers as + // 'implicit defined' so that they're spilled. This prevents code from + // moving instructions to before the EH block, where they will never be + // executed. + for (MachineBasicBlock::reverse_iterator + II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) { + if (!II->getDesc().isCall()) continue; + + DenseMap<unsigned, bool> DefRegs; + for (MachineInstr::mop_iterator + OI = II->operands_begin(), OE = II->operands_end(); + OI != OE; ++OI) { + if (!OI->isReg()) continue; + DefRegs[OI->getReg()] = true; + } + + MachineInstrBuilder MIB(&*II); + + for (unsigned i = 0; SavedRegs[i] != 0; ++i) { + if (!TRC->contains(SavedRegs[i])) continue; + if (!DefRegs[SavedRegs[i]]) + MIB.addReg(SavedRegs[i], RegState::ImplicitDefine | RegState::Dead); + } + + break; + } + } + + // The instruction is gone now. + MI->eraseFromParent(); + + return MBB; +} + +static +MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { + for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), + E = MBB->succ_end(); I != E; ++I) + if (*I != Succ) + return *I; + llvm_unreachable("Expecting a BB with two successors!"); +} + +MachineBasicBlock * +ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, + MachineBasicBlock *BB) const { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc dl = MI->getDebugLoc(); + bool isThumb2 = Subtarget->isThumb2(); + switch (MI->getOpcode()) { + default: { + MI->dump(); + llvm_unreachable("Unexpected instr type to insert"); + } + // The Thumb2 pre-indexed stores have the same MI operands, they just + // define them differently in the .td files from the isel patterns, so + // they need pseudos. + case ARM::t2STR_preidx: + MI->setDesc(TII->get(ARM::t2STR_PRE)); + return BB; + case ARM::t2STRB_preidx: + MI->setDesc(TII->get(ARM::t2STRB_PRE)); + return BB; + case ARM::t2STRH_preidx: + MI->setDesc(TII->get(ARM::t2STRH_PRE)); + return BB; + + case ARM::STRi_preidx: + case ARM::STRBi_preidx: { + unsigned NewOpc = MI->getOpcode() == ARM::STRi_preidx ? + ARM::STR_PRE_IMM : ARM::STRB_PRE_IMM; + // Decode the offset. + unsigned Offset = MI->getOperand(4).getImm(); + bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub; + Offset = ARM_AM::getAM2Offset(Offset); + if (isSub) + Offset = -Offset; + + MachineMemOperand *MMO = *MI->memoperands_begin(); + BuildMI(*BB, MI, dl, TII->get(NewOpc)) + .addOperand(MI->getOperand(0)) // Rn_wb + .addOperand(MI->getOperand(1)) // Rt + .addOperand(MI->getOperand(2)) // Rn + .addImm(Offset) // offset (skip GPR==zero_reg) + .addOperand(MI->getOperand(5)) // pred + .addOperand(MI->getOperand(6)) + .addMemOperand(MMO); + MI->eraseFromParent(); + return BB; + } + case ARM::STRr_preidx: + case ARM::STRBr_preidx: + case ARM::STRH_preidx: { + unsigned NewOpc; + switch (MI->getOpcode()) { + default: llvm_unreachable("unexpected opcode!"); + case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break; + case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break; + case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break; + } + MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); + for (unsigned i = 0; i < MI->getNumOperands(); ++i) + MIB.addOperand(MI->getOperand(i)); + MI->eraseFromParent(); + return BB; + } + case ARM::ATOMIC_LOAD_ADD_I8: + return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr); + case ARM::ATOMIC_LOAD_ADD_I16: + return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr); + case ARM::ATOMIC_LOAD_ADD_I32: + return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr); + + case ARM::ATOMIC_LOAD_AND_I8: + return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr); + case ARM::ATOMIC_LOAD_AND_I16: + return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr); + case ARM::ATOMIC_LOAD_AND_I32: + return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr); + + case ARM::ATOMIC_LOAD_OR_I8: + return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr); + case ARM::ATOMIC_LOAD_OR_I16: + return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr); + case ARM::ATOMIC_LOAD_OR_I32: + return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr); + + case ARM::ATOMIC_LOAD_XOR_I8: + return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2EORrr : ARM::EORrr); + case ARM::ATOMIC_LOAD_XOR_I16: + return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2EORrr : ARM::EORrr); + case ARM::ATOMIC_LOAD_XOR_I32: + return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2EORrr : ARM::EORrr); + + case ARM::ATOMIC_LOAD_NAND_I8: + return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2BICrr : ARM::BICrr); + case ARM::ATOMIC_LOAD_NAND_I16: + return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2BICrr : ARM::BICrr); + case ARM::ATOMIC_LOAD_NAND_I32: + return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2BICrr : ARM::BICrr); + + case ARM::ATOMIC_LOAD_SUB_I8: + return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr); + case ARM::ATOMIC_LOAD_SUB_I16: + return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr); + case ARM::ATOMIC_LOAD_SUB_I32: + return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr); + + case ARM::ATOMIC_LOAD_MIN_I8: + return EmitAtomicBinaryMinMax(MI, BB, 1, true, ARMCC::LT); + case ARM::ATOMIC_LOAD_MIN_I16: + return EmitAtomicBinaryMinMax(MI, BB, 2, true, ARMCC::LT); + case ARM::ATOMIC_LOAD_MIN_I32: + return EmitAtomicBinaryMinMax(MI, BB, 4, true, ARMCC::LT); + + case ARM::ATOMIC_LOAD_MAX_I8: + return EmitAtomicBinaryMinMax(MI, BB, 1, true, ARMCC::GT); + case ARM::ATOMIC_LOAD_MAX_I16: + return EmitAtomicBinaryMinMax(MI, BB, 2, true, ARMCC::GT); + case ARM::ATOMIC_LOAD_MAX_I32: + return EmitAtomicBinaryMinMax(MI, BB, 4, true, ARMCC::GT); + + case ARM::ATOMIC_LOAD_UMIN_I8: + return EmitAtomicBinaryMinMax(MI, BB, 1, false, ARMCC::LO); + case ARM::ATOMIC_LOAD_UMIN_I16: + return EmitAtomicBinaryMinMax(MI, BB, 2, false, ARMCC::LO); + case ARM::ATOMIC_LOAD_UMIN_I32: + return EmitAtomicBinaryMinMax(MI, BB, 4, false, ARMCC::LO); + + case ARM::ATOMIC_LOAD_UMAX_I8: + return EmitAtomicBinaryMinMax(MI, BB, 1, false, ARMCC::HI); + case ARM::ATOMIC_LOAD_UMAX_I16: + return EmitAtomicBinaryMinMax(MI, BB, 2, false, ARMCC::HI); + case ARM::ATOMIC_LOAD_UMAX_I32: + return EmitAtomicBinaryMinMax(MI, BB, 4, false, ARMCC::HI); + + case ARM::ATOMIC_SWAP_I8: return EmitAtomicBinary(MI, BB, 1, 0); + case ARM::ATOMIC_SWAP_I16: return EmitAtomicBinary(MI, BB, 2, 0); + case ARM::ATOMIC_SWAP_I32: return EmitAtomicBinary(MI, BB, 4, 0); + + case ARM::ATOMIC_CMP_SWAP_I8: return EmitAtomicCmpSwap(MI, BB, 1); + case ARM::ATOMIC_CMP_SWAP_I16: return EmitAtomicCmpSwap(MI, BB, 2); + case ARM::ATOMIC_CMP_SWAP_I32: return EmitAtomicCmpSwap(MI, BB, 4); + + + case ARM::ATOMADD6432: + return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr, + isThumb2 ? ARM::t2ADCrr : ARM::ADCrr, + /*NeedsCarry*/ true); + case ARM::ATOMSUB6432: + return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, + isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, + /*NeedsCarry*/ true); + case ARM::ATOMOR6432: + return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr, + isThumb2 ? ARM::t2ORRrr : ARM::ORRrr); + case ARM::ATOMXOR6432: + return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2EORrr : ARM::EORrr, + isThumb2 ? ARM::t2EORrr : ARM::EORrr); + case ARM::ATOMAND6432: + return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr, + isThumb2 ? ARM::t2ANDrr : ARM::ANDrr); + case ARM::ATOMSWAP6432: + return EmitAtomicBinary64(MI, BB, 0, 0, false); + case ARM::ATOMCMPXCHG6432: + return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, + isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, + /*NeedsCarry*/ false, /*IsCmpxchg*/true); + + case ARM::tMOVCCr_pseudo: { + // To "insert" a SELECT_CC instruction, we actually have to insert the + // diamond control-flow pattern. The incoming instruction knows the + // destination vreg to set, the condition code register to branch on, the + // true/false values to select between, and a branch opcode to use. + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction::iterator It = BB; + ++It; + + // thisMBB: + // ... + // TrueVal = ... + // cmpTY ccX, r1, r2 + // bCC copy1MBB + // fallthrough --> copy0MBB + MachineBasicBlock *thisMBB = BB; + MachineFunction *F = BB->getParent(); + MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(It, copy0MBB); + F->insert(It, sinkMBB); + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), BB, + llvm::next(MachineBasicBlock::iterator(MI)), + BB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(BB); + + BB->addSuccessor(copy0MBB); + BB->addSuccessor(sinkMBB); + + BuildMI(BB, dl, TII->get(ARM::tBcc)).addMBB(sinkMBB) + .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg()); + + // copy0MBB: + // %FalseValue = ... + // # fallthrough to sinkMBB + BB = copy0MBB; + + // Update machine-CFG edges + BB->addSuccessor(sinkMBB); + + // sinkMBB: + // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] + // ... + BB = sinkMBB; + BuildMI(*BB, BB->begin(), dl, + TII->get(ARM::PHI), MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) + .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); + + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; + } + + case ARM::BCCi64: + case ARM::BCCZi64: { + // If there is an unconditional branch to the other successor, remove it. + BB->erase(llvm::next(MachineBasicBlock::iterator(MI)), BB->end()); + + // Compare both parts that make up the double comparison separately for + // equality. + bool RHSisZero = MI->getOpcode() == ARM::BCCZi64; + + unsigned LHS1 = MI->getOperand(1).getReg(); + unsigned LHS2 = MI->getOperand(2).getReg(); + if (RHSisZero) { + AddDefaultPred(BuildMI(BB, dl, + TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) + .addReg(LHS1).addImm(0)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) + .addReg(LHS2).addImm(0) + .addImm(ARMCC::EQ).addReg(ARM::CPSR); + } else { + unsigned RHS1 = MI->getOperand(3).getReg(); + unsigned RHS2 = MI->getOperand(4).getReg(); + AddDefaultPred(BuildMI(BB, dl, + TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) + .addReg(LHS1).addReg(RHS1)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) + .addReg(LHS2).addReg(RHS2) + .addImm(ARMCC::EQ).addReg(ARM::CPSR); + } + + MachineBasicBlock *destMBB = MI->getOperand(RHSisZero ? 3 : 5).getMBB(); + MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB); + if (MI->getOperand(0).getImm() == ARMCC::NE) + std::swap(destMBB, exitMBB); + + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) + .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR); + if (isThumb2) + AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2B)).addMBB(exitMBB)); + else + BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB); + + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; + } + + case ARM::ABS: + case ARM::t2ABS: { + // To insert an ABS instruction, we have to insert the + // diamond control-flow pattern. The incoming instruction knows the + // source vreg to test against 0, the destination vreg to set, + // the condition code register to branch on, the + // true/false values to select between, and a branch opcode to use. + // It transforms + // V1 = ABS V0 + // into + // V2 = MOVS V0 + // BCC (branch to SinkBB if V0 >= 0) + // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0) + // SinkBB: V1 = PHI(V2, V3) + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction::iterator BBI = BB; + ++BBI; + MachineFunction *Fn = BB->getParent(); + MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB); + Fn->insert(BBI, RSBBB); + Fn->insert(BBI, SinkBB); + + unsigned int ABSSrcReg = MI->getOperand(1).getReg(); + unsigned int ABSDstReg = MI->getOperand(0).getReg(); + bool isThumb2 = Subtarget->isThumb2(); + MachineRegisterInfo &MRI = Fn->getRegInfo(); + // In Thumb mode S must not be specified if source register is the SP or + // PC and if destination register is the SP, so restrict register class + unsigned NewMovDstReg = MRI.createVirtualRegister( + isThumb2 ? ARM::rGPRRegisterClass : ARM::GPRRegisterClass); + unsigned NewRsbDstReg = MRI.createVirtualRegister( + isThumb2 ? ARM::rGPRRegisterClass : ARM::GPRRegisterClass); + + // Transfer the remainder of BB and its successor edges to sinkMBB. + SinkBB->splice(SinkBB->begin(), BB, + llvm::next(MachineBasicBlock::iterator(MI)), + BB->end()); + SinkBB->transferSuccessorsAndUpdatePHIs(BB); + + BB->addSuccessor(RSBBB); + BB->addSuccessor(SinkBB); + + // fall through to SinkMBB + RSBBB->addSuccessor(SinkBB); + + // insert a movs at the end of BB + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2MOVr : ARM::MOVr), + NewMovDstReg) + .addReg(ABSSrcReg, RegState::Kill) + .addImm((unsigned)ARMCC::AL).addReg(0) + .addReg(ARM::CPSR, RegState::Define); + + // insert a bcc with opposite CC to ARMCC::MI at the end of BB + BuildMI(BB, dl, + TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB) + .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR); + + // insert rsbri in RSBBB + // Note: BCC and rsbri will be converted into predicated rsbmi + // by if-conversion pass + BuildMI(*RSBBB, RSBBB->begin(), dl, + TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg) + .addReg(NewMovDstReg, RegState::Kill) + .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0); + + // insert PHI in SinkBB, + // reuse ABSDstReg to not change uses of ABS instruction + BuildMI(*SinkBB, SinkBB->begin(), dl, + TII->get(ARM::PHI), ABSDstReg) + .addReg(NewRsbDstReg).addMBB(RSBBB) + .addReg(NewMovDstReg).addMBB(BB); + + // remove ABS instruction + MI->eraseFromParent(); + + // return last added BB + return SinkBB; + } + } +} + +void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, + SDNode *Node) const { + const MCInstrDesc &MCID = MI->getDesc(); + if (!MCID.hasPostISelHook()) { + assert(!convertAddSubFlagsOpcode(MI->getOpcode()) && + "Pseudo flag-setting opcodes must be marked with 'hasPostISelHook'"); + return; + } + + // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, + // RSC. Coming out of isel, they have an implicit CPSR def, but the optional + // operand is still set to noreg. If needed, set the optional operand's + // register to CPSR, and remove the redundant implicit def. + // + // e.g. ADCS (...opt:%noreg, CPSR<imp-def>) -> ADC (... opt:CPSR<def>). + + // Rename pseudo opcodes. + unsigned NewOpc = convertAddSubFlagsOpcode(MI->getOpcode()); + if (NewOpc) { + const ARMBaseInstrInfo *TII = + static_cast<const ARMBaseInstrInfo*>(getTargetMachine().getInstrInfo()); + MI->setDesc(TII->get(NewOpc)); + } + unsigned ccOutIdx = MCID.getNumOperands() - 1; + + // Any ARM instruction that sets the 's' bit should specify an optional + // "cc_out" operand in the last operand position. + if (!MCID.hasOptionalDef() || !MCID.OpInfo[ccOutIdx].isOptionalDef()) { + assert(!NewOpc && "Optional cc_out operand required"); + return; + } + // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it + // since we already have an optional CPSR def. + bool definesCPSR = false; + bool deadCPSR = false; + for (unsigned i = MCID.getNumOperands(), e = MI->getNumOperands(); + i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) { + definesCPSR = true; + if (MO.isDead()) + deadCPSR = true; + MI->RemoveOperand(i); + break; + } + } + if (!definesCPSR) { + assert(!NewOpc && "Optional cc_out operand required"); + return; + } + assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag"); + if (deadCPSR) { + assert(!MI->getOperand(ccOutIdx).getReg() && + "expect uninitialized optional cc_out operand"); + return; + } + + // If this instruction was defined with an optional CPSR def and its dag node + // had a live implicit CPSR def, then activate the optional CPSR def. + MachineOperand &MO = MI->getOperand(ccOutIdx); + MO.setReg(ARM::CPSR); + MO.setIsDef(true); +} + +//===----------------------------------------------------------------------===// +// ARM Optimization Hooks +//===----------------------------------------------------------------------===// + +static +SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, + TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT VT = N->getValueType(0); + unsigned Opc = N->getOpcode(); + bool isSlctCC = Slct.getOpcode() == ISD::SELECT_CC; + SDValue LHS = isSlctCC ? Slct.getOperand(2) : Slct.getOperand(1); + SDValue RHS = isSlctCC ? Slct.getOperand(3) : Slct.getOperand(2); + ISD::CondCode CC = ISD::SETCC_INVALID; + + if (isSlctCC) { + CC = cast<CondCodeSDNode>(Slct.getOperand(4))->get(); + } else { + SDValue CCOp = Slct.getOperand(0); + if (CCOp.getOpcode() == ISD::SETCC) + CC = cast<CondCodeSDNode>(CCOp.getOperand(2))->get(); + } + + bool DoXform = false; + bool InvCC = false; + assert ((Opc == ISD::ADD || (Opc == ISD::SUB && Slct == N->getOperand(1))) && + "Bad input!"); + + if (LHS.getOpcode() == ISD::Constant && + cast<ConstantSDNode>(LHS)->isNullValue()) { + DoXform = true; + } else if (CC != ISD::SETCC_INVALID && + RHS.getOpcode() == ISD::Constant && + cast<ConstantSDNode>(RHS)->isNullValue()) { + std::swap(LHS, RHS); + SDValue Op0 = Slct.getOperand(0); + EVT OpVT = isSlctCC ? Op0.getValueType() : + Op0.getOperand(0).getValueType(); + bool isInt = OpVT.isInteger(); + CC = ISD::getSetCCInverse(CC, isInt); + + if (!TLI.isCondCodeLegal(CC, OpVT)) + return SDValue(); // Inverse operator isn't legal. + + DoXform = true; + InvCC = true; + } + + if (DoXform) { + SDValue Result = DAG.getNode(Opc, RHS.getDebugLoc(), VT, OtherOp, RHS); + if (isSlctCC) + return DAG.getSelectCC(N->getDebugLoc(), OtherOp, Result, + Slct.getOperand(0), Slct.getOperand(1), CC); + SDValue CCOp = Slct.getOperand(0); + if (InvCC) + CCOp = DAG.getSetCC(Slct.getDebugLoc(), CCOp.getValueType(), + CCOp.getOperand(0), CCOp.getOperand(1), CC); + return DAG.getNode(ISD::SELECT, N->getDebugLoc(), VT, + CCOp, OtherOp, Result); + } + return SDValue(); +} + +// AddCombineToVPADDL- For pair-wise add on neon, use the vpaddl instruction +// (only after legalization). +static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + + // Only perform optimization if after legalize, and if NEON is available. We + // also expected both operands to be BUILD_VECTORs. + if (DCI.isBeforeLegalize() || !Subtarget->hasNEON() + || N0.getOpcode() != ISD::BUILD_VECTOR + || N1.getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + + // Check output type since VPADDL operand elements can only be 8, 16, or 32. + EVT VT = N->getValueType(0); + if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64) + return SDValue(); + + // Check that the vector operands are of the right form. + // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR + // operands, where N is the size of the formed vector. + // Each EXTRACT_VECTOR should have the same input vector and odd or even + // index such that we have a pair wise add pattern. + + // Grab the vector that all EXTRACT_VECTOR nodes should be referencing. + if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + SDValue Vec = N0->getOperand(0)->getOperand(0); + SDNode *V = Vec.getNode(); + unsigned nextIndex = 0; + + // For each operands to the ADD which are BUILD_VECTORs, + // check to see if each of their operands are an EXTRACT_VECTOR with + // the same vector and appropriate index. + for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) { + if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT + && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + + SDValue ExtVec0 = N0->getOperand(i); + SDValue ExtVec1 = N1->getOperand(i); + + // First operand is the vector, verify its the same. + if (V != ExtVec0->getOperand(0).getNode() || + V != ExtVec1->getOperand(0).getNode()) + return SDValue(); + + // Second is the constant, verify its correct. + ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1)); + ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1)); + + // For the constant, we want to see all the even or all the odd. + if (!C0 || !C1 || C0->getZExtValue() != nextIndex + || C1->getZExtValue() != nextIndex+1) + return SDValue(); + + // Increment index. + nextIndex+=2; + } else + return SDValue(); + } + + // Create VPADDL node. + SelectionDAG &DAG = DCI.DAG; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // Build operand list. + SmallVector<SDValue, 8> Ops; + Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, + TLI.getPointerTy())); + + // Input is the vector. + Ops.push_back(Vec); + + // Get widened type and narrowed type. + MVT widenType; + unsigned numElem = VT.getVectorNumElements(); + switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { + case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break; + case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break; + case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break; + default: + assert(0 && "Invalid vector element type for padd optimization."); + } + + SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(), + widenType, &Ops[0], Ops.size()); + return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, tmp); +} + +/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with +/// operands N0 and N1. This is a helper for PerformADDCombine that is +/// called with the default operands, and if that fails, with commuted +/// operands. +static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget){ + + // Attempt to create vpaddl for this add. + SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget); + if (Result.getNode()) + return Result; + + // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) + if (N0.getOpcode() == ISD::SELECT && N0.getNode()->hasOneUse()) { + SDValue Result = combineSelectAndUse(N, N0, N1, DCI); + if (Result.getNode()) return Result; + } + return SDValue(); +} + +/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. +/// +static SDValue PerformADDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // First try with the default operand order. + SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget); + if (Result.getNode()) + return Result; + + // If that didn't work, try again with the operands commuted. + return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget); +} + +/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. +/// +static SDValue PerformSUBCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) + if (N1.getOpcode() == ISD::SELECT && N1.getNode()->hasOneUse()) { + SDValue Result = combineSelectAndUse(N, N1, N0, DCI); + if (Result.getNode()) return Result; + } + + return SDValue(); +} + +/// PerformVMULCombine +/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the +/// special multiplier accumulator forwarding. +/// vmul d3, d0, d2 +/// vmla d3, d1, d2 +/// is faster than +/// vadd d3, d0, d1 +/// vmul d3, d3, d2 +static SDValue PerformVMULCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + if (!Subtarget->hasVMLxForwarding()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + unsigned Opcode = N0.getOpcode(); + if (Opcode != ISD::ADD && Opcode != ISD::SUB && + Opcode != ISD::FADD && Opcode != ISD::FSUB) { + Opcode = N1.getOpcode(); + if (Opcode != ISD::ADD && Opcode != ISD::SUB && + Opcode != ISD::FADD && Opcode != ISD::FSUB) + return SDValue(); + std::swap(N0, N1); + } + + EVT VT = N->getValueType(0); + DebugLoc DL = N->getDebugLoc(); + SDValue N00 = N0->getOperand(0); + SDValue N01 = N0->getOperand(1); + return DAG.getNode(Opcode, DL, VT, + DAG.getNode(ISD::MUL, DL, VT, N00, N1), + DAG.getNode(ISD::MUL, DL, VT, N01, N1)); +} + +static SDValue PerformMULCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + SelectionDAG &DAG = DCI.DAG; + + if (Subtarget->isThumb1Only()) + return SDValue(); + + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + EVT VT = N->getValueType(0); + if (VT.is64BitVector() || VT.is128BitVector()) + return PerformVMULCombine(N, DCI, Subtarget); + if (VT != MVT::i32) + return SDValue(); + + ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!C) + return SDValue(); + + uint64_t MulAmt = C->getZExtValue(); + unsigned ShiftAmt = CountTrailingZeros_64(MulAmt); + ShiftAmt = ShiftAmt & (32 - 1); + SDValue V = N->getOperand(0); + DebugLoc DL = N->getDebugLoc(); + + SDValue Res; + MulAmt >>= ShiftAmt; + if (isPowerOf2_32(MulAmt - 1)) { + // (mul x, 2^N + 1) => (add (shl x, N), x) + Res = DAG.getNode(ISD::ADD, DL, VT, + V, DAG.getNode(ISD::SHL, DL, VT, + V, DAG.getConstant(Log2_32(MulAmt-1), + MVT::i32))); + } else if (isPowerOf2_32(MulAmt + 1)) { + // (mul x, 2^N - 1) => (sub (shl x, N), x) + Res = DAG.getNode(ISD::SUB, DL, VT, + DAG.getNode(ISD::SHL, DL, VT, + V, DAG.getConstant(Log2_32(MulAmt+1), + MVT::i32)), + V); + } else + return SDValue(); + + if (ShiftAmt != 0) + Res = DAG.getNode(ISD::SHL, DL, VT, Res, + DAG.getConstant(ShiftAmt, MVT::i32)); + + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, Res, false); + return SDValue(); +} + +static SDValue PerformANDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + + // Attempt to use immediate-form VBIC + BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); + DebugLoc dl = N->getDebugLoc(); + EVT VT = N->getValueType(0); + SelectionDAG &DAG = DCI.DAG; + + if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (BVN && + BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { + if (SplatBitSize <= 64) { + EVT VbicVT; + SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(), + SplatUndef.getZExtValue(), SplatBitSize, + DAG, VbicVT, VT.is128BitVector(), + OtherModImm); + if (Val.getNode()) { + SDValue Input = + DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); + SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val); + return DAG.getNode(ISD::BITCAST, dl, VT, Vbic); + } + } + } + + return SDValue(); +} + +/// PerformORCombine - Target-specific dag combine xforms for ISD::OR +static SDValue PerformORCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + // Attempt to use immediate-form VORR + BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); + DebugLoc dl = N->getDebugLoc(); + EVT VT = N->getValueType(0); + SelectionDAG &DAG = DCI.DAG; + + if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (BVN && Subtarget->hasNEON() && + BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { + if (SplatBitSize <= 64) { + EVT VorrVT; + SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), + SplatUndef.getZExtValue(), SplatBitSize, + DAG, VorrVT, VT.is128BitVector(), + OtherModImm); + if (Val.getNode()) { + SDValue Input = + DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); + SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val); + return DAG.getNode(ISD::BITCAST, dl, VT, Vorr); + } + } + } + + SDValue N0 = N->getOperand(0); + if (N0.getOpcode() != ISD::AND) + return SDValue(); + SDValue N1 = N->getOperand(1); + + // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. + if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() && + DAG.getTargetLoweringInfo().isTypeLegal(VT)) { + APInt SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + + BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1)); + APInt SplatBits0; + if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, + HasAnyUndefs) && !HasAnyUndefs) { + BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1)); + APInt SplatBits1; + if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, + HasAnyUndefs) && !HasAnyUndefs && + SplatBits0 == ~SplatBits1) { + // Canonicalize the vector type to make instruction selection simpler. + EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; + SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, + N0->getOperand(1), N0->getOperand(0), + N1->getOperand(0)); + return DAG.getNode(ISD::BITCAST, dl, VT, Result); + } + } + } + + // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when + // reasonable. + + // BFI is only available on V6T2+ + if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) + return SDValue(); + + DebugLoc DL = N->getDebugLoc(); + // 1) or (and A, mask), val => ARMbfi A, val, mask + // iff (val & mask) == val + // + // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask + // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2) + // && mask == ~mask2 + // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2) + // && ~mask == mask2 + // (i.e., copy a bitfield value into another bitfield of the same width) + + if (VT != MVT::i32) + return SDValue(); + + SDValue N00 = N0.getOperand(0); + + // The value and the mask need to be constants so we can verify this is + // actually a bitfield set. If the mask is 0xffff, we can do better + // via a movt instruction, so don't use BFI in that case. + SDValue MaskOp = N0.getOperand(1); + ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp); + if (!MaskC) + return SDValue(); + unsigned Mask = MaskC->getZExtValue(); + if (Mask == 0xffff) + return SDValue(); + SDValue Res; + // Case (1): or (and A, mask), val => ARMbfi A, val, mask + ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); + if (N1C) { + unsigned Val = N1C->getZExtValue(); + if ((Val & ~Mask) != Val) + return SDValue(); + + if (ARM::isBitFieldInvertedMask(Mask)) { + Val >>= CountTrailingZeros_32(~Mask); + + Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, + DAG.getConstant(Val, MVT::i32), + DAG.getConstant(Mask, MVT::i32)); + + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, Res, false); + return SDValue(); + } + } else if (N1.getOpcode() == ISD::AND) { + // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask + ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); + if (!N11C) + return SDValue(); + unsigned Mask2 = N11C->getZExtValue(); + + // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern + // as is to match. + if (ARM::isBitFieldInvertedMask(Mask) && + (Mask == ~Mask2)) { + // The pack halfword instruction works better for masks that fit it, + // so use that when it's available. + if (Subtarget->hasT2ExtractPack() && + (Mask == 0xffff || Mask == 0xffff0000)) + return SDValue(); + // 2a + unsigned amt = CountTrailingZeros_32(Mask2); + Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), + DAG.getConstant(amt, MVT::i32)); + Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, + DAG.getConstant(Mask, MVT::i32)); + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, Res, false); + return SDValue(); + } else if (ARM::isBitFieldInvertedMask(~Mask) && + (~Mask == Mask2)) { + // The pack halfword instruction works better for masks that fit it, + // so use that when it's available. + if (Subtarget->hasT2ExtractPack() && + (Mask2 == 0xffff || Mask2 == 0xffff0000)) + return SDValue(); + // 2b + unsigned lsb = CountTrailingZeros_32(Mask); + Res = DAG.getNode(ISD::SRL, DL, VT, N00, + DAG.getConstant(lsb, MVT::i32)); + Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, + DAG.getConstant(Mask2, MVT::i32)); + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, Res, false); + return SDValue(); + } + } + + if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) && + N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) && + ARM::isBitFieldInvertedMask(~Mask)) { + // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask + // where lsb(mask) == #shamt and masked bits of B are known zero. + SDValue ShAmt = N00.getOperand(1); + unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue(); + unsigned LSB = CountTrailingZeros_32(Mask); + if (ShAmtC != LSB) + return SDValue(); + + Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0), + DAG.getConstant(~Mask, MVT::i32)); + + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, Res, false); + } + + return SDValue(); +} + +/// PerformBFICombine - (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff +/// the bits being cleared by the AND are not demanded by the BFI. +static SDValue PerformBFICombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue N1 = N->getOperand(1); + if (N1.getOpcode() == ISD::AND) { + ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); + if (!N11C) + return SDValue(); + unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); + unsigned LSB = CountTrailingZeros_32(~InvMask); + unsigned Width = (32 - CountLeadingZeros_32(~InvMask)) - LSB; + unsigned Mask = (1 << Width)-1; + unsigned Mask2 = N11C->getZExtValue(); + if ((Mask & (~Mask2)) == 0) + return DCI.DAG.getNode(ARMISD::BFI, N->getDebugLoc(), N->getValueType(0), + N->getOperand(0), N1.getOperand(0), + N->getOperand(2)); + } + return SDValue(); +} + +/// PerformVMOVRRDCombine - Target-specific dag combine xforms for +/// ARMISD::VMOVRRD. +static SDValue PerformVMOVRRDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + // vmovrrd(vmovdrr x, y) -> x,y + SDValue InDouble = N->getOperand(0); + if (InDouble.getOpcode() == ARMISD::VMOVDRR) + return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); + + // vmovrrd(load f64) -> (load i32), (load i32) + SDNode *InNode = InDouble.getNode(); + if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() && + InNode->getValueType(0) == MVT::f64 && + InNode->getOperand(1).getOpcode() == ISD::FrameIndex && + !cast<LoadSDNode>(InNode)->isVolatile()) { + // TODO: Should this be done for non-FrameIndex operands? + LoadSDNode *LD = cast<LoadSDNode>(InNode); + + SelectionDAG &DAG = DCI.DAG; + DebugLoc DL = LD->getDebugLoc(); + SDValue BasePtr = LD->getBasePtr(); + SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, + LD->getPointerInfo(), LD->isVolatile(), + LD->isNonTemporal(), LD->getAlignment()); + + SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, + DAG.getConstant(4, MVT::i32)); + SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, + LD->getPointerInfo(), LD->isVolatile(), + LD->isNonTemporal(), + std::min(4U, LD->getAlignment() / 2)); + + DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); + SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2); + DCI.RemoveFromWorklist(LD); + DAG.DeleteNode(LD); + return Result; + } + + return SDValue(); +} + +/// PerformVMOVDRRCombine - Target-specific dag combine xforms for +/// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands. +static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { + // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X) + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + if (Op0.getOpcode() == ISD::BITCAST) + Op0 = Op0.getOperand(0); + if (Op1.getOpcode() == ISD::BITCAST) + Op1 = Op1.getOperand(0); + if (Op0.getOpcode() == ARMISD::VMOVRRD && + Op0.getNode() == Op1.getNode() && + Op0.getResNo() == 0 && Op1.getResNo() == 1) + return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), + N->getValueType(0), Op0.getOperand(0)); + return SDValue(); +} + +/// PerformSTORECombine - Target-specific dag combine xforms for +/// ISD::STORE. +static SDValue PerformSTORECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + // Bitcast an i64 store extracted from a vector to f64. + // Otherwise, the i64 value will be legalized to a pair of i32 values. + StoreSDNode *St = cast<StoreSDNode>(N); + SDValue StVal = St->getValue(); + if (!ISD::isNormalStore(St) || St->isVolatile()) + return SDValue(); + + if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && + StVal.getNode()->hasOneUse() && !St->isVolatile()) { + SelectionDAG &DAG = DCI.DAG; + DebugLoc DL = St->getDebugLoc(); + SDValue BasePtr = St->getBasePtr(); + SDValue NewST1 = DAG.getStore(St->getChain(), DL, + StVal.getNode()->getOperand(0), BasePtr, + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + + SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, + DAG.getConstant(4, MVT::i32)); + return DAG.getStore(NewST1.getValue(0), DL, StVal.getNode()->getOperand(1), + OffsetPtr, St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), + std::min(4U, St->getAlignment() / 2)); + } + + if (StVal.getValueType() != MVT::i64 || + StVal.getNode()->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + DebugLoc dl = StVal.getDebugLoc(); + SDValue IntVec = StVal.getOperand(0); + EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, + IntVec.getValueType().getVectorNumElements()); + SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); + SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, + Vec, StVal.getOperand(1)); + dl = N->getDebugLoc(); + SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); + // Make the DAGCombiner fold the bitcasts. + DCI.AddToWorklist(Vec.getNode()); + DCI.AddToWorklist(ExtElt.getNode()); + DCI.AddToWorklist(V.getNode()); + return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment(), + St->getTBAAInfo()); +} + +/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node +/// are normal, non-volatile loads. If so, it is profitable to bitcast an +/// i64 vector to have f64 elements, since the value can then be loaded +/// directly into a VFP register. +static bool hasNormalLoadOperand(SDNode *N) { + unsigned NumElts = N->getValueType(0).getVectorNumElements(); + for (unsigned i = 0; i < NumElts; ++i) { + SDNode *Elt = N->getOperand(i).getNode(); + if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile()) + return true; + } + return false; +} + +/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for +/// ISD::BUILD_VECTOR. +static SDValue PerformBUILD_VECTORCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI){ + // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X): + // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value + // into a pair of GPRs, which is fine when the value is used as a scalar, + // but if the i64 value is converted to a vector, we need to undo the VMOVRRD. + SelectionDAG &DAG = DCI.DAG; + if (N->getNumOperands() == 2) { + SDValue RV = PerformVMOVDRRCombine(N, DAG); + if (RV.getNode()) + return RV; + } + + // Load i64 elements as f64 values so that type legalization does not split + // them up into i32 values. + EVT VT = N->getValueType(0); + if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N)) + return SDValue(); + DebugLoc dl = N->getDebugLoc(); + SmallVector<SDValue, 8> Ops; + unsigned NumElts = VT.getVectorNumElements(); + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i)); + Ops.push_back(V); + // Make the DAGCombiner fold the bitcast. + DCI.AddToWorklist(V.getNode()); + } + EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts); + SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, FloatVT, Ops.data(), NumElts); + return DAG.getNode(ISD::BITCAST, dl, VT, BV); +} + +/// PerformInsertEltCombine - Target-specific dag combine xforms for +/// ISD::INSERT_VECTOR_ELT. +static SDValue PerformInsertEltCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + // Bitcast an i64 load inserted into a vector to f64. + // Otherwise, the i64 value will be legalized to a pair of i32 values. + EVT VT = N->getValueType(0); + SDNode *Elt = N->getOperand(1).getNode(); + if (VT.getVectorElementType() != MVT::i64 || + !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + DebugLoc dl = N->getDebugLoc(); + EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, + VT.getVectorNumElements()); + SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0)); + SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1)); + // Make the DAGCombiner fold the bitcasts. + DCI.AddToWorklist(Vec.getNode()); + DCI.AddToWorklist(V.getNode()); + SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT, + Vec, V, N->getOperand(2)); + return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); +} + +/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for +/// ISD::VECTOR_SHUFFLE. +static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { + // The LLVM shufflevector instruction does not require the shuffle mask + // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does + // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the + // operands do not match the mask length, they are extended by concatenating + // them with undef vectors. That is probably the right thing for other + // targets, but for NEON it is better to concatenate two double-register + // size vector operands into a single quad-register size vector. Do that + // transformation here: + // shuffle(concat(v1, undef), concat(v2, undef)) -> + // shuffle(concat(v1, v2), undef) + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + if (Op0.getOpcode() != ISD::CONCAT_VECTORS || + Op1.getOpcode() != ISD::CONCAT_VECTORS || + Op0.getNumOperands() != 2 || + Op1.getNumOperands() != 2) + return SDValue(); + SDValue Concat0Op1 = Op0.getOperand(1); + SDValue Concat1Op1 = Op1.getOperand(1); + if (Concat0Op1.getOpcode() != ISD::UNDEF || + Concat1Op1.getOpcode() != ISD::UNDEF) + return SDValue(); + // Skip the transformation if any of the types are illegal. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT VT = N->getValueType(0); + if (!TLI.isTypeLegal(VT) || + !TLI.isTypeLegal(Concat0Op1.getValueType()) || + !TLI.isTypeLegal(Concat1Op1.getValueType())) + return SDValue(); + + SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, N->getDebugLoc(), VT, + Op0.getOperand(0), Op1.getOperand(0)); + // Translate the shuffle mask. + SmallVector<int, 16> NewMask; + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfElts = NumElts/2; + ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); + for (unsigned n = 0; n < NumElts; ++n) { + int MaskElt = SVN->getMaskElt(n); + int NewElt = -1; + if (MaskElt < (int)HalfElts) + NewElt = MaskElt; + else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts)) + NewElt = HalfElts + MaskElt - NumElts; + NewMask.push_back(NewElt); + } + return DAG.getVectorShuffle(VT, N->getDebugLoc(), NewConcat, + DAG.getUNDEF(VT), NewMask.data()); +} + +/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP and +/// NEON load/store intrinsics to merge base address updates. +static SDValue CombineBaseUpdate(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || + N->getOpcode() == ISD::INTRINSIC_W_CHAIN); + unsigned AddrOpIdx = (isIntrinsic ? 2 : 1); + SDValue Addr = N->getOperand(AddrOpIdx); + + // Search for a use of the address operand that is an increment. + for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), + UE = Addr.getNode()->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + if (User->getOpcode() != ISD::ADD || + UI.getUse().getResNo() != Addr.getResNo()) + continue; + + // Check that the add is independent of the load/store. Otherwise, folding + // it would create a cycle. + if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) + continue; + + // Find the new opcode for the updating load/store. + bool isLoad = true; + bool isLaneOp = false; + unsigned NewOpc = 0; + unsigned NumVecs = 0; + if (isIntrinsic) { + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + switch (IntNo) { + default: assert(0 && "unexpected intrinsic for Neon base update"); + case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD; + NumVecs = 1; break; + case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD; + NumVecs = 2; break; + case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD; + NumVecs = 3; break; + case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; + NumVecs = 4; break; + case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; + NumVecs = 2; isLaneOp = true; break; + case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; + NumVecs = 3; isLaneOp = true; break; + case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; + NumVecs = 4; isLaneOp = true; break; + case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; + NumVecs = 1; isLoad = false; break; + case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; + NumVecs = 2; isLoad = false; break; + case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; + NumVecs = 3; isLoad = false; break; + case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; + NumVecs = 4; isLoad = false; break; + case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; + NumVecs = 2; isLoad = false; isLaneOp = true; break; + case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; + NumVecs = 3; isLoad = false; isLaneOp = true; break; + case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; + NumVecs = 4; isLoad = false; isLaneOp = true; break; + } + } else { + isLaneOp = true; + switch (N->getOpcode()) { + default: assert(0 && "unexpected opcode for Neon base update"); + case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; + case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; + case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; + } + } + + // Find the size of memory referenced by the load/store. + EVT VecTy; + if (isLoad) + VecTy = N->getValueType(0); + else + VecTy = N->getOperand(AddrOpIdx+1).getValueType(); + unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; + if (isLaneOp) + NumBytes /= VecTy.getVectorNumElements(); + + // If the increment is a constant, it must match the memory ref size. + SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); + if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { + uint64_t IncVal = CInc->getZExtValue(); + if (IncVal != NumBytes) + continue; + } else if (NumBytes >= 3 * 16) { + // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two + // separate instructions that make it harder to use a non-constant update. + continue; + } + + // Create the new updating load/store node. + EVT Tys[6]; + unsigned NumResultVecs = (isLoad ? NumVecs : 0); + unsigned n; + for (n = 0; n < NumResultVecs; ++n) + Tys[n] = VecTy; + Tys[n++] = MVT::i32; + Tys[n] = MVT::Other; + SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs+2); + SmallVector<SDValue, 8> Ops; + Ops.push_back(N->getOperand(0)); // incoming chain + Ops.push_back(N->getOperand(AddrOpIdx)); + Ops.push_back(Inc); + for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) { + Ops.push_back(N->getOperand(i)); + } + MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N); + SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, N->getDebugLoc(), SDTys, + Ops.data(), Ops.size(), + MemInt->getMemoryVT(), + MemInt->getMemOperand()); + + // Update the uses. + std::vector<SDValue> NewResults; + for (unsigned i = 0; i < NumResultVecs; ++i) { + NewResults.push_back(SDValue(UpdN.getNode(), i)); + } + NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain + DCI.CombineTo(N, NewResults); + DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); + + break; + } + return SDValue(); +} + +/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a +/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic +/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and +/// return true. +static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + // vldN-dup instructions only support 64-bit vectors for N > 1. + if (!VT.is64BitVector()) + return false; + + // Check if the VDUPLANE operand is a vldN-dup intrinsic. + SDNode *VLD = N->getOperand(0).getNode(); + if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) + return false; + unsigned NumVecs = 0; + unsigned NewOpc = 0; + unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue(); + if (IntNo == Intrinsic::arm_neon_vld2lane) { + NumVecs = 2; + NewOpc = ARMISD::VLD2DUP; + } else if (IntNo == Intrinsic::arm_neon_vld3lane) { + NumVecs = 3; + NewOpc = ARMISD::VLD3DUP; + } else if (IntNo == Intrinsic::arm_neon_vld4lane) { + NumVecs = 4; + NewOpc = ARMISD::VLD4DUP; + } else { + return false; + } + + // First check that all the vldN-lane uses are VDUPLANEs and that the lane + // numbers match the load. + unsigned VLDLaneNo = + cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue(); + for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); + UI != UE; ++UI) { + // Ignore uses of the chain result. + if (UI.getUse().getResNo() == NumVecs) + continue; + SDNode *User = *UI; + if (User->getOpcode() != ARMISD::VDUPLANE || + VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue()) + return false; + } + + // Create the vldN-dup node. + EVT Tys[5]; + unsigned n; + for (n = 0; n < NumVecs; ++n) + Tys[n] = VT; + Tys[n] = MVT::Other; + SDVTList SDTys = DAG.getVTList(Tys, NumVecs+1); + SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; + MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD); + SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, VLD->getDebugLoc(), SDTys, + Ops, 2, VLDMemInt->getMemoryVT(), + VLDMemInt->getMemOperand()); + + // Update the uses. + for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); + UI != UE; ++UI) { + unsigned ResNo = UI.getUse().getResNo(); + // Ignore uses of the chain result. + if (ResNo == NumVecs) + continue; + SDNode *User = *UI; + DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); + } + + // Now the vldN-lane intrinsic is dead except for its chain result. + // Update uses of the chain. + std::vector<SDValue> VLDDupResults; + for (unsigned n = 0; n < NumVecs; ++n) + VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); + VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); + DCI.CombineTo(VLD, VLDDupResults); + + return true; +} + +/// PerformVDUPLANECombine - Target-specific dag combine xforms for +/// ARMISD::VDUPLANE. +static SDValue PerformVDUPLANECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue Op = N->getOperand(0); + + // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses + // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. + if (CombineVLDDUP(N, DCI)) + return SDValue(N, 0); + + // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is + // redundant. Ignore bit_converts for now; element sizes are checked below. + while (Op.getOpcode() == ISD::BITCAST) + Op = Op.getOperand(0); + if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM) + return SDValue(); + + // Make sure the VMOV element size is not bigger than the VDUPLANE elements. + unsigned EltSize = Op.getValueType().getVectorElementType().getSizeInBits(); + // The canonical VMOV for a zero vector uses a 32-bit element size. + unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + unsigned EltBits; + if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0) + EltSize = 8; + EVT VT = N->getValueType(0); + if (EltSize > VT.getVectorElementType().getSizeInBits()) + return SDValue(); + + return DCI.DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); +} + +// isConstVecPow2 - Return true if each vector element is a power of 2, all +// elements are the same constant, C, and Log2(C) ranges from 1 to 32. +static bool isConstVecPow2(SDValue ConstVec, bool isSigned, uint64_t &C) +{ + integerPart cN; + integerPart c0 = 0; + for (unsigned I = 0, E = ConstVec.getValueType().getVectorNumElements(); + I != E; I++) { + ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(ConstVec.getOperand(I)); + if (!C) + return false; + + bool isExact; + APFloat APF = C->getValueAPF(); + if (APF.convertToInteger(&cN, 64, isSigned, APFloat::rmTowardZero, &isExact) + != APFloat::opOK || !isExact) + return false; + + c0 = (I == 0) ? cN : c0; + if (!isPowerOf2_64(cN) || c0 != cN || Log2_64(c0) < 1 || Log2_64(c0) > 32) + return false; + } + C = c0; + return true; +} + +/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) +/// can replace combinations of VMUL and VCVT (floating-point to integer) +/// when the VMUL has a constant operand that is a power of 2. +/// +/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): +/// vmul.f32 d16, d17, d16 +/// vcvt.s32.f32 d16, d16 +/// becomes: +/// vcvt.s32.f32 d16, d16, #3 +static SDValue PerformVCVTCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + SelectionDAG &DAG = DCI.DAG; + SDValue Op = N->getOperand(0); + + if (!Subtarget->hasNEON() || !Op.getValueType().isVector() || + Op.getOpcode() != ISD::FMUL) + return SDValue(); + + uint64_t C; + SDValue N0 = Op->getOperand(0); + SDValue ConstVec = Op->getOperand(1); + bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; + + if (ConstVec.getOpcode() != ISD::BUILD_VECTOR || + !isConstVecPow2(ConstVec, isSigned, C)) + return SDValue(); + + unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : + Intrinsic::arm_neon_vcvtfp2fxu; + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(), + N->getValueType(0), + DAG.getConstant(IntrinsicOpcode, MVT::i32), N0, + DAG.getConstant(Log2_64(C), MVT::i32)); +} + +/// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD) +/// can replace combinations of VCVT (integer to floating-point) and VDIV +/// when the VDIV has a constant operand that is a power of 2. +/// +/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): +/// vcvt.f32.s32 d16, d16 +/// vdiv.f32 d16, d17, d16 +/// becomes: +/// vcvt.f32.s32 d16, d16, #3 +static SDValue PerformVDIVCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + SelectionDAG &DAG = DCI.DAG; + SDValue Op = N->getOperand(0); + unsigned OpOpcode = Op.getNode()->getOpcode(); + + if (!Subtarget->hasNEON() || !N->getValueType(0).isVector() || + (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP)) + return SDValue(); + + uint64_t C; + SDValue ConstVec = N->getOperand(1); + bool isSigned = OpOpcode == ISD::SINT_TO_FP; + + if (ConstVec.getOpcode() != ISD::BUILD_VECTOR || + !isConstVecPow2(ConstVec, isSigned, C)) + return SDValue(); + + unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp : + Intrinsic::arm_neon_vcvtfxu2fp; + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(), + Op.getValueType(), + DAG.getConstant(IntrinsicOpcode, MVT::i32), + Op.getOperand(0), DAG.getConstant(Log2_64(C), MVT::i32)); +} + +/// Getvshiftimm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift operation, where all the elements of the +/// build_vector must have the same constant integer value. +static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { + // Ignore bit_converts. + while (Op.getOpcode() == ISD::BITCAST) + Op = Op.getOperand(0); + BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, + HasAnyUndefs, ElementBits) || + SplatBitSize > ElementBits) + return false; + Cnt = SplatBits.getSExtValue(); + return true; +} + +/// isVShiftLImm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift left operation. That value must be in the range: +/// 0 <= Value < ElementBits for a left shift; or +/// 0 <= Value <= ElementBits for a long left shift. +static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { + assert(VT.isVector() && "vector shift count is not a vector type"); + unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + if (! getVShiftImm(Op, ElementBits, Cnt)) + return false; + return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits); +} + +/// isVShiftRImm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift right operation. For a shift opcode, the value +/// is positive, but for an intrinsic the value count must be negative. The +/// absolute value must be in the range: +/// 1 <= |Value| <= ElementBits for a right shift; or +/// 1 <= |Value| <= ElementBits/2 for a narrow right shift. +static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, + int64_t &Cnt) { + assert(VT.isVector() && "vector shift count is not a vector type"); + unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + if (! getVShiftImm(Op, ElementBits, Cnt)) + return false; + if (isIntrinsic) + Cnt = -Cnt; + return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits)); +} + +/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. +static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + switch (IntNo) { + default: + // Don't do anything for most intrinsics. + break; + + // Vector shifts: check for immediate versions and lower them. + // Note: This is done during DAG combining instead of DAG legalizing because + // the build_vectors for 64-bit vector element shift counts are generally + // not legal, and it is hard to see their values after they get legalized to + // loads from a constant pool. + case Intrinsic::arm_neon_vshifts: + case Intrinsic::arm_neon_vshiftu: + case Intrinsic::arm_neon_vshiftls: + case Intrinsic::arm_neon_vshiftlu: + case Intrinsic::arm_neon_vshiftn: + case Intrinsic::arm_neon_vrshifts: + case Intrinsic::arm_neon_vrshiftu: + case Intrinsic::arm_neon_vrshiftn: + case Intrinsic::arm_neon_vqshifts: + case Intrinsic::arm_neon_vqshiftu: + case Intrinsic::arm_neon_vqshiftsu: + case Intrinsic::arm_neon_vqshiftns: + case Intrinsic::arm_neon_vqshiftnu: + case Intrinsic::arm_neon_vqshiftnsu: + case Intrinsic::arm_neon_vqrshiftns: + case Intrinsic::arm_neon_vqrshiftnu: + case Intrinsic::arm_neon_vqrshiftnsu: { + EVT VT = N->getOperand(1).getValueType(); + int64_t Cnt; + unsigned VShiftOpc = 0; + + switch (IntNo) { + case Intrinsic::arm_neon_vshifts: + case Intrinsic::arm_neon_vshiftu: + if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { + VShiftOpc = ARMISD::VSHL; + break; + } + if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { + VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? + ARMISD::VSHRs : ARMISD::VSHRu); + break; + } + return SDValue(); + + case Intrinsic::arm_neon_vshiftls: + case Intrinsic::arm_neon_vshiftlu: + if (isVShiftLImm(N->getOperand(2), VT, true, Cnt)) + break; + llvm_unreachable("invalid shift count for vshll intrinsic"); + + case Intrinsic::arm_neon_vrshifts: + case Intrinsic::arm_neon_vrshiftu: + if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) + break; + return SDValue(); + + case Intrinsic::arm_neon_vqshifts: + case Intrinsic::arm_neon_vqshiftu: + if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) + break; + return SDValue(); + + case Intrinsic::arm_neon_vqshiftsu: + if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) + break; + llvm_unreachable("invalid shift count for vqshlu intrinsic"); + + case Intrinsic::arm_neon_vshiftn: + case Intrinsic::arm_neon_vrshiftn: + case Intrinsic::arm_neon_vqshiftns: + case Intrinsic::arm_neon_vqshiftnu: + case Intrinsic::arm_neon_vqshiftnsu: + case Intrinsic::arm_neon_vqrshiftns: + case Intrinsic::arm_neon_vqrshiftnu: + case Intrinsic::arm_neon_vqrshiftnsu: + // Narrowing shifts require an immediate right shift. + if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) + break; + llvm_unreachable("invalid shift count for narrowing vector shift " + "intrinsic"); + + default: + llvm_unreachable("unhandled vector shift"); + } + + switch (IntNo) { + case Intrinsic::arm_neon_vshifts: + case Intrinsic::arm_neon_vshiftu: + // Opcode already set above. + break; + case Intrinsic::arm_neon_vshiftls: + case Intrinsic::arm_neon_vshiftlu: + if (Cnt == VT.getVectorElementType().getSizeInBits()) + VShiftOpc = ARMISD::VSHLLi; + else + VShiftOpc = (IntNo == Intrinsic::arm_neon_vshiftls ? + ARMISD::VSHLLs : ARMISD::VSHLLu); + break; + case Intrinsic::arm_neon_vshiftn: + VShiftOpc = ARMISD::VSHRN; break; + case Intrinsic::arm_neon_vrshifts: + VShiftOpc = ARMISD::VRSHRs; break; + case Intrinsic::arm_neon_vrshiftu: + VShiftOpc = ARMISD::VRSHRu; break; + case Intrinsic::arm_neon_vrshiftn: + VShiftOpc = ARMISD::VRSHRN; break; + case Intrinsic::arm_neon_vqshifts: + VShiftOpc = ARMISD::VQSHLs; break; + case Intrinsic::arm_neon_vqshiftu: + VShiftOpc = ARMISD::VQSHLu; break; + case Intrinsic::arm_neon_vqshiftsu: + VShiftOpc = ARMISD::VQSHLsu; break; + case Intrinsic::arm_neon_vqshiftns: + VShiftOpc = ARMISD::VQSHRNs; break; + case Intrinsic::arm_neon_vqshiftnu: + VShiftOpc = ARMISD::VQSHRNu; break; + case Intrinsic::arm_neon_vqshiftnsu: + VShiftOpc = ARMISD::VQSHRNsu; break; + case Intrinsic::arm_neon_vqrshiftns: + VShiftOpc = ARMISD::VQRSHRNs; break; + case Intrinsic::arm_neon_vqrshiftnu: + VShiftOpc = ARMISD::VQRSHRNu; break; + case Intrinsic::arm_neon_vqrshiftnsu: + VShiftOpc = ARMISD::VQRSHRNsu; break; + } + + return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0), + N->getOperand(1), DAG.getConstant(Cnt, MVT::i32)); + } + + case Intrinsic::arm_neon_vshiftins: { + EVT VT = N->getOperand(1).getValueType(); + int64_t Cnt; + unsigned VShiftOpc = 0; + + if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) + VShiftOpc = ARMISD::VSLI; + else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) + VShiftOpc = ARMISD::VSRI; + else { + llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); + } + + return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0), + N->getOperand(1), N->getOperand(2), + DAG.getConstant(Cnt, MVT::i32)); + } + + case Intrinsic::arm_neon_vqrshifts: + case Intrinsic::arm_neon_vqrshiftu: + // No immediate versions of these to check for. + break; + } + + return SDValue(); +} + +/// PerformShiftCombine - Checks for immediate versions of vector shifts and +/// lowers them. As with the vector shift intrinsics, this is done during DAG +/// combining instead of DAG legalizing because the build_vectors for 64-bit +/// vector element shift counts are generally not legal, and it is hard to see +/// their values after they get legalized to loads from a constant pool. +static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VT = N->getValueType(0); + + // Nothing to be done for scalar shifts. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!VT.isVector() || !TLI.isTypeLegal(VT)) + return SDValue(); + + assert(ST->hasNEON() && "unexpected vector shift"); + int64_t Cnt; + + switch (N->getOpcode()) { + default: llvm_unreachable("unexpected shift opcode"); + + case ISD::SHL: + if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) + return DAG.getNode(ARMISD::VSHL, N->getDebugLoc(), VT, N->getOperand(0), + DAG.getConstant(Cnt, MVT::i32)); + break; + + case ISD::SRA: + case ISD::SRL: + if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { + unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ? + ARMISD::VSHRs : ARMISD::VSHRu); + return DAG.getNode(VShiftOpc, N->getDebugLoc(), VT, N->getOperand(0), + DAG.getConstant(Cnt, MVT::i32)); + } + } + return SDValue(); +} + +/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, +/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. +static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + SDValue N0 = N->getOperand(0); + + // Check for sign- and zero-extensions of vector extract operations of 8- + // and 16-bit vector elements. NEON supports these directly. They are + // handled during DAG combining because type legalization will promote them + // to 32-bit types and it is messy to recognize the operations after that. + if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + SDValue Vec = N0.getOperand(0); + SDValue Lane = N0.getOperand(1); + EVT VT = N->getValueType(0); + EVT EltVT = N0.getValueType(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + if (VT == MVT::i32 && + (EltVT == MVT::i8 || EltVT == MVT::i16) && + TLI.isTypeLegal(Vec.getValueType()) && + isa<ConstantSDNode>(Lane)) { + + unsigned Opc = 0; + switch (N->getOpcode()) { + default: llvm_unreachable("unexpected opcode"); + case ISD::SIGN_EXTEND: + Opc = ARMISD::VGETLANEs; + break; + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: + Opc = ARMISD::VGETLANEu; + break; + } + return DAG.getNode(Opc, N->getDebugLoc(), VT, Vec, Lane); + } + } + + return SDValue(); +} + +/// PerformSELECT_CCCombine - Target-specific DAG combining for ISD::SELECT_CC +/// to match f32 max/min patterns to use NEON vmax/vmin instructions. +static SDValue PerformSELECT_CCCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + // If the target supports NEON, try to use vmax/vmin instructions for f32 + // selects like "x < y ? x : y". Unless the NoNaNsFPMath option is set, + // be careful about NaNs: NEON's vmax/vmin return NaN if either operand is + // a NaN; only do the transformation when it matches that behavior. + + // For now only do this when using NEON for FP operations; if using VFP, it + // is not obvious that the benefit outweighs the cost of switching to the + // NEON pipeline. + if (!ST->hasNEON() || !ST->useNEONForSinglePrecisionFP() || + N->getValueType(0) != MVT::f32) + return SDValue(); + + SDValue CondLHS = N->getOperand(0); + SDValue CondRHS = N->getOperand(1); + SDValue LHS = N->getOperand(2); + SDValue RHS = N->getOperand(3); + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get(); + + unsigned Opcode = 0; + bool IsReversed; + if (DAG.isEqualTo(LHS, CondLHS) && DAG.isEqualTo(RHS, CondRHS)) { + IsReversed = false; // x CC y ? x : y + } else if (DAG.isEqualTo(LHS, CondRHS) && DAG.isEqualTo(RHS, CondLHS)) { + IsReversed = true ; // x CC y ? y : x + } else { + return SDValue(); + } + + bool IsUnordered; + switch (CC) { + default: break; + case ISD::SETOLT: + case ISD::SETOLE: + case ISD::SETLT: + case ISD::SETLE: + case ISD::SETULT: + case ISD::SETULE: + // If LHS is NaN, an ordered comparison will be false and the result will + // be the RHS, but vmin(NaN, RHS) = NaN. Avoid this by checking that LHS + // != NaN. Likewise, for unordered comparisons, check for RHS != NaN. + IsUnordered = (CC == ISD::SETULT || CC == ISD::SETULE); + if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS)) + break; + // For less-than-or-equal comparisons, "+0 <= -0" will be true but vmin + // will return -0, so vmin can only be used for unsafe math or if one of + // the operands is known to be nonzero. + if ((CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE) && + !UnsafeFPMath && + !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) + break; + Opcode = IsReversed ? ARMISD::FMAX : ARMISD::FMIN; + break; + + case ISD::SETOGT: + case ISD::SETOGE: + case ISD::SETGT: + case ISD::SETGE: + case ISD::SETUGT: + case ISD::SETUGE: + // If LHS is NaN, an ordered comparison will be false and the result will + // be the RHS, but vmax(NaN, RHS) = NaN. Avoid this by checking that LHS + // != NaN. Likewise, for unordered comparisons, check for RHS != NaN. + IsUnordered = (CC == ISD::SETUGT || CC == ISD::SETUGE); + if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS)) + break; + // For greater-than-or-equal comparisons, "-0 >= +0" will be true but vmax + // will return +0, so vmax can only be used for unsafe math or if one of + // the operands is known to be nonzero. + if ((CC == ISD::SETGE || CC == ISD::SETOGE || CC == ISD::SETUGE) && + !UnsafeFPMath && + !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) + break; + Opcode = IsReversed ? ARMISD::FMIN : ARMISD::FMAX; + break; + } + + if (!Opcode) + return SDValue(); + return DAG.getNode(Opcode, N->getDebugLoc(), N->getValueType(0), LHS, RHS); +} + +/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. +SDValue +ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { + SDValue Cmp = N->getOperand(4); + if (Cmp.getOpcode() != ARMISD::CMPZ) + // Only looking at EQ and NE cases. + return SDValue(); + + EVT VT = N->getValueType(0); + DebugLoc dl = N->getDebugLoc(); + SDValue LHS = Cmp.getOperand(0); + SDValue RHS = Cmp.getOperand(1); + SDValue FalseVal = N->getOperand(0); + SDValue TrueVal = N->getOperand(1); + SDValue ARMcc = N->getOperand(2); + ARMCC::CondCodes CC = + (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); + + // Simplify + // mov r1, r0 + // cmp r1, x + // mov r0, y + // moveq r0, x + // to + // cmp r0, x + // movne r0, y + // + // mov r1, r0 + // cmp r1, x + // mov r0, x + // movne r0, y + // to + // cmp r0, x + // movne r0, y + /// FIXME: Turn this into a target neutral optimization? + SDValue Res; + if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) { + Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, + N->getOperand(3), Cmp); + } else if (CC == ARMCC::EQ && TrueVal == RHS) { + SDValue ARMcc; + SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl); + Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, + N->getOperand(3), NewCmp); + } + + if (Res.getNode()) { + APInt KnownZero, KnownOne; + APInt Mask = APInt::getAllOnesValue(VT.getScalarType().getSizeInBits()); + DAG.ComputeMaskedBits(SDValue(N,0), Mask, KnownZero, KnownOne); + // Capture demanded bits information that would be otherwise lost. + if (KnownZero == 0xfffffffe) + Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, + DAG.getValueType(MVT::i1)); + else if (KnownZero == 0xffffff00) + Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, + DAG.getValueType(MVT::i8)); + else if (KnownZero == 0xffff0000) + Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, + DAG.getValueType(MVT::i16)); + } + + return Res; +} + +SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + switch (N->getOpcode()) { + default: break; + case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); + case ISD::SUB: return PerformSUBCombine(N, DCI); + case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); + case ISD::OR: return PerformORCombine(N, DCI, Subtarget); + case ISD::AND: return PerformANDCombine(N, DCI); + case ARMISD::BFI: return PerformBFICombine(N, DCI); + case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI); + case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); + case ISD::STORE: return PerformSTORECombine(N, DCI); + case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI); + case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); + case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); + case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: return PerformVCVTCombine(N, DCI, Subtarget); + case ISD::FDIV: return PerformVDIVCombine(N, DCI, Subtarget); + case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); + case ISD::SHL: + case ISD::SRA: + case ISD::SRL: return PerformShiftCombine(N, DCI.DAG, Subtarget); + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); + case ISD::SELECT_CC: return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget); + case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); + case ARMISD::VLD2DUP: + case ARMISD::VLD3DUP: + case ARMISD::VLD4DUP: + return CombineBaseUpdate(N, DCI); + case ISD::INTRINSIC_VOID: + case ISD::INTRINSIC_W_CHAIN: + switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { + case Intrinsic::arm_neon_vld1: + case Intrinsic::arm_neon_vld2: + case Intrinsic::arm_neon_vld3: + case Intrinsic::arm_neon_vld4: + case Intrinsic::arm_neon_vld2lane: + case Intrinsic::arm_neon_vld3lane: + case Intrinsic::arm_neon_vld4lane: + case Intrinsic::arm_neon_vst1: + case Intrinsic::arm_neon_vst2: + case Intrinsic::arm_neon_vst3: + case Intrinsic::arm_neon_vst4: + case Intrinsic::arm_neon_vst2lane: + case Intrinsic::arm_neon_vst3lane: + case Intrinsic::arm_neon_vst4lane: + return CombineBaseUpdate(N, DCI); + default: break; + } + break; + } + return SDValue(); +} + +bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, + EVT VT) const { + return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); +} + +bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const { + if (!Subtarget->allowsUnalignedMem()) + return false; + + switch (VT.getSimpleVT().SimpleTy) { + default: + return false; + case MVT::i8: + case MVT::i16: + case MVT::i32: + return true; + // FIXME: VLD1 etc with standard alignment is legal. + } +} + +static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { + if (V < 0) + return false; + + unsigned Scale = 1; + switch (VT.getSimpleVT().SimpleTy) { + default: return false; + case MVT::i1: + case MVT::i8: + // Scale == 1; + break; + case MVT::i16: + // Scale == 2; + Scale = 2; + break; + case MVT::i32: + // Scale == 4; + Scale = 4; + break; + } + + if ((V & (Scale - 1)) != 0) + return false; + V /= Scale; + return V == (V & ((1LL << 5) - 1)); +} + +static bool isLegalT2AddressImmediate(int64_t V, EVT VT, + const ARMSubtarget *Subtarget) { + bool isNeg = false; + if (V < 0) { + isNeg = true; + V = - V; + } + + switch (VT.getSimpleVT().SimpleTy) { + default: return false; + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + // + imm12 or - imm8 + if (isNeg) + return V == (V & ((1LL << 8) - 1)); + return V == (V & ((1LL << 12) - 1)); + case MVT::f32: + case MVT::f64: + // Same as ARM mode. FIXME: NEON? + if (!Subtarget->hasVFP2()) + return false; + if ((V & 3) != 0) + return false; + V >>= 2; + return V == (V & ((1LL << 8) - 1)); + } +} + +/// isLegalAddressImmediate - Return true if the integer value can be used +/// as the offset of the target addressing mode for load / store of the +/// given type. +static bool isLegalAddressImmediate(int64_t V, EVT VT, + const ARMSubtarget *Subtarget) { + if (V == 0) + return true; + + if (!VT.isSimple()) + return false; + + if (Subtarget->isThumb1Only()) + return isLegalT1AddressImmediate(V, VT); + else if (Subtarget->isThumb2()) + return isLegalT2AddressImmediate(V, VT, Subtarget); + + // ARM mode. + if (V < 0) + V = - V; + switch (VT.getSimpleVT().SimpleTy) { + default: return false; + case MVT::i1: + case MVT::i8: + case MVT::i32: + // +- imm12 + return V == (V & ((1LL << 12) - 1)); + case MVT::i16: + // +- imm8 + return V == (V & ((1LL << 8) - 1)); + case MVT::f32: + case MVT::f64: + if (!Subtarget->hasVFP2()) // FIXME: NEON? + return false; + if ((V & 3) != 0) + return false; + V >>= 2; + return V == (V & ((1LL << 8) - 1)); + } +} + +bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM, + EVT VT) const { + int Scale = AM.Scale; + if (Scale < 0) + return false; + + switch (VT.getSimpleVT().SimpleTy) { + default: return false; + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + if (Scale == 1) + return true; + // r + r << imm + Scale = Scale & ~1; + return Scale == 2 || Scale == 4 || Scale == 8; + case MVT::i64: + // r + r + if (((unsigned)AM.HasBaseReg + Scale) <= 2) + return true; + return false; + case MVT::isVoid: + // Note, we allow "void" uses (basically, uses that aren't loads or + // stores), because arm allows folding a scale into many arithmetic + // operations. This should be made more precise and revisited later. + + // Allow r << imm, but the imm has to be a multiple of two. + if (Scale & 1) return false; + return isPowerOf2_32(Scale); + } +} + +/// isLegalAddressingMode - Return true if the addressing mode represented +/// by AM is legal for this target, for a load/store of the specified type. +bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM, + Type *Ty) const { + EVT VT = getValueType(Ty, true); + if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget)) + return false; + + // Can never fold addr of global into load/store. + if (AM.BaseGV) + return false; + + switch (AM.Scale) { + case 0: // no scale reg, must be "r+i" or "r", or "i". + break; + case 1: + if (Subtarget->isThumb1Only()) + return false; + // FALL THROUGH. + default: + // ARM doesn't support any R+R*scale+imm addr modes. + if (AM.BaseOffs) + return false; + + if (!VT.isSimple()) + return false; + + if (Subtarget->isThumb2()) + return isLegalT2ScaledAddressingMode(AM, VT); + + int Scale = AM.Scale; + switch (VT.getSimpleVT().SimpleTy) { + default: return false; + case MVT::i1: + case MVT::i8: + case MVT::i32: + if (Scale < 0) Scale = -Scale; + if (Scale == 1) + return true; + // r + r << imm + return isPowerOf2_32(Scale & ~1); + case MVT::i16: + case MVT::i64: + // r + r + if (((unsigned)AM.HasBaseReg + Scale) <= 2) + return true; + return false; + + case MVT::isVoid: + // Note, we allow "void" uses (basically, uses that aren't loads or + // stores), because arm allows folding a scale into many arithmetic + // operations. This should be made more precise and revisited later. + + // Allow r << imm, but the imm has to be a multiple of two. + if (Scale & 1) return false; + return isPowerOf2_32(Scale); + } + break; + } + return true; +} + +/// isLegalICmpImmediate - Return true if the specified immediate is legal +/// icmp immediate, that is the target has icmp instructions which can compare +/// a register against the immediate without having to materialize the +/// immediate into a register. +bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { + if (!Subtarget->isThumb()) + return ARM_AM::getSOImmVal(Imm) != -1; + if (Subtarget->isThumb2()) + return ARM_AM::getT2SOImmVal(Imm) != -1; + return Imm >= 0 && Imm <= 255; +} + +/// isLegalAddImmediate - Return true if the specified immediate is legal +/// add immediate, that is the target has add instructions which can add +/// a register with the immediate without having to materialize the +/// immediate into a register. +bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { + return ARM_AM::getSOImmVal(Imm) != -1; +} + +static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, + bool isSEXTLoad, SDValue &Base, + SDValue &Offset, bool &isInc, + SelectionDAG &DAG) { + if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) + return false; + + if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) { + // AddressingMode 3 + Base = Ptr->getOperand(0); + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { + int RHSC = (int)RHS->getZExtValue(); + if (RHSC < 0 && RHSC > -256) { + assert(Ptr->getOpcode() == ISD::ADD); + isInc = false; + Offset = DAG.getConstant(-RHSC, RHS->getValueType(0)); + return true; + } + } + isInc = (Ptr->getOpcode() == ISD::ADD); + Offset = Ptr->getOperand(1); + return true; + } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) { + // AddressingMode 2 + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { + int RHSC = (int)RHS->getZExtValue(); + if (RHSC < 0 && RHSC > -0x1000) { + assert(Ptr->getOpcode() == ISD::ADD); + isInc = false; + Offset = DAG.getConstant(-RHSC, RHS->getValueType(0)); + Base = Ptr->getOperand(0); + return true; + } + } + + if (Ptr->getOpcode() == ISD::ADD) { + isInc = true; + ARM_AM::ShiftOpc ShOpcVal= + ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode()); + if (ShOpcVal != ARM_AM::no_shift) { + Base = Ptr->getOperand(1); + Offset = Ptr->getOperand(0); + } else { + Base = Ptr->getOperand(0); + Offset = Ptr->getOperand(1); + } + return true; + } + + isInc = (Ptr->getOpcode() == ISD::ADD); + Base = Ptr->getOperand(0); + Offset = Ptr->getOperand(1); + return true; + } + + // FIXME: Use VLDM / VSTM to emulate indexed FP load / store. + return false; +} + +static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, + bool isSEXTLoad, SDValue &Base, + SDValue &Offset, bool &isInc, + SelectionDAG &DAG) { + if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) + return false; + + Base = Ptr->getOperand(0); + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { + int RHSC = (int)RHS->getZExtValue(); + if (RHSC < 0 && RHSC > -0x100) { // 8 bits. + assert(Ptr->getOpcode() == ISD::ADD); + isInc = false; + Offset = DAG.getConstant(-RHSC, RHS->getValueType(0)); + return true; + } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero. + isInc = Ptr->getOpcode() == ISD::ADD; + Offset = DAG.getConstant(RHSC, RHS->getValueType(0)); + return true; + } + } + + return false; +} + +/// getPreIndexedAddressParts - returns true by value, base pointer and +/// offset pointer and addressing mode by reference if the node's address +/// can be legally represented as pre-indexed load / store address. +bool +ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, + SDValue &Offset, + ISD::MemIndexedMode &AM, + SelectionDAG &DAG) const { + if (Subtarget->isThumb1Only()) + return false; + + EVT VT; + SDValue Ptr; + bool isSEXTLoad = false; + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { + Ptr = LD->getBasePtr(); + VT = LD->getMemoryVT(); + isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; + } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { + Ptr = ST->getBasePtr(); + VT = ST->getMemoryVT(); + } else + return false; + + bool isInc; + bool isLegal = false; + if (Subtarget->isThumb2()) + isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, + Offset, isInc, DAG); + else + isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, + Offset, isInc, DAG); + if (!isLegal) + return false; + + AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC; + return true; +} + +/// getPostIndexedAddressParts - returns true by value, base pointer and +/// offset pointer and addressing mode by reference if this node can be +/// combined with a load / store to form a post-indexed load / store. +bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, + SDValue &Base, + SDValue &Offset, + ISD::MemIndexedMode &AM, + SelectionDAG &DAG) const { + if (Subtarget->isThumb1Only()) + return false; + + EVT VT; + SDValue Ptr; + bool isSEXTLoad = false; + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { + VT = LD->getMemoryVT(); + Ptr = LD->getBasePtr(); + isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; + } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { + VT = ST->getMemoryVT(); + Ptr = ST->getBasePtr(); + } else + return false; + + bool isInc; + bool isLegal = false; + if (Subtarget->isThumb2()) + isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, + isInc, DAG); + else + isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, + isInc, DAG); + if (!isLegal) + return false; + + if (Ptr != Base) { + // Swap base ptr and offset to catch more post-index load / store when + // it's legal. In Thumb2 mode, offset must be an immediate. + if (Ptr == Offset && Op->getOpcode() == ISD::ADD && + !Subtarget->isThumb2()) + std::swap(Base, Offset); + + // Post-indexed load / store update the base pointer. + if (Ptr != Base) + return false; + } + + AM = isInc ? ISD::POST_INC : ISD::POST_DEC; + return true; +} + +void ARMTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, + const APInt &Mask, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) const { + KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); + switch (Op.getOpcode()) { + default: break; + case ARMISD::CMOV: { + // Bits are known zero/one if known on the LHS and RHS. + DAG.ComputeMaskedBits(Op.getOperand(0), Mask, KnownZero, KnownOne, Depth+1); + if (KnownZero == 0 && KnownOne == 0) return; + + APInt KnownZeroRHS, KnownOneRHS; + DAG.ComputeMaskedBits(Op.getOperand(1), Mask, + KnownZeroRHS, KnownOneRHS, Depth+1); + KnownZero &= KnownZeroRHS; + KnownOne &= KnownOneRHS; + return; + } + } +} + +//===----------------------------------------------------------------------===// +// ARM Inline Assembly Support +//===----------------------------------------------------------------------===// + +bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { + // Looking for "rev" which is V6+. + if (!Subtarget->hasV6Ops()) + return false; + + InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); + std::string AsmStr = IA->getAsmString(); + SmallVector<StringRef, 4> AsmPieces; + SplitString(AsmStr, AsmPieces, ";\n"); + + switch (AsmPieces.size()) { + default: return false; + case 1: + AsmStr = AsmPieces[0]; + AsmPieces.clear(); + SplitString(AsmStr, AsmPieces, " \t,"); + + // rev $0, $1 + if (AsmPieces.size() == 3 && + AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" && + IA->getConstraintString().compare(0, 4, "=l,l") == 0) { + IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); + if (Ty && Ty->getBitWidth() == 32) + return IntrinsicLowering::LowerToByteSwap(CI); + } + break; + } + + return false; +} + +/// getConstraintType - Given a constraint letter, return the type of +/// constraint it is for this target. +ARMTargetLowering::ConstraintType +ARMTargetLowering::getConstraintType(const std::string &Constraint) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + default: break; + case 'l': return C_RegisterClass; + case 'w': return C_RegisterClass; + case 'h': return C_RegisterClass; + case 'x': return C_RegisterClass; + case 't': return C_RegisterClass; + case 'j': return C_Other; // Constant for movw. + // An address with a single base register. Due to the way we + // currently handle addresses it is the same as an 'r' memory constraint. + case 'Q': return C_Memory; + } + } else if (Constraint.size() == 2) { + switch (Constraint[0]) { + default: break; + // All 'U+' constraints are addresses. + case 'U': return C_Memory; + } + } + return TargetLowering::getConstraintType(Constraint); +} + +/// Examine constraint type and operand type and determine a weight value. +/// This object must already have been set up with the operand type +/// and the current alternative constraint selected. +TargetLowering::ConstraintWeight +ARMTargetLowering::getSingleConstraintMatchWeight( + AsmOperandInfo &info, const char *constraint) const { + ConstraintWeight weight = CW_Invalid; + Value *CallOperandVal = info.CallOperandVal; + // If we don't have a value, we can't do a match, + // but allow it at the lowest weight. + if (CallOperandVal == NULL) + return CW_Default; + Type *type = CallOperandVal->getType(); + // Look at the constraint type. + switch (*constraint) { + default: + weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); + break; + case 'l': + if (type->isIntegerTy()) { + if (Subtarget->isThumb()) + weight = CW_SpecificReg; + else + weight = CW_Register; + } + break; + case 'w': + if (type->isFloatingPointTy()) + weight = CW_Register; + break; + } + return weight; +} + +typedef std::pair<unsigned, const TargetRegisterClass*> RCPair; +RCPair +ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, + EVT VT) const { + if (Constraint.size() == 1) { + // GCC ARM Constraint Letters + switch (Constraint[0]) { + case 'l': // Low regs or general regs. + if (Subtarget->isThumb()) + return RCPair(0U, ARM::tGPRRegisterClass); + else + return RCPair(0U, ARM::GPRRegisterClass); + case 'h': // High regs or no regs. + if (Subtarget->isThumb()) + return RCPair(0U, ARM::hGPRRegisterClass); + break; + case 'r': + return RCPair(0U, ARM::GPRRegisterClass); + case 'w': + if (VT == MVT::f32) + return RCPair(0U, ARM::SPRRegisterClass); + if (VT.getSizeInBits() == 64) + return RCPair(0U, ARM::DPRRegisterClass); + if (VT.getSizeInBits() == 128) + return RCPair(0U, ARM::QPRRegisterClass); + break; + case 'x': + if (VT == MVT::f32) + return RCPair(0U, ARM::SPR_8RegisterClass); + if (VT.getSizeInBits() == 64) + return RCPair(0U, ARM::DPR_8RegisterClass); + if (VT.getSizeInBits() == 128) + return RCPair(0U, ARM::QPR_8RegisterClass); + break; + case 't': + if (VT == MVT::f32) + return RCPair(0U, ARM::SPRRegisterClass); + break; + } + } + if (StringRef("{cc}").equals_lower(Constraint)) + return std::make_pair(unsigned(ARM::CPSR), ARM::CCRRegisterClass); + + return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); +} + +/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops +/// vector. If it is invalid, don't add anything to Ops. +void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, + std::string &Constraint, + std::vector<SDValue>&Ops, + SelectionDAG &DAG) const { + SDValue Result(0, 0); + + // Currently only support length 1 constraints. + if (Constraint.length() != 1) return; + + char ConstraintLetter = Constraint[0]; + switch (ConstraintLetter) { + default: break; + case 'j': + case 'I': case 'J': case 'K': case 'L': + case 'M': case 'N': case 'O': + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); + if (!C) + return; + + int64_t CVal64 = C->getSExtValue(); + int CVal = (int) CVal64; + // None of these constraints allow values larger than 32 bits. Check + // that the value fits in an int. + if (CVal != CVal64) + return; + + switch (ConstraintLetter) { + case 'j': + // Constant suitable for movw, must be between 0 and + // 65535. + if (Subtarget->hasV6T2Ops()) + if (CVal >= 0 && CVal <= 65535) + break; + return; + case 'I': + if (Subtarget->isThumb1Only()) { + // This must be a constant between 0 and 255, for ADD + // immediates. + if (CVal >= 0 && CVal <= 255) + break; + } else if (Subtarget->isThumb2()) { + // A constant that can be used as an immediate value in a + // data-processing instruction. + if (ARM_AM::getT2SOImmVal(CVal) != -1) + break; + } else { + // A constant that can be used as an immediate value in a + // data-processing instruction. + if (ARM_AM::getSOImmVal(CVal) != -1) + break; + } + return; + + case 'J': + if (Subtarget->isThumb()) { // FIXME thumb2 + // This must be a constant between -255 and -1, for negated ADD + // immediates. This can be used in GCC with an "n" modifier that + // prints the negated value, for use with SUB instructions. It is + // not useful otherwise but is implemented for compatibility. + if (CVal >= -255 && CVal <= -1) + break; + } else { + // This must be a constant between -4095 and 4095. It is not clear + // what this constraint is intended for. Implemented for + // compatibility with GCC. + if (CVal >= -4095 && CVal <= 4095) + break; + } + return; + + case 'K': + if (Subtarget->isThumb1Only()) { + // A 32-bit value where only one byte has a nonzero value. Exclude + // zero to match GCC. This constraint is used by GCC internally for + // constants that can be loaded with a move/shift combination. + // It is not useful otherwise but is implemented for compatibility. + if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal)) + break; + } else if (Subtarget->isThumb2()) { + // A constant whose bitwise inverse can be used as an immediate + // value in a data-processing instruction. This can be used in GCC + // with a "B" modifier that prints the inverted value, for use with + // BIC and MVN instructions. It is not useful otherwise but is + // implemented for compatibility. + if (ARM_AM::getT2SOImmVal(~CVal) != -1) + break; + } else { + // A constant whose bitwise inverse can be used as an immediate + // value in a data-processing instruction. This can be used in GCC + // with a "B" modifier that prints the inverted value, for use with + // BIC and MVN instructions. It is not useful otherwise but is + // implemented for compatibility. + if (ARM_AM::getSOImmVal(~CVal) != -1) + break; + } + return; + + case 'L': + if (Subtarget->isThumb1Only()) { + // This must be a constant between -7 and 7, + // for 3-operand ADD/SUB immediate instructions. + if (CVal >= -7 && CVal < 7) + break; + } else if (Subtarget->isThumb2()) { + // A constant whose negation can be used as an immediate value in a + // data-processing instruction. This can be used in GCC with an "n" + // modifier that prints the negated value, for use with SUB + // instructions. It is not useful otherwise but is implemented for + // compatibility. + if (ARM_AM::getT2SOImmVal(-CVal) != -1) + break; + } else { + // A constant whose negation can be used as an immediate value in a + // data-processing instruction. This can be used in GCC with an "n" + // modifier that prints the negated value, for use with SUB + // instructions. It is not useful otherwise but is implemented for + // compatibility. + if (ARM_AM::getSOImmVal(-CVal) != -1) + break; + } + return; + + case 'M': + if (Subtarget->isThumb()) { // FIXME thumb2 + // This must be a multiple of 4 between 0 and 1020, for + // ADD sp + immediate. + if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0)) + break; + } else { + // A power of two or a constant between 0 and 32. This is used in + // GCC for the shift amount on shifted register operands, but it is + // useful in general for any shift amounts. + if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0)) + break; + } + return; + + case 'N': + if (Subtarget->isThumb()) { // FIXME thumb2 + // This must be a constant between 0 and 31, for shift amounts. + if (CVal >= 0 && CVal <= 31) + break; + } + return; + + case 'O': + if (Subtarget->isThumb()) { // FIXME thumb2 + // This must be a multiple of 4 between -508 and 508, for + // ADD/SUB sp = sp + immediate. + if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0)) + break; + } + return; + } + Result = DAG.getTargetConstant(CVal, Op.getValueType()); + break; + } + + if (Result.getNode()) { + Ops.push_back(Result); + return; + } + return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); +} + +bool +ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { + // The ARM target isn't yet aware of offsets. + return false; +} + +bool ARM::isBitFieldInvertedMask(unsigned v) { + if (v == 0xffffffff) + return 0; + // there can be 1's on either or both "outsides", all the "inside" + // bits must be 0's + unsigned int lsb = 0, msb = 31; + while (v & (1 << msb)) --msb; + while (v & (1 << lsb)) ++lsb; + for (unsigned int i = lsb; i <= msb; ++i) { + if (v & (1 << i)) + return 0; + } + return 1; +} + +/// isFPImmLegal - Returns true if the target can instruction select the +/// specified FP immediate natively. If false, the legalizer will +/// materialize the FP immediate as a load from a constant pool. +bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { + if (!Subtarget->hasVFP3()) + return false; + if (VT == MVT::f32) + return ARM_AM::getFP32Imm(Imm) != -1; + if (VT == MVT::f64) + return ARM_AM::getFP64Imm(Imm) != -1; + return false; +} + +/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as +/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment +/// specified in the intrinsic calls. +bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, + unsigned Intrinsic) const { + switch (Intrinsic) { + case Intrinsic::arm_neon_vld1: + case Intrinsic::arm_neon_vld2: + case Intrinsic::arm_neon_vld3: + case Intrinsic::arm_neon_vld4: + case Intrinsic::arm_neon_vld2lane: + case Intrinsic::arm_neon_vld3lane: + case Intrinsic::arm_neon_vld4lane: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + // Conservatively set memVT to the entire set of vectors loaded. + uint64_t NumElts = getTargetData()->getTypeAllocSize(I.getType()) / 8; + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); + Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); + Info.vol = false; // volatile loads with NEON intrinsics not supported + Info.readMem = true; + Info.writeMem = false; + return true; + } + case Intrinsic::arm_neon_vst1: + case Intrinsic::arm_neon_vst2: + case Intrinsic::arm_neon_vst3: + case Intrinsic::arm_neon_vst4: + case Intrinsic::arm_neon_vst2lane: + case Intrinsic::arm_neon_vst3lane: + case Intrinsic::arm_neon_vst4lane: { + Info.opc = ISD::INTRINSIC_VOID; + // Conservatively set memVT to the entire set of vectors stored. + unsigned NumElts = 0; + for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { + Type *ArgTy = I.getArgOperand(ArgI)->getType(); + if (!ArgTy->isVectorTy()) + break; + NumElts += getTargetData()->getTypeAllocSize(ArgTy) / 8; + } + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); + Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); + Info.vol = false; // volatile stores with NEON intrinsics not supported + Info.readMem = false; + Info.writeMem = true; + return true; + } + case Intrinsic::arm_strexd: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::i64; + Info.ptrVal = I.getArgOperand(2); + Info.offset = 0; + Info.align = 8; + Info.vol = true; + Info.readMem = false; + Info.writeMem = true; + return true; + } + case Intrinsic::arm_ldrexd: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::i64; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = 8; + Info.vol = true; + Info.readMem = true; + Info.writeMem = false; + return true; + } + default: + break; + } + + return false; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h new file mode 100644 index 0000000..5da9b27 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h @@ -0,0 +1,540 @@ +//===-- ARMISelLowering.h - ARM DAG Lowering Interface ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that ARM uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMISELLOWERING_H +#define ARMISELLOWERING_H + +#include "ARMSubtarget.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/CodeGen/FastISel.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include <vector> + +namespace llvm { + class ARMConstantPoolValue; + + namespace ARMISD { + // ARM Specific DAG Nodes + enum NodeType { + // Start the numbering where the builtin ops and target ops leave off. + FIRST_NUMBER = ISD::BUILTIN_OP_END, + + Wrapper, // Wrapper - A wrapper node for TargetConstantPool, + // TargetExternalSymbol, and TargetGlobalAddress. + WrapperDYN, // WrapperDYN - A wrapper node for TargetGlobalAddress in + // DYN mode. + WrapperPIC, // WrapperPIC - A wrapper node for TargetGlobalAddress in + // PIC mode. + WrapperJT, // WrapperJT - A wrapper node for TargetJumpTable + + CALL, // Function call. + CALL_PRED, // Function call that's predicable. + CALL_NOLINK, // Function call with branch not branch-and-link. + tCALL, // Thumb function call. + BRCOND, // Conditional branch. + BR_JT, // Jumptable branch. + BR2_JT, // Jumptable branch (2 level - jumptable entry is a jump). + RET_FLAG, // Return with a flag operand. + + PIC_ADD, // Add with a PC operand and a PIC label. + + CMP, // ARM compare instructions. + CMPZ, // ARM compare that sets only Z flag. + CMPFP, // ARM VFP compare instruction, sets FPSCR. + CMPFPw0, // ARM VFP compare against zero instruction, sets FPSCR. + FMSTAT, // ARM fmstat instruction. + CMOV, // ARM conditional move instructions. + + BCC_i64, + + RBIT, // ARM bitreverse instruction + + FTOSI, // FP to sint within a FP register. + FTOUI, // FP to uint within a FP register. + SITOF, // sint to FP within a FP register. + UITOF, // uint to FP within a FP register. + + SRL_FLAG, // V,Flag = srl_flag X -> srl X, 1 + save carry out. + SRA_FLAG, // V,Flag = sra_flag X -> sra X, 1 + save carry out. + RRX, // V = RRX X, Flag -> srl X, 1 + shift in carry flag. + + ADDC, // Add with carry + ADDE, // Add using carry + SUBC, // Sub with carry + SUBE, // Sub using carry + + VMOVRRD, // double to two gprs. + VMOVDRR, // Two gprs to double. + + EH_SJLJ_SETJMP, // SjLj exception handling setjmp. + EH_SJLJ_LONGJMP, // SjLj exception handling longjmp. + EH_SJLJ_DISPATCHSETUP, // SjLj exception handling dispatch setup. + + TC_RETURN, // Tail call return pseudo. + + THREAD_POINTER, + + DYN_ALLOC, // Dynamic allocation on the stack. + + MEMBARRIER, // Memory barrier (DMB) + MEMBARRIER_MCR, // Memory barrier (MCR) + + PRELOAD, // Preload + + VCEQ, // Vector compare equal. + VCEQZ, // Vector compare equal to zero. + VCGE, // Vector compare greater than or equal. + VCGEZ, // Vector compare greater than or equal to zero. + VCLEZ, // Vector compare less than or equal to zero. + VCGEU, // Vector compare unsigned greater than or equal. + VCGT, // Vector compare greater than. + VCGTZ, // Vector compare greater than zero. + VCLTZ, // Vector compare less than zero. + VCGTU, // Vector compare unsigned greater than. + VTST, // Vector test bits. + + // Vector shift by immediate: + VSHL, // ...left + VSHRs, // ...right (signed) + VSHRu, // ...right (unsigned) + VSHLLs, // ...left long (signed) + VSHLLu, // ...left long (unsigned) + VSHLLi, // ...left long (with maximum shift count) + VSHRN, // ...right narrow + + // Vector rounding shift by immediate: + VRSHRs, // ...right (signed) + VRSHRu, // ...right (unsigned) + VRSHRN, // ...right narrow + + // Vector saturating shift by immediate: + VQSHLs, // ...left (signed) + VQSHLu, // ...left (unsigned) + VQSHLsu, // ...left (signed to unsigned) + VQSHRNs, // ...right narrow (signed) + VQSHRNu, // ...right narrow (unsigned) + VQSHRNsu, // ...right narrow (signed to unsigned) + + // Vector saturating rounding shift by immediate: + VQRSHRNs, // ...right narrow (signed) + VQRSHRNu, // ...right narrow (unsigned) + VQRSHRNsu, // ...right narrow (signed to unsigned) + + // Vector shift and insert: + VSLI, // ...left + VSRI, // ...right + + // Vector get lane (VMOV scalar to ARM core register) + // (These are used for 8- and 16-bit element types only.) + VGETLANEu, // zero-extend vector extract element + VGETLANEs, // sign-extend vector extract element + + // Vector move immediate and move negated immediate: + VMOVIMM, + VMVNIMM, + + // Vector duplicate: + VDUP, + VDUPLANE, + + // Vector shuffles: + VEXT, // extract + VREV64, // reverse elements within 64-bit doublewords + VREV32, // reverse elements within 32-bit words + VREV16, // reverse elements within 16-bit halfwords + VZIP, // zip (interleave) + VUZP, // unzip (deinterleave) + VTRN, // transpose + VTBL1, // 1-register shuffle with mask + VTBL2, // 2-register shuffle with mask + + // Vector multiply long: + VMULLs, // ...signed + VMULLu, // ...unsigned + + // Operands of the standard BUILD_VECTOR node are not legalized, which + // is fine if BUILD_VECTORs are always lowered to shuffles or other + // operations, but for ARM some BUILD_VECTORs are legal as-is and their + // operands need to be legalized. Define an ARM-specific version of + // BUILD_VECTOR for this purpose. + BUILD_VECTOR, + + // Floating-point max and min: + FMAX, + FMIN, + + // Bit-field insert + BFI, + + // Vector OR with immediate + VORRIMM, + // Vector AND with NOT of immediate + VBICIMM, + + // Vector bitwise select + VBSL, + + // Vector load N-element structure to all lanes: + VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE, + VLD3DUP, + VLD4DUP, + + // NEON loads with post-increment base updates: + VLD1_UPD, + VLD2_UPD, + VLD3_UPD, + VLD4_UPD, + VLD2LN_UPD, + VLD3LN_UPD, + VLD4LN_UPD, + VLD2DUP_UPD, + VLD3DUP_UPD, + VLD4DUP_UPD, + + // NEON stores with post-increment base updates: + VST1_UPD, + VST2_UPD, + VST3_UPD, + VST4_UPD, + VST2LN_UPD, + VST3LN_UPD, + VST4LN_UPD, + + // 64-bit atomic ops (value split into two registers) + ATOMADD64_DAG, + ATOMSUB64_DAG, + ATOMOR64_DAG, + ATOMXOR64_DAG, + ATOMAND64_DAG, + ATOMNAND64_DAG, + ATOMSWAP64_DAG, + ATOMCMPXCHG64_DAG + }; + } + + /// Define some predicates that are used for node matching. + namespace ARM { + bool isBitFieldInvertedMask(unsigned v); + } + + //===--------------------------------------------------------------------===// + // ARMTargetLowering - ARM Implementation of the TargetLowering interface + + class ARMTargetLowering : public TargetLowering { + public: + explicit ARMTargetLowering(TargetMachine &TM); + + virtual unsigned getJumpTableEncoding(void) const; + + virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const; + + /// ReplaceNodeResults - Replace the results of node with an illegal result + /// type with new values built out of custom code. + /// + virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results, + SelectionDAG &DAG) const; + + virtual const char *getTargetNodeName(unsigned Opcode) const; + + /// getSetCCResultType - Return the value type to use for ISD::SETCC. + virtual EVT getSetCCResultType(EVT VT) const; + + virtual MachineBasicBlock * + EmitInstrWithCustomInserter(MachineInstr *MI, + MachineBasicBlock *MBB) const; + + virtual void + AdjustInstrPostInstrSelection(MachineInstr *MI, SDNode *Node) const; + + SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const; + virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; + + bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const; + + /// allowsUnalignedMemoryAccesses - Returns true if the target allows + /// unaligned memory accesses. of the specified type. + /// FIXME: Add getOptimalMemOpType to implement memcpy with NEON? + virtual bool allowsUnalignedMemoryAccesses(EVT VT) const; + + /// isLegalAddressingMode - Return true if the addressing mode represented + /// by AM is legal for this target, for a load/store of the specified type. + virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty)const; + bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const; + + /// isLegalICmpImmediate - Return true if the specified immediate is legal + /// icmp immediate, that is the target has icmp instructions which can + /// compare a register against the immediate without having to materialize + /// the immediate into a register. + virtual bool isLegalICmpImmediate(int64_t Imm) const; + + /// isLegalAddImmediate - Return true if the specified immediate is legal + /// add immediate, that is the target has add instructions which can + /// add a register and the immediate without having to materialize + /// the immediate into a register. + virtual bool isLegalAddImmediate(int64_t Imm) const; + + /// getPreIndexedAddressParts - returns true by value, base pointer and + /// offset pointer and addressing mode by reference if the node's address + /// can be legally represented as pre-indexed load / store address. + virtual bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, + SDValue &Offset, + ISD::MemIndexedMode &AM, + SelectionDAG &DAG) const; + + /// getPostIndexedAddressParts - returns true by value, base pointer and + /// offset pointer and addressing mode by reference if this node can be + /// combined with a load / store to form a post-indexed load / store. + virtual bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, + SDValue &Base, SDValue &Offset, + ISD::MemIndexedMode &AM, + SelectionDAG &DAG) const; + + virtual void computeMaskedBitsForTargetNode(const SDValue Op, + const APInt &Mask, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) const; + + + virtual bool ExpandInlineAsm(CallInst *CI) const; + + ConstraintType getConstraintType(const std::string &Constraint) const; + + /// Examine constraint string and operand type and determine a weight value. + /// The operand object must already have been set up with the operand type. + ConstraintWeight getSingleConstraintMatchWeight( + AsmOperandInfo &info, const char *constraint) const; + + std::pair<unsigned, const TargetRegisterClass*> + getRegForInlineAsmConstraint(const std::string &Constraint, + EVT VT) const; + + /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops + /// vector. If it is invalid, don't add anything to Ops. If hasMemory is + /// true it means one of the asm constraint of the inline asm instruction + /// being processed is 'm'. + virtual void LowerAsmOperandForConstraint(SDValue Op, + std::string &Constraint, + std::vector<SDValue> &Ops, + SelectionDAG &DAG) const; + + const ARMSubtarget* getSubtarget() const { + return Subtarget; + } + + /// getRegClassFor - Return the register class that should be used for the + /// specified value type. + virtual TargetRegisterClass *getRegClassFor(EVT VT) const; + + /// getMaximalGlobalOffset - Returns the maximal possible offset which can + /// be used for loads / stores from the global. + virtual unsigned getMaximalGlobalOffset() const; + + /// createFastISel - This method returns a target specific FastISel object, + /// or null if the target does not support "fast" ISel. + virtual FastISel *createFastISel(FunctionLoweringInfo &funcInfo) const; + + Sched::Preference getSchedulingPreference(SDNode *N) const; + + bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const; + bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const; + + /// isFPImmLegal - Returns true if the target can instruction select the + /// specified FP immediate natively. If false, the legalizer will + /// materialize the FP immediate as a load from a constant pool. + virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const; + + virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, + unsigned Intrinsic) const; + protected: + std::pair<const TargetRegisterClass*, uint8_t> + findRepresentativeClass(EVT VT) const; + + private: + /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can + /// make the right decision when generating code for different targets. + const ARMSubtarget *Subtarget; + + const TargetRegisterInfo *RegInfo; + + const InstrItineraryData *Itins; + + /// ARMPCLabelIndex - Keep track of the number of ARM PC labels created. + /// + unsigned ARMPCLabelIndex; + + void addTypeForNEON(EVT VT, EVT PromotedLdStVT, EVT PromotedBitwiseVT); + void addDRTypeForNEON(EVT VT); + void addQRTypeForNEON(EVT VT); + + typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector; + void PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG, + SDValue Chain, SDValue &Arg, + RegsToPassVector &RegsToPass, + CCValAssign &VA, CCValAssign &NextVA, + SDValue &StackPtr, + SmallVector<SDValue, 8> &MemOpChains, + ISD::ArgFlagsTy Flags) const; + SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, + SDValue &Root, SelectionDAG &DAG, + DebugLoc dl) const; + + CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return, + bool isVarArg) const; + SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, + DebugLoc dl, SelectionDAG &DAG, + const CCValAssign &VA, + ISD::ArgFlagsTy Flags) const; + SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEH_SJLJ_DISPATCHSETUP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) const; + SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, + SelectionDAG &DAG) const; + SDValue LowerToTLSExecModels(GlobalAddressSDNode *GA, + SelectionDAG &DAG) const; + SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) const; + + SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerCallResult(SDValue Chain, SDValue InFlag, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + + virtual SDValue + LowerFormalArguments(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + + void VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, + DebugLoc dl, SDValue &Chain, unsigned ArgOffset) + const; + + void computeRegArea(CCState &CCInfo, MachineFunction &MF, + unsigned &VARegSize, unsigned &VARegSaveSize) const; + + virtual SDValue + LowerCall(SDValue Chain, SDValue Callee, + CallingConv::ID CallConv, bool isVarArg, + bool &isTailCall, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + + /// HandleByVal - Target-specific cleanup for ByVal support. + virtual void HandleByVal(CCState *, unsigned &) const; + + /// IsEligibleForTailCallOptimization - Check whether the call is eligible + /// for tail call optimization. Targets which want to do tail call + /// optimization should implement this function. + bool IsEligibleForTailCallOptimization(SDValue Callee, + CallingConv::ID CalleeCC, + bool isVarArg, + bool isCalleeStructRet, + bool isCallerStructRet, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + SelectionDAG& DAG) const; + virtual SDValue + LowerReturn(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + DebugLoc dl, SelectionDAG &DAG) const; + + virtual bool isUsedByReturnOnly(SDNode *N) const; + + virtual bool mayBeEmittedAsTailCall(CallInst *CI) const; + + SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, + SDValue &ARMcc, SelectionDAG &DAG, DebugLoc dl) const; + SDValue getVFPCmp(SDValue LHS, SDValue RHS, + SelectionDAG &DAG, DebugLoc dl) const; + SDValue duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const; + + SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const; + + MachineBasicBlock *EmitAtomicCmpSwap(MachineInstr *MI, + MachineBasicBlock *BB, + unsigned Size) const; + MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI, + MachineBasicBlock *BB, + unsigned Size, + unsigned BinOpcode) const; + MachineBasicBlock *EmitAtomicBinary64(MachineInstr *MI, + MachineBasicBlock *BB, + unsigned Op1, + unsigned Op2, + bool NeedsCarry = false, + bool IsCmpxchg = false) const; + MachineBasicBlock * EmitAtomicBinaryMinMax(MachineInstr *MI, + MachineBasicBlock *BB, + unsigned Size, + bool signExtend, + ARMCC::CondCodes Cond) const; + + void EmitBasePointerRecalculation(MachineInstr *MI, MachineBasicBlock *MBB, + MachineBasicBlock *DispatchBB) const; + + void SetupEntryBlockForSjLj(MachineInstr *MI, + MachineBasicBlock *MBB, + MachineBasicBlock *DispatchBB, int FI) const; + + MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr *MI, + MachineBasicBlock *MBB) const; + + bool RemapAddSubWithFlags(MachineInstr *MI, MachineBasicBlock *BB) const; + }; + + enum NEONModImmType { + VMOVModImm, + VMVNModImm, + OtherModImm + }; + + + namespace ARM { + FastISel *createFastISel(FunctionLoweringInfo &funcInfo); + } +} + +#endif // ARMISELLOWERING_H diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td new file mode 100644 index 0000000..7cbc911 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td @@ -0,0 +1,1965 @@ +//===- ARMInstrFormats.td - ARM Instruction Formats ----------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// +// ARM Instruction Format Definitions. +// + +// Format specifies the encoding used by the instruction. This is part of the +// ad-hoc solution used to emit machine instruction encodings by our machine +// code emitter. +class Format<bits<6> val> { + bits<6> Value = val; +} + +def Pseudo : Format<0>; +def MulFrm : Format<1>; +def BrFrm : Format<2>; +def BrMiscFrm : Format<3>; + +def DPFrm : Format<4>; +def DPSoRegRegFrm : Format<5>; + +def LdFrm : Format<6>; +def StFrm : Format<7>; +def LdMiscFrm : Format<8>; +def StMiscFrm : Format<9>; +def LdStMulFrm : Format<10>; + +def LdStExFrm : Format<11>; + +def ArithMiscFrm : Format<12>; +def SatFrm : Format<13>; +def ExtFrm : Format<14>; + +def VFPUnaryFrm : Format<15>; +def VFPBinaryFrm : Format<16>; +def VFPConv1Frm : Format<17>; +def VFPConv2Frm : Format<18>; +def VFPConv3Frm : Format<19>; +def VFPConv4Frm : Format<20>; +def VFPConv5Frm : Format<21>; +def VFPLdStFrm : Format<22>; +def VFPLdStMulFrm : Format<23>; +def VFPMiscFrm : Format<24>; + +def ThumbFrm : Format<25>; +def MiscFrm : Format<26>; + +def NGetLnFrm : Format<27>; +def NSetLnFrm : Format<28>; +def NDupFrm : Format<29>; +def NLdStFrm : Format<30>; +def N1RegModImmFrm: Format<31>; +def N2RegFrm : Format<32>; +def NVCVTFrm : Format<33>; +def NVDupLnFrm : Format<34>; +def N2RegVShLFrm : Format<35>; +def N2RegVShRFrm : Format<36>; +def N3RegFrm : Format<37>; +def N3RegVShFrm : Format<38>; +def NVExtFrm : Format<39>; +def NVMulSLFrm : Format<40>; +def NVTBLFrm : Format<41>; +def DPSoRegImmFrm : Format<42>; + +// Misc flags. + +// The instruction has an Rn register operand. +// UnaryDP - Indicates this is a unary data processing instruction, i.e. +// it doesn't have a Rn operand. +class UnaryDP { bit isUnaryDataProc = 1; } + +// Xform16Bit - Indicates this Thumb2 instruction may be transformed into +// a 16-bit Thumb instruction if certain conditions are met. +class Xform16Bit { bit canXformTo16Bit = 1; } + +//===----------------------------------------------------------------------===// +// ARM Instruction flags. These need to match ARMBaseInstrInfo.h. +// + +// FIXME: Once the JIT is MC-ized, these can go away. +// Addressing mode. +class AddrMode<bits<5> val> { + bits<5> Value = val; +} +def AddrModeNone : AddrMode<0>; +def AddrMode1 : AddrMode<1>; +def AddrMode2 : AddrMode<2>; +def AddrMode3 : AddrMode<3>; +def AddrMode4 : AddrMode<4>; +def AddrMode5 : AddrMode<5>; +def AddrMode6 : AddrMode<6>; +def AddrModeT1_1 : AddrMode<7>; +def AddrModeT1_2 : AddrMode<8>; +def AddrModeT1_4 : AddrMode<9>; +def AddrModeT1_s : AddrMode<10>; +def AddrModeT2_i12 : AddrMode<11>; +def AddrModeT2_i8 : AddrMode<12>; +def AddrModeT2_so : AddrMode<13>; +def AddrModeT2_pc : AddrMode<14>; +def AddrModeT2_i8s4 : AddrMode<15>; +def AddrMode_i12 : AddrMode<16>; + +// Load / store index mode. +class IndexMode<bits<2> val> { + bits<2> Value = val; +} +def IndexModeNone : IndexMode<0>; +def IndexModePre : IndexMode<1>; +def IndexModePost : IndexMode<2>; +def IndexModeUpd : IndexMode<3>; + +// Instruction execution domain. +class Domain<bits<3> val> { + bits<3> Value = val; +} +def GenericDomain : Domain<0>; +def VFPDomain : Domain<1>; // Instructions in VFP domain only +def NeonDomain : Domain<2>; // Instructions in Neon domain only +def VFPNeonDomain : Domain<3>; // Instructions in both VFP & Neon domains +def VFPNeonA8Domain : Domain<5>; // Instructions in VFP & Neon under A8 + +//===----------------------------------------------------------------------===// +// ARM special operands. +// + +// ARM imod and iflag operands, used only by the CPS instruction. +def imod_op : Operand<i32> { + let PrintMethod = "printCPSIMod"; +} + +def ProcIFlagsOperand : AsmOperandClass { + let Name = "ProcIFlags"; + let ParserMethod = "parseProcIFlagsOperand"; +} +def iflags_op : Operand<i32> { + let PrintMethod = "printCPSIFlag"; + let ParserMatchClass = ProcIFlagsOperand; +} + +// ARM Predicate operand. Default to 14 = always (AL). Second part is CC +// register whose default is 0 (no register). +def CondCodeOperand : AsmOperandClass { let Name = "CondCode"; } +def pred : PredicateOperand<OtherVT, (ops i32imm, i32imm), + (ops (i32 14), (i32 zero_reg))> { + let PrintMethod = "printPredicateOperand"; + let ParserMatchClass = CondCodeOperand; + let DecoderMethod = "DecodePredicateOperand"; +} + +// Conditional code result for instructions whose 's' bit is set, e.g. subs. +def CCOutOperand : AsmOperandClass { let Name = "CCOut"; } +def cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 zero_reg))> { + let EncoderMethod = "getCCOutOpValue"; + let PrintMethod = "printSBitModifierOperand"; + let ParserMatchClass = CCOutOperand; + let DecoderMethod = "DecodeCCOutOperand"; +} + +// Same as cc_out except it defaults to setting CPSR. +def s_cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 CPSR))> { + let EncoderMethod = "getCCOutOpValue"; + let PrintMethod = "printSBitModifierOperand"; + let ParserMatchClass = CCOutOperand; + let DecoderMethod = "DecodeCCOutOperand"; +} + +// ARM special operands for disassembly only. +// +def SetEndAsmOperand : AsmOperandClass { + let Name = "SetEndImm"; + let ParserMethod = "parseSetEndImm"; +} +def setend_op : Operand<i32> { + let PrintMethod = "printSetendOperand"; + let ParserMatchClass = SetEndAsmOperand; +} + +def MSRMaskOperand : AsmOperandClass { + let Name = "MSRMask"; + let ParserMethod = "parseMSRMaskOperand"; +} +def msr_mask : Operand<i32> { + let PrintMethod = "printMSRMaskOperand"; + let DecoderMethod = "DecodeMSRMask"; + let ParserMatchClass = MSRMaskOperand; +} + +// Shift Right Immediate - A shift right immediate is encoded differently from +// other shift immediates. The imm6 field is encoded like so: +// +// Offset Encoding +// 8 imm6<5:3> = '001', 8 - <imm> is encoded in imm6<2:0> +// 16 imm6<5:4> = '01', 16 - <imm> is encoded in imm6<3:0> +// 32 imm6<5> = '1', 32 - <imm> is encoded in imm6<4:0> +// 64 64 - <imm> is encoded in imm6<5:0> +def shr_imm8 : Operand<i32> { + let EncoderMethod = "getShiftRight8Imm"; + let DecoderMethod = "DecodeShiftRight8Imm"; +} +def shr_imm16 : Operand<i32> { + let EncoderMethod = "getShiftRight16Imm"; + let DecoderMethod = "DecodeShiftRight16Imm"; +} +def shr_imm32 : Operand<i32> { + let EncoderMethod = "getShiftRight32Imm"; + let DecoderMethod = "DecodeShiftRight32Imm"; +} +def shr_imm64 : Operand<i32> { + let EncoderMethod = "getShiftRight64Imm"; + let DecoderMethod = "DecodeShiftRight64Imm"; +} + +//===----------------------------------------------------------------------===// +// ARM Assembler alias templates. +// +class ARMInstAlias<string Asm, dag Result, bit Emit = 0b1> + : InstAlias<Asm, Result, Emit>, Requires<[IsARM]>; +class tInstAlias<string Asm, dag Result, bit Emit = 0b1> + : InstAlias<Asm, Result, Emit>, Requires<[IsThumb]>; +class t2InstAlias<string Asm, dag Result, bit Emit = 0b1> + : InstAlias<Asm, Result, Emit>, Requires<[IsThumb2]>; +class VFP2InstAlias<string Asm, dag Result, bit Emit = 0b1> + : InstAlias<Asm, Result, Emit>, Requires<[HasVFP2]>; +class VFP3InstAlias<string Asm, dag Result, bit Emit = 0b1> + : InstAlias<Asm, Result, Emit>, Requires<[HasVFP3]>; + +//===----------------------------------------------------------------------===// +// ARM Instruction templates. +// + + +class InstTemplate<AddrMode am, int sz, IndexMode im, + Format f, Domain d, string cstr, InstrItinClass itin> + : Instruction { + let Namespace = "ARM"; + + AddrMode AM = am; + int Size = sz; + IndexMode IM = im; + bits<2> IndexModeBits = IM.Value; + Format F = f; + bits<6> Form = F.Value; + Domain D = d; + bit isUnaryDataProc = 0; + bit canXformTo16Bit = 0; + // The instruction is a 16-bit flag setting Thumb instruction. Used + // by the parser to determine whether to require the 'S' suffix on the + // mnemonic (when not in an IT block) or preclude it (when in an IT block). + bit thumbArithFlagSetting = 0; + + // If this is a pseudo instruction, mark it isCodeGenOnly. + let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo"); + + // The layout of TSFlags should be kept in sync with ARMBaseInfo.h. + let TSFlags{4-0} = AM.Value; + let TSFlags{6-5} = IndexModeBits; + let TSFlags{12-7} = Form; + let TSFlags{13} = isUnaryDataProc; + let TSFlags{14} = canXformTo16Bit; + let TSFlags{17-15} = D.Value; + let TSFlags{18} = thumbArithFlagSetting; + + let Constraints = cstr; + let Itinerary = itin; +} + +class Encoding { + field bits<32> Inst; +} + +class InstARM<AddrMode am, int sz, IndexMode im, + Format f, Domain d, string cstr, InstrItinClass itin> + : InstTemplate<am, sz, im, f, d, cstr, itin>, Encoding { + let DecoderNamespace = "ARM"; +} + +// This Encoding-less class is used by Thumb1 to specify the encoding bits later +// on by adding flavors to specific instructions. +class InstThumb<AddrMode am, int sz, IndexMode im, + Format f, Domain d, string cstr, InstrItinClass itin> + : InstTemplate<am, sz, im, f, d, cstr, itin> { + let DecoderNamespace = "Thumb"; +} + +class PseudoInst<dag oops, dag iops, InstrItinClass itin, list<dag> pattern> + : InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo, + GenericDomain, "", itin> { + let OutOperandList = oops; + let InOperandList = iops; + let Pattern = pattern; + let isCodeGenOnly = 1; + let isPseudo = 1; +} + +// PseudoInst that's ARM-mode only. +class ARMPseudoInst<dag oops, dag iops, int sz, InstrItinClass itin, + list<dag> pattern> + : PseudoInst<oops, iops, itin, pattern> { + let Size = sz; + list<Predicate> Predicates = [IsARM]; +} + +// PseudoInst that's Thumb-mode only. +class tPseudoInst<dag oops, dag iops, int sz, InstrItinClass itin, + list<dag> pattern> + : PseudoInst<oops, iops, itin, pattern> { + let Size = sz; + list<Predicate> Predicates = [IsThumb]; +} + +// PseudoInst that's Thumb2-mode only. +class t2PseudoInst<dag oops, dag iops, int sz, InstrItinClass itin, + list<dag> pattern> + : PseudoInst<oops, iops, itin, pattern> { + let Size = sz; + list<Predicate> Predicates = [IsThumb2]; +} + +class ARMPseudoExpand<dag oops, dag iops, int sz, + InstrItinClass itin, list<dag> pattern, + dag Result> + : ARMPseudoInst<oops, iops, sz, itin, pattern>, + PseudoInstExpansion<Result>; + +class tPseudoExpand<dag oops, dag iops, int sz, + InstrItinClass itin, list<dag> pattern, + dag Result> + : tPseudoInst<oops, iops, sz, itin, pattern>, + PseudoInstExpansion<Result>; + +class t2PseudoExpand<dag oops, dag iops, int sz, + InstrItinClass itin, list<dag> pattern, + dag Result> + : t2PseudoInst<oops, iops, sz, itin, pattern>, + PseudoInstExpansion<Result>; + +// Almost all ARM instructions are predicable. +class I<dag oops, dag iops, AddrMode am, int sz, + IndexMode im, Format f, InstrItinClass itin, + string opc, string asm, string cstr, + list<dag> pattern> + : InstARM<am, sz, im, f, GenericDomain, cstr, itin> { + bits<4> p; + let Inst{31-28} = p; + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat(opc, "${p}", asm); + let Pattern = pattern; + list<Predicate> Predicates = [IsARM]; +} + +// A few are not predicable +class InoP<dag oops, dag iops, AddrMode am, int sz, + IndexMode im, Format f, InstrItinClass itin, + string opc, string asm, string cstr, + list<dag> pattern> + : InstARM<am, sz, im, f, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = iops; + let AsmString = !strconcat(opc, asm); + let Pattern = pattern; + let isPredicable = 0; + list<Predicate> Predicates = [IsARM]; +} + +// Same as I except it can optionally modify CPSR. Note it's modeled as an input +// operand since by default it's a zero register. It will become an implicit def +// once it's "flipped". +class sI<dag oops, dag iops, AddrMode am, int sz, + IndexMode im, Format f, InstrItinClass itin, + string opc, string asm, string cstr, + list<dag> pattern> + : InstARM<am, sz, im, f, GenericDomain, cstr, itin> { + bits<4> p; // Predicate operand + bits<1> s; // condition-code set flag ('1' if the insn should set the flags) + let Inst{31-28} = p; + let Inst{20} = s; + + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p, cc_out:$s)); + let AsmString = !strconcat(opc, "${s}${p}", asm); + let Pattern = pattern; + list<Predicate> Predicates = [IsARM]; +} + +// Special cases +class XI<dag oops, dag iops, AddrMode am, int sz, + IndexMode im, Format f, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, im, f, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = iops; + let AsmString = asm; + let Pattern = pattern; + list<Predicate> Predicates = [IsARM]; +} + +class AI<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin, + opc, asm, "", pattern>; +class AsI<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : sI<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin, + opc, asm, "", pattern>; +class AXI<dag oops, dag iops, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin, + asm, "", pattern>; +class AInoP<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : InoP<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin, + opc, asm, "", pattern>; + +// Ctrl flow instructions +class ABI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, 4, IndexModeNone, BrFrm, itin, + opc, asm, "", pattern> { + let Inst{27-24} = opcod; +} +class ABXI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrModeNone, 4, IndexModeNone, BrFrm, itin, + asm, "", pattern> { + let Inst{27-24} = opcod; +} + +// BR_JT instructions +class JTI<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrModeNone, 0, IndexModeNone, BrMiscFrm, itin, + asm, "", pattern>; + +// Atomic load/store instructions +class AIldrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, 4, IndexModeNone, LdStExFrm, itin, + opc, asm, "", pattern> { + bits<4> Rt; + bits<4> addr; + let Inst{27-23} = 0b00011; + let Inst{22-21} = opcod; + let Inst{20} = 1; + let Inst{19-16} = addr; + let Inst{15-12} = Rt; + let Inst{11-0} = 0b111110011111; +} +class AIstrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, 4, IndexModeNone, LdStExFrm, itin, + opc, asm, "", pattern> { + bits<4> Rd; + bits<4> Rt; + bits<4> addr; + let Inst{27-23} = 0b00011; + let Inst{22-21} = opcod; + let Inst{20} = 0; + let Inst{19-16} = addr; + let Inst{15-12} = Rd; + let Inst{11-4} = 0b11111001; + let Inst{3-0} = Rt; +} +class AIswp<bit b, dag oops, dag iops, string opc, list<dag> pattern> + : AI<oops, iops, MiscFrm, NoItinerary, opc, "\t$Rt, $Rt2, $addr", pattern> { + bits<4> Rt; + bits<4> Rt2; + bits<4> addr; + let Inst{27-23} = 0b00010; + let Inst{22} = b; + let Inst{21-20} = 0b00; + let Inst{19-16} = addr; + let Inst{15-12} = Rt; + let Inst{11-4} = 0b00001001; + let Inst{3-0} = Rt2; +} + +// addrmode1 instructions +class AI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode1, 4, IndexModeNone, f, itin, + opc, asm, "", pattern> { + let Inst{24-21} = opcod; + let Inst{27-26} = 0b00; +} +class AsI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : sI<oops, iops, AddrMode1, 4, IndexModeNone, f, itin, + opc, asm, "", pattern> { + let Inst{24-21} = opcod; + let Inst{27-26} = 0b00; +} +class AXI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin, + string asm, list<dag> pattern> + : XI<oops, iops, AddrMode1, 4, IndexModeNone, f, itin, + asm, "", pattern> { + let Inst{24-21} = opcod; + let Inst{27-26} = 0b00; +} + +// loads + +// LDR/LDRB/STR/STRB/... +class AI2ldst<bits<3> op, bit isLd, bit isByte, dag oops, dag iops, AddrMode am, + Format f, InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : I<oops, iops, am, 4, IndexModeNone, f, itin, opc, asm, + "", pattern> { + let Inst{27-25} = op; + let Inst{24} = 1; // 24 == P + // 23 == U + let Inst{22} = isByte; + let Inst{21} = 0; // 21 == W + let Inst{20} = isLd; +} +// Indexed load/stores +class AI2ldstidx<bit isLd, bit isByte, bit isPre, dag oops, dag iops, + IndexMode im, Format f, InstrItinClass itin, string opc, + string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode2, 4, im, f, itin, + opc, asm, cstr, pattern> { + bits<4> Rt; + let Inst{27-26} = 0b01; + let Inst{24} = isPre; // P bit + let Inst{22} = isByte; // B bit + let Inst{21} = isPre; // W bit + let Inst{20} = isLd; // L bit + let Inst{15-12} = Rt; +} +class AI2stridx_reg<bit isByte, bit isPre, dag oops, dag iops, + IndexMode im, Format f, InstrItinClass itin, string opc, + string asm, string cstr, list<dag> pattern> + : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr, + pattern> { + // AM2 store w/ two operands: (GPR, am2offset) + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> Rn; + let Inst{25} = 1; + let Inst{23} = offset{12}; + let Inst{19-16} = Rn; + let Inst{11-5} = offset{11-5}; + let Inst{4} = 0; + let Inst{3-0} = offset{3-0}; +} + +class AI2stridx_imm<bit isByte, bit isPre, dag oops, dag iops, + IndexMode im, Format f, InstrItinClass itin, string opc, + string asm, string cstr, list<dag> pattern> + : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr, + pattern> { + // AM2 store w/ two operands: (GPR, am2offset) + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> Rn; + let Inst{25} = 0; + let Inst{23} = offset{12}; + let Inst{19-16} = Rn; + let Inst{11-0} = offset{11-0}; +} + + +// FIXME: Merge with the above class when addrmode2 gets used for STR, STRB +// but for now use this class for STRT and STRBT. +class AI2stridxT<bit isByte, bit isPre, dag oops, dag iops, + IndexMode im, Format f, InstrItinClass itin, string opc, + string asm, string cstr, list<dag> pattern> + : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr, + pattern> { + // AM2 store w/ two operands: (GPR, am2offset) + // {17-14} Rn + // {13} 1 == Rm, 0 == imm12 + // {12} isAdd + // {11-0} imm12/Rm + bits<18> addr; + let Inst{25} = addr{13}; + let Inst{23} = addr{12}; + let Inst{19-16} = addr{17-14}; + let Inst{11-0} = addr{11-0}; +} + +// addrmode3 instructions +class AI3ld<bits<4> op, bit op20, dag oops, dag iops, Format f, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode3, 4, IndexModeNone, f, itin, + opc, asm, "", pattern> { + bits<14> addr; + bits<4> Rt; + let Inst{27-25} = 0b000; + let Inst{24} = 1; // P bit + let Inst{23} = addr{8}; // U bit + let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm + let Inst{21} = 0; // W bit + let Inst{20} = op20; // L bit + let Inst{19-16} = addr{12-9}; // Rn + let Inst{15-12} = Rt; // Rt + let Inst{11-8} = addr{7-4}; // imm7_4/zero + let Inst{7-4} = op; + let Inst{3-0} = addr{3-0}; // imm3_0/Rm + + let DecoderMethod = "DecodeAddrMode3Instruction"; +} + +class AI3ldstidx<bits<4> op, bit op20, bit isPre, dag oops, dag iops, + IndexMode im, Format f, InstrItinClass itin, string opc, + string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode3, 4, im, f, itin, + opc, asm, cstr, pattern> { + bits<4> Rt; + let Inst{27-25} = 0b000; + let Inst{24} = isPre; // P bit + let Inst{21} = isPre; // W bit + let Inst{20} = op20; // L bit + let Inst{15-12} = Rt; // Rt + let Inst{7-4} = op; +} + +// FIXME: Merge with the above class when addrmode2 gets used for LDR, LDRB +// but for now use this class for LDRSBT, LDRHT, LDSHT. +class AI3ldstidxT<bits<4> op, bit isLoad, dag oops, dag iops, + IndexMode im, Format f, InstrItinClass itin, string opc, + string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode3, 4, im, f, itin, opc, asm, cstr, pattern> { + // {13} 1 == imm8, 0 == Rm + // {12-9} Rn + // {8} isAdd + // {7-4} imm7_4/zero + // {3-0} imm3_0/Rm + bits<4> addr; + bits<4> Rt; + let Inst{27-25} = 0b000; + let Inst{24} = 0; // P bit + let Inst{21} = 1; + let Inst{20} = isLoad; // L bit + let Inst{19-16} = addr; // Rn + let Inst{15-12} = Rt; // Rt + let Inst{7-4} = op; +} + +// stores +class AI3str<bits<4> op, dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrMode3, 4, IndexModeNone, f, itin, + opc, asm, "", pattern> { + bits<14> addr; + bits<4> Rt; + let Inst{27-25} = 0b000; + let Inst{24} = 1; // P bit + let Inst{23} = addr{8}; // U bit + let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm + let Inst{21} = 0; // W bit + let Inst{20} = 0; // L bit + let Inst{19-16} = addr{12-9}; // Rn + let Inst{15-12} = Rt; // Rt + let Inst{11-8} = addr{7-4}; // imm7_4/zero + let Inst{7-4} = op; + let Inst{3-0} = addr{3-0}; // imm3_0/Rm + let DecoderMethod = "DecodeAddrMode3Instruction"; +} + +// addrmode4 instructions +class AXI4<dag oops, dag iops, IndexMode im, Format f, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : XI<oops, iops, AddrMode4, 4, im, f, itin, asm, cstr, pattern> { + bits<4> p; + bits<16> regs; + bits<4> Rn; + let Inst{31-28} = p; + let Inst{27-25} = 0b100; + let Inst{22} = 0; // S bit + let Inst{19-16} = Rn; + let Inst{15-0} = regs; +} + +// Unsigned multiply, multiply-accumulate instructions. +class AMul1I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, 4, IndexModeNone, MulFrm, itin, + opc, asm, "", pattern> { + let Inst{7-4} = 0b1001; + let Inst{20} = 0; // S bit + let Inst{27-21} = opcod; +} +class AsMul1I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : sI<oops, iops, AddrModeNone, 4, IndexModeNone, MulFrm, itin, + opc, asm, "", pattern> { + let Inst{7-4} = 0b1001; + let Inst{27-21} = opcod; +} + +// Most significant word multiply +class AMul2I<bits<7> opcod, bits<4> opc7_4, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, 4, IndexModeNone, MulFrm, itin, + opc, asm, "", pattern> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + let Inst{7-4} = opc7_4; + let Inst{20} = 1; + let Inst{27-21} = opcod; + let Inst{19-16} = Rd; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; +} +// MSW multiple w/ Ra operand +class AMul2Ia<bits<7> opcod, bits<4> opc7_4, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AMul2I<opcod, opc7_4, oops, iops, itin, opc, asm, pattern> { + bits<4> Ra; + let Inst{15-12} = Ra; +} + +// SMUL<x><y> / SMULW<y> / SMLA<x><y> / SMLAW<x><y> +class AMulxyIbase<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, 4, IndexModeNone, MulFrm, itin, + opc, asm, "", pattern> { + bits<4> Rn; + bits<4> Rm; + let Inst{4} = 0; + let Inst{7} = 1; + let Inst{20} = 0; + let Inst{27-21} = opcod; + let Inst{6-5} = bit6_5; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; +} +class AMulxyI<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AMulxyIbase<opcod, bit6_5, oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + let Inst{19-16} = Rd; +} + +// AMulxyI with Ra operand +class AMulxyIa<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AMulxyI<opcod, bit6_5, oops, iops, itin, opc, asm, pattern> { + bits<4> Ra; + let Inst{15-12} = Ra; +} +// SMLAL* +class AMulxyI64<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AMulxyIbase<opcod, bit6_5, oops, iops, itin, opc, asm, pattern> { + bits<4> RdLo; + bits<4> RdHi; + let Inst{19-16} = RdHi; + let Inst{15-12} = RdLo; +} + +// Extend instructions. +class AExtI<bits<8> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, 4, IndexModeNone, ExtFrm, itin, + opc, asm, "", pattern> { + // All AExtI instructions have Rd and Rm register operands. + bits<4> Rd; + bits<4> Rm; + let Inst{15-12} = Rd; + let Inst{3-0} = Rm; + let Inst{7-4} = 0b0111; + let Inst{9-8} = 0b00; + let Inst{27-20} = opcod; +} + +// Misc Arithmetic instructions. +class AMiscA1I<bits<8> opcod, bits<4> opc7_4, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, 4, IndexModeNone, ArithMiscFrm, itin, + opc, asm, "", pattern> { + bits<4> Rd; + bits<4> Rm; + let Inst{27-20} = opcod; + let Inst{19-16} = 0b1111; + let Inst{15-12} = Rd; + let Inst{11-8} = 0b1111; + let Inst{7-4} = opc7_4; + let Inst{3-0} = Rm; +} + +// PKH instructions +def PKHLSLAsmOperand : AsmOperandClass { + let Name = "PKHLSLImm"; + let ParserMethod = "parsePKHLSLImm"; +} +def pkh_lsl_amt: Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 32; }]>{ + let PrintMethod = "printPKHLSLShiftImm"; + let ParserMatchClass = PKHLSLAsmOperand; +} +def PKHASRAsmOperand : AsmOperandClass { + let Name = "PKHASRImm"; + let ParserMethod = "parsePKHASRImm"; +} +def pkh_asr_amt: Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 32; }]>{ + let PrintMethod = "printPKHASRShiftImm"; + let ParserMatchClass = PKHASRAsmOperand; +} + +class APKHI<bits<8> opcod, bit tb, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : I<oops, iops, AddrModeNone, 4, IndexModeNone, ArithMiscFrm, itin, + opc, asm, "", pattern> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + bits<5> sh; + let Inst{27-20} = opcod; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-7} = sh; + let Inst{6} = tb; + let Inst{5-4} = 0b01; + let Inst{3-0} = Rm; +} + +//===----------------------------------------------------------------------===// + +// ARMPat - Same as Pat<>, but requires that the compiler be in ARM mode. +class ARMPat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsARM]; +} +class ARMV5TPat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsARM, HasV5T]; +} +class ARMV5TEPat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsARM, HasV5TE]; +} +class ARMV6Pat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsARM, HasV6]; +} + +//===----------------------------------------------------------------------===// +// Thumb Instruction Format Definitions. +// + +class ThumbI<dag oops, dag iops, AddrMode am, int sz, + InstrItinClass itin, string asm, string cstr, list<dag> pattern> + : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = iops; + let AsmString = asm; + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb]; +} + +// TI - Thumb instruction. +class TI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern> + : ThumbI<oops, iops, AddrModeNone, 2, itin, asm, "", pattern>; + +// Two-address instructions +class TIt<dag oops, dag iops, InstrItinClass itin, string asm, + list<dag> pattern> + : ThumbI<oops, iops, AddrModeNone, 2, itin, asm, "$lhs = $dst", + pattern>; + +// tBL, tBX 32-bit instructions +class TIx2<bits<5> opcod1, bits<2> opcod2, bit opcod3, + dag oops, dag iops, InstrItinClass itin, string asm, + list<dag> pattern> + : ThumbI<oops, iops, AddrModeNone, 4, itin, asm, "", pattern>, + Encoding { + let Inst{31-27} = opcod1; + let Inst{15-14} = opcod2; + let Inst{12} = opcod3; +} + +// BR_JT instructions +class TJTI<dag oops, dag iops, InstrItinClass itin, string asm, + list<dag> pattern> + : ThumbI<oops, iops, AddrModeNone, 0, itin, asm, "", pattern>; + +// Thumb1 only +class Thumb1I<dag oops, dag iops, AddrMode am, int sz, + InstrItinClass itin, string asm, string cstr, list<dag> pattern> + : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = iops; + let AsmString = asm; + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb, IsThumb1Only]; +} + +class T1I<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : Thumb1I<oops, iops, AddrModeNone, 2, itin, asm, "", pattern>; +class T1Ix2<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : Thumb1I<oops, iops, AddrModeNone, 4, itin, asm, "", pattern>; + +// Two-address instructions +class T1It<dag oops, dag iops, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : Thumb1I<oops, iops, AddrModeNone, 2, itin, + asm, cstr, pattern>; + +// Thumb1 instruction that can either be predicated or set CPSR. +class Thumb1sI<dag oops, dag iops, AddrMode am, int sz, + InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { + let OutOperandList = !con(oops, (outs s_cc_out:$s)); + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat(opc, "${s}${p}", asm); + let Pattern = pattern; + let thumbArithFlagSetting = 1; + list<Predicate> Predicates = [IsThumb, IsThumb1Only]; + let DecoderNamespace = "ThumbSBit"; +} + +class T1sI<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb1sI<oops, iops, AddrModeNone, 2, itin, opc, asm, "", pattern>; + +// Two-address instructions +class T1sIt<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb1sI<oops, iops, AddrModeNone, 2, itin, opc, asm, + "$Rn = $Rdn", pattern>; + +// Thumb1 instruction that can be predicated. +class Thumb1pI<dag oops, dag iops, AddrMode am, int sz, + InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat(opc, "${p}", asm); + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb, IsThumb1Only]; +} + +class T1pI<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb1pI<oops, iops, AddrModeNone, 2, itin, opc, asm, "", pattern>; + +// Two-address instructions +class T1pIt<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb1pI<oops, iops, AddrModeNone, 2, itin, opc, asm, + "$Rn = $Rdn", pattern>; + +class T1pIs<dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : Thumb1pI<oops, iops, AddrModeT1_s, 2, itin, opc, asm, "", pattern>; + +class Encoding16 : Encoding { + let Inst{31-16} = 0x0000; +} + +// A6.2 16-bit Thumb instruction encoding +class T1Encoding<bits<6> opcode> : Encoding16 { + let Inst{15-10} = opcode; +} + +// A6.2.1 Shift (immediate), add, subtract, move, and compare encoding. +class T1General<bits<5> opcode> : Encoding16 { + let Inst{15-14} = 0b00; + let Inst{13-9} = opcode; +} + +// A6.2.2 Data-processing encoding. +class T1DataProcessing<bits<4> opcode> : Encoding16 { + let Inst{15-10} = 0b010000; + let Inst{9-6} = opcode; +} + +// A6.2.3 Special data instructions and branch and exchange encoding. +class T1Special<bits<4> opcode> : Encoding16 { + let Inst{15-10} = 0b010001; + let Inst{9-6} = opcode; +} + +// A6.2.4 Load/store single data item encoding. +class T1LoadStore<bits<4> opA, bits<3> opB> : Encoding16 { + let Inst{15-12} = opA; + let Inst{11-9} = opB; +} +class T1LdStSP<bits<3> opB> : T1LoadStore<0b1001, opB>; // SP relative + +class T1BranchCond<bits<4> opcode> : Encoding16 { + let Inst{15-12} = opcode; +} + +// Helper classes to encode Thumb1 loads and stores. For immediates, the +// following bits are used for "opA" (see A6.2.4): +// +// 0b0110 => Immediate, 4 bytes +// 0b1000 => Immediate, 2 bytes +// 0b0111 => Immediate, 1 byte +class T1pILdStEncode<bits<3> opcode, dag oops, dag iops, AddrMode am, + InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : Thumb1pI<oops, iops, am, 2, itin, opc, asm, "", pattern>, + T1LoadStore<0b0101, opcode> { + bits<3> Rt; + bits<8> addr; + let Inst{8-6} = addr{5-3}; // Rm + let Inst{5-3} = addr{2-0}; // Rn + let Inst{2-0} = Rt; +} +class T1pILdStEncodeImm<bits<4> opA, bit opB, dag oops, dag iops, AddrMode am, + InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : Thumb1pI<oops, iops, am, 2, itin, opc, asm, "", pattern>, + T1LoadStore<opA, {opB,?,?}> { + bits<3> Rt; + bits<8> addr; + let Inst{10-6} = addr{7-3}; // imm5 + let Inst{5-3} = addr{2-0}; // Rn + let Inst{2-0} = Rt; +} + +// A6.2.5 Miscellaneous 16-bit instructions encoding. +class T1Misc<bits<7> opcode> : Encoding16 { + let Inst{15-12} = 0b1011; + let Inst{11-5} = opcode; +} + +// Thumb2I - Thumb2 instruction. Almost all Thumb2 instructions are predicable. +class Thumb2I<dag oops, dag iops, AddrMode am, int sz, + InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat(opc, "${p}", asm); + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb2]; + let DecoderNamespace = "Thumb2"; +} + +// Same as Thumb2I except it can optionally modify CPSR. Note it's modeled as an +// input operand since by default it's a zero register. It will become an +// implicit def once it's "flipped". +// +// FIXME: This uses unified syntax so {s} comes before {p}. We should make it +// more consistent. +class Thumb2sI<dag oops, dag iops, AddrMode am, int sz, + InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { + bits<1> s; // condition-code set flag ('1' if the insn should set the flags) + let Inst{20} = s; + + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p, cc_out:$s)); + let AsmString = !strconcat(opc, "${s}${p}", asm); + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb2]; + let DecoderNamespace = "Thumb2"; +} + +// Special cases +class Thumb2XI<dag oops, dag iops, AddrMode am, int sz, + InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = iops; + let AsmString = asm; + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb2]; + let DecoderNamespace = "Thumb2"; +} + +class ThumbXI<dag oops, dag iops, AddrMode am, int sz, + InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = iops; + let AsmString = asm; + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb, IsThumb1Only]; + let DecoderNamespace = "Thumb"; +} + +class T2I<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeNone, 4, itin, opc, asm, "", pattern>; +class T2Ii12<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeT2_i12, 4, itin, opc, asm, "",pattern>; +class T2Ii8<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeT2_i8, 4, itin, opc, asm, "", pattern>; +class T2Iso<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeT2_so, 4, itin, opc, asm, "", pattern>; +class T2Ipc<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeT2_pc, 4, itin, opc, asm, "", pattern>; +class T2Ii8s4<bit P, bit W, bit isLoad, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : Thumb2I<oops, iops, AddrModeT2_i8s4, 4, itin, opc, asm, cstr, + pattern> { + bits<4> Rt; + bits<4> Rt2; + bits<13> addr; + let Inst{31-25} = 0b1110100; + let Inst{24} = P; + let Inst{23} = addr{8}; + let Inst{22} = 1; + let Inst{21} = W; + let Inst{20} = isLoad; + let Inst{19-16} = addr{12-9}; + let Inst{15-12} = Rt{3-0}; + let Inst{11-8} = Rt2{3-0}; + let Inst{7-0} = addr{7-0}; +} +class T2Ii8s4post<bit P, bit W, bit isLoad, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, string cstr, + list<dag> pattern> + : Thumb2I<oops, iops, AddrModeT2_i8s4, 4, itin, opc, asm, cstr, + pattern> { + bits<4> Rt; + bits<4> Rt2; + bits<4> addr; + bits<9> imm; + let Inst{31-25} = 0b1110100; + let Inst{24} = P; + let Inst{23} = imm{8}; + let Inst{22} = 1; + let Inst{21} = W; + let Inst{20} = isLoad; + let Inst{19-16} = addr; + let Inst{15-12} = Rt{3-0}; + let Inst{11-8} = Rt2{3-0}; + let Inst{7-0} = imm{7-0}; +} + +class T2sI<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : Thumb2sI<oops, iops, AddrModeNone, 4, itin, opc, asm, "", pattern>; + +class T2XI<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : Thumb2XI<oops, iops, AddrModeNone, 4, itin, asm, "", pattern>; +class T2JTI<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : Thumb2XI<oops, iops, AddrModeNone, 0, itin, asm, "", pattern>; + +// Move to/from coprocessor instructions +class T2Cop<bits<4> opc, dag oops, dag iops, string asm, list<dag> pattern> + : T2XI <oops, iops, NoItinerary, asm, pattern>, Requires<[IsThumb2]> { + let Inst{31-28} = opc; +} + +// Two-address instructions +class T2XIt<dag oops, dag iops, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : Thumb2XI<oops, iops, AddrModeNone, 4, itin, asm, cstr, pattern>; + +// T2Ipreldst - Thumb2 pre-indexed load / store instructions. +class T2Ipreldst<bit signed, bits<2> opcod, bit load, bit pre, + dag oops, dag iops, + AddrMode am, IndexMode im, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : InstARM<am, 4, im, ThumbFrm, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat(opc, "${p}", asm); + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb2]; + let DecoderNamespace = "Thumb2"; + + bits<4> Rt; + bits<13> addr; + let Inst{31-27} = 0b11111; + let Inst{26-25} = 0b00; + let Inst{24} = signed; + let Inst{23} = 0; + let Inst{22-21} = opcod; + let Inst{20} = load; + let Inst{19-16} = addr{12-9}; + let Inst{15-12} = Rt{3-0}; + let Inst{11} = 1; + // (P, W) = (1, 1) Pre-indexed or (0, 1) Post-indexed + let Inst{10} = pre; // The P bit. + let Inst{9} = addr{8}; // Sign bit + let Inst{8} = 1; // The W bit. + let Inst{7-0} = addr{7-0}; + + let DecoderMethod = "DecodeT2LdStPre"; +} + +// T2Ipostldst - Thumb2 post-indexed load / store instructions. +class T2Ipostldst<bit signed, bits<2> opcod, bit load, bit pre, + dag oops, dag iops, + AddrMode am, IndexMode im, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : InstARM<am, 4, im, ThumbFrm, GenericDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat(opc, "${p}", asm); + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb2]; + let DecoderNamespace = "Thumb2"; + + bits<4> Rt; + bits<4> Rn; + bits<9> offset; + let Inst{31-27} = 0b11111; + let Inst{26-25} = 0b00; + let Inst{24} = signed; + let Inst{23} = 0; + let Inst{22-21} = opcod; + let Inst{20} = load; + let Inst{19-16} = Rn; + let Inst{15-12} = Rt{3-0}; + let Inst{11} = 1; + // (P, W) = (1, 1) Pre-indexed or (0, 1) Post-indexed + let Inst{10} = pre; // The P bit. + let Inst{9} = offset{8}; // Sign bit + let Inst{8} = 1; // The W bit. + let Inst{7-0} = offset{7-0}; + + let DecoderMethod = "DecodeT2LdStPre"; +} + +// Tv5Pat - Same as Pat<>, but requires V5T Thumb mode. +class Tv5Pat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsThumb, IsThumb1Only, HasV5T]; +} + +// T1Pat - Same as Pat<>, but requires that the compiler be in Thumb1 mode. +class T1Pat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsThumb, IsThumb1Only]; +} + +// T2v6Pat - Same as Pat<>, but requires V6T2 Thumb2 mode. +class T2v6Pat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsThumb2, HasV6T2]; +} + +// T2Pat - Same as Pat<>, but requires that the compiler be in Thumb2 mode. +class T2Pat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsThumb2]; +} + +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// ARM VFP Instruction templates. +// + +// Almost all VFP instructions are predicable. +class VFPI<dag oops, dag iops, AddrMode am, int sz, + IndexMode im, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, im, f, VFPDomain, cstr, itin> { + bits<4> p; + let Inst{31-28} = p; + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat(opc, "${p}", asm); + let Pattern = pattern; + let PostEncoderMethod = "VFPThumb2PostEncoder"; + let DecoderNamespace = "VFP"; + list<Predicate> Predicates = [HasVFP2]; +} + +// Special cases +class VFPXI<dag oops, dag iops, AddrMode am, int sz, + IndexMode im, Format f, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : InstARM<am, sz, im, f, VFPDomain, cstr, itin> { + bits<4> p; + let Inst{31-28} = p; + let OutOperandList = oops; + let InOperandList = iops; + let AsmString = asm; + let Pattern = pattern; + let PostEncoderMethod = "VFPThumb2PostEncoder"; + let DecoderNamespace = "VFP"; + list<Predicate> Predicates = [HasVFP2]; +} + +class VFPAI<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : VFPI<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin, + opc, asm, "", pattern> { + let PostEncoderMethod = "VFPThumb2PostEncoder"; +} + +// ARM VFP addrmode5 loads and stores +class ADI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops, + InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : VFPI<oops, iops, AddrMode5, 4, IndexModeNone, + VFPLdStFrm, itin, opc, asm, "", pattern> { + // Instruction operands. + bits<5> Dd; + bits<13> addr; + + // Encode instruction operands. + let Inst{23} = addr{8}; // U (add = (U == '1')) + let Inst{22} = Dd{4}; + let Inst{19-16} = addr{12-9}; // Rn + let Inst{15-12} = Dd{3-0}; + let Inst{7-0} = addr{7-0}; // imm8 + + // TODO: Mark the instructions with the appropriate subtarget info. + let Inst{27-24} = opcod1; + let Inst{21-20} = opcod2; + let Inst{11-9} = 0b101; + let Inst{8} = 1; // Double precision + + // Loads & stores operate on both NEON and VFP pipelines. + let D = VFPNeonDomain; +} + +class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops, + InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : VFPI<oops, iops, AddrMode5, 4, IndexModeNone, + VFPLdStFrm, itin, opc, asm, "", pattern> { + // Instruction operands. + bits<5> Sd; + bits<13> addr; + + // Encode instruction operands. + let Inst{23} = addr{8}; // U (add = (U == '1')) + let Inst{22} = Sd{0}; + let Inst{19-16} = addr{12-9}; // Rn + let Inst{15-12} = Sd{4-1}; + let Inst{7-0} = addr{7-0}; // imm8 + + // TODO: Mark the instructions with the appropriate subtarget info. + let Inst{27-24} = opcod1; + let Inst{21-20} = opcod2; + let Inst{11-9} = 0b101; + let Inst{8} = 0; // Single precision + + // Loads & stores operate on both NEON and VFP pipelines. + let D = VFPNeonDomain; +} + +// VFP Load / store multiple pseudo instructions. +class PseudoVFPLdStM<dag oops, dag iops, InstrItinClass itin, string cstr, + list<dag> pattern> + : InstARM<AddrMode4, 4, IndexModeNone, Pseudo, VFPNeonDomain, + cstr, itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let Pattern = pattern; + list<Predicate> Predicates = [HasVFP2]; +} + +// Load / store multiple +class AXDI4<dag oops, dag iops, IndexMode im, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : VFPXI<oops, iops, AddrMode4, 4, im, + VFPLdStMulFrm, itin, asm, cstr, pattern> { + // Instruction operands. + bits<4> Rn; + bits<13> regs; + + // Encode instruction operands. + let Inst{19-16} = Rn; + let Inst{22} = regs{12}; + let Inst{15-12} = regs{11-8}; + let Inst{7-0} = regs{7-0}; + + // TODO: Mark the instructions with the appropriate subtarget info. + let Inst{27-25} = 0b110; + let Inst{11-9} = 0b101; + let Inst{8} = 1; // Double precision +} + +class AXSI4<dag oops, dag iops, IndexMode im, InstrItinClass itin, + string asm, string cstr, list<dag> pattern> + : VFPXI<oops, iops, AddrMode4, 4, im, + VFPLdStMulFrm, itin, asm, cstr, pattern> { + // Instruction operands. + bits<4> Rn; + bits<13> regs; + + // Encode instruction operands. + let Inst{19-16} = Rn; + let Inst{22} = regs{8}; + let Inst{15-12} = regs{12-9}; + let Inst{7-0} = regs{7-0}; + + // TODO: Mark the instructions with the appropriate subtarget info. + let Inst{27-25} = 0b110; + let Inst{11-9} = 0b101; + let Inst{8} = 0; // Single precision +} + +// Double precision, unary +class ADuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, + bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc, + string asm, list<dag> pattern> + : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> { + // Instruction operands. + bits<5> Dd; + bits<5> Dm; + + // Encode instruction operands. + let Inst{3-0} = Dm{3-0}; + let Inst{5} = Dm{4}; + let Inst{15-12} = Dd{3-0}; + let Inst{22} = Dd{4}; + + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{19-16} = opcod3; + let Inst{11-9} = 0b101; + let Inst{8} = 1; // Double precision + let Inst{7-6} = opcod4; + let Inst{4} = opcod5; +} + +// Double precision, binary +class ADbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, + dag iops, InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> { + // Instruction operands. + bits<5> Dd; + bits<5> Dn; + bits<5> Dm; + + // Encode instruction operands. + let Inst{3-0} = Dm{3-0}; + let Inst{5} = Dm{4}; + let Inst{19-16} = Dn{3-0}; + let Inst{7} = Dn{4}; + let Inst{15-12} = Dd{3-0}; + let Inst{22} = Dd{4}; + + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{11-9} = 0b101; + let Inst{8} = 1; // Double precision + let Inst{6} = op6; + let Inst{4} = op4; +} + +// Single precision, unary +class ASuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, + bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc, + string asm, list<dag> pattern> + : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> { + // Instruction operands. + bits<5> Sd; + bits<5> Sm; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; + + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{19-16} = opcod3; + let Inst{11-9} = 0b101; + let Inst{8} = 0; // Single precision + let Inst{7-6} = opcod4; + let Inst{4} = opcod5; +} + +// Single precision unary, if no NEON. Same as ASuI except not available if +// NEON is enabled. +class ASuIn<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4, + bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc, + string asm, list<dag> pattern> + : ASuI<opcod1, opcod2, opcod3, opcod4, opcod5, oops, iops, itin, opc, asm, + pattern> { + list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP]; +} + +// Single precision, binary +class ASbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> { + // Instruction operands. + bits<5> Sd; + bits<5> Sn; + bits<5> Sm; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{19-16} = Sn{4-1}; + let Inst{7} = Sn{0}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; + + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{11-9} = 0b101; + let Inst{8} = 0; // Single precision + let Inst{6} = op6; + let Inst{4} = op4; +} + +// Single precision binary, if no NEON. Same as ASbI except not available if +// NEON is enabled. +class ASbIn<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, + dag iops, InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : ASbI<opcod1, opcod2, op6, op4, oops, iops, itin, opc, asm, pattern> { + list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP]; + + // Instruction operands. + bits<5> Sd; + bits<5> Sn; + bits<5> Sm; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{19-16} = Sn{4-1}; + let Inst{7} = Sn{0}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; +} + +// VFP conversion instructions +class AVConv1I<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<4> opcod4, + dag oops, dag iops, InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : VFPAI<oops, iops, VFPConv1Frm, itin, opc, asm, pattern> { + let Inst{27-23} = opcod1; + let Inst{21-20} = opcod2; + let Inst{19-16} = opcod3; + let Inst{11-8} = opcod4; + let Inst{6} = 1; + let Inst{4} = 0; +} + +// VFP conversion between floating-point and fixed-point +class AVConv1XI<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, bit op5, + dag oops, dag iops, InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : AVConv1I<op1, op2, op3, op4, oops, iops, itin, opc, asm, pattern> { + // size (fixed-point number): sx == 0 ? 16 : 32 + let Inst{7} = op5; // sx +} + +// VFP conversion instructions, if no NEON +class AVConv1In<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<4> opcod4, + dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm, + pattern> { + list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP]; +} + +class AVConvXI<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, Format f, + InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : VFPAI<oops, iops, f, itin, opc, asm, pattern> { + let Inst{27-20} = opcod1; + let Inst{11-8} = opcod2; + let Inst{4} = 1; +} + +class AVConv2I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AVConvXI<opcod1, opcod2, oops, iops, VFPConv2Frm, itin, opc, asm, pattern>; + +class AVConv3I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AVConvXI<opcod1, opcod2, oops, iops, VFPConv3Frm, itin, opc, asm, pattern>; + +class AVConv4I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AVConvXI<opcod1, opcod2, oops, iops, VFPConv4Frm, itin, opc, asm, pattern>; + +class AVConv5I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : AVConvXI<opcod1, opcod2, oops, iops, VFPConv5Frm, itin, opc, asm, pattern>; + +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// ARM NEON Instruction templates. +// + +class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, Format f, + InstrItinClass itin, string opc, string dt, string asm, string cstr, + list<dag> pattern> + : InstARM<am, 4, im, f, NeonDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat(opc, "${p}", ".", dt, "\t", asm); + let Pattern = pattern; + list<Predicate> Predicates = [HasNEON]; + let DecoderNamespace = "NEON"; +} + +// Same as NeonI except it does not have a "data type" specifier. +class NeonXI<dag oops, dag iops, AddrMode am, IndexMode im, Format f, + InstrItinClass itin, string opc, string asm, string cstr, + list<dag> pattern> + : InstARM<am, 4, im, f, NeonDomain, cstr, itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat(opc, "${p}", "\t", asm); + let Pattern = pattern; + list<Predicate> Predicates = [HasNEON]; + let DecoderNamespace = "NEON"; +} + +class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4, + dag oops, dag iops, InstrItinClass itin, + string opc, string dt, string asm, string cstr, list<dag> pattern> + : NeonI<oops, iops, AddrMode6, IndexModeNone, NLdStFrm, itin, opc, dt, asm, + cstr, pattern> { + let Inst{31-24} = 0b11110100; + let Inst{23} = op23; + let Inst{21-20} = op21_20; + let Inst{11-8} = op11_8; + let Inst{7-4} = op7_4; + + let PostEncoderMethod = "NEONThumb2LoadStorePostEncoder"; + let DecoderNamespace = "NEONLoadStore"; + + bits<5> Vd; + bits<6> Rn; + bits<4> Rm; + + let Inst{22} = Vd{4}; + let Inst{15-12} = Vd{3-0}; + let Inst{19-16} = Rn{3-0}; + let Inst{3-0} = Rm{3-0}; +} + +class NLdStLn<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4, + dag oops, dag iops, InstrItinClass itin, + string opc, string dt, string asm, string cstr, list<dag> pattern> + : NLdSt<op23, op21_20, op11_8, op7_4, oops, iops, itin, opc, + dt, asm, cstr, pattern> { + bits<3> lane; +} + +class PseudoNLdSt<dag oops, dag iops, InstrItinClass itin, string cstr> + : InstARM<AddrMode6, 4, IndexModeNone, Pseudo, NeonDomain, cstr, + itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + list<Predicate> Predicates = [HasNEON]; +} + +class PseudoNeonI<dag oops, dag iops, InstrItinClass itin, string cstr, + list<dag> pattern> + : InstARM<AddrModeNone, 4, IndexModeNone, Pseudo, NeonDomain, cstr, + itin> { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let Pattern = pattern; + list<Predicate> Predicates = [HasNEON]; +} + +class NDataI<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string dt, string asm, string cstr, list<dag> pattern> + : NeonI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, dt, asm, cstr, + pattern> { + let Inst{31-25} = 0b1111001; + let PostEncoderMethod = "NEONThumb2DataIPostEncoder"; + let DecoderNamespace = "NEONData"; +} + +class NDataXI<dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : NeonXI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, asm, + cstr, pattern> { + let Inst{31-25} = 0b1111001; + let PostEncoderMethod = "NEONThumb2DataIPostEncoder"; + let DecoderNamespace = "NEONData"; +} + +// NEON "one register and a modified immediate" format. +class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6, + bit op5, bit op4, + dag oops, dag iops, InstrItinClass itin, + string opc, string dt, string asm, string cstr, + list<dag> pattern> + : NDataI<oops, iops, N1RegModImmFrm, itin, opc, dt, asm, cstr, pattern> { + let Inst{23} = op23; + let Inst{21-19} = op21_19; + let Inst{11-8} = op11_8; + let Inst{7} = op7; + let Inst{6} = op6; + let Inst{5} = op5; + let Inst{4} = op4; + + // Instruction operands. + bits<5> Vd; + bits<13> SIMM; + + let Inst{15-12} = Vd{3-0}; + let Inst{22} = Vd{4}; + let Inst{24} = SIMM{7}; + let Inst{18-16} = SIMM{6-4}; + let Inst{3-0} = SIMM{3-0}; + let DecoderMethod = "DecodeNEONModImmInstruction"; +} + +// NEON 2 vector register format. +class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, + bits<5> op11_7, bit op6, bit op4, + dag oops, dag iops, InstrItinClass itin, + string opc, string dt, string asm, string cstr, list<dag> pattern> + : NDataI<oops, iops, N2RegFrm, itin, opc, dt, asm, cstr, pattern> { + let Inst{24-23} = op24_23; + let Inst{21-20} = op21_20; + let Inst{19-18} = op19_18; + let Inst{17-16} = op17_16; + let Inst{11-7} = op11_7; + let Inst{6} = op6; + let Inst{4} = op4; + + // Instruction operands. + bits<5> Vd; + bits<5> Vm; + + let Inst{15-12} = Vd{3-0}; + let Inst{22} = Vd{4}; + let Inst{3-0} = Vm{3-0}; + let Inst{5} = Vm{4}; +} + +// Same as N2V except it doesn't have a datatype suffix. +class N2VX<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, + bits<5> op11_7, bit op6, bit op4, + dag oops, dag iops, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : NDataXI<oops, iops, N2RegFrm, itin, opc, asm, cstr, pattern> { + let Inst{24-23} = op24_23; + let Inst{21-20} = op21_20; + let Inst{19-18} = op19_18; + let Inst{17-16} = op17_16; + let Inst{11-7} = op11_7; + let Inst{6} = op6; + let Inst{4} = op4; + + // Instruction operands. + bits<5> Vd; + bits<5> Vm; + + let Inst{15-12} = Vd{3-0}; + let Inst{22} = Vd{4}; + let Inst{3-0} = Vm{3-0}; + let Inst{5} = Vm{4}; +} + +// NEON 2 vector register with immediate. +class N2VImm<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, + dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string dt, string asm, string cstr, list<dag> pattern> + : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> { + let Inst{24} = op24; + let Inst{23} = op23; + let Inst{11-8} = op11_8; + let Inst{7} = op7; + let Inst{6} = op6; + let Inst{4} = op4; + + // Instruction operands. + bits<5> Vd; + bits<5> Vm; + bits<6> SIMM; + + let Inst{15-12} = Vd{3-0}; + let Inst{22} = Vd{4}; + let Inst{3-0} = Vm{3-0}; + let Inst{5} = Vm{4}; + let Inst{21-16} = SIMM{5-0}; +} + +// NEON 3 vector register format. + +class N3VCommon<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, + bit op4, dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string dt, string asm, string cstr, + list<dag> pattern> + : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> { + let Inst{24} = op24; + let Inst{23} = op23; + let Inst{21-20} = op21_20; + let Inst{11-8} = op11_8; + let Inst{6} = op6; + let Inst{4} = op4; +} + +class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4, + dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string dt, string asm, string cstr, list<dag> pattern> + : N3VCommon<op24, op23, op21_20, op11_8, op6, op4, + oops, iops, f, itin, opc, dt, asm, cstr, pattern> { + + // Instruction operands. + bits<5> Vd; + bits<5> Vn; + bits<5> Vm; + + let Inst{15-12} = Vd{3-0}; + let Inst{22} = Vd{4}; + let Inst{19-16} = Vn{3-0}; + let Inst{7} = Vn{4}; + let Inst{3-0} = Vm{3-0}; + let Inst{5} = Vm{4}; +} + +class N3VLane32<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, + bit op4, dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string dt, string asm, string cstr, + list<dag> pattern> + : N3VCommon<op24, op23, op21_20, op11_8, op6, op4, + oops, iops, f, itin, opc, dt, asm, cstr, pattern> { + + // Instruction operands. + bits<5> Vd; + bits<5> Vn; + bits<5> Vm; + bit lane; + + let Inst{15-12} = Vd{3-0}; + let Inst{22} = Vd{4}; + let Inst{19-16} = Vn{3-0}; + let Inst{7} = Vn{4}; + let Inst{3-0} = Vm{3-0}; + let Inst{5} = lane; +} + +class N3VLane16<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, + bit op4, dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string dt, string asm, string cstr, + list<dag> pattern> + : N3VCommon<op24, op23, op21_20, op11_8, op6, op4, + oops, iops, f, itin, opc, dt, asm, cstr, pattern> { + + // Instruction operands. + bits<5> Vd; + bits<5> Vn; + bits<5> Vm; + bits<2> lane; + + let Inst{15-12} = Vd{3-0}; + let Inst{22} = Vd{4}; + let Inst{19-16} = Vn{3-0}; + let Inst{7} = Vn{4}; + let Inst{2-0} = Vm{2-0}; + let Inst{5} = lane{1}; + let Inst{3} = lane{0}; +} + +// Same as N3V except it doesn't have a data type suffix. +class N3VX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, + bit op4, + dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string asm, string cstr, list<dag> pattern> + : NDataXI<oops, iops, f, itin, opc, asm, cstr, pattern> { + let Inst{24} = op24; + let Inst{23} = op23; + let Inst{21-20} = op21_20; + let Inst{11-8} = op11_8; + let Inst{6} = op6; + let Inst{4} = op4; + + // Instruction operands. + bits<5> Vd; + bits<5> Vn; + bits<5> Vm; + + let Inst{15-12} = Vd{3-0}; + let Inst{22} = Vd{4}; + let Inst{19-16} = Vn{3-0}; + let Inst{7} = Vn{4}; + let Inst{3-0} = Vm{3-0}; + let Inst{5} = Vm{4}; +} + +// NEON VMOVs between scalar and core registers. +class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, + dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string dt, string asm, list<dag> pattern> + : InstARM<AddrModeNone, 4, IndexModeNone, f, NeonDomain, + "", itin> { + let Inst{27-20} = opcod1; + let Inst{11-8} = opcod2; + let Inst{6-5} = opcod3; + let Inst{4} = 1; + // A8.6.303, A8.6.328, A8.6.329 + let Inst{3-0} = 0b0000; + + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + let AsmString = !strconcat(opc, "${p}", ".", dt, "\t", asm); + let Pattern = pattern; + list<Predicate> Predicates = [HasNEON]; + + let PostEncoderMethod = "NEONThumb2DupPostEncoder"; + let DecoderNamespace = "NEONDup"; + + bits<5> V; + bits<4> R; + bits<4> p; + bits<4> lane; + + let Inst{31-28} = p{3-0}; + let Inst{7} = V{4}; + let Inst{19-16} = V{3-0}; + let Inst{15-12} = R{3-0}; +} +class NVGetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, + dag oops, dag iops, InstrItinClass itin, + string opc, string dt, string asm, list<dag> pattern> + : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NGetLnFrm, itin, + opc, dt, asm, pattern>; +class NVSetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, + dag oops, dag iops, InstrItinClass itin, + string opc, string dt, string asm, list<dag> pattern> + : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NSetLnFrm, itin, + opc, dt, asm, pattern>; +class NVDup<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, + dag oops, dag iops, InstrItinClass itin, + string opc, string dt, string asm, list<dag> pattern> + : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NDupFrm, itin, + opc, dt, asm, pattern>; + +// Vector Duplicate Lane (from scalar to all elements) +class NVDupLane<bits<4> op19_16, bit op6, dag oops, dag iops, + InstrItinClass itin, string opc, string dt, string asm, + list<dag> pattern> + : NDataI<oops, iops, NVDupLnFrm, itin, opc, dt, asm, "", pattern> { + let Inst{24-23} = 0b11; + let Inst{21-20} = 0b11; + let Inst{19-16} = op19_16; + let Inst{11-7} = 0b11000; + let Inst{6} = op6; + let Inst{4} = 0; + + bits<5> Vd; + bits<5> Vm; + + let Inst{22} = Vd{4}; + let Inst{15-12} = Vd{3-0}; + let Inst{5} = Vm{4}; + let Inst{3-0} = Vm{3-0}; +} + +// NEONFPPat - Same as Pat<>, but requires that the compiler be using NEON +// for single-precision FP. +class NEONFPPat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [HasNEON,UseNEONForFP]; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp new file mode 100644 index 0000000..48da03f --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp @@ -0,0 +1,68 @@ +//===- ARMInstrInfo.cpp - ARM Instruction Information -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the ARM implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "ARMInstrInfo.h" +#include "ARM.h" +#include "ARMMachineFunctionInfo.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/MC/MCAsmInfo.h" +using namespace llvm; + +ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI) + : ARMBaseInstrInfo(STI), RI(*this, STI) { +} + +unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const { + switch (Opc) { + default: break; + case ARM::LDR_PRE_IMM: + case ARM::LDR_PRE_REG: + case ARM::LDR_POST_IMM: + case ARM::LDR_POST_REG: + return ARM::LDRi12; + case ARM::LDRH_PRE: + case ARM::LDRH_POST: + return ARM::LDRH; + case ARM::LDRB_PRE_IMM: + case ARM::LDRB_PRE_REG: + case ARM::LDRB_POST_IMM: + case ARM::LDRB_POST_REG: + return ARM::LDRBi12; + case ARM::LDRSH_PRE: + case ARM::LDRSH_POST: + return ARM::LDRSH; + case ARM::LDRSB_PRE: + case ARM::LDRSB_POST: + return ARM::LDRSB; + case ARM::STR_PRE_IMM: + case ARM::STR_PRE_REG: + case ARM::STR_POST_IMM: + case ARM::STR_POST_REG: + return ARM::STRi12; + case ARM::STRH_PRE: + case ARM::STRH_POST: + return ARM::STRH; + case ARM::STRB_PRE_IMM: + case ARM::STRB_PRE_REG: + case ARM::STRB_POST_IMM: + case ARM::STRB_POST_REG: + return ARM::STRBi12; + } + + return 0; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h new file mode 100644 index 0000000..f2c7bdc --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h @@ -0,0 +1,44 @@ +//===- ARMInstrInfo.h - ARM Instruction Information -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the ARM implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMINSTRUCTIONINFO_H +#define ARMINSTRUCTIONINFO_H + +#include "llvm/Target/TargetInstrInfo.h" +#include "ARMBaseInstrInfo.h" +#include "ARMRegisterInfo.h" +#include "ARMSubtarget.h" +#include "ARM.h" + +namespace llvm { + class ARMSubtarget; + +class ARMInstrInfo : public ARMBaseInstrInfo { + ARMRegisterInfo RI; +public: + explicit ARMInstrInfo(const ARMSubtarget &STI); + + // Return the non-pre/post incrementing version of 'Opc'. Return 0 + // if there is not such an opcode. + unsigned getUnindexedOpcode(unsigned Opc) const; + + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As + /// such, whenever a client has an instance of instruction info, it should + /// always be able to get register info as well (through this method). + /// + const ARMRegisterInfo &getRegisterInfo() const { return RI; } +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td new file mode 100644 index 0000000..2cf0f09 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -0,0 +1,5004 @@ +//===- ARMInstrInfo.td - Target Description for ARM Target -*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the ARM instructions in TableGen format. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// ARM specific DAG Nodes. +// + +// Type profiles. +def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>; +def SDT_ARMCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>; + +def SDT_ARMSaveCallPC : SDTypeProfile<0, 1, []>; + +def SDT_ARMcall : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>; + +def SDT_ARMCMov : SDTypeProfile<1, 3, + [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, + SDTCisVT<3, i32>]>; + +def SDT_ARMBrcond : SDTypeProfile<0, 2, + [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>]>; + +def SDT_ARMBrJT : SDTypeProfile<0, 3, + [SDTCisPtrTy<0>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>]>; + +def SDT_ARMBr2JT : SDTypeProfile<0, 4, + [SDTCisPtrTy<0>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; + +def SDT_ARMBCC_i64 : SDTypeProfile<0, 6, + [SDTCisVT<0, i32>, + SDTCisVT<1, i32>, SDTCisVT<2, i32>, + SDTCisVT<3, i32>, SDTCisVT<4, i32>, + SDTCisVT<5, OtherVT>]>; + +def SDT_ARMAnd : SDTypeProfile<1, 2, + [SDTCisVT<0, i32>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>]>; + +def SDT_ARMCmp : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>; + +def SDT_ARMPICAdd : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, + SDTCisPtrTy<1>, SDTCisVT<2, i32>]>; + +def SDT_ARMThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>; +def SDT_ARMEH_SJLJ_Setjmp : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisPtrTy<1>, + SDTCisInt<2>]>; +def SDT_ARMEH_SJLJ_Longjmp: SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisInt<1>]>; + +def SDT_ARMEH_SJLJ_DispatchSetup: SDTypeProfile<0, 1, [SDTCisInt<0>]>; + +def SDT_ARMMEMBARRIER : SDTypeProfile<0, 1, [SDTCisInt<0>]>; + +def SDT_ARMPREFETCH : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisSameAs<1, 2>, + SDTCisInt<1>]>; + +def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; + +def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; + +def SDTBinaryArithWithFlags : SDTypeProfile<2, 2, + [SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisInt<0>, SDTCisVT<1, i32>]>; + +// SDTBinaryArithWithFlagsInOut - RES1, CPSR = op LHS, RHS, CPSR +def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, + [SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisInt<0>, + SDTCisVT<1, i32>, + SDTCisVT<4, i32>]>; +// Node definitions. +def ARMWrapper : SDNode<"ARMISD::Wrapper", SDTIntUnaryOp>; +def ARMWrapperDYN : SDNode<"ARMISD::WrapperDYN", SDTIntUnaryOp>; +def ARMWrapperPIC : SDNode<"ARMISD::WrapperPIC", SDTIntUnaryOp>; +def ARMWrapperJT : SDNode<"ARMISD::WrapperJT", SDTIntBinOp>; + +def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeqStart, + [SDNPHasChain, SDNPOutGlue]>; +def ARMcallseq_end : SDNode<"ISD::CALLSEQ_END", SDT_ARMCallSeqEnd, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def ARMcall : SDNode<"ARMISD::CALL", SDT_ARMcall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; +def ARMcall_pred : SDNode<"ARMISD::CALL_PRED", SDT_ARMcall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; +def ARMcall_nolink : SDNode<"ARMISD::CALL_NOLINK", SDT_ARMcall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; + +def ARMretflag : SDNode<"ARMISD::RET_FLAG", SDTNone, + [SDNPHasChain, SDNPOptInGlue]>; + +def ARMcmov : SDNode<"ARMISD::CMOV", SDT_ARMCMov, + [SDNPInGlue]>; + +def ARMbrcond : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>; + +def ARMbrjt : SDNode<"ARMISD::BR_JT", SDT_ARMBrJT, + [SDNPHasChain]>; +def ARMbr2jt : SDNode<"ARMISD::BR2_JT", SDT_ARMBr2JT, + [SDNPHasChain]>; + +def ARMBcci64 : SDNode<"ARMISD::BCC_i64", SDT_ARMBCC_i64, + [SDNPHasChain]>; + +def ARMcmp : SDNode<"ARMISD::CMP", SDT_ARMCmp, + [SDNPOutGlue]>; + +def ARMcmpZ : SDNode<"ARMISD::CMPZ", SDT_ARMCmp, + [SDNPOutGlue, SDNPCommutative]>; + +def ARMpic_add : SDNode<"ARMISD::PIC_ADD", SDT_ARMPICAdd>; + +def ARMsrl_flag : SDNode<"ARMISD::SRL_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>; +def ARMsra_flag : SDNode<"ARMISD::SRA_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>; +def ARMrrx : SDNode<"ARMISD::RRX" , SDTIntUnaryOp, [SDNPInGlue ]>; + +def ARMaddc : SDNode<"ARMISD::ADDC", SDTBinaryArithWithFlags, + [SDNPCommutative]>; +def ARMsubc : SDNode<"ARMISD::SUBC", SDTBinaryArithWithFlags>; +def ARMadde : SDNode<"ARMISD::ADDE", SDTBinaryArithWithFlagsInOut>; +def ARMsube : SDNode<"ARMISD::SUBE", SDTBinaryArithWithFlagsInOut>; + +def ARMthread_pointer: SDNode<"ARMISD::THREAD_POINTER", SDT_ARMThreadPointer>; +def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP", + SDT_ARMEH_SJLJ_Setjmp, [SDNPHasChain]>; +def ARMeh_sjlj_longjmp: SDNode<"ARMISD::EH_SJLJ_LONGJMP", + SDT_ARMEH_SJLJ_Longjmp, [SDNPHasChain]>; +def ARMeh_sjlj_dispatchsetup: SDNode<"ARMISD::EH_SJLJ_DISPATCHSETUP", + SDT_ARMEH_SJLJ_DispatchSetup, [SDNPHasChain]>; + + +def ARMMemBarrier : SDNode<"ARMISD::MEMBARRIER", SDT_ARMMEMBARRIER, + [SDNPHasChain]>; +def ARMMemBarrierMCR : SDNode<"ARMISD::MEMBARRIER_MCR", SDT_ARMMEMBARRIER, + [SDNPHasChain]>; +def ARMPreload : SDNode<"ARMISD::PRELOAD", SDT_ARMPREFETCH, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>; + +def ARMrbit : SDNode<"ARMISD::RBIT", SDTIntUnaryOp>; + +def ARMtcret : SDNode<"ARMISD::TC_RETURN", SDT_ARMTCRET, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; + + +def ARMbfi : SDNode<"ARMISD::BFI", SDT_ARMBFI>; + +//===----------------------------------------------------------------------===// +// ARM Instruction Predicate Definitions. +// +def HasV4T : Predicate<"Subtarget->hasV4TOps()">, + AssemblerPredicate<"HasV4TOps">; +def NoV4T : Predicate<"!Subtarget->hasV4TOps()">; +def HasV5T : Predicate<"Subtarget->hasV5TOps()">; +def HasV5TE : Predicate<"Subtarget->hasV5TEOps()">, + AssemblerPredicate<"HasV5TEOps">; +def HasV6 : Predicate<"Subtarget->hasV6Ops()">, + AssemblerPredicate<"HasV6Ops">; +def NoV6 : Predicate<"!Subtarget->hasV6Ops()">; +def HasV6T2 : Predicate<"Subtarget->hasV6T2Ops()">, + AssemblerPredicate<"HasV6T2Ops">; +def NoV6T2 : Predicate<"!Subtarget->hasV6T2Ops()">; +def HasV7 : Predicate<"Subtarget->hasV7Ops()">, + AssemblerPredicate<"HasV7Ops">; +def NoVFP : Predicate<"!Subtarget->hasVFP2()">; +def HasVFP2 : Predicate<"Subtarget->hasVFP2()">, + AssemblerPredicate<"FeatureVFP2">; +def HasVFP3 : Predicate<"Subtarget->hasVFP3()">, + AssemblerPredicate<"FeatureVFP3">; +def HasNEON : Predicate<"Subtarget->hasNEON()">, + AssemblerPredicate<"FeatureNEON">; +def HasFP16 : Predicate<"Subtarget->hasFP16()">, + AssemblerPredicate<"FeatureFP16">; +def HasDivide : Predicate<"Subtarget->hasDivide()">, + AssemblerPredicate<"FeatureHWDiv">; +def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">, + AssemblerPredicate<"FeatureT2XtPk">; +def HasThumb2DSP : Predicate<"Subtarget->hasThumb2DSP()">, + AssemblerPredicate<"FeatureDSPThumb2">; +def HasDB : Predicate<"Subtarget->hasDataBarrier()">, + AssemblerPredicate<"FeatureDB">; +def HasMP : Predicate<"Subtarget->hasMPExtension()">, + AssemblerPredicate<"FeatureMP">; +def UseNEONForFP : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">; +def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">; +def IsThumb : Predicate<"Subtarget->isThumb()">, + AssemblerPredicate<"ModeThumb">; +def IsThumb1Only : Predicate<"Subtarget->isThumb1Only()">; +def IsThumb2 : Predicate<"Subtarget->isThumb2()">, + AssemblerPredicate<"ModeThumb,FeatureThumb2">; +def IsMClass : Predicate<"Subtarget->isMClass()">, + AssemblerPredicate<"FeatureMClass">; +def IsARClass : Predicate<"!Subtarget->isMClass()">, + AssemblerPredicate<"!FeatureMClass">; +def IsARM : Predicate<"!Subtarget->isThumb()">, + AssemblerPredicate<"!ModeThumb">; +def IsDarwin : Predicate<"Subtarget->isTargetDarwin()">; +def IsNotDarwin : Predicate<"!Subtarget->isTargetDarwin()">; +def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">, + AssemblerPredicate<"ModeNaCl">; + +// FIXME: Eventually this will be just "hasV6T2Ops". +def UseMovt : Predicate<"Subtarget->useMovt()">; +def DontUseMovt : Predicate<"!Subtarget->useMovt()">; +def UseFPVMLx : Predicate<"Subtarget->useFPVMLx()">; + +//===----------------------------------------------------------------------===// +// ARM Flag Definitions. + +class RegConstraint<string C> { + string Constraints = C; +} + +//===----------------------------------------------------------------------===// +// ARM specific transformation functions and pattern fragments. +// + +// so_imm_neg_XFORM - Return a so_imm value packed into the format described for +// so_imm_neg def below. +def so_imm_neg_XFORM : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(-(int)N->getZExtValue(), MVT::i32); +}]>; + +// so_imm_not_XFORM - Return a so_imm value packed into the format described for +// so_imm_not def below. +def so_imm_not_XFORM : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(~(int)N->getZExtValue(), MVT::i32); +}]>; + +/// imm1_15 predicate - True if the 32-bit immediate is in the range [1,15]. +def imm1_15 : ImmLeaf<i32, [{ + return (int32_t)Imm >= 1 && (int32_t)Imm < 16; +}]>; + +/// imm16_31 predicate - True if the 32-bit immediate is in the range [16,31]. +def imm16_31 : ImmLeaf<i32, [{ + return (int32_t)Imm >= 16 && (int32_t)Imm < 32; +}]>; + +def so_imm_neg : + PatLeaf<(imm), [{ + return ARM_AM::getSOImmVal(-(uint32_t)N->getZExtValue()) != -1; + }], so_imm_neg_XFORM>; + +def so_imm_not : + PatLeaf<(imm), [{ + return ARM_AM::getSOImmVal(~(uint32_t)N->getZExtValue()) != -1; + }], so_imm_not_XFORM>; + +// sext_16_node predicate - True if the SDNode is sign-extended 16 or more bits. +def sext_16_node : PatLeaf<(i32 GPR:$a), [{ + return CurDAG->ComputeNumSignBits(SDValue(N,0)) >= 17; +}]>; + +/// Split a 32-bit immediate into two 16 bit parts. +def hi16 : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant((uint32_t)N->getZExtValue() >> 16, MVT::i32); +}]>; + +def lo16AllZero : PatLeaf<(i32 imm), [{ + // Returns true if all low 16-bits are 0. + return (((uint32_t)N->getZExtValue()) & 0xFFFFUL) == 0; +}], hi16>; + +/// imm0_65535 - An immediate is in the range [0.65535]. +def Imm0_65535AsmOperand: AsmOperandClass { let Name = "Imm0_65535"; } +def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{ + return Imm >= 0 && Imm < 65536; +}]> { + let ParserMatchClass = Imm0_65535AsmOperand; +} + +class BinOpWithFlagFrag<dag res> : + PatFrag<(ops node:$LHS, node:$RHS, node:$FLAG), res>; +class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>; +class UnOpFrag <dag res> : PatFrag<(ops node:$Src), res>; + +// An 'and' node with a single use. +def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{ + return N->hasOneUse(); +}]>; + +// An 'xor' node with a single use. +def xor_su : PatFrag<(ops node:$lhs, node:$rhs), (xor node:$lhs, node:$rhs), [{ + return N->hasOneUse(); +}]>; + +// An 'fmul' node with a single use. +def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (fmul node:$lhs, node:$rhs),[{ + return N->hasOneUse(); +}]>; + +// An 'fadd' node which checks for single non-hazardous use. +def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fadd node:$lhs, node:$rhs),[{ + return hasNoVMLxHazardUse(N); +}]>; + +// An 'fsub' node which checks for single non-hazardous use. +def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{ + return hasNoVMLxHazardUse(N); +}]>; + +//===----------------------------------------------------------------------===// +// Operand Definitions. +// + +// Branch target. +// FIXME: rename brtarget to t2_brtarget +def brtarget : Operand<OtherVT> { + let EncoderMethod = "getBranchTargetOpValue"; + let OperandType = "OPERAND_PCREL"; + let DecoderMethod = "DecodeT2BROperand"; +} + +// FIXME: get rid of this one? +def uncondbrtarget : Operand<OtherVT> { + let EncoderMethod = "getUnconditionalBranchTargetOpValue"; + let OperandType = "OPERAND_PCREL"; +} + +// Branch target for ARM. Handles conditional/unconditional +def br_target : Operand<OtherVT> { + let EncoderMethod = "getARMBranchTargetOpValue"; + let OperandType = "OPERAND_PCREL"; +} + +// Call target. +// FIXME: rename bltarget to t2_bl_target? +def bltarget : Operand<i32> { + // Encoded the same as branch targets. + let EncoderMethod = "getBranchTargetOpValue"; + let OperandType = "OPERAND_PCREL"; +} + +// Call target for ARM. Handles conditional/unconditional +// FIXME: rename bl_target to t2_bltarget? +def bl_target : Operand<i32> { + // Encoded the same as branch targets. + let EncoderMethod = "getARMBranchTargetOpValue"; + let OperandType = "OPERAND_PCREL"; +} + +def blx_target : Operand<i32> { + // Encoded the same as branch targets. + let EncoderMethod = "getARMBLXTargetOpValue"; + let OperandType = "OPERAND_PCREL"; +} + +// A list of registers separated by comma. Used by load/store multiple. +def RegListAsmOperand : AsmOperandClass { let Name = "RegList"; } +def reglist : Operand<i32> { + let EncoderMethod = "getRegisterListOpValue"; + let ParserMatchClass = RegListAsmOperand; + let PrintMethod = "printRegisterList"; + let DecoderMethod = "DecodeRegListOperand"; +} + +def DPRRegListAsmOperand : AsmOperandClass { let Name = "DPRRegList"; } +def dpr_reglist : Operand<i32> { + let EncoderMethod = "getRegisterListOpValue"; + let ParserMatchClass = DPRRegListAsmOperand; + let PrintMethod = "printRegisterList"; + let DecoderMethod = "DecodeDPRRegListOperand"; +} + +def SPRRegListAsmOperand : AsmOperandClass { let Name = "SPRRegList"; } +def spr_reglist : Operand<i32> { + let EncoderMethod = "getRegisterListOpValue"; + let ParserMatchClass = SPRRegListAsmOperand; + let PrintMethod = "printRegisterList"; + let DecoderMethod = "DecodeSPRRegListOperand"; +} + +// An operand for the CONSTPOOL_ENTRY pseudo-instruction. +def cpinst_operand : Operand<i32> { + let PrintMethod = "printCPInstOperand"; +} + +// Local PC labels. +def pclabel : Operand<i32> { + let PrintMethod = "printPCLabel"; +} + +// ADR instruction labels. +def adrlabel : Operand<i32> { + let EncoderMethod = "getAdrLabelOpValue"; +} + +def neon_vcvt_imm32 : Operand<i32> { + let EncoderMethod = "getNEONVcvtImm32OpValue"; + let DecoderMethod = "DecodeVCVTImmOperand"; +} + +// rot_imm: An integer that encodes a rotate amount. Must be 8, 16, or 24. +def rot_imm_XFORM: SDNodeXForm<imm, [{ + switch (N->getZExtValue()){ + default: assert(0); + case 0: return CurDAG->getTargetConstant(0, MVT::i32); + case 8: return CurDAG->getTargetConstant(1, MVT::i32); + case 16: return CurDAG->getTargetConstant(2, MVT::i32); + case 24: return CurDAG->getTargetConstant(3, MVT::i32); + } +}]>; +def RotImmAsmOperand : AsmOperandClass { + let Name = "RotImm"; + let ParserMethod = "parseRotImm"; +} +def rot_imm : Operand<i32>, PatLeaf<(i32 imm), [{ + int32_t v = N->getZExtValue(); + return v == 8 || v == 16 || v == 24; }], + rot_imm_XFORM> { + let PrintMethod = "printRotImmOperand"; + let ParserMatchClass = RotImmAsmOperand; +} + +// shift_imm: An integer that encodes a shift amount and the type of shift +// (asr or lsl). The 6-bit immediate encodes as: +// {5} 0 ==> lsl +// 1 asr +// {4-0} imm5 shift amount. +// asr #32 encoded as imm5 == 0. +def ShifterImmAsmOperand : AsmOperandClass { + let Name = "ShifterImm"; + let ParserMethod = "parseShifterImm"; +} +def shift_imm : Operand<i32> { + let PrintMethod = "printShiftImmOperand"; + let ParserMatchClass = ShifterImmAsmOperand; +} + +// shifter_operand operands: so_reg_reg, so_reg_imm, and so_imm. +def ShiftedRegAsmOperand : AsmOperandClass { let Name = "RegShiftedReg"; } +def so_reg_reg : Operand<i32>, // reg reg imm + ComplexPattern<i32, 3, "SelectRegShifterOperand", + [shl, srl, sra, rotr]> { + let EncoderMethod = "getSORegRegOpValue"; + let PrintMethod = "printSORegRegOperand"; + let DecoderMethod = "DecodeSORegRegOperand"; + let ParserMatchClass = ShiftedRegAsmOperand; + let MIOperandInfo = (ops GPRnopc, GPRnopc, i32imm); +} + +def ShiftedImmAsmOperand : AsmOperandClass { let Name = "RegShiftedImm"; } +def so_reg_imm : Operand<i32>, // reg imm + ComplexPattern<i32, 2, "SelectImmShifterOperand", + [shl, srl, sra, rotr]> { + let EncoderMethod = "getSORegImmOpValue"; + let PrintMethod = "printSORegImmOperand"; + let DecoderMethod = "DecodeSORegImmOperand"; + let ParserMatchClass = ShiftedImmAsmOperand; + let MIOperandInfo = (ops GPR, i32imm); +} + +// FIXME: Does this need to be distinct from so_reg? +def shift_so_reg_reg : Operand<i32>, // reg reg imm + ComplexPattern<i32, 3, "SelectShiftRegShifterOperand", + [shl,srl,sra,rotr]> { + let EncoderMethod = "getSORegRegOpValue"; + let PrintMethod = "printSORegRegOperand"; + let DecoderMethod = "DecodeSORegRegOperand"; + let MIOperandInfo = (ops GPR, GPR, i32imm); +} + +// FIXME: Does this need to be distinct from so_reg? +def shift_so_reg_imm : Operand<i32>, // reg reg imm + ComplexPattern<i32, 2, "SelectShiftImmShifterOperand", + [shl,srl,sra,rotr]> { + let EncoderMethod = "getSORegImmOpValue"; + let PrintMethod = "printSORegImmOperand"; + let DecoderMethod = "DecodeSORegImmOperand"; + let MIOperandInfo = (ops GPR, i32imm); +} + + +// so_imm - Match a 32-bit shifter_operand immediate operand, which is an +// 8-bit immediate rotated by an arbitrary number of bits. +def SOImmAsmOperand: AsmOperandClass { let Name = "ARMSOImm"; } +def so_imm : Operand<i32>, ImmLeaf<i32, [{ + return ARM_AM::getSOImmVal(Imm) != -1; + }]> { + let EncoderMethod = "getSOImmOpValue"; + let ParserMatchClass = SOImmAsmOperand; + let DecoderMethod = "DecodeSOImmOperand"; +} + +// Break so_imm's up into two pieces. This handles immediates with up to 16 +// bits set in them. This uses so_imm2part to match and so_imm2part_[12] to +// get the first/second pieces. +def so_imm2part : PatLeaf<(imm), [{ + return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue()); +}]>; + +/// arm_i32imm - True for +V6T2, or true only if so_imm2part is true. +/// +def arm_i32imm : PatLeaf<(imm), [{ + if (Subtarget->hasV6T2Ops()) + return true; + return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue()); +}]>; + +/// imm0_7 predicate - Immediate in the range [0,7]. +def Imm0_7AsmOperand: AsmOperandClass { let Name = "Imm0_7"; } +def imm0_7 : Operand<i32>, ImmLeaf<i32, [{ + return Imm >= 0 && Imm < 8; +}]> { + let ParserMatchClass = Imm0_7AsmOperand; +} + +/// imm0_15 predicate - Immediate in the range [0,15]. +def Imm0_15AsmOperand: AsmOperandClass { let Name = "Imm0_15"; } +def imm0_15 : Operand<i32>, ImmLeaf<i32, [{ + return Imm >= 0 && Imm < 16; +}]> { + let ParserMatchClass = Imm0_15AsmOperand; +} + +/// imm0_31 predicate - True if the 32-bit immediate is in the range [0,31]. +def Imm0_31AsmOperand: AsmOperandClass { let Name = "Imm0_31"; } +def imm0_31 : Operand<i32>, ImmLeaf<i32, [{ + return Imm >= 0 && Imm < 32; +}]> { + let ParserMatchClass = Imm0_31AsmOperand; +} + +/// imm0_255 predicate - Immediate in the range [0,255]. +def Imm0_255AsmOperand : AsmOperandClass { let Name = "Imm0_255"; } +def imm0_255 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 256; }]> { + let ParserMatchClass = Imm0_255AsmOperand; +} + +// imm0_65535_expr - For movt/movw - 16-bit immediate that can also reference +// a relocatable expression. +// +// FIXME: This really needs a Thumb version separate from the ARM version. +// While the range is the same, and can thus use the same match class, +// the encoding is different so it should have a different encoder method. +def Imm0_65535ExprAsmOperand: AsmOperandClass { let Name = "Imm0_65535Expr"; } +def imm0_65535_expr : Operand<i32> { + let EncoderMethod = "getHiLo16ImmOpValue"; + let ParserMatchClass = Imm0_65535ExprAsmOperand; +} + +/// imm24b - True if the 32-bit immediate is encodable in 24 bits. +def Imm24bitAsmOperand: AsmOperandClass { let Name = "Imm24bit"; } +def imm24b : Operand<i32>, ImmLeaf<i32, [{ + return Imm >= 0 && Imm <= 0xffffff; +}]> { + let ParserMatchClass = Imm24bitAsmOperand; +} + + +/// bf_inv_mask_imm predicate - An AND mask to clear an arbitrary width bitfield +/// e.g., 0xf000ffff +def BitfieldAsmOperand : AsmOperandClass { + let Name = "Bitfield"; + let ParserMethod = "parseBitfield"; +} +def bf_inv_mask_imm : Operand<i32>, + PatLeaf<(imm), [{ + return ARM::isBitFieldInvertedMask(N->getZExtValue()); +}] > { + let EncoderMethod = "getBitfieldInvertedMaskOpValue"; + let PrintMethod = "printBitfieldInvMaskImmOperand"; + let DecoderMethod = "DecodeBitfieldMaskOperand"; + let ParserMatchClass = BitfieldAsmOperand; +} + +def imm1_32_XFORM: SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant((int)N->getZExtValue() - 1, MVT::i32); +}]>; +def Imm1_32AsmOperand: AsmOperandClass { let Name = "Imm1_32"; } +def imm1_32 : Operand<i32>, PatLeaf<(imm), [{ + uint64_t Imm = N->getZExtValue(); + return Imm > 0 && Imm <= 32; + }], + imm1_32_XFORM> { + let PrintMethod = "printImmPlusOneOperand"; + let ParserMatchClass = Imm1_32AsmOperand; +} + +def imm1_16_XFORM: SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant((int)N->getZExtValue() - 1, MVT::i32); +}]>; +def Imm1_16AsmOperand: AsmOperandClass { let Name = "Imm1_16"; } +def imm1_16 : Operand<i32>, PatLeaf<(imm), [{ return Imm > 0 && Imm <= 16; }], + imm1_16_XFORM> { + let PrintMethod = "printImmPlusOneOperand"; + let ParserMatchClass = Imm1_16AsmOperand; +} + +// Define ARM specific addressing modes. +// addrmode_imm12 := reg +/- imm12 +// +def MemImm12OffsetAsmOperand : AsmOperandClass { let Name = "MemImm12Offset"; } +def addrmode_imm12 : Operand<i32>, + ComplexPattern<i32, 2, "SelectAddrModeImm12", []> { + // 12-bit immediate operand. Note that instructions using this encode + // #0 and #-0 differently. We flag #-0 as the magic value INT32_MIN. All other + // immediate values are as normal. + + let EncoderMethod = "getAddrModeImm12OpValue"; + let PrintMethod = "printAddrModeImm12Operand"; + let DecoderMethod = "DecodeAddrModeImm12Operand"; + let ParserMatchClass = MemImm12OffsetAsmOperand; + let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); +} +// ldst_so_reg := reg +/- reg shop imm +// +def MemRegOffsetAsmOperand : AsmOperandClass { let Name = "MemRegOffset"; } +def ldst_so_reg : Operand<i32>, + ComplexPattern<i32, 3, "SelectLdStSOReg", []> { + let EncoderMethod = "getLdStSORegOpValue"; + // FIXME: Simplify the printer + let PrintMethod = "printAddrMode2Operand"; + let DecoderMethod = "DecodeSORegMemOperand"; + let ParserMatchClass = MemRegOffsetAsmOperand; + let MIOperandInfo = (ops GPR:$base, GPRnopc:$offsreg, i32imm:$shift); +} + +// postidx_imm8 := +/- [0,255] +// +// 9 bit value: +// {8} 1 is imm8 is non-negative. 0 otherwise. +// {7-0} [0,255] imm8 value. +def PostIdxImm8AsmOperand : AsmOperandClass { let Name = "PostIdxImm8"; } +def postidx_imm8 : Operand<i32> { + let PrintMethod = "printPostIdxImm8Operand"; + let ParserMatchClass = PostIdxImm8AsmOperand; + let MIOperandInfo = (ops i32imm); +} + +// postidx_imm8s4 := +/- [0,1020] +// +// 9 bit value: +// {8} 1 is imm8 is non-negative. 0 otherwise. +// {7-0} [0,255] imm8 value, scaled by 4. +def PostIdxImm8s4AsmOperand : AsmOperandClass { let Name = "PostIdxImm8s4"; } +def postidx_imm8s4 : Operand<i32> { + let PrintMethod = "printPostIdxImm8s4Operand"; + let ParserMatchClass = PostIdxImm8s4AsmOperand; + let MIOperandInfo = (ops i32imm); +} + + +// postidx_reg := +/- reg +// +def PostIdxRegAsmOperand : AsmOperandClass { + let Name = "PostIdxReg"; + let ParserMethod = "parsePostIdxReg"; +} +def postidx_reg : Operand<i32> { + let EncoderMethod = "getPostIdxRegOpValue"; + let DecoderMethod = "DecodePostIdxReg"; + let PrintMethod = "printPostIdxRegOperand"; + let ParserMatchClass = PostIdxRegAsmOperand; + let MIOperandInfo = (ops GPR, i32imm); +} + + +// addrmode2 := reg +/- imm12 +// := reg +/- reg shop imm +// +// FIXME: addrmode2 should be refactored the rest of the way to always +// use explicit imm vs. reg versions above (addrmode_imm12 and ldst_so_reg). +def AddrMode2AsmOperand : AsmOperandClass { let Name = "AddrMode2"; } +def addrmode2 : Operand<i32>, + ComplexPattern<i32, 3, "SelectAddrMode2", []> { + let EncoderMethod = "getAddrMode2OpValue"; + let PrintMethod = "printAddrMode2Operand"; + let ParserMatchClass = AddrMode2AsmOperand; + let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm); +} + +def PostIdxRegShiftedAsmOperand : AsmOperandClass { + let Name = "PostIdxRegShifted"; + let ParserMethod = "parsePostIdxReg"; +} +def am2offset_reg : Operand<i32>, + ComplexPattern<i32, 2, "SelectAddrMode2OffsetReg", + [], [SDNPWantRoot]> { + let EncoderMethod = "getAddrMode2OffsetOpValue"; + let PrintMethod = "printAddrMode2OffsetOperand"; + // When using this for assembly, it's always as a post-index offset. + let ParserMatchClass = PostIdxRegShiftedAsmOperand; + let MIOperandInfo = (ops GPR, i32imm); +} + +// FIXME: am2offset_imm should only need the immediate, not the GPR. Having +// the GPR is purely vestigal at this point. +def AM2OffsetImmAsmOperand : AsmOperandClass { let Name = "AM2OffsetImm"; } +def am2offset_imm : Operand<i32>, + ComplexPattern<i32, 2, "SelectAddrMode2OffsetImm", + [], [SDNPWantRoot]> { + let EncoderMethod = "getAddrMode2OffsetOpValue"; + let PrintMethod = "printAddrMode2OffsetOperand"; + let ParserMatchClass = AM2OffsetImmAsmOperand; + let MIOperandInfo = (ops GPR, i32imm); +} + + +// addrmode3 := reg +/- reg +// addrmode3 := reg +/- imm8 +// +// FIXME: split into imm vs. reg versions. +def AddrMode3AsmOperand : AsmOperandClass { let Name = "AddrMode3"; } +def addrmode3 : Operand<i32>, + ComplexPattern<i32, 3, "SelectAddrMode3", []> { + let EncoderMethod = "getAddrMode3OpValue"; + let PrintMethod = "printAddrMode3Operand"; + let ParserMatchClass = AddrMode3AsmOperand; + let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm); +} + +// FIXME: split into imm vs. reg versions. +// FIXME: parser method to handle +/- register. +def AM3OffsetAsmOperand : AsmOperandClass { + let Name = "AM3Offset"; + let ParserMethod = "parseAM3Offset"; +} +def am3offset : Operand<i32>, + ComplexPattern<i32, 2, "SelectAddrMode3Offset", + [], [SDNPWantRoot]> { + let EncoderMethod = "getAddrMode3OffsetOpValue"; + let PrintMethod = "printAddrMode3OffsetOperand"; + let ParserMatchClass = AM3OffsetAsmOperand; + let MIOperandInfo = (ops GPR, i32imm); +} + +// ldstm_mode := {ia, ib, da, db} +// +def ldstm_mode : OptionalDefOperand<OtherVT, (ops i32), (ops (i32 1))> { + let EncoderMethod = "getLdStmModeOpValue"; + let PrintMethod = "printLdStmModeOperand"; +} + +// addrmode5 := reg +/- imm8*4 +// +def AddrMode5AsmOperand : AsmOperandClass { let Name = "AddrMode5"; } +def addrmode5 : Operand<i32>, + ComplexPattern<i32, 2, "SelectAddrMode5", []> { + let PrintMethod = "printAddrMode5Operand"; + let EncoderMethod = "getAddrMode5OpValue"; + let DecoderMethod = "DecodeAddrMode5Operand"; + let ParserMatchClass = AddrMode5AsmOperand; + let MIOperandInfo = (ops GPR:$base, i32imm); +} + +// addrmode6 := reg with optional alignment +// +def AddrMode6AsmOperand : AsmOperandClass { let Name = "AlignedMemory"; } +def addrmode6 : Operand<i32>, + ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{ + let PrintMethod = "printAddrMode6Operand"; + let MIOperandInfo = (ops GPR:$addr, i32imm:$align); + let EncoderMethod = "getAddrMode6AddressOpValue"; + let DecoderMethod = "DecodeAddrMode6Operand"; + let ParserMatchClass = AddrMode6AsmOperand; +} + +def am6offset : Operand<i32>, + ComplexPattern<i32, 1, "SelectAddrMode6Offset", + [], [SDNPWantRoot]> { + let PrintMethod = "printAddrMode6OffsetOperand"; + let MIOperandInfo = (ops GPR); + let EncoderMethod = "getAddrMode6OffsetOpValue"; + let DecoderMethod = "DecodeGPRRegisterClass"; +} + +// Special version of addrmode6 to handle alignment encoding for VST1/VLD1 +// (single element from one lane) for size 32. +def addrmode6oneL32 : Operand<i32>, + ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{ + let PrintMethod = "printAddrMode6Operand"; + let MIOperandInfo = (ops GPR:$addr, i32imm); + let EncoderMethod = "getAddrMode6OneLane32AddressOpValue"; +} + +// Special version of addrmode6 to handle alignment encoding for VLD-dup +// instructions, specifically VLD4-dup. +def addrmode6dup : Operand<i32>, + ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{ + let PrintMethod = "printAddrMode6Operand"; + let MIOperandInfo = (ops GPR:$addr, i32imm); + let EncoderMethod = "getAddrMode6DupAddressOpValue"; +} + +// addrmodepc := pc + reg +// +def addrmodepc : Operand<i32>, + ComplexPattern<i32, 2, "SelectAddrModePC", []> { + let PrintMethod = "printAddrModePCOperand"; + let MIOperandInfo = (ops GPR, i32imm); +} + +// addr_offset_none := reg +// +def MemNoOffsetAsmOperand : AsmOperandClass { let Name = "MemNoOffset"; } +def addr_offset_none : Operand<i32>, + ComplexPattern<i32, 1, "SelectAddrOffsetNone", []> { + let PrintMethod = "printAddrMode7Operand"; + let DecoderMethod = "DecodeAddrMode7Operand"; + let ParserMatchClass = MemNoOffsetAsmOperand; + let MIOperandInfo = (ops GPR:$base); +} + +def nohash_imm : Operand<i32> { + let PrintMethod = "printNoHashImmediate"; +} + +def CoprocNumAsmOperand : AsmOperandClass { + let Name = "CoprocNum"; + let ParserMethod = "parseCoprocNumOperand"; +} +def p_imm : Operand<i32> { + let PrintMethod = "printPImmediate"; + let ParserMatchClass = CoprocNumAsmOperand; + let DecoderMethod = "DecodeCoprocessor"; +} + +def CoprocRegAsmOperand : AsmOperandClass { + let Name = "CoprocReg"; + let ParserMethod = "parseCoprocRegOperand"; +} +def c_imm : Operand<i32> { + let PrintMethod = "printCImmediate"; + let ParserMatchClass = CoprocRegAsmOperand; +} +def CoprocOptionAsmOperand : AsmOperandClass { + let Name = "CoprocOption"; + let ParserMethod = "parseCoprocOptionOperand"; +} +def coproc_option_imm : Operand<i32> { + let PrintMethod = "printCoprocOptionImm"; + let ParserMatchClass = CoprocOptionAsmOperand; +} + +//===----------------------------------------------------------------------===// + +include "ARMInstrFormats.td" + +//===----------------------------------------------------------------------===// +// Multiclass helpers... +// + +/// AsI1_bin_irs - Defines a set of (op r, {so_imm|r|so_reg}) patterns for a +/// binop that produces a value. +multiclass AsI1_bin_irs<bits<4> opcod, string opc, + InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, + PatFrag opnode, string baseOpc, bit Commutable = 0> { + // The register-immediate version is re-materializable. This is useful + // in particular for taking the address of a local. + let isReMaterializable = 1 in { + def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm, + iii, opc, "\t$Rd, $Rn, $imm", + [(set GPR:$Rd, (opnode GPR:$Rn, so_imm:$imm))]> { + bits<4> Rd; + bits<4> Rn; + bits<12> imm; + let Inst{25} = 1; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-0} = imm; + } + } + def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, + iir, opc, "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + let Inst{25} = 0; + let isCommutable = Commutable; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-4} = 0b00000000; + let Inst{3-0} = Rm; + } + + def rsi : AsI1<opcod, (outs GPR:$Rd), + (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, + iis, opc, "\t$Rd, $Rn, $shift", + [(set GPR:$Rd, (opnode GPR:$Rn, so_reg_imm:$shift))]> { + bits<4> Rd; + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-5} = shift{11-5}; + let Inst{4} = 0; + let Inst{3-0} = shift{3-0}; + } + + def rsr : AsI1<opcod, (outs GPR:$Rd), + (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, + iis, opc, "\t$Rd, $Rn, $shift", + [(set GPR:$Rd, (opnode GPR:$Rn, so_reg_reg:$shift))]> { + bits<4> Rd; + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-8} = shift{11-8}; + let Inst{7} = 0; + let Inst{6-5} = shift{6-5}; + let Inst{4} = 1; + let Inst{3-0} = shift{3-0}; + } + + // Assembly aliases for optional destination operand when it's the same + // as the source operand. + def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"), + (!cast<Instruction>(!strconcat(baseOpc, "ri")) GPR:$Rdn, GPR:$Rdn, + so_imm:$imm, pred:$p, + cc_out:$s)>, + Requires<[IsARM]>; + def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $Rm"), + (!cast<Instruction>(!strconcat(baseOpc, "rr")) GPR:$Rdn, GPR:$Rdn, + GPR:$Rm, pred:$p, + cc_out:$s)>, + Requires<[IsARM]>; + def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"), + (!cast<Instruction>(!strconcat(baseOpc, "rsi")) GPR:$Rdn, GPR:$Rdn, + so_reg_imm:$shift, pred:$p, + cc_out:$s)>, + Requires<[IsARM]>; + def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"), + (!cast<Instruction>(!strconcat(baseOpc, "rsr")) GPR:$Rdn, GPR:$Rdn, + so_reg_reg:$shift, pred:$p, + cc_out:$s)>, + Requires<[IsARM]>; + +} + +/// AsI1_rbin_irs - Same as AsI1_bin_irs except the order of operands are +/// reversed. The 'rr' form is only defined for the disassembler; for codegen +/// it is equivalent to the AsI1_bin_irs counterpart. +multiclass AsI1_rbin_irs<bits<4> opcod, string opc, + InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, + PatFrag opnode, string baseOpc, bit Commutable = 0> { + // The register-immediate version is re-materializable. This is useful + // in particular for taking the address of a local. + let isReMaterializable = 1 in { + def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm, + iii, opc, "\t$Rd, $Rn, $imm", + [(set GPR:$Rd, (opnode so_imm:$imm, GPR:$Rn))]> { + bits<4> Rd; + bits<4> Rn; + bits<12> imm; + let Inst{25} = 1; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-0} = imm; + } + } + def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, + iir, opc, "\t$Rd, $Rn, $Rm", + [/* pattern left blank */]> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + let Inst{11-4} = 0b00000000; + let Inst{25} = 0; + let Inst{3-0} = Rm; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; + } + + def rsi : AsI1<opcod, (outs GPR:$Rd), + (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, + iis, opc, "\t$Rd, $Rn, $shift", + [(set GPR:$Rd, (opnode so_reg_imm:$shift, GPR:$Rn))]> { + bits<4> Rd; + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-5} = shift{11-5}; + let Inst{4} = 0; + let Inst{3-0} = shift{3-0}; + } + + def rsr : AsI1<opcod, (outs GPR:$Rd), + (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, + iis, opc, "\t$Rd, $Rn, $shift", + [(set GPR:$Rd, (opnode so_reg_reg:$shift, GPR:$Rn))]> { + bits<4> Rd; + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-8} = shift{11-8}; + let Inst{7} = 0; + let Inst{6-5} = shift{6-5}; + let Inst{4} = 1; + let Inst{3-0} = shift{3-0}; + } + + // Assembly aliases for optional destination operand when it's the same + // as the source operand. + def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"), + (!cast<Instruction>(!strconcat(baseOpc, "ri")) GPR:$Rdn, GPR:$Rdn, + so_imm:$imm, pred:$p, + cc_out:$s)>, + Requires<[IsARM]>; + def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $Rm"), + (!cast<Instruction>(!strconcat(baseOpc, "rr")) GPR:$Rdn, GPR:$Rdn, + GPR:$Rm, pred:$p, + cc_out:$s)>, + Requires<[IsARM]>; + def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"), + (!cast<Instruction>(!strconcat(baseOpc, "rsi")) GPR:$Rdn, GPR:$Rdn, + so_reg_imm:$shift, pred:$p, + cc_out:$s)>, + Requires<[IsARM]>; + def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"), + (!cast<Instruction>(!strconcat(baseOpc, "rsr")) GPR:$Rdn, GPR:$Rdn, + so_reg_reg:$shift, pred:$p, + cc_out:$s)>, + Requires<[IsARM]>; + +} + +/// AsI1_rbin_s_is - Same as AsI1_rbin_s_is except it sets 's' bit by default. +/// +/// These opcodes will be converted to the real non-S opcodes by +/// AdjustInstrPostInstrSelection after giving then an optional CPSR operand. +let hasPostISelHook = 1, isCodeGenOnly = 1, isPseudo = 1, Defs = [CPSR] in { +multiclass AsI1_rbin_s_is<bits<4> opcod, string opc, + InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, + PatFrag opnode, bit Commutable = 0> { + def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm, + iii, opc, "\t$Rd, $Rn, $imm", + [(set GPR:$Rd, CPSR, (opnode so_imm:$imm, GPR:$Rn))]>; + + def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, + iir, opc, "\t$Rd, $Rn, $Rm", + [/* pattern left blank */]>; + + def rsi : AsI1<opcod, (outs GPR:$Rd), + (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, + iis, opc, "\t$Rd, $Rn, $shift", + [(set GPR:$Rd, CPSR, (opnode so_reg_imm:$shift, GPR:$Rn))]>; + + def rsr : AsI1<opcod, (outs GPR:$Rd), + (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, + iis, opc, "\t$Rd, $Rn, $shift", + [(set GPR:$Rd, CPSR, (opnode so_reg_reg:$shift, GPR:$Rn))]> { + bits<4> Rd; + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-8} = shift{11-8}; + let Inst{7} = 0; + let Inst{6-5} = shift{6-5}; + let Inst{4} = 1; + let Inst{3-0} = shift{3-0}; + } +} +} + +/// AsI1_bin_s_irs - Same as AsI1_bin_irs except it sets the 's' bit by default. +/// +/// These opcodes will be converted to the real non-S opcodes by +/// AdjustInstrPostInstrSelection after giving then an optional CPSR operand. +let hasPostISelHook = 1, isCodeGenOnly = 1, isPseudo = 1, Defs = [CPSR] in { +multiclass AsI1_bin_s_irs<bits<4> opcod, string opc, + InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, + PatFrag opnode, bit Commutable = 0> { + def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm, + iii, opc, "\t$Rd, $Rn, $imm", + [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_imm:$imm))]>; + def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, + iir, opc, "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, GPR:$Rm))]>; + def rsi : AsI1<opcod, (outs GPR:$Rd), + (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, + iis, opc, "\t$Rd, $Rn, $shift", + [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_reg_imm:$shift))]>; + + def rsr : AsI1<opcod, (outs GPR:$Rd), + (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, + iis, opc, "\t$Rd, $Rn, $shift", + [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_reg_reg:$shift))]>; +} +} + +/// AI1_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test +/// patterns. Similar to AsI1_bin_irs except the instruction does not produce +/// a explicit result, only implicitly set CPSR. +let isCompare = 1, Defs = [CPSR] in { +multiclass AI1_cmp_irs<bits<4> opcod, string opc, + InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, + PatFrag opnode, bit Commutable = 0> { + def ri : AI1<opcod, (outs), (ins GPR:$Rn, so_imm:$imm), DPFrm, iii, + opc, "\t$Rn, $imm", + [(opnode GPR:$Rn, so_imm:$imm)]> { + bits<4> Rn; + bits<12> imm; + let Inst{25} = 1; + let Inst{20} = 1; + let Inst{19-16} = Rn; + let Inst{15-12} = 0b0000; + let Inst{11-0} = imm; + } + def rr : AI1<opcod, (outs), (ins GPR:$Rn, GPR:$Rm), DPFrm, iir, + opc, "\t$Rn, $Rm", + [(opnode GPR:$Rn, GPR:$Rm)]> { + bits<4> Rn; + bits<4> Rm; + let isCommutable = Commutable; + let Inst{25} = 0; + let Inst{20} = 1; + let Inst{19-16} = Rn; + let Inst{15-12} = 0b0000; + let Inst{11-4} = 0b00000000; + let Inst{3-0} = Rm; + } + def rsi : AI1<opcod, (outs), + (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, iis, + opc, "\t$Rn, $shift", + [(opnode GPR:$Rn, so_reg_imm:$shift)]> { + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{20} = 1; + let Inst{19-16} = Rn; + let Inst{15-12} = 0b0000; + let Inst{11-5} = shift{11-5}; + let Inst{4} = 0; + let Inst{3-0} = shift{3-0}; + } + def rsr : AI1<opcod, (outs), + (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, iis, + opc, "\t$Rn, $shift", + [(opnode GPR:$Rn, so_reg_reg:$shift)]> { + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{20} = 1; + let Inst{19-16} = Rn; + let Inst{15-12} = 0b0000; + let Inst{11-8} = shift{11-8}; + let Inst{7} = 0; + let Inst{6-5} = shift{6-5}; + let Inst{4} = 1; + let Inst{3-0} = shift{3-0}; + } + +} +} + +/// AI_ext_rrot - A unary operation with two forms: one whose operand is a +/// register and one whose operand is a register rotated by 8/16/24. +/// FIXME: Remove the 'r' variant. Its rot_imm is zero. +class AI_ext_rrot<bits<8> opcod, string opc, PatFrag opnode> + : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPRnopc:$Rm, rot_imm:$rot), + IIC_iEXTr, opc, "\t$Rd, $Rm$rot", + [(set GPRnopc:$Rd, (opnode (rotr GPRnopc:$Rm, rot_imm:$rot)))]>, + Requires<[IsARM, HasV6]> { + bits<4> Rd; + bits<4> Rm; + bits<2> rot; + let Inst{19-16} = 0b1111; + let Inst{15-12} = Rd; + let Inst{11-10} = rot; + let Inst{3-0} = Rm; +} + +class AI_ext_rrot_np<bits<8> opcod, string opc> + : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPRnopc:$Rm, rot_imm:$rot), + IIC_iEXTr, opc, "\t$Rd, $Rm$rot", []>, + Requires<[IsARM, HasV6]> { + bits<2> rot; + let Inst{19-16} = 0b1111; + let Inst{11-10} = rot; +} + +/// AI_exta_rrot - A binary operation with two forms: one whose operand is a +/// register and one whose operand is a register rotated by 8/16/24. +class AI_exta_rrot<bits<8> opcod, string opc, PatFrag opnode> + : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPR:$Rn, GPRnopc:$Rm, rot_imm:$rot), + IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm$rot", + [(set GPRnopc:$Rd, (opnode GPR:$Rn, + (rotr GPRnopc:$Rm, rot_imm:$rot)))]>, + Requires<[IsARM, HasV6]> { + bits<4> Rd; + bits<4> Rm; + bits<4> Rn; + bits<2> rot; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-10} = rot; + let Inst{9-4} = 0b000111; + let Inst{3-0} = Rm; +} + +class AI_exta_rrot_np<bits<8> opcod, string opc> + : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPR:$Rn, GPRnopc:$Rm, rot_imm:$rot), + IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm$rot", []>, + Requires<[IsARM, HasV6]> { + bits<4> Rn; + bits<2> rot; + let Inst{19-16} = Rn; + let Inst{11-10} = rot; +} + +/// AI1_adde_sube_irs - Define instructions and patterns for adde and sube. +multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, + string baseOpc, bit Commutable = 0> { + let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in { + def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), + DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm", + [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_imm:$imm, CPSR))]>, + Requires<[IsARM]> { + bits<4> Rd; + bits<4> Rn; + bits<12> imm; + let Inst{25} = 1; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; + let Inst{11-0} = imm; + } + def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + DPFrm, IIC_iALUr, opc, "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, GPR:$Rm, CPSR))]>, + Requires<[IsARM]> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + let Inst{11-4} = 0b00000000; + let Inst{25} = 0; + let isCommutable = Commutable; + let Inst{3-0} = Rm; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; + } + def rsi : AsI1<opcod, (outs GPR:$Rd), + (ins GPR:$Rn, so_reg_imm:$shift), + DPSoRegImmFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift", + [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_reg_imm:$shift, CPSR))]>, + Requires<[IsARM]> { + bits<4> Rd; + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-5} = shift{11-5}; + let Inst{4} = 0; + let Inst{3-0} = shift{3-0}; + } + def rsr : AsI1<opcod, (outs GPR:$Rd), + (ins GPR:$Rn, so_reg_reg:$shift), + DPSoRegRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift", + [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_reg_reg:$shift, CPSR))]>, + Requires<[IsARM]> { + bits<4> Rd; + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-8} = shift{11-8}; + let Inst{7} = 0; + let Inst{6-5} = shift{6-5}; + let Inst{4} = 1; + let Inst{3-0} = shift{3-0}; + } + } + + // Assembly aliases for optional destination operand when it's the same + // as the source operand. + def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"), + (!cast<Instruction>(!strconcat(baseOpc, "ri")) GPR:$Rdn, GPR:$Rdn, + so_imm:$imm, pred:$p, + cc_out:$s)>, + Requires<[IsARM]>; + def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $Rm"), + (!cast<Instruction>(!strconcat(baseOpc, "rr")) GPR:$Rdn, GPR:$Rdn, + GPR:$Rm, pred:$p, + cc_out:$s)>, + Requires<[IsARM]>; + def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"), + (!cast<Instruction>(!strconcat(baseOpc, "rsi")) GPR:$Rdn, GPR:$Rdn, + so_reg_imm:$shift, pred:$p, + cc_out:$s)>, + Requires<[IsARM]>; + def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"), + (!cast<Instruction>(!strconcat(baseOpc, "rsr")) GPR:$Rdn, GPR:$Rdn, + so_reg_reg:$shift, pred:$p, + cc_out:$s)>, + Requires<[IsARM]>; +} + +/// AI1_rsc_irs - Define instructions and patterns for rsc +multiclass AI1_rsc_irs<bits<4> opcod, string opc, PatFrag opnode, + string baseOpc> { + let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in { + def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), + DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm", + [(set GPR:$Rd, CPSR, (opnode so_imm:$imm, GPR:$Rn, CPSR))]>, + Requires<[IsARM]> { + bits<4> Rd; + bits<4> Rn; + bits<12> imm; + let Inst{25} = 1; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; + let Inst{11-0} = imm; + } + def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + DPFrm, IIC_iALUr, opc, "\t$Rd, $Rn, $Rm", + [/* pattern left blank */]> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + let Inst{11-4} = 0b00000000; + let Inst{25} = 0; + let Inst{3-0} = Rm; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; + } + def rsi : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg_imm:$shift), + DPSoRegImmFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift", + [(set GPR:$Rd, CPSR, (opnode so_reg_imm:$shift, GPR:$Rn, CPSR))]>, + Requires<[IsARM]> { + bits<4> Rd; + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-5} = shift{11-5}; + let Inst{4} = 0; + let Inst{3-0} = shift{3-0}; + } + def rsr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg_reg:$shift), + DPSoRegRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift", + [(set GPR:$Rd, CPSR, (opnode so_reg_reg:$shift, GPR:$Rn, CPSR))]>, + Requires<[IsARM]> { + bits<4> Rd; + bits<4> Rn; + bits<12> shift; + let Inst{25} = 0; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{11-8} = shift{11-8}; + let Inst{7} = 0; + let Inst{6-5} = shift{6-5}; + let Inst{4} = 1; + let Inst{3-0} = shift{3-0}; + } + } + + // Assembly aliases for optional destination operand when it's the same + // as the source operand. + def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"), + (!cast<Instruction>(!strconcat(baseOpc, "ri")) GPR:$Rdn, GPR:$Rdn, + so_imm:$imm, pred:$p, + cc_out:$s)>, + Requires<[IsARM]>; + def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $Rm"), + (!cast<Instruction>(!strconcat(baseOpc, "rr")) GPR:$Rdn, GPR:$Rdn, + GPR:$Rm, pred:$p, + cc_out:$s)>, + Requires<[IsARM]>; + def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"), + (!cast<Instruction>(!strconcat(baseOpc, "rsi")) GPR:$Rdn, GPR:$Rdn, + so_reg_imm:$shift, pred:$p, + cc_out:$s)>, + Requires<[IsARM]>; + def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"), + (!cast<Instruction>(!strconcat(baseOpc, "rsr")) GPR:$Rdn, GPR:$Rdn, + so_reg_reg:$shift, pred:$p, + cc_out:$s)>, + Requires<[IsARM]>; +} + +let canFoldAsLoad = 1, isReMaterializable = 1 in { +multiclass AI_ldr1<bit isByte, string opc, InstrItinClass iii, + InstrItinClass iir, PatFrag opnode> { + // Note: We use the complex addrmode_imm12 rather than just an input + // GPR and a constrained immediate so that we can use this to match + // frame index references and avoid matching constant pool references. + def i12: AI2ldst<0b010, 1, isByte, (outs GPR:$Rt), (ins addrmode_imm12:$addr), + AddrMode_i12, LdFrm, iii, opc, "\t$Rt, $addr", + [(set GPR:$Rt, (opnode addrmode_imm12:$addr))]> { + bits<4> Rt; + bits<17> addr; + let Inst{23} = addr{12}; // U (add = ('U' == 1)) + let Inst{19-16} = addr{16-13}; // Rn + let Inst{15-12} = Rt; + let Inst{11-0} = addr{11-0}; // imm12 + } + def rs : AI2ldst<0b011, 1, isByte, (outs GPR:$Rt), (ins ldst_so_reg:$shift), + AddrModeNone, LdFrm, iir, opc, "\t$Rt, $shift", + [(set GPR:$Rt, (opnode ldst_so_reg:$shift))]> { + bits<4> Rt; + bits<17> shift; + let shift{4} = 0; // Inst{4} = 0 + let Inst{23} = shift{12}; // U (add = ('U' == 1)) + let Inst{19-16} = shift{16-13}; // Rn + let Inst{15-12} = Rt; + let Inst{11-0} = shift{11-0}; + } +} +} + +let canFoldAsLoad = 1, isReMaterializable = 1 in { +multiclass AI_ldr1nopc<bit isByte, string opc, InstrItinClass iii, + InstrItinClass iir, PatFrag opnode> { + // Note: We use the complex addrmode_imm12 rather than just an input + // GPR and a constrained immediate so that we can use this to match + // frame index references and avoid matching constant pool references. + def i12: AI2ldst<0b010, 1, isByte, (outs GPRnopc:$Rt), (ins addrmode_imm12:$addr), + AddrMode_i12, LdFrm, iii, opc, "\t$Rt, $addr", + [(set GPRnopc:$Rt, (opnode addrmode_imm12:$addr))]> { + bits<4> Rt; + bits<17> addr; + let Inst{23} = addr{12}; // U (add = ('U' == 1)) + let Inst{19-16} = addr{16-13}; // Rn + let Inst{15-12} = Rt; + let Inst{11-0} = addr{11-0}; // imm12 + } + def rs : AI2ldst<0b011, 1, isByte, (outs GPRnopc:$Rt), (ins ldst_so_reg:$shift), + AddrModeNone, LdFrm, iir, opc, "\t$Rt, $shift", + [(set GPRnopc:$Rt, (opnode ldst_so_reg:$shift))]> { + bits<4> Rt; + bits<17> shift; + let shift{4} = 0; // Inst{4} = 0 + let Inst{23} = shift{12}; // U (add = ('U' == 1)) + let Inst{19-16} = shift{16-13}; // Rn + let Inst{15-12} = Rt; + let Inst{11-0} = shift{11-0}; + } +} +} + + +multiclass AI_str1<bit isByte, string opc, InstrItinClass iii, + InstrItinClass iir, PatFrag opnode> { + // Note: We use the complex addrmode_imm12 rather than just an input + // GPR and a constrained immediate so that we can use this to match + // frame index references and avoid matching constant pool references. + def i12 : AI2ldst<0b010, 0, isByte, (outs), + (ins GPR:$Rt, addrmode_imm12:$addr), + AddrMode_i12, StFrm, iii, opc, "\t$Rt, $addr", + [(opnode GPR:$Rt, addrmode_imm12:$addr)]> { + bits<4> Rt; + bits<17> addr; + let Inst{23} = addr{12}; // U (add = ('U' == 1)) + let Inst{19-16} = addr{16-13}; // Rn + let Inst{15-12} = Rt; + let Inst{11-0} = addr{11-0}; // imm12 + } + def rs : AI2ldst<0b011, 0, isByte, (outs), (ins GPR:$Rt, ldst_so_reg:$shift), + AddrModeNone, StFrm, iir, opc, "\t$Rt, $shift", + [(opnode GPR:$Rt, ldst_so_reg:$shift)]> { + bits<4> Rt; + bits<17> shift; + let shift{4} = 0; // Inst{4} = 0 + let Inst{23} = shift{12}; // U (add = ('U' == 1)) + let Inst{19-16} = shift{16-13}; // Rn + let Inst{15-12} = Rt; + let Inst{11-0} = shift{11-0}; + } +} + +multiclass AI_str1nopc<bit isByte, string opc, InstrItinClass iii, + InstrItinClass iir, PatFrag opnode> { + // Note: We use the complex addrmode_imm12 rather than just an input + // GPR and a constrained immediate so that we can use this to match + // frame index references and avoid matching constant pool references. + def i12 : AI2ldst<0b010, 0, isByte, (outs), + (ins GPRnopc:$Rt, addrmode_imm12:$addr), + AddrMode_i12, StFrm, iii, opc, "\t$Rt, $addr", + [(opnode GPRnopc:$Rt, addrmode_imm12:$addr)]> { + bits<4> Rt; + bits<17> addr; + let Inst{23} = addr{12}; // U (add = ('U' == 1)) + let Inst{19-16} = addr{16-13}; // Rn + let Inst{15-12} = Rt; + let Inst{11-0} = addr{11-0}; // imm12 + } + def rs : AI2ldst<0b011, 0, isByte, (outs), (ins GPRnopc:$Rt, ldst_so_reg:$shift), + AddrModeNone, StFrm, iir, opc, "\t$Rt, $shift", + [(opnode GPRnopc:$Rt, ldst_so_reg:$shift)]> { + bits<4> Rt; + bits<17> shift; + let shift{4} = 0; // Inst{4} = 0 + let Inst{23} = shift{12}; // U (add = ('U' == 1)) + let Inst{19-16} = shift{16-13}; // Rn + let Inst{15-12} = Rt; + let Inst{11-0} = shift{11-0}; + } +} + + +//===----------------------------------------------------------------------===// +// Instructions +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Miscellaneous Instructions. +// + +/// CONSTPOOL_ENTRY - This instruction represents a floating constant pool in +/// the function. The first operand is the ID# for this instruction, the second +/// is the index into the MachineConstantPool that this is, the third is the +/// size in bytes of this constant pool entry. +let neverHasSideEffects = 1, isNotDuplicable = 1 in +def CONSTPOOL_ENTRY : +PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx, + i32imm:$size), NoItinerary, []>; + +// FIXME: Marking these as hasSideEffects is necessary to prevent machine DCE +// from removing one half of the matched pairs. That breaks PEI, which assumes +// these will always be in pairs, and asserts if it finds otherwise. Better way? +let Defs = [SP], Uses = [SP], hasSideEffects = 1 in { +def ADJCALLSTACKUP : +PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2, pred:$p), NoItinerary, + [(ARMcallseq_end timm:$amt1, timm:$amt2)]>; + +def ADJCALLSTACKDOWN : +PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary, + [(ARMcallseq_start timm:$amt)]>; +} + +// Atomic pseudo-insts which will be lowered to ldrexd/strexd loops. +// (These psuedos use a hand-written selection code). +let usesCustomInserter = 1, Defs = [CPSR], mayLoad = 1, mayStore = 1 in { +def ATOMOR6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$src1, GPR:$src2), + NoItinerary, []>; +def ATOMXOR6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$src1, GPR:$src2), + NoItinerary, []>; +def ATOMADD6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$src1, GPR:$src2), + NoItinerary, []>; +def ATOMSUB6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$src1, GPR:$src2), + NoItinerary, []>; +def ATOMNAND6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$src1, GPR:$src2), + NoItinerary, []>; +def ATOMAND6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$src1, GPR:$src2), + NoItinerary, []>; +def ATOMSWAP6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$src1, GPR:$src2), + NoItinerary, []>; +def ATOMCMPXCHG6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2), + (ins GPR:$addr, GPR:$cmp1, GPR:$cmp2, + GPR:$set1, GPR:$set2), + NoItinerary, []>; +} + +def NOP : AI<(outs), (ins), MiscFrm, NoItinerary, "nop", "", []>, + Requires<[IsARM, HasV6T2]> { + let Inst{27-16} = 0b001100100000; + let Inst{15-8} = 0b11110000; + let Inst{7-0} = 0b00000000; +} + +def YIELD : AI<(outs), (ins), MiscFrm, NoItinerary, "yield", "", []>, + Requires<[IsARM, HasV6T2]> { + let Inst{27-16} = 0b001100100000; + let Inst{15-8} = 0b11110000; + let Inst{7-0} = 0b00000001; +} + +def WFE : AI<(outs), (ins), MiscFrm, NoItinerary, "wfe", "", []>, + Requires<[IsARM, HasV6T2]> { + let Inst{27-16} = 0b001100100000; + let Inst{15-8} = 0b11110000; + let Inst{7-0} = 0b00000010; +} + +def WFI : AI<(outs), (ins), MiscFrm, NoItinerary, "wfi", "", []>, + Requires<[IsARM, HasV6T2]> { + let Inst{27-16} = 0b001100100000; + let Inst{15-8} = 0b11110000; + let Inst{7-0} = 0b00000011; +} + +def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel", + "\t$Rd, $Rn, $Rm", []>, Requires<[IsARM, HasV6]> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + let Inst{3-0} = Rm; + let Inst{15-12} = Rd; + let Inst{19-16} = Rn; + let Inst{27-20} = 0b01101000; + let Inst{7-4} = 0b1011; + let Inst{11-8} = 0b1111; +} + +def SEV : AI<(outs), (ins), MiscFrm, NoItinerary, "sev", "", + []>, Requires<[IsARM, HasV6T2]> { + let Inst{27-16} = 0b001100100000; + let Inst{15-8} = 0b11110000; + let Inst{7-0} = 0b00000100; +} + +// The i32imm operand $val can be used by a debugger to store more information +// about the breakpoint. +def BKPT : AI<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary, + "bkpt", "\t$val", []>, Requires<[IsARM]> { + bits<16> val; + let Inst{3-0} = val{3-0}; + let Inst{19-8} = val{15-4}; + let Inst{27-20} = 0b00010010; + let Inst{7-4} = 0b0111; +} + +// Change Processor State +// FIXME: We should use InstAlias to handle the optional operands. +class CPS<dag iops, string asm_ops> + : AXI<(outs), iops, MiscFrm, NoItinerary, !strconcat("cps", asm_ops), + []>, Requires<[IsARM]> { + bits<2> imod; + bits<3> iflags; + bits<5> mode; + bit M; + + let Inst{31-28} = 0b1111; + let Inst{27-20} = 0b00010000; + let Inst{19-18} = imod; + let Inst{17} = M; // Enabled if mode is set; + let Inst{16} = 0; + let Inst{8-6} = iflags; + let Inst{5} = 0; + let Inst{4-0} = mode; +} + +let DecoderMethod = "DecodeCPSInstruction" in { +let M = 1 in + def CPS3p : CPS<(ins imod_op:$imod, iflags_op:$iflags, imm0_31:$mode), + "$imod\t$iflags, $mode">; +let mode = 0, M = 0 in + def CPS2p : CPS<(ins imod_op:$imod, iflags_op:$iflags), "$imod\t$iflags">; + +let imod = 0, iflags = 0, M = 1 in + def CPS1p : CPS<(ins imm0_31:$mode), "\t$mode">; +} + +// Preload signals the memory system of possible future data/instruction access. +multiclass APreLoad<bits<1> read, bits<1> data, string opc> { + + def i12 : AXI<(outs), (ins addrmode_imm12:$addr), MiscFrm, IIC_Preload, + !strconcat(opc, "\t$addr"), + [(ARMPreload addrmode_imm12:$addr, (i32 read), (i32 data))]> { + bits<4> Rt; + bits<17> addr; + let Inst{31-26} = 0b111101; + let Inst{25} = 0; // 0 for immediate form + let Inst{24} = data; + let Inst{23} = addr{12}; // U (add = ('U' == 1)) + let Inst{22} = read; + let Inst{21-20} = 0b01; + let Inst{19-16} = addr{16-13}; // Rn + let Inst{15-12} = 0b1111; + let Inst{11-0} = addr{11-0}; // imm12 + } + + def rs : AXI<(outs), (ins ldst_so_reg:$shift), MiscFrm, IIC_Preload, + !strconcat(opc, "\t$shift"), + [(ARMPreload ldst_so_reg:$shift, (i32 read), (i32 data))]> { + bits<17> shift; + let Inst{31-26} = 0b111101; + let Inst{25} = 1; // 1 for register form + let Inst{24} = data; + let Inst{23} = shift{12}; // U (add = ('U' == 1)) + let Inst{22} = read; + let Inst{21-20} = 0b01; + let Inst{19-16} = shift{16-13}; // Rn + let Inst{15-12} = 0b1111; + let Inst{11-0} = shift{11-0}; + let Inst{4} = 0; + } +} + +defm PLD : APreLoad<1, 1, "pld">, Requires<[IsARM]>; +defm PLDW : APreLoad<0, 1, "pldw">, Requires<[IsARM,HasV7,HasMP]>; +defm PLI : APreLoad<1, 0, "pli">, Requires<[IsARM,HasV7]>; + +def SETEND : AXI<(outs), (ins setend_op:$end), MiscFrm, NoItinerary, + "setend\t$end", []>, Requires<[IsARM]> { + bits<1> end; + let Inst{31-10} = 0b1111000100000001000000; + let Inst{9} = end; + let Inst{8-0} = 0; +} + +def DBG : AI<(outs), (ins imm0_15:$opt), MiscFrm, NoItinerary, "dbg", "\t$opt", + []>, Requires<[IsARM, HasV7]> { + bits<4> opt; + let Inst{27-4} = 0b001100100000111100001111; + let Inst{3-0} = opt; +} + +// A5.4 Permanently UNDEFINED instructions. +let isBarrier = 1, isTerminator = 1 in +def TRAP : AXI<(outs), (ins), MiscFrm, NoItinerary, + "trap", [(trap)]>, + Requires<[IsARM]> { + let Inst = 0xe7ffdefe; +} + +// Address computation and loads and stores in PIC mode. +let isNotDuplicable = 1 in { +def PICADD : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$a, pclabel:$cp, pred:$p), + 4, IIC_iALUr, + [(set GPR:$dst, (ARMpic_add GPR:$a, imm:$cp))]>; + +let AddedComplexity = 10 in { +def PICLDR : ARMPseudoInst<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p), + 4, IIC_iLoad_r, + [(set GPR:$dst, (load addrmodepc:$addr))]>; + +def PICLDRH : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p), + 4, IIC_iLoad_bh_r, + [(set GPR:$Rt, (zextloadi16 addrmodepc:$addr))]>; + +def PICLDRB : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p), + 4, IIC_iLoad_bh_r, + [(set GPR:$Rt, (zextloadi8 addrmodepc:$addr))]>; + +def PICLDRSH : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p), + 4, IIC_iLoad_bh_r, + [(set GPR:$Rt, (sextloadi16 addrmodepc:$addr))]>; + +def PICLDRSB : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p), + 4, IIC_iLoad_bh_r, + [(set GPR:$Rt, (sextloadi8 addrmodepc:$addr))]>; +} +let AddedComplexity = 10 in { +def PICSTR : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p), + 4, IIC_iStore_r, [(store GPR:$src, addrmodepc:$addr)]>; + +def PICSTRH : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p), + 4, IIC_iStore_bh_r, [(truncstorei16 GPR:$src, + addrmodepc:$addr)]>; + +def PICSTRB : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p), + 4, IIC_iStore_bh_r, [(truncstorei8 GPR:$src, addrmodepc:$addr)]>; +} +} // isNotDuplicable = 1 + + +// LEApcrel - Load a pc-relative address into a register without offending the +// assembler. +let neverHasSideEffects = 1, isReMaterializable = 1 in +// The 'adr' mnemonic encodes differently if the label is before or after +// the instruction. The {24-21} opcode bits are set by the fixup, as we don't +// know until then which form of the instruction will be used. +def ADR : AI1<{0,?,?,0}, (outs GPR:$Rd), (ins adrlabel:$label), + MiscFrm, IIC_iALUi, "adr", "\t$Rd, $label", []> { + bits<4> Rd; + bits<14> label; + let Inst{27-25} = 0b001; + let Inst{24} = 0; + let Inst{23-22} = label{13-12}; + let Inst{21} = 0; + let Inst{20} = 0; + let Inst{19-16} = 0b1111; + let Inst{15-12} = Rd; + let Inst{11-0} = label{11-0}; +} +def LEApcrel : ARMPseudoInst<(outs GPR:$Rd), (ins i32imm:$label, pred:$p), + 4, IIC_iALUi, []>; + +def LEApcrelJT : ARMPseudoInst<(outs GPR:$Rd), + (ins i32imm:$label, nohash_imm:$id, pred:$p), + 4, IIC_iALUi, []>; + +//===----------------------------------------------------------------------===// +// Control Flow Instructions. +// + +let isReturn = 1, isTerminator = 1, isBarrier = 1 in { + // ARMV4T and above + def BX_RET : AI<(outs), (ins), BrMiscFrm, IIC_Br, + "bx", "\tlr", [(ARMretflag)]>, + Requires<[IsARM, HasV4T]> { + let Inst{27-0} = 0b0001001011111111111100011110; + } + + // ARMV4 only + def MOVPCLR : AI<(outs), (ins), BrMiscFrm, IIC_Br, + "mov", "\tpc, lr", [(ARMretflag)]>, + Requires<[IsARM, NoV4T]> { + let Inst{27-0} = 0b0001101000001111000000001110; + } +} + +// Indirect branches +let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { + // ARMV4T and above + def BX : AXI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br, "bx\t$dst", + [(brind GPR:$dst)]>, + Requires<[IsARM, HasV4T]> { + bits<4> dst; + let Inst{31-4} = 0b1110000100101111111111110001; + let Inst{3-0} = dst; + } + + def BX_pred : AI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br, + "bx", "\t$dst", [/* pattern left blank */]>, + Requires<[IsARM, HasV4T]> { + bits<4> dst; + let Inst{27-4} = 0b000100101111111111110001; + let Inst{3-0} = dst; + } +} + +// All calls clobber the non-callee saved registers. SP is marked as +// a use to prevent stack-pointer assignments that appear immediately +// before calls from potentially appearing dead. +let isCall = 1, + // On non-Darwin platforms R9 is callee-saved. + // FIXME: Do we really need a non-predicated version? If so, it should + // at least be a pseudo instruction expanding to the predicated version + // at MC lowering time. + Defs = [R0, R1, R2, R3, R12, LR, QQQQ0, QQQQ2, QQQQ3, CPSR, FPSCR], + Uses = [SP] in { + def BL : ABXI<0b1011, (outs), (ins bl_target:$func, variable_ops), + IIC_Br, "bl\t$func", + [(ARMcall tglobaladdr:$func)]>, + Requires<[IsARM, IsNotDarwin]> { + let Inst{31-28} = 0b1110; + bits<24> func; + let Inst{23-0} = func; + let DecoderMethod = "DecodeBranchImmInstruction"; + } + + def BL_pred : ABI<0b1011, (outs), (ins bl_target:$func, variable_ops), + IIC_Br, "bl", "\t$func", + [(ARMcall_pred tglobaladdr:$func)]>, + Requires<[IsARM, IsNotDarwin]> { + bits<24> func; + let Inst{23-0} = func; + let DecoderMethod = "DecodeBranchImmInstruction"; + } + + // ARMv5T and above + def BLX : AXI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm, + IIC_Br, "blx\t$func", + [(ARMcall GPR:$func)]>, + Requires<[IsARM, HasV5T, IsNotDarwin]> { + bits<4> func; + let Inst{31-4} = 0b1110000100101111111111110011; + let Inst{3-0} = func; + } + + def BLX_pred : AI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm, + IIC_Br, "blx", "\t$func", + [(ARMcall_pred GPR:$func)]>, + Requires<[IsARM, HasV5T, IsNotDarwin]> { + bits<4> func; + let Inst{27-4} = 0b000100101111111111110011; + let Inst{3-0} = func; + } + + // ARMv4T + // Note: Restrict $func to the tGPR regclass to prevent it being in LR. + def BX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func, variable_ops), + 8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>, + Requires<[IsARM, HasV4T, IsNotDarwin]>; + + // ARMv4 + def BMOVPCRX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func, variable_ops), + 8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>, + Requires<[IsARM, NoV4T, IsNotDarwin]>; +} + +let isCall = 1, + // On Darwin R9 is call-clobbered. + // R7 is marked as a use to prevent frame-pointer assignments from being + // moved above / below calls. + Defs = [R0, R1, R2, R3, R9, R12, LR, QQQQ0, QQQQ2, QQQQ3, CPSR, FPSCR], + Uses = [R7, SP] in { + def BLr9 : ARMPseudoExpand<(outs), (ins bl_target:$func, variable_ops), + 4, IIC_Br, + [(ARMcall tglobaladdr:$func)], (BL bl_target:$func)>, + Requires<[IsARM, IsDarwin]>; + + def BLr9_pred : ARMPseudoExpand<(outs), + (ins bl_target:$func, pred:$p, variable_ops), + 4, IIC_Br, + [(ARMcall_pred tglobaladdr:$func)], + (BL_pred bl_target:$func, pred:$p)>, + Requires<[IsARM, IsDarwin]>; + + // ARMv5T and above + def BLXr9 : ARMPseudoExpand<(outs), (ins GPR:$func, variable_ops), + 4, IIC_Br, + [(ARMcall GPR:$func)], + (BLX GPR:$func)>, + Requires<[IsARM, HasV5T, IsDarwin]>; + + def BLXr9_pred: ARMPseudoExpand<(outs), (ins GPR:$func, pred:$p,variable_ops), + 4, IIC_Br, + [(ARMcall_pred GPR:$func)], + (BLX_pred GPR:$func, pred:$p)>, + Requires<[IsARM, HasV5T, IsDarwin]>; + + // ARMv4T + // Note: Restrict $func to the tGPR regclass to prevent it being in LR. + def BXr9_CALL : ARMPseudoInst<(outs), (ins tGPR:$func, variable_ops), + 8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>, + Requires<[IsARM, HasV4T, IsDarwin]>; + + // ARMv4 + def BMOVPCRXr9_CALL : ARMPseudoInst<(outs), (ins tGPR:$func, variable_ops), + 8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>, + Requires<[IsARM, NoV4T, IsDarwin]>; +} + +let isBranch = 1, isTerminator = 1 in { + // FIXME: should be able to write a pattern for ARMBrcond, but can't use + // a two-value operand where a dag node expects two operands. :( + def Bcc : ABI<0b1010, (outs), (ins br_target:$target), + IIC_Br, "b", "\t$target", + [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]> { + bits<24> target; + let Inst{23-0} = target; + let DecoderMethod = "DecodeBranchImmInstruction"; + } + + let isBarrier = 1 in { + // B is "predicable" since it's just a Bcc with an 'always' condition. + let isPredicable = 1 in + // FIXME: We shouldn't need this pseudo at all. Just using Bcc directly + // should be sufficient. + // FIXME: Is B really a Barrier? That doesn't seem right. + def B : ARMPseudoExpand<(outs), (ins br_target:$target), 4, IIC_Br, + [(br bb:$target)], (Bcc br_target:$target, (ops 14, zero_reg))>; + + let isNotDuplicable = 1, isIndirectBranch = 1 in { + def BR_JTr : ARMPseudoInst<(outs), + (ins GPR:$target, i32imm:$jt, i32imm:$id), + 0, IIC_Br, + [(ARMbrjt GPR:$target, tjumptable:$jt, imm:$id)]>; + // FIXME: This shouldn't use the generic "addrmode2," but rather be split + // into i12 and rs suffixed versions. + def BR_JTm : ARMPseudoInst<(outs), + (ins addrmode2:$target, i32imm:$jt, i32imm:$id), + 0, IIC_Br, + [(ARMbrjt (i32 (load addrmode2:$target)), tjumptable:$jt, + imm:$id)]>; + def BR_JTadd : ARMPseudoInst<(outs), + (ins GPR:$target, GPR:$idx, i32imm:$jt, i32imm:$id), + 0, IIC_Br, + [(ARMbrjt (add GPR:$target, GPR:$idx), tjumptable:$jt, + imm:$id)]>; + } // isNotDuplicable = 1, isIndirectBranch = 1 + } // isBarrier = 1 + +} + +// BLX (immediate) +def BLXi : AXI<(outs), (ins blx_target:$target), BrMiscFrm, NoItinerary, + "blx\t$target", []>, + Requires<[IsARM, HasV5T]> { + let Inst{31-25} = 0b1111101; + bits<25> target; + let Inst{23-0} = target{24-1}; + let Inst{24} = target{0}; +} + +// Branch and Exchange Jazelle +def BXJ : ABI<0b0001, (outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func", + [/* pattern left blank */]> { + bits<4> func; + let Inst{23-20} = 0b0010; + let Inst{19-8} = 0xfff; + let Inst{7-4} = 0b0010; + let Inst{3-0} = func; +} + +// Tail calls. + +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in { + // Darwin versions. + let Defs = [R0, R1, R2, R3, R9, R12, QQQQ0, QQQQ2, QQQQ3, PC], + Uses = [SP] in { + def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst, variable_ops), + IIC_Br, []>, Requires<[IsDarwin]>; + + def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst, variable_ops), + IIC_Br, []>, Requires<[IsDarwin]>; + + def TAILJMPd : ARMPseudoExpand<(outs), (ins br_target:$dst, variable_ops), + 4, IIC_Br, [], + (Bcc br_target:$dst, (ops 14, zero_reg))>, + Requires<[IsARM, IsDarwin]>; + + def TAILJMPr : ARMPseudoExpand<(outs), (ins tcGPR:$dst, variable_ops), + 4, IIC_Br, [], + (BX GPR:$dst)>, + Requires<[IsARM, IsDarwin]>; + + } + + // Non-Darwin versions (the difference is R9). + let Defs = [R0, R1, R2, R3, R12, QQQQ0, QQQQ2, QQQQ3, PC], + Uses = [SP] in { + def TCRETURNdiND : PseudoInst<(outs), (ins i32imm:$dst, variable_ops), + IIC_Br, []>, Requires<[IsNotDarwin]>; + + def TCRETURNriND : PseudoInst<(outs), (ins tcGPR:$dst, variable_ops), + IIC_Br, []>, Requires<[IsNotDarwin]>; + + def TAILJMPdND : ARMPseudoExpand<(outs), (ins brtarget:$dst, variable_ops), + 4, IIC_Br, [], + (Bcc br_target:$dst, (ops 14, zero_reg))>, + Requires<[IsARM, IsNotDarwin]>; + + def TAILJMPrND : ARMPseudoExpand<(outs), (ins tcGPR:$dst, variable_ops), + 4, IIC_Br, [], + (BX GPR:$dst)>, + Requires<[IsARM, IsNotDarwin]>; + } +} + +// Secure Monitor Call is a system instruction. +def SMC : ABI<0b0001, (outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt", + []> { + bits<4> opt; + let Inst{23-4} = 0b01100000000000000111; + let Inst{3-0} = opt; +} + +// Supervisor Call (Software Interrupt) +let isCall = 1, Uses = [SP] in { +def SVC : ABI<0b1111, (outs), (ins imm24b:$svc), IIC_Br, "svc", "\t$svc", []> { + bits<24> svc; + let Inst{23-0} = svc; +} +} + +// Store Return State +class SRSI<bit wb, string asm> + : XI<(outs), (ins imm0_31:$mode), AddrModeNone, 4, IndexModeNone, BrFrm, + NoItinerary, asm, "", []> { + bits<5> mode; + let Inst{31-28} = 0b1111; + let Inst{27-25} = 0b100; + let Inst{22} = 1; + let Inst{21} = wb; + let Inst{20} = 0; + let Inst{19-16} = 0b1101; // SP + let Inst{15-5} = 0b00000101000; + let Inst{4-0} = mode; +} + +def SRSDA : SRSI<0, "srsda\tsp, $mode"> { + let Inst{24-23} = 0; +} +def SRSDA_UPD : SRSI<1, "srsda\tsp!, $mode"> { + let Inst{24-23} = 0; +} +def SRSDB : SRSI<0, "srsdb\tsp, $mode"> { + let Inst{24-23} = 0b10; +} +def SRSDB_UPD : SRSI<1, "srsdb\tsp!, $mode"> { + let Inst{24-23} = 0b10; +} +def SRSIA : SRSI<0, "srsia\tsp, $mode"> { + let Inst{24-23} = 0b01; +} +def SRSIA_UPD : SRSI<1, "srsia\tsp!, $mode"> { + let Inst{24-23} = 0b01; +} +def SRSIB : SRSI<0, "srsib\tsp, $mode"> { + let Inst{24-23} = 0b11; +} +def SRSIB_UPD : SRSI<1, "srsib\tsp!, $mode"> { + let Inst{24-23} = 0b11; +} + +// Return From Exception +class RFEI<bit wb, string asm> + : XI<(outs), (ins GPR:$Rn), AddrModeNone, 4, IndexModeNone, BrFrm, + NoItinerary, asm, "", []> { + bits<4> Rn; + let Inst{31-28} = 0b1111; + let Inst{27-25} = 0b100; + let Inst{22} = 0; + let Inst{21} = wb; + let Inst{20} = 1; + let Inst{19-16} = Rn; + let Inst{15-0} = 0xa00; +} + +def RFEDA : RFEI<0, "rfeda\t$Rn"> { + let Inst{24-23} = 0; +} +def RFEDA_UPD : RFEI<1, "rfeda\t$Rn!"> { + let Inst{24-23} = 0; +} +def RFEDB : RFEI<0, "rfedb\t$Rn"> { + let Inst{24-23} = 0b10; +} +def RFEDB_UPD : RFEI<1, "rfedb\t$Rn!"> { + let Inst{24-23} = 0b10; +} +def RFEIA : RFEI<0, "rfeia\t$Rn"> { + let Inst{24-23} = 0b01; +} +def RFEIA_UPD : RFEI<1, "rfeia\t$Rn!"> { + let Inst{24-23} = 0b01; +} +def RFEIB : RFEI<0, "rfeib\t$Rn"> { + let Inst{24-23} = 0b11; +} +def RFEIB_UPD : RFEI<1, "rfeib\t$Rn!"> { + let Inst{24-23} = 0b11; +} + +//===----------------------------------------------------------------------===// +// Load / store Instructions. +// + +// Load + + +defm LDR : AI_ldr1<0, "ldr", IIC_iLoad_r, IIC_iLoad_si, + UnOpFrag<(load node:$Src)>>; +defm LDRB : AI_ldr1nopc<1, "ldrb", IIC_iLoad_bh_r, IIC_iLoad_bh_si, + UnOpFrag<(zextloadi8 node:$Src)>>; +defm STR : AI_str1<0, "str", IIC_iStore_r, IIC_iStore_si, + BinOpFrag<(store node:$LHS, node:$RHS)>>; +defm STRB : AI_str1nopc<1, "strb", IIC_iStore_bh_r, IIC_iStore_bh_si, + BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>; + +// Special LDR for loads from non-pc-relative constpools. +let canFoldAsLoad = 1, mayLoad = 1, neverHasSideEffects = 1, + isReMaterializable = 1, isCodeGenOnly = 1 in +def LDRcp : AI2ldst<0b010, 1, 0, (outs GPR:$Rt), (ins addrmode_imm12:$addr), + AddrMode_i12, LdFrm, IIC_iLoad_r, "ldr", "\t$Rt, $addr", + []> { + bits<4> Rt; + bits<17> addr; + let Inst{23} = addr{12}; // U (add = ('U' == 1)) + let Inst{19-16} = 0b1111; + let Inst{15-12} = Rt; + let Inst{11-0} = addr{11-0}; // imm12 +} + +// Loads with zero extension +def LDRH : AI3ld<0b1011, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm, + IIC_iLoad_bh_r, "ldrh", "\t$Rt, $addr", + [(set GPR:$Rt, (zextloadi16 addrmode3:$addr))]>; + +// Loads with sign extension +def LDRSH : AI3ld<0b1111, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm, + IIC_iLoad_bh_r, "ldrsh", "\t$Rt, $addr", + [(set GPR:$Rt, (sextloadi16 addrmode3:$addr))]>; + +def LDRSB : AI3ld<0b1101, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm, + IIC_iLoad_bh_r, "ldrsb", "\t$Rt, $addr", + [(set GPR:$Rt, (sextloadi8 addrmode3:$addr))]>; + +let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { +// Load doubleword +def LDRD : AI3ld<0b1101, 0, (outs GPR:$Rd, GPR:$dst2), + (ins addrmode3:$addr), LdMiscFrm, + IIC_iLoad_d_r, "ldrd", "\t$Rd, $dst2, $addr", + []>, Requires<[IsARM, HasV5TE]>; +} + +// Indexed loads +multiclass AI2_ldridx<bit isByte, string opc, InstrItinClass itin> { + def _PRE_IMM : AI2ldstidx<1, isByte, 1, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addrmode_imm12:$addr), IndexModePre, LdFrm, itin, + opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> { + bits<17> addr; + let Inst{25} = 0; + let Inst{23} = addr{12}; + let Inst{19-16} = addr{16-13}; + let Inst{11-0} = addr{11-0}; + let DecoderMethod = "DecodeLDRPreImm"; + let AsmMatchConverter = "cvtLdWriteBackRegAddrModeImm12"; + } + + def _PRE_REG : AI2ldstidx<1, isByte, 1, (outs GPR:$Rt, GPR:$Rn_wb), + (ins ldst_so_reg:$addr), IndexModePre, LdFrm, itin, + opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> { + bits<17> addr; + let Inst{25} = 1; + let Inst{23} = addr{12}; + let Inst{19-16} = addr{16-13}; + let Inst{11-0} = addr{11-0}; + let Inst{4} = 0; + let DecoderMethod = "DecodeLDRPreReg"; + let AsmMatchConverter = "cvtLdWriteBackRegAddrMode2"; + } + + def _POST_REG : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addr_offset_none:$addr, am2offset_reg:$offset), + IndexModePost, LdFrm, itin, + opc, "\t$Rt, $addr, $offset", + "$addr.base = $Rn_wb", []> { + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> addr; + let Inst{25} = 1; + let Inst{23} = offset{12}; + let Inst{19-16} = addr; + let Inst{11-0} = offset{11-0}; + + let DecoderMethod = "DecodeAddrMode2IdxInstruction"; + } + + def _POST_IMM : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addr_offset_none:$addr, am2offset_imm:$offset), + IndexModePost, LdFrm, itin, + opc, "\t$Rt, $addr, $offset", + "$addr.base = $Rn_wb", []> { + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> addr; + let Inst{25} = 0; + let Inst{23} = offset{12}; + let Inst{19-16} = addr; + let Inst{11-0} = offset{11-0}; + + let DecoderMethod = "DecodeAddrMode2IdxInstruction"; + } + +} + +let mayLoad = 1, neverHasSideEffects = 1 in { +defm LDR : AI2_ldridx<0, "ldr", IIC_iLoad_ru>; +defm LDRB : AI2_ldridx<1, "ldrb", IIC_iLoad_bh_ru>; +} + +multiclass AI3_ldridx<bits<4> op, string opc, InstrItinClass itin> { + def _PRE : AI3ldstidx<op, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addrmode3:$addr), IndexModePre, + LdMiscFrm, itin, + opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> { + bits<14> addr; + let Inst{23} = addr{8}; // U bit + let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm + let Inst{19-16} = addr{12-9}; // Rn + let Inst{11-8} = addr{7-4}; // imm7_4/zero + let Inst{3-0} = addr{3-0}; // imm3_0/Rm + let AsmMatchConverter = "cvtLdWriteBackRegAddrMode3"; + let DecoderMethod = "DecodeAddrMode3Instruction"; + } + def _POST : AI3ldstidx<op, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addr_offset_none:$addr, am3offset:$offset), + IndexModePost, LdMiscFrm, itin, + opc, "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", + []> { + bits<10> offset; + bits<4> addr; + let Inst{23} = offset{8}; // U bit + let Inst{22} = offset{9}; // 1 == imm8, 0 == Rm + let Inst{19-16} = addr; + let Inst{11-8} = offset{7-4}; // imm7_4/zero + let Inst{3-0} = offset{3-0}; // imm3_0/Rm + let DecoderMethod = "DecodeAddrMode3Instruction"; + } +} + +let mayLoad = 1, neverHasSideEffects = 1 in { +defm LDRH : AI3_ldridx<0b1011, "ldrh", IIC_iLoad_bh_ru>; +defm LDRSH : AI3_ldridx<0b1111, "ldrsh", IIC_iLoad_bh_ru>; +defm LDRSB : AI3_ldridx<0b1101, "ldrsb", IIC_iLoad_bh_ru>; +let hasExtraDefRegAllocReq = 1 in { +def LDRD_PRE : AI3ldstidx<0b1101, 0, 1, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb), + (ins addrmode3:$addr), IndexModePre, + LdMiscFrm, IIC_iLoad_d_ru, + "ldrd", "\t$Rt, $Rt2, $addr!", + "$addr.base = $Rn_wb", []> { + bits<14> addr; + let Inst{23} = addr{8}; // U bit + let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm + let Inst{19-16} = addr{12-9}; // Rn + let Inst{11-8} = addr{7-4}; // imm7_4/zero + let Inst{3-0} = addr{3-0}; // imm3_0/Rm + let DecoderMethod = "DecodeAddrMode3Instruction"; + let AsmMatchConverter = "cvtLdrdPre"; +} +def LDRD_POST: AI3ldstidx<0b1101, 0, 0, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb), + (ins addr_offset_none:$addr, am3offset:$offset), + IndexModePost, LdMiscFrm, IIC_iLoad_d_ru, + "ldrd", "\t$Rt, $Rt2, $addr, $offset", + "$addr.base = $Rn_wb", []> { + bits<10> offset; + bits<4> addr; + let Inst{23} = offset{8}; // U bit + let Inst{22} = offset{9}; // 1 == imm8, 0 == Rm + let Inst{19-16} = addr; + let Inst{11-8} = offset{7-4}; // imm7_4/zero + let Inst{3-0} = offset{3-0}; // imm3_0/Rm + let DecoderMethod = "DecodeAddrMode3Instruction"; +} +} // hasExtraDefRegAllocReq = 1 +} // mayLoad = 1, neverHasSideEffects = 1 + +// LDRT, LDRBT, LDRSBT, LDRHT, LDRSHT. +let mayLoad = 1, neverHasSideEffects = 1 in { +def LDRT_POST_REG : AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addr_offset_none:$addr, am2offset_reg:$offset), + IndexModePost, LdFrm, IIC_iLoad_ru, + "ldrt", "\t$Rt, $addr, $offset", + "$addr.base = $Rn_wb", []> { + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> addr; + let Inst{25} = 1; + let Inst{23} = offset{12}; + let Inst{21} = 1; // overwrite + let Inst{19-16} = addr; + let Inst{11-5} = offset{11-5}; + let Inst{4} = 0; + let Inst{3-0} = offset{3-0}; + let DecoderMethod = "DecodeAddrMode2IdxInstruction"; +} + +def LDRT_POST_IMM : AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addr_offset_none:$addr, am2offset_imm:$offset), + IndexModePost, LdFrm, IIC_iLoad_ru, + "ldrt", "\t$Rt, $addr, $offset", + "$addr.base = $Rn_wb", []> { + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> addr; + let Inst{25} = 0; + let Inst{23} = offset{12}; + let Inst{21} = 1; // overwrite + let Inst{19-16} = addr; + let Inst{11-0} = offset{11-0}; + let DecoderMethod = "DecodeAddrMode2IdxInstruction"; +} + +def LDRBT_POST_REG : AI2ldstidx<1, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addr_offset_none:$addr, am2offset_reg:$offset), + IndexModePost, LdFrm, IIC_iLoad_bh_ru, + "ldrbt", "\t$Rt, $addr, $offset", + "$addr.base = $Rn_wb", []> { + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> addr; + let Inst{25} = 1; + let Inst{23} = offset{12}; + let Inst{21} = 1; // overwrite + let Inst{19-16} = addr; + let Inst{11-5} = offset{11-5}; + let Inst{4} = 0; + let Inst{3-0} = offset{3-0}; + let DecoderMethod = "DecodeAddrMode2IdxInstruction"; +} + +def LDRBT_POST_IMM : AI2ldstidx<1, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addr_offset_none:$addr, am2offset_imm:$offset), + IndexModePost, LdFrm, IIC_iLoad_bh_ru, + "ldrbt", "\t$Rt, $addr, $offset", + "$addr.base = $Rn_wb", []> { + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> addr; + let Inst{25} = 0; + let Inst{23} = offset{12}; + let Inst{21} = 1; // overwrite + let Inst{19-16} = addr; + let Inst{11-0} = offset{11-0}; + let DecoderMethod = "DecodeAddrMode2IdxInstruction"; +} + +multiclass AI3ldrT<bits<4> op, string opc> { + def i : AI3ldstidxT<op, 1, (outs GPR:$Rt, GPR:$base_wb), + (ins addr_offset_none:$addr, postidx_imm8:$offset), + IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru, opc, + "\t$Rt, $addr, $offset", "$addr.base = $base_wb", []> { + bits<9> offset; + let Inst{23} = offset{8}; + let Inst{22} = 1; + let Inst{11-8} = offset{7-4}; + let Inst{3-0} = offset{3-0}; + let AsmMatchConverter = "cvtLdExtTWriteBackImm"; + } + def r : AI3ldstidxT<op, 1, (outs GPR:$Rt, GPR:$base_wb), + (ins addr_offset_none:$addr, postidx_reg:$Rm), + IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru, opc, + "\t$Rt, $addr, $Rm", "$addr.base = $base_wb", []> { + bits<5> Rm; + let Inst{23} = Rm{4}; + let Inst{22} = 0; + let Inst{11-8} = 0; + let Inst{3-0} = Rm{3-0}; + let AsmMatchConverter = "cvtLdExtTWriteBackReg"; + } +} + +defm LDRSBT : AI3ldrT<0b1101, "ldrsbt">; +defm LDRHT : AI3ldrT<0b1011, "ldrht">; +defm LDRSHT : AI3ldrT<0b1111, "ldrsht">; +} + +// Store + +// Stores with truncate +def STRH : AI3str<0b1011, (outs), (ins GPR:$Rt, addrmode3:$addr), StMiscFrm, + IIC_iStore_bh_r, "strh", "\t$Rt, $addr", + [(truncstorei16 GPR:$Rt, addrmode3:$addr)]>; + +// Store doubleword +let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in +def STRD : AI3str<0b1111, (outs), (ins GPR:$Rt, GPR:$src2, addrmode3:$addr), + StMiscFrm, IIC_iStore_d_r, + "strd", "\t$Rt, $src2, $addr", []>, + Requires<[IsARM, HasV5TE]> { + let Inst{21} = 0; +} + +// Indexed stores +multiclass AI2_stridx<bit isByte, string opc, InstrItinClass itin> { + def _PRE_IMM : AI2ldstidx<0, isByte, 1, (outs GPR:$Rn_wb), + (ins GPR:$Rt, addrmode_imm12:$addr), IndexModePre, + StFrm, itin, + opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> { + bits<17> addr; + let Inst{25} = 0; + let Inst{23} = addr{12}; // U (add = ('U' == 1)) + let Inst{19-16} = addr{16-13}; // Rn + let Inst{11-0} = addr{11-0}; // imm12 + let AsmMatchConverter = "cvtStWriteBackRegAddrModeImm12"; + let DecoderMethod = "DecodeSTRPreImm"; + } + + def _PRE_REG : AI2ldstidx<0, isByte, 1, (outs GPR:$Rn_wb), + (ins GPR:$Rt, ldst_so_reg:$addr), + IndexModePre, StFrm, itin, + opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> { + bits<17> addr; + let Inst{25} = 1; + let Inst{23} = addr{12}; // U (add = ('U' == 1)) + let Inst{19-16} = addr{16-13}; // Rn + let Inst{11-0} = addr{11-0}; + let Inst{4} = 0; // Inst{4} = 0 + let AsmMatchConverter = "cvtStWriteBackRegAddrMode2"; + let DecoderMethod = "DecodeSTRPreReg"; + } + def _POST_REG : AI2ldstidx<0, isByte, 0, (outs GPR:$Rn_wb), + (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset), + IndexModePost, StFrm, itin, + opc, "\t$Rt, $addr, $offset", + "$addr.base = $Rn_wb", []> { + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> addr; + let Inst{25} = 1; + let Inst{23} = offset{12}; + let Inst{19-16} = addr; + let Inst{11-0} = offset{11-0}; + + let DecoderMethod = "DecodeAddrMode2IdxInstruction"; + } + + def _POST_IMM : AI2ldstidx<0, isByte, 0, (outs GPR:$Rn_wb), + (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset), + IndexModePost, StFrm, itin, + opc, "\t$Rt, $addr, $offset", + "$addr.base = $Rn_wb", []> { + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> addr; + let Inst{25} = 0; + let Inst{23} = offset{12}; + let Inst{19-16} = addr; + let Inst{11-0} = offset{11-0}; + + let DecoderMethod = "DecodeAddrMode2IdxInstruction"; + } +} + +let mayStore = 1, neverHasSideEffects = 1 in { +defm STR : AI2_stridx<0, "str", IIC_iStore_ru>; +defm STRB : AI2_stridx<1, "strb", IIC_iStore_bh_ru>; +} + +def : ARMPat<(post_store GPR:$Rt, addr_offset_none:$addr, + am2offset_reg:$offset), + (STR_POST_REG GPR:$Rt, addr_offset_none:$addr, + am2offset_reg:$offset)>; +def : ARMPat<(post_store GPR:$Rt, addr_offset_none:$addr, + am2offset_imm:$offset), + (STR_POST_IMM GPR:$Rt, addr_offset_none:$addr, + am2offset_imm:$offset)>; +def : ARMPat<(post_truncsti8 GPR:$Rt, addr_offset_none:$addr, + am2offset_reg:$offset), + (STRB_POST_REG GPR:$Rt, addr_offset_none:$addr, + am2offset_reg:$offset)>; +def : ARMPat<(post_truncsti8 GPR:$Rt, addr_offset_none:$addr, + am2offset_imm:$offset), + (STRB_POST_IMM GPR:$Rt, addr_offset_none:$addr, + am2offset_imm:$offset)>; + +// Pseudo-instructions for pattern matching the pre-indexed stores. We can't +// put the patterns on the instruction definitions directly as ISel wants +// the address base and offset to be separate operands, not a single +// complex operand like we represent the instructions themselves. The +// pseudos map between the two. +let usesCustomInserter = 1, + Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in { +def STRi_preidx: ARMPseudoInst<(outs GPR:$Rn_wb), + (ins GPR:$Rt, GPR:$Rn, am2offset_imm:$offset, pred:$p), + 4, IIC_iStore_ru, + [(set GPR:$Rn_wb, + (pre_store GPR:$Rt, GPR:$Rn, am2offset_imm:$offset))]>; +def STRr_preidx: ARMPseudoInst<(outs GPR:$Rn_wb), + (ins GPR:$Rt, GPR:$Rn, am2offset_reg:$offset, pred:$p), + 4, IIC_iStore_ru, + [(set GPR:$Rn_wb, + (pre_store GPR:$Rt, GPR:$Rn, am2offset_reg:$offset))]>; +def STRBi_preidx: ARMPseudoInst<(outs GPR:$Rn_wb), + (ins GPR:$Rt, GPR:$Rn, am2offset_imm:$offset, pred:$p), + 4, IIC_iStore_ru, + [(set GPR:$Rn_wb, + (pre_truncsti8 GPR:$Rt, GPR:$Rn, am2offset_imm:$offset))]>; +def STRBr_preidx: ARMPseudoInst<(outs GPR:$Rn_wb), + (ins GPR:$Rt, GPR:$Rn, am2offset_reg:$offset, pred:$p), + 4, IIC_iStore_ru, + [(set GPR:$Rn_wb, + (pre_truncsti8 GPR:$Rt, GPR:$Rn, am2offset_reg:$offset))]>; +def STRH_preidx: ARMPseudoInst<(outs GPR:$Rn_wb), + (ins GPR:$Rt, GPR:$Rn, am3offset:$offset, pred:$p), + 4, IIC_iStore_ru, + [(set GPR:$Rn_wb, + (pre_truncsti16 GPR:$Rt, GPR:$Rn, am3offset:$offset))]>; +} + + + +def STRH_PRE : AI3ldstidx<0b1011, 0, 1, (outs GPR:$Rn_wb), + (ins GPR:$Rt, addrmode3:$addr), IndexModePre, + StMiscFrm, IIC_iStore_bh_ru, + "strh", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> { + bits<14> addr; + let Inst{23} = addr{8}; // U bit + let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm + let Inst{19-16} = addr{12-9}; // Rn + let Inst{11-8} = addr{7-4}; // imm7_4/zero + let Inst{3-0} = addr{3-0}; // imm3_0/Rm + let AsmMatchConverter = "cvtStWriteBackRegAddrMode3"; + let DecoderMethod = "DecodeAddrMode3Instruction"; +} + +def STRH_POST : AI3ldstidx<0b1011, 0, 0, (outs GPR:$Rn_wb), + (ins GPR:$Rt, addr_offset_none:$addr, am3offset:$offset), + IndexModePost, StMiscFrm, IIC_iStore_bh_ru, + "strh", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", + [(set GPR:$Rn_wb, (post_truncsti16 GPR:$Rt, + addr_offset_none:$addr, + am3offset:$offset))]> { + bits<10> offset; + bits<4> addr; + let Inst{23} = offset{8}; // U bit + let Inst{22} = offset{9}; // 1 == imm8, 0 == Rm + let Inst{19-16} = addr; + let Inst{11-8} = offset{7-4}; // imm7_4/zero + let Inst{3-0} = offset{3-0}; // imm3_0/Rm + let DecoderMethod = "DecodeAddrMode3Instruction"; +} + +let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in { +def STRD_PRE : AI3ldstidx<0b1111, 0, 1, (outs GPR:$Rn_wb), + (ins GPR:$Rt, GPR:$Rt2, addrmode3:$addr), + IndexModePre, StMiscFrm, IIC_iStore_d_ru, + "strd", "\t$Rt, $Rt2, $addr!", + "$addr.base = $Rn_wb", []> { + bits<14> addr; + let Inst{23} = addr{8}; // U bit + let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm + let Inst{19-16} = addr{12-9}; // Rn + let Inst{11-8} = addr{7-4}; // imm7_4/zero + let Inst{3-0} = addr{3-0}; // imm3_0/Rm + let DecoderMethod = "DecodeAddrMode3Instruction"; + let AsmMatchConverter = "cvtStrdPre"; +} + +def STRD_POST: AI3ldstidx<0b1111, 0, 0, (outs GPR:$Rn_wb), + (ins GPR:$Rt, GPR:$Rt2, addr_offset_none:$addr, + am3offset:$offset), + IndexModePost, StMiscFrm, IIC_iStore_d_ru, + "strd", "\t$Rt, $Rt2, $addr, $offset", + "$addr.base = $Rn_wb", []> { + bits<10> offset; + bits<4> addr; + let Inst{23} = offset{8}; // U bit + let Inst{22} = offset{9}; // 1 == imm8, 0 == Rm + let Inst{19-16} = addr; + let Inst{11-8} = offset{7-4}; // imm7_4/zero + let Inst{3-0} = offset{3-0}; // imm3_0/Rm + let DecoderMethod = "DecodeAddrMode3Instruction"; +} +} // mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 + +// STRT, STRBT, and STRHT + +def STRBT_POST_REG : AI2ldstidx<0, 1, 0, (outs GPR:$Rn_wb), + (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset), + IndexModePost, StFrm, IIC_iStore_bh_ru, + "strbt", "\t$Rt, $addr, $offset", + "$addr.base = $Rn_wb", []> { + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> addr; + let Inst{25} = 1; + let Inst{23} = offset{12}; + let Inst{21} = 1; // overwrite + let Inst{19-16} = addr; + let Inst{11-5} = offset{11-5}; + let Inst{4} = 0; + let Inst{3-0} = offset{3-0}; + let DecoderMethod = "DecodeAddrMode2IdxInstruction"; +} + +def STRBT_POST_IMM : AI2ldstidx<0, 1, 0, (outs GPR:$Rn_wb), + (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset), + IndexModePost, StFrm, IIC_iStore_bh_ru, + "strbt", "\t$Rt, $addr, $offset", + "$addr.base = $Rn_wb", []> { + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> addr; + let Inst{25} = 0; + let Inst{23} = offset{12}; + let Inst{21} = 1; // overwrite + let Inst{19-16} = addr; + let Inst{11-0} = offset{11-0}; + let DecoderMethod = "DecodeAddrMode2IdxInstruction"; +} + +let mayStore = 1, neverHasSideEffects = 1 in { +def STRT_POST_REG : AI2ldstidx<0, 0, 0, (outs GPR:$Rn_wb), + (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset), + IndexModePost, StFrm, IIC_iStore_ru, + "strt", "\t$Rt, $addr, $offset", + "$addr.base = $Rn_wb", []> { + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> addr; + let Inst{25} = 1; + let Inst{23} = offset{12}; + let Inst{21} = 1; // overwrite + let Inst{19-16} = addr; + let Inst{11-5} = offset{11-5}; + let Inst{4} = 0; + let Inst{3-0} = offset{3-0}; + let DecoderMethod = "DecodeAddrMode2IdxInstruction"; +} + +def STRT_POST_IMM : AI2ldstidx<0, 0, 0, (outs GPR:$Rn_wb), + (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset), + IndexModePost, StFrm, IIC_iStore_ru, + "strt", "\t$Rt, $addr, $offset", + "$addr.base = $Rn_wb", []> { + // {12} isAdd + // {11-0} imm12/Rm + bits<14> offset; + bits<4> addr; + let Inst{25} = 0; + let Inst{23} = offset{12}; + let Inst{21} = 1; // overwrite + let Inst{19-16} = addr; + let Inst{11-0} = offset{11-0}; + let DecoderMethod = "DecodeAddrMode2IdxInstruction"; +} +} + + +multiclass AI3strT<bits<4> op, string opc> { + def i : AI3ldstidxT<op, 0, (outs GPR:$base_wb), + (ins GPR:$Rt, addr_offset_none:$addr, postidx_imm8:$offset), + IndexModePost, StMiscFrm, IIC_iStore_bh_ru, opc, + "\t$Rt, $addr, $offset", "$addr.base = $base_wb", []> { + bits<9> offset; + let Inst{23} = offset{8}; + let Inst{22} = 1; + let Inst{11-8} = offset{7-4}; + let Inst{3-0} = offset{3-0}; + let AsmMatchConverter = "cvtStExtTWriteBackImm"; + } + def r : AI3ldstidxT<op, 0, (outs GPR:$base_wb), + (ins GPR:$Rt, addr_offset_none:$addr, postidx_reg:$Rm), + IndexModePost, StMiscFrm, IIC_iStore_bh_ru, opc, + "\t$Rt, $addr, $Rm", "$addr.base = $base_wb", []> { + bits<5> Rm; + let Inst{23} = Rm{4}; + let Inst{22} = 0; + let Inst{11-8} = 0; + let Inst{3-0} = Rm{3-0}; + let AsmMatchConverter = "cvtStExtTWriteBackReg"; + } +} + + +defm STRHT : AI3strT<0b1011, "strht">; + + +//===----------------------------------------------------------------------===// +// Load / store multiple Instructions. +// + +multiclass arm_ldst_mult<string asm, bit L_bit, Format f, + InstrItinClass itin, InstrItinClass itin_upd> { + // IA is the default, so no need for an explicit suffix on the + // mnemonic here. Without it is the cannonical spelling. + def IA : + AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + IndexModeNone, f, itin, + !strconcat(asm, "${p}\t$Rn, $regs"), "", []> { + let Inst{24-23} = 0b01; // Increment After + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + } + def IA_UPD : + AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + IndexModeUpd, f, itin_upd, + !strconcat(asm, "${p}\t$Rn!, $regs"), "$Rn = $wb", []> { + let Inst{24-23} = 0b01; // Increment After + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + + let DecoderMethod = "DecodeMemMultipleWritebackInstruction"; + } + def DA : + AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + IndexModeNone, f, itin, + !strconcat(asm, "da${p}\t$Rn, $regs"), "", []> { + let Inst{24-23} = 0b00; // Decrement After + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + } + def DA_UPD : + AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + IndexModeUpd, f, itin_upd, + !strconcat(asm, "da${p}\t$Rn!, $regs"), "$Rn = $wb", []> { + let Inst{24-23} = 0b00; // Decrement After + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + + let DecoderMethod = "DecodeMemMultipleWritebackInstruction"; + } + def DB : + AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + IndexModeNone, f, itin, + !strconcat(asm, "db${p}\t$Rn, $regs"), "", []> { + let Inst{24-23} = 0b10; // Decrement Before + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + } + def DB_UPD : + AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + IndexModeUpd, f, itin_upd, + !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> { + let Inst{24-23} = 0b10; // Decrement Before + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + + let DecoderMethod = "DecodeMemMultipleWritebackInstruction"; + } + def IB : + AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + IndexModeNone, f, itin, + !strconcat(asm, "ib${p}\t$Rn, $regs"), "", []> { + let Inst{24-23} = 0b11; // Increment Before + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + } + def IB_UPD : + AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + IndexModeUpd, f, itin_upd, + !strconcat(asm, "ib${p}\t$Rn!, $regs"), "$Rn = $wb", []> { + let Inst{24-23} = 0b11; // Increment Before + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + + let DecoderMethod = "DecodeMemMultipleWritebackInstruction"; + } +} + +let neverHasSideEffects = 1 in { + +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in +defm LDM : arm_ldst_mult<"ldm", 1, LdStMulFrm, IIC_iLoad_m, IIC_iLoad_mu>; + +let mayStore = 1, hasExtraSrcRegAllocReq = 1 in +defm STM : arm_ldst_mult<"stm", 0, LdStMulFrm, IIC_iStore_m, IIC_iStore_mu>; + +} // neverHasSideEffects + +// FIXME: remove when we have a way to marking a MI with these properties. +// FIXME: Should pc be an implicit operand like PICADD, etc? +let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1, + hasExtraDefRegAllocReq = 1, isCodeGenOnly = 1 in +def LDMIA_RET : ARMPseudoExpand<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, + reglist:$regs, variable_ops), + 4, IIC_iLoad_mBr, [], + (LDMIA_UPD GPR:$wb, GPR:$Rn, pred:$p, reglist:$regs)>, + RegConstraint<"$Rn = $wb">; + +//===----------------------------------------------------------------------===// +// Move Instructions. +// + +let neverHasSideEffects = 1 in +def MOVr : AsI1<0b1101, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMOVr, + "mov", "\t$Rd, $Rm", []>, UnaryDP { + bits<4> Rd; + bits<4> Rm; + + let Inst{19-16} = 0b0000; + let Inst{11-4} = 0b00000000; + let Inst{25} = 0; + let Inst{3-0} = Rm; + let Inst{15-12} = Rd; +} + +def : ARMInstAlias<"movs${p} $Rd, $Rm", + (MOVr GPR:$Rd, GPR:$Rm, pred:$p, CPSR)>; + +// A version for the smaller set of tail call registers. +let neverHasSideEffects = 1 in +def MOVr_TC : AsI1<0b1101, (outs tcGPR:$Rd), (ins tcGPR:$Rm), DPFrm, + IIC_iMOVr, "mov", "\t$Rd, $Rm", []>, UnaryDP { + bits<4> Rd; + bits<4> Rm; + + let Inst{11-4} = 0b00000000; + let Inst{25} = 0; + let Inst{3-0} = Rm; + let Inst{15-12} = Rd; +} + +def MOVsr : AsI1<0b1101, (outs GPRnopc:$Rd), (ins shift_so_reg_reg:$src), + DPSoRegRegFrm, IIC_iMOVsr, + "mov", "\t$Rd, $src", + [(set GPRnopc:$Rd, shift_so_reg_reg:$src)]>, UnaryDP { + bits<4> Rd; + bits<12> src; + let Inst{15-12} = Rd; + let Inst{19-16} = 0b0000; + let Inst{11-8} = src{11-8}; + let Inst{7} = 0; + let Inst{6-5} = src{6-5}; + let Inst{4} = 1; + let Inst{3-0} = src{3-0}; + let Inst{25} = 0; +} + +def MOVsi : AsI1<0b1101, (outs GPR:$Rd), (ins shift_so_reg_imm:$src), + DPSoRegImmFrm, IIC_iMOVsr, + "mov", "\t$Rd, $src", [(set GPR:$Rd, shift_so_reg_imm:$src)]>, + UnaryDP { + bits<4> Rd; + bits<12> src; + let Inst{15-12} = Rd; + let Inst{19-16} = 0b0000; + let Inst{11-5} = src{11-5}; + let Inst{4} = 0; + let Inst{3-0} = src{3-0}; + let Inst{25} = 0; +} + +let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in +def MOVi : AsI1<0b1101, (outs GPR:$Rd), (ins so_imm:$imm), DPFrm, IIC_iMOVi, + "mov", "\t$Rd, $imm", [(set GPR:$Rd, so_imm:$imm)]>, UnaryDP { + bits<4> Rd; + bits<12> imm; + let Inst{25} = 1; + let Inst{15-12} = Rd; + let Inst{19-16} = 0b0000; + let Inst{11-0} = imm; +} + +let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in +def MOVi16 : AI1<0b1000, (outs GPR:$Rd), (ins imm0_65535_expr:$imm), + DPFrm, IIC_iMOVi, + "movw", "\t$Rd, $imm", + [(set GPR:$Rd, imm0_65535:$imm)]>, + Requires<[IsARM, HasV6T2]>, UnaryDP { + bits<4> Rd; + bits<16> imm; + let Inst{15-12} = Rd; + let Inst{11-0} = imm{11-0}; + let Inst{19-16} = imm{15-12}; + let Inst{20} = 0; + let Inst{25} = 1; + let DecoderMethod = "DecodeArmMOVTWInstruction"; +} + +def : InstAlias<"mov${p} $Rd, $imm", + (MOVi16 GPR:$Rd, imm0_65535_expr:$imm, pred:$p)>, + Requires<[IsARM]>; + +def MOVi16_ga_pcrel : PseudoInst<(outs GPR:$Rd), + (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>; + +let Constraints = "$src = $Rd" in { +def MOVTi16 : AI1<0b1010, (outs GPRnopc:$Rd), + (ins GPR:$src, imm0_65535_expr:$imm), + DPFrm, IIC_iMOVi, + "movt", "\t$Rd, $imm", + [(set GPRnopc:$Rd, + (or (and GPR:$src, 0xffff), + lo16AllZero:$imm))]>, UnaryDP, + Requires<[IsARM, HasV6T2]> { + bits<4> Rd; + bits<16> imm; + let Inst{15-12} = Rd; + let Inst{11-0} = imm{11-0}; + let Inst{19-16} = imm{15-12}; + let Inst{20} = 0; + let Inst{25} = 1; + let DecoderMethod = "DecodeArmMOVTWInstruction"; +} + +def MOVTi16_ga_pcrel : PseudoInst<(outs GPR:$Rd), + (ins GPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>; + +} // Constraints + +def : ARMPat<(or GPR:$src, 0xffff0000), (MOVTi16 GPR:$src, 0xffff)>, + Requires<[IsARM, HasV6T2]>; + +let Uses = [CPSR] in +def RRX: PseudoInst<(outs GPR:$Rd), (ins GPR:$Rm), IIC_iMOVsi, + [(set GPR:$Rd, (ARMrrx GPR:$Rm))]>, UnaryDP, + Requires<[IsARM]>; + +// These aren't really mov instructions, but we have to define them this way +// due to flag operands. + +let Defs = [CPSR] in { +def MOVsrl_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi, + [(set GPR:$dst, (ARMsrl_flag GPR:$src))]>, UnaryDP, + Requires<[IsARM]>; +def MOVsra_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi, + [(set GPR:$dst, (ARMsra_flag GPR:$src))]>, UnaryDP, + Requires<[IsARM]>; +} + +//===----------------------------------------------------------------------===// +// Extend Instructions. +// + +// Sign extenders + +def SXTB : AI_ext_rrot<0b01101010, + "sxtb", UnOpFrag<(sext_inreg node:$Src, i8)>>; +def SXTH : AI_ext_rrot<0b01101011, + "sxth", UnOpFrag<(sext_inreg node:$Src, i16)>>; + +def SXTAB : AI_exta_rrot<0b01101010, + "sxtab", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>; +def SXTAH : AI_exta_rrot<0b01101011, + "sxtah", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>; + +def SXTB16 : AI_ext_rrot_np<0b01101000, "sxtb16">; + +def SXTAB16 : AI_exta_rrot_np<0b01101000, "sxtab16">; + +// Zero extenders + +let AddedComplexity = 16 in { +def UXTB : AI_ext_rrot<0b01101110, + "uxtb" , UnOpFrag<(and node:$Src, 0x000000FF)>>; +def UXTH : AI_ext_rrot<0b01101111, + "uxth" , UnOpFrag<(and node:$Src, 0x0000FFFF)>>; +def UXTB16 : AI_ext_rrot<0b01101100, + "uxtb16", UnOpFrag<(and node:$Src, 0x00FF00FF)>>; + +// FIXME: This pattern incorrectly assumes the shl operator is a rotate. +// The transformation should probably be done as a combiner action +// instead so we can include a check for masking back in the upper +// eight bits of the source into the lower eight bits of the result. +//def : ARMV6Pat<(and (shl GPR:$Src, (i32 8)), 0xFF00FF), +// (UXTB16r_rot GPR:$Src, 3)>; +def : ARMV6Pat<(and (srl GPR:$Src, (i32 8)), 0xFF00FF), + (UXTB16 GPR:$Src, 1)>; + +def UXTAB : AI_exta_rrot<0b01101110, "uxtab", + BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>; +def UXTAH : AI_exta_rrot<0b01101111, "uxtah", + BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>; +} + +// This isn't safe in general, the add is two 16-bit units, not a 32-bit add. +def UXTAB16 : AI_exta_rrot_np<0b01101100, "uxtab16">; + + +def SBFX : I<(outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, imm0_31:$lsb, imm1_32:$width), + AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi, + "sbfx", "\t$Rd, $Rn, $lsb, $width", "", []>, + Requires<[IsARM, HasV6T2]> { + bits<4> Rd; + bits<4> Rn; + bits<5> lsb; + bits<5> width; + let Inst{27-21} = 0b0111101; + let Inst{6-4} = 0b101; + let Inst{20-16} = width; + let Inst{15-12} = Rd; + let Inst{11-7} = lsb; + let Inst{3-0} = Rn; +} + +def UBFX : I<(outs GPR:$Rd), + (ins GPR:$Rn, imm0_31:$lsb, imm1_32:$width), + AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi, + "ubfx", "\t$Rd, $Rn, $lsb, $width", "", []>, + Requires<[IsARM, HasV6T2]> { + bits<4> Rd; + bits<4> Rn; + bits<5> lsb; + bits<5> width; + let Inst{27-21} = 0b0111111; + let Inst{6-4} = 0b101; + let Inst{20-16} = width; + let Inst{15-12} = Rd; + let Inst{11-7} = lsb; + let Inst{3-0} = Rn; +} + +//===----------------------------------------------------------------------===// +// Arithmetic Instructions. +// + +defm ADD : AsI1_bin_irs<0b0100, "add", + IIC_iALUi, IIC_iALUr, IIC_iALUsr, + BinOpFrag<(add node:$LHS, node:$RHS)>, "ADD", 1>; +defm SUB : AsI1_bin_irs<0b0010, "sub", + IIC_iALUi, IIC_iALUr, IIC_iALUsr, + BinOpFrag<(sub node:$LHS, node:$RHS)>, "SUB">; + +// ADD and SUB with 's' bit set. +// +// Currently, t2ADDS/t2SUBS are pseudo opcodes that exist only in the +// selection DAG. They are "lowered" to real t2ADD/t2SUB opcodes by +// AdjustInstrPostInstrSelection where we determine whether or not to +// set the "s" bit based on CPSR liveness. +// +// FIXME: Eliminate t2ADDS/t2SUBS pseudo opcodes after adding tablegen +// support for an optional CPSR definition that corresponds to the DAG +// node's second value. We can then eliminate the implicit def of CPSR. +defm ADDS : AsI1_bin_s_irs<0b0100, "add", + IIC_iALUi, IIC_iALUr, IIC_iALUsr, + BinOpFrag<(ARMaddc node:$LHS, node:$RHS)>, 1>; +defm SUBS : AsI1_bin_s_irs<0b0010, "sub", + IIC_iALUi, IIC_iALUr, IIC_iALUsr, + BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>; + +defm ADC : AI1_adde_sube_irs<0b0101, "adc", + BinOpWithFlagFrag<(ARMadde node:$LHS, node:$RHS, node:$FLAG)>, + "ADC", 1>; +defm SBC : AI1_adde_sube_irs<0b0110, "sbc", + BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>, + "SBC">; + +defm RSB : AsI1_rbin_irs <0b0011, "rsb", + IIC_iALUi, IIC_iALUr, IIC_iALUsr, + BinOpFrag<(sub node:$LHS, node:$RHS)>, "RSB">; + +// FIXME: Eliminate them if we can write def : Pat patterns which defines +// CPSR and the implicit def of CPSR is not needed. +defm RSBS : AsI1_rbin_s_is<0b0011, "rsb", + IIC_iALUi, IIC_iALUr, IIC_iALUsr, + BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>; + +defm RSC : AI1_rsc_irs<0b0111, "rsc", + BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>, + "RSC">; + +// (sub X, imm) gets canonicalized to (add X, -imm). Match this form. +// The assume-no-carry-in form uses the negation of the input since add/sub +// assume opposite meanings of the carry flag (i.e., carry == !borrow). +// See the definition of AddWithCarry() in the ARM ARM A2.2.1 for the gory +// details. +def : ARMPat<(add GPR:$src, so_imm_neg:$imm), + (SUBri GPR:$src, so_imm_neg:$imm)>; +def : ARMPat<(ARMaddc GPR:$src, so_imm_neg:$imm), + (SUBSri GPR:$src, so_imm_neg:$imm)>; + +// The with-carry-in form matches bitwise not instead of the negation. +// Effectively, the inverse interpretation of the carry flag already accounts +// for part of the negation. +def : ARMPat<(ARMadde GPR:$src, so_imm_not:$imm, CPSR), + (SBCri GPR:$src, so_imm_not:$imm)>; + +// Note: These are implemented in C++ code, because they have to generate +// ADD/SUBrs instructions, which use a complex pattern that a xform function +// cannot produce. +// (mul X, 2^n+1) -> (add (X << n), X) +// (mul X, 2^n-1) -> (rsb X, (X << n)) + +// ARM Arithmetic Instruction +// GPR:$dst = GPR:$a op GPR:$b +class AAI<bits<8> op27_20, bits<8> op11_4, string opc, + list<dag> pattern = [], + dag iops = (ins GPRnopc:$Rn, GPRnopc:$Rm), + string asm = "\t$Rd, $Rn, $Rm"> + : AI<(outs GPRnopc:$Rd), iops, DPFrm, IIC_iALUr, opc, asm, pattern> { + bits<4> Rn; + bits<4> Rd; + bits<4> Rm; + let Inst{27-20} = op27_20; + let Inst{11-4} = op11_4; + let Inst{19-16} = Rn; + let Inst{15-12} = Rd; + let Inst{3-0} = Rm; +} + +// Saturating add/subtract + +def QADD : AAI<0b00010000, 0b00000101, "qadd", + [(set GPRnopc:$Rd, (int_arm_qadd GPRnopc:$Rm, GPRnopc:$Rn))], + (ins GPRnopc:$Rm, GPRnopc:$Rn), "\t$Rd, $Rm, $Rn">; +def QSUB : AAI<0b00010010, 0b00000101, "qsub", + [(set GPRnopc:$Rd, (int_arm_qsub GPRnopc:$Rm, GPRnopc:$Rn))], + (ins GPRnopc:$Rm, GPRnopc:$Rn), "\t$Rd, $Rm, $Rn">; +def QDADD : AAI<0b00010100, 0b00000101, "qdadd", [], + (ins GPRnopc:$Rm, GPRnopc:$Rn), + "\t$Rd, $Rm, $Rn">; +def QDSUB : AAI<0b00010110, 0b00000101, "qdsub", [], + (ins GPRnopc:$Rm, GPRnopc:$Rn), + "\t$Rd, $Rm, $Rn">; + +def QADD16 : AAI<0b01100010, 0b11110001, "qadd16">; +def QADD8 : AAI<0b01100010, 0b11111001, "qadd8">; +def QASX : AAI<0b01100010, 0b11110011, "qasx">; +def QSAX : AAI<0b01100010, 0b11110101, "qsax">; +def QSUB16 : AAI<0b01100010, 0b11110111, "qsub16">; +def QSUB8 : AAI<0b01100010, 0b11111111, "qsub8">; +def UQADD16 : AAI<0b01100110, 0b11110001, "uqadd16">; +def UQADD8 : AAI<0b01100110, 0b11111001, "uqadd8">; +def UQASX : AAI<0b01100110, 0b11110011, "uqasx">; +def UQSAX : AAI<0b01100110, 0b11110101, "uqsax">; +def UQSUB16 : AAI<0b01100110, 0b11110111, "uqsub16">; +def UQSUB8 : AAI<0b01100110, 0b11111111, "uqsub8">; + +// Signed/Unsigned add/subtract + +def SASX : AAI<0b01100001, 0b11110011, "sasx">; +def SADD16 : AAI<0b01100001, 0b11110001, "sadd16">; +def SADD8 : AAI<0b01100001, 0b11111001, "sadd8">; +def SSAX : AAI<0b01100001, 0b11110101, "ssax">; +def SSUB16 : AAI<0b01100001, 0b11110111, "ssub16">; +def SSUB8 : AAI<0b01100001, 0b11111111, "ssub8">; +def UASX : AAI<0b01100101, 0b11110011, "uasx">; +def UADD16 : AAI<0b01100101, 0b11110001, "uadd16">; +def UADD8 : AAI<0b01100101, 0b11111001, "uadd8">; +def USAX : AAI<0b01100101, 0b11110101, "usax">; +def USUB16 : AAI<0b01100101, 0b11110111, "usub16">; +def USUB8 : AAI<0b01100101, 0b11111111, "usub8">; + +// Signed/Unsigned halving add/subtract + +def SHASX : AAI<0b01100011, 0b11110011, "shasx">; +def SHADD16 : AAI<0b01100011, 0b11110001, "shadd16">; +def SHADD8 : AAI<0b01100011, 0b11111001, "shadd8">; +def SHSAX : AAI<0b01100011, 0b11110101, "shsax">; +def SHSUB16 : AAI<0b01100011, 0b11110111, "shsub16">; +def SHSUB8 : AAI<0b01100011, 0b11111111, "shsub8">; +def UHASX : AAI<0b01100111, 0b11110011, "uhasx">; +def UHADD16 : AAI<0b01100111, 0b11110001, "uhadd16">; +def UHADD8 : AAI<0b01100111, 0b11111001, "uhadd8">; +def UHSAX : AAI<0b01100111, 0b11110101, "uhsax">; +def UHSUB16 : AAI<0b01100111, 0b11110111, "uhsub16">; +def UHSUB8 : AAI<0b01100111, 0b11111111, "uhsub8">; + +// Unsigned Sum of Absolute Differences [and Accumulate]. + +def USAD8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + MulFrm /* for convenience */, NoItinerary, "usad8", + "\t$Rd, $Rn, $Rm", []>, + Requires<[IsARM, HasV6]> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + let Inst{27-20} = 0b01111000; + let Inst{15-12} = 0b1111; + let Inst{7-4} = 0b0001; + let Inst{19-16} = Rd; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; +} +def USADA8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), + MulFrm /* for convenience */, NoItinerary, "usada8", + "\t$Rd, $Rn, $Rm, $Ra", []>, + Requires<[IsARM, HasV6]> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + bits<4> Ra; + let Inst{27-20} = 0b01111000; + let Inst{7-4} = 0b0001; + let Inst{19-16} = Rd; + let Inst{15-12} = Ra; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; +} + +// Signed/Unsigned saturate + +def SSAT : AI<(outs GPRnopc:$Rd), + (ins imm1_32:$sat_imm, GPRnopc:$Rn, shift_imm:$sh), + SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []> { + bits<4> Rd; + bits<5> sat_imm; + bits<4> Rn; + bits<8> sh; + let Inst{27-21} = 0b0110101; + let Inst{5-4} = 0b01; + let Inst{20-16} = sat_imm; + let Inst{15-12} = Rd; + let Inst{11-7} = sh{4-0}; + let Inst{6} = sh{5}; + let Inst{3-0} = Rn; +} + +def SSAT16 : AI<(outs GPRnopc:$Rd), + (ins imm1_16:$sat_imm, GPRnopc:$Rn), SatFrm, + NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", []> { + bits<4> Rd; + bits<4> sat_imm; + bits<4> Rn; + let Inst{27-20} = 0b01101010; + let Inst{11-4} = 0b11110011; + let Inst{15-12} = Rd; + let Inst{19-16} = sat_imm; + let Inst{3-0} = Rn; +} + +def USAT : AI<(outs GPRnopc:$Rd), + (ins imm0_31:$sat_imm, GPRnopc:$Rn, shift_imm:$sh), + SatFrm, NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []> { + bits<4> Rd; + bits<5> sat_imm; + bits<4> Rn; + bits<8> sh; + let Inst{27-21} = 0b0110111; + let Inst{5-4} = 0b01; + let Inst{15-12} = Rd; + let Inst{11-7} = sh{4-0}; + let Inst{6} = sh{5}; + let Inst{20-16} = sat_imm; + let Inst{3-0} = Rn; +} + +def USAT16 : AI<(outs GPRnopc:$Rd), + (ins imm0_15:$sat_imm, GPRnopc:$Rn), SatFrm, + NoItinerary, "usat16", "\t$Rd, $sat_imm, $Rn", []> { + bits<4> Rd; + bits<4> sat_imm; + bits<4> Rn; + let Inst{27-20} = 0b01101110; + let Inst{11-4} = 0b11110011; + let Inst{15-12} = Rd; + let Inst{19-16} = sat_imm; + let Inst{3-0} = Rn; +} + +def : ARMV6Pat<(int_arm_ssat GPRnopc:$a, imm:$pos), + (SSAT imm:$pos, GPRnopc:$a, 0)>; +def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm:$pos), + (USAT imm:$pos, GPRnopc:$a, 0)>; + +//===----------------------------------------------------------------------===// +// Bitwise Instructions. +// + +defm AND : AsI1_bin_irs<0b0000, "and", + IIC_iBITi, IIC_iBITr, IIC_iBITsr, + BinOpFrag<(and node:$LHS, node:$RHS)>, "AND", 1>; +defm ORR : AsI1_bin_irs<0b1100, "orr", + IIC_iBITi, IIC_iBITr, IIC_iBITsr, + BinOpFrag<(or node:$LHS, node:$RHS)>, "ORR", 1>; +defm EOR : AsI1_bin_irs<0b0001, "eor", + IIC_iBITi, IIC_iBITr, IIC_iBITsr, + BinOpFrag<(xor node:$LHS, node:$RHS)>, "EOR", 1>; +defm BIC : AsI1_bin_irs<0b1110, "bic", + IIC_iBITi, IIC_iBITr, IIC_iBITsr, + BinOpFrag<(and node:$LHS, (not node:$RHS))>, "BIC">; + +// FIXME: bf_inv_mask_imm should be two operands, the lsb and the msb, just +// like in the actual instruction encoding. The complexity of mapping the mask +// to the lsb/msb pair should be handled by ISel, not encapsulated in the +// instruction description. +def BFC : I<(outs GPR:$Rd), (ins GPR:$src, bf_inv_mask_imm:$imm), + AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi, + "bfc", "\t$Rd, $imm", "$src = $Rd", + [(set GPR:$Rd, (and GPR:$src, bf_inv_mask_imm:$imm))]>, + Requires<[IsARM, HasV6T2]> { + bits<4> Rd; + bits<10> imm; + let Inst{27-21} = 0b0111110; + let Inst{6-0} = 0b0011111; + let Inst{15-12} = Rd; + let Inst{11-7} = imm{4-0}; // lsb + let Inst{20-16} = imm{9-5}; // msb +} + +// A8.6.18 BFI - Bitfield insert (Encoding A1) +def BFI:I<(outs GPRnopc:$Rd), (ins GPRnopc:$src, GPR:$Rn, bf_inv_mask_imm:$imm), + AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi, + "bfi", "\t$Rd, $Rn, $imm", "$src = $Rd", + [(set GPRnopc:$Rd, (ARMbfi GPRnopc:$src, GPR:$Rn, + bf_inv_mask_imm:$imm))]>, + Requires<[IsARM, HasV6T2]> { + bits<4> Rd; + bits<4> Rn; + bits<10> imm; + let Inst{27-21} = 0b0111110; + let Inst{6-4} = 0b001; // Rn: Inst{3-0} != 15 + let Inst{15-12} = Rd; + let Inst{11-7} = imm{4-0}; // lsb + let Inst{20-16} = imm{9-5}; // width + let Inst{3-0} = Rn; +} + +def MVNr : AsI1<0b1111, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMVNr, + "mvn", "\t$Rd, $Rm", + [(set GPR:$Rd, (not GPR:$Rm))]>, UnaryDP { + bits<4> Rd; + bits<4> Rm; + let Inst{25} = 0; + let Inst{19-16} = 0b0000; + let Inst{11-4} = 0b00000000; + let Inst{15-12} = Rd; + let Inst{3-0} = Rm; +} +def MVNsi : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_imm:$shift), + DPSoRegImmFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift", + [(set GPR:$Rd, (not so_reg_imm:$shift))]>, UnaryDP { + bits<4> Rd; + bits<12> shift; + let Inst{25} = 0; + let Inst{19-16} = 0b0000; + let Inst{15-12} = Rd; + let Inst{11-5} = shift{11-5}; + let Inst{4} = 0; + let Inst{3-0} = shift{3-0}; +} +def MVNsr : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_reg:$shift), + DPSoRegRegFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift", + [(set GPR:$Rd, (not so_reg_reg:$shift))]>, UnaryDP { + bits<4> Rd; + bits<12> shift; + let Inst{25} = 0; + let Inst{19-16} = 0b0000; + let Inst{15-12} = Rd; + let Inst{11-8} = shift{11-8}; + let Inst{7} = 0; + let Inst{6-5} = shift{6-5}; + let Inst{4} = 1; + let Inst{3-0} = shift{3-0}; +} +let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in +def MVNi : AsI1<0b1111, (outs GPR:$Rd), (ins so_imm:$imm), DPFrm, + IIC_iMVNi, "mvn", "\t$Rd, $imm", + [(set GPR:$Rd, so_imm_not:$imm)]>,UnaryDP { + bits<4> Rd; + bits<12> imm; + let Inst{25} = 1; + let Inst{19-16} = 0b0000; + let Inst{15-12} = Rd; + let Inst{11-0} = imm; +} + +def : ARMPat<(and GPR:$src, so_imm_not:$imm), + (BICri GPR:$src, so_imm_not:$imm)>; + +//===----------------------------------------------------------------------===// +// Multiply Instructions. +// +class AsMul1I32<bits<7> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rm; + bits<4> Rn; + let Inst{19-16} = Rd; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; +} +class AsMul1I64<bits<7> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> { + bits<4> RdLo; + bits<4> RdHi; + bits<4> Rm; + bits<4> Rn; + let Inst{19-16} = RdHi; + let Inst{15-12} = RdLo; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; +} + +// FIXME: The v5 pseudos are only necessary for the additional Constraint +// property. Remove them when it's possible to add those properties +// on an individual MachineInstr, not just an instuction description. +let isCommutable = 1 in { +def MUL : AsMul1I32<0b0000000, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (mul GPR:$Rn, GPR:$Rm))]>, + Requires<[IsARM, HasV6]> { + let Inst{15-12} = 0b0000; +} + +let Constraints = "@earlyclobber $Rd" in +def MULv5: ARMPseudoExpand<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, + pred:$p, cc_out:$s), + 4, IIC_iMUL32, + [(set GPR:$Rd, (mul GPR:$Rn, GPR:$Rm))], + (MUL GPR:$Rd, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>, + Requires<[IsARM, NoV6]>; +} + +def MLA : AsMul1I32<0b0000001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), + IIC_iMAC32, "mla", "\t$Rd, $Rn, $Rm, $Ra", + [(set GPR:$Rd, (add (mul GPR:$Rn, GPR:$Rm), GPR:$Ra))]>, + Requires<[IsARM, HasV6]> { + bits<4> Ra; + let Inst{15-12} = Ra; +} + +let Constraints = "@earlyclobber $Rd" in +def MLAv5: ARMPseudoExpand<(outs GPR:$Rd), + (ins GPR:$Rn, GPR:$Rm, GPR:$Ra, pred:$p, cc_out:$s), + 4, IIC_iMAC32, + [(set GPR:$Rd, (add (mul GPR:$Rn, GPR:$Rm), GPR:$Ra))], + (MLA GPR:$Rd, GPR:$Rn, GPR:$Rm, GPR:$Ra, pred:$p, cc_out:$s)>, + Requires<[IsARM, NoV6]>; + +def MLS : AMul1I<0b0000011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), + IIC_iMAC32, "mls", "\t$Rd, $Rn, $Rm, $Ra", + [(set GPR:$Rd, (sub GPR:$Ra, (mul GPR:$Rn, GPR:$Rm)))]>, + Requires<[IsARM, HasV6T2]> { + bits<4> Rd; + bits<4> Rm; + bits<4> Rn; + bits<4> Ra; + let Inst{19-16} = Rd; + let Inst{15-12} = Ra; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; +} + +// Extra precision multiplies with low / high results +let neverHasSideEffects = 1 in { +let isCommutable = 1 in { +def SMULL : AsMul1I64<0b0000110, (outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm), IIC_iMUL64, + "smull", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + Requires<[IsARM, HasV6]>; + +def UMULL : AsMul1I64<0b0000100, (outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm), IIC_iMUL64, + "umull", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + Requires<[IsARM, HasV6]>; + +let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in { +def SMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), + 4, IIC_iMUL64, [], + (SMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>, + Requires<[IsARM, NoV6]>; + +def UMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), + 4, IIC_iMUL64, [], + (UMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>, + Requires<[IsARM, NoV6]>; +} +} + +// Multiply + accumulate +def SMLAL : AsMul1I64<0b0000111, (outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64, + "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + Requires<[IsARM, HasV6]>; +def UMLAL : AsMul1I64<0b0000101, (outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64, + "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + Requires<[IsARM, HasV6]>; + +def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64, + "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + Requires<[IsARM, HasV6]> { + bits<4> RdLo; + bits<4> RdHi; + bits<4> Rm; + bits<4> Rn; + let Inst{19-16} = RdHi; + let Inst{15-12} = RdLo; + let Inst{11-8} = Rm; + let Inst{3-0} = Rn; +} + +let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in { +def SMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), + 4, IIC_iMAC64, [], + (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>, + Requires<[IsARM, NoV6]>; +def UMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), + 4, IIC_iMAC64, [], + (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>, + Requires<[IsARM, NoV6]>; +def UMAALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi), + (ins GPR:$Rn, GPR:$Rm, pred:$p), + 4, IIC_iMAC64, [], + (UMAAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p)>, + Requires<[IsARM, NoV6]>; +} + +} // neverHasSideEffects + +// Most significant word multiply +def SMMUL : AMul2I <0b0111010, 0b0001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iMUL32, "smmul", "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (mulhs GPR:$Rn, GPR:$Rm))]>, + Requires<[IsARM, HasV6]> { + let Inst{15-12} = 0b1111; +} + +def SMMULR : AMul2I <0b0111010, 0b0011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iMUL32, "smmulr", "\t$Rd, $Rn, $Rm", []>, + Requires<[IsARM, HasV6]> { + let Inst{15-12} = 0b1111; +} + +def SMMLA : AMul2Ia <0b0111010, 0b0001, (outs GPR:$Rd), + (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), + IIC_iMAC32, "smmla", "\t$Rd, $Rn, $Rm, $Ra", + [(set GPR:$Rd, (add (mulhs GPR:$Rn, GPR:$Rm), GPR:$Ra))]>, + Requires<[IsARM, HasV6]>; + +def SMMLAR : AMul2Ia <0b0111010, 0b0011, (outs GPR:$Rd), + (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), + IIC_iMAC32, "smmlar", "\t$Rd, $Rn, $Rm, $Ra", []>, + Requires<[IsARM, HasV6]>; + +def SMMLS : AMul2Ia <0b0111010, 0b1101, (outs GPR:$Rd), + (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), + IIC_iMAC32, "smmls", "\t$Rd, $Rn, $Rm, $Ra", + [(set GPR:$Rd, (sub GPR:$Ra, (mulhs GPR:$Rn, GPR:$Rm)))]>, + Requires<[IsARM, HasV6]>; + +def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd), + (ins GPR:$Rn, GPR:$Rm, GPR:$Ra), + IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []>, + Requires<[IsARM, HasV6]>; + +multiclass AI_smul<string opc, PatFrag opnode> { + def BB : AMulxyI<0b0001011, 0b00, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iMUL16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (opnode (sext_inreg GPR:$Rn, i16), + (sext_inreg GPR:$Rm, i16)))]>, + Requires<[IsARM, HasV5TE]>; + + def BT : AMulxyI<0b0001011, 0b10, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iMUL16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (opnode (sext_inreg GPR:$Rn, i16), + (sra GPR:$Rm, (i32 16))))]>, + Requires<[IsARM, HasV5TE]>; + + def TB : AMulxyI<0b0001011, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iMUL16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (opnode (sra GPR:$Rn, (i32 16)), + (sext_inreg GPR:$Rm, i16)))]>, + Requires<[IsARM, HasV5TE]>; + + def TT : AMulxyI<0b0001011, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iMUL16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (opnode (sra GPR:$Rn, (i32 16)), + (sra GPR:$Rm, (i32 16))))]>, + Requires<[IsARM, HasV5TE]>; + + def WB : AMulxyI<0b0001001, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iMUL16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (sra (opnode GPR:$Rn, + (sext_inreg GPR:$Rm, i16)), (i32 16)))]>, + Requires<[IsARM, HasV5TE]>; + + def WT : AMulxyI<0b0001001, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + IIC_iMUL16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm", + [(set GPR:$Rd, (sra (opnode GPR:$Rn, + (sra GPR:$Rm, (i32 16))), (i32 16)))]>, + Requires<[IsARM, HasV5TE]>; +} + + +multiclass AI_smla<string opc, PatFrag opnode> { + let DecoderMethod = "DecodeSMLAInstruction" in { + def BB : AMulxyIa<0b0001000, 0b00, (outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), + IIC_iMAC16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm, $Ra", + [(set GPRnopc:$Rd, (add GPR:$Ra, + (opnode (sext_inreg GPRnopc:$Rn, i16), + (sext_inreg GPRnopc:$Rm, i16))))]>, + Requires<[IsARM, HasV5TE]>; + + def BT : AMulxyIa<0b0001000, 0b10, (outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), + IIC_iMAC16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra", + [(set GPRnopc:$Rd, + (add GPR:$Ra, (opnode (sext_inreg GPRnopc:$Rn, i16), + (sra GPRnopc:$Rm, (i32 16)))))]>, + Requires<[IsARM, HasV5TE]>; + + def TB : AMulxyIa<0b0001000, 0b01, (outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), + IIC_iMAC16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra", + [(set GPRnopc:$Rd, + (add GPR:$Ra, (opnode (sra GPRnopc:$Rn, (i32 16)), + (sext_inreg GPRnopc:$Rm, i16))))]>, + Requires<[IsARM, HasV5TE]>; + + def TT : AMulxyIa<0b0001000, 0b11, (outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), + IIC_iMAC16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra", + [(set GPRnopc:$Rd, + (add GPR:$Ra, (opnode (sra GPRnopc:$Rn, (i32 16)), + (sra GPRnopc:$Rm, (i32 16)))))]>, + Requires<[IsARM, HasV5TE]>; + + def WB : AMulxyIa<0b0001001, 0b00, (outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), + IIC_iMAC16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra", + [(set GPRnopc:$Rd, + (add GPR:$Ra, (sra (opnode GPRnopc:$Rn, + (sext_inreg GPRnopc:$Rm, i16)), (i32 16))))]>, + Requires<[IsARM, HasV5TE]>; + + def WT : AMulxyIa<0b0001001, 0b10, (outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), + IIC_iMAC16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra", + [(set GPRnopc:$Rd, + (add GPR:$Ra, (sra (opnode GPRnopc:$Rn, + (sra GPRnopc:$Rm, (i32 16))), (i32 16))))]>, + Requires<[IsARM, HasV5TE]>; + } +} + +defm SMUL : AI_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>; +defm SMLA : AI_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>; + +// Halfword multiply accumulate long: SMLAL<x><y>. +def SMLALBB : AMulxyI64<0b0001010, 0b00, (outs GPRnopc:$RdLo, GPRnopc:$RdHi), + (ins GPRnopc:$Rn, GPRnopc:$Rm), + IIC_iMAC64, "smlalbb", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + Requires<[IsARM, HasV5TE]>; + +def SMLALBT : AMulxyI64<0b0001010, 0b10, (outs GPRnopc:$RdLo, GPRnopc:$RdHi), + (ins GPRnopc:$Rn, GPRnopc:$Rm), + IIC_iMAC64, "smlalbt", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + Requires<[IsARM, HasV5TE]>; + +def SMLALTB : AMulxyI64<0b0001010, 0b01, (outs GPRnopc:$RdLo, GPRnopc:$RdHi), + (ins GPRnopc:$Rn, GPRnopc:$Rm), + IIC_iMAC64, "smlaltb", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + Requires<[IsARM, HasV5TE]>; + +def SMLALTT : AMulxyI64<0b0001010, 0b11, (outs GPRnopc:$RdLo, GPRnopc:$RdHi), + (ins GPRnopc:$Rn, GPRnopc:$Rm), + IIC_iMAC64, "smlaltt", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + Requires<[IsARM, HasV5TE]>; + +// Helper class for AI_smld. +class AMulDualIbase<bit long, bit sub, bit swap, dag oops, dag iops, + InstrItinClass itin, string opc, string asm> + : AI<oops, iops, MulFrm, itin, opc, asm, []>, Requires<[IsARM, HasV6]> { + bits<4> Rn; + bits<4> Rm; + let Inst{27-23} = 0b01110; + let Inst{22} = long; + let Inst{21-20} = 0b00; + let Inst{11-8} = Rm; + let Inst{7} = 0; + let Inst{6} = sub; + let Inst{5} = swap; + let Inst{4} = 1; + let Inst{3-0} = Rn; +} +class AMulDualI<bit long, bit sub, bit swap, dag oops, dag iops, + InstrItinClass itin, string opc, string asm> + : AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> { + bits<4> Rd; + let Inst{15-12} = 0b1111; + let Inst{19-16} = Rd; +} +class AMulDualIa<bit long, bit sub, bit swap, dag oops, dag iops, + InstrItinClass itin, string opc, string asm> + : AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> { + bits<4> Ra; + bits<4> Rd; + let Inst{19-16} = Rd; + let Inst{15-12} = Ra; +} +class AMulDualI64<bit long, bit sub, bit swap, dag oops, dag iops, + InstrItinClass itin, string opc, string asm> + : AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> { + bits<4> RdLo; + bits<4> RdHi; + let Inst{19-16} = RdHi; + let Inst{15-12} = RdLo; +} + +multiclass AI_smld<bit sub, string opc> { + + def D : AMulDualIa<0, sub, 0, (outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), + NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm, $Ra">; + + def DX: AMulDualIa<0, sub, 1, (outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra), + NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm, $Ra">; + + def LD: AMulDualI64<1, sub, 0, (outs GPRnopc:$RdLo, GPRnopc:$RdHi), + (ins GPRnopc:$Rn, GPRnopc:$Rm), NoItinerary, + !strconcat(opc, "ld"), "\t$RdLo, $RdHi, $Rn, $Rm">; + + def LDX : AMulDualI64<1, sub, 1, (outs GPRnopc:$RdLo, GPRnopc:$RdHi), + (ins GPRnopc:$Rn, GPRnopc:$Rm), NoItinerary, + !strconcat(opc, "ldx"),"\t$RdLo, $RdHi, $Rn, $Rm">; + +} + +defm SMLA : AI_smld<0, "smla">; +defm SMLS : AI_smld<1, "smls">; + +multiclass AI_sdml<bit sub, string opc> { + + def D:AMulDualI<0, sub, 0, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm), + NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm">; + def DX:AMulDualI<0, sub, 1, (outs GPRnopc:$Rd),(ins GPRnopc:$Rn, GPRnopc:$Rm), + NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm">; +} + +defm SMUA : AI_sdml<0, "smua">; +defm SMUS : AI_sdml<1, "smus">; + +//===----------------------------------------------------------------------===// +// Misc. Arithmetic Instructions. +// + +def CLZ : AMiscA1I<0b000010110, 0b0001, (outs GPR:$Rd), (ins GPR:$Rm), + IIC_iUNAr, "clz", "\t$Rd, $Rm", + [(set GPR:$Rd, (ctlz GPR:$Rm))]>, Requires<[IsARM, HasV5T]>; + +def RBIT : AMiscA1I<0b01101111, 0b0011, (outs GPR:$Rd), (ins GPR:$Rm), + IIC_iUNAr, "rbit", "\t$Rd, $Rm", + [(set GPR:$Rd, (ARMrbit GPR:$Rm))]>, + Requires<[IsARM, HasV6T2]>; + +def REV : AMiscA1I<0b01101011, 0b0011, (outs GPR:$Rd), (ins GPR:$Rm), + IIC_iUNAr, "rev", "\t$Rd, $Rm", + [(set GPR:$Rd, (bswap GPR:$Rm))]>, Requires<[IsARM, HasV6]>; + +let AddedComplexity = 5 in +def REV16 : AMiscA1I<0b01101011, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm), + IIC_iUNAr, "rev16", "\t$Rd, $Rm", + [(set GPR:$Rd, (rotr (bswap GPR:$Rm), (i32 16)))]>, + Requires<[IsARM, HasV6]>; + +let AddedComplexity = 5 in +def REVSH : AMiscA1I<0b01101111, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm), + IIC_iUNAr, "revsh", "\t$Rd, $Rm", + [(set GPR:$Rd, (sra (bswap GPR:$Rm), (i32 16)))]>, + Requires<[IsARM, HasV6]>; + +def : ARMV6Pat<(or (sra (shl GPR:$Rm, (i32 24)), (i32 16)), + (and (srl GPR:$Rm, (i32 8)), 0xFF)), + (REVSH GPR:$Rm)>; + +def PKHBT : APKHI<0b01101000, 0, (outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, GPRnopc:$Rm, pkh_lsl_amt:$sh), + IIC_iALUsi, "pkhbt", "\t$Rd, $Rn, $Rm$sh", + [(set GPRnopc:$Rd, (or (and GPRnopc:$Rn, 0xFFFF), + (and (shl GPRnopc:$Rm, pkh_lsl_amt:$sh), + 0xFFFF0000)))]>, + Requires<[IsARM, HasV6]>; + +// Alternate cases for PKHBT where identities eliminate some nodes. +def : ARMV6Pat<(or (and GPRnopc:$Rn, 0xFFFF), (and GPRnopc:$Rm, 0xFFFF0000)), + (PKHBT GPRnopc:$Rn, GPRnopc:$Rm, 0)>; +def : ARMV6Pat<(or (and GPRnopc:$Rn, 0xFFFF), (shl GPRnopc:$Rm, imm16_31:$sh)), + (PKHBT GPRnopc:$Rn, GPRnopc:$Rm, imm16_31:$sh)>; + +// Note: Shifts of 1-15 bits will be transformed to srl instead of sra and +// will match the pattern below. +def PKHTB : APKHI<0b01101000, 1, (outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, GPRnopc:$Rm, pkh_asr_amt:$sh), + IIC_iBITsi, "pkhtb", "\t$Rd, $Rn, $Rm$sh", + [(set GPRnopc:$Rd, (or (and GPRnopc:$Rn, 0xFFFF0000), + (and (sra GPRnopc:$Rm, pkh_asr_amt:$sh), + 0xFFFF)))]>, + Requires<[IsARM, HasV6]>; + +// Alternate cases for PKHTB where identities eliminate some nodes. Note that +// a shift amount of 0 is *not legal* here, it is PKHBT instead. +def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000), + (srl GPRnopc:$src2, imm16_31:$sh)), + (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm16_31:$sh)>; +def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000), + (and (srl GPRnopc:$src2, imm1_15:$sh), 0xFFFF)), + (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm1_15:$sh)>; + +//===----------------------------------------------------------------------===// +// Comparison Instructions... +// + +defm CMP : AI1_cmp_irs<0b1010, "cmp", + IIC_iCMPi, IIC_iCMPr, IIC_iCMPsr, + BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>; + +// ARMcmpZ can re-use the above instruction definitions. +def : ARMPat<(ARMcmpZ GPR:$src, so_imm:$imm), + (CMPri GPR:$src, so_imm:$imm)>; +def : ARMPat<(ARMcmpZ GPR:$src, GPR:$rhs), + (CMPrr GPR:$src, GPR:$rhs)>; +def : ARMPat<(ARMcmpZ GPR:$src, so_reg_imm:$rhs), + (CMPrsi GPR:$src, so_reg_imm:$rhs)>; +def : ARMPat<(ARMcmpZ GPR:$src, so_reg_reg:$rhs), + (CMPrsr GPR:$src, so_reg_reg:$rhs)>; + +// FIXME: We have to be careful when using the CMN instruction and comparison +// with 0. One would expect these two pieces of code should give identical +// results: +// +// rsbs r1, r1, 0 +// cmp r0, r1 +// mov r0, #0 +// it ls +// mov r0, #1 +// +// and: +// +// cmn r0, r1 +// mov r0, #0 +// it ls +// mov r0, #1 +// +// However, the CMN gives the *opposite* result when r1 is 0. This is because +// the carry flag is set in the CMP case but not in the CMN case. In short, the +// CMP instruction doesn't perform a truncate of the (logical) NOT of 0 plus the +// value of r0 and the carry bit (because the "carry bit" parameter to +// AddWithCarry is defined as 1 in this case, the carry flag will always be set +// when r0 >= 0). The CMN instruction doesn't perform a NOT of 0 so there is +// never a "carry" when this AddWithCarry is performed (because the "carry bit" +// parameter to AddWithCarry is defined as 0). +// +// When x is 0 and unsigned: +// +// x = 0 +// ~x = 0xFFFF FFFF +// ~x + 1 = 0x1 0000 0000 +// (-x = 0) != (0x1 0000 0000 = ~x + 1) +// +// Therefore, we should disable CMN when comparing against zero, until we can +// limit when the CMN instruction is used (when we know that the RHS is not 0 or +// when it's a comparison which doesn't look at the 'carry' flag). +// +// (See the ARM docs for the "AddWithCarry" pseudo-code.) +// +// This is related to <rdar://problem/7569620>. +// +//defm CMN : AI1_cmp_irs<0b1011, "cmn", +// BinOpFrag<(ARMcmp node:$LHS,(ineg node:$RHS))>>; + +// Note that TST/TEQ don't set all the same flags that CMP does! +defm TST : AI1_cmp_irs<0b1000, "tst", + IIC_iTSTi, IIC_iTSTr, IIC_iTSTsr, + BinOpFrag<(ARMcmpZ (and_su node:$LHS, node:$RHS), 0)>, 1>; +defm TEQ : AI1_cmp_irs<0b1001, "teq", + IIC_iTSTi, IIC_iTSTr, IIC_iTSTsr, + BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>, 1>; + +defm CMNz : AI1_cmp_irs<0b1011, "cmn", + IIC_iCMPi, IIC_iCMPr, IIC_iCMPsr, + BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>>; + +//def : ARMPat<(ARMcmp GPR:$src, so_imm_neg:$imm), +// (CMNri GPR:$src, so_imm_neg:$imm)>; + +def : ARMPat<(ARMcmpZ GPR:$src, so_imm_neg:$imm), + (CMNzri GPR:$src, so_imm_neg:$imm)>; + +// Pseudo i64 compares for some floating point compares. +let usesCustomInserter = 1, isBranch = 1, isTerminator = 1, + Defs = [CPSR] in { +def BCCi64 : PseudoInst<(outs), + (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, brtarget:$dst), + IIC_Br, + [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, bb:$dst)]>; + +def BCCZi64 : PseudoInst<(outs), + (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, brtarget:$dst), IIC_Br, + [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, 0, 0, bb:$dst)]>; +} // usesCustomInserter + + +// Conditional moves +// FIXME: should be able to write a pattern for ARMcmov, but can't use +// a two-value operand where a dag node expects two operands. :( +let neverHasSideEffects = 1 in { +def MOVCCr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$false, GPR:$Rm, pred:$p), + 4, IIC_iCMOVr, + [/*(set GPR:$Rd, (ARMcmov GPR:$false, GPR:$Rm, imm:$cc, CCR:$ccr))*/]>, + RegConstraint<"$false = $Rd">; +def MOVCCsi : ARMPseudoInst<(outs GPR:$Rd), + (ins GPR:$false, so_reg_imm:$shift, pred:$p), + 4, IIC_iCMOVsr, + [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_reg_imm:$shift, + imm:$cc, CCR:$ccr))*/]>, + RegConstraint<"$false = $Rd">; +def MOVCCsr : ARMPseudoInst<(outs GPR:$Rd), + (ins GPR:$false, so_reg_reg:$shift, pred:$p), + 4, IIC_iCMOVsr, + [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_reg_reg:$shift, + imm:$cc, CCR:$ccr))*/]>, + RegConstraint<"$false = $Rd">; + + +let isMoveImm = 1 in +def MOVCCi16 : ARMPseudoInst<(outs GPR:$Rd), + (ins GPR:$false, imm0_65535_expr:$imm, pred:$p), + 4, IIC_iMOVi, + []>, + RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>; + +let isMoveImm = 1 in +def MOVCCi : ARMPseudoInst<(outs GPR:$Rd), + (ins GPR:$false, so_imm:$imm, pred:$p), + 4, IIC_iCMOVi, + [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_imm:$imm, imm:$cc, CCR:$ccr))*/]>, + RegConstraint<"$false = $Rd">; + +// Two instruction predicate mov immediate. +let isMoveImm = 1 in +def MOVCCi32imm : ARMPseudoInst<(outs GPR:$Rd), + (ins GPR:$false, i32imm:$src, pred:$p), + 8, IIC_iCMOVix2, []>, RegConstraint<"$false = $Rd">; + +let isMoveImm = 1 in +def MVNCCi : ARMPseudoInst<(outs GPR:$Rd), + (ins GPR:$false, so_imm:$imm, pred:$p), + 4, IIC_iCMOVi, + [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_imm_not:$imm, imm:$cc, CCR:$ccr))*/]>, + RegConstraint<"$false = $Rd">; +} // neverHasSideEffects + +//===----------------------------------------------------------------------===// +// Atomic operations intrinsics +// + +def MemBarrierOptOperand : AsmOperandClass { + let Name = "MemBarrierOpt"; + let ParserMethod = "parseMemBarrierOptOperand"; +} +def memb_opt : Operand<i32> { + let PrintMethod = "printMemBOption"; + let ParserMatchClass = MemBarrierOptOperand; + let DecoderMethod = "DecodeMemBarrierOption"; +} + +// memory barriers protect the atomic sequences +let hasSideEffects = 1 in { +def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary, + "dmb", "\t$opt", [(ARMMemBarrier (i32 imm:$opt))]>, + Requires<[IsARM, HasDB]> { + bits<4> opt; + let Inst{31-4} = 0xf57ff05; + let Inst{3-0} = opt; +} +} + +def DSB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary, + "dsb", "\t$opt", []>, + Requires<[IsARM, HasDB]> { + bits<4> opt; + let Inst{31-4} = 0xf57ff04; + let Inst{3-0} = opt; +} + +// ISB has only full system option +def ISB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary, + "isb", "\t$opt", []>, + Requires<[IsARM, HasDB]> { + bits<4> opt; + let Inst{31-4} = 0xf57ff06; + let Inst{3-0} = opt; +} + +// Pseudo isntruction that combines movs + predicated rsbmi +// to implement integer ABS +let usesCustomInserter = 1, Defs = [CPSR] in { +def ABS : ARMPseudoInst< + (outs GPR:$dst), (ins GPR:$src), + 8, NoItinerary, []>; +} + +let usesCustomInserter = 1 in { + let Defs = [CPSR] in { + def ATOMIC_LOAD_ADD_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + [(set GPR:$dst, (atomic_load_add_8 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_SUB_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + [(set GPR:$dst, (atomic_load_sub_8 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_AND_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + [(set GPR:$dst, (atomic_load_and_8 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_OR_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + [(set GPR:$dst, (atomic_load_or_8 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_XOR_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + [(set GPR:$dst, (atomic_load_xor_8 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_NAND_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + [(set GPR:$dst, (atomic_load_nand_8 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_MIN_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_min_8 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_MAX_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_max_8 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_UMIN_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_min_8 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_UMAX_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_max_8 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_ADD_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + [(set GPR:$dst, (atomic_load_add_16 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_SUB_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + [(set GPR:$dst, (atomic_load_sub_16 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_AND_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + [(set GPR:$dst, (atomic_load_and_16 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_OR_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + [(set GPR:$dst, (atomic_load_or_16 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_XOR_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + [(set GPR:$dst, (atomic_load_xor_16 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_NAND_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + [(set GPR:$dst, (atomic_load_nand_16 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_MIN_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_min_16 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_MAX_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_max_16 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_UMIN_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_min_16 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_UMAX_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_max_16 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_ADD_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + [(set GPR:$dst, (atomic_load_add_32 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_SUB_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + [(set GPR:$dst, (atomic_load_sub_32 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_AND_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + [(set GPR:$dst, (atomic_load_and_32 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_OR_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + [(set GPR:$dst, (atomic_load_or_32 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_XOR_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + [(set GPR:$dst, (atomic_load_xor_32 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_NAND_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, + [(set GPR:$dst, (atomic_load_nand_32 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_MIN_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_min_32 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_MAX_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_max_32 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_UMIN_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_min_32 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_UMAX_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_max_32 GPR:$ptr, GPR:$val))]>; + + def ATOMIC_SWAP_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary, + [(set GPR:$dst, (atomic_swap_8 GPR:$ptr, GPR:$new))]>; + def ATOMIC_SWAP_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary, + [(set GPR:$dst, (atomic_swap_16 GPR:$ptr, GPR:$new))]>; + def ATOMIC_SWAP_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary, + [(set GPR:$dst, (atomic_swap_32 GPR:$ptr, GPR:$new))]>; + + def ATOMIC_CMP_SWAP_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary, + [(set GPR:$dst, (atomic_cmp_swap_8 GPR:$ptr, GPR:$old, GPR:$new))]>; + def ATOMIC_CMP_SWAP_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary, + [(set GPR:$dst, (atomic_cmp_swap_16 GPR:$ptr, GPR:$old, GPR:$new))]>; + def ATOMIC_CMP_SWAP_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary, + [(set GPR:$dst, (atomic_cmp_swap_32 GPR:$ptr, GPR:$old, GPR:$new))]>; +} +} + +let mayLoad = 1 in { +def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr), + NoItinerary, + "ldrexb", "\t$Rt, $addr", []>; +def LDREXH : AIldrex<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr), + NoItinerary, "ldrexh", "\t$Rt, $addr", []>; +def LDREX : AIldrex<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr), + NoItinerary, "ldrex", "\t$Rt, $addr", []>; +let hasExtraDefRegAllocReq = 1 in +def LDREXD: AIldrex<0b01, (outs GPR:$Rt, GPR:$Rt2),(ins addr_offset_none:$addr), + NoItinerary, "ldrexd", "\t$Rt, $Rt2, $addr", []> { + let DecoderMethod = "DecodeDoubleRegLoad"; +} +} + +let mayStore = 1, Constraints = "@earlyclobber $Rd" in { +def STREXB: AIstrex<0b10, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr), + NoItinerary, "strexb", "\t$Rd, $Rt, $addr", []>; +def STREXH: AIstrex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr), + NoItinerary, "strexh", "\t$Rd, $Rt, $addr", []>; +def STREX : AIstrex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr), + NoItinerary, "strex", "\t$Rd, $Rt, $addr", []>; +} + +let hasExtraSrcRegAllocReq = 1, Constraints = "@earlyclobber $Rd" in +def STREXD : AIstrex<0b01, (outs GPR:$Rd), + (ins GPR:$Rt, GPR:$Rt2, addr_offset_none:$addr), + NoItinerary, "strexd", "\t$Rd, $Rt, $Rt2, $addr", []> { + let DecoderMethod = "DecodeDoubleRegStore"; +} + +def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex", []>, + Requires<[IsARM, HasV7]> { + let Inst{31-0} = 0b11110101011111111111000000011111; +} + +// SWP/SWPB are deprecated in V6/V7. +let mayLoad = 1, mayStore = 1 in { +def SWP : AIswp<0, (outs GPR:$Rt), (ins GPR:$Rt2, addr_offset_none:$addr), + "swp", []>; +def SWPB: AIswp<1, (outs GPR:$Rt), (ins GPR:$Rt2, addr_offset_none:$addr), + "swpb", []>; +} + +//===----------------------------------------------------------------------===// +// Coprocessor Instructions. +// + +def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1, + c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), + NoItinerary, "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2", + [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn, + imm:$CRm, imm:$opc2)]> { + bits<4> opc1; + bits<4> CRn; + bits<4> CRd; + bits<4> cop; + bits<3> opc2; + bits<4> CRm; + + let Inst{3-0} = CRm; + let Inst{4} = 0; + let Inst{7-5} = opc2; + let Inst{11-8} = cop; + let Inst{15-12} = CRd; + let Inst{19-16} = CRn; + let Inst{23-20} = opc1; +} + +def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1, + c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), + NoItinerary, "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2", + [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn, + imm:$CRm, imm:$opc2)]> { + let Inst{31-28} = 0b1111; + bits<4> opc1; + bits<4> CRn; + bits<4> CRd; + bits<4> cop; + bits<3> opc2; + bits<4> CRm; + + let Inst{3-0} = CRm; + let Inst{4} = 0; + let Inst{7-5} = opc2; + let Inst{11-8} = cop; + let Inst{15-12} = CRd; + let Inst{19-16} = CRn; + let Inst{23-20} = opc1; +} + +class ACI<dag oops, dag iops, string opc, string asm, + IndexMode im = IndexModeNone> + : I<oops, iops, AddrModeNone, 4, im, BrFrm, NoItinerary, + opc, asm, "", []> { + let Inst{27-25} = 0b110; +} +class ACInoP<dag oops, dag iops, string opc, string asm, + IndexMode im = IndexModeNone> + : InoP<oops, iops, AddrModeNone, 4, im, BrFrm, NoItinerary, + opc, asm, "", []> { + let Inst{31-28} = 0b1111; + let Inst{27-25} = 0b110; +} +multiclass LdStCop<bit load, bit Dbit, string asm> { + def _OFFSET : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr), + asm, "\t$cop, $CRd, $addr"> { + bits<13> addr; + bits<4> cop; + bits<4> CRd; + let Inst{24} = 1; // P = 1 + let Inst{23} = addr{8}; + let Inst{22} = Dbit; + let Inst{21} = 0; // W = 0 + let Inst{20} = load; + let Inst{19-16} = addr{12-9}; + let Inst{15-12} = CRd; + let Inst{11-8} = cop; + let Inst{7-0} = addr{7-0}; + let DecoderMethod = "DecodeCopMemInstruction"; + } + def _PRE : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr), + asm, "\t$cop, $CRd, $addr!", IndexModePre> { + bits<13> addr; + bits<4> cop; + bits<4> CRd; + let Inst{24} = 1; // P = 1 + let Inst{23} = addr{8}; + let Inst{22} = Dbit; + let Inst{21} = 1; // W = 1 + let Inst{20} = load; + let Inst{19-16} = addr{12-9}; + let Inst{15-12} = CRd; + let Inst{11-8} = cop; + let Inst{7-0} = addr{7-0}; + let DecoderMethod = "DecodeCopMemInstruction"; + } + def _POST: ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr, + postidx_imm8s4:$offset), + asm, "\t$cop, $CRd, $addr, $offset", IndexModePost> { + bits<9> offset; + bits<4> addr; + bits<4> cop; + bits<4> CRd; + let Inst{24} = 0; // P = 0 + let Inst{23} = offset{8}; + let Inst{22} = Dbit; + let Inst{21} = 1; // W = 1 + let Inst{20} = load; + let Inst{19-16} = addr; + let Inst{15-12} = CRd; + let Inst{11-8} = cop; + let Inst{7-0} = offset{7-0}; + let DecoderMethod = "DecodeCopMemInstruction"; + } + def _OPTION : ACI<(outs), + (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr, + coproc_option_imm:$option), + asm, "\t$cop, $CRd, $addr, $option"> { + bits<8> option; + bits<4> addr; + bits<4> cop; + bits<4> CRd; + let Inst{24} = 0; // P = 0 + let Inst{23} = 1; // U = 1 + let Inst{22} = Dbit; + let Inst{21} = 0; // W = 0 + let Inst{20} = load; + let Inst{19-16} = addr; + let Inst{15-12} = CRd; + let Inst{11-8} = cop; + let Inst{7-0} = option; + let DecoderMethod = "DecodeCopMemInstruction"; + } +} +multiclass LdSt2Cop<bit load, bit Dbit, string asm> { + def _OFFSET : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr), + asm, "\t$cop, $CRd, $addr"> { + bits<13> addr; + bits<4> cop; + bits<4> CRd; + let Inst{24} = 1; // P = 1 + let Inst{23} = addr{8}; + let Inst{22} = Dbit; + let Inst{21} = 0; // W = 0 + let Inst{20} = load; + let Inst{19-16} = addr{12-9}; + let Inst{15-12} = CRd; + let Inst{11-8} = cop; + let Inst{7-0} = addr{7-0}; + let DecoderMethod = "DecodeCopMemInstruction"; + } + def _PRE : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr), + asm, "\t$cop, $CRd, $addr!", IndexModePre> { + bits<13> addr; + bits<4> cop; + bits<4> CRd; + let Inst{24} = 1; // P = 1 + let Inst{23} = addr{8}; + let Inst{22} = Dbit; + let Inst{21} = 1; // W = 1 + let Inst{20} = load; + let Inst{19-16} = addr{12-9}; + let Inst{15-12} = CRd; + let Inst{11-8} = cop; + let Inst{7-0} = addr{7-0}; + let DecoderMethod = "DecodeCopMemInstruction"; + } + def _POST: ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr, + postidx_imm8s4:$offset), + asm, "\t$cop, $CRd, $addr, $offset", IndexModePost> { + bits<9> offset; + bits<4> addr; + bits<4> cop; + bits<4> CRd; + let Inst{24} = 0; // P = 0 + let Inst{23} = offset{8}; + let Inst{22} = Dbit; + let Inst{21} = 1; // W = 1 + let Inst{20} = load; + let Inst{19-16} = addr; + let Inst{15-12} = CRd; + let Inst{11-8} = cop; + let Inst{7-0} = offset{7-0}; + let DecoderMethod = "DecodeCopMemInstruction"; + } + def _OPTION : ACInoP<(outs), + (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr, + coproc_option_imm:$option), + asm, "\t$cop, $CRd, $addr, $option"> { + bits<8> option; + bits<4> addr; + bits<4> cop; + bits<4> CRd; + let Inst{24} = 0; // P = 0 + let Inst{23} = 1; // U = 1 + let Inst{22} = Dbit; + let Inst{21} = 0; // W = 0 + let Inst{20} = load; + let Inst{19-16} = addr; + let Inst{15-12} = CRd; + let Inst{11-8} = cop; + let Inst{7-0} = option; + let DecoderMethod = "DecodeCopMemInstruction"; + } +} + +defm LDC : LdStCop <1, 0, "ldc">; +defm LDCL : LdStCop <1, 1, "ldcl">; +defm STC : LdStCop <0, 0, "stc">; +defm STCL : LdStCop <0, 1, "stcl">; +defm LDC2 : LdSt2Cop<1, 0, "ldc2">; +defm LDC2L : LdSt2Cop<1, 1, "ldc2l">; +defm STC2 : LdSt2Cop<0, 0, "stc2">; +defm STC2L : LdSt2Cop<0, 1, "stc2l">; + +//===----------------------------------------------------------------------===// +// Move between coprocessor and ARM core register. +// + +class MovRCopro<string opc, bit direction, dag oops, dag iops, + list<dag> pattern> + : ABI<0b1110, oops, iops, NoItinerary, opc, + "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2", pattern> { + let Inst{20} = direction; + let Inst{4} = 1; + + bits<4> Rt; + bits<4> cop; + bits<3> opc1; + bits<3> opc2; + bits<4> CRm; + bits<4> CRn; + + let Inst{15-12} = Rt; + let Inst{11-8} = cop; + let Inst{23-21} = opc1; + let Inst{7-5} = opc2; + let Inst{3-0} = CRm; + let Inst{19-16} = CRn; +} + +def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */, + (outs), + (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, + c_imm:$CRm, imm0_7:$opc2), + [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn, + imm:$CRm, imm:$opc2)]>; +def MRC : MovRCopro<"mrc", 1 /* from coprocessor to ARM core register */, + (outs GPR:$Rt), + (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, + imm0_7:$opc2), []>; + +def : ARMPat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2), + (MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>; + +class MovRCopro2<string opc, bit direction, dag oops, dag iops, + list<dag> pattern> + : ABXI<0b1110, oops, iops, NoItinerary, + !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"), pattern> { + let Inst{31-28} = 0b1111; + let Inst{20} = direction; + let Inst{4} = 1; + + bits<4> Rt; + bits<4> cop; + bits<3> opc1; + bits<3> opc2; + bits<4> CRm; + bits<4> CRn; + + let Inst{15-12} = Rt; + let Inst{11-8} = cop; + let Inst{23-21} = opc1; + let Inst{7-5} = opc2; + let Inst{3-0} = CRm; + let Inst{19-16} = CRn; +} + +def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */, + (outs), + (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, + c_imm:$CRm, imm0_7:$opc2), + [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn, + imm:$CRm, imm:$opc2)]>; +def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */, + (outs GPR:$Rt), + (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, + imm0_7:$opc2), []>; + +def : ARMV5TPat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn, + imm:$CRm, imm:$opc2), + (MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>; + +class MovRRCopro<string opc, bit direction, list<dag> pattern = []> + : ABI<0b1100, (outs), (ins p_imm:$cop, imm0_15:$opc1, + GPR:$Rt, GPR:$Rt2, c_imm:$CRm), + NoItinerary, opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm", pattern> { + let Inst{23-21} = 0b010; + let Inst{20} = direction; + + bits<4> Rt; + bits<4> Rt2; + bits<4> cop; + bits<4> opc1; + bits<4> CRm; + + let Inst{15-12} = Rt; + let Inst{19-16} = Rt2; + let Inst{11-8} = cop; + let Inst{7-4} = opc1; + let Inst{3-0} = CRm; +} + +def MCRR : MovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */, + [(int_arm_mcrr imm:$cop, imm:$opc1, GPR:$Rt, GPR:$Rt2, + imm:$CRm)]>; +def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */>; + +class MovRRCopro2<string opc, bit direction, list<dag> pattern = []> + : ABXI<0b1100, (outs), (ins p_imm:$cop, imm0_15:$opc1, + GPR:$Rt, GPR:$Rt2, c_imm:$CRm), NoItinerary, + !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern> { + let Inst{31-28} = 0b1111; + let Inst{23-21} = 0b010; + let Inst{20} = direction; + + bits<4> Rt; + bits<4> Rt2; + bits<4> cop; + bits<4> opc1; + bits<4> CRm; + + let Inst{15-12} = Rt; + let Inst{19-16} = Rt2; + let Inst{11-8} = cop; + let Inst{7-4} = opc1; + let Inst{3-0} = CRm; +} + +def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */, + [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPR:$Rt, GPR:$Rt2, + imm:$CRm)]>; +def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */>; + +//===----------------------------------------------------------------------===// +// Move between special register and ARM core register +// + +// Move to ARM core register from Special Register +def MRS : ABI<0b0001, (outs GPR:$Rd), (ins), NoItinerary, + "mrs", "\t$Rd, apsr", []> { + bits<4> Rd; + let Inst{23-16} = 0b00001111; + let Inst{15-12} = Rd; + let Inst{7-4} = 0b0000; +} + +def : InstAlias<"mrs${p} $Rd, cpsr", (MRS GPR:$Rd, pred:$p)>, Requires<[IsARM]>; + +def MRSsys : ABI<0b0001, (outs GPR:$Rd), (ins), NoItinerary, + "mrs", "\t$Rd, spsr", []> { + bits<4> Rd; + let Inst{23-16} = 0b01001111; + let Inst{15-12} = Rd; + let Inst{7-4} = 0b0000; +} + +// Move from ARM core register to Special Register +// +// No need to have both system and application versions, the encodings are the +// same and the assembly parser has no way to distinguish between them. The mask +// operand contains the special register (R Bit) in bit 4 and bits 3-0 contains +// the mask with the fields to be accessed in the special register. +def MSR : ABI<0b0001, (outs), (ins msr_mask:$mask, GPR:$Rn), NoItinerary, + "msr", "\t$mask, $Rn", []> { + bits<5> mask; + bits<4> Rn; + + let Inst{23} = 0; + let Inst{22} = mask{4}; // R bit + let Inst{21-20} = 0b10; + let Inst{19-16} = mask{3-0}; + let Inst{15-12} = 0b1111; + let Inst{11-4} = 0b00000000; + let Inst{3-0} = Rn; +} + +def MSRi : ABI<0b0011, (outs), (ins msr_mask:$mask, so_imm:$a), NoItinerary, + "msr", "\t$mask, $a", []> { + bits<5> mask; + bits<12> a; + + let Inst{23} = 0; + let Inst{22} = mask{4}; // R bit + let Inst{21-20} = 0b10; + let Inst{19-16} = mask{3-0}; + let Inst{15-12} = 0b1111; + let Inst{11-0} = a; +} + +//===----------------------------------------------------------------------===// +// TLS Instructions +// + +// __aeabi_read_tp preserves the registers r1-r3. +// This is a pseudo inst so that we can get the encoding right, +// complete with fixup for the aeabi_read_tp function. +let isCall = 1, + Defs = [R0, R12, LR, CPSR], Uses = [SP] in { + def TPsoft : PseudoInst<(outs), (ins), IIC_Br, + [(set R0, ARMthread_pointer)]>; +} + +//===----------------------------------------------------------------------===// +// SJLJ Exception handling intrinsics +// eh_sjlj_setjmp() is an instruction sequence to store the return +// address and save #0 in R0 for the non-longjmp case. +// Since by its nature we may be coming from some other function to get +// here, and we're using the stack frame for the containing function to +// save/restore registers, we can't keep anything live in regs across +// the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon +// when we get here from a longjmp(). We force everything out of registers +// except for our own input by listing the relevant registers in Defs. By +// doing so, we also cause the prologue/epilogue code to actively preserve +// all of the callee-saved resgisters, which is exactly what we want. +// A constant value is passed in $val, and we use the location as a scratch. +// +// These are pseudo-instructions and are lowered to individual MC-insts, so +// no encoding information is necessary. +let Defs = + [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR, + QQQQ0, QQQQ1, QQQQ2, QQQQ3 ], hasSideEffects = 1, isBarrier = 1 in { + def Int_eh_sjlj_setjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$val), + NoItinerary, + [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>, + Requires<[IsARM, HasVFP2]>; +} + +let Defs = + [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR ], + hasSideEffects = 1, isBarrier = 1 in { + def Int_eh_sjlj_setjmp_nofp : PseudoInst<(outs), (ins GPR:$src, GPR:$val), + NoItinerary, + [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>, + Requires<[IsARM, NoVFP]>; +} + +// FIXME: Non-Darwin version(s) +let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, + Defs = [ R7, LR, SP ] in { +def Int_eh_sjlj_longjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$scratch), + NoItinerary, + [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>, + Requires<[IsARM, IsDarwin]>; +} + +// eh.sjlj.dispatchsetup pseudo-instruction. +// This pseudo is used for ARM, Thumb1 and Thumb2. Any differences are +// handled when the pseudo is expanded (which happens before any passes +// that need the instruction size). +let isBarrier = 1, hasSideEffects = 1 in +def Int_eh_sjlj_dispatchsetup : + PseudoInst<(outs), (ins GPR:$src), NoItinerary, + [(ARMeh_sjlj_dispatchsetup GPR:$src)]>, + Requires<[IsDarwin]>; + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +// + +// ARMv4 indirect branch using (MOVr PC, dst) +let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in + def MOVPCRX : ARMPseudoExpand<(outs), (ins GPR:$dst), + 4, IIC_Br, [(brind GPR:$dst)], + (MOVr PC, GPR:$dst, (ops 14, zero_reg), zero_reg)>, + Requires<[IsARM, NoV4T]>; + +// Large immediate handling. + +// 32-bit immediate using two piece so_imms or movw + movt. +// This is a single pseudo instruction, the benefit is that it can be remat'd +// as a single unit instead of having to handle reg inputs. +// FIXME: Remove this when we can do generalized remat. +let isReMaterializable = 1, isMoveImm = 1 in +def MOVi32imm : PseudoInst<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVix2, + [(set GPR:$dst, (arm_i32imm:$src))]>, + Requires<[IsARM]>; + +// Pseudo instruction that combines movw + movt + add pc (if PIC). +// It also makes it possible to rematerialize the instructions. +// FIXME: Remove this when we can do generalized remat and when machine licm +// can properly the instructions. +let isReMaterializable = 1 in { +def MOV_ga_pcrel : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr), + IIC_iMOVix2addpc, + [(set GPR:$dst, (ARMWrapperPIC tglobaladdr:$addr))]>, + Requires<[IsARM, UseMovt]>; + +def MOV_ga_dyn : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr), + IIC_iMOVix2, + [(set GPR:$dst, (ARMWrapperDYN tglobaladdr:$addr))]>, + Requires<[IsARM, UseMovt]>; + +let AddedComplexity = 10 in +def MOV_ga_pcrel_ldr : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr), + IIC_iMOVix2ld, + [(set GPR:$dst, (load (ARMWrapperPIC tglobaladdr:$addr)))]>, + Requires<[IsARM, UseMovt]>; +} // isReMaterializable + +// ConstantPool, GlobalAddress, and JumpTable +def : ARMPat<(ARMWrapper tglobaladdr :$dst), (LEApcrel tglobaladdr :$dst)>, + Requires<[IsARM, DontUseMovt]>; +def : ARMPat<(ARMWrapper tconstpool :$dst), (LEApcrel tconstpool :$dst)>; +def : ARMPat<(ARMWrapper tglobaladdr :$dst), (MOVi32imm tglobaladdr :$dst)>, + Requires<[IsARM, UseMovt]>; +def : ARMPat<(ARMWrapperJT tjumptable:$dst, imm:$id), + (LEApcrelJT tjumptable:$dst, imm:$id)>; + +// TODO: add,sub,and, 3-instr forms? + +// Tail calls +def : ARMPat<(ARMtcret tcGPR:$dst), + (TCRETURNri tcGPR:$dst)>, Requires<[IsDarwin]>; + +def : ARMPat<(ARMtcret (i32 tglobaladdr:$dst)), + (TCRETURNdi texternalsym:$dst)>, Requires<[IsDarwin]>; + +def : ARMPat<(ARMtcret (i32 texternalsym:$dst)), + (TCRETURNdi texternalsym:$dst)>, Requires<[IsDarwin]>; + +def : ARMPat<(ARMtcret tcGPR:$dst), + (TCRETURNriND tcGPR:$dst)>, Requires<[IsNotDarwin]>; + +def : ARMPat<(ARMtcret (i32 tglobaladdr:$dst)), + (TCRETURNdiND texternalsym:$dst)>, Requires<[IsNotDarwin]>; + +def : ARMPat<(ARMtcret (i32 texternalsym:$dst)), + (TCRETURNdiND texternalsym:$dst)>, Requires<[IsNotDarwin]>; + +// Direct calls +def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>, + Requires<[IsARM, IsNotDarwin]>; +def : ARMPat<(ARMcall texternalsym:$func), (BLr9 texternalsym:$func)>, + Requires<[IsARM, IsDarwin]>; + +// zextload i1 -> zextload i8 +def : ARMPat<(zextloadi1 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>; +def : ARMPat<(zextloadi1 ldst_so_reg:$addr), (LDRBrs ldst_so_reg:$addr)>; + +// extload -> zextload +def : ARMPat<(extloadi1 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>; +def : ARMPat<(extloadi1 ldst_so_reg:$addr), (LDRBrs ldst_so_reg:$addr)>; +def : ARMPat<(extloadi8 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>; +def : ARMPat<(extloadi8 ldst_so_reg:$addr), (LDRBrs ldst_so_reg:$addr)>; + +def : ARMPat<(extloadi16 addrmode3:$addr), (LDRH addrmode3:$addr)>; + +def : ARMPat<(extloadi8 addrmodepc:$addr), (PICLDRB addrmodepc:$addr)>; +def : ARMPat<(extloadi16 addrmodepc:$addr), (PICLDRH addrmodepc:$addr)>; + +// smul* and smla* +def : ARMV5TEPat<(mul (sra (shl GPR:$a, (i32 16)), (i32 16)), + (sra (shl GPR:$b, (i32 16)), (i32 16))), + (SMULBB GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul sext_16_node:$a, sext_16_node:$b), + (SMULBB GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul (sra (shl GPR:$a, (i32 16)), (i32 16)), + (sra GPR:$b, (i32 16))), + (SMULBT GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul sext_16_node:$a, (sra GPR:$b, (i32 16))), + (SMULBT GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)), + (sra (shl GPR:$b, (i32 16)), (i32 16))), + (SMULTB GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)), sext_16_node:$b), + (SMULTB GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))), + (i32 16)), + (SMULWB GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(sra (mul GPR:$a, sext_16_node:$b), (i32 16)), + (SMULWB GPR:$a, GPR:$b)>; + +def : ARMV5TEPat<(add GPR:$acc, + (mul (sra (shl GPR:$a, (i32 16)), (i32 16)), + (sra (shl GPR:$b, (i32 16)), (i32 16)))), + (SMLABB GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (mul sext_16_node:$a, sext_16_node:$b)), + (SMLABB GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (mul (sra (shl GPR:$a, (i32 16)), (i32 16)), + (sra GPR:$b, (i32 16)))), + (SMLABT GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (mul sext_16_node:$a, (sra GPR:$b, (i32 16)))), + (SMLABT GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (mul (sra GPR:$a, (i32 16)), + (sra (shl GPR:$b, (i32 16)), (i32 16)))), + (SMLATB GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (mul (sra GPR:$a, (i32 16)), sext_16_node:$b)), + (SMLATB GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))), + (i32 16))), + (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (sra (mul GPR:$a, sext_16_node:$b), (i32 16))), + (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>; + + +// Pre-v7 uses MCR for synchronization barriers. +def : ARMPat<(ARMMemBarrierMCR GPR:$zero), (MCR 15, 0, GPR:$zero, 7, 10, 5)>, + Requires<[IsARM, HasV6]>; + +// SXT/UXT with no rotate +let AddedComplexity = 16 in { +def : ARMV6Pat<(and GPR:$Src, 0x000000FF), (UXTB GPR:$Src, 0)>; +def : ARMV6Pat<(and GPR:$Src, 0x0000FFFF), (UXTH GPR:$Src, 0)>; +def : ARMV6Pat<(and GPR:$Src, 0x00FF00FF), (UXTB16 GPR:$Src, 0)>; +def : ARMV6Pat<(add GPR:$Rn, (and GPR:$Rm, 0x00FF)), + (UXTAB GPR:$Rn, GPR:$Rm, 0)>; +def : ARMV6Pat<(add GPR:$Rn, (and GPR:$Rm, 0xFFFF)), + (UXTAH GPR:$Rn, GPR:$Rm, 0)>; +} + +def : ARMV6Pat<(sext_inreg GPR:$Src, i8), (SXTB GPR:$Src, 0)>; +def : ARMV6Pat<(sext_inreg GPR:$Src, i16), (SXTH GPR:$Src, 0)>; + +def : ARMV6Pat<(add GPR:$Rn, (sext_inreg GPRnopc:$Rm, i8)), + (SXTAB GPR:$Rn, GPRnopc:$Rm, 0)>; +def : ARMV6Pat<(add GPR:$Rn, (sext_inreg GPRnopc:$Rm, i16)), + (SXTAH GPR:$Rn, GPRnopc:$Rm, 0)>; + +// Atomic load/store patterns +def : ARMPat<(atomic_load_8 ldst_so_reg:$src), + (LDRBrs ldst_so_reg:$src)>; +def : ARMPat<(atomic_load_8 addrmode_imm12:$src), + (LDRBi12 addrmode_imm12:$src)>; +def : ARMPat<(atomic_load_16 addrmode3:$src), + (LDRH addrmode3:$src)>; +def : ARMPat<(atomic_load_32 ldst_so_reg:$src), + (LDRrs ldst_so_reg:$src)>; +def : ARMPat<(atomic_load_32 addrmode_imm12:$src), + (LDRi12 addrmode_imm12:$src)>; +def : ARMPat<(atomic_store_8 ldst_so_reg:$ptr, GPR:$val), + (STRBrs GPR:$val, ldst_so_reg:$ptr)>; +def : ARMPat<(atomic_store_8 addrmode_imm12:$ptr, GPR:$val), + (STRBi12 GPR:$val, addrmode_imm12:$ptr)>; +def : ARMPat<(atomic_store_16 addrmode3:$ptr, GPR:$val), + (STRH GPR:$val, addrmode3:$ptr)>; +def : ARMPat<(atomic_store_32 ldst_so_reg:$ptr, GPR:$val), + (STRrs GPR:$val, ldst_so_reg:$ptr)>; +def : ARMPat<(atomic_store_32 addrmode_imm12:$ptr, GPR:$val), + (STRi12 GPR:$val, addrmode_imm12:$ptr)>; + + +//===----------------------------------------------------------------------===// +// Thumb Support +// + +include "ARMInstrThumb.td" + +//===----------------------------------------------------------------------===// +// Thumb2 Support +// + +include "ARMInstrThumb2.td" + +//===----------------------------------------------------------------------===// +// Floating Point Support +// + +include "ARMInstrVFP.td" + +//===----------------------------------------------------------------------===// +// Advanced SIMD (NEON) Support +// + +include "ARMInstrNEON.td" + +//===----------------------------------------------------------------------===// +// Assembler aliases +// + +// Memory barriers +def : InstAlias<"dmb", (DMB 0xf)>, Requires<[IsARM, HasDB]>; +def : InstAlias<"dsb", (DSB 0xf)>, Requires<[IsARM, HasDB]>; +def : InstAlias<"isb", (ISB 0xf)>, Requires<[IsARM, HasDB]>; + +// System instructions +def : MnemonicAlias<"swi", "svc">; + +// Load / Store Multiple +def : MnemonicAlias<"ldmfd", "ldm">; +def : MnemonicAlias<"ldmia", "ldm">; +def : MnemonicAlias<"ldmea", "ldmdb">; +def : MnemonicAlias<"stmfd", "stmdb">; +def : MnemonicAlias<"stmia", "stm">; +def : MnemonicAlias<"stmea", "stm">; + +// PKHBT/PKHTB with default shift amount. PKHTB is equivalent to PKHBT when the +// shift amount is zero (i.e., unspecified). +def : InstAlias<"pkhbt${p} $Rd, $Rn, $Rm", + (PKHBT GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, 0, pred:$p)>, + Requires<[IsARM, HasV6]>; +def : InstAlias<"pkhtb${p} $Rd, $Rn, $Rm", + (PKHBT GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, 0, pred:$p)>, + Requires<[IsARM, HasV6]>; + +// PUSH/POP aliases for STM/LDM +def : ARMInstAlias<"push${p} $regs", (STMDB_UPD SP, pred:$p, reglist:$regs)>; +def : ARMInstAlias<"pop${p} $regs", (LDMIA_UPD SP, pred:$p, reglist:$regs)>; + +// SSAT/USAT optional shift operand. +def : ARMInstAlias<"ssat${p} $Rd, $sat_imm, $Rn", + (SSAT GPRnopc:$Rd, imm1_32:$sat_imm, GPRnopc:$Rn, 0, pred:$p)>; +def : ARMInstAlias<"usat${p} $Rd, $sat_imm, $Rn", + (USAT GPRnopc:$Rd, imm0_31:$sat_imm, GPRnopc:$Rn, 0, pred:$p)>; + + +// Extend instruction optional rotate operand. +def : ARMInstAlias<"sxtab${p} $Rd, $Rn, $Rm", + (SXTAB GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>; +def : ARMInstAlias<"sxtah${p} $Rd, $Rn, $Rm", + (SXTAH GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>; +def : ARMInstAlias<"sxtab16${p} $Rd, $Rn, $Rm", + (SXTAB16 GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>; +def : ARMInstAlias<"sxtb${p} $Rd, $Rm", + (SXTB GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>; +def : ARMInstAlias<"sxtb16${p} $Rd, $Rm", + (SXTB16 GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>; +def : ARMInstAlias<"sxth${p} $Rd, $Rm", + (SXTH GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>; + +def : ARMInstAlias<"uxtab${p} $Rd, $Rn, $Rm", + (UXTAB GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>; +def : ARMInstAlias<"uxtah${p} $Rd, $Rn, $Rm", + (UXTAH GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>; +def : ARMInstAlias<"uxtab16${p} $Rd, $Rn, $Rm", + (UXTAB16 GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>; +def : ARMInstAlias<"uxtb${p} $Rd, $Rm", + (UXTB GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>; +def : ARMInstAlias<"uxtb16${p} $Rd, $Rm", + (UXTB16 GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>; +def : ARMInstAlias<"uxth${p} $Rd, $Rm", + (UXTH GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>; + + +// RFE aliases +def : MnemonicAlias<"rfefa", "rfeda">; +def : MnemonicAlias<"rfeea", "rfedb">; +def : MnemonicAlias<"rfefd", "rfeia">; +def : MnemonicAlias<"rfeed", "rfeib">; +def : MnemonicAlias<"rfe", "rfeia">; + +// SRS aliases +def : MnemonicAlias<"srsfa", "srsda">; +def : MnemonicAlias<"srsea", "srsdb">; +def : MnemonicAlias<"srsfd", "srsia">; +def : MnemonicAlias<"srsed", "srsib">; +def : MnemonicAlias<"srs", "srsia">; + +// QSAX == QSUBADDX +def : MnemonicAlias<"qsubaddx", "qsax">; +// SASX == SADDSUBX +def : MnemonicAlias<"saddsubx", "sasx">; +// SHASX == SHADDSUBX +def : MnemonicAlias<"shaddsubx", "shasx">; +// SHSAX == SHSUBADDX +def : MnemonicAlias<"shsubaddx", "shsax">; +// SSAX == SSUBADDX +def : MnemonicAlias<"ssubaddx", "ssax">; +// UASX == UADDSUBX +def : MnemonicAlias<"uaddsubx", "uasx">; +// UHASX == UHADDSUBX +def : MnemonicAlias<"uhaddsubx", "uhasx">; +// UHSAX == UHSUBADDX +def : MnemonicAlias<"uhsubaddx", "uhsax">; +// UQASX == UQADDSUBX +def : MnemonicAlias<"uqaddsubx", "uqasx">; +// UQSAX == UQSUBADDX +def : MnemonicAlias<"uqsubaddx", "uqsax">; +// USAX == USUBADDX +def : MnemonicAlias<"usubaddx", "usax">; + +// LDRSBT/LDRHT/LDRSHT post-index offset if optional. +// Note that the write-back output register is a dummy operand for MC (it's +// only meaningful for codegen), so we just pass zero here. +// FIXME: tblgen not cooperating with argument conversions. +//def : InstAlias<"ldrsbt${p} $Rt, $addr", +// (LDRSBTi GPR:$Rt, GPR:$Rt, addr_offset_none:$addr, 0,pred:$p)>; +//def : InstAlias<"ldrht${p} $Rt, $addr", +// (LDRHTi GPR:$Rt, GPR:$Rt, addr_offset_none:$addr, 0, pred:$p)>; +//def : InstAlias<"ldrsht${p} $Rt, $addr", +// (LDRSHTi GPR:$Rt, GPR:$Rt, addr_offset_none:$addr, 0, pred:$p)>; diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td new file mode 100644 index 0000000..7aad186 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -0,0 +1,5030 @@ +//===- ARMInstrNEON.td - NEON support for ARM -----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the ARM NEON instruction set. +// +//===----------------------------------------------------------------------===// + + +//===----------------------------------------------------------------------===// +// NEON-specific Operands. +//===----------------------------------------------------------------------===// +def VectorIndex8Operand : AsmOperandClass { let Name = "VectorIndex8"; } +def VectorIndex16Operand : AsmOperandClass { let Name = "VectorIndex16"; } +def VectorIndex32Operand : AsmOperandClass { let Name = "VectorIndex32"; } +def VectorIndex8 : Operand<i32>, ImmLeaf<i32, [{ + return ((uint64_t)Imm) < 8; +}]> { + let ParserMatchClass = VectorIndex8Operand; + let PrintMethod = "printVectorIndex"; + let MIOperandInfo = (ops i32imm); +} +def VectorIndex16 : Operand<i32>, ImmLeaf<i32, [{ + return ((uint64_t)Imm) < 4; +}]> { + let ParserMatchClass = VectorIndex16Operand; + let PrintMethod = "printVectorIndex"; + let MIOperandInfo = (ops i32imm); +} +def VectorIndex32 : Operand<i32>, ImmLeaf<i32, [{ + return ((uint64_t)Imm) < 2; +}]> { + let ParserMatchClass = VectorIndex32Operand; + let PrintMethod = "printVectorIndex"; + let MIOperandInfo = (ops i32imm); +} + +//===----------------------------------------------------------------------===// +// NEON-specific DAG Nodes. +//===----------------------------------------------------------------------===// + +def SDTARMVCMP : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<1, 2>]>; +def SDTARMVCMPZ : SDTypeProfile<1, 1, []>; + +def NEONvceq : SDNode<"ARMISD::VCEQ", SDTARMVCMP>; +def NEONvceqz : SDNode<"ARMISD::VCEQZ", SDTARMVCMPZ>; +def NEONvcge : SDNode<"ARMISD::VCGE", SDTARMVCMP>; +def NEONvcgez : SDNode<"ARMISD::VCGEZ", SDTARMVCMPZ>; +def NEONvclez : SDNode<"ARMISD::VCLEZ", SDTARMVCMPZ>; +def NEONvcgeu : SDNode<"ARMISD::VCGEU", SDTARMVCMP>; +def NEONvcgt : SDNode<"ARMISD::VCGT", SDTARMVCMP>; +def NEONvcgtz : SDNode<"ARMISD::VCGTZ", SDTARMVCMPZ>; +def NEONvcltz : SDNode<"ARMISD::VCLTZ", SDTARMVCMPZ>; +def NEONvcgtu : SDNode<"ARMISD::VCGTU", SDTARMVCMP>; +def NEONvtst : SDNode<"ARMISD::VTST", SDTARMVCMP>; + +// Types for vector shift by immediates. The "SHX" version is for long and +// narrow operations where the source and destination vectors have different +// types. The "SHINS" version is for shift and insert operations. +def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>, + SDTCisVT<2, i32>]>; +def SDTARMVSHX : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, + SDTCisVT<2, i32>]>; +def SDTARMVSHINS : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>; + +def NEONvshl : SDNode<"ARMISD::VSHL", SDTARMVSH>; +def NEONvshrs : SDNode<"ARMISD::VSHRs", SDTARMVSH>; +def NEONvshru : SDNode<"ARMISD::VSHRu", SDTARMVSH>; +def NEONvshlls : SDNode<"ARMISD::VSHLLs", SDTARMVSHX>; +def NEONvshllu : SDNode<"ARMISD::VSHLLu", SDTARMVSHX>; +def NEONvshlli : SDNode<"ARMISD::VSHLLi", SDTARMVSHX>; +def NEONvshrn : SDNode<"ARMISD::VSHRN", SDTARMVSHX>; + +def NEONvrshrs : SDNode<"ARMISD::VRSHRs", SDTARMVSH>; +def NEONvrshru : SDNode<"ARMISD::VRSHRu", SDTARMVSH>; +def NEONvrshrn : SDNode<"ARMISD::VRSHRN", SDTARMVSHX>; + +def NEONvqshls : SDNode<"ARMISD::VQSHLs", SDTARMVSH>; +def NEONvqshlu : SDNode<"ARMISD::VQSHLu", SDTARMVSH>; +def NEONvqshlsu : SDNode<"ARMISD::VQSHLsu", SDTARMVSH>; +def NEONvqshrns : SDNode<"ARMISD::VQSHRNs", SDTARMVSHX>; +def NEONvqshrnu : SDNode<"ARMISD::VQSHRNu", SDTARMVSHX>; +def NEONvqshrnsu : SDNode<"ARMISD::VQSHRNsu", SDTARMVSHX>; + +def NEONvqrshrns : SDNode<"ARMISD::VQRSHRNs", SDTARMVSHX>; +def NEONvqrshrnu : SDNode<"ARMISD::VQRSHRNu", SDTARMVSHX>; +def NEONvqrshrnsu : SDNode<"ARMISD::VQRSHRNsu", SDTARMVSHX>; + +def NEONvsli : SDNode<"ARMISD::VSLI", SDTARMVSHINS>; +def NEONvsri : SDNode<"ARMISD::VSRI", SDTARMVSHINS>; + +def SDTARMVGETLN : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>, + SDTCisVT<2, i32>]>; +def NEONvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>; +def NEONvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>; + +def SDTARMVMOVIMM : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>; +def NEONvmovImm : SDNode<"ARMISD::VMOVIMM", SDTARMVMOVIMM>; +def NEONvmvnImm : SDNode<"ARMISD::VMVNIMM", SDTARMVMOVIMM>; + +def SDTARMVORRIMM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisVT<2, i32>]>; +def NEONvorrImm : SDNode<"ARMISD::VORRIMM", SDTARMVORRIMM>; +def NEONvbicImm : SDNode<"ARMISD::VBICIMM", SDTARMVORRIMM>; + +def NEONvbsl : SDNode<"ARMISD::VBSL", + SDTypeProfile<1, 3, [SDTCisVec<0>, + SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>]>>; + +def NEONvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>; + +// VDUPLANE can produce a quad-register result from a double-register source, +// so the result is not constrained to match the source. +def NEONvduplane : SDNode<"ARMISD::VDUPLANE", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisVT<2, i32>]>>; + +def SDTARMVEXT : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>; +def NEONvext : SDNode<"ARMISD::VEXT", SDTARMVEXT>; + +def SDTARMVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>; +def NEONvrev64 : SDNode<"ARMISD::VREV64", SDTARMVSHUF>; +def NEONvrev32 : SDNode<"ARMISD::VREV32", SDTARMVSHUF>; +def NEONvrev16 : SDNode<"ARMISD::VREV16", SDTARMVSHUF>; + +def SDTARMVSHUF2 : SDTypeProfile<2, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>]>; +def NEONzip : SDNode<"ARMISD::VZIP", SDTARMVSHUF2>; +def NEONuzp : SDNode<"ARMISD::VUZP", SDTARMVSHUF2>; +def NEONtrn : SDNode<"ARMISD::VTRN", SDTARMVSHUF2>; + +def SDTARMVMULL : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, + SDTCisSameAs<1, 2>]>; +def NEONvmulls : SDNode<"ARMISD::VMULLs", SDTARMVMULL>; +def NEONvmullu : SDNode<"ARMISD::VMULLu", SDTARMVMULL>; + +def SDTARMFMAX : SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>]>; +def NEONfmax : SDNode<"ARMISD::FMAX", SDTARMFMAX>; +def NEONfmin : SDNode<"ARMISD::FMIN", SDTARMFMAX>; + +def NEONimmAllZerosV: PatLeaf<(NEONvmovImm (i32 timm)), [{ + ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0)); + unsigned EltBits = 0; + uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits); + return (EltBits == 32 && EltVal == 0); +}]>; + +def NEONimmAllOnesV: PatLeaf<(NEONvmovImm (i32 timm)), [{ + ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0)); + unsigned EltBits = 0; + uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits); + return (EltBits == 8 && EltVal == 0xff); +}]>; + +//===----------------------------------------------------------------------===// +// NEON operand definitions +//===----------------------------------------------------------------------===// + +def nModImm : Operand<i32> { + let PrintMethod = "printNEONModImmOperand"; +} + +//===----------------------------------------------------------------------===// +// NEON load / store instructions +//===----------------------------------------------------------------------===// + +// Use VLDM to load a Q register as a D register pair. +// This is a pseudo instruction that is expanded to VLDMD after reg alloc. +def VLDMQIA + : PseudoVFPLdStM<(outs QPR:$dst), (ins GPR:$Rn), + IIC_fpLoad_m, "", + [(set QPR:$dst, (v2f64 (load GPR:$Rn)))]>; + +// Use VSTM to store a Q register as a D register pair. +// This is a pseudo instruction that is expanded to VSTMD after reg alloc. +def VSTMQIA + : PseudoVFPLdStM<(outs), (ins QPR:$src, GPR:$Rn), + IIC_fpStore_m, "", + [(store (v2f64 QPR:$src), GPR:$Rn)]>; + +// Classes for VLD* pseudo-instructions with multi-register operands. +// These are expanded to real instructions after register allocation. +class VLDQPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QPR:$dst), (ins addrmode6:$addr), itin, "">; +class VLDQWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QPR:$dst, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset), itin, + "$addr.addr = $wb">; +class VLDQQPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QQPR:$dst), (ins addrmode6:$addr), itin, "">; +class VLDQQWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset), itin, + "$addr.addr = $wb">; +class VLDQQQQPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QQQQPR:$dst), (ins addrmode6:$addr, QQQQPR:$src),itin, + "$src = $dst">; +class VLDQQQQWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QQQQPR:$dst, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), itin, + "$addr.addr = $wb, $src = $dst">; + +let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { + +// VLD1 : Vector Load (multiple single elements) +class VLD1D<bits<4> op7_4, string Dt> + : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$Vd), + (ins addrmode6:$Rn), IIC_VLD1, + "vld1", Dt, "\\{$Vd\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLDInstruction"; +} +class VLD1Q<bits<4> op7_4, string Dt> + : NLdSt<0,0b10,0b1010,op7_4, (outs DPR:$Vd, DPR:$dst2), + (ins addrmode6:$Rn), IIC_VLD1x2, + "vld1", Dt, "\\{$Vd, $dst2\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDInstruction"; +} + +def VLD1d8 : VLD1D<{0,0,0,?}, "8">; +def VLD1d16 : VLD1D<{0,1,0,?}, "16">; +def VLD1d32 : VLD1D<{1,0,0,?}, "32">; +def VLD1d64 : VLD1D<{1,1,0,?}, "64">; + +def VLD1q8 : VLD1Q<{0,0,?,?}, "8">; +def VLD1q16 : VLD1Q<{0,1,?,?}, "16">; +def VLD1q32 : VLD1Q<{1,0,?,?}, "32">; +def VLD1q64 : VLD1Q<{1,1,?,?}, "64">; + +def VLD1q8Pseudo : VLDQPseudo<IIC_VLD1x2>; +def VLD1q16Pseudo : VLDQPseudo<IIC_VLD1x2>; +def VLD1q32Pseudo : VLDQPseudo<IIC_VLD1x2>; +def VLD1q64Pseudo : VLDQPseudo<IIC_VLD1x2>; + +// ...with address register writeback: +class VLD1DWB<bits<4> op7_4, string Dt> + : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$Vd, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD1u, + "vld1", Dt, "\\{$Vd\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLDInstruction"; +} +class VLD1QWB<bits<4> op7_4, string Dt> + : NLdSt<0,0b10,0b1010,op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD1x2u, + "vld1", Dt, "\\{$Vd, $dst2\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDInstruction"; +} + +def VLD1d8_UPD : VLD1DWB<{0,0,0,?}, "8">; +def VLD1d16_UPD : VLD1DWB<{0,1,0,?}, "16">; +def VLD1d32_UPD : VLD1DWB<{1,0,0,?}, "32">; +def VLD1d64_UPD : VLD1DWB<{1,1,0,?}, "64">; + +def VLD1q8_UPD : VLD1QWB<{0,0,?,?}, "8">; +def VLD1q16_UPD : VLD1QWB<{0,1,?,?}, "16">; +def VLD1q32_UPD : VLD1QWB<{1,0,?,?}, "32">; +def VLD1q64_UPD : VLD1QWB<{1,1,?,?}, "64">; + +def VLD1q8Pseudo_UPD : VLDQWBPseudo<IIC_VLD1x2u>; +def VLD1q16Pseudo_UPD : VLDQWBPseudo<IIC_VLD1x2u>; +def VLD1q32Pseudo_UPD : VLDQWBPseudo<IIC_VLD1x2u>; +def VLD1q64Pseudo_UPD : VLDQWBPseudo<IIC_VLD1x2u>; + +// ...with 3 registers (some of these are only for the disassembler): +class VLD1D3<bits<4> op7_4, string Dt> + : NLdSt<0,0b10,0b0110,op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3), + (ins addrmode6:$Rn), IIC_VLD1x3, "vld1", Dt, + "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLDInstruction"; +} +class VLD1D3WB<bits<4> op7_4, string Dt> + : NLdSt<0,0b10,0b0110,op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD1x3u, "vld1", Dt, + "\\{$Vd, $dst2, $dst3\\}, $Rn$Rm", "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLDInstruction"; +} + +def VLD1d8T : VLD1D3<{0,0,0,?}, "8">; +def VLD1d16T : VLD1D3<{0,1,0,?}, "16">; +def VLD1d32T : VLD1D3<{1,0,0,?}, "32">; +def VLD1d64T : VLD1D3<{1,1,0,?}, "64">; + +def VLD1d8T_UPD : VLD1D3WB<{0,0,0,?}, "8">; +def VLD1d16T_UPD : VLD1D3WB<{0,1,0,?}, "16">; +def VLD1d32T_UPD : VLD1D3WB<{1,0,0,?}, "32">; +def VLD1d64T_UPD : VLD1D3WB<{1,1,0,?}, "64">; + +def VLD1d64TPseudo : VLDQQPseudo<IIC_VLD1x3>; +def VLD1d64TPseudo_UPD : VLDQQWBPseudo<IIC_VLD1x3u>; + +// ...with 4 registers (some of these are only for the disassembler): +class VLD1D4<bits<4> op7_4, string Dt> + : NLdSt<0,0b10,0b0010,op7_4,(outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4), + (ins addrmode6:$Rn), IIC_VLD1x4, "vld1", Dt, + "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDInstruction"; +} +class VLD1D4WB<bits<4> op7_4, string Dt> + : NLdSt<0,0b10,0b0010,op7_4, + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD1x4u, "vld1", Dt, + "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm", "$Rn.addr = $wb", + []> { + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDInstruction"; +} + +def VLD1d8Q : VLD1D4<{0,0,?,?}, "8">; +def VLD1d16Q : VLD1D4<{0,1,?,?}, "16">; +def VLD1d32Q : VLD1D4<{1,0,?,?}, "32">; +def VLD1d64Q : VLD1D4<{1,1,?,?}, "64">; + +def VLD1d8Q_UPD : VLD1D4WB<{0,0,?,?}, "8">; +def VLD1d16Q_UPD : VLD1D4WB<{0,1,?,?}, "16">; +def VLD1d32Q_UPD : VLD1D4WB<{1,0,?,?}, "32">; +def VLD1d64Q_UPD : VLD1D4WB<{1,1,?,?}, "64">; + +def VLD1d64QPseudo : VLDQQPseudo<IIC_VLD1x4>; +def VLD1d64QPseudo_UPD : VLDQQWBPseudo<IIC_VLD1x4u>; + +// VLD2 : Vector Load (multiple 2-element structures) +class VLD2D<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2), + (ins addrmode6:$Rn), IIC_VLD2, + "vld2", Dt, "\\{$Vd, $dst2\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDInstruction"; +} +class VLD2Q<bits<4> op7_4, string Dt> + : NLdSt<0, 0b10, 0b0011, op7_4, + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4), + (ins addrmode6:$Rn), IIC_VLD2x2, + "vld2", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDInstruction"; +} + +def VLD2d8 : VLD2D<0b1000, {0,0,?,?}, "8">; +def VLD2d16 : VLD2D<0b1000, {0,1,?,?}, "16">; +def VLD2d32 : VLD2D<0b1000, {1,0,?,?}, "32">; + +def VLD2q8 : VLD2Q<{0,0,?,?}, "8">; +def VLD2q16 : VLD2Q<{0,1,?,?}, "16">; +def VLD2q32 : VLD2Q<{1,0,?,?}, "32">; + +def VLD2d8Pseudo : VLDQPseudo<IIC_VLD2>; +def VLD2d16Pseudo : VLDQPseudo<IIC_VLD2>; +def VLD2d32Pseudo : VLDQPseudo<IIC_VLD2>; + +def VLD2q8Pseudo : VLDQQPseudo<IIC_VLD2x2>; +def VLD2q16Pseudo : VLDQQPseudo<IIC_VLD2x2>; +def VLD2q32Pseudo : VLDQQPseudo<IIC_VLD2x2>; + +// ...with address register writeback: +class VLD2DWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD2u, + "vld2", Dt, "\\{$Vd, $dst2\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDInstruction"; +} +class VLD2QWB<bits<4> op7_4, string Dt> + : NLdSt<0, 0b10, 0b0011, op7_4, + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD2x2u, + "vld2", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDInstruction"; +} + +def VLD2d8_UPD : VLD2DWB<0b1000, {0,0,?,?}, "8">; +def VLD2d16_UPD : VLD2DWB<0b1000, {0,1,?,?}, "16">; +def VLD2d32_UPD : VLD2DWB<0b1000, {1,0,?,?}, "32">; + +def VLD2q8_UPD : VLD2QWB<{0,0,?,?}, "8">; +def VLD2q16_UPD : VLD2QWB<{0,1,?,?}, "16">; +def VLD2q32_UPD : VLD2QWB<{1,0,?,?}, "32">; + +def VLD2d8Pseudo_UPD : VLDQWBPseudo<IIC_VLD2u>; +def VLD2d16Pseudo_UPD : VLDQWBPseudo<IIC_VLD2u>; +def VLD2d32Pseudo_UPD : VLDQWBPseudo<IIC_VLD2u>; + +def VLD2q8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD2x2u>; +def VLD2q16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD2x2u>; +def VLD2q32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD2x2u>; + +// ...with double-spaced registers (for disassembly only): +def VLD2b8 : VLD2D<0b1001, {0,0,?,?}, "8">; +def VLD2b16 : VLD2D<0b1001, {0,1,?,?}, "16">; +def VLD2b32 : VLD2D<0b1001, {1,0,?,?}, "32">; +def VLD2b8_UPD : VLD2DWB<0b1001, {0,0,?,?}, "8">; +def VLD2b16_UPD : VLD2DWB<0b1001, {0,1,?,?}, "16">; +def VLD2b32_UPD : VLD2DWB<0b1001, {1,0,?,?}, "32">; + +// VLD3 : Vector Load (multiple 3-element structures) +class VLD3D<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3), + (ins addrmode6:$Rn), IIC_VLD3, + "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLDInstruction"; +} + +def VLD3d8 : VLD3D<0b0100, {0,0,0,?}, "8">; +def VLD3d16 : VLD3D<0b0100, {0,1,0,?}, "16">; +def VLD3d32 : VLD3D<0b0100, {1,0,0,?}, "32">; + +def VLD3d8Pseudo : VLDQQPseudo<IIC_VLD3>; +def VLD3d16Pseudo : VLDQQPseudo<IIC_VLD3>; +def VLD3d32Pseudo : VLDQQPseudo<IIC_VLD3>; + +// ...with address register writeback: +class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b10, op11_8, op7_4, + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD3u, + "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLDInstruction"; +} + +def VLD3d8_UPD : VLD3DWB<0b0100, {0,0,0,?}, "8">; +def VLD3d16_UPD : VLD3DWB<0b0100, {0,1,0,?}, "16">; +def VLD3d32_UPD : VLD3DWB<0b0100, {1,0,0,?}, "32">; + +def VLD3d8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>; +def VLD3d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>; +def VLD3d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>; + +// ...with double-spaced registers: +def VLD3q8 : VLD3D<0b0101, {0,0,0,?}, "8">; +def VLD3q16 : VLD3D<0b0101, {0,1,0,?}, "16">; +def VLD3q32 : VLD3D<0b0101, {1,0,0,?}, "32">; +def VLD3q8_UPD : VLD3DWB<0b0101, {0,0,0,?}, "8">; +def VLD3q16_UPD : VLD3DWB<0b0101, {0,1,0,?}, "16">; +def VLD3q32_UPD : VLD3DWB<0b0101, {1,0,0,?}, "32">; + +def VLD3q8Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>; +def VLD3q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>; +def VLD3q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>; + +// ...alternate versions to be allocated odd register numbers: +def VLD3q8oddPseudo : VLDQQQQPseudo<IIC_VLD3>; +def VLD3q16oddPseudo : VLDQQQQPseudo<IIC_VLD3>; +def VLD3q32oddPseudo : VLDQQQQPseudo<IIC_VLD3>; + +def VLD3q8oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>; +def VLD3q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>; +def VLD3q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>; + +// VLD4 : Vector Load (multiple 4-element structures) +class VLD4D<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b10, op11_8, op7_4, + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4), + (ins addrmode6:$Rn), IIC_VLD4, + "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDInstruction"; +} + +def VLD4d8 : VLD4D<0b0000, {0,0,?,?}, "8">; +def VLD4d16 : VLD4D<0b0000, {0,1,?,?}, "16">; +def VLD4d32 : VLD4D<0b0000, {1,0,?,?}, "32">; + +def VLD4d8Pseudo : VLDQQPseudo<IIC_VLD4>; +def VLD4d16Pseudo : VLDQQPseudo<IIC_VLD4>; +def VLD4d32Pseudo : VLDQQPseudo<IIC_VLD4>; + +// ...with address register writeback: +class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b10, op11_8, op7_4, + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD4u, + "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVLDInstruction"; +} + +def VLD4d8_UPD : VLD4DWB<0b0000, {0,0,?,?}, "8">; +def VLD4d16_UPD : VLD4DWB<0b0000, {0,1,?,?}, "16">; +def VLD4d32_UPD : VLD4DWB<0b0000, {1,0,?,?}, "32">; + +def VLD4d8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>; +def VLD4d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>; +def VLD4d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>; + +// ...with double-spaced registers: +def VLD4q8 : VLD4D<0b0001, {0,0,?,?}, "8">; +def VLD4q16 : VLD4D<0b0001, {0,1,?,?}, "16">; +def VLD4q32 : VLD4D<0b0001, {1,0,?,?}, "32">; +def VLD4q8_UPD : VLD4DWB<0b0001, {0,0,?,?}, "8">; +def VLD4q16_UPD : VLD4DWB<0b0001, {0,1,?,?}, "16">; +def VLD4q32_UPD : VLD4DWB<0b0001, {1,0,?,?}, "32">; + +def VLD4q8Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>; +def VLD4q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>; +def VLD4q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>; + +// ...alternate versions to be allocated odd register numbers: +def VLD4q8oddPseudo : VLDQQQQPseudo<IIC_VLD4>; +def VLD4q16oddPseudo : VLDQQQQPseudo<IIC_VLD4>; +def VLD4q32oddPseudo : VLDQQQQPseudo<IIC_VLD4>; + +def VLD4q8oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>; +def VLD4q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>; +def VLD4q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>; + +} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 + +// Classes for VLD*LN pseudo-instructions with multi-register operands. +// These are expanded to real instructions after register allocation. +class VLDQLNPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QPR:$dst), + (ins addrmode6:$addr, QPR:$src, nohash_imm:$lane), + itin, "$src = $dst">; +class VLDQLNWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QPR:$dst, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QPR:$src, + nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">; +class VLDQQLNPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QQPR:$dst), + (ins addrmode6:$addr, QQPR:$src, nohash_imm:$lane), + itin, "$src = $dst">; +class VLDQQLNWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QQPR:$src, + nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">; +class VLDQQQQLNPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QQQQPR:$dst), + (ins addrmode6:$addr, QQQQPR:$src, nohash_imm:$lane), + itin, "$src = $dst">; +class VLDQQQQLNWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs QQQQPR:$dst, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src, + nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">; + +// VLD1LN : Vector Load (single element to one lane) +class VLD1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty, + PatFrag LoadOp> + : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd), + (ins addrmode6:$Rn, DPR:$src, nohash_imm:$lane), + IIC_VLD1ln, "vld1", Dt, "\\{$Vd[$lane]\\}, $Rn", + "$src = $Vd", + [(set DPR:$Vd, (vector_insert (Ty DPR:$src), + (i32 (LoadOp addrmode6:$Rn)), + imm:$lane))]> { + let Rm = 0b1111; + let DecoderMethod = "DecodeVLD1LN"; +} +class VLD1LN32<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty, + PatFrag LoadOp> + : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd), + (ins addrmode6oneL32:$Rn, DPR:$src, nohash_imm:$lane), + IIC_VLD1ln, "vld1", Dt, "\\{$Vd[$lane]\\}, $Rn", + "$src = $Vd", + [(set DPR:$Vd, (vector_insert (Ty DPR:$src), + (i32 (LoadOp addrmode6oneL32:$Rn)), + imm:$lane))]> { + let Rm = 0b1111; + let DecoderMethod = "DecodeVLD1LN"; +} +class VLD1QLNPseudo<ValueType Ty, PatFrag LoadOp> : VLDQLNPseudo<IIC_VLD1ln> { + let Pattern = [(set QPR:$dst, (vector_insert (Ty QPR:$src), + (i32 (LoadOp addrmode6:$addr)), + imm:$lane))]; +} + +def VLD1LNd8 : VLD1LN<0b0000, {?,?,?,0}, "8", v8i8, extloadi8> { + let Inst{7-5} = lane{2-0}; +} +def VLD1LNd16 : VLD1LN<0b0100, {?,?,0,?}, "16", v4i16, extloadi16> { + let Inst{7-6} = lane{1-0}; + let Inst{4} = Rn{4}; +} +def VLD1LNd32 : VLD1LN32<0b1000, {?,0,?,?}, "32", v2i32, load> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{4}; + let Inst{4} = Rn{4}; +} + +def VLD1LNq8Pseudo : VLD1QLNPseudo<v16i8, extloadi8>; +def VLD1LNq16Pseudo : VLD1QLNPseudo<v8i16, extloadi16>; +def VLD1LNq32Pseudo : VLD1QLNPseudo<v4i32, load>; + +def : Pat<(vector_insert (v2f32 DPR:$src), + (f32 (load addrmode6:$addr)), imm:$lane), + (VLD1LNd32 addrmode6:$addr, DPR:$src, imm:$lane)>; +def : Pat<(vector_insert (v4f32 QPR:$src), + (f32 (load addrmode6:$addr)), imm:$lane), + (VLD1LNq32Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>; + +let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { + +// ...with address register writeback: +class VLD1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$src, nohash_imm:$lane), IIC_VLD1lnu, "vld1", Dt, + "\\{$Vd[$lane]\\}, $Rn$Rm", + "$src = $Vd, $Rn.addr = $wb", []> { + let DecoderMethod = "DecodeVLD1LN"; +} + +def VLD1LNd8_UPD : VLD1LNWB<0b0000, {?,?,?,0}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VLD1LNd16_UPD : VLD1LNWB<0b0100, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; + let Inst{4} = Rn{4}; +} +def VLD1LNd32_UPD : VLD1LNWB<0b1000, {?,0,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{4}; + let Inst{4} = Rn{4}; +} + +def VLD1LNq8Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>; +def VLD1LNq16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>; +def VLD1LNq32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>; + +// VLD2LN : Vector Load (single 2-element structure to one lane) +class VLD2LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2), + (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, nohash_imm:$lane), + IIC_VLD2ln, "vld2", Dt, "\\{$Vd[$lane], $dst2[$lane]\\}, $Rn", + "$src1 = $Vd, $src2 = $dst2", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLD2LN"; +} + +def VLD2LNd8 : VLD2LN<0b0001, {?,?,?,?}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VLD2LNd16 : VLD2LN<0b0101, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD2LNd32 : VLD2LN<0b1001, {?,0,0,?}, "32"> { + let Inst{7} = lane{0}; +} + +def VLD2LNd8Pseudo : VLDQLNPseudo<IIC_VLD2ln>; +def VLD2LNd16Pseudo : VLDQLNPseudo<IIC_VLD2ln>; +def VLD2LNd32Pseudo : VLDQLNPseudo<IIC_VLD2ln>; + +// ...with double-spaced registers: +def VLD2LNq16 : VLD2LN<0b0101, {?,?,1,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD2LNq32 : VLD2LN<0b1001, {?,1,0,?}, "32"> { + let Inst{7} = lane{0}; +} + +def VLD2LNq16Pseudo : VLDQQLNPseudo<IIC_VLD2ln>; +def VLD2LNq32Pseudo : VLDQQLNPseudo<IIC_VLD2ln>; + +// ...with address register writeback: +class VLD2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VLD2lnu, "vld2", Dt, + "\\{$Vd[$lane], $dst2[$lane]\\}, $Rn$Rm", + "$src1 = $Vd, $src2 = $dst2, $Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLD2LN"; +} + +def VLD2LNd8_UPD : VLD2LNWB<0b0001, {?,?,?,?}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VLD2LNd16_UPD : VLD2LNWB<0b0101, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD2LNd32_UPD : VLD2LNWB<0b1001, {?,0,0,?}, "32"> { + let Inst{7} = lane{0}; +} + +def VLD2LNd8Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>; +def VLD2LNd16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>; +def VLD2LNd32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>; + +def VLD2LNq16_UPD : VLD2LNWB<0b0101, {?,?,1,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD2LNq32_UPD : VLD2LNWB<0b1001, {?,1,0,?}, "32"> { + let Inst{7} = lane{0}; +} + +def VLD2LNq16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>; +def VLD2LNq32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>; + +// VLD3LN : Vector Load (single 3-element structure to one lane) +class VLD3LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3), + (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, DPR:$src3, + nohash_imm:$lane), IIC_VLD3ln, "vld3", Dt, + "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn", + "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3", []> { + let Rm = 0b1111; + let DecoderMethod = "DecodeVLD3LN"; +} + +def VLD3LNd8 : VLD3LN<0b0010, {?,?,?,0}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VLD3LNd16 : VLD3LN<0b0110, {?,?,0,0}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD3LNd32 : VLD3LN<0b1010, {?,0,0,0}, "32"> { + let Inst{7} = lane{0}; +} + +def VLD3LNd8Pseudo : VLDQQLNPseudo<IIC_VLD3ln>; +def VLD3LNd16Pseudo : VLDQQLNPseudo<IIC_VLD3ln>; +def VLD3LNd32Pseudo : VLDQQLNPseudo<IIC_VLD3ln>; + +// ...with double-spaced registers: +def VLD3LNq16 : VLD3LN<0b0110, {?,?,1,0}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD3LNq32 : VLD3LN<0b1010, {?,1,0,0}, "32"> { + let Inst{7} = lane{0}; +} + +def VLD3LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>; +def VLD3LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>; + +// ...with address register writeback: +class VLD3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b10, op11_8, op7_4, + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$src1, DPR:$src2, DPR:$src3, nohash_imm:$lane), + IIC_VLD3lnu, "vld3", Dt, + "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn$Rm", + "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $Rn.addr = $wb", + []> { + let DecoderMethod = "DecodeVLD3LN"; +} + +def VLD3LNd8_UPD : VLD3LNWB<0b0010, {?,?,?,0}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VLD3LNd16_UPD : VLD3LNWB<0b0110, {?,?,0,0}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD3LNd32_UPD : VLD3LNWB<0b1010, {?,0,0,0}, "32"> { + let Inst{7} = lane{0}; +} + +def VLD3LNd8Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>; +def VLD3LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>; +def VLD3LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>; + +def VLD3LNq16_UPD : VLD3LNWB<0b0110, {?,?,1,0}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD3LNq32_UPD : VLD3LNWB<0b1010, {?,1,0,0}, "32"> { + let Inst{7} = lane{0}; +} + +def VLD3LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>; +def VLD3LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>; + +// VLD4LN : Vector Load (single 4-element structure to one lane) +class VLD4LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b10, op11_8, op7_4, + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4), + (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, + nohash_imm:$lane), IIC_VLD4ln, "vld4", Dt, + "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $Rn", + "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLD4LN"; +} + +def VLD4LNd8 : VLD4LN<0b0011, {?,?,?,?}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VLD4LNd16 : VLD4LN<0b0111, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD4LNd32 : VLD4LN<0b1011, {?,0,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{5}; +} + +def VLD4LNd8Pseudo : VLDQQLNPseudo<IIC_VLD4ln>; +def VLD4LNd16Pseudo : VLDQQLNPseudo<IIC_VLD4ln>; +def VLD4LNd32Pseudo : VLDQQLNPseudo<IIC_VLD4ln>; + +// ...with double-spaced registers: +def VLD4LNq16 : VLD4LN<0b0111, {?,?,1,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD4LNq32 : VLD4LN<0b1011, {?,1,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{5}; +} + +def VLD4LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>; +def VLD4LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>; + +// ...with address register writeback: +class VLD4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b10, op11_8, op7_4, + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane), + IIC_VLD4lnu, "vld4", Dt, +"\\{$Vd[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $Rn$Rm", +"$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4, $Rn.addr = $wb", + []> { + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLD4LN" ; +} + +def VLD4LNd8_UPD : VLD4LNWB<0b0011, {?,?,?,?}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VLD4LNd16_UPD : VLD4LNWB<0b0111, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD4LNd32_UPD : VLD4LNWB<0b1011, {?,0,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{5}; +} + +def VLD4LNd8Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>; +def VLD4LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>; +def VLD4LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>; + +def VLD4LNq16_UPD : VLD4LNWB<0b0111, {?,?,1,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VLD4LNq32_UPD : VLD4LNWB<0b1011, {?,1,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{5}; +} + +def VLD4LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>; +def VLD4LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>; + +} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 + +// VLD1DUP : Vector Load (single element to all lanes) +class VLD1DUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp> + : NLdSt<1, 0b10, 0b1100, op7_4, (outs DPR:$Vd), (ins addrmode6dup:$Rn), + IIC_VLD1dup, "vld1", Dt, "\\{$Vd[]\\}, $Rn", "", + [(set DPR:$Vd, (Ty (NEONvdup (i32 (LoadOp addrmode6dup:$Rn)))))]> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLD1DupInstruction"; +} +class VLD1QDUPPseudo<ValueType Ty, PatFrag LoadOp> : VLDQPseudo<IIC_VLD1dup> { + let Pattern = [(set QPR:$dst, + (Ty (NEONvdup (i32 (LoadOp addrmode6dup:$addr)))))]; +} + +def VLD1DUPd8 : VLD1DUP<{0,0,0,?}, "8", v8i8, extloadi8>; +def VLD1DUPd16 : VLD1DUP<{0,1,0,?}, "16", v4i16, extloadi16>; +def VLD1DUPd32 : VLD1DUP<{1,0,0,?}, "32", v2i32, load>; + +def VLD1DUPq8Pseudo : VLD1QDUPPseudo<v16i8, extloadi8>; +def VLD1DUPq16Pseudo : VLD1QDUPPseudo<v8i16, extloadi16>; +def VLD1DUPq32Pseudo : VLD1QDUPPseudo<v4i32, load>; + +def : Pat<(v2f32 (NEONvdup (f32 (load addrmode6dup:$addr)))), + (VLD1DUPd32 addrmode6:$addr)>; +def : Pat<(v4f32 (NEONvdup (f32 (load addrmode6dup:$addr)))), + (VLD1DUPq32Pseudo addrmode6:$addr)>; + +let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { + +class VLD1QDUP<bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, 0b1100, op7_4, (outs DPR:$Vd, DPR:$dst2), + (ins addrmode6dup:$Rn), IIC_VLD1dup, + "vld1", Dt, "\\{$Vd[], $dst2[]\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLD1DupInstruction"; +} + +def VLD1DUPq8 : VLD1QDUP<{0,0,1,0}, "8">; +def VLD1DUPq16 : VLD1QDUP<{0,1,1,?}, "16">; +def VLD1DUPq32 : VLD1QDUP<{1,0,1,?}, "32">; + +// ...with address register writeback: +class VLD1DUPWB<bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, 0b1100, op7_4, (outs DPR:$Vd, GPR:$wb), + (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD1dupu, + "vld1", Dt, "\\{$Vd[]\\}, $Rn$Rm", "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLD1DupInstruction"; +} +class VLD1QDUPWB<bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, 0b1100, op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb), + (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD1dupu, + "vld1", Dt, "\\{$Vd[], $dst2[]\\}, $Rn$Rm", "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLD1DupInstruction"; +} + +def VLD1DUPd8_UPD : VLD1DUPWB<{0,0,0,0}, "8">; +def VLD1DUPd16_UPD : VLD1DUPWB<{0,1,0,?}, "16">; +def VLD1DUPd32_UPD : VLD1DUPWB<{1,0,0,?}, "32">; + +def VLD1DUPq8_UPD : VLD1QDUPWB<{0,0,1,0}, "8">; +def VLD1DUPq16_UPD : VLD1QDUPWB<{0,1,1,?}, "16">; +def VLD1DUPq32_UPD : VLD1QDUPWB<{1,0,1,?}, "32">; + +def VLD1DUPq8Pseudo_UPD : VLDQWBPseudo<IIC_VLD1dupu>; +def VLD1DUPq16Pseudo_UPD : VLDQWBPseudo<IIC_VLD1dupu>; +def VLD1DUPq32Pseudo_UPD : VLDQWBPseudo<IIC_VLD1dupu>; + +// VLD2DUP : Vector Load (single 2-element structure to all lanes) +class VLD2DUP<bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, 0b1101, op7_4, (outs DPR:$Vd, DPR:$dst2), + (ins addrmode6dup:$Rn), IIC_VLD2dup, + "vld2", Dt, "\\{$Vd[], $dst2[]\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLD2DupInstruction"; +} + +def VLD2DUPd8 : VLD2DUP<{0,0,0,?}, "8">; +def VLD2DUPd16 : VLD2DUP<{0,1,0,?}, "16">; +def VLD2DUPd32 : VLD2DUP<{1,0,0,?}, "32">; + +def VLD2DUPd8Pseudo : VLDQPseudo<IIC_VLD2dup>; +def VLD2DUPd16Pseudo : VLDQPseudo<IIC_VLD2dup>; +def VLD2DUPd32Pseudo : VLDQPseudo<IIC_VLD2dup>; + +// ...with double-spaced registers (not used for codegen): +def VLD2DUPd8x2 : VLD2DUP<{0,0,1,?}, "8">; +def VLD2DUPd16x2 : VLD2DUP<{0,1,1,?}, "16">; +def VLD2DUPd32x2 : VLD2DUP<{1,0,1,?}, "32">; + +// ...with address register writeback: +class VLD2DUPWB<bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, 0b1101, op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb), + (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD2dupu, + "vld2", Dt, "\\{$Vd[], $dst2[]\\}, $Rn$Rm", "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLD2DupInstruction"; +} + +def VLD2DUPd8_UPD : VLD2DUPWB<{0,0,0,0}, "8">; +def VLD2DUPd16_UPD : VLD2DUPWB<{0,1,0,?}, "16">; +def VLD2DUPd32_UPD : VLD2DUPWB<{1,0,0,?}, "32">; + +def VLD2DUPd8x2_UPD : VLD2DUPWB<{0,0,1,0}, "8">; +def VLD2DUPd16x2_UPD : VLD2DUPWB<{0,1,1,?}, "16">; +def VLD2DUPd32x2_UPD : VLD2DUPWB<{1,0,1,?}, "32">; + +def VLD2DUPd8Pseudo_UPD : VLDQWBPseudo<IIC_VLD2dupu>; +def VLD2DUPd16Pseudo_UPD : VLDQWBPseudo<IIC_VLD2dupu>; +def VLD2DUPd32Pseudo_UPD : VLDQWBPseudo<IIC_VLD2dupu>; + +// VLD3DUP : Vector Load (single 3-element structure to all lanes) +class VLD3DUP<bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3), + (ins addrmode6dup:$Rn), IIC_VLD3dup, + "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = 0; + let DecoderMethod = "DecodeVLD3DupInstruction"; +} + +def VLD3DUPd8 : VLD3DUP<{0,0,0,?}, "8">; +def VLD3DUPd16 : VLD3DUP<{0,1,0,?}, "16">; +def VLD3DUPd32 : VLD3DUP<{1,0,0,?}, "32">; + +def VLD3DUPd8Pseudo : VLDQQPseudo<IIC_VLD3dup>; +def VLD3DUPd16Pseudo : VLDQQPseudo<IIC_VLD3dup>; +def VLD3DUPd32Pseudo : VLDQQPseudo<IIC_VLD3dup>; + +// ...with double-spaced registers (not used for codegen): +def VLD3DUPd8x2 : VLD3DUP<{0,0,1,?}, "8">; +def VLD3DUPd16x2 : VLD3DUP<{0,1,1,?}, "16">; +def VLD3DUPd32x2 : VLD3DUP<{1,0,1,?}, "32">; + +// ...with address register writeback: +class VLD3DUPWB<bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb), + (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD3dupu, + "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{4} = 0; + let DecoderMethod = "DecodeVLD3DupInstruction"; +} + +def VLD3DUPd8_UPD : VLD3DUPWB<{0,0,0,0}, "8">; +def VLD3DUPd16_UPD : VLD3DUPWB<{0,1,0,?}, "16">; +def VLD3DUPd32_UPD : VLD3DUPWB<{1,0,0,?}, "32">; + +def VLD3DUPd8x2_UPD : VLD3DUPWB<{0,0,1,0}, "8">; +def VLD3DUPd16x2_UPD : VLD3DUPWB<{0,1,1,?}, "16">; +def VLD3DUPd32x2_UPD : VLD3DUPWB<{1,0,1,?}, "32">; + +def VLD3DUPd8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>; +def VLD3DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>; +def VLD3DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>; + +// VLD4DUP : Vector Load (single 4-element structure to all lanes) +class VLD4DUP<bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, 0b1111, op7_4, + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4), + (ins addrmode6dup:$Rn), IIC_VLD4dup, + "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLD4DupInstruction"; +} + +def VLD4DUPd8 : VLD4DUP<{0,0,0,?}, "8">; +def VLD4DUPd16 : VLD4DUP<{0,1,0,?}, "16">; +def VLD4DUPd32 : VLD4DUP<{1,?,0,?}, "32"> { let Inst{6} = Rn{5}; } + +def VLD4DUPd8Pseudo : VLDQQPseudo<IIC_VLD4dup>; +def VLD4DUPd16Pseudo : VLDQQPseudo<IIC_VLD4dup>; +def VLD4DUPd32Pseudo : VLDQQPseudo<IIC_VLD4dup>; + +// ...with double-spaced registers (not used for codegen): +def VLD4DUPd8x2 : VLD4DUP<{0,0,1,?}, "8">; +def VLD4DUPd16x2 : VLD4DUP<{0,1,1,?}, "16">; +def VLD4DUPd32x2 : VLD4DUP<{1,?,1,?}, "32"> { let Inst{6} = Rn{5}; } + +// ...with address register writeback: +class VLD4DUPWB<bits<4> op7_4, string Dt> + : NLdSt<1, 0b10, 0b1111, op7_4, + (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), + (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD4dupu, + "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVLD4DupInstruction"; +} + +def VLD4DUPd8_UPD : VLD4DUPWB<{0,0,0,0}, "8">; +def VLD4DUPd16_UPD : VLD4DUPWB<{0,1,0,?}, "16">; +def VLD4DUPd32_UPD : VLD4DUPWB<{1,?,0,?}, "32"> { let Inst{6} = Rn{5}; } + +def VLD4DUPd8x2_UPD : VLD4DUPWB<{0,0,1,0}, "8">; +def VLD4DUPd16x2_UPD : VLD4DUPWB<{0,1,1,?}, "16">; +def VLD4DUPd32x2_UPD : VLD4DUPWB<{1,?,1,?}, "32"> { let Inst{6} = Rn{5}; } + +def VLD4DUPd8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>; +def VLD4DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>; +def VLD4DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>; + +} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 + +let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in { + +// Classes for VST* pseudo-instructions with multi-register operands. +// These are expanded to real instructions after register allocation. +class VSTQPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs), (ins addrmode6:$addr, QPR:$src), itin, "">; +class VSTQWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QPR:$src), itin, + "$addr.addr = $wb">; +class VSTQQPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQPR:$src), itin, "">; +class VSTQQWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QQPR:$src), itin, + "$addr.addr = $wb">; +class VSTQQQQPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQQQPR:$src), itin, "">; +class VSTQQQQWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), itin, + "$addr.addr = $wb">; + +// VST1 : Vector Store (multiple single elements) +class VST1D<bits<4> op7_4, string Dt> + : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins addrmode6:$Rn, DPR:$Vd), + IIC_VST1, "vst1", Dt, "\\{$Vd\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVSTInstruction"; +} +class VST1Q<bits<4> op7_4, string Dt> + : NLdSt<0,0b00,0b1010,op7_4, (outs), + (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2), IIC_VST1x2, + "vst1", Dt, "\\{$Vd, $src2\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVSTInstruction"; +} + +def VST1d8 : VST1D<{0,0,0,?}, "8">; +def VST1d16 : VST1D<{0,1,0,?}, "16">; +def VST1d32 : VST1D<{1,0,0,?}, "32">; +def VST1d64 : VST1D<{1,1,0,?}, "64">; + +def VST1q8 : VST1Q<{0,0,?,?}, "8">; +def VST1q16 : VST1Q<{0,1,?,?}, "16">; +def VST1q32 : VST1Q<{1,0,?,?}, "32">; +def VST1q64 : VST1Q<{1,1,?,?}, "64">; + +def VST1q8Pseudo : VSTQPseudo<IIC_VST1x2>; +def VST1q16Pseudo : VSTQPseudo<IIC_VST1x2>; +def VST1q32Pseudo : VSTQPseudo<IIC_VST1x2>; +def VST1q64Pseudo : VSTQPseudo<IIC_VST1x2>; + +// ...with address register writeback: +class VST1DWB<bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, 0b0111, op7_4, (outs GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, DPR:$Vd), IIC_VST1u, + "vst1", Dt, "\\{$Vd\\}, $Rn$Rm", "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVSTInstruction"; +} +class VST1QWB<bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, 0b1010, op7_4, (outs GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, DPR:$Vd, DPR:$src2), + IIC_VST1x2u, "vst1", Dt, "\\{$Vd, $src2\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVSTInstruction"; +} + +def VST1d8_UPD : VST1DWB<{0,0,0,?}, "8">; +def VST1d16_UPD : VST1DWB<{0,1,0,?}, "16">; +def VST1d32_UPD : VST1DWB<{1,0,0,?}, "32">; +def VST1d64_UPD : VST1DWB<{1,1,0,?}, "64">; + +def VST1q8_UPD : VST1QWB<{0,0,?,?}, "8">; +def VST1q16_UPD : VST1QWB<{0,1,?,?}, "16">; +def VST1q32_UPD : VST1QWB<{1,0,?,?}, "32">; +def VST1q64_UPD : VST1QWB<{1,1,?,?}, "64">; + +def VST1q8Pseudo_UPD : VSTQWBPseudo<IIC_VST1x2u>; +def VST1q16Pseudo_UPD : VSTQWBPseudo<IIC_VST1x2u>; +def VST1q32Pseudo_UPD : VSTQWBPseudo<IIC_VST1x2u>; +def VST1q64Pseudo_UPD : VSTQWBPseudo<IIC_VST1x2u>; + +// ...with 3 registers (some of these are only for the disassembler): +class VST1D3<bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, 0b0110, op7_4, (outs), + (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3), + IIC_VST1x3, "vst1", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVSTInstruction"; +} +class VST1D3WB<bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, 0b0110, op7_4, (outs GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$Vd, DPR:$src2, DPR:$src3), + IIC_VST1x3u, "vst1", Dt, "\\{$Vd, $src2, $src3\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVSTInstruction"; +} + +def VST1d8T : VST1D3<{0,0,0,?}, "8">; +def VST1d16T : VST1D3<{0,1,0,?}, "16">; +def VST1d32T : VST1D3<{1,0,0,?}, "32">; +def VST1d64T : VST1D3<{1,1,0,?}, "64">; + +def VST1d8T_UPD : VST1D3WB<{0,0,0,?}, "8">; +def VST1d16T_UPD : VST1D3WB<{0,1,0,?}, "16">; +def VST1d32T_UPD : VST1D3WB<{1,0,0,?}, "32">; +def VST1d64T_UPD : VST1D3WB<{1,1,0,?}, "64">; + +def VST1d64TPseudo : VSTQQPseudo<IIC_VST1x3>; +def VST1d64TPseudo_UPD : VSTQQWBPseudo<IIC_VST1x3u>; + +// ...with 4 registers (some of these are only for the disassembler): +class VST1D4<bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, 0b0010, op7_4, (outs), + (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), + IIC_VST1x4, "vst1", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn", "", + []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVSTInstruction"; +} +class VST1D4WB<bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, 0b0010, op7_4, (outs GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST1x4u, + "vst1", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVSTInstruction"; +} + +def VST1d8Q : VST1D4<{0,0,?,?}, "8">; +def VST1d16Q : VST1D4<{0,1,?,?}, "16">; +def VST1d32Q : VST1D4<{1,0,?,?}, "32">; +def VST1d64Q : VST1D4<{1,1,?,?}, "64">; + +def VST1d8Q_UPD : VST1D4WB<{0,0,?,?}, "8">; +def VST1d16Q_UPD : VST1D4WB<{0,1,?,?}, "16">; +def VST1d32Q_UPD : VST1D4WB<{1,0,?,?}, "32">; +def VST1d64Q_UPD : VST1D4WB<{1,1,?,?}, "64">; + +def VST1d64QPseudo : VSTQQPseudo<IIC_VST1x4>; +def VST1d64QPseudo_UPD : VSTQQWBPseudo<IIC_VST1x4u>; + +// VST2 : Vector Store (multiple 2-element structures) +class VST2D<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, op11_8, op7_4, (outs), + (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2), + IIC_VST2, "vst2", Dt, "\\{$Vd, $src2\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVSTInstruction"; +} +class VST2Q<bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, 0b0011, op7_4, (outs), + (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), + IIC_VST2x2, "vst2", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn", + "", []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVSTInstruction"; +} + +def VST2d8 : VST2D<0b1000, {0,0,?,?}, "8">; +def VST2d16 : VST2D<0b1000, {0,1,?,?}, "16">; +def VST2d32 : VST2D<0b1000, {1,0,?,?}, "32">; + +def VST2q8 : VST2Q<{0,0,?,?}, "8">; +def VST2q16 : VST2Q<{0,1,?,?}, "16">; +def VST2q32 : VST2Q<{1,0,?,?}, "32">; + +def VST2d8Pseudo : VSTQPseudo<IIC_VST2>; +def VST2d16Pseudo : VSTQPseudo<IIC_VST2>; +def VST2d32Pseudo : VSTQPseudo<IIC_VST2>; + +def VST2q8Pseudo : VSTQQPseudo<IIC_VST2x2>; +def VST2q16Pseudo : VSTQQPseudo<IIC_VST2x2>; +def VST2q32Pseudo : VSTQQPseudo<IIC_VST2x2>; + +// ...with address register writeback: +class VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, DPR:$Vd, DPR:$src2), + IIC_VST2u, "vst2", Dt, "\\{$Vd, $src2\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVSTInstruction"; +} +class VST2QWB<bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST2x2u, + "vst2", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVSTInstruction"; +} + +def VST2d8_UPD : VST2DWB<0b1000, {0,0,?,?}, "8">; +def VST2d16_UPD : VST2DWB<0b1000, {0,1,?,?}, "16">; +def VST2d32_UPD : VST2DWB<0b1000, {1,0,?,?}, "32">; + +def VST2q8_UPD : VST2QWB<{0,0,?,?}, "8">; +def VST2q16_UPD : VST2QWB<{0,1,?,?}, "16">; +def VST2q32_UPD : VST2QWB<{1,0,?,?}, "32">; + +def VST2d8Pseudo_UPD : VSTQWBPseudo<IIC_VST2u>; +def VST2d16Pseudo_UPD : VSTQWBPseudo<IIC_VST2u>; +def VST2d32Pseudo_UPD : VSTQWBPseudo<IIC_VST2u>; + +def VST2q8Pseudo_UPD : VSTQQWBPseudo<IIC_VST2x2u>; +def VST2q16Pseudo_UPD : VSTQQWBPseudo<IIC_VST2x2u>; +def VST2q32Pseudo_UPD : VSTQQWBPseudo<IIC_VST2x2u>; + +// ...with double-spaced registers (for disassembly only): +def VST2b8 : VST2D<0b1001, {0,0,?,?}, "8">; +def VST2b16 : VST2D<0b1001, {0,1,?,?}, "16">; +def VST2b32 : VST2D<0b1001, {1,0,?,?}, "32">; +def VST2b8_UPD : VST2DWB<0b1001, {0,0,?,?}, "8">; +def VST2b16_UPD : VST2DWB<0b1001, {0,1,?,?}, "16">; +def VST2b32_UPD : VST2DWB<0b1001, {1,0,?,?}, "32">; + +// VST3 : Vector Store (multiple 3-element structures) +class VST3D<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, op11_8, op7_4, (outs), + (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3), IIC_VST3, + "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVSTInstruction"; +} + +def VST3d8 : VST3D<0b0100, {0,0,0,?}, "8">; +def VST3d16 : VST3D<0b0100, {0,1,0,?}, "16">; +def VST3d32 : VST3D<0b0100, {1,0,0,?}, "32">; + +def VST3d8Pseudo : VSTQQPseudo<IIC_VST3>; +def VST3d16Pseudo : VSTQQPseudo<IIC_VST3>; +def VST3d32Pseudo : VSTQQPseudo<IIC_VST3>; + +// ...with address register writeback: +class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$Vd, DPR:$src2, DPR:$src3), IIC_VST3u, + "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVSTInstruction"; +} + +def VST3d8_UPD : VST3DWB<0b0100, {0,0,0,?}, "8">; +def VST3d16_UPD : VST3DWB<0b0100, {0,1,0,?}, "16">; +def VST3d32_UPD : VST3DWB<0b0100, {1,0,0,?}, "32">; + +def VST3d8Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>; +def VST3d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>; +def VST3d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>; + +// ...with double-spaced registers: +def VST3q8 : VST3D<0b0101, {0,0,0,?}, "8">; +def VST3q16 : VST3D<0b0101, {0,1,0,?}, "16">; +def VST3q32 : VST3D<0b0101, {1,0,0,?}, "32">; +def VST3q8_UPD : VST3DWB<0b0101, {0,0,0,?}, "8">; +def VST3q16_UPD : VST3DWB<0b0101, {0,1,0,?}, "16">; +def VST3q32_UPD : VST3DWB<0b0101, {1,0,0,?}, "32">; + +def VST3q8Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>; +def VST3q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>; +def VST3q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>; + +// ...alternate versions to be allocated odd register numbers: +def VST3q8oddPseudo : VSTQQQQPseudo<IIC_VST3>; +def VST3q16oddPseudo : VSTQQQQPseudo<IIC_VST3>; +def VST3q32oddPseudo : VSTQQQQPseudo<IIC_VST3>; + +def VST3q8oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>; +def VST3q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>; +def VST3q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>; + +// VST4 : Vector Store (multiple 4-element structures) +class VST4D<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, op11_8, op7_4, (outs), + (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), + IIC_VST4, "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn", + "", []> { + let Rm = 0b1111; + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVSTInstruction"; +} + +def VST4d8 : VST4D<0b0000, {0,0,?,?}, "8">; +def VST4d16 : VST4D<0b0000, {0,1,?,?}, "16">; +def VST4d32 : VST4D<0b0000, {1,0,?,?}, "32">; + +def VST4d8Pseudo : VSTQQPseudo<IIC_VST4>; +def VST4d16Pseudo : VSTQQPseudo<IIC_VST4>; +def VST4d32Pseudo : VSTQQPseudo<IIC_VST4>; + +// ...with address register writeback: +class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST4u, + "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{5-4} = Rn{5-4}; + let DecoderMethod = "DecodeVSTInstruction"; +} + +def VST4d8_UPD : VST4DWB<0b0000, {0,0,?,?}, "8">; +def VST4d16_UPD : VST4DWB<0b0000, {0,1,?,?}, "16">; +def VST4d32_UPD : VST4DWB<0b0000, {1,0,?,?}, "32">; + +def VST4d8Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>; +def VST4d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>; +def VST4d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>; + +// ...with double-spaced registers: +def VST4q8 : VST4D<0b0001, {0,0,?,?}, "8">; +def VST4q16 : VST4D<0b0001, {0,1,?,?}, "16">; +def VST4q32 : VST4D<0b0001, {1,0,?,?}, "32">; +def VST4q8_UPD : VST4DWB<0b0001, {0,0,?,?}, "8">; +def VST4q16_UPD : VST4DWB<0b0001, {0,1,?,?}, "16">; +def VST4q32_UPD : VST4DWB<0b0001, {1,0,?,?}, "32">; + +def VST4q8Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>; +def VST4q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>; +def VST4q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>; + +// ...alternate versions to be allocated odd register numbers: +def VST4q8oddPseudo : VSTQQQQPseudo<IIC_VST4>; +def VST4q16oddPseudo : VSTQQQQPseudo<IIC_VST4>; +def VST4q32oddPseudo : VSTQQQQPseudo<IIC_VST4>; + +def VST4q8oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>; +def VST4q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>; +def VST4q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>; + +} // mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 + +// Classes for VST*LN pseudo-instructions with multi-register operands. +// These are expanded to real instructions after register allocation. +class VSTQLNPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs), (ins addrmode6:$addr, QPR:$src, nohash_imm:$lane), + itin, "">; +class VSTQLNWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QPR:$src, + nohash_imm:$lane), itin, "$addr.addr = $wb">; +class VSTQQLNPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQPR:$src, nohash_imm:$lane), + itin, "">; +class VSTQQLNWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QQPR:$src, + nohash_imm:$lane), itin, "$addr.addr = $wb">; +class VSTQQQQLNPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQQQPR:$src, nohash_imm:$lane), + itin, "">; +class VSTQQQQLNWBPseudo<InstrItinClass itin> + : PseudoNLdSt<(outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src, + nohash_imm:$lane), itin, "$addr.addr = $wb">; + +// VST1LN : Vector Store (single element from one lane) +class VST1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty, + PatFrag StoreOp, SDNode ExtractOp> + : NLdStLn<1, 0b00, op11_8, op7_4, (outs), + (ins addrmode6:$Rn, DPR:$Vd, nohash_imm:$lane), + IIC_VST1ln, "vst1", Dt, "\\{$Vd[$lane]\\}, $Rn", "", + [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), addrmode6:$Rn)]> { + let Rm = 0b1111; + let DecoderMethod = "DecodeVST1LN"; +} +class VST1LN32<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty, + PatFrag StoreOp, SDNode ExtractOp> + : NLdStLn<1, 0b00, op11_8, op7_4, (outs), + (ins addrmode6oneL32:$Rn, DPR:$Vd, nohash_imm:$lane), + IIC_VST1ln, "vst1", Dt, "\\{$Vd[$lane]\\}, $Rn", "", + [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), addrmode6oneL32:$Rn)]>{ + let Rm = 0b1111; + let DecoderMethod = "DecodeVST1LN"; +} +class VST1QLNPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp> + : VSTQLNPseudo<IIC_VST1ln> { + let Pattern = [(StoreOp (ExtractOp (Ty QPR:$src), imm:$lane), + addrmode6:$addr)]; +} + +def VST1LNd8 : VST1LN<0b0000, {?,?,?,0}, "8", v8i8, truncstorei8, + NEONvgetlaneu> { + let Inst{7-5} = lane{2-0}; +} +def VST1LNd16 : VST1LN<0b0100, {?,?,0,?}, "16", v4i16, truncstorei16, + NEONvgetlaneu> { + let Inst{7-6} = lane{1-0}; + let Inst{4} = Rn{5}; +} + +def VST1LNd32 : VST1LN32<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt> { + let Inst{7} = lane{0}; + let Inst{5-4} = Rn{5-4}; +} + +def VST1LNq8Pseudo : VST1QLNPseudo<v16i8, truncstorei8, NEONvgetlaneu>; +def VST1LNq16Pseudo : VST1QLNPseudo<v8i16, truncstorei16, NEONvgetlaneu>; +def VST1LNq32Pseudo : VST1QLNPseudo<v4i32, store, extractelt>; + +def : Pat<(store (extractelt (v2f32 DPR:$src), imm:$lane), addrmode6:$addr), + (VST1LNd32 addrmode6:$addr, DPR:$src, imm:$lane)>; +def : Pat<(store (extractelt (v4f32 QPR:$src), imm:$lane), addrmode6:$addr), + (VST1LNq32Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>; + +// ...with address register writeback: +class VST1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty, + PatFrag StoreOp, SDNode ExtractOp> + : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$Vd, nohash_imm:$lane), IIC_VST1lnu, "vst1", Dt, + "\\{$Vd[$lane]\\}, $Rn$Rm", + "$Rn.addr = $wb", + [(set GPR:$wb, (StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), + addrmode6:$Rn, am6offset:$Rm))]> { + let DecoderMethod = "DecodeVST1LN"; +} +class VST1QLNWBPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp> + : VSTQLNWBPseudo<IIC_VST1lnu> { + let Pattern = [(set GPR:$wb, (StoreOp (ExtractOp (Ty QPR:$src), imm:$lane), + addrmode6:$addr, am6offset:$offset))]; +} + +def VST1LNd8_UPD : VST1LNWB<0b0000, {?,?,?,0}, "8", v8i8, post_truncsti8, + NEONvgetlaneu> { + let Inst{7-5} = lane{2-0}; +} +def VST1LNd16_UPD : VST1LNWB<0b0100, {?,?,0,?}, "16", v4i16, post_truncsti16, + NEONvgetlaneu> { + let Inst{7-6} = lane{1-0}; + let Inst{4} = Rn{5}; +} +def VST1LNd32_UPD : VST1LNWB<0b1000, {?,0,?,?}, "32", v2i32, post_store, + extractelt> { + let Inst{7} = lane{0}; + let Inst{5-4} = Rn{5-4}; +} + +def VST1LNq8Pseudo_UPD : VST1QLNWBPseudo<v16i8, post_truncsti8, NEONvgetlaneu>; +def VST1LNq16Pseudo_UPD : VST1QLNWBPseudo<v8i16, post_truncsti16,NEONvgetlaneu>; +def VST1LNq32Pseudo_UPD : VST1QLNWBPseudo<v4i32, post_store, extractelt>; + +let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in { + +// VST2LN : Vector Store (single 2-element structure from one lane) +class VST2LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b00, op11_8, op7_4, (outs), + (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, nohash_imm:$lane), + IIC_VST2ln, "vst2", Dt, "\\{$Vd[$lane], $src2[$lane]\\}, $Rn", + "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVST2LN"; +} + +def VST2LNd8 : VST2LN<0b0001, {?,?,?,?}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VST2LNd16 : VST2LN<0b0101, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST2LNd32 : VST2LN<0b1001, {?,0,0,?}, "32"> { + let Inst{7} = lane{0}; +} + +def VST2LNd8Pseudo : VSTQLNPseudo<IIC_VST2ln>; +def VST2LNd16Pseudo : VSTQLNPseudo<IIC_VST2ln>; +def VST2LNd32Pseudo : VSTQLNPseudo<IIC_VST2ln>; + +// ...with double-spaced registers: +def VST2LNq16 : VST2LN<0b0101, {?,?,1,?}, "16"> { + let Inst{7-6} = lane{1-0}; + let Inst{4} = Rn{4}; +} +def VST2LNq32 : VST2LN<0b1001, {?,1,0,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{4} = Rn{4}; +} + +def VST2LNq16Pseudo : VSTQQLNPseudo<IIC_VST2ln>; +def VST2LNq32Pseudo : VSTQQLNPseudo<IIC_VST2ln>; + +// ...with address register writeback: +class VST2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, + DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VST2lnu, "vst2", Dt, + "\\{$src1[$lane], $src2[$lane]\\}, $addr$offset", + "$addr.addr = $wb", []> { + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVST2LN"; +} + +def VST2LNd8_UPD : VST2LNWB<0b0001, {?,?,?,?}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VST2LNd16_UPD : VST2LNWB<0b0101, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST2LNd32_UPD : VST2LNWB<0b1001, {?,0,0,?}, "32"> { + let Inst{7} = lane{0}; +} + +def VST2LNd8Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>; +def VST2LNd16Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>; +def VST2LNd32Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>; + +def VST2LNq16_UPD : VST2LNWB<0b0101, {?,?,1,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST2LNq32_UPD : VST2LNWB<0b1001, {?,1,0,?}, "32"> { + let Inst{7} = lane{0}; +} + +def VST2LNq16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>; +def VST2LNq32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>; + +// VST3LN : Vector Store (single 3-element structure from one lane) +class VST3LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b00, op11_8, op7_4, (outs), + (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, + nohash_imm:$lane), IIC_VST3ln, "vst3", Dt, + "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn", "", []> { + let Rm = 0b1111; + let DecoderMethod = "DecodeVST3LN"; +} + +def VST3LNd8 : VST3LN<0b0010, {?,?,?,0}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VST3LNd16 : VST3LN<0b0110, {?,?,0,0}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST3LNd32 : VST3LN<0b1010, {?,0,0,0}, "32"> { + let Inst{7} = lane{0}; +} + +def VST3LNd8Pseudo : VSTQQLNPseudo<IIC_VST3ln>; +def VST3LNd16Pseudo : VSTQQLNPseudo<IIC_VST3ln>; +def VST3LNd32Pseudo : VSTQQLNPseudo<IIC_VST3ln>; + +// ...with double-spaced registers: +def VST3LNq16 : VST3LN<0b0110, {?,?,1,0}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST3LNq32 : VST3LN<0b1010, {?,1,0,0}, "32"> { + let Inst{7} = lane{0}; +} + +def VST3LNq16Pseudo : VSTQQQQLNPseudo<IIC_VST3ln>; +def VST3LNq32Pseudo : VSTQQQQLNPseudo<IIC_VST3ln>; + +// ...with address register writeback: +class VST3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$Vd, DPR:$src2, DPR:$src3, nohash_imm:$lane), + IIC_VST3lnu, "vst3", Dt, + "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let DecoderMethod = "DecodeVST3LN"; +} + +def VST3LNd8_UPD : VST3LNWB<0b0010, {?,?,?,0}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VST3LNd16_UPD : VST3LNWB<0b0110, {?,?,0,0}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST3LNd32_UPD : VST3LNWB<0b1010, {?,0,0,0}, "32"> { + let Inst{7} = lane{0}; +} + +def VST3LNd8Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>; +def VST3LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>; +def VST3LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>; + +def VST3LNq16_UPD : VST3LNWB<0b0110, {?,?,1,0}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST3LNq32_UPD : VST3LNWB<0b1010, {?,1,0,0}, "32"> { + let Inst{7} = lane{0}; +} + +def VST3LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>; +def VST3LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>; + +// VST4LN : Vector Store (single 4-element structure from one lane) +class VST4LN<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b00, op11_8, op7_4, (outs), + (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4, + nohash_imm:$lane), IIC_VST4ln, "vst4", Dt, + "\\{$Vd[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $Rn", + "", []> { + let Rm = 0b1111; + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVST4LN"; +} + +def VST4LNd8 : VST4LN<0b0011, {?,?,?,?}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VST4LNd16 : VST4LN<0b0111, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST4LNd32 : VST4LN<0b1011, {?,0,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{5}; +} + +def VST4LNd8Pseudo : VSTQQLNPseudo<IIC_VST4ln>; +def VST4LNd16Pseudo : VSTQQLNPseudo<IIC_VST4ln>; +def VST4LNd32Pseudo : VSTQQLNPseudo<IIC_VST4ln>; + +// ...with double-spaced registers: +def VST4LNq16 : VST4LN<0b0111, {?,?,1,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST4LNq32 : VST4LN<0b1011, {?,1,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{5}; +} + +def VST4LNq16Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>; +def VST4LNq32Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>; + +// ...with address register writeback: +class VST4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> + : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb), + (ins addrmode6:$Rn, am6offset:$Rm, + DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane), + IIC_VST4lnu, "vst4", Dt, + "\\{$Vd[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $Rn$Rm", + "$Rn.addr = $wb", []> { + let Inst{4} = Rn{4}; + let DecoderMethod = "DecodeVST4LN"; +} + +def VST4LNd8_UPD : VST4LNWB<0b0011, {?,?,?,?}, "8"> { + let Inst{7-5} = lane{2-0}; +} +def VST4LNd16_UPD : VST4LNWB<0b0111, {?,?,0,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST4LNd32_UPD : VST4LNWB<0b1011, {?,0,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{5}; +} + +def VST4LNd8Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>; +def VST4LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>; +def VST4LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>; + +def VST4LNq16_UPD : VST4LNWB<0b0111, {?,?,1,?}, "16"> { + let Inst{7-6} = lane{1-0}; +} +def VST4LNq32_UPD : VST4LNWB<0b1011, {?,1,?,?}, "32"> { + let Inst{7} = lane{0}; + let Inst{5} = Rn{5}; +} + +def VST4LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>; +def VST4LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>; + +} // mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 + + +//===----------------------------------------------------------------------===// +// NEON pattern fragments +//===----------------------------------------------------------------------===// + +// Extract D sub-registers of Q registers. +def DSubReg_i8_reg : SDNodeXForm<imm, [{ + assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); + return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/8, MVT::i32); +}]>; +def DSubReg_i16_reg : SDNodeXForm<imm, [{ + assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); + return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/4, MVT::i32); +}]>; +def DSubReg_i32_reg : SDNodeXForm<imm, [{ + assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); + return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/2, MVT::i32); +}]>; +def DSubReg_f64_reg : SDNodeXForm<imm, [{ + assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); + return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue(), MVT::i32); +}]>; + +// Extract S sub-registers of Q/D registers. +def SSubReg_f32_reg : SDNodeXForm<imm, [{ + assert(ARM::ssub_3 == ARM::ssub_0+3 && "Unexpected subreg numbering"); + return CurDAG->getTargetConstant(ARM::ssub_0 + N->getZExtValue(), MVT::i32); +}]>; + +// Translate lane numbers from Q registers to D subregs. +def SubReg_i8_lane : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() & 7, MVT::i32); +}]>; +def SubReg_i16_lane : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() & 3, MVT::i32); +}]>; +def SubReg_i32_lane : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() & 1, MVT::i32); +}]>; + +//===----------------------------------------------------------------------===// +// Instruction Classes +//===----------------------------------------------------------------------===// + +// Basic 2-register operations: double- and quad-register. +class N2VD<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, + string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd), + (ins DPR:$Vm), IIC_VUNAD, OpcodeStr, Dt,"$Vd, $Vm", "", + [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vm))))]>; +class N2VQ<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, + string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd), + (ins QPR:$Vm), IIC_VUNAQ, OpcodeStr, Dt,"$Vd, $Vm", "", + [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vm))))]>; + +// Basic 2-register intrinsics, both double- and quad-register. +class N2VDInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd), + (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>; +class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd), + (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>; + +// Narrow 2-register operations. +class N2VN<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op6, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyD, ValueType TyQ, SDNode OpNode> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$Vd), + (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set DPR:$Vd, (TyD (OpNode (TyQ QPR:$Vm))))]>; + +// Narrow 2-register intrinsics. +class N2VNInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op6, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyD, ValueType TyQ, Intrinsic IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$Vd), + (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set DPR:$Vd, (TyD (IntOp (TyQ QPR:$Vm))))]>; + +// Long 2-register operations (currently only used for VMOVL). +class N2VL<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op6, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDNode OpNode> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$Vd), + (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set QPR:$Vd, (TyQ (OpNode (TyD DPR:$Vm))))]>; + +// Long 2-register intrinsics. +class N2VLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op6, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, Intrinsic IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$Vd), + (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set QPR:$Vd, (TyQ (IntOp (TyD DPR:$Vm))))]>; + +// 2-register shuffles (VTRN/VZIP/VUZP), both double- and quad-register. +class N2VDShuffle<bits<2> op19_18, bits<5> op11_7, string OpcodeStr, string Dt> + : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 0, 0, (outs DPR:$Vd, DPR:$Vm), + (ins DPR:$src1, DPR:$src2), IIC_VPERMD, + OpcodeStr, Dt, "$Vd, $Vm", + "$src1 = $Vd, $src2 = $Vm", []>; +class N2VQShuffle<bits<2> op19_18, bits<5> op11_7, + InstrItinClass itin, string OpcodeStr, string Dt> + : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 1, 0, (outs QPR:$Vd, QPR:$Vm), + (ins QPR:$src1, QPR:$src2), itin, OpcodeStr, Dt, "$Vd, $Vm", + "$src1 = $Vd, $src2 = $Vm", []>; + +// Basic 3-register operations: double- and quad-register. +class N3VD<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]> { + let isCommutable = Commutable; +} +// Same as N3VD but no data type. +class N3VDX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, + ValueType ResTy, ValueType OpTy, + SDNode OpNode, bit Commutable> + : N3VX<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, "$Vd, $Vn, $Vm", "", + [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>{ + let isCommutable = Commutable; +} + +class N3VDSL<bits<2> op21_20, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType Ty, SDNode ShOp> + : N3VLane32<0, 1, op21_20, op11_8, 1, 0, + (outs DPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", + [(set (Ty DPR:$Vd), + (Ty (ShOp (Ty DPR:$Vn), + (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),imm:$lane)))))]> { + let isCommutable = 0; +} +class N3VDSL16<bits<2> op21_20, bits<4> op11_8, + string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp> + : N3VLane16<0, 1, op21_20, op11_8, 1, 0, + (outs DPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), + NVMulSLFrm, IIC_VMULi16D, OpcodeStr, Dt,"$Vd, $Vn, $Vm[$lane]","", + [(set (Ty DPR:$Vd), + (Ty (ShOp (Ty DPR:$Vn), + (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> { + let isCommutable = 0; +} + +class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 1, op4, + (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]> { + let isCommutable = Commutable; +} +class N3VQX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, + ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable> + : N3VX<op24, op23, op21_20, op11_8, 1, op4, + (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin, + OpcodeStr, "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>{ + let isCommutable = Commutable; +} +class N3VQSL<bits<2> op21_20, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDNode ShOp> + : N3VLane32<1, 1, op21_20, op11_8, 1, 0, + (outs QPR:$Vd), (ins QPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", + [(set (ResTy QPR:$Vd), + (ResTy (ShOp (ResTy QPR:$Vn), + (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm), + imm:$lane)))))]> { + let isCommutable = 0; +} +class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDNode ShOp> + : N3VLane16<1, 1, op21_20, op11_8, 1, 0, + (outs QPR:$Vd), (ins QPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), + NVMulSLFrm, IIC_VMULi16Q, OpcodeStr, Dt,"$Vd, $Vn, $Vm[$lane]","", + [(set (ResTy QPR:$Vd), + (ResTy (ShOp (ResTy QPR:$Vn), + (ResTy (NEONvduplane (OpTy DPR_8:$Vm), + imm:$lane)))))]> { + let isCommutable = 0; +} + +// Basic 3-register intrinsics, both double- and quad-register. +class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + Format f, InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), f, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]> { + let isCommutable = Commutable; +} +class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp> + : N3VLane32<0, 1, op21_20, op11_8, 1, 0, + (outs DPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", + [(set (Ty DPR:$Vd), + (Ty (IntOp (Ty DPR:$Vn), + (Ty (NEONvduplane (Ty DPR_VFP2:$Vm), + imm:$lane)))))]> { + let isCommutable = 0; +} +class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp> + : N3VLane16<0, 1, op21_20, op11_8, 1, 0, + (outs DPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", + [(set (Ty DPR:$Vd), + (Ty (IntOp (Ty DPR:$Vn), + (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> { + let isCommutable = 0; +} +class N3VDIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + Format f, InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$Vd), (ins DPR:$Vm, DPR:$Vn), f, itin, + OpcodeStr, Dt, "$Vd, $Vm, $Vn", "", + [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm), (OpTy DPR:$Vn))))]> { + let isCommutable = 0; +} + +class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + Format f, InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 1, op4, + (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), f, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]> { + let isCommutable = Commutable; +} +class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N3VLane32<1, 1, op21_20, op11_8, 1, 0, + (outs QPR:$Vd), (ins QPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", + [(set (ResTy QPR:$Vd), + (ResTy (IntOp (ResTy QPR:$Vn), + (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm), + imm:$lane)))))]> { + let isCommutable = 0; +} +class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N3VLane16<1, 1, op21_20, op11_8, 1, 0, + (outs QPR:$Vd), (ins QPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", + [(set (ResTy QPR:$Vd), + (ResTy (IntOp (ResTy QPR:$Vn), + (ResTy (NEONvduplane (OpTy DPR_8:$Vm), + imm:$lane)))))]> { + let isCommutable = 0; +} +class N3VQIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + Format f, InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N3V<op24, op23, op21_20, op11_8, 1, op4, + (outs QPR:$Vd), (ins QPR:$Vm, QPR:$Vn), f, itin, + OpcodeStr, Dt, "$Vd, $Vm, $Vn", "", + [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm), (OpTy QPR:$Vn))))]> { + let isCommutable = 0; +} + +// Multiply-Add/Sub operations: double- and quad-register. +class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType Ty, SDPatternOperator MulOp, SDPatternOperator OpNode> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set DPR:$Vd, (Ty (OpNode DPR:$src1, + (Ty (MulOp DPR:$Vn, DPR:$Vm)))))]>; + +class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, + ValueType Ty, SDPatternOperator MulOp, SDPatternOperator ShOp> + : N3VLane32<0, 1, op21_20, op11_8, 1, 0, + (outs DPR:$Vd), + (ins DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd", + [(set (Ty DPR:$Vd), + (Ty (ShOp (Ty DPR:$src1), + (Ty (MulOp DPR:$Vn, + (Ty (NEONvduplane (Ty DPR_VFP2:$Vm), + imm:$lane)))))))]>; +class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, + ValueType Ty, SDNode MulOp, SDNode ShOp> + : N3VLane16<0, 1, op21_20, op11_8, 1, 0, + (outs DPR:$Vd), + (ins DPR:$src1, DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd", + [(set (Ty DPR:$Vd), + (Ty (ShOp (Ty DPR:$src1), + (Ty (MulOp DPR:$Vn, + (Ty (NEONvduplane (Ty DPR_8:$Vm), + imm:$lane)))))))]>; + +class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, + SDPatternOperator MulOp, SDPatternOperator OpNode> + : N3V<op24, op23, op21_20, op11_8, 1, op4, + (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set QPR:$Vd, (Ty (OpNode QPR:$src1, + (Ty (MulOp QPR:$Vn, QPR:$Vm)))))]>; +class N3VQMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, + SDPatternOperator MulOp, SDPatternOperator ShOp> + : N3VLane32<1, 1, op21_20, op11_8, 1, 0, + (outs QPR:$Vd), + (ins QPR:$src1, QPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd", + [(set (ResTy QPR:$Vd), + (ResTy (ShOp (ResTy QPR:$src1), + (ResTy (MulOp QPR:$Vn, + (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm), + imm:$lane)))))))]>; +class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, + SDNode MulOp, SDNode ShOp> + : N3VLane16<1, 1, op21_20, op11_8, 1, 0, + (outs QPR:$Vd), + (ins QPR:$src1, QPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd", + [(set (ResTy QPR:$Vd), + (ResTy (ShOp (ResTy QPR:$src1), + (ResTy (MulOp QPR:$Vn, + (ResTy (NEONvduplane (OpTy DPR_8:$Vm), + imm:$lane)))))))]>; + +// Neon Intrinsic-Op instructions (VABA): double- and quad-register. +class N3VDIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType Ty, Intrinsic IntOp, SDNode OpNode> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set DPR:$Vd, (Ty (OpNode DPR:$src1, + (Ty (IntOp (Ty DPR:$Vn), (Ty DPR:$Vm))))))]>; +class N3VQIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType Ty, Intrinsic IntOp, SDNode OpNode> + : N3V<op24, op23, op21_20, op11_8, 1, op4, + (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set QPR:$Vd, (Ty (OpNode QPR:$src1, + (Ty (IntOp (Ty QPR:$Vn), (Ty QPR:$Vm))))))]>; + +// Neon 3-argument intrinsics, both double- and quad-register. +// The destination register is also used as the first source operand register. +class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$src1), + (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>; +class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N3V<op24, op23, op21_20, op11_8, 1, op4, + (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$src1), + (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>; + +// Long Multiply-Add/Sub operations. +class N3VLMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set QPR:$Vd, (OpNode (TyQ QPR:$src1), + (TyQ (MulOp (TyD DPR:$Vn), + (TyD DPR:$Vm)))))]>; +class N3VLMulOpSL<bit op24, bits<2> op21_20, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode> + : N3VLane32<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), + (ins QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd", + [(set QPR:$Vd, + (OpNode (TyQ QPR:$src1), + (TyQ (MulOp (TyD DPR:$Vn), + (TyD (NEONvduplane (TyD DPR_VFP2:$Vm), + imm:$lane))))))]>; +class N3VLMulOpSL16<bit op24, bits<2> op21_20, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode> + : N3VLane16<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), + (ins QPR:$src1, DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd", + [(set QPR:$Vd, + (OpNode (TyQ QPR:$src1), + (TyQ (MulOp (TyD DPR:$Vn), + (TyD (NEONvduplane (TyD DPR_8:$Vm), + imm:$lane))))))]>; + +// Long Intrinsic-Op vector operations with explicit extend (VABAL). +class N3VLIntExtOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, Intrinsic IntOp, SDNode ExtOp, + SDNode OpNode> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set QPR:$Vd, (OpNode (TyQ QPR:$src1), + (TyQ (ExtOp (TyD (IntOp (TyD DPR:$Vn), + (TyD DPR:$Vm)))))))]>; + +// Neon Long 3-argument intrinsic. The destination register is +// a quad-register and is also used as the first source operand register. +class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, Intrinsic IntOp> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set QPR:$Vd, + (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$Vn), (TyD DPR:$Vm))))]>; +class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N3VLane32<op24, 1, op21_20, op11_8, 1, 0, + (outs QPR:$Vd), + (ins QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd", + [(set (ResTy QPR:$Vd), + (ResTy (IntOp (ResTy QPR:$src1), + (OpTy DPR:$Vn), + (OpTy (NEONvduplane (OpTy DPR_VFP2:$Vm), + imm:$lane)))))]>; +class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N3VLane16<op24, 1, op21_20, op11_8, 1, 0, + (outs QPR:$Vd), + (ins QPR:$src1, DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd", + [(set (ResTy QPR:$Vd), + (ResTy (IntOp (ResTy QPR:$src1), + (OpTy DPR:$Vn), + (OpTy (NEONvduplane (OpTy DPR_8:$Vm), + imm:$lane)))))]>; + +// Narrowing 3-register intrinsics. +class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + string OpcodeStr, string Dt, ValueType TyD, ValueType TyQ, + Intrinsic IntOp, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINi4D, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set DPR:$Vd, (TyD (IntOp (TyQ QPR:$Vn), (TyQ QPR:$Vm))))]> { + let isCommutable = Commutable; +} + +// Long 3-register operations. +class N3VL<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDNode OpNode, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (TyQ (OpNode (TyD DPR:$Vn), (TyD DPR:$Vm))))]> { + let isCommutable = Commutable; +} +class N3VLSL<bit op24, bits<2> op21_20, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDNode OpNode> + : N3VLane32<op24, 1, op21_20, op11_8, 1, 0, + (outs QPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", + [(set QPR:$Vd, + (TyQ (OpNode (TyD DPR:$Vn), + (TyD (NEONvduplane (TyD DPR_VFP2:$Vm),imm:$lane)))))]>; +class N3VLSL16<bit op24, bits<2> op21_20, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDNode OpNode> + : N3VLane16<op24, 1, op21_20, op11_8, 1, 0, + (outs QPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", + [(set QPR:$Vd, + (TyQ (OpNode (TyD DPR:$Vn), + (TyD (NEONvduplane (TyD DPR_8:$Vm), imm:$lane)))))]>; + +// Long 3-register operations with explicitly extended operands. +class N3VLExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, SDNode OpNode, SDNode ExtOp, + bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (OpNode (TyQ (ExtOp (TyD DPR:$Vn))), + (TyQ (ExtOp (TyD DPR:$Vm)))))]> { + let isCommutable = Commutable; +} + +// Long 3-register intrinsics with explicit extend (VABDL). +class N3VLIntExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, Intrinsic IntOp, SDNode ExtOp, + bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (TyQ (ExtOp (TyD (IntOp (TyD DPR:$Vn), + (TyD DPR:$Vm))))))]> { + let isCommutable = Commutable; +} + +// Long 3-register intrinsics. +class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType TyQ, ValueType TyD, Intrinsic IntOp, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (TyQ (IntOp (TyD DPR:$Vn), (TyD DPR:$Vm))))]> { + let isCommutable = Commutable; +} +class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N3VLane32<op24, 1, op21_20, op11_8, 1, 0, + (outs QPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", + [(set (ResTy QPR:$Vd), + (ResTy (IntOp (OpTy DPR:$Vn), + (OpTy (NEONvduplane (OpTy DPR_VFP2:$Vm), + imm:$lane)))))]>; +class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N3VLane16<op24, 1, op21_20, op11_8, 1, 0, + (outs QPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), + NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", + [(set (ResTy QPR:$Vd), + (ResTy (IntOp (OpTy DPR:$Vn), + (OpTy (NEONvduplane (OpTy DPR_8:$Vm), + imm:$lane)))))]>; + +// Wide 3-register operations. +class N3VW<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD, + SDNode OpNode, SDNode ExtOp, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs QPR:$Vd), (ins QPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VSUBiD, + OpcodeStr, Dt, "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (OpNode (TyQ QPR:$Vn), + (TyQ (ExtOp (TyD DPR:$Vm)))))]> { + let isCommutable = Commutable; +} + +// Pairwise long 2-register intrinsics, both double- and quad-register. +class N2VDPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd), + (ins DPR:$Vm), IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>; +class N2VQPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd), + (ins QPR:$Vm), IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>; + +// Pairwise long 2-register accumulate intrinsics, +// both double- and quad-register. +// The destination register is also used as the first source operand register. +class N2VDPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, + (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vm), IIC_VPALiD, + OpcodeStr, Dt, "$Vd, $Vm", "$src1 = $Vd", + [(set DPR:$Vd, (ResTy (IntOp (ResTy DPR:$src1), (OpTy DPR:$Vm))))]>; +class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, + (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vm), IIC_VPALiQ, + OpcodeStr, Dt, "$Vd, $Vm", "$src1 = $Vd", + [(set QPR:$Vd, (ResTy (IntOp (ResTy QPR:$src1), (OpTy QPR:$Vm))))]>; + +// Shift by immediate, +// both double- and quad-register. +class N2VDSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, + Format f, InstrItinClass itin, Operand ImmTy, + string OpcodeStr, string Dt, ValueType Ty, SDNode OpNode> + : N2VImm<op24, op23, op11_8, op7, 0, op4, + (outs DPR:$Vd), (ins DPR:$Vm, ImmTy:$SIMM), f, itin, + OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "", + [(set DPR:$Vd, (Ty (OpNode (Ty DPR:$Vm), (i32 imm:$SIMM))))]>; +class N2VQSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, + Format f, InstrItinClass itin, Operand ImmTy, + string OpcodeStr, string Dt, ValueType Ty, SDNode OpNode> + : N2VImm<op24, op23, op11_8, op7, 1, op4, + (outs QPR:$Vd), (ins QPR:$Vm, ImmTy:$SIMM), f, itin, + OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "", + [(set QPR:$Vd, (Ty (OpNode (Ty QPR:$Vm), (i32 imm:$SIMM))))]>; + +// Long shift by immediate. +class N2VLSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, + string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, SDNode OpNode> + : N2VImm<op24, op23, op11_8, op7, op6, op4, + (outs QPR:$Vd), (ins DPR:$Vm, i32imm:$SIMM), N2RegVShLFrm, + IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "", + [(set QPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vm), + (i32 imm:$SIMM))))]>; + +// Narrow shift by immediate. +class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Operand ImmTy, SDNode OpNode> + : N2VImm<op24, op23, op11_8, op7, op6, op4, + (outs DPR:$Vd), (ins QPR:$Vm, ImmTy:$SIMM), N2RegVShRFrm, itin, + OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "", + [(set DPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vm), + (i32 imm:$SIMM))))]>; + +// Shift right by immediate and accumulate, +// both double- and quad-register. +class N2VDShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, + Operand ImmTy, string OpcodeStr, string Dt, + ValueType Ty, SDNode ShOp> + : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$Vd), + (ins DPR:$src1, DPR:$Vm, ImmTy:$SIMM), N2RegVShRFrm, IIC_VPALiD, + OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd", + [(set DPR:$Vd, (Ty (add DPR:$src1, + (Ty (ShOp DPR:$Vm, (i32 imm:$SIMM))))))]>; +class N2VQShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, + Operand ImmTy, string OpcodeStr, string Dt, + ValueType Ty, SDNode ShOp> + : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$Vd), + (ins QPR:$src1, QPR:$Vm, ImmTy:$SIMM), N2RegVShRFrm, IIC_VPALiD, + OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd", + [(set QPR:$Vd, (Ty (add QPR:$src1, + (Ty (ShOp QPR:$Vm, (i32 imm:$SIMM))))))]>; + +// Shift by immediate and insert, +// both double- and quad-register. +class N2VDShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, + Operand ImmTy, Format f, string OpcodeStr, string Dt, + ValueType Ty,SDNode ShOp> + : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$Vd), + (ins DPR:$src1, DPR:$Vm, ImmTy:$SIMM), f, IIC_VSHLiD, + OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd", + [(set DPR:$Vd, (Ty (ShOp DPR:$src1, DPR:$Vm, (i32 imm:$SIMM))))]>; +class N2VQShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, + Operand ImmTy, Format f, string OpcodeStr, string Dt, + ValueType Ty,SDNode ShOp> + : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$Vd), + (ins QPR:$src1, QPR:$Vm, ImmTy:$SIMM), f, IIC_VSHLiQ, + OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd", + [(set QPR:$Vd, (Ty (ShOp QPR:$src1, QPR:$Vm, (i32 imm:$SIMM))))]>; + +// Convert, with fractional bits immediate, +// both double- and quad-register. +class N2VCvtD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, + string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, + Intrinsic IntOp> + : N2VImm<op24, op23, op11_8, op7, 0, op4, + (outs DPR:$Vd), (ins DPR:$Vm, neon_vcvt_imm32:$SIMM), NVCVTFrm, + IIC_VUNAD, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "", + [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm), (i32 imm:$SIMM))))]>; +class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, + string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, + Intrinsic IntOp> + : N2VImm<op24, op23, op11_8, op7, 1, op4, + (outs QPR:$Vd), (ins QPR:$Vm, neon_vcvt_imm32:$SIMM), NVCVTFrm, + IIC_VUNAQ, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "", + [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm), (i32 imm:$SIMM))))]>; + +//===----------------------------------------------------------------------===// +// Multiclasses +//===----------------------------------------------------------------------===// + +// Abbreviations used in multiclass suffixes: +// Q = quarter int (8 bit) elements +// H = half int (16 bit) elements +// S = single int (32 bit) elements +// D = double int (64 bit) elements + +// Neon 2-register vector operations and intrinsics. + +// Neon 2-register comparisons. +// source operand element sizes of 8, 16 and 32 bits: +multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, + bits<5> op11_7, bit op4, string opc, string Dt, + string asm, SDNode OpNode> { + // 64-bit vector types. + def v8i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 0, op4, + (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, + opc, !strconcat(Dt, "8"), asm, "", + [(set DPR:$Vd, (v8i8 (OpNode (v8i8 DPR:$Vm))))]>; + def v4i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4, + (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, + opc, !strconcat(Dt, "16"), asm, "", + [(set DPR:$Vd, (v4i16 (OpNode (v4i16 DPR:$Vm))))]>; + def v2i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4, + (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, + opc, !strconcat(Dt, "32"), asm, "", + [(set DPR:$Vd, (v2i32 (OpNode (v2i32 DPR:$Vm))))]>; + def v2f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4, + (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, + opc, "f32", asm, "", + [(set DPR:$Vd, (v2i32 (OpNode (v2f32 DPR:$Vm))))]> { + let Inst{10} = 1; // overwrite F = 1 + } + + // 128-bit vector types. + def v16i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 1, op4, + (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, + opc, !strconcat(Dt, "8"), asm, "", + [(set QPR:$Vd, (v16i8 (OpNode (v16i8 QPR:$Vm))))]>; + def v8i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4, + (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, + opc, !strconcat(Dt, "16"), asm, "", + [(set QPR:$Vd, (v8i16 (OpNode (v8i16 QPR:$Vm))))]>; + def v4i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4, + (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, + opc, !strconcat(Dt, "32"), asm, "", + [(set QPR:$Vd, (v4i32 (OpNode (v4i32 QPR:$Vm))))]>; + def v4f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4, + (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, + opc, "f32", asm, "", + [(set QPR:$Vd, (v4i32 (OpNode (v4f32 QPR:$Vm))))]> { + let Inst{10} = 1; // overwrite F = 1 + } +} + + +// Neon 2-register vector intrinsics, +// element sizes of 8, 16 and 32 bits: +multiclass N2VInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, + bits<5> op11_7, bit op4, + InstrItinClass itinD, InstrItinClass itinQ, + string OpcodeStr, string Dt, Intrinsic IntOp> { + // 64-bit vector types. + def v8i8 : N2VDInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4, + itinD, OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>; + def v4i16 : N2VDInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4, + itinD, OpcodeStr, !strconcat(Dt, "16"),v4i16,v4i16,IntOp>; + def v2i32 : N2VDInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4, + itinD, OpcodeStr, !strconcat(Dt, "32"),v2i32,v2i32,IntOp>; + + // 128-bit vector types. + def v16i8 : N2VQInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4, + itinQ, OpcodeStr, !strconcat(Dt, "8"), v16i8,v16i8,IntOp>; + def v8i16 : N2VQInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4, + itinQ, OpcodeStr, !strconcat(Dt, "16"),v8i16,v8i16,IntOp>; + def v4i32 : N2VQInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4, + itinQ, OpcodeStr, !strconcat(Dt, "32"),v4i32,v4i32,IntOp>; +} + + +// Neon Narrowing 2-register vector operations, +// source operand element sizes of 16, 32 and 64 bits: +multiclass N2VN_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, + bits<5> op11_7, bit op6, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + SDNode OpNode> { + def v8i8 : N2VN<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4, + itin, OpcodeStr, !strconcat(Dt, "16"), + v8i8, v8i16, OpNode>; + def v4i16 : N2VN<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4, + itin, OpcodeStr, !strconcat(Dt, "32"), + v4i16, v4i32, OpNode>; + def v2i32 : N2VN<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4, + itin, OpcodeStr, !strconcat(Dt, "64"), + v2i32, v2i64, OpNode>; +} + +// Neon Narrowing 2-register vector intrinsics, +// source operand element sizes of 16, 32 and 64 bits: +multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, + bits<5> op11_7, bit op6, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + Intrinsic IntOp> { + def v8i8 : N2VNInt<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4, + itin, OpcodeStr, !strconcat(Dt, "16"), + v8i8, v8i16, IntOp>; + def v4i16 : N2VNInt<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4, + itin, OpcodeStr, !strconcat(Dt, "32"), + v4i16, v4i32, IntOp>; + def v2i32 : N2VNInt<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4, + itin, OpcodeStr, !strconcat(Dt, "64"), + v2i32, v2i64, IntOp>; +} + + +// Neon Lengthening 2-register vector intrinsic (currently specific to VMOVL). +// source operand element sizes of 16, 32 and 64 bits: +multiclass N2VL_QHS<bits<2> op24_23, bits<5> op11_7, bit op6, bit op4, + string OpcodeStr, string Dt, SDNode OpNode> { + def v8i16 : N2VL<op24_23, 0b00, 0b10, 0b00, op11_7, op6, op4, IIC_VQUNAiD, + OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, OpNode>; + def v4i32 : N2VL<op24_23, 0b01, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD, + OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, OpNode>; + def v2i64 : N2VL<op24_23, 0b10, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD, + OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, OpNode>; +} + + +// Neon 3-register vector operations. + +// First with only element sizes of 8, 16 and 32 bits: +multiclass N3V_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, + SDNode OpNode, bit Commutable = 0> { + // 64-bit vector types. + def v8i8 : N3VD<op24, op23, 0b00, op11_8, op4, itinD16, + OpcodeStr, !strconcat(Dt, "8"), + v8i8, v8i8, OpNode, Commutable>; + def v4i16 : N3VD<op24, op23, 0b01, op11_8, op4, itinD16, + OpcodeStr, !strconcat(Dt, "16"), + v4i16, v4i16, OpNode, Commutable>; + def v2i32 : N3VD<op24, op23, 0b10, op11_8, op4, itinD32, + OpcodeStr, !strconcat(Dt, "32"), + v2i32, v2i32, OpNode, Commutable>; + + // 128-bit vector types. + def v16i8 : N3VQ<op24, op23, 0b00, op11_8, op4, itinQ16, + OpcodeStr, !strconcat(Dt, "8"), + v16i8, v16i8, OpNode, Commutable>; + def v8i16 : N3VQ<op24, op23, 0b01, op11_8, op4, itinQ16, + OpcodeStr, !strconcat(Dt, "16"), + v8i16, v8i16, OpNode, Commutable>; + def v4i32 : N3VQ<op24, op23, 0b10, op11_8, op4, itinQ32, + OpcodeStr, !strconcat(Dt, "32"), + v4i32, v4i32, OpNode, Commutable>; +} + +multiclass N3VSL_HS<bits<4> op11_8, string OpcodeStr, string Dt, SDNode ShOp> { + def v4i16 : N3VDSL16<0b01, op11_8, OpcodeStr, !strconcat(Dt, "16"), + v4i16, ShOp>; + def v2i32 : N3VDSL<0b10, op11_8, IIC_VMULi32D, OpcodeStr, !strconcat(Dt,"32"), + v2i32, ShOp>; + def v8i16 : N3VQSL16<0b01, op11_8, OpcodeStr, !strconcat(Dt, "16"), + v8i16, v4i16, ShOp>; + def v4i32 : N3VQSL<0b10, op11_8, IIC_VMULi32Q, OpcodeStr, !strconcat(Dt,"32"), + v4i32, v2i32, ShOp>; +} + +// ....then also with element size 64 bits: +multiclass N3V_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itinD, InstrItinClass itinQ, + string OpcodeStr, string Dt, + SDNode OpNode, bit Commutable = 0> + : N3V_QHS<op24, op23, op11_8, op4, itinD, itinD, itinQ, itinQ, + OpcodeStr, Dt, OpNode, Commutable> { + def v1i64 : N3VD<op24, op23, 0b11, op11_8, op4, itinD, + OpcodeStr, !strconcat(Dt, "64"), + v1i64, v1i64, OpNode, Commutable>; + def v2i64 : N3VQ<op24, op23, 0b11, op11_8, op4, itinQ, + OpcodeStr, !strconcat(Dt, "64"), + v2i64, v2i64, OpNode, Commutable>; +} + + +// Neon 3-register vector intrinsics. + +// First with only element sizes of 16 and 32 bits: +multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, + Intrinsic IntOp, bit Commutable = 0> { + // 64-bit vector types. + def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, f, itinD16, + OpcodeStr, !strconcat(Dt, "16"), + v4i16, v4i16, IntOp, Commutable>; + def v2i32 : N3VDInt<op24, op23, 0b10, op11_8, op4, f, itinD32, + OpcodeStr, !strconcat(Dt, "32"), + v2i32, v2i32, IntOp, Commutable>; + + // 128-bit vector types. + def v8i16 : N3VQInt<op24, op23, 0b01, op11_8, op4, f, itinQ16, + OpcodeStr, !strconcat(Dt, "16"), + v8i16, v8i16, IntOp, Commutable>; + def v4i32 : N3VQInt<op24, op23, 0b10, op11_8, op4, f, itinQ32, + OpcodeStr, !strconcat(Dt, "32"), + v4i32, v4i32, IntOp, Commutable>; +} +multiclass N3VInt_HSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, + Intrinsic IntOp> { + // 64-bit vector types. + def v4i16 : N3VDIntSh<op24, op23, 0b01, op11_8, op4, f, itinD16, + OpcodeStr, !strconcat(Dt, "16"), + v4i16, v4i16, IntOp>; + def v2i32 : N3VDIntSh<op24, op23, 0b10, op11_8, op4, f, itinD32, + OpcodeStr, !strconcat(Dt, "32"), + v2i32, v2i32, IntOp>; + + // 128-bit vector types. + def v8i16 : N3VQIntSh<op24, op23, 0b01, op11_8, op4, f, itinQ16, + OpcodeStr, !strconcat(Dt, "16"), + v8i16, v8i16, IntOp>; + def v4i32 : N3VQIntSh<op24, op23, 0b10, op11_8, op4, f, itinQ32, + OpcodeStr, !strconcat(Dt, "32"), + v4i32, v4i32, IntOp>; +} + +multiclass N3VIntSL_HS<bits<4> op11_8, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, Intrinsic IntOp> { + def v4i16 : N3VDIntSL16<0b01, op11_8, itinD16, + OpcodeStr, !strconcat(Dt, "16"), v4i16, IntOp>; + def v2i32 : N3VDIntSL<0b10, op11_8, itinD32, + OpcodeStr, !strconcat(Dt, "32"), v2i32, IntOp>; + def v8i16 : N3VQIntSL16<0b01, op11_8, itinQ16, + OpcodeStr, !strconcat(Dt, "16"), v8i16, v4i16, IntOp>; + def v4i32 : N3VQIntSL<0b10, op11_8, itinQ32, + OpcodeStr, !strconcat(Dt, "32"), v4i32, v2i32, IntOp>; +} + +// ....then also with element size of 8 bits: +multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, + Intrinsic IntOp, bit Commutable = 0> + : N3VInt_HS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32, + OpcodeStr, Dt, IntOp, Commutable> { + def v8i8 : N3VDInt<op24, op23, 0b00, op11_8, op4, f, itinD16, + OpcodeStr, !strconcat(Dt, "8"), + v8i8, v8i8, IntOp, Commutable>; + def v16i8 : N3VQInt<op24, op23, 0b00, op11_8, op4, f, itinQ16, + OpcodeStr, !strconcat(Dt, "8"), + v16i8, v16i8, IntOp, Commutable>; +} +multiclass N3VInt_QHSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, + Intrinsic IntOp> + : N3VInt_HSSh<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32, + OpcodeStr, Dt, IntOp> { + def v8i8 : N3VDIntSh<op24, op23, 0b00, op11_8, op4, f, itinD16, + OpcodeStr, !strconcat(Dt, "8"), + v8i8, v8i8, IntOp>; + def v16i8 : N3VQIntSh<op24, op23, 0b00, op11_8, op4, f, itinQ16, + OpcodeStr, !strconcat(Dt, "8"), + v16i8, v16i8, IntOp>; +} + + +// ....then also with element size of 64 bits: +multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, + Intrinsic IntOp, bit Commutable = 0> + : N3VInt_QHS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32, + OpcodeStr, Dt, IntOp, Commutable> { + def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, f, itinD32, + OpcodeStr, !strconcat(Dt, "64"), + v1i64, v1i64, IntOp, Commutable>; + def v2i64 : N3VQInt<op24, op23, 0b11, op11_8, op4, f, itinQ32, + OpcodeStr, !strconcat(Dt, "64"), + v2i64, v2i64, IntOp, Commutable>; +} +multiclass N3VInt_QHSDSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, + Intrinsic IntOp> + : N3VInt_QHSSh<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32, + OpcodeStr, Dt, IntOp> { + def v1i64 : N3VDIntSh<op24, op23, 0b11, op11_8, op4, f, itinD32, + OpcodeStr, !strconcat(Dt, "64"), + v1i64, v1i64, IntOp>; + def v2i64 : N3VQIntSh<op24, op23, 0b11, op11_8, op4, f, itinQ32, + OpcodeStr, !strconcat(Dt, "64"), + v2i64, v2i64, IntOp>; +} + +// Neon Narrowing 3-register vector intrinsics, +// source operand element sizes of 16, 32 and 64 bits: +multiclass N3VNInt_HSD<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr, string Dt, + Intrinsic IntOp, bit Commutable = 0> { + def v8i8 : N3VNInt<op24, op23, 0b00, op11_8, op4, + OpcodeStr, !strconcat(Dt, "16"), + v8i8, v8i16, IntOp, Commutable>; + def v4i16 : N3VNInt<op24, op23, 0b01, op11_8, op4, + OpcodeStr, !strconcat(Dt, "32"), + v4i16, v4i32, IntOp, Commutable>; + def v2i32 : N3VNInt<op24, op23, 0b10, op11_8, op4, + OpcodeStr, !strconcat(Dt, "64"), + v2i32, v2i64, IntOp, Commutable>; +} + + +// Neon Long 3-register vector operations. + +multiclass N3VL_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin16, InstrItinClass itin32, + string OpcodeStr, string Dt, + SDNode OpNode, bit Commutable = 0> { + def v8i16 : N3VL<op24, op23, 0b00, op11_8, op4, itin16, + OpcodeStr, !strconcat(Dt, "8"), + v8i16, v8i8, OpNode, Commutable>; + def v4i32 : N3VL<op24, op23, 0b01, op11_8, op4, itin16, + OpcodeStr, !strconcat(Dt, "16"), + v4i32, v4i16, OpNode, Commutable>; + def v2i64 : N3VL<op24, op23, 0b10, op11_8, op4, itin32, + OpcodeStr, !strconcat(Dt, "32"), + v2i64, v2i32, OpNode, Commutable>; +} + +multiclass N3VLSL_HS<bit op24, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + SDNode OpNode> { + def v4i16 : N3VLSL16<op24, 0b01, op11_8, itin, OpcodeStr, + !strconcat(Dt, "16"), v4i32, v4i16, OpNode>; + def v2i32 : N3VLSL<op24, 0b10, op11_8, itin, OpcodeStr, + !strconcat(Dt, "32"), v2i64, v2i32, OpNode>; +} + +multiclass N3VLExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin16, InstrItinClass itin32, + string OpcodeStr, string Dt, + SDNode OpNode, SDNode ExtOp, bit Commutable = 0> { + def v8i16 : N3VLExt<op24, op23, 0b00, op11_8, op4, itin16, + OpcodeStr, !strconcat(Dt, "8"), + v8i16, v8i8, OpNode, ExtOp, Commutable>; + def v4i32 : N3VLExt<op24, op23, 0b01, op11_8, op4, itin16, + OpcodeStr, !strconcat(Dt, "16"), + v4i32, v4i16, OpNode, ExtOp, Commutable>; + def v2i64 : N3VLExt<op24, op23, 0b10, op11_8, op4, itin32, + OpcodeStr, !strconcat(Dt, "32"), + v2i64, v2i32, OpNode, ExtOp, Commutable>; +} + +// Neon Long 3-register vector intrinsics. + +// First with only element sizes of 16 and 32 bits: +multiclass N3VLInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin16, InstrItinClass itin32, + string OpcodeStr, string Dt, + Intrinsic IntOp, bit Commutable = 0> { + def v4i32 : N3VLInt<op24, op23, 0b01, op11_8, op4, itin16, + OpcodeStr, !strconcat(Dt, "16"), + v4i32, v4i16, IntOp, Commutable>; + def v2i64 : N3VLInt<op24, op23, 0b10, op11_8, op4, itin32, + OpcodeStr, !strconcat(Dt, "32"), + v2i64, v2i32, IntOp, Commutable>; +} + +multiclass N3VLIntSL_HS<bit op24, bits<4> op11_8, + InstrItinClass itin, string OpcodeStr, string Dt, + Intrinsic IntOp> { + def v4i16 : N3VLIntSL16<op24, 0b01, op11_8, itin, + OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>; + def v2i32 : N3VLIntSL<op24, 0b10, op11_8, itin, + OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>; +} + +// ....then also with element size of 8 bits: +multiclass N3VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin16, InstrItinClass itin32, + string OpcodeStr, string Dt, + Intrinsic IntOp, bit Commutable = 0> + : N3VLInt_HS<op24, op23, op11_8, op4, itin16, itin32, OpcodeStr, Dt, + IntOp, Commutable> { + def v8i16 : N3VLInt<op24, op23, 0b00, op11_8, op4, itin16, + OpcodeStr, !strconcat(Dt, "8"), + v8i16, v8i8, IntOp, Commutable>; +} + +// ....with explicit extend (VABDL). +multiclass N3VLIntExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + Intrinsic IntOp, SDNode ExtOp, bit Commutable = 0> { + def v8i16 : N3VLIntExt<op24, op23, 0b00, op11_8, op4, itin, + OpcodeStr, !strconcat(Dt, "8"), + v8i16, v8i8, IntOp, ExtOp, Commutable>; + def v4i32 : N3VLIntExt<op24, op23, 0b01, op11_8, op4, itin, + OpcodeStr, !strconcat(Dt, "16"), + v4i32, v4i16, IntOp, ExtOp, Commutable>; + def v2i64 : N3VLIntExt<op24, op23, 0b10, op11_8, op4, itin, + OpcodeStr, !strconcat(Dt, "32"), + v2i64, v2i32, IntOp, ExtOp, Commutable>; +} + + +// Neon Wide 3-register vector intrinsics, +// source operand element sizes of 8, 16 and 32 bits: +multiclass N3VW_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr, string Dt, + SDNode OpNode, SDNode ExtOp, bit Commutable = 0> { + def v8i16 : N3VW<op24, op23, 0b00, op11_8, op4, + OpcodeStr, !strconcat(Dt, "8"), + v8i16, v8i8, OpNode, ExtOp, Commutable>; + def v4i32 : N3VW<op24, op23, 0b01, op11_8, op4, + OpcodeStr, !strconcat(Dt, "16"), + v4i32, v4i16, OpNode, ExtOp, Commutable>; + def v2i64 : N3VW<op24, op23, 0b10, op11_8, op4, + OpcodeStr, !strconcat(Dt, "32"), + v2i64, v2i32, OpNode, ExtOp, Commutable>; +} + + +// Neon Multiply-Op vector operations, +// element sizes of 8, 16 and 32 bits: +multiclass N3VMulOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, SDNode OpNode> { + // 64-bit vector types. + def v8i8 : N3VDMulOp<op24, op23, 0b00, op11_8, op4, itinD16, + OpcodeStr, !strconcat(Dt, "8"), v8i8, mul, OpNode>; + def v4i16 : N3VDMulOp<op24, op23, 0b01, op11_8, op4, itinD16, + OpcodeStr, !strconcat(Dt, "16"), v4i16, mul, OpNode>; + def v2i32 : N3VDMulOp<op24, op23, 0b10, op11_8, op4, itinD32, + OpcodeStr, !strconcat(Dt, "32"), v2i32, mul, OpNode>; + + // 128-bit vector types. + def v16i8 : N3VQMulOp<op24, op23, 0b00, op11_8, op4, itinQ16, + OpcodeStr, !strconcat(Dt, "8"), v16i8, mul, OpNode>; + def v8i16 : N3VQMulOp<op24, op23, 0b01, op11_8, op4, itinQ16, + OpcodeStr, !strconcat(Dt, "16"), v8i16, mul, OpNode>; + def v4i32 : N3VQMulOp<op24, op23, 0b10, op11_8, op4, itinQ32, + OpcodeStr, !strconcat(Dt, "32"), v4i32, mul, OpNode>; +} + +multiclass N3VMulOpSL_HS<bits<4> op11_8, + InstrItinClass itinD16, InstrItinClass itinD32, + InstrItinClass itinQ16, InstrItinClass itinQ32, + string OpcodeStr, string Dt, SDNode ShOp> { + def v4i16 : N3VDMulOpSL16<0b01, op11_8, itinD16, + OpcodeStr, !strconcat(Dt, "16"), v4i16, mul, ShOp>; + def v2i32 : N3VDMulOpSL<0b10, op11_8, itinD32, + OpcodeStr, !strconcat(Dt, "32"), v2i32, mul, ShOp>; + def v8i16 : N3VQMulOpSL16<0b01, op11_8, itinQ16, + OpcodeStr, !strconcat(Dt, "16"), v8i16, v4i16, + mul, ShOp>; + def v4i32 : N3VQMulOpSL<0b10, op11_8, itinQ32, + OpcodeStr, !strconcat(Dt, "32"), v4i32, v2i32, + mul, ShOp>; +} + +// Neon Intrinsic-Op vector operations, +// element sizes of 8, 16 and 32 bits: +multiclass N3VIntOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itinD, InstrItinClass itinQ, + string OpcodeStr, string Dt, Intrinsic IntOp, + SDNode OpNode> { + // 64-bit vector types. + def v8i8 : N3VDIntOp<op24, op23, 0b00, op11_8, op4, itinD, + OpcodeStr, !strconcat(Dt, "8"), v8i8, IntOp, OpNode>; + def v4i16 : N3VDIntOp<op24, op23, 0b01, op11_8, op4, itinD, + OpcodeStr, !strconcat(Dt, "16"), v4i16, IntOp, OpNode>; + def v2i32 : N3VDIntOp<op24, op23, 0b10, op11_8, op4, itinD, + OpcodeStr, !strconcat(Dt, "32"), v2i32, IntOp, OpNode>; + + // 128-bit vector types. + def v16i8 : N3VQIntOp<op24, op23, 0b00, op11_8, op4, itinQ, + OpcodeStr, !strconcat(Dt, "8"), v16i8, IntOp, OpNode>; + def v8i16 : N3VQIntOp<op24, op23, 0b01, op11_8, op4, itinQ, + OpcodeStr, !strconcat(Dt, "16"), v8i16, IntOp, OpNode>; + def v4i32 : N3VQIntOp<op24, op23, 0b10, op11_8, op4, itinQ, + OpcodeStr, !strconcat(Dt, "32"), v4i32, IntOp, OpNode>; +} + +// Neon 3-argument intrinsics, +// element sizes of 8, 16 and 32 bits: +multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itinD, InstrItinClass itinQ, + string OpcodeStr, string Dt, Intrinsic IntOp> { + // 64-bit vector types. + def v8i8 : N3VDInt3<op24, op23, 0b00, op11_8, op4, itinD, + OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>; + def v4i16 : N3VDInt3<op24, op23, 0b01, op11_8, op4, itinD, + OpcodeStr, !strconcat(Dt, "16"), v4i16, v4i16, IntOp>; + def v2i32 : N3VDInt3<op24, op23, 0b10, op11_8, op4, itinD, + OpcodeStr, !strconcat(Dt, "32"), v2i32, v2i32, IntOp>; + + // 128-bit vector types. + def v16i8 : N3VQInt3<op24, op23, 0b00, op11_8, op4, itinQ, + OpcodeStr, !strconcat(Dt, "8"), v16i8, v16i8, IntOp>; + def v8i16 : N3VQInt3<op24, op23, 0b01, op11_8, op4, itinQ, + OpcodeStr, !strconcat(Dt, "16"), v8i16, v8i16, IntOp>; + def v4i32 : N3VQInt3<op24, op23, 0b10, op11_8, op4, itinQ, + OpcodeStr, !strconcat(Dt, "32"), v4i32, v4i32, IntOp>; +} + + +// Neon Long Multiply-Op vector operations, +// element sizes of 8, 16 and 32 bits: +multiclass N3VLMulOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin16, InstrItinClass itin32, + string OpcodeStr, string Dt, SDNode MulOp, + SDNode OpNode> { + def v8i16 : N3VLMulOp<op24, op23, 0b00, op11_8, op4, itin16, OpcodeStr, + !strconcat(Dt, "8"), v8i16, v8i8, MulOp, OpNode>; + def v4i32 : N3VLMulOp<op24, op23, 0b01, op11_8, op4, itin16, OpcodeStr, + !strconcat(Dt, "16"), v4i32, v4i16, MulOp, OpNode>; + def v2i64 : N3VLMulOp<op24, op23, 0b10, op11_8, op4, itin32, OpcodeStr, + !strconcat(Dt, "32"), v2i64, v2i32, MulOp, OpNode>; +} + +multiclass N3VLMulOpSL_HS<bit op24, bits<4> op11_8, string OpcodeStr, + string Dt, SDNode MulOp, SDNode OpNode> { + def v4i16 : N3VLMulOpSL16<op24, 0b01, op11_8, IIC_VMACi16D, OpcodeStr, + !strconcat(Dt,"16"), v4i32, v4i16, MulOp, OpNode>; + def v2i32 : N3VLMulOpSL<op24, 0b10, op11_8, IIC_VMACi32D, OpcodeStr, + !strconcat(Dt, "32"), v2i64, v2i32, MulOp, OpNode>; +} + + +// Neon Long 3-argument intrinsics. + +// First with only element sizes of 16 and 32 bits: +multiclass N3VLInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin16, InstrItinClass itin32, + string OpcodeStr, string Dt, Intrinsic IntOp> { + def v4i32 : N3VLInt3<op24, op23, 0b01, op11_8, op4, itin16, + OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>; + def v2i64 : N3VLInt3<op24, op23, 0b10, op11_8, op4, itin32, + OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>; +} + +multiclass N3VLInt3SL_HS<bit op24, bits<4> op11_8, + string OpcodeStr, string Dt, Intrinsic IntOp> { + def v4i16 : N3VLInt3SL16<op24, 0b01, op11_8, IIC_VMACi16D, + OpcodeStr, !strconcat(Dt,"16"), v4i32, v4i16, IntOp>; + def v2i32 : N3VLInt3SL<op24, 0b10, op11_8, IIC_VMACi32D, + OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>; +} + +// ....then also with element size of 8 bits: +multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin16, InstrItinClass itin32, + string OpcodeStr, string Dt, Intrinsic IntOp> + : N3VLInt3_HS<op24, op23, op11_8, op4, itin16, itin32, OpcodeStr, Dt, IntOp> { + def v8i16 : N3VLInt3<op24, op23, 0b00, op11_8, op4, itin16, + OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, IntOp>; +} + +// ....with explicit extend (VABAL). +multiclass N3VLIntExtOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + Intrinsic IntOp, SDNode ExtOp, SDNode OpNode> { + def v8i16 : N3VLIntExtOp<op24, op23, 0b00, op11_8, op4, itin, + OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, + IntOp, ExtOp, OpNode>; + def v4i32 : N3VLIntExtOp<op24, op23, 0b01, op11_8, op4, itin, + OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, + IntOp, ExtOp, OpNode>; + def v2i64 : N3VLIntExtOp<op24, op23, 0b10, op11_8, op4, itin, + OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, + IntOp, ExtOp, OpNode>; +} + + +// Neon Pairwise long 2-register intrinsics, +// element sizes of 8, 16 and 32 bits: +multiclass N2VPLInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, + bits<5> op11_7, bit op4, + string OpcodeStr, string Dt, Intrinsic IntOp> { + // 64-bit vector types. + def v8i8 : N2VDPLInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "8"), v4i16, v8i8, IntOp>; + def v4i16 : N2VDPLInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "16"), v2i32, v4i16, IntOp>; + def v2i32 : N2VDPLInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "32"), v1i64, v2i32, IntOp>; + + // 128-bit vector types. + def v16i8 : N2VQPLInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "8"), v8i16, v16i8, IntOp>; + def v8i16 : N2VQPLInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "16"), v4i32, v8i16, IntOp>; + def v4i32 : N2VQPLInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "32"), v2i64, v4i32, IntOp>; +} + + +// Neon Pairwise long 2-register accumulate intrinsics, +// element sizes of 8, 16 and 32 bits: +multiclass N2VPLInt2_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, + bits<5> op11_7, bit op4, + string OpcodeStr, string Dt, Intrinsic IntOp> { + // 64-bit vector types. + def v8i8 : N2VDPLInt2<op24_23, op21_20, 0b00, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "8"), v4i16, v8i8, IntOp>; + def v4i16 : N2VDPLInt2<op24_23, op21_20, 0b01, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "16"), v2i32, v4i16, IntOp>; + def v2i32 : N2VDPLInt2<op24_23, op21_20, 0b10, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "32"), v1i64, v2i32, IntOp>; + + // 128-bit vector types. + def v16i8 : N2VQPLInt2<op24_23, op21_20, 0b00, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "8"), v8i16, v16i8, IntOp>; + def v8i16 : N2VQPLInt2<op24_23, op21_20, 0b01, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "16"), v4i32, v8i16, IntOp>; + def v4i32 : N2VQPLInt2<op24_23, op21_20, 0b10, op17_16, op11_7, op4, + OpcodeStr, !strconcat(Dt, "32"), v2i64, v4i32, IntOp>; +} + + +// Neon 2-register vector shift by immediate, +// with f of either N2RegVShLFrm or N2RegVShRFrm +// element sizes of 8, 16, 32 and 64 bits: +multiclass N2VShL_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + SDNode OpNode> { + // 64-bit vector types. + def v8i8 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm, + OpcodeStr, !strconcat(Dt, "8"), v8i8, OpNode> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm, + OpcodeStr, !strconcat(Dt, "16"), v4i16, OpNode> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm, + OpcodeStr, !strconcat(Dt, "32"), v2i32, OpNode> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, N2RegVShLFrm, itin, i32imm, + OpcodeStr, !strconcat(Dt, "64"), v1i64, OpNode>; + // imm6 = xxxxxx + + // 128-bit vector types. + def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm, + OpcodeStr, !strconcat(Dt, "8"), v16i8, OpNode> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm, + OpcodeStr, !strconcat(Dt, "16"), v8i16, OpNode> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm, + OpcodeStr, !strconcat(Dt, "32"), v4i32, OpNode> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, N2RegVShLFrm, itin, i32imm, + OpcodeStr, !strconcat(Dt, "64"), v2i64, OpNode>; + // imm6 = xxxxxx +} +multiclass N2VShR_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + SDNode OpNode> { + // 64-bit vector types. + def v8i8 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm8, + OpcodeStr, !strconcat(Dt, "8"), v8i8, OpNode> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm16, + OpcodeStr, !strconcat(Dt, "16"), v4i16, OpNode> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm32, + OpcodeStr, !strconcat(Dt, "32"), v2i32, OpNode> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, N2RegVShRFrm, itin, shr_imm64, + OpcodeStr, !strconcat(Dt, "64"), v1i64, OpNode>; + // imm6 = xxxxxx + + // 128-bit vector types. + def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm8, + OpcodeStr, !strconcat(Dt, "8"), v16i8, OpNode> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm16, + OpcodeStr, !strconcat(Dt, "16"), v8i16, OpNode> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm32, + OpcodeStr, !strconcat(Dt, "32"), v4i32, OpNode> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, N2RegVShRFrm, itin, shr_imm64, + OpcodeStr, !strconcat(Dt, "64"), v2i64, OpNode>; + // imm6 = xxxxxx +} + +// Neon Shift-Accumulate vector operations, +// element sizes of 8, 16, 32 and 64 bits: +multiclass N2VShAdd_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr, string Dt, SDNode ShOp> { + // 64-bit vector types. + def v8i8 : N2VDShAdd<op24, op23, op11_8, 0, op4, shr_imm8, + OpcodeStr, !strconcat(Dt, "8"), v8i8, ShOp> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v4i16 : N2VDShAdd<op24, op23, op11_8, 0, op4, shr_imm16, + OpcodeStr, !strconcat(Dt, "16"), v4i16, ShOp> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v2i32 : N2VDShAdd<op24, op23, op11_8, 0, op4, shr_imm32, + OpcodeStr, !strconcat(Dt, "32"), v2i32, ShOp> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v1i64 : N2VDShAdd<op24, op23, op11_8, 1, op4, shr_imm64, + OpcodeStr, !strconcat(Dt, "64"), v1i64, ShOp>; + // imm6 = xxxxxx + + // 128-bit vector types. + def v16i8 : N2VQShAdd<op24, op23, op11_8, 0, op4, shr_imm8, + OpcodeStr, !strconcat(Dt, "8"), v16i8, ShOp> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v8i16 : N2VQShAdd<op24, op23, op11_8, 0, op4, shr_imm16, + OpcodeStr, !strconcat(Dt, "16"), v8i16, ShOp> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v4i32 : N2VQShAdd<op24, op23, op11_8, 0, op4, shr_imm32, + OpcodeStr, !strconcat(Dt, "32"), v4i32, ShOp> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v2i64 : N2VQShAdd<op24, op23, op11_8, 1, op4, shr_imm64, + OpcodeStr, !strconcat(Dt, "64"), v2i64, ShOp>; + // imm6 = xxxxxx +} + +// Neon Shift-Insert vector operations, +// with f of either N2RegVShLFrm or N2RegVShRFrm +// element sizes of 8, 16, 32 and 64 bits: +multiclass N2VShInsL_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr> { + // 64-bit vector types. + def v8i8 : N2VDShIns<op24, op23, op11_8, 0, op4, i32imm, + N2RegVShLFrm, OpcodeStr, "8", v8i8, NEONvsli> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4, i32imm, + N2RegVShLFrm, OpcodeStr, "16", v4i16, NEONvsli> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4, i32imm, + N2RegVShLFrm, OpcodeStr, "32", v2i32, NEONvsli> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4, i32imm, + N2RegVShLFrm, OpcodeStr, "64", v1i64, NEONvsli>; + // imm6 = xxxxxx + + // 128-bit vector types. + def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4, i32imm, + N2RegVShLFrm, OpcodeStr, "8", v16i8, NEONvsli> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4, i32imm, + N2RegVShLFrm, OpcodeStr, "16", v8i16, NEONvsli> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4, i32imm, + N2RegVShLFrm, OpcodeStr, "32", v4i32, NEONvsli> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4, i32imm, + N2RegVShLFrm, OpcodeStr, "64", v2i64, NEONvsli>; + // imm6 = xxxxxx +} +multiclass N2VShInsR_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr> { + // 64-bit vector types. + def v8i8 : N2VDShIns<op24, op23, op11_8, 0, op4, shr_imm8, + N2RegVShRFrm, OpcodeStr, "8", v8i8, NEONvsri> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4, shr_imm16, + N2RegVShRFrm, OpcodeStr, "16", v4i16, NEONvsri> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4, shr_imm32, + N2RegVShRFrm, OpcodeStr, "32", v2i32, NEONvsri> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4, shr_imm64, + N2RegVShRFrm, OpcodeStr, "64", v1i64, NEONvsri>; + // imm6 = xxxxxx + + // 128-bit vector types. + def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4, shr_imm8, + N2RegVShRFrm, OpcodeStr, "8", v16i8, NEONvsri> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4, shr_imm16, + N2RegVShRFrm, OpcodeStr, "16", v8i16, NEONvsri> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4, shr_imm32, + N2RegVShRFrm, OpcodeStr, "32", v4i32, NEONvsri> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4, shr_imm64, + N2RegVShRFrm, OpcodeStr, "64", v2i64, NEONvsri>; + // imm6 = xxxxxx +} + +// Neon Shift Long operations, +// element sizes of 8, 16, 32 bits: +multiclass N2VLSh_QHS<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, + bit op4, string OpcodeStr, string Dt, SDNode OpNode> { + def v8i16 : N2VLSh<op24, op23, op11_8, op7, op6, op4, + OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, OpNode> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v4i32 : N2VLSh<op24, op23, op11_8, op7, op6, op4, + OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, OpNode> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v2i64 : N2VLSh<op24, op23, op11_8, op7, op6, op4, + OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, OpNode> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } +} + +// Neon Shift Narrow operations, +// element sizes of 16, 32, 64 bits: +multiclass N2VNSh_HSD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, + bit op4, InstrItinClass itin, string OpcodeStr, string Dt, + SDNode OpNode> { + def v8i8 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin, + OpcodeStr, !strconcat(Dt, "16"), + v8i8, v8i16, shr_imm8, OpNode> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v4i16 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin, + OpcodeStr, !strconcat(Dt, "32"), + v4i16, v4i32, shr_imm16, OpNode> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v2i32 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin, + OpcodeStr, !strconcat(Dt, "64"), + v2i32, v2i64, shr_imm32, OpNode> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } +} + +//===----------------------------------------------------------------------===// +// Instruction Definitions. +//===----------------------------------------------------------------------===// + +// Vector Add Operations. + +// VADD : Vector Add (integer and floating-point) +defm VADD : N3V_QHSD<0, 0, 0b1000, 0, IIC_VBINiD, IIC_VBINiQ, "vadd", "i", + add, 1>; +def VADDfd : N3VD<0, 0, 0b00, 0b1101, 0, IIC_VBIND, "vadd", "f32", + v2f32, v2f32, fadd, 1>; +def VADDfq : N3VQ<0, 0, 0b00, 0b1101, 0, IIC_VBINQ, "vadd", "f32", + v4f32, v4f32, fadd, 1>; +// VADDL : Vector Add Long (Q = D + D) +defm VADDLs : N3VLExt_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD, + "vaddl", "s", add, sext, 1>; +defm VADDLu : N3VLExt_QHS<1,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD, + "vaddl", "u", add, zext, 1>; +// VADDW : Vector Add Wide (Q = Q + D) +defm VADDWs : N3VW_QHS<0,1,0b0001,0, "vaddw", "s", add, sext, 0>; +defm VADDWu : N3VW_QHS<1,1,0b0001,0, "vaddw", "u", add, zext, 0>; +// VHADD : Vector Halving Add +defm VHADDs : N3VInt_QHS<0, 0, 0b0000, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vhadd", "s", int_arm_neon_vhadds, 1>; +defm VHADDu : N3VInt_QHS<1, 0, 0b0000, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vhadd", "u", int_arm_neon_vhaddu, 1>; +// VRHADD : Vector Rounding Halving Add +defm VRHADDs : N3VInt_QHS<0, 0, 0b0001, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vrhadd", "s", int_arm_neon_vrhadds, 1>; +defm VRHADDu : N3VInt_QHS<1, 0, 0b0001, 0, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vrhadd", "u", int_arm_neon_vrhaddu, 1>; +// VQADD : Vector Saturating Add +defm VQADDs : N3VInt_QHSD<0, 0, 0b0000, 1, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vqadd", "s", int_arm_neon_vqadds, 1>; +defm VQADDu : N3VInt_QHSD<1, 0, 0b0000, 1, N3RegFrm, + IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, + "vqadd", "u", int_arm_neon_vqaddu, 1>; +// VADDHN : Vector Add and Narrow Returning High Half (D = Q + Q) +defm VADDHN : N3VNInt_HSD<0,1,0b0100,0, "vaddhn", "i", + int_arm_neon_vaddhn, 1>; +// VRADDHN : Vector Rounding Add and Narrow Returning High Half (D = Q + Q) +defm VRADDHN : N3VNInt_HSD<1,1,0b0100,0, "vraddhn", "i", + int_arm_neon_vraddhn, 1>; + +// Vector Multiply Operations. + +// VMUL : Vector Multiply (integer, polynomial and floating-point) +defm VMUL : N3V_QHS<0, 0, 0b1001, 1, IIC_VMULi16D, IIC_VMULi32D, + IIC_VMULi16Q, IIC_VMULi32Q, "vmul", "i", mul, 1>; +def VMULpd : N3VDInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16D, "vmul", + "p8", v8i8, v8i8, int_arm_neon_vmulp, 1>; +def VMULpq : N3VQInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16Q, "vmul", + "p8", v16i8, v16i8, int_arm_neon_vmulp, 1>; +def VMULfd : N3VD<1, 0, 0b00, 0b1101, 1, IIC_VFMULD, "vmul", "f32", + v2f32, v2f32, fmul, 1>; +def VMULfq : N3VQ<1, 0, 0b00, 0b1101, 1, IIC_VFMULQ, "vmul", "f32", + v4f32, v4f32, fmul, 1>; +defm VMULsl : N3VSL_HS<0b1000, "vmul", "i", mul>; +def VMULslfd : N3VDSL<0b10, 0b1001, IIC_VBIND, "vmul", "f32", v2f32, fmul>; +def VMULslfq : N3VQSL<0b10, 0b1001, IIC_VBINQ, "vmul", "f32", v4f32, + v2f32, fmul>; + +def : Pat<(v8i16 (mul (v8i16 QPR:$src1), + (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))), + (v8i16 (VMULslv8i16 (v8i16 QPR:$src1), + (v4i16 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; +def : Pat<(v4i32 (mul (v4i32 QPR:$src1), + (v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))), + (v4i32 (VMULslv4i32 (v4i32 QPR:$src1), + (v2i32 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; +def : Pat<(v4f32 (fmul (v4f32 QPR:$src1), + (v4f32 (NEONvduplane (v4f32 QPR:$src2), imm:$lane)))), + (v4f32 (VMULslfq (v4f32 QPR:$src1), + (v2f32 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + +// VQDMULH : Vector Saturating Doubling Multiply Returning High Half +defm VQDMULH : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D, + IIC_VMULi16Q, IIC_VMULi32Q, + "vqdmulh", "s", int_arm_neon_vqdmulh, 1>; +defm VQDMULHsl: N3VIntSL_HS<0b1100, IIC_VMULi16D, IIC_VMULi32D, + IIC_VMULi16Q, IIC_VMULi32Q, + "vqdmulh", "s", int_arm_neon_vqdmulh>; +def : Pat<(v8i16 (int_arm_neon_vqdmulh (v8i16 QPR:$src1), + (v8i16 (NEONvduplane (v8i16 QPR:$src2), + imm:$lane)))), + (v8i16 (VQDMULHslv8i16 (v8i16 QPR:$src1), + (v4i16 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; +def : Pat<(v4i32 (int_arm_neon_vqdmulh (v4i32 QPR:$src1), + (v4i32 (NEONvduplane (v4i32 QPR:$src2), + imm:$lane)))), + (v4i32 (VQDMULHslv4i32 (v4i32 QPR:$src1), + (v2i32 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + +// VQRDMULH : Vector Rounding Saturating Doubling Multiply Returning High Half +defm VQRDMULH : N3VInt_HS<1, 0, 0b1011, 0, N3RegFrm, + IIC_VMULi16D,IIC_VMULi32D,IIC_VMULi16Q,IIC_VMULi32Q, + "vqrdmulh", "s", int_arm_neon_vqrdmulh, 1>; +defm VQRDMULHsl : N3VIntSL_HS<0b1101, IIC_VMULi16D, IIC_VMULi32D, + IIC_VMULi16Q, IIC_VMULi32Q, + "vqrdmulh", "s", int_arm_neon_vqrdmulh>; +def : Pat<(v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src1), + (v8i16 (NEONvduplane (v8i16 QPR:$src2), + imm:$lane)))), + (v8i16 (VQRDMULHslv8i16 (v8i16 QPR:$src1), + (v4i16 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; +def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1), + (v4i32 (NEONvduplane (v4i32 QPR:$src2), + imm:$lane)))), + (v4i32 (VQRDMULHslv4i32 (v4i32 QPR:$src1), + (v2i32 (EXTRACT_SUBREG QPR:$src2, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + +// VMULL : Vector Multiply Long (integer and polynomial) (Q = D * D) +defm VMULLs : N3VL_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D, + "vmull", "s", NEONvmulls, 1>; +defm VMULLu : N3VL_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D, + "vmull", "u", NEONvmullu, 1>; +def VMULLp : N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull", "p8", + v8i16, v8i8, int_arm_neon_vmullp, 1>; +defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", NEONvmulls>; +defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", NEONvmullu>; + +// VQDMULL : Vector Saturating Doubling Multiply Long (Q = D * D) +defm VQDMULL : N3VLInt_HS<0,1,0b1101,0, IIC_VMULi16D, IIC_VMULi32D, + "vqdmull", "s", int_arm_neon_vqdmull, 1>; +defm VQDMULLsl: N3VLIntSL_HS<0, 0b1011, IIC_VMULi16D, + "vqdmull", "s", int_arm_neon_vqdmull>; + +// Vector Multiply-Accumulate and Multiply-Subtract Operations. + +// VMLA : Vector Multiply Accumulate (integer and floating-point) +defm VMLA : N3VMulOp_QHS<0, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, + IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>; +def VMLAfd : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32", + v2f32, fmul_su, fadd_mlx>, + Requires<[HasNEON, UseFPVMLx]>; +def VMLAfq : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32", + v4f32, fmul_su, fadd_mlx>, + Requires<[HasNEON, UseFPVMLx]>; +defm VMLAsl : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D, + IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>; +def VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32", + v2f32, fmul_su, fadd_mlx>, + Requires<[HasNEON, UseFPVMLx]>; +def VMLAslfq : N3VQMulOpSL<0b10, 0b0001, IIC_VMACQ, "vmla", "f32", + v4f32, v2f32, fmul_su, fadd_mlx>, + Requires<[HasNEON, UseFPVMLx]>; + +def : Pat<(v8i16 (add (v8i16 QPR:$src1), + (mul (v8i16 QPR:$src2), + (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))), + (v8i16 (VMLAslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2), + (v4i16 (EXTRACT_SUBREG QPR:$src3, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; + +def : Pat<(v4i32 (add (v4i32 QPR:$src1), + (mul (v4i32 QPR:$src2), + (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))), + (v4i32 (VMLAslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2), + (v2i32 (EXTRACT_SUBREG QPR:$src3, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + +def : Pat<(v4f32 (fadd_mlx (v4f32 QPR:$src1), + (fmul_su (v4f32 QPR:$src2), + (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))), + (v4f32 (VMLAslfq (v4f32 QPR:$src1), + (v4f32 QPR:$src2), + (v2f32 (EXTRACT_SUBREG QPR:$src3, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>, + Requires<[HasNEON, UseFPVMLx]>; + +// VMLAL : Vector Multiply Accumulate Long (Q += D * D) +defm VMLALs : N3VLMulOp_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D, + "vmlal", "s", NEONvmulls, add>; +defm VMLALu : N3VLMulOp_QHS<1,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D, + "vmlal", "u", NEONvmullu, add>; + +defm VMLALsls : N3VLMulOpSL_HS<0, 0b0010, "vmlal", "s", NEONvmulls, add>; +defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", NEONvmullu, add>; + +// VQDMLAL : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D) +defm VQDMLAL : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, + "vqdmlal", "s", int_arm_neon_vqdmlal>; +defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", int_arm_neon_vqdmlal>; + +// VMLS : Vector Multiply Subtract (integer and floating-point) +defm VMLS : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, + IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>; +def VMLSfd : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32", + v2f32, fmul_su, fsub_mlx>, + Requires<[HasNEON, UseFPVMLx]>; +def VMLSfq : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32", + v4f32, fmul_su, fsub_mlx>, + Requires<[HasNEON, UseFPVMLx]>; +defm VMLSsl : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D, + IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>; +def VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32", + v2f32, fmul_su, fsub_mlx>, + Requires<[HasNEON, UseFPVMLx]>; +def VMLSslfq : N3VQMulOpSL<0b10, 0b0101, IIC_VMACQ, "vmls", "f32", + v4f32, v2f32, fmul_su, fsub_mlx>, + Requires<[HasNEON, UseFPVMLx]>; + +def : Pat<(v8i16 (sub (v8i16 QPR:$src1), + (mul (v8i16 QPR:$src2), + (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))), + (v8i16 (VMLSslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2), + (v4i16 (EXTRACT_SUBREG QPR:$src3, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; + +def : Pat<(v4i32 (sub (v4i32 QPR:$src1), + (mul (v4i32 QPR:$src2), + (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))), + (v4i32 (VMLSslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2), + (v2i32 (EXTRACT_SUBREG QPR:$src3, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + +def : Pat<(v4f32 (fsub_mlx (v4f32 QPR:$src1), + (fmul_su (v4f32 QPR:$src2), + (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))), + (v4f32 (VMLSslfq (v4f32 QPR:$src1), (v4f32 QPR:$src2), + (v2f32 (EXTRACT_SUBREG QPR:$src3, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>, + Requires<[HasNEON, UseFPVMLx]>; + +// VMLSL : Vector Multiply Subtract Long (Q -= D * D) +defm VMLSLs : N3VLMulOp_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D, + "vmlsl", "s", NEONvmulls, sub>; +defm VMLSLu : N3VLMulOp_QHS<1,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D, + "vmlsl", "u", NEONvmullu, sub>; + +defm VMLSLsls : N3VLMulOpSL_HS<0, 0b0110, "vmlsl", "s", NEONvmulls, sub>; +defm VMLSLslu : N3VLMulOpSL_HS<1, 0b0110, "vmlsl", "u", NEONvmullu, sub>; + +// VQDMLSL : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D) +defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D, + "vqdmlsl", "s", int_arm_neon_vqdmlsl>; +defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", int_arm_neon_vqdmlsl>; + +// Vector Subtract Operations. + +// VSUB : Vector Subtract (integer and floating-point) +defm VSUB : N3V_QHSD<1, 0, 0b1000, 0, IIC_VSUBiD, IIC_VSUBiQ, + "vsub", "i", sub, 0>; +def VSUBfd : N3VD<0, 0, 0b10, 0b1101, 0, IIC_VBIND, "vsub", "f32", + v2f32, v2f32, fsub, 0>; +def VSUBfq : N3VQ<0, 0, 0b10, 0b1101, 0, IIC_VBINQ, "vsub", "f32", + v4f32, v4f32, fsub, 0>; +// VSUBL : Vector Subtract Long (Q = D - D) +defm VSUBLs : N3VLExt_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD, + "vsubl", "s", sub, sext, 0>; +defm VSUBLu : N3VLExt_QHS<1,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD, + "vsubl", "u", sub, zext, 0>; +// VSUBW : Vector Subtract Wide (Q = Q - D) +defm VSUBWs : N3VW_QHS<0,1,0b0011,0, "vsubw", "s", sub, sext, 0>; +defm VSUBWu : N3VW_QHS<1,1,0b0011,0, "vsubw", "u", sub, zext, 0>; +// VHSUB : Vector Halving Subtract +defm VHSUBs : N3VInt_QHS<0, 0, 0b0010, 0, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vhsub", "s", int_arm_neon_vhsubs, 0>; +defm VHSUBu : N3VInt_QHS<1, 0, 0b0010, 0, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vhsub", "u", int_arm_neon_vhsubu, 0>; +// VQSUB : Vector Saturing Subtract +defm VQSUBs : N3VInt_QHSD<0, 0, 0b0010, 1, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vqsub", "s", int_arm_neon_vqsubs, 0>; +defm VQSUBu : N3VInt_QHSD<1, 0, 0b0010, 1, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vqsub", "u", int_arm_neon_vqsubu, 0>; +// VSUBHN : Vector Subtract and Narrow Returning High Half (D = Q - Q) +defm VSUBHN : N3VNInt_HSD<0,1,0b0110,0, "vsubhn", "i", + int_arm_neon_vsubhn, 0>; +// VRSUBHN : Vector Rounding Subtract and Narrow Returning High Half (D=Q-Q) +defm VRSUBHN : N3VNInt_HSD<1,1,0b0110,0, "vrsubhn", "i", + int_arm_neon_vrsubhn, 0>; + +// Vector Comparisons. + +// VCEQ : Vector Compare Equal +defm VCEQ : N3V_QHS<1, 0, 0b1000, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, + IIC_VSUBi4Q, "vceq", "i", NEONvceq, 1>; +def VCEQfd : N3VD<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32, + NEONvceq, 1>; +def VCEQfq : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32, + NEONvceq, 1>; + +defm VCEQz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00010, 0, "vceq", "i", + "$Vd, $Vm, #0", NEONvceqz>; + +// VCGE : Vector Compare Greater Than or Equal +defm VCGEs : N3V_QHS<0, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, + IIC_VSUBi4Q, "vcge", "s", NEONvcge, 0>; +defm VCGEu : N3V_QHS<1, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, + IIC_VSUBi4Q, "vcge", "u", NEONvcgeu, 0>; +def VCGEfd : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32, + NEONvcge, 0>; +def VCGEfq : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32, + NEONvcge, 0>; + +defm VCGEz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00001, 0, "vcge", "s", + "$Vd, $Vm, #0", NEONvcgez>; +defm VCLEz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00011, 0, "vcle", "s", + "$Vd, $Vm, #0", NEONvclez>; + +// VCGT : Vector Compare Greater Than +defm VCGTs : N3V_QHS<0, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, + IIC_VSUBi4Q, "vcgt", "s", NEONvcgt, 0>; +defm VCGTu : N3V_QHS<1, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, + IIC_VSUBi4Q, "vcgt", "u", NEONvcgtu, 0>; +def VCGTfd : N3VD<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32, + NEONvcgt, 0>; +def VCGTfq : N3VQ<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32, + NEONvcgt, 0>; + +defm VCGTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00000, 0, "vcgt", "s", + "$Vd, $Vm, #0", NEONvcgtz>; +defm VCLTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s", + "$Vd, $Vm, #0", NEONvcltz>; + +// VACGE : Vector Absolute Compare Greater Than or Equal (aka VCAGE) +def VACGEd : N3VDInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge", + "f32", v2i32, v2f32, int_arm_neon_vacged, 0>; +def VACGEq : N3VQInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacge", + "f32", v4i32, v4f32, int_arm_neon_vacgeq, 0>; +// VACGT : Vector Absolute Compare Greater Than (aka VCAGT) +def VACGTd : N3VDInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt", + "f32", v2i32, v2f32, int_arm_neon_vacgtd, 0>; +def VACGTq : N3VQInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt", + "f32", v4i32, v4f32, int_arm_neon_vacgtq, 0>; +// VTST : Vector Test Bits +defm VTST : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, + IIC_VBINi4Q, "vtst", "", NEONvtst, 1>; + +// Vector Bitwise Operations. + +def vnotd : PatFrag<(ops node:$in), + (xor node:$in, (bitconvert (v8i8 NEONimmAllOnesV)))>; +def vnotq : PatFrag<(ops node:$in), + (xor node:$in, (bitconvert (v16i8 NEONimmAllOnesV)))>; + + +// VAND : Vector Bitwise AND +def VANDd : N3VDX<0, 0, 0b00, 0b0001, 1, IIC_VBINiD, "vand", + v2i32, v2i32, and, 1>; +def VANDq : N3VQX<0, 0, 0b00, 0b0001, 1, IIC_VBINiQ, "vand", + v4i32, v4i32, and, 1>; + +// VEOR : Vector Bitwise Exclusive OR +def VEORd : N3VDX<1, 0, 0b00, 0b0001, 1, IIC_VBINiD, "veor", + v2i32, v2i32, xor, 1>; +def VEORq : N3VQX<1, 0, 0b00, 0b0001, 1, IIC_VBINiQ, "veor", + v4i32, v4i32, xor, 1>; + +// VORR : Vector Bitwise OR +def VORRd : N3VDX<0, 0, 0b10, 0b0001, 1, IIC_VBINiD, "vorr", + v2i32, v2i32, or, 1>; +def VORRq : N3VQX<0, 0, 0b10, 0b0001, 1, IIC_VBINiQ, "vorr", + v4i32, v4i32, or, 1>; + +def VORRiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 0, 1, + (outs DPR:$Vd), (ins nModImm:$SIMM, DPR:$src), + IIC_VMOVImm, + "vorr", "i16", "$Vd, $SIMM", "$src = $Vd", + [(set DPR:$Vd, + (v4i16 (NEONvorrImm DPR:$src, timm:$SIMM)))]> { + let Inst{9} = SIMM{9}; +} + +def VORRiv2i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 0, 0, 1, + (outs DPR:$Vd), (ins nModImm:$SIMM, DPR:$src), + IIC_VMOVImm, + "vorr", "i32", "$Vd, $SIMM", "$src = $Vd", + [(set DPR:$Vd, + (v2i32 (NEONvorrImm DPR:$src, timm:$SIMM)))]> { + let Inst{10-9} = SIMM{10-9}; +} + +def VORRiv8i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 1, 0, 1, + (outs QPR:$Vd), (ins nModImm:$SIMM, QPR:$src), + IIC_VMOVImm, + "vorr", "i16", "$Vd, $SIMM", "$src = $Vd", + [(set QPR:$Vd, + (v8i16 (NEONvorrImm QPR:$src, timm:$SIMM)))]> { + let Inst{9} = SIMM{9}; +} + +def VORRiv4i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 1, 0, 1, + (outs QPR:$Vd), (ins nModImm:$SIMM, QPR:$src), + IIC_VMOVImm, + "vorr", "i32", "$Vd, $SIMM", "$src = $Vd", + [(set QPR:$Vd, + (v4i32 (NEONvorrImm QPR:$src, timm:$SIMM)))]> { + let Inst{10-9} = SIMM{10-9}; +} + + +// VBIC : Vector Bitwise Bit Clear (AND NOT) +def VBICd : N3VX<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd), + (ins DPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VBINiD, + "vbic", "$Vd, $Vn, $Vm", "", + [(set DPR:$Vd, (v2i32 (and DPR:$Vn, + (vnotd DPR:$Vm))))]>; +def VBICq : N3VX<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd), + (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINiQ, + "vbic", "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (v4i32 (and QPR:$Vn, + (vnotq QPR:$Vm))))]>; + +def VBICiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 1, 1, + (outs DPR:$Vd), (ins nModImm:$SIMM, DPR:$src), + IIC_VMOVImm, + "vbic", "i16", "$Vd, $SIMM", "$src = $Vd", + [(set DPR:$Vd, + (v4i16 (NEONvbicImm DPR:$src, timm:$SIMM)))]> { + let Inst{9} = SIMM{9}; +} + +def VBICiv2i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 0, 1, 1, + (outs DPR:$Vd), (ins nModImm:$SIMM, DPR:$src), + IIC_VMOVImm, + "vbic", "i32", "$Vd, $SIMM", "$src = $Vd", + [(set DPR:$Vd, + (v2i32 (NEONvbicImm DPR:$src, timm:$SIMM)))]> { + let Inst{10-9} = SIMM{10-9}; +} + +def VBICiv8i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 1, 1, 1, + (outs QPR:$Vd), (ins nModImm:$SIMM, QPR:$src), + IIC_VMOVImm, + "vbic", "i16", "$Vd, $SIMM", "$src = $Vd", + [(set QPR:$Vd, + (v8i16 (NEONvbicImm QPR:$src, timm:$SIMM)))]> { + let Inst{9} = SIMM{9}; +} + +def VBICiv4i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 1, 1, 1, + (outs QPR:$Vd), (ins nModImm:$SIMM, QPR:$src), + IIC_VMOVImm, + "vbic", "i32", "$Vd, $SIMM", "$src = $Vd", + [(set QPR:$Vd, + (v4i32 (NEONvbicImm QPR:$src, timm:$SIMM)))]> { + let Inst{10-9} = SIMM{10-9}; +} + +// VORN : Vector Bitwise OR NOT +def VORNd : N3VX<0, 0, 0b11, 0b0001, 0, 1, (outs DPR:$Vd), + (ins DPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VBINiD, + "vorn", "$Vd, $Vn, $Vm", "", + [(set DPR:$Vd, (v2i32 (or DPR:$Vn, + (vnotd DPR:$Vm))))]>; +def VORNq : N3VX<0, 0, 0b11, 0b0001, 1, 1, (outs QPR:$Vd), + (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINiQ, + "vorn", "$Vd, $Vn, $Vm", "", + [(set QPR:$Vd, (v4i32 (or QPR:$Vn, + (vnotq QPR:$Vm))))]>; + +// VMVN : Vector Bitwise NOT (Immediate) + +let isReMaterializable = 1 in { + +def VMVNv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 1, 1, (outs DPR:$Vd), + (ins nModImm:$SIMM), IIC_VMOVImm, + "vmvn", "i16", "$Vd, $SIMM", "", + [(set DPR:$Vd, (v4i16 (NEONvmvnImm timm:$SIMM)))]> { + let Inst{9} = SIMM{9}; +} + +def VMVNv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 1, 1, (outs QPR:$Vd), + (ins nModImm:$SIMM), IIC_VMOVImm, + "vmvn", "i16", "$Vd, $SIMM", "", + [(set QPR:$Vd, (v8i16 (NEONvmvnImm timm:$SIMM)))]> { + let Inst{9} = SIMM{9}; +} + +def VMVNv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 1, 1, (outs DPR:$Vd), + (ins nModImm:$SIMM), IIC_VMOVImm, + "vmvn", "i32", "$Vd, $SIMM", "", + [(set DPR:$Vd, (v2i32 (NEONvmvnImm timm:$SIMM)))]> { + let Inst{11-8} = SIMM{11-8}; +} + +def VMVNv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 1, 1, (outs QPR:$Vd), + (ins nModImm:$SIMM), IIC_VMOVImm, + "vmvn", "i32", "$Vd, $SIMM", "", + [(set QPR:$Vd, (v4i32 (NEONvmvnImm timm:$SIMM)))]> { + let Inst{11-8} = SIMM{11-8}; +} +} + +// VMVN : Vector Bitwise NOT +def VMVNd : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 0, 0, + (outs DPR:$Vd), (ins DPR:$Vm), IIC_VSUBiD, + "vmvn", "$Vd, $Vm", "", + [(set DPR:$Vd, (v2i32 (vnotd DPR:$Vm)))]>; +def VMVNq : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 1, 0, + (outs QPR:$Vd), (ins QPR:$Vm), IIC_VSUBiD, + "vmvn", "$Vd, $Vm", "", + [(set QPR:$Vd, (v4i32 (vnotq QPR:$Vm)))]>; +def : Pat<(v2i32 (vnotd DPR:$src)), (VMVNd DPR:$src)>; +def : Pat<(v4i32 (vnotq QPR:$src)), (VMVNq QPR:$src)>; + +// VBSL : Vector Bitwise Select +def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd), + (ins DPR:$src1, DPR:$Vn, DPR:$Vm), + N3RegFrm, IIC_VCNTiD, + "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set DPR:$Vd, + (v2i32 (NEONvbsl DPR:$src1, DPR:$Vn, DPR:$Vm)))]>; + +def : Pat<(v2i32 (or (and DPR:$Vn, DPR:$Vd), + (and DPR:$Vm, (vnotd DPR:$Vd)))), + (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>; + +def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd), + (ins QPR:$src1, QPR:$Vn, QPR:$Vm), + N3RegFrm, IIC_VCNTiQ, + "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd", + [(set QPR:$Vd, + (v4i32 (NEONvbsl QPR:$src1, QPR:$Vn, QPR:$Vm)))]>; + +def : Pat<(v4i32 (or (and QPR:$Vn, QPR:$Vd), + (and QPR:$Vm, (vnotq QPR:$Vd)))), + (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>; + +// VBIF : Vector Bitwise Insert if False +// like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst", +// FIXME: This instruction's encoding MAY NOT BE correct. +def VBIFd : N3VX<1, 0, 0b11, 0b0001, 0, 1, + (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), + N3RegFrm, IIC_VBINiD, + "vbif", "$Vd, $Vn, $Vm", "$src1 = $Vd", + [/* For disassembly only; pattern left blank */]>; +def VBIFq : N3VX<1, 0, 0b11, 0b0001, 1, 1, + (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), + N3RegFrm, IIC_VBINiQ, + "vbif", "$Vd, $Vn, $Vm", "$src1 = $Vd", + [/* For disassembly only; pattern left blank */]>; + +// VBIT : Vector Bitwise Insert if True +// like VBSL but with: "vbit $dst, $src2, $src1", "$src3 = $dst", +// FIXME: This instruction's encoding MAY NOT BE correct. +def VBITd : N3VX<1, 0, 0b10, 0b0001, 0, 1, + (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), + N3RegFrm, IIC_VBINiD, + "vbit", "$Vd, $Vn, $Vm", "$src1 = $Vd", + [/* For disassembly only; pattern left blank */]>; +def VBITq : N3VX<1, 0, 0b10, 0b0001, 1, 1, + (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), + N3RegFrm, IIC_VBINiQ, + "vbit", "$Vd, $Vn, $Vm", "$src1 = $Vd", + [/* For disassembly only; pattern left blank */]>; + +// VBIT/VBIF are not yet implemented. The TwoAddress pass will not go looking +// for equivalent operations with different register constraints; it just +// inserts copies. + +// Vector Absolute Differences. + +// VABD : Vector Absolute Difference +defm VABDs : N3VInt_QHS<0, 0, 0b0111, 0, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vabd", "s", int_arm_neon_vabds, 1>; +defm VABDu : N3VInt_QHS<1, 0, 0b0111, 0, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vabd", "u", int_arm_neon_vabdu, 1>; +def VABDfd : N3VDInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBIND, + "vabd", "f32", v2f32, v2f32, int_arm_neon_vabds, 1>; +def VABDfq : N3VQInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBINQ, + "vabd", "f32", v4f32, v4f32, int_arm_neon_vabds, 1>; + +// VABDL : Vector Absolute Difference Long (Q = | D - D |) +defm VABDLs : N3VLIntExt_QHS<0,1,0b0111,0, IIC_VSUBi4Q, + "vabdl", "s", int_arm_neon_vabds, zext, 1>; +defm VABDLu : N3VLIntExt_QHS<1,1,0b0111,0, IIC_VSUBi4Q, + "vabdl", "u", int_arm_neon_vabdu, zext, 1>; + +// VABA : Vector Absolute Difference and Accumulate +defm VABAs : N3VIntOp_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ, + "vaba", "s", int_arm_neon_vabds, add>; +defm VABAu : N3VIntOp_QHS<1,0,0b0111,1, IIC_VABAD, IIC_VABAQ, + "vaba", "u", int_arm_neon_vabdu, add>; + +// VABAL : Vector Absolute Difference and Accumulate Long (Q += | D - D |) +defm VABALs : N3VLIntExtOp_QHS<0,1,0b0101,0, IIC_VABAD, + "vabal", "s", int_arm_neon_vabds, zext, add>; +defm VABALu : N3VLIntExtOp_QHS<1,1,0b0101,0, IIC_VABAD, + "vabal", "u", int_arm_neon_vabdu, zext, add>; + +// Vector Maximum and Minimum. + +// VMAX : Vector Maximum +defm VMAXs : N3VInt_QHS<0, 0, 0b0110, 0, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vmax", "s", int_arm_neon_vmaxs, 1>; +defm VMAXu : N3VInt_QHS<1, 0, 0b0110, 0, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vmax", "u", int_arm_neon_vmaxu, 1>; +def VMAXfd : N3VDInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBIND, + "vmax", "f32", + v2f32, v2f32, int_arm_neon_vmaxs, 1>; +def VMAXfq : N3VQInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBINQ, + "vmax", "f32", + v4f32, v4f32, int_arm_neon_vmaxs, 1>; + +// VMIN : Vector Minimum +defm VMINs : N3VInt_QHS<0, 0, 0b0110, 1, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vmin", "s", int_arm_neon_vmins, 1>; +defm VMINu : N3VInt_QHS<1, 0, 0b0110, 1, N3RegFrm, + IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, + "vmin", "u", int_arm_neon_vminu, 1>; +def VMINfd : N3VDInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBIND, + "vmin", "f32", + v2f32, v2f32, int_arm_neon_vmins, 1>; +def VMINfq : N3VQInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBINQ, + "vmin", "f32", + v4f32, v4f32, int_arm_neon_vmins, 1>; + +// Vector Pairwise Operations. + +// VPADD : Vector Pairwise Add +def VPADDi8 : N3VDInt<0, 0, 0b00, 0b1011, 1, N3RegFrm, IIC_VSHLiD, + "vpadd", "i8", + v8i8, v8i8, int_arm_neon_vpadd, 0>; +def VPADDi16 : N3VDInt<0, 0, 0b01, 0b1011, 1, N3RegFrm, IIC_VSHLiD, + "vpadd", "i16", + v4i16, v4i16, int_arm_neon_vpadd, 0>; +def VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, N3RegFrm, IIC_VSHLiD, + "vpadd", "i32", + v2i32, v2i32, int_arm_neon_vpadd, 0>; +def VPADDf : N3VDInt<1, 0, 0b00, 0b1101, 0, N3RegFrm, + IIC_VPBIND, "vpadd", "f32", + v2f32, v2f32, int_arm_neon_vpadd, 0>; + +// VPADDL : Vector Pairwise Add Long +defm VPADDLs : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00100, 0, "vpaddl", "s", + int_arm_neon_vpaddls>; +defm VPADDLu : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00101, 0, "vpaddl", "u", + int_arm_neon_vpaddlu>; + +// VPADAL : Vector Pairwise Add and Accumulate Long +defm VPADALs : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b01100, 0, "vpadal", "s", + int_arm_neon_vpadals>; +defm VPADALu : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b01101, 0, "vpadal", "u", + int_arm_neon_vpadalu>; + +// VPMAX : Vector Pairwise Maximum +def VPMAXs8 : N3VDInt<0, 0, 0b00, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax", + "s8", v8i8, v8i8, int_arm_neon_vpmaxs, 0>; +def VPMAXs16 : N3VDInt<0, 0, 0b01, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax", + "s16", v4i16, v4i16, int_arm_neon_vpmaxs, 0>; +def VPMAXs32 : N3VDInt<0, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax", + "s32", v2i32, v2i32, int_arm_neon_vpmaxs, 0>; +def VPMAXu8 : N3VDInt<1, 0, 0b00, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax", + "u8", v8i8, v8i8, int_arm_neon_vpmaxu, 0>; +def VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax", + "u16", v4i16, v4i16, int_arm_neon_vpmaxu, 0>; +def VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax", + "u32", v2i32, v2i32, int_arm_neon_vpmaxu, 0>; +def VPMAXf : N3VDInt<1, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmax", + "f32", v2f32, v2f32, int_arm_neon_vpmaxs, 0>; + +// VPMIN : Vector Pairwise Minimum +def VPMINs8 : N3VDInt<0, 0, 0b00, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin", + "s8", v8i8, v8i8, int_arm_neon_vpmins, 0>; +def VPMINs16 : N3VDInt<0, 0, 0b01, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin", + "s16", v4i16, v4i16, int_arm_neon_vpmins, 0>; +def VPMINs32 : N3VDInt<0, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin", + "s32", v2i32, v2i32, int_arm_neon_vpmins, 0>; +def VPMINu8 : N3VDInt<1, 0, 0b00, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin", + "u8", v8i8, v8i8, int_arm_neon_vpminu, 0>; +def VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin", + "u16", v4i16, v4i16, int_arm_neon_vpminu, 0>; +def VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin", + "u32", v2i32, v2i32, int_arm_neon_vpminu, 0>; +def VPMINf : N3VDInt<1, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmin", + "f32", v2f32, v2f32, int_arm_neon_vpmins, 0>; + +// Vector Reciprocal and Reciprocal Square Root Estimate and Step. + +// VRECPE : Vector Reciprocal Estimate +def VRECPEd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, + IIC_VUNAD, "vrecpe", "u32", + v2i32, v2i32, int_arm_neon_vrecpe>; +def VRECPEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, + IIC_VUNAQ, "vrecpe", "u32", + v4i32, v4i32, int_arm_neon_vrecpe>; +def VRECPEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, + IIC_VUNAD, "vrecpe", "f32", + v2f32, v2f32, int_arm_neon_vrecpe>; +def VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, + IIC_VUNAQ, "vrecpe", "f32", + v4f32, v4f32, int_arm_neon_vrecpe>; + +// VRECPS : Vector Reciprocal Step +def VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, N3RegFrm, + IIC_VRECSD, "vrecps", "f32", + v2f32, v2f32, int_arm_neon_vrecps, 1>; +def VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1, N3RegFrm, + IIC_VRECSQ, "vrecps", "f32", + v4f32, v4f32, int_arm_neon_vrecps, 1>; + +// VRSQRTE : Vector Reciprocal Square Root Estimate +def VRSQRTEd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0, + IIC_VUNAD, "vrsqrte", "u32", + v2i32, v2i32, int_arm_neon_vrsqrte>; +def VRSQRTEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0, + IIC_VUNAQ, "vrsqrte", "u32", + v4i32, v4i32, int_arm_neon_vrsqrte>; +def VRSQRTEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, + IIC_VUNAD, "vrsqrte", "f32", + v2f32, v2f32, int_arm_neon_vrsqrte>; +def VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, + IIC_VUNAQ, "vrsqrte", "f32", + v4f32, v4f32, int_arm_neon_vrsqrte>; + +// VRSQRTS : Vector Reciprocal Square Root Step +def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, N3RegFrm, + IIC_VRECSD, "vrsqrts", "f32", + v2f32, v2f32, int_arm_neon_vrsqrts, 1>; +def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, N3RegFrm, + IIC_VRECSQ, "vrsqrts", "f32", + v4f32, v4f32, int_arm_neon_vrsqrts, 1>; + +// Vector Shifts. + +// VSHL : Vector Shift +defm VSHLs : N3VInt_QHSDSh<0, 0, 0b0100, 0, N3RegVShFrm, + IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ, + "vshl", "s", int_arm_neon_vshifts>; +defm VSHLu : N3VInt_QHSDSh<1, 0, 0b0100, 0, N3RegVShFrm, + IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ, + "vshl", "u", int_arm_neon_vshiftu>; + +// VSHL : Vector Shift Left (Immediate) +defm VSHLi : N2VShL_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshl>; + +// VSHR : Vector Shift Right (Immediate) +defm VSHRs : N2VShR_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s",NEONvshrs>; +defm VSHRu : N2VShR_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u",NEONvshru>; + +// VSHLL : Vector Shift Left Long +defm VSHLLs : N2VLSh_QHS<0, 1, 0b1010, 0, 0, 1, "vshll", "s", NEONvshlls>; +defm VSHLLu : N2VLSh_QHS<1, 1, 0b1010, 0, 0, 1, "vshll", "u", NEONvshllu>; + +// VSHLL : Vector Shift Left Long (with maximum shift count) +class N2VLShMax<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, + bit op6, bit op4, string OpcodeStr, string Dt, ValueType ResTy, + ValueType OpTy, SDNode OpNode> + : N2VLSh<op24, op23, op11_8, op7, op6, op4, OpcodeStr, Dt, + ResTy, OpTy, OpNode> { + let Inst{21-16} = op21_16; + let DecoderMethod = "DecodeVSHLMaxInstruction"; +} +def VSHLLi8 : N2VLShMax<1, 1, 0b110010, 0b0011, 0, 0, 0, "vshll", "i8", + v8i16, v8i8, NEONvshlli>; +def VSHLLi16 : N2VLShMax<1, 1, 0b110110, 0b0011, 0, 0, 0, "vshll", "i16", + v4i32, v4i16, NEONvshlli>; +def VSHLLi32 : N2VLShMax<1, 1, 0b111010, 0b0011, 0, 0, 0, "vshll", "i32", + v2i64, v2i32, NEONvshlli>; + +// VSHRN : Vector Shift Right and Narrow +defm VSHRN : N2VNSh_HSD<0,1,0b1000,0,0,1, IIC_VSHLiD, "vshrn", "i", + NEONvshrn>; + +// VRSHL : Vector Rounding Shift +defm VRSHLs : N3VInt_QHSDSh<0, 0, 0b0101, 0, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vrshl", "s", int_arm_neon_vrshifts>; +defm VRSHLu : N3VInt_QHSDSh<1, 0, 0b0101, 0, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vrshl", "u", int_arm_neon_vrshiftu>; +// VRSHR : Vector Rounding Shift Right +defm VRSHRs : N2VShR_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s",NEONvrshrs>; +defm VRSHRu : N2VShR_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u",NEONvrshru>; + +// VRSHRN : Vector Rounding Shift Right and Narrow +defm VRSHRN : N2VNSh_HSD<0, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vrshrn", "i", + NEONvrshrn>; + +// VQSHL : Vector Saturating Shift +defm VQSHLs : N3VInt_QHSDSh<0, 0, 0b0100, 1, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vqshl", "s", int_arm_neon_vqshifts>; +defm VQSHLu : N3VInt_QHSDSh<1, 0, 0b0100, 1, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vqshl", "u", int_arm_neon_vqshiftu>; +// VQSHL : Vector Saturating Shift Left (Immediate) +defm VQSHLsi : N2VShL_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s",NEONvqshls>; +defm VQSHLui : N2VShL_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u",NEONvqshlu>; + +// VQSHLU : Vector Saturating Shift Left (Immediate, Unsigned) +defm VQSHLsu : N2VShL_QHSD<1,1,0b0110,1, IIC_VSHLi4D,"vqshlu","s",NEONvqshlsu>; + +// VQSHRN : Vector Saturating Shift Right and Narrow +defm VQSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "s", + NEONvqshrns>; +defm VQSHRNu : N2VNSh_HSD<1, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "u", + NEONvqshrnu>; + +// VQSHRUN : Vector Saturating Shift Right and Narrow (Unsigned) +defm VQSHRUN : N2VNSh_HSD<1, 1, 0b1000, 0, 0, 1, IIC_VSHLi4D, "vqshrun", "s", + NEONvqshrnsu>; + +// VQRSHL : Vector Saturating Rounding Shift +defm VQRSHLs : N3VInt_QHSDSh<0, 0, 0b0101, 1, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vqrshl", "s", int_arm_neon_vqrshifts>; +defm VQRSHLu : N3VInt_QHSDSh<1, 0, 0b0101, 1, N3RegVShFrm, + IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, + "vqrshl", "u", int_arm_neon_vqrshiftu>; + +// VQRSHRN : Vector Saturating Rounding Shift Right and Narrow +defm VQRSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "s", + NEONvqrshrns>; +defm VQRSHRNu : N2VNSh_HSD<1, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "u", + NEONvqrshrnu>; + +// VQRSHRUN : Vector Saturating Rounding Shift Right and Narrow (Unsigned) +defm VQRSHRUN : N2VNSh_HSD<1, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vqrshrun", "s", + NEONvqrshrnsu>; + +// VSRA : Vector Shift Right and Accumulate +defm VSRAs : N2VShAdd_QHSD<0, 1, 0b0001, 1, "vsra", "s", NEONvshrs>; +defm VSRAu : N2VShAdd_QHSD<1, 1, 0b0001, 1, "vsra", "u", NEONvshru>; +// VRSRA : Vector Rounding Shift Right and Accumulate +defm VRSRAs : N2VShAdd_QHSD<0, 1, 0b0011, 1, "vrsra", "s", NEONvrshrs>; +defm VRSRAu : N2VShAdd_QHSD<1, 1, 0b0011, 1, "vrsra", "u", NEONvrshru>; + +// VSLI : Vector Shift Left and Insert +defm VSLI : N2VShInsL_QHSD<1, 1, 0b0101, 1, "vsli">; + +// VSRI : Vector Shift Right and Insert +defm VSRI : N2VShInsR_QHSD<1, 1, 0b0100, 1, "vsri">; + +// Vector Absolute and Saturating Absolute. + +// VABS : Vector Absolute Value +defm VABS : N2VInt_QHS<0b11, 0b11, 0b01, 0b00110, 0, + IIC_VUNAiD, IIC_VUNAiQ, "vabs", "s", + int_arm_neon_vabs>; +def VABSfd : N2VDInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, + IIC_VUNAD, "vabs", "f32", + v2f32, v2f32, int_arm_neon_vabs>; +def VABSfq : N2VQInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, + IIC_VUNAQ, "vabs", "f32", + v4f32, v4f32, int_arm_neon_vabs>; + +// VQABS : Vector Saturating Absolute Value +defm VQABS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0, + IIC_VQUNAiD, IIC_VQUNAiQ, "vqabs", "s", + int_arm_neon_vqabs>; + +// Vector Negate. + +def vnegd : PatFrag<(ops node:$in), + (sub (bitconvert (v2i32 NEONimmAllZerosV)), node:$in)>; +def vnegq : PatFrag<(ops node:$in), + (sub (bitconvert (v4i32 NEONimmAllZerosV)), node:$in)>; + +class VNEGD<bits<2> size, string OpcodeStr, string Dt, ValueType Ty> + : N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$Vd), (ins DPR:$Vm), + IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set DPR:$Vd, (Ty (vnegd DPR:$Vm)))]>; +class VNEGQ<bits<2> size, string OpcodeStr, string Dt, ValueType Ty> + : N2V<0b11, 0b11, size, 0b01, 0b00111, 1, 0, (outs QPR:$Vd), (ins QPR:$Vm), + IIC_VSHLiQ, OpcodeStr, Dt, "$Vd, $Vm", "", + [(set QPR:$Vd, (Ty (vnegq QPR:$Vm)))]>; + +// VNEG : Vector Negate (integer) +def VNEGs8d : VNEGD<0b00, "vneg", "s8", v8i8>; +def VNEGs16d : VNEGD<0b01, "vneg", "s16", v4i16>; +def VNEGs32d : VNEGD<0b10, "vneg", "s32", v2i32>; +def VNEGs8q : VNEGQ<0b00, "vneg", "s8", v16i8>; +def VNEGs16q : VNEGQ<0b01, "vneg", "s16", v8i16>; +def VNEGs32q : VNEGQ<0b10, "vneg", "s32", v4i32>; + +// VNEG : Vector Negate (floating-point) +def VNEGfd : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0, + (outs DPR:$Vd), (ins DPR:$Vm), IIC_VUNAD, + "vneg", "f32", "$Vd, $Vm", "", + [(set DPR:$Vd, (v2f32 (fneg DPR:$Vm)))]>; +def VNEGf32q : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 1, 0, + (outs QPR:$Vd), (ins QPR:$Vm), IIC_VUNAQ, + "vneg", "f32", "$Vd, $Vm", "", + [(set QPR:$Vd, (v4f32 (fneg QPR:$Vm)))]>; + +def : Pat<(v8i8 (vnegd DPR:$src)), (VNEGs8d DPR:$src)>; +def : Pat<(v4i16 (vnegd DPR:$src)), (VNEGs16d DPR:$src)>; +def : Pat<(v2i32 (vnegd DPR:$src)), (VNEGs32d DPR:$src)>; +def : Pat<(v16i8 (vnegq QPR:$src)), (VNEGs8q QPR:$src)>; +def : Pat<(v8i16 (vnegq QPR:$src)), (VNEGs16q QPR:$src)>; +def : Pat<(v4i32 (vnegq QPR:$src)), (VNEGs32q QPR:$src)>; + +// VQNEG : Vector Saturating Negate +defm VQNEG : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0, + IIC_VQUNAiD, IIC_VQUNAiQ, "vqneg", "s", + int_arm_neon_vqneg>; + +// Vector Bit Counting Operations. + +// VCLS : Vector Count Leading Sign Bits +defm VCLS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0, + IIC_VCNTiD, IIC_VCNTiQ, "vcls", "s", + int_arm_neon_vcls>; +// VCLZ : Vector Count Leading Zeros +defm VCLZ : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0, + IIC_VCNTiD, IIC_VCNTiQ, "vclz", "i", + int_arm_neon_vclz>; +// VCNT : Vector Count One Bits +def VCNTd : N2VDInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, + IIC_VCNTiD, "vcnt", "8", + v8i8, v8i8, int_arm_neon_vcnt>; +def VCNTq : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, + IIC_VCNTiQ, "vcnt", "8", + v16i8, v16i8, int_arm_neon_vcnt>; + +// Vector Swap -- for disassembly only. +def VSWPd : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 0, 0, + (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, + "vswp", "$Vd, $Vm", "", []>; +def VSWPq : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 1, 0, + (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, + "vswp", "$Vd, $Vm", "", []>; + +// Vector Move Operations. + +// VMOV : Vector Move (Register) +def : InstAlias<"vmov${p} $Vd, $Vm", + (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>; +def : InstAlias<"vmov${p} $Vd, $Vm", + (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>; + +// VMOV : Vector Move (Immediate) + +let isReMaterializable = 1 in { +def VMOVv8i8 : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$Vd), + (ins nModImm:$SIMM), IIC_VMOVImm, + "vmov", "i8", "$Vd, $SIMM", "", + [(set DPR:$Vd, (v8i8 (NEONvmovImm timm:$SIMM)))]>; +def VMOVv16i8 : N1ModImm<1, 0b000, 0b1110, 0, 1, 0, 1, (outs QPR:$Vd), + (ins nModImm:$SIMM), IIC_VMOVImm, + "vmov", "i8", "$Vd, $SIMM", "", + [(set QPR:$Vd, (v16i8 (NEONvmovImm timm:$SIMM)))]>; + +def VMOVv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 0, 1, (outs DPR:$Vd), + (ins nModImm:$SIMM), IIC_VMOVImm, + "vmov", "i16", "$Vd, $SIMM", "", + [(set DPR:$Vd, (v4i16 (NEONvmovImm timm:$SIMM)))]> { + let Inst{9} = SIMM{9}; +} + +def VMOVv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 0, 1, (outs QPR:$Vd), + (ins nModImm:$SIMM), IIC_VMOVImm, + "vmov", "i16", "$Vd, $SIMM", "", + [(set QPR:$Vd, (v8i16 (NEONvmovImm timm:$SIMM)))]> { + let Inst{9} = SIMM{9}; +} + +def VMOVv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 0, 1, (outs DPR:$Vd), + (ins nModImm:$SIMM), IIC_VMOVImm, + "vmov", "i32", "$Vd, $SIMM", "", + [(set DPR:$Vd, (v2i32 (NEONvmovImm timm:$SIMM)))]> { + let Inst{11-8} = SIMM{11-8}; +} + +def VMOVv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 0, 1, (outs QPR:$Vd), + (ins nModImm:$SIMM), IIC_VMOVImm, + "vmov", "i32", "$Vd, $SIMM", "", + [(set QPR:$Vd, (v4i32 (NEONvmovImm timm:$SIMM)))]> { + let Inst{11-8} = SIMM{11-8}; +} + +def VMOVv1i64 : N1ModImm<1, 0b000, 0b1110, 0, 0, 1, 1, (outs DPR:$Vd), + (ins nModImm:$SIMM), IIC_VMOVImm, + "vmov", "i64", "$Vd, $SIMM", "", + [(set DPR:$Vd, (v1i64 (NEONvmovImm timm:$SIMM)))]>; +def VMOVv2i64 : N1ModImm<1, 0b000, 0b1110, 0, 1, 1, 1, (outs QPR:$Vd), + (ins nModImm:$SIMM), IIC_VMOVImm, + "vmov", "i64", "$Vd, $SIMM", "", + [(set QPR:$Vd, (v2i64 (NEONvmovImm timm:$SIMM)))]>; +} // isReMaterializable + +// VMOV : Vector Get Lane (move scalar to ARM core register) + +def VGETLNs8 : NVGetLane<{1,1,1,0,0,1,?,1}, 0b1011, {?,?}, + (outs GPR:$R), (ins DPR:$V, nohash_imm:$lane), + IIC_VMOVSI, "vmov", "s8", "$R, $V[$lane]", + [(set GPR:$R, (NEONvgetlanes (v8i8 DPR:$V), + imm:$lane))]> { + let Inst{21} = lane{2}; + let Inst{6-5} = lane{1-0}; +} +def VGETLNs16 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, {?,1}, + (outs GPR:$R), (ins DPR:$V, nohash_imm:$lane), + IIC_VMOVSI, "vmov", "s16", "$R, $V[$lane]", + [(set GPR:$R, (NEONvgetlanes (v4i16 DPR:$V), + imm:$lane))]> { + let Inst{21} = lane{1}; + let Inst{6} = lane{0}; +} +def VGETLNu8 : NVGetLane<{1,1,1,0,1,1,?,1}, 0b1011, {?,?}, + (outs GPR:$R), (ins DPR:$V, nohash_imm:$lane), + IIC_VMOVSI, "vmov", "u8", "$R, $V[$lane]", + [(set GPR:$R, (NEONvgetlaneu (v8i8 DPR:$V), + imm:$lane))]> { + let Inst{21} = lane{2}; + let Inst{6-5} = lane{1-0}; +} +def VGETLNu16 : NVGetLane<{1,1,1,0,1,0,?,1}, 0b1011, {?,1}, + (outs GPR:$R), (ins DPR:$V, nohash_imm:$lane), + IIC_VMOVSI, "vmov", "u16", "$R, $V[$lane]", + [(set GPR:$R, (NEONvgetlaneu (v4i16 DPR:$V), + imm:$lane))]> { + let Inst{21} = lane{1}; + let Inst{6} = lane{0}; +} +def VGETLNi32 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, 0b00, + (outs GPR:$R), (ins DPR:$V, nohash_imm:$lane), + IIC_VMOVSI, "vmov", "32", "$R, $V[$lane]", + [(set GPR:$R, (extractelt (v2i32 DPR:$V), + imm:$lane))]> { + let Inst{21} = lane{0}; +} +// def VGETLNf32: see FMRDH and FMRDL in ARMInstrVFP.td +def : Pat<(NEONvgetlanes (v16i8 QPR:$src), imm:$lane), + (VGETLNs8 (v8i8 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i8_reg imm:$lane))), + (SubReg_i8_lane imm:$lane))>; +def : Pat<(NEONvgetlanes (v8i16 QPR:$src), imm:$lane), + (VGETLNs16 (v4i16 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane))>; +def : Pat<(NEONvgetlaneu (v16i8 QPR:$src), imm:$lane), + (VGETLNu8 (v8i8 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i8_reg imm:$lane))), + (SubReg_i8_lane imm:$lane))>; +def : Pat<(NEONvgetlaneu (v8i16 QPR:$src), imm:$lane), + (VGETLNu16 (v4i16 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane))>; +def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane), + (VGETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane))>; +def : Pat<(extractelt (v2f32 DPR:$src1), imm:$src2), + (EXTRACT_SUBREG (v2f32 (COPY_TO_REGCLASS (v2f32 DPR:$src1),DPR_VFP2)), + (SSubReg_f32_reg imm:$src2))>; +def : Pat<(extractelt (v4f32 QPR:$src1), imm:$src2), + (EXTRACT_SUBREG (v4f32 (COPY_TO_REGCLASS (v4f32 QPR:$src1),QPR_VFP2)), + (SSubReg_f32_reg imm:$src2))>; +//def : Pat<(extractelt (v2i64 QPR:$src1), imm:$src2), +// (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>; +def : Pat<(extractelt (v2f64 QPR:$src1), imm:$src2), + (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>; + + +// VMOV : Vector Set Lane (move ARM core register to scalar) + +let Constraints = "$src1 = $V" in { +def VSETLNi8 : NVSetLane<{1,1,1,0,0,1,?,0}, 0b1011, {?,?}, (outs DPR:$V), + (ins DPR:$src1, GPR:$R, nohash_imm:$lane), + IIC_VMOVISL, "vmov", "8", "$V[$lane], $R", + [(set DPR:$V, (vector_insert (v8i8 DPR:$src1), + GPR:$R, imm:$lane))]> { + let Inst{21} = lane{2}; + let Inst{6-5} = lane{1-0}; +} +def VSETLNi16 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, {?,1}, (outs DPR:$V), + (ins DPR:$src1, GPR:$R, nohash_imm:$lane), + IIC_VMOVISL, "vmov", "16", "$V[$lane], $R", + [(set DPR:$V, (vector_insert (v4i16 DPR:$src1), + GPR:$R, imm:$lane))]> { + let Inst{21} = lane{1}; + let Inst{6} = lane{0}; +} +def VSETLNi32 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, 0b00, (outs DPR:$V), + (ins DPR:$src1, GPR:$R, nohash_imm:$lane), + IIC_VMOVISL, "vmov", "32", "$V[$lane], $R", + [(set DPR:$V, (insertelt (v2i32 DPR:$src1), + GPR:$R, imm:$lane))]> { + let Inst{21} = lane{0}; +} +} +def : Pat<(vector_insert (v16i8 QPR:$src1), GPR:$src2, imm:$lane), + (v16i8 (INSERT_SUBREG QPR:$src1, + (v8i8 (VSETLNi8 (v8i8 (EXTRACT_SUBREG QPR:$src1, + (DSubReg_i8_reg imm:$lane))), + GPR:$src2, (SubReg_i8_lane imm:$lane))), + (DSubReg_i8_reg imm:$lane)))>; +def : Pat<(vector_insert (v8i16 QPR:$src1), GPR:$src2, imm:$lane), + (v8i16 (INSERT_SUBREG QPR:$src1, + (v4i16 (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1, + (DSubReg_i16_reg imm:$lane))), + GPR:$src2, (SubReg_i16_lane imm:$lane))), + (DSubReg_i16_reg imm:$lane)))>; +def : Pat<(insertelt (v4i32 QPR:$src1), GPR:$src2, imm:$lane), + (v4i32 (INSERT_SUBREG QPR:$src1, + (v2i32 (VSETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src1, + (DSubReg_i32_reg imm:$lane))), + GPR:$src2, (SubReg_i32_lane imm:$lane))), + (DSubReg_i32_reg imm:$lane)))>; + +def : Pat<(v2f32 (insertelt DPR:$src1, SPR:$src2, imm:$src3)), + (INSERT_SUBREG (v2f32 (COPY_TO_REGCLASS DPR:$src1, DPR_VFP2)), + SPR:$src2, (SSubReg_f32_reg imm:$src3))>; +def : Pat<(v4f32 (insertelt QPR:$src1, SPR:$src2, imm:$src3)), + (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS QPR:$src1, QPR_VFP2)), + SPR:$src2, (SSubReg_f32_reg imm:$src3))>; + +//def : Pat<(v2i64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)), +// (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>; +def : Pat<(v2f64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)), + (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>; + +def : Pat<(v2f32 (scalar_to_vector SPR:$src)), + (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>; +def : Pat<(v2f64 (scalar_to_vector (f64 DPR:$src))), + (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), DPR:$src, dsub_0)>; +def : Pat<(v4f32 (scalar_to_vector SPR:$src)), + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>; + +def : Pat<(v8i8 (scalar_to_vector GPR:$src)), + (VSETLNi8 (v8i8 (IMPLICIT_DEF)), GPR:$src, (i32 0))>; +def : Pat<(v4i16 (scalar_to_vector GPR:$src)), + (VSETLNi16 (v4i16 (IMPLICIT_DEF)), GPR:$src, (i32 0))>; +def : Pat<(v2i32 (scalar_to_vector GPR:$src)), + (VSETLNi32 (v2i32 (IMPLICIT_DEF)), GPR:$src, (i32 0))>; + +def : Pat<(v16i8 (scalar_to_vector GPR:$src)), + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (VSETLNi8 (v8i8 (IMPLICIT_DEF)), GPR:$src, (i32 0)), + dsub_0)>; +def : Pat<(v8i16 (scalar_to_vector GPR:$src)), + (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), + (VSETLNi16 (v4i16 (IMPLICIT_DEF)), GPR:$src, (i32 0)), + dsub_0)>; +def : Pat<(v4i32 (scalar_to_vector GPR:$src)), + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), + (VSETLNi32 (v2i32 (IMPLICIT_DEF)), GPR:$src, (i32 0)), + dsub_0)>; + +// VDUP : Vector Duplicate (from ARM core register to all elements) + +class VDUPD<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty> + : NVDup<opcod1, 0b1011, opcod3, (outs DPR:$V), (ins GPR:$R), + IIC_VMOVIS, "vdup", Dt, "$V, $R", + [(set DPR:$V, (Ty (NEONvdup (i32 GPR:$R))))]>; +class VDUPQ<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty> + : NVDup<opcod1, 0b1011, opcod3, (outs QPR:$V), (ins GPR:$R), + IIC_VMOVIS, "vdup", Dt, "$V, $R", + [(set QPR:$V, (Ty (NEONvdup (i32 GPR:$R))))]>; + +def VDUP8d : VDUPD<0b11101100, 0b00, "8", v8i8>; +def VDUP16d : VDUPD<0b11101000, 0b01, "16", v4i16>; +def VDUP32d : VDUPD<0b11101000, 0b00, "32", v2i32>; +def VDUP8q : VDUPQ<0b11101110, 0b00, "8", v16i8>; +def VDUP16q : VDUPQ<0b11101010, 0b01, "16", v8i16>; +def VDUP32q : VDUPQ<0b11101010, 0b00, "32", v4i32>; + +def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32d GPR:$R)>; +def : Pat<(v4f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32q GPR:$R)>; + +// VDUP : Vector Duplicate Lane (from scalar to all elements) + +class VDUPLND<bits<4> op19_16, string OpcodeStr, string Dt, + ValueType Ty, Operand IdxTy> + : NVDupLane<op19_16, 0, (outs DPR:$Vd), (ins DPR:$Vm, IdxTy:$lane), + IIC_VMOVD, OpcodeStr, Dt, "$Vd, $Vm$lane", + [(set DPR:$Vd, (Ty (NEONvduplane (Ty DPR:$Vm), imm:$lane)))]>; + +class VDUPLNQ<bits<4> op19_16, string OpcodeStr, string Dt, + ValueType ResTy, ValueType OpTy, Operand IdxTy> + : NVDupLane<op19_16, 1, (outs QPR:$Vd), (ins DPR:$Vm, IdxTy:$lane), + IIC_VMOVQ, OpcodeStr, Dt, "$Vd, $Vm$lane", + [(set QPR:$Vd, (ResTy (NEONvduplane (OpTy DPR:$Vm), + VectorIndex32:$lane)))]>; + +// Inst{19-16} is partially specified depending on the element size. + +def VDUPLN8d : VDUPLND<{?,?,?,1}, "vdup", "8", v8i8, VectorIndex8> { + bits<3> lane; + let Inst{19-17} = lane{2-0}; +} +def VDUPLN16d : VDUPLND<{?,?,1,0}, "vdup", "16", v4i16, VectorIndex16> { + bits<2> lane; + let Inst{19-18} = lane{1-0}; +} +def VDUPLN32d : VDUPLND<{?,1,0,0}, "vdup", "32", v2i32, VectorIndex32> { + bits<1> lane; + let Inst{19} = lane{0}; +} +def VDUPLN8q : VDUPLNQ<{?,?,?,1}, "vdup", "8", v16i8, v8i8, VectorIndex8> { + bits<3> lane; + let Inst{19-17} = lane{2-0}; +} +def VDUPLN16q : VDUPLNQ<{?,?,1,0}, "vdup", "16", v8i16, v4i16, VectorIndex16> { + bits<2> lane; + let Inst{19-18} = lane{1-0}; +} +def VDUPLN32q : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4i32, v2i32, VectorIndex32> { + bits<1> lane; + let Inst{19} = lane{0}; +} + +def : Pat<(v2f32 (NEONvduplane (v2f32 DPR:$Vm), imm:$lane)), + (VDUPLN32d DPR:$Vm, imm:$lane)>; + +def : Pat<(v4f32 (NEONvduplane (v2f32 DPR:$Vm), imm:$lane)), + (VDUPLN32q DPR:$Vm, imm:$lane)>; + +def : Pat<(v16i8 (NEONvduplane (v16i8 QPR:$src), imm:$lane)), + (v16i8 (VDUPLN8q (v8i8 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i8_reg imm:$lane))), + (SubReg_i8_lane imm:$lane)))>; +def : Pat<(v8i16 (NEONvduplane (v8i16 QPR:$src), imm:$lane)), + (v8i16 (VDUPLN16q (v4i16 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; +def : Pat<(v4i32 (NEONvduplane (v4i32 QPR:$src), imm:$lane)), + (v4i32 (VDUPLN32q (v2i32 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; +def : Pat<(v4f32 (NEONvduplane (v4f32 QPR:$src), imm:$lane)), + (v4f32 (VDUPLN32q (v2f32 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane)))>; + +def VDUPfdf : PseudoNeonI<(outs DPR:$dst), (ins SPR:$src), IIC_VMOVD, "", + [(set DPR:$dst, (v2f32 (NEONvdup (f32 SPR:$src))))]>; +def VDUPfqf : PseudoNeonI<(outs QPR:$dst), (ins SPR:$src), IIC_VMOVD, "", + [(set QPR:$dst, (v4f32 (NEONvdup (f32 SPR:$src))))]>; + +// VMOVN : Vector Narrowing Move +defm VMOVN : N2VN_HSD<0b11,0b11,0b10,0b00100,0,0, IIC_VMOVN, + "vmovn", "i", trunc>; +// VQMOVN : Vector Saturating Narrowing Move +defm VQMOVNs : N2VNInt_HSD<0b11,0b11,0b10,0b00101,0,0, IIC_VQUNAiD, + "vqmovn", "s", int_arm_neon_vqmovns>; +defm VQMOVNu : N2VNInt_HSD<0b11,0b11,0b10,0b00101,1,0, IIC_VQUNAiD, + "vqmovn", "u", int_arm_neon_vqmovnu>; +defm VQMOVNsu : N2VNInt_HSD<0b11,0b11,0b10,0b00100,1,0, IIC_VQUNAiD, + "vqmovun", "s", int_arm_neon_vqmovnsu>; +// VMOVL : Vector Lengthening Move +defm VMOVLs : N2VL_QHS<0b01,0b10100,0,1, "vmovl", "s", sext>; +defm VMOVLu : N2VL_QHS<0b11,0b10100,0,1, "vmovl", "u", zext>; + +// Vector Conversions. + +// VCVT : Vector Convert Between Floating-Point and Integers +def VCVTf2sd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt", "s32.f32", + v2i32, v2f32, fp_to_sint>; +def VCVTf2ud : N2VD<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt", "u32.f32", + v2i32, v2f32, fp_to_uint>; +def VCVTs2fd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32", + v2f32, v2i32, sint_to_fp>; +def VCVTu2fd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32", + v2f32, v2i32, uint_to_fp>; + +def VCVTf2sq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt", "s32.f32", + v4i32, v4f32, fp_to_sint>; +def VCVTf2uq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt", "u32.f32", + v4i32, v4f32, fp_to_uint>; +def VCVTs2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32", + v4f32, v4i32, sint_to_fp>; +def VCVTu2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32", + v4f32, v4i32, uint_to_fp>; + +// VCVT : Vector Convert Between Floating-Point and Fixed-Point. +def VCVTf2xsd : N2VCvtD<0, 1, 0b1111, 0, 1, "vcvt", "s32.f32", + v2i32, v2f32, int_arm_neon_vcvtfp2fxs>; +def VCVTf2xud : N2VCvtD<1, 1, 0b1111, 0, 1, "vcvt", "u32.f32", + v2i32, v2f32, int_arm_neon_vcvtfp2fxu>; +def VCVTxs2fd : N2VCvtD<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32", + v2f32, v2i32, int_arm_neon_vcvtfxs2fp>; +def VCVTxu2fd : N2VCvtD<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32", + v2f32, v2i32, int_arm_neon_vcvtfxu2fp>; + +def VCVTf2xsq : N2VCvtQ<0, 1, 0b1111, 0, 1, "vcvt", "s32.f32", + v4i32, v4f32, int_arm_neon_vcvtfp2fxs>; +def VCVTf2xuq : N2VCvtQ<1, 1, 0b1111, 0, 1, "vcvt", "u32.f32", + v4i32, v4f32, int_arm_neon_vcvtfp2fxu>; +def VCVTxs2fq : N2VCvtQ<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32", + v4f32, v4i32, int_arm_neon_vcvtfxs2fp>; +def VCVTxu2fq : N2VCvtQ<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32", + v4f32, v4i32, int_arm_neon_vcvtfxu2fp>; + +// VCVT : Vector Convert Between Half-Precision and Single-Precision. +def VCVTf2h : N2VNInt<0b11, 0b11, 0b01, 0b10, 0b01100, 0, 0, + IIC_VUNAQ, "vcvt", "f16.f32", + v4i16, v4f32, int_arm_neon_vcvtfp2hf>, + Requires<[HasNEON, HasFP16]>; +def VCVTh2f : N2VLInt<0b11, 0b11, 0b01, 0b10, 0b01110, 0, 0, + IIC_VUNAQ, "vcvt", "f32.f16", + v4f32, v4i16, int_arm_neon_vcvthf2fp>, + Requires<[HasNEON, HasFP16]>; + +// Vector Reverse. + +// VREV64 : Vector Reverse elements within 64-bit doublewords + +class VREV64D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty> + : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 0, 0, (outs DPR:$Vd), + (ins DPR:$Vm), IIC_VMOVD, + OpcodeStr, Dt, "$Vd, $Vm", "", + [(set DPR:$Vd, (Ty (NEONvrev64 (Ty DPR:$Vm))))]>; +class VREV64Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty> + : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 1, 0, (outs QPR:$Vd), + (ins QPR:$Vm), IIC_VMOVQ, + OpcodeStr, Dt, "$Vd, $Vm", "", + [(set QPR:$Vd, (Ty (NEONvrev64 (Ty QPR:$Vm))))]>; + +def VREV64d8 : VREV64D<0b00, "vrev64", "8", v8i8>; +def VREV64d16 : VREV64D<0b01, "vrev64", "16", v4i16>; +def VREV64d32 : VREV64D<0b10, "vrev64", "32", v2i32>; +def : Pat<(v2f32 (NEONvrev64 (v2f32 DPR:$Vm))), (VREV64d32 DPR:$Vm)>; + +def VREV64q8 : VREV64Q<0b00, "vrev64", "8", v16i8>; +def VREV64q16 : VREV64Q<0b01, "vrev64", "16", v8i16>; +def VREV64q32 : VREV64Q<0b10, "vrev64", "32", v4i32>; +def : Pat<(v4f32 (NEONvrev64 (v4f32 QPR:$Vm))), (VREV64q32 QPR:$Vm)>; + +// VREV32 : Vector Reverse elements within 32-bit words + +class VREV32D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty> + : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 0, 0, (outs DPR:$Vd), + (ins DPR:$Vm), IIC_VMOVD, + OpcodeStr, Dt, "$Vd, $Vm", "", + [(set DPR:$Vd, (Ty (NEONvrev32 (Ty DPR:$Vm))))]>; +class VREV32Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty> + : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 1, 0, (outs QPR:$Vd), + (ins QPR:$Vm), IIC_VMOVQ, + OpcodeStr, Dt, "$Vd, $Vm", "", + [(set QPR:$Vd, (Ty (NEONvrev32 (Ty QPR:$Vm))))]>; + +def VREV32d8 : VREV32D<0b00, "vrev32", "8", v8i8>; +def VREV32d16 : VREV32D<0b01, "vrev32", "16", v4i16>; + +def VREV32q8 : VREV32Q<0b00, "vrev32", "8", v16i8>; +def VREV32q16 : VREV32Q<0b01, "vrev32", "16", v8i16>; + +// VREV16 : Vector Reverse elements within 16-bit halfwords + +class VREV16D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty> + : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 0, 0, (outs DPR:$Vd), + (ins DPR:$Vm), IIC_VMOVD, + OpcodeStr, Dt, "$Vd, $Vm", "", + [(set DPR:$Vd, (Ty (NEONvrev16 (Ty DPR:$Vm))))]>; +class VREV16Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty> + : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 1, 0, (outs QPR:$Vd), + (ins QPR:$Vm), IIC_VMOVQ, + OpcodeStr, Dt, "$Vd, $Vm", "", + [(set QPR:$Vd, (Ty (NEONvrev16 (Ty QPR:$Vm))))]>; + +def VREV16d8 : VREV16D<0b00, "vrev16", "8", v8i8>; +def VREV16q8 : VREV16Q<0b00, "vrev16", "8", v16i8>; + +// Other Vector Shuffles. + +// Aligned extractions: really just dropping registers + +class AlignedVEXTq<ValueType DestTy, ValueType SrcTy, SDNodeXForm LaneCVT> + : Pat<(DestTy (vector_extract_subvec (SrcTy QPR:$src), (i32 imm:$start))), + (EXTRACT_SUBREG (SrcTy QPR:$src), (LaneCVT imm:$start))>; + +def : AlignedVEXTq<v8i8, v16i8, DSubReg_i8_reg>; + +def : AlignedVEXTq<v4i16, v8i16, DSubReg_i16_reg>; + +def : AlignedVEXTq<v2i32, v4i32, DSubReg_i32_reg>; + +def : AlignedVEXTq<v1i64, v2i64, DSubReg_f64_reg>; + +def : AlignedVEXTq<v2f32, v4f32, DSubReg_i32_reg>; + + +// VEXT : Vector Extract + +class VEXTd<string OpcodeStr, string Dt, ValueType Ty> + : N3V<0,1,0b11,{?,?,?,?},0,0, (outs DPR:$Vd), + (ins DPR:$Vn, DPR:$Vm, i32imm:$index), NVExtFrm, + IIC_VEXTD, OpcodeStr, Dt, "$Vd, $Vn, $Vm, $index", "", + [(set DPR:$Vd, (Ty (NEONvext (Ty DPR:$Vn), + (Ty DPR:$Vm), imm:$index)))]> { + bits<4> index; + let Inst{11-8} = index{3-0}; +} + +class VEXTq<string OpcodeStr, string Dt, ValueType Ty> + : N3V<0,1,0b11,{?,?,?,?},1,0, (outs QPR:$Vd), + (ins QPR:$Vn, QPR:$Vm, i32imm:$index), NVExtFrm, + IIC_VEXTQ, OpcodeStr, Dt, "$Vd, $Vn, $Vm, $index", "", + [(set QPR:$Vd, (Ty (NEONvext (Ty QPR:$Vn), + (Ty QPR:$Vm), imm:$index)))]> { + bits<4> index; + let Inst{11-8} = index{3-0}; +} + +def VEXTd8 : VEXTd<"vext", "8", v8i8> { + let Inst{11-8} = index{3-0}; +} +def VEXTd16 : VEXTd<"vext", "16", v4i16> { + let Inst{11-9} = index{2-0}; + let Inst{8} = 0b0; +} +def VEXTd32 : VEXTd<"vext", "32", v2i32> { + let Inst{11-10} = index{1-0}; + let Inst{9-8} = 0b00; +} +def : Pat<(v2f32 (NEONvext (v2f32 DPR:$Vn), + (v2f32 DPR:$Vm), + (i32 imm:$index))), + (VEXTd32 DPR:$Vn, DPR:$Vm, imm:$index)>; + +def VEXTq8 : VEXTq<"vext", "8", v16i8> { + let Inst{11-8} = index{3-0}; +} +def VEXTq16 : VEXTq<"vext", "16", v8i16> { + let Inst{11-9} = index{2-0}; + let Inst{8} = 0b0; +} +def VEXTq32 : VEXTq<"vext", "32", v4i32> { + let Inst{11-10} = index{1-0}; + let Inst{9-8} = 0b00; +} +def : Pat<(v4f32 (NEONvext (v4f32 QPR:$Vn), + (v4f32 QPR:$Vm), + (i32 imm:$index))), + (VEXTq32 QPR:$Vn, QPR:$Vm, imm:$index)>; + +// VTRN : Vector Transpose + +def VTRNd8 : N2VDShuffle<0b00, 0b00001, "vtrn", "8">; +def VTRNd16 : N2VDShuffle<0b01, 0b00001, "vtrn", "16">; +def VTRNd32 : N2VDShuffle<0b10, 0b00001, "vtrn", "32">; + +def VTRNq8 : N2VQShuffle<0b00, 0b00001, IIC_VPERMQ, "vtrn", "8">; +def VTRNq16 : N2VQShuffle<0b01, 0b00001, IIC_VPERMQ, "vtrn", "16">; +def VTRNq32 : N2VQShuffle<0b10, 0b00001, IIC_VPERMQ, "vtrn", "32">; + +// VUZP : Vector Unzip (Deinterleave) + +def VUZPd8 : N2VDShuffle<0b00, 0b00010, "vuzp", "8">; +def VUZPd16 : N2VDShuffle<0b01, 0b00010, "vuzp", "16">; +def VUZPd32 : N2VDShuffle<0b10, 0b00010, "vuzp", "32">; + +def VUZPq8 : N2VQShuffle<0b00, 0b00010, IIC_VPERMQ3, "vuzp", "8">; +def VUZPq16 : N2VQShuffle<0b01, 0b00010, IIC_VPERMQ3, "vuzp", "16">; +def VUZPq32 : N2VQShuffle<0b10, 0b00010, IIC_VPERMQ3, "vuzp", "32">; + +// VZIP : Vector Zip (Interleave) + +def VZIPd8 : N2VDShuffle<0b00, 0b00011, "vzip", "8">; +def VZIPd16 : N2VDShuffle<0b01, 0b00011, "vzip", "16">; +def VZIPd32 : N2VDShuffle<0b10, 0b00011, "vzip", "32">; + +def VZIPq8 : N2VQShuffle<0b00, 0b00011, IIC_VPERMQ3, "vzip", "8">; +def VZIPq16 : N2VQShuffle<0b01, 0b00011, IIC_VPERMQ3, "vzip", "16">; +def VZIPq32 : N2VQShuffle<0b10, 0b00011, IIC_VPERMQ3, "vzip", "32">; + +// Vector Table Lookup and Table Extension. + +// VTBL : Vector Table Lookup +let DecoderMethod = "DecodeTBLInstruction" in { +def VTBL1 + : N3V<1,1,0b11,0b1000,0,0, (outs DPR:$Vd), + (ins DPR:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTB1, + "vtbl", "8", "$Vd, \\{$Vn\\}, $Vm", "", + [(set DPR:$Vd, (v8i8 (int_arm_neon_vtbl1 DPR:$Vn, DPR:$Vm)))]>; +let hasExtraSrcRegAllocReq = 1 in { +def VTBL2 + : N3V<1,1,0b11,0b1001,0,0, (outs DPR:$Vd), + (ins DPR:$Vn, DPR:$tbl2, DPR:$Vm), NVTBLFrm, IIC_VTB2, + "vtbl", "8", "$Vd, \\{$Vn, $tbl2\\}, $Vm", "", []>; +def VTBL3 + : N3V<1,1,0b11,0b1010,0,0, (outs DPR:$Vd), + (ins DPR:$Vn, DPR:$tbl2, DPR:$tbl3, DPR:$Vm), NVTBLFrm, IIC_VTB3, + "vtbl", "8", "$Vd, \\{$Vn, $tbl2, $tbl3\\}, $Vm", "", []>; +def VTBL4 + : N3V<1,1,0b11,0b1011,0,0, (outs DPR:$Vd), + (ins DPR:$Vn, DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$Vm), + NVTBLFrm, IIC_VTB4, + "vtbl", "8", "$Vd, \\{$Vn, $tbl2, $tbl3, $tbl4\\}, $Vm", "", []>; +} // hasExtraSrcRegAllocReq = 1 + +def VTBL2Pseudo + : PseudoNeonI<(outs DPR:$dst), (ins QPR:$tbl, DPR:$src), IIC_VTB2, "", []>; +def VTBL3Pseudo + : PseudoNeonI<(outs DPR:$dst), (ins QQPR:$tbl, DPR:$src), IIC_VTB3, "", []>; +def VTBL4Pseudo + : PseudoNeonI<(outs DPR:$dst), (ins QQPR:$tbl, DPR:$src), IIC_VTB4, "", []>; + +// VTBX : Vector Table Extension +def VTBX1 + : N3V<1,1,0b11,0b1000,1,0, (outs DPR:$Vd), + (ins DPR:$orig, DPR:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTBX1, + "vtbx", "8", "$Vd, \\{$Vn\\}, $Vm", "$orig = $Vd", + [(set DPR:$Vd, (v8i8 (int_arm_neon_vtbx1 + DPR:$orig, DPR:$Vn, DPR:$Vm)))]>; +let hasExtraSrcRegAllocReq = 1 in { +def VTBX2 + : N3V<1,1,0b11,0b1001,1,0, (outs DPR:$Vd), + (ins DPR:$orig, DPR:$Vn, DPR:$tbl2, DPR:$Vm), NVTBLFrm, IIC_VTBX2, + "vtbx", "8", "$Vd, \\{$Vn, $tbl2\\}, $Vm", "$orig = $Vd", []>; +def VTBX3 + : N3V<1,1,0b11,0b1010,1,0, (outs DPR:$Vd), + (ins DPR:$orig, DPR:$Vn, DPR:$tbl2, DPR:$tbl3, DPR:$Vm), + NVTBLFrm, IIC_VTBX3, + "vtbx", "8", "$Vd, \\{$Vn, $tbl2, $tbl3\\}, $Vm", + "$orig = $Vd", []>; +def VTBX4 + : N3V<1,1,0b11,0b1011,1,0, (outs DPR:$Vd), (ins DPR:$orig, DPR:$Vn, + DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$Vm), NVTBLFrm, IIC_VTBX4, + "vtbx", "8", "$Vd, \\{$Vn, $tbl2, $tbl3, $tbl4\\}, $Vm", + "$orig = $Vd", []>; +} // hasExtraSrcRegAllocReq = 1 + +def VTBX2Pseudo + : PseudoNeonI<(outs DPR:$dst), (ins DPR:$orig, QPR:$tbl, DPR:$src), + IIC_VTBX2, "$orig = $dst", []>; +def VTBX3Pseudo + : PseudoNeonI<(outs DPR:$dst), (ins DPR:$orig, QQPR:$tbl, DPR:$src), + IIC_VTBX3, "$orig = $dst", []>; +def VTBX4Pseudo + : PseudoNeonI<(outs DPR:$dst), (ins DPR:$orig, QQPR:$tbl, DPR:$src), + IIC_VTBX4, "$orig = $dst", []>; +} // DecoderMethod = "DecodeTBLInstruction" + +//===----------------------------------------------------------------------===// +// NEON instructions for single-precision FP math +//===----------------------------------------------------------------------===// + +class N2VSPat<SDNode OpNode, NeonI Inst> + : NEONFPPat<(f32 (OpNode SPR:$a)), + (EXTRACT_SUBREG + (v2f32 (COPY_TO_REGCLASS (Inst + (INSERT_SUBREG + (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)), + SPR:$a, ssub_0)), DPR_VFP2)), ssub_0)>; + +class N3VSPat<SDNode OpNode, NeonI Inst> + : NEONFPPat<(f32 (OpNode SPR:$a, SPR:$b)), + (EXTRACT_SUBREG + (v2f32 (COPY_TO_REGCLASS (Inst + (INSERT_SUBREG + (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)), + SPR:$a, ssub_0), + (INSERT_SUBREG + (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)), + SPR:$b, ssub_0)), DPR_VFP2)), ssub_0)>; + +class N3VSMulOpPat<SDNode MulNode, SDNode OpNode, NeonI Inst> + : NEONFPPat<(f32 (OpNode SPR:$acc, (f32 (MulNode SPR:$a, SPR:$b)))), + (EXTRACT_SUBREG + (v2f32 (COPY_TO_REGCLASS (Inst + (INSERT_SUBREG + (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)), + SPR:$acc, ssub_0), + (INSERT_SUBREG + (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)), + SPR:$a, ssub_0), + (INSERT_SUBREG + (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)), + SPR:$b, ssub_0)), DPR_VFP2)), ssub_0)>; + +def : N3VSPat<fadd, VADDfd>; +def : N3VSPat<fsub, VSUBfd>; +def : N3VSPat<fmul, VMULfd>; +def : N3VSMulOpPat<fmul, fadd, VMLAfd>, + Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>; +def : N3VSMulOpPat<fmul, fsub, VMLSfd>, + Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>; +def : N2VSPat<fabs, VABSfd>; +def : N2VSPat<fneg, VNEGfd>; +def : N3VSPat<NEONfmax, VMAXfd>; +def : N3VSPat<NEONfmin, VMINfd>; +def : N2VSPat<arm_ftosi, VCVTf2sd>; +def : N2VSPat<arm_ftoui, VCVTf2ud>; +def : N2VSPat<arm_sitof, VCVTs2fd>; +def : N2VSPat<arm_uitof, VCVTu2fd>; + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +//===----------------------------------------------------------------------===// + +// bit_convert +def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (v1i64 DPR:$src)>; +def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (v1i64 DPR:$src)>; +def : Pat<(v1i64 (bitconvert (v8i8 DPR:$src))), (v1i64 DPR:$src)>; +def : Pat<(v1i64 (bitconvert (f64 DPR:$src))), (v1i64 DPR:$src)>; +def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (v1i64 DPR:$src)>; +def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (v2i32 DPR:$src)>; +def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (v2i32 DPR:$src)>; +def : Pat<(v2i32 (bitconvert (v8i8 DPR:$src))), (v2i32 DPR:$src)>; +def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (v2i32 DPR:$src)>; +def : Pat<(v2i32 (bitconvert (v2f32 DPR:$src))), (v2i32 DPR:$src)>; +def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (v4i16 DPR:$src)>; +def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (v4i16 DPR:$src)>; +def : Pat<(v4i16 (bitconvert (v8i8 DPR:$src))), (v4i16 DPR:$src)>; +def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (v4i16 DPR:$src)>; +def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (v4i16 DPR:$src)>; +def : Pat<(v8i8 (bitconvert (v1i64 DPR:$src))), (v8i8 DPR:$src)>; +def : Pat<(v8i8 (bitconvert (v2i32 DPR:$src))), (v8i8 DPR:$src)>; +def : Pat<(v8i8 (bitconvert (v4i16 DPR:$src))), (v8i8 DPR:$src)>; +def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (v8i8 DPR:$src)>; +def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (v8i8 DPR:$src)>; +def : Pat<(f64 (bitconvert (v1i64 DPR:$src))), (f64 DPR:$src)>; +def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (f64 DPR:$src)>; +def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (f64 DPR:$src)>; +def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (f64 DPR:$src)>; +def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (f64 DPR:$src)>; +def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (v2f32 DPR:$src)>; +def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>; +def : Pat<(v2f32 (bitconvert (v2i32 DPR:$src))), (v2f32 DPR:$src)>; +def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (v2f32 DPR:$src)>; +def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (v2f32 DPR:$src)>; + +def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>; +def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>; +def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>; +def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>; +def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>; +def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>; +def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>; +def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>; +def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>; +def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>; +def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>; +def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>; +def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>; +def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>; +def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>; +def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>; +def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>; +def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>; +def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>; +def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>; +def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>; +def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>; +def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>; +def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>; +def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>; +def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>; +def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>; +def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>; +def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>; +def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>; diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td new file mode 100644 index 0000000..cedb547 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td @@ -0,0 +1,1436 @@ +//===- ARMInstrThumb.td - Thumb support for ARM ------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the Thumb instruction set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Thumb specific DAG Nodes. +// + +def ARMtcall : SDNode<"ARMISD::tCALL", SDT_ARMcall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; + +def imm_sr_XFORM: SDNodeXForm<imm, [{ + unsigned Imm = N->getZExtValue(); + return CurDAG->getTargetConstant((Imm == 32 ? 0 : Imm), MVT::i32); +}]>; +def ThumbSRImmAsmOperand: AsmOperandClass { let Name = "ImmThumbSR"; } +def imm_sr : Operand<i32>, PatLeaf<(imm), [{ + uint64_t Imm = N->getZExtValue(); + return Imm > 0 && Imm <= 32; +}], imm_sr_XFORM> { + let PrintMethod = "printThumbSRImm"; + let ParserMatchClass = ThumbSRImmAsmOperand; +} + +def imm_neg_XFORM : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(-(int)N->getZExtValue(), MVT::i32); +}]>; +def imm_comp_XFORM : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), MVT::i32); +}]>; + +def imm0_7_neg : PatLeaf<(i32 imm), [{ + return (uint32_t)-N->getZExtValue() < 8; +}], imm_neg_XFORM>; + +def imm0_255_comp : PatLeaf<(i32 imm), [{ + return ~((uint32_t)N->getZExtValue()) < 256; +}]>; + +def imm8_255 : ImmLeaf<i32, [{ + return Imm >= 8 && Imm < 256; +}]>; +def imm8_255_neg : PatLeaf<(i32 imm), [{ + unsigned Val = -N->getZExtValue(); + return Val >= 8 && Val < 256; +}], imm_neg_XFORM>; + +// Break imm's up into two pieces: an immediate + a left shift. This uses +// thumb_immshifted to match and thumb_immshifted_val and thumb_immshifted_shamt +// to get the val/shift pieces. +def thumb_immshifted : PatLeaf<(imm), [{ + return ARM_AM::isThumbImmShiftedVal((unsigned)N->getZExtValue()); +}]>; + +def thumb_immshifted_val : SDNodeXForm<imm, [{ + unsigned V = ARM_AM::getThumbImmNonShiftedVal((unsigned)N->getZExtValue()); + return CurDAG->getTargetConstant(V, MVT::i32); +}]>; + +def thumb_immshifted_shamt : SDNodeXForm<imm, [{ + unsigned V = ARM_AM::getThumbImmValShift((unsigned)N->getZExtValue()); + return CurDAG->getTargetConstant(V, MVT::i32); +}]>; + +// ADR instruction labels. +def t_adrlabel : Operand<i32> { + let EncoderMethod = "getThumbAdrLabelOpValue"; +} + +// Scaled 4 immediate. +def t_imm0_1020s4_asmoperand: AsmOperandClass { let Name = "Imm0_1020s4"; } +def t_imm0_1020s4 : Operand<i32> { + let PrintMethod = "printThumbS4ImmOperand"; + let ParserMatchClass = t_imm0_1020s4_asmoperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +def t_imm0_508s4_asmoperand: AsmOperandClass { let Name = "Imm0_508s4"; } +def t_imm0_508s4 : Operand<i32> { + let PrintMethod = "printThumbS4ImmOperand"; + let ParserMatchClass = t_imm0_508s4_asmoperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +// Define Thumb specific addressing modes. + +let OperandType = "OPERAND_PCREL" in { +def t_brtarget : Operand<OtherVT> { + let EncoderMethod = "getThumbBRTargetOpValue"; + let DecoderMethod = "DecodeThumbBROperand"; +} + +def t_bcctarget : Operand<i32> { + let EncoderMethod = "getThumbBCCTargetOpValue"; + let DecoderMethod = "DecodeThumbBCCTargetOperand"; +} + +def t_cbtarget : Operand<i32> { + let EncoderMethod = "getThumbCBTargetOpValue"; + let DecoderMethod = "DecodeThumbCmpBROperand"; +} + +def t_bltarget : Operand<i32> { + let EncoderMethod = "getThumbBLTargetOpValue"; + let DecoderMethod = "DecodeThumbBLTargetOperand"; +} + +def t_blxtarget : Operand<i32> { + let EncoderMethod = "getThumbBLXTargetOpValue"; + let DecoderMethod = "DecodeThumbBLXOffset"; +} +} + +// t_addrmode_rr := reg + reg +// +def t_addrmode_rr_asm_operand : AsmOperandClass { let Name = "MemThumbRR"; } +def t_addrmode_rr : Operand<i32>, + ComplexPattern<i32, 2, "SelectThumbAddrModeRR", []> { + let EncoderMethod = "getThumbAddrModeRegRegOpValue"; + let PrintMethod = "printThumbAddrModeRROperand"; + let DecoderMethod = "DecodeThumbAddrModeRR"; + let ParserMatchClass = t_addrmode_rr_asm_operand; + let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg); +} + +// t_addrmode_rrs := reg + reg +// +// We use separate scaled versions because the Select* functions need +// to explicitly check for a matching constant and return false here so that +// the reg+imm forms will match instead. This is a horrible way to do that, +// as it forces tight coupling between the methods, but it's how selectiondag +// currently works. +def t_addrmode_rrs1 : Operand<i32>, + ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S1", []> { + let EncoderMethod = "getThumbAddrModeRegRegOpValue"; + let PrintMethod = "printThumbAddrModeRROperand"; + let DecoderMethod = "DecodeThumbAddrModeRR"; + let ParserMatchClass = t_addrmode_rr_asm_operand; + let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg); +} +def t_addrmode_rrs2 : Operand<i32>, + ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S2", []> { + let EncoderMethod = "getThumbAddrModeRegRegOpValue"; + let DecoderMethod = "DecodeThumbAddrModeRR"; + let PrintMethod = "printThumbAddrModeRROperand"; + let ParserMatchClass = t_addrmode_rr_asm_operand; + let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg); +} +def t_addrmode_rrs4 : Operand<i32>, + ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S4", []> { + let EncoderMethod = "getThumbAddrModeRegRegOpValue"; + let DecoderMethod = "DecodeThumbAddrModeRR"; + let PrintMethod = "printThumbAddrModeRROperand"; + let ParserMatchClass = t_addrmode_rr_asm_operand; + let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg); +} + +// t_addrmode_is4 := reg + imm5 * 4 +// +def t_addrmode_is4_asm_operand : AsmOperandClass { let Name = "MemThumbRIs4"; } +def t_addrmode_is4 : Operand<i32>, + ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S4", []> { + let EncoderMethod = "getAddrModeISOpValue"; + let DecoderMethod = "DecodeThumbAddrModeIS"; + let PrintMethod = "printThumbAddrModeImm5S4Operand"; + let ParserMatchClass = t_addrmode_is4_asm_operand; + let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm); +} + +// t_addrmode_is2 := reg + imm5 * 2 +// +def t_addrmode_is2_asm_operand : AsmOperandClass { let Name = "MemThumbRIs2"; } +def t_addrmode_is2 : Operand<i32>, + ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S2", []> { + let EncoderMethod = "getAddrModeISOpValue"; + let DecoderMethod = "DecodeThumbAddrModeIS"; + let PrintMethod = "printThumbAddrModeImm5S2Operand"; + let ParserMatchClass = t_addrmode_is2_asm_operand; + let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm); +} + +// t_addrmode_is1 := reg + imm5 +// +def t_addrmode_is1_asm_operand : AsmOperandClass { let Name = "MemThumbRIs1"; } +def t_addrmode_is1 : Operand<i32>, + ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S1", []> { + let EncoderMethod = "getAddrModeISOpValue"; + let DecoderMethod = "DecodeThumbAddrModeIS"; + let PrintMethod = "printThumbAddrModeImm5S1Operand"; + let ParserMatchClass = t_addrmode_is1_asm_operand; + let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm); +} + +// t_addrmode_sp := sp + imm8 * 4 +// +// FIXME: This really shouldn't have an explicit SP operand at all. It should +// be implicit, just like in the instruction encoding itself. +def t_addrmode_sp_asm_operand : AsmOperandClass { let Name = "MemThumbSPI"; } +def t_addrmode_sp : Operand<i32>, + ComplexPattern<i32, 2, "SelectThumbAddrModeSP", []> { + let EncoderMethod = "getAddrModeThumbSPOpValue"; + let DecoderMethod = "DecodeThumbAddrModeSP"; + let PrintMethod = "printThumbAddrModeSPOperand"; + let ParserMatchClass = t_addrmode_sp_asm_operand; + let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); +} + +// t_addrmode_pc := <label> => pc + imm8 * 4 +// +def t_addrmode_pc : Operand<i32> { + let EncoderMethod = "getAddrModePCOpValue"; + let DecoderMethod = "DecodeThumbAddrModePC"; +} + +//===----------------------------------------------------------------------===// +// Miscellaneous Instructions. +// + +// FIXME: Marking these as hasSideEffects is necessary to prevent machine DCE +// from removing one half of the matched pairs. That breaks PEI, which assumes +// these will always be in pairs, and asserts if it finds otherwise. Better way? +let Defs = [SP], Uses = [SP], hasSideEffects = 1 in { +def tADJCALLSTACKUP : + PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), NoItinerary, + [(ARMcallseq_end imm:$amt1, imm:$amt2)]>, + Requires<[IsThumb, IsThumb1Only]>; + +def tADJCALLSTACKDOWN : + PseudoInst<(outs), (ins i32imm:$amt), NoItinerary, + [(ARMcallseq_start imm:$amt)]>, + Requires<[IsThumb, IsThumb1Only]>; +} + +class T1SystemEncoding<bits<8> opc> + : T1Encoding<0b101111> { + let Inst{9-8} = 0b11; + let Inst{7-0} = opc; +} + +def tNOP : T1pI<(outs), (ins), NoItinerary, "nop", "", []>, + T1SystemEncoding<0x00>, // A8.6.110 + Requires<[IsThumb2]>; + +def tYIELD : T1pI<(outs), (ins), NoItinerary, "yield", "", []>, + T1SystemEncoding<0x10>; // A8.6.410 + +def tWFE : T1pI<(outs), (ins), NoItinerary, "wfe", "", []>, + T1SystemEncoding<0x20>; // A8.6.408 + +def tWFI : T1pI<(outs), (ins), NoItinerary, "wfi", "", []>, + T1SystemEncoding<0x30>; // A8.6.409 + +def tSEV : T1pI<(outs), (ins), NoItinerary, "sev", "", []>, + T1SystemEncoding<0x40>; // A8.6.157 + +// The imm operand $val can be used by a debugger to store more information +// about the breakpoint. +def tBKPT : T1I<(outs), (ins imm0_255:$val), NoItinerary, "bkpt\t$val", + []>, + T1Encoding<0b101111> { + let Inst{9-8} = 0b10; + // A8.6.22 + bits<8> val; + let Inst{7-0} = val; +} + +def tSETEND : T1I<(outs), (ins setend_op:$end), NoItinerary, "setend\t$end", + []>, T1Encoding<0b101101> { + bits<1> end; + // A8.6.156 + let Inst{9-5} = 0b10010; + let Inst{4} = 1; + let Inst{3} = end; + let Inst{2-0} = 0b000; +} + +// Change Processor State is a system instruction -- for disassembly only. +def tCPS : T1I<(outs), (ins imod_op:$imod, iflags_op:$iflags), + NoItinerary, "cps$imod $iflags", []>, + T1Misc<0b0110011> { + // A8.6.38 & B6.1.1 + bit imod; + bits<3> iflags; + + let Inst{4} = imod; + let Inst{3} = 0; + let Inst{2-0} = iflags; + let DecoderMethod = "DecodeThumbCPS"; +} + +// For both thumb1 and thumb2. +let isNotDuplicable = 1, isCodeGenOnly = 1 in +def tPICADD : TIt<(outs GPR:$dst), (ins GPR:$lhs, pclabel:$cp), IIC_iALUr, "", + [(set GPR:$dst, (ARMpic_add GPR:$lhs, imm:$cp))]>, + T1Special<{0,0,?,?}> { + // A8.6.6 + bits<3> dst; + let Inst{6-3} = 0b1111; // Rm = pc + let Inst{2-0} = dst; +} + +// ADD <Rd>, sp, #<imm8> +// FIXME: This should not be marked as having side effects, and it should be +// rematerializable. Clearing the side effect bit causes miscompilations, +// probably because the instruction can be moved around. +def tADDrSPi : T1pI<(outs tGPR:$dst), (ins GPRsp:$sp, t_imm0_1020s4:$imm), + IIC_iALUi, "add", "\t$dst, $sp, $imm", []>, + T1Encoding<{1,0,1,0,1,?}> { + // A6.2 & A8.6.8 + bits<3> dst; + bits<8> imm; + let Inst{10-8} = dst; + let Inst{7-0} = imm; + let DecoderMethod = "DecodeThumbAddSpecialReg"; +} + +// ADD sp, sp, #<imm7> +def tADDspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm), + IIC_iALUi, "add", "\t$Rdn, $imm", []>, + T1Misc<{0,0,0,0,0,?,?}> { + // A6.2.5 & A8.6.8 + bits<7> imm; + let Inst{6-0} = imm; + let DecoderMethod = "DecodeThumbAddSPImm"; +} + +// SUB sp, sp, #<imm7> +// FIXME: The encoding and the ASM string don't match up. +def tSUBspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm), + IIC_iALUi, "sub", "\t$Rdn, $imm", []>, + T1Misc<{0,0,0,0,1,?,?}> { + // A6.2.5 & A8.6.214 + bits<7> imm; + let Inst{6-0} = imm; + let DecoderMethod = "DecodeThumbAddSPImm"; +} + +// Can optionally specify SP as a three operand instruction. +def : tInstAlias<"add${p} sp, sp, $imm", + (tADDspi SP, t_imm0_508s4:$imm, pred:$p)>; +def : tInstAlias<"sub${p} sp, sp, $imm", + (tSUBspi SP, t_imm0_508s4:$imm, pred:$p)>; + +// ADD <Rm>, sp +def tADDrSP : T1pIt<(outs GPR:$Rdn), (ins GPR:$Rn, GPRsp:$sp), IIC_iALUr, + "add", "\t$Rdn, $sp, $Rn", []>, + T1Special<{0,0,?,?}> { + // A8.6.9 Encoding T1 + bits<4> Rdn; + let Inst{7} = Rdn{3}; + let Inst{6-3} = 0b1101; + let Inst{2-0} = Rdn{2-0}; + let DecoderMethod = "DecodeThumbAddSPReg"; +} + +// ADD sp, <Rm> +def tADDspr : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, GPR:$Rm), IIC_iALUr, + "add", "\t$Rdn, $Rm", []>, + T1Special<{0,0,?,?}> { + // A8.6.9 Encoding T2 + bits<4> Rm; + let Inst{7} = 1; + let Inst{6-3} = Rm; + let Inst{2-0} = 0b101; + let DecoderMethod = "DecodeThumbAddSPReg"; +} + +//===----------------------------------------------------------------------===// +// Control Flow Instructions. +// + +// Indirect branches +let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { + def tBX : TI<(outs), (ins GPR:$Rm, pred:$p), IIC_Br, "bx${p}\t$Rm", []>, + T1Special<{1,1,0,?}> { + // A6.2.3 & A8.6.25 + bits<4> Rm; + let Inst{6-3} = Rm; + let Inst{2-0} = 0b000; + } +} + +let isReturn = 1, isTerminator = 1, isBarrier = 1 in { + def tBX_RET : tPseudoExpand<(outs), (ins pred:$p), 2, IIC_Br, + [(ARMretflag)], (tBX LR, pred:$p)>; + + // Alternative return instruction used by vararg functions. + def tBX_RET_vararg : tPseudoExpand<(outs), (ins tGPR:$Rm, pred:$p), + 2, IIC_Br, [], + (tBX GPR:$Rm, pred:$p)>; +} + +// All calls clobber the non-callee saved registers. SP is marked as a use to +// prevent stack-pointer assignments that appear immediately before calls from +// potentially appearing dead. +let isCall = 1, + // On non-Darwin platforms R9 is callee-saved. + Defs = [R0, R1, R2, R3, R12, LR, QQQQ0, QQQQ2, QQQQ3, CPSR, FPSCR], + Uses = [SP] in { + // Also used for Thumb2 + def tBL : TIx2<0b11110, 0b11, 1, + (outs), (ins pred:$p, t_bltarget:$func, variable_ops), IIC_Br, + "bl${p}\t$func", + [(ARMtcall tglobaladdr:$func)]>, + Requires<[IsThumb, IsNotDarwin]> { + bits<22> func; + let Inst{26} = func{21}; + let Inst{25-16} = func{20-11}; + let Inst{13} = 1; + let Inst{11} = 1; + let Inst{10-0} = func{10-0}; + } + + // ARMv5T and above, also used for Thumb2 + def tBLXi : TIx2<0b11110, 0b11, 0, + (outs), (ins pred:$p, t_blxtarget:$func, variable_ops), IIC_Br, + "blx${p}\t$func", + [(ARMcall tglobaladdr:$func)]>, + Requires<[IsThumb, HasV5T, IsNotDarwin]> { + bits<21> func; + let Inst{25-16} = func{20-11}; + let Inst{13} = 1; + let Inst{11} = 1; + let Inst{10-1} = func{10-1}; + let Inst{0} = 0; // func{0} is assumed zero + } + + // Also used for Thumb2 + def tBLXr : TI<(outs), (ins pred:$p, GPR:$func, variable_ops), IIC_Br, + "blx${p}\t$func", + [(ARMtcall GPR:$func)]>, + Requires<[IsThumb, HasV5T, IsNotDarwin]>, + T1Special<{1,1,1,?}> { // A6.2.3 & A8.6.24; + bits<4> func; + let Inst{6-3} = func; + let Inst{2-0} = 0b000; + } + + // ARMv4T + def tBX_CALL : tPseudoInst<(outs), (ins tGPR:$func, variable_ops), + 4, IIC_Br, + [(ARMcall_nolink tGPR:$func)]>, + Requires<[IsThumb, IsThumb1Only, IsNotDarwin]>; +} + +let isCall = 1, + // On Darwin R9 is call-clobbered. + // R7 is marked as a use to prevent frame-pointer assignments from being + // moved above / below calls. + Defs = [R0, R1, R2, R3, R9, R12, LR, QQQQ0, QQQQ2, QQQQ3, CPSR, FPSCR], + Uses = [R7, SP] in { + // Also used for Thumb2 + def tBLr9 : tPseudoExpand<(outs), (ins pred:$p, t_bltarget:$func, variable_ops), + 4, IIC_Br, [(ARMtcall tglobaladdr:$func)], + (tBL pred:$p, t_bltarget:$func)>, + Requires<[IsThumb, IsDarwin]>; + + // ARMv5T and above, also used for Thumb2 + def tBLXi_r9 : tPseudoExpand<(outs), (ins pred:$p, t_blxtarget:$func, variable_ops), + 4, IIC_Br, [(ARMcall tglobaladdr:$func)], + (tBLXi pred:$p, t_blxtarget:$func)>, + Requires<[IsThumb, HasV5T, IsDarwin]>; + + // Also used for Thumb2 + def tBLXr_r9 : tPseudoExpand<(outs), (ins pred:$p, GPR:$func, variable_ops), + 2, IIC_Br, [(ARMtcall GPR:$func)], + (tBLXr pred:$p, GPR:$func)>, + Requires<[IsThumb, HasV5T, IsDarwin]>; + + // ARMv4T + def tBXr9_CALL : tPseudoInst<(outs), (ins tGPR:$func, variable_ops), + 4, IIC_Br, + [(ARMcall_nolink tGPR:$func)]>, + Requires<[IsThumb, IsThumb1Only, IsDarwin]>; +} + +let isBranch = 1, isTerminator = 1, isBarrier = 1 in { + let isPredicable = 1 in + def tB : T1pI<(outs), (ins t_brtarget:$target), IIC_Br, + "b", "\t$target", [(br bb:$target)]>, + T1Encoding<{1,1,1,0,0,?}> { + bits<11> target; + let Inst{10-0} = target; + } + + // Far jump + // Just a pseudo for a tBL instruction. Needed to let regalloc know about + // the clobber of LR. + let Defs = [LR] in + def tBfar : tPseudoExpand<(outs), (ins t_bltarget:$target, pred:$p), + 4, IIC_Br, [], (tBL pred:$p, t_bltarget:$target)>; + + def tBR_JTr : tPseudoInst<(outs), + (ins tGPR:$target, i32imm:$jt, i32imm:$id), + 0, IIC_Br, + [(ARMbrjt tGPR:$target, tjumptable:$jt, imm:$id)]> { + list<Predicate> Predicates = [IsThumb, IsThumb1Only]; + } +} + +// FIXME: should be able to write a pattern for ARMBrcond, but can't use +// a two-value operand where a dag node expects two operands. :( +let isBranch = 1, isTerminator = 1 in + def tBcc : T1I<(outs), (ins t_bcctarget:$target, pred:$p), IIC_Br, + "b${p}\t$target", + [/*(ARMbrcond bb:$target, imm:$cc)*/]>, + T1BranchCond<{1,1,0,1}> { + bits<4> p; + bits<8> target; + let Inst{11-8} = p; + let Inst{7-0} = target; +} + +// Tail calls +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in { + // Darwin versions. + let Defs = [R0, R1, R2, R3, R9, R12, QQQQ0, QQQQ2, QQQQ3, PC], + Uses = [SP] in { + // tTAILJMPd: Darwin version uses a Thumb2 branch (no Thumb1 tail calls + // on Darwin), so it's in ARMInstrThumb2.td. + def tTAILJMPr : tPseudoExpand<(outs), (ins tcGPR:$dst, variable_ops), + 4, IIC_Br, [], + (tBX GPR:$dst, (ops 14, zero_reg))>, + Requires<[IsThumb, IsDarwin]>; + } + // Non-Darwin versions (the difference is R9). + let Defs = [R0, R1, R2, R3, R12, QQQQ0, QQQQ2, QQQQ3, PC], + Uses = [SP] in { + def tTAILJMPdND : tPseudoExpand<(outs), + (ins t_brtarget:$dst, pred:$p, variable_ops), + 4, IIC_Br, [], + (tB t_brtarget:$dst, pred:$p)>, + Requires<[IsThumb, IsNotDarwin]>; + def tTAILJMPrND : tPseudoExpand<(outs), (ins tcGPR:$dst, variable_ops), + 4, IIC_Br, [], + (tBX GPR:$dst, (ops 14, zero_reg))>, + Requires<[IsThumb, IsNotDarwin]>; + } +} + + +// A8.6.218 Supervisor Call (Software Interrupt) +// A8.6.16 B: Encoding T1 +// If Inst{11-8} == 0b1111 then SEE SVC +let isCall = 1, Uses = [SP] in +def tSVC : T1pI<(outs), (ins imm0_255:$imm), IIC_Br, + "svc", "\t$imm", []>, Encoding16 { + bits<8> imm; + let Inst{15-12} = 0b1101; + let Inst{11-8} = 0b1111; + let Inst{7-0} = imm; +} + +// The assembler uses 0xDEFE for a trap instruction. +let isBarrier = 1, isTerminator = 1 in +def tTRAP : TI<(outs), (ins), IIC_Br, + "trap", [(trap)]>, Encoding16 { + let Inst = 0xdefe; +} + +//===----------------------------------------------------------------------===// +// Load Store Instructions. +// + +// Loads: reg/reg and reg/imm5 +let canFoldAsLoad = 1, isReMaterializable = 1 in +multiclass thumb_ld_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc, + Operand AddrMode_r, Operand AddrMode_i, + AddrMode am, InstrItinClass itin_r, + InstrItinClass itin_i, string asm, + PatFrag opnode> { + def r : // reg/reg + T1pILdStEncode<reg_opc, + (outs tGPR:$Rt), (ins AddrMode_r:$addr), + am, itin_r, asm, "\t$Rt, $addr", + [(set tGPR:$Rt, (opnode AddrMode_r:$addr))]>; + def i : // reg/imm5 + T1pILdStEncodeImm<imm_opc, 1 /* Load */, + (outs tGPR:$Rt), (ins AddrMode_i:$addr), + am, itin_i, asm, "\t$Rt, $addr", + [(set tGPR:$Rt, (opnode AddrMode_i:$addr))]>; +} +// Stores: reg/reg and reg/imm5 +multiclass thumb_st_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc, + Operand AddrMode_r, Operand AddrMode_i, + AddrMode am, InstrItinClass itin_r, + InstrItinClass itin_i, string asm, + PatFrag opnode> { + def r : // reg/reg + T1pILdStEncode<reg_opc, + (outs), (ins tGPR:$Rt, AddrMode_r:$addr), + am, itin_r, asm, "\t$Rt, $addr", + [(opnode tGPR:$Rt, AddrMode_r:$addr)]>; + def i : // reg/imm5 + T1pILdStEncodeImm<imm_opc, 0 /* Store */, + (outs), (ins tGPR:$Rt, AddrMode_i:$addr), + am, itin_i, asm, "\t$Rt, $addr", + [(opnode tGPR:$Rt, AddrMode_i:$addr)]>; +} + +// A8.6.57 & A8.6.60 +defm tLDR : thumb_ld_rr_ri_enc<0b100, 0b0110, t_addrmode_rrs4, + t_addrmode_is4, AddrModeT1_4, + IIC_iLoad_r, IIC_iLoad_i, "ldr", + UnOpFrag<(load node:$Src)>>; + +// A8.6.64 & A8.6.61 +defm tLDRB : thumb_ld_rr_ri_enc<0b110, 0b0111, t_addrmode_rrs1, + t_addrmode_is1, AddrModeT1_1, + IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrb", + UnOpFrag<(zextloadi8 node:$Src)>>; + +// A8.6.76 & A8.6.73 +defm tLDRH : thumb_ld_rr_ri_enc<0b101, 0b1000, t_addrmode_rrs2, + t_addrmode_is2, AddrModeT1_2, + IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrh", + UnOpFrag<(zextloadi16 node:$Src)>>; + +let AddedComplexity = 10 in +def tLDRSB : // A8.6.80 + T1pILdStEncode<0b011, (outs tGPR:$Rt), (ins t_addrmode_rr:$addr), + AddrModeT1_1, IIC_iLoad_bh_r, + "ldrsb", "\t$Rt, $addr", + [(set tGPR:$Rt, (sextloadi8 t_addrmode_rr:$addr))]>; + +let AddedComplexity = 10 in +def tLDRSH : // A8.6.84 + T1pILdStEncode<0b111, (outs tGPR:$Rt), (ins t_addrmode_rr:$addr), + AddrModeT1_2, IIC_iLoad_bh_r, + "ldrsh", "\t$Rt, $addr", + [(set tGPR:$Rt, (sextloadi16 t_addrmode_rr:$addr))]>; + +let canFoldAsLoad = 1 in +def tLDRspi : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_sp:$addr), IIC_iLoad_i, + "ldr", "\t$Rt, $addr", + [(set tGPR:$Rt, (load t_addrmode_sp:$addr))]>, + T1LdStSP<{1,?,?}> { + bits<3> Rt; + bits<8> addr; + let Inst{10-8} = Rt; + let Inst{7-0} = addr; +} + +// Load tconstpool +// FIXME: Use ldr.n to work around a Darwin assembler bug. +let canFoldAsLoad = 1, isReMaterializable = 1, isCodeGenOnly = 1 in +def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i, + "ldr", ".n\t$Rt, $addr", + [(set tGPR:$Rt, (load (ARMWrapper tconstpool:$addr)))]>, + T1Encoding<{0,1,0,0,1,?}> { + // A6.2 & A8.6.59 + bits<3> Rt; + bits<8> addr; + let Inst{10-8} = Rt; + let Inst{7-0} = addr; +} + +// FIXME: Remove this entry when the above ldr.n workaround is fixed. +// For disassembly use only. +def tLDRpciDIS : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i, + "ldr", "\t$Rt, $addr", + [/* disassembly only */]>, + T1Encoding<{0,1,0,0,1,?}> { + // A6.2 & A8.6.59 + bits<3> Rt; + bits<8> addr; + let Inst{10-8} = Rt; + let Inst{7-0} = addr; +} + +// A8.6.194 & A8.6.192 +defm tSTR : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rrs4, + t_addrmode_is4, AddrModeT1_4, + IIC_iStore_r, IIC_iStore_i, "str", + BinOpFrag<(store node:$LHS, node:$RHS)>>; + +// A8.6.197 & A8.6.195 +defm tSTRB : thumb_st_rr_ri_enc<0b010, 0b0111, t_addrmode_rrs1, + t_addrmode_is1, AddrModeT1_1, + IIC_iStore_bh_r, IIC_iStore_bh_i, "strb", + BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>; + +// A8.6.207 & A8.6.205 +defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rrs2, + t_addrmode_is2, AddrModeT1_2, + IIC_iStore_bh_r, IIC_iStore_bh_i, "strh", + BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>; + + +def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i, + "str", "\t$Rt, $addr", + [(store tGPR:$Rt, t_addrmode_sp:$addr)]>, + T1LdStSP<{0,?,?}> { + bits<3> Rt; + bits<8> addr; + let Inst{10-8} = Rt; + let Inst{7-0} = addr; +} + +//===----------------------------------------------------------------------===// +// Load / store multiple Instructions. +// + +// These require base address to be written back or one of the loaded regs. +let neverHasSideEffects = 1 in { + +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in +def tLDMIA : T1I<(outs), (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops), + IIC_iLoad_m, "ldm${p}\t$Rn, $regs", []>, T1Encoding<{1,1,0,0,1,?}> { + bits<3> Rn; + bits<8> regs; + let Inst{10-8} = Rn; + let Inst{7-0} = regs; +} + +// Writeback version is just a pseudo, as there's no encoding difference. +// Writeback happens iff the base register is not in the destination register +// list. +def tLDMIA_UPD : + InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo, GenericDomain, + "$Rn = $wb", IIC_iLoad_mu>, + PseudoInstExpansion<(tLDMIA tGPR:$Rn, pred:$p, reglist:$regs)> { + let Size = 2; + let OutOperandList = (outs GPR:$wb); + let InOperandList = (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops); + let Pattern = []; + let isCodeGenOnly = 1; + let isPseudo = 1; + list<Predicate> Predicates = [IsThumb]; +} + +// There is no non-writeback version of STM for Thumb. +let mayStore = 1, hasExtraSrcRegAllocReq = 1 in +def tSTMIA_UPD : Thumb1I<(outs GPR:$wb), + (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops), + AddrModeNone, 2, IIC_iStore_mu, + "stm${p}\t$Rn!, $regs", "$Rn = $wb", []>, + T1Encoding<{1,1,0,0,0,?}> { + bits<3> Rn; + bits<8> regs; + let Inst{10-8} = Rn; + let Inst{7-0} = regs; +} + +} // neverHasSideEffects + +def : InstAlias<"ldm${p} $Rn!, $regs", + (tLDMIA tGPR:$Rn, pred:$p, reglist:$regs)>, + Requires<[IsThumb, IsThumb1Only]>; + +let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1 in +def tPOP : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops), + IIC_iPop, + "pop${p}\t$regs", []>, + T1Misc<{1,1,0,?,?,?,?}> { + bits<16> regs; + let Inst{8} = regs{15}; + let Inst{7-0} = regs{7-0}; +} + +let mayStore = 1, Uses = [SP], Defs = [SP], hasExtraSrcRegAllocReq = 1 in +def tPUSH : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops), + IIC_iStore_m, + "push${p}\t$regs", []>, + T1Misc<{0,1,0,?,?,?,?}> { + bits<16> regs; + let Inst{8} = regs{14}; + let Inst{7-0} = regs{7-0}; +} + +//===----------------------------------------------------------------------===// +// Arithmetic Instructions. +// + +// Helper classes for encoding T1pI patterns: +class T1pIDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T1pI<oops, iops, itin, opc, asm, pattern>, + T1DataProcessing<opA> { + bits<3> Rm; + bits<3> Rn; + let Inst{5-3} = Rm; + let Inst{2-0} = Rn; +} +class T1pIMiscEncode<bits<7> opA, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T1pI<oops, iops, itin, opc, asm, pattern>, + T1Misc<opA> { + bits<3> Rm; + bits<3> Rd; + let Inst{5-3} = Rm; + let Inst{2-0} = Rd; +} + +// Helper classes for encoding T1sI patterns: +class T1sIDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T1sI<oops, iops, itin, opc, asm, pattern>, + T1DataProcessing<opA> { + bits<3> Rd; + bits<3> Rn; + let Inst{5-3} = Rn; + let Inst{2-0} = Rd; +} +class T1sIGenEncode<bits<5> opA, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T1sI<oops, iops, itin, opc, asm, pattern>, + T1General<opA> { + bits<3> Rm; + bits<3> Rn; + bits<3> Rd; + let Inst{8-6} = Rm; + let Inst{5-3} = Rn; + let Inst{2-0} = Rd; +} +class T1sIGenEncodeImm<bits<5> opA, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T1sI<oops, iops, itin, opc, asm, pattern>, + T1General<opA> { + bits<3> Rd; + bits<3> Rm; + let Inst{5-3} = Rm; + let Inst{2-0} = Rd; +} + +// Helper classes for encoding T1sIt patterns: +class T1sItDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T1sIt<oops, iops, itin, opc, asm, pattern>, + T1DataProcessing<opA> { + bits<3> Rdn; + bits<3> Rm; + let Inst{5-3} = Rm; + let Inst{2-0} = Rdn; +} +class T1sItGenEncodeImm<bits<5> opA, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T1sIt<oops, iops, itin, opc, asm, pattern>, + T1General<opA> { + bits<3> Rdn; + bits<8> imm8; + let Inst{10-8} = Rdn; + let Inst{7-0} = imm8; +} + +// Add with carry register +let isCommutable = 1, Uses = [CPSR] in +def tADC : // A8.6.2 + T1sItDPEncode<0b0101, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), IIC_iALUr, + "adc", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (adde tGPR:$Rn, tGPR:$Rm))]>; + +// Add immediate +def tADDi3 : // A8.6.4 T1 + T1sIGenEncodeImm<0b01110, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3), + IIC_iALUi, + "add", "\t$Rd, $Rm, $imm3", + [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7:$imm3))]> { + bits<3> imm3; + let Inst{8-6} = imm3; +} + +def tADDi8 : // A8.6.4 T2 + T1sItGenEncodeImm<{1,1,0,?,?}, (outs tGPR:$Rdn), + (ins tGPR:$Rn, imm0_255:$imm8), IIC_iALUi, + "add", "\t$Rdn, $imm8", + [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255:$imm8))]>; + +// Add register +let isCommutable = 1 in +def tADDrr : // A8.6.6 T1 + T1sIGenEncode<0b01100, (outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iALUr, + "add", "\t$Rd, $Rn, $Rm", + [(set tGPR:$Rd, (add tGPR:$Rn, tGPR:$Rm))]>; + +let neverHasSideEffects = 1 in +def tADDhirr : T1pIt<(outs GPR:$Rdn), (ins GPR:$Rn, GPR:$Rm), IIC_iALUr, + "add", "\t$Rdn, $Rm", []>, + T1Special<{0,0,?,?}> { + // A8.6.6 T2 + bits<4> Rdn; + bits<4> Rm; + let Inst{7} = Rdn{3}; + let Inst{6-3} = Rm; + let Inst{2-0} = Rdn{2-0}; +} + +// AND register +let isCommutable = 1 in +def tAND : // A8.6.12 + T1sItDPEncode<0b0000, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iBITr, + "and", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (and tGPR:$Rn, tGPR:$Rm))]>; + +// ASR immediate +def tASRri : // A8.6.14 + T1sIGenEncodeImm<{0,1,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm_sr:$imm5), + IIC_iMOVsi, + "asr", "\t$Rd, $Rm, $imm5", + [(set tGPR:$Rd, (sra tGPR:$Rm, (i32 imm_sr:$imm5)))]> { + bits<5> imm5; + let Inst{10-6} = imm5; +} + +// ASR register +def tASRrr : // A8.6.15 + T1sItDPEncode<0b0100, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iMOVsr, + "asr", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (sra tGPR:$Rn, tGPR:$Rm))]>; + +// BIC register +def tBIC : // A8.6.20 + T1sItDPEncode<0b1110, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iBITr, + "bic", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (and tGPR:$Rn, (not tGPR:$Rm)))]>; + +// CMN register +let isCompare = 1, Defs = [CPSR] in { +//FIXME: Disable CMN, as CCodes are backwards from compare expectations +// Compare-to-zero still works out, just not the relationals +//def tCMN : // A8.6.33 +// T1pIDPEncode<0b1011, (outs), (ins tGPR:$lhs, tGPR:$rhs), +// IIC_iCMPr, +// "cmn", "\t$lhs, $rhs", +// [(ARMcmp tGPR:$lhs, (ineg tGPR:$rhs))]>; + +def tCMNz : // A8.6.33 + T1pIDPEncode<0b1011, (outs), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iCMPr, + "cmn", "\t$Rn, $Rm", + [(ARMcmpZ tGPR:$Rn, (ineg tGPR:$Rm))]>; + +} // isCompare = 1, Defs = [CPSR] + +// CMP immediate +let isCompare = 1, Defs = [CPSR] in { +def tCMPi8 : T1pI<(outs), (ins tGPR:$Rn, imm0_255:$imm8), IIC_iCMPi, + "cmp", "\t$Rn, $imm8", + [(ARMcmp tGPR:$Rn, imm0_255:$imm8)]>, + T1General<{1,0,1,?,?}> { + // A8.6.35 + bits<3> Rn; + bits<8> imm8; + let Inst{10-8} = Rn; + let Inst{7-0} = imm8; +} + +// CMP register +def tCMPr : // A8.6.36 T1 + T1pIDPEncode<0b1010, (outs), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iCMPr, + "cmp", "\t$Rn, $Rm", + [(ARMcmp tGPR:$Rn, tGPR:$Rm)]>; + +def tCMPhir : T1pI<(outs), (ins GPR:$Rn, GPR:$Rm), IIC_iCMPr, + "cmp", "\t$Rn, $Rm", []>, + T1Special<{0,1,?,?}> { + // A8.6.36 T2 + bits<4> Rm; + bits<4> Rn; + let Inst{7} = Rn{3}; + let Inst{6-3} = Rm; + let Inst{2-0} = Rn{2-0}; +} +} // isCompare = 1, Defs = [CPSR] + + +// XOR register +let isCommutable = 1 in +def tEOR : // A8.6.45 + T1sItDPEncode<0b0001, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iBITr, + "eor", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (xor tGPR:$Rn, tGPR:$Rm))]>; + +// LSL immediate +def tLSLri : // A8.6.88 + T1sIGenEncodeImm<{0,0,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_31:$imm5), + IIC_iMOVsi, + "lsl", "\t$Rd, $Rm, $imm5", + [(set tGPR:$Rd, (shl tGPR:$Rm, (i32 imm:$imm5)))]> { + bits<5> imm5; + let Inst{10-6} = imm5; +} + +// LSL register +def tLSLrr : // A8.6.89 + T1sItDPEncode<0b0010, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iMOVsr, + "lsl", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (shl tGPR:$Rn, tGPR:$Rm))]>; + +// LSR immediate +def tLSRri : // A8.6.90 + T1sIGenEncodeImm<{0,0,1,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm_sr:$imm5), + IIC_iMOVsi, + "lsr", "\t$Rd, $Rm, $imm5", + [(set tGPR:$Rd, (srl tGPR:$Rm, (i32 imm_sr:$imm5)))]> { + bits<5> imm5; + let Inst{10-6} = imm5; +} + +// LSR register +def tLSRrr : // A8.6.91 + T1sItDPEncode<0b0011, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iMOVsr, + "lsr", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (srl tGPR:$Rn, tGPR:$Rm))]>; + +// Move register +let isMoveImm = 1 in +def tMOVi8 : T1sI<(outs tGPR:$Rd), (ins imm0_255:$imm8), IIC_iMOVi, + "mov", "\t$Rd, $imm8", + [(set tGPR:$Rd, imm0_255:$imm8)]>, + T1General<{1,0,0,?,?}> { + // A8.6.96 + bits<3> Rd; + bits<8> imm8; + let Inst{10-8} = Rd; + let Inst{7-0} = imm8; +} +// Because we have an explicit tMOVSr below, we need an alias to handle +// the immediate "movs" form here. Blech. +def : tInstAlias <"movs $Rdn, $imm", + (tMOVi8 tGPR:$Rdn, CPSR, imm0_255:$imm, 14, 0)>; + +// A7-73: MOV(2) - mov setting flag. + +let neverHasSideEffects = 1 in { +def tMOVr : Thumb1pI<(outs GPR:$Rd), (ins GPR:$Rm), AddrModeNone, + 2, IIC_iMOVr, + "mov", "\t$Rd, $Rm", "", []>, + T1Special<{1,0,?,?}> { + // A8.6.97 + bits<4> Rd; + bits<4> Rm; + let Inst{7} = Rd{3}; + let Inst{6-3} = Rm; + let Inst{2-0} = Rd{2-0}; +} +let Defs = [CPSR] in +def tMOVSr : T1I<(outs tGPR:$Rd), (ins tGPR:$Rm), IIC_iMOVr, + "movs\t$Rd, $Rm", []>, Encoding16 { + // A8.6.97 + bits<3> Rd; + bits<3> Rm; + let Inst{15-6} = 0b0000000000; + let Inst{5-3} = Rm; + let Inst{2-0} = Rd; +} +} // neverHasSideEffects + +// Multiply register +let isCommutable = 1 in +def tMUL : // A8.6.105 T1 + Thumb1sI<(outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm), AddrModeNone, 2, + IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm", "$Rm = $Rd", + [(set tGPR:$Rd, (mul tGPR:$Rn, tGPR:$Rm))]>, + T1DataProcessing<0b1101> { + bits<3> Rd; + bits<3> Rn; + let Inst{5-3} = Rn; + let Inst{2-0} = Rd; + let AsmMatchConverter = "cvtThumbMultiply"; +} + +def :tInstAlias<"mul${s}${p} $Rdm, $Rn", (tMUL tGPR:$Rdm, s_cc_out:$s, tGPR:$Rn, + pred:$p)>; + +// Move inverse register +def tMVN : // A8.6.107 + T1sIDPEncode<0b1111, (outs tGPR:$Rd), (ins tGPR:$Rn), IIC_iMVNr, + "mvn", "\t$Rd, $Rn", + [(set tGPR:$Rd, (not tGPR:$Rn))]>; + +// Bitwise or register +let isCommutable = 1 in +def tORR : // A8.6.114 + T1sItDPEncode<0b1100, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iBITr, + "orr", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (or tGPR:$Rn, tGPR:$Rm))]>; + +// Swaps +def tREV : // A8.6.134 + T1pIMiscEncode<{1,0,1,0,0,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm), + IIC_iUNAr, + "rev", "\t$Rd, $Rm", + [(set tGPR:$Rd, (bswap tGPR:$Rm))]>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; + +def tREV16 : // A8.6.135 + T1pIMiscEncode<{1,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm), + IIC_iUNAr, + "rev16", "\t$Rd, $Rm", + [(set tGPR:$Rd, (rotr (bswap tGPR:$Rm), (i32 16)))]>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; + +def tREVSH : // A8.6.136 + T1pIMiscEncode<{1,0,1,0,1,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm), + IIC_iUNAr, + "revsh", "\t$Rd, $Rm", + [(set tGPR:$Rd, (sra (bswap tGPR:$Rm), (i32 16)))]>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; + +// Rotate right register +def tROR : // A8.6.139 + T1sItDPEncode<0b0111, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iMOVsr, + "ror", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (rotr tGPR:$Rn, tGPR:$Rm))]>; + +// Negate register +def tRSB : // A8.6.141 + T1sIDPEncode<0b1001, (outs tGPR:$Rd), (ins tGPR:$Rn), + IIC_iALUi, + "rsb", "\t$Rd, $Rn, #0", + [(set tGPR:$Rd, (ineg tGPR:$Rn))]>; + +def : tInstAlias<"neg${s}${p} $Rd, $Rm", + (tRSB tGPR:$Rd, s_cc_out:$s, tGPR:$Rm, pred:$p)>; + +// Subtract with carry register +let Uses = [CPSR] in +def tSBC : // A8.6.151 + T1sItDPEncode<0b0110, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iALUr, + "sbc", "\t$Rdn, $Rm", + [(set tGPR:$Rdn, (sube tGPR:$Rn, tGPR:$Rm))]>; + +// Subtract immediate +def tSUBi3 : // A8.6.210 T1 + T1sIGenEncodeImm<0b01111, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3), + IIC_iALUi, + "sub", "\t$Rd, $Rm, $imm3", + [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7_neg:$imm3))]> { + bits<3> imm3; + let Inst{8-6} = imm3; +} + +def tSUBi8 : // A8.6.210 T2 + T1sItGenEncodeImm<{1,1,1,?,?}, (outs tGPR:$Rdn), + (ins tGPR:$Rn, imm0_255:$imm8), IIC_iALUi, + "sub", "\t$Rdn, $imm8", + [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255_neg:$imm8))]>; + +// Subtract register +def tSUBrr : // A8.6.212 + T1sIGenEncode<0b01101, (outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm), + IIC_iALUr, + "sub", "\t$Rd, $Rn, $Rm", + [(set tGPR:$Rd, (sub tGPR:$Rn, tGPR:$Rm))]>; + +// Sign-extend byte +def tSXTB : // A8.6.222 + T1pIMiscEncode<{0,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm), + IIC_iUNAr, + "sxtb", "\t$Rd, $Rm", + [(set tGPR:$Rd, (sext_inreg tGPR:$Rm, i8))]>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; + +// Sign-extend short +def tSXTH : // A8.6.224 + T1pIMiscEncode<{0,0,1,0,0,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm), + IIC_iUNAr, + "sxth", "\t$Rd, $Rm", + [(set tGPR:$Rd, (sext_inreg tGPR:$Rm, i16))]>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; + +// Test +let isCompare = 1, isCommutable = 1, Defs = [CPSR] in +def tTST : // A8.6.230 + T1pIDPEncode<0b1000, (outs), (ins tGPR:$Rn, tGPR:$Rm), IIC_iTSTr, + "tst", "\t$Rn, $Rm", + [(ARMcmpZ (and_su tGPR:$Rn, tGPR:$Rm), 0)]>; + +// Zero-extend byte +def tUXTB : // A8.6.262 + T1pIMiscEncode<{0,0,1,0,1,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm), + IIC_iUNAr, + "uxtb", "\t$Rd, $Rm", + [(set tGPR:$Rd, (and tGPR:$Rm, 0xFF))]>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; + +// Zero-extend short +def tUXTH : // A8.6.264 + T1pIMiscEncode<{0,0,1,0,1,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm), + IIC_iUNAr, + "uxth", "\t$Rd, $Rm", + [(set tGPR:$Rd, (and tGPR:$Rm, 0xFFFF))]>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; + +// Conditional move tMOVCCr - Used to implement the Thumb SELECT_CC operation. +// Expanded after instruction selection into a branch sequence. +let usesCustomInserter = 1 in // Expanded after instruction selection. + def tMOVCCr_pseudo : + PseudoInst<(outs tGPR:$dst), (ins tGPR:$false, tGPR:$true, pred:$cc), + NoItinerary, + [/*(set tGPR:$dst, (ARMcmov tGPR:$false, tGPR:$true, imm:$cc))*/]>; + +// tLEApcrel - Load a pc-relative address into a register without offending the +// assembler. + +def tADR : T1I<(outs tGPR:$Rd), (ins t_adrlabel:$addr, pred:$p), + IIC_iALUi, "adr{$p}\t$Rd, $addr", []>, + T1Encoding<{1,0,1,0,0,?}> { + bits<3> Rd; + bits<8> addr; + let Inst{10-8} = Rd; + let Inst{7-0} = addr; + let DecoderMethod = "DecodeThumbAddSpecialReg"; +} + +let neverHasSideEffects = 1, isReMaterializable = 1 in +def tLEApcrel : tPseudoInst<(outs tGPR:$Rd), (ins i32imm:$label, pred:$p), + 2, IIC_iALUi, []>; + +def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd), + (ins i32imm:$label, nohash_imm:$id, pred:$p), + 2, IIC_iALUi, []>; + +//===----------------------------------------------------------------------===// +// TLS Instructions +// + +// __aeabi_read_tp preserves the registers r1-r3. +// This is a pseudo inst so that we can get the encoding right, +// complete with fixup for the aeabi_read_tp function. +let isCall = 1, Defs = [R0, R12, LR, CPSR], Uses = [SP] in +def tTPsoft : tPseudoInst<(outs), (ins), 4, IIC_Br, + [(set R0, ARMthread_pointer)]>; + +//===----------------------------------------------------------------------===// +// SJLJ Exception handling intrinsics +// + +// eh_sjlj_setjmp() is an instruction sequence to store the return address and +// save #0 in R0 for the non-longjmp case. Since by its nature we may be coming +// from some other function to get here, and we're using the stack frame for the +// containing function to save/restore registers, we can't keep anything live in +// regs across the eh_sjlj_setjmp(), else it will almost certainly have been +// tromped upon when we get here from a longjmp(). We force everything out of +// registers except for our own input by listing the relevant registers in +// Defs. By doing so, we also cause the prologue/epilogue code to actively +// preserve all of the callee-saved resgisters, which is exactly what we want. +// $val is a scratch register for our use. +let Defs = [ R0, R1, R2, R3, R4, R5, R6, R7, R12, CPSR ], + hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1 in +def tInt_eh_sjlj_setjmp : ThumbXI<(outs),(ins tGPR:$src, tGPR:$val), + AddrModeNone, 0, NoItinerary, "","", + [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>; + +// FIXME: Non-Darwin version(s) +let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, isCodeGenOnly = 1, + Defs = [ R7, LR, SP ] in +def tInt_eh_sjlj_longjmp : XI<(outs), (ins GPR:$src, GPR:$scratch), + AddrModeNone, 0, IndexModeNone, + Pseudo, NoItinerary, "", "", + [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>, + Requires<[IsThumb, IsDarwin]>; + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +// + +// Comparisons +def : T1Pat<(ARMcmpZ tGPR:$Rn, imm0_255:$imm8), + (tCMPi8 tGPR:$Rn, imm0_255:$imm8)>; +def : T1Pat<(ARMcmpZ tGPR:$Rn, tGPR:$Rm), + (tCMPr tGPR:$Rn, tGPR:$Rm)>; + +// Add with carry +def : T1Pat<(addc tGPR:$lhs, imm0_7:$rhs), + (tADDi3 tGPR:$lhs, imm0_7:$rhs)>; +def : T1Pat<(addc tGPR:$lhs, imm8_255:$rhs), + (tADDi8 tGPR:$lhs, imm8_255:$rhs)>; +def : T1Pat<(addc tGPR:$lhs, tGPR:$rhs), + (tADDrr tGPR:$lhs, tGPR:$rhs)>; + +// Subtract with carry +def : T1Pat<(addc tGPR:$lhs, imm0_7_neg:$rhs), + (tSUBi3 tGPR:$lhs, imm0_7_neg:$rhs)>; +def : T1Pat<(addc tGPR:$lhs, imm8_255_neg:$rhs), + (tSUBi8 tGPR:$lhs, imm8_255_neg:$rhs)>; +def : T1Pat<(subc tGPR:$lhs, tGPR:$rhs), + (tSUBrr tGPR:$lhs, tGPR:$rhs)>; + +// ConstantPool, GlobalAddress +def : T1Pat<(ARMWrapper tglobaladdr :$dst), (tLEApcrel tglobaladdr :$dst)>; +def : T1Pat<(ARMWrapper tconstpool :$dst), (tLEApcrel tconstpool :$dst)>; + +// JumpTable +def : T1Pat<(ARMWrapperJT tjumptable:$dst, imm:$id), + (tLEApcrelJT tjumptable:$dst, imm:$id)>; + +// Direct calls +def : T1Pat<(ARMtcall texternalsym:$func), (tBL texternalsym:$func)>, + Requires<[IsThumb, IsNotDarwin]>; +def : T1Pat<(ARMtcall texternalsym:$func), (tBLr9 texternalsym:$func)>, + Requires<[IsThumb, IsDarwin]>; + +def : Tv5Pat<(ARMcall texternalsym:$func), (tBLXi texternalsym:$func)>, + Requires<[IsThumb, HasV5T, IsNotDarwin]>; +def : Tv5Pat<(ARMcall texternalsym:$func), (tBLXi_r9 texternalsym:$func)>, + Requires<[IsThumb, HasV5T, IsDarwin]>; + +// Indirect calls to ARM routines +def : Tv5Pat<(ARMcall GPR:$dst), (tBLXr GPR:$dst)>, + Requires<[IsThumb, HasV5T, IsNotDarwin]>; +def : Tv5Pat<(ARMcall GPR:$dst), (tBLXr_r9 GPR:$dst)>, + Requires<[IsThumb, HasV5T, IsDarwin]>; + +// zextload i1 -> zextload i8 +def : T1Pat<(zextloadi1 t_addrmode_rrs1:$addr), + (tLDRBr t_addrmode_rrs1:$addr)>; +def : T1Pat<(zextloadi1 t_addrmode_is1:$addr), + (tLDRBi t_addrmode_is1:$addr)>; + +// extload -> zextload +def : T1Pat<(extloadi1 t_addrmode_rrs1:$addr), (tLDRBr t_addrmode_rrs1:$addr)>; +def : T1Pat<(extloadi1 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>; +def : T1Pat<(extloadi8 t_addrmode_rrs1:$addr), (tLDRBr t_addrmode_rrs1:$addr)>; +def : T1Pat<(extloadi8 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>; +def : T1Pat<(extloadi16 t_addrmode_rrs2:$addr), (tLDRHr t_addrmode_rrs2:$addr)>; +def : T1Pat<(extloadi16 t_addrmode_is2:$addr), (tLDRHi t_addrmode_is2:$addr)>; + +// If it's impossible to use [r,r] address mode for sextload, select to +// ldr{b|h} + sxt{b|h} instead. +def : T1Pat<(sextloadi8 t_addrmode_is1:$addr), + (tSXTB (tLDRBi t_addrmode_is1:$addr))>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; +def : T1Pat<(sextloadi8 t_addrmode_rrs1:$addr), + (tSXTB (tLDRBr t_addrmode_rrs1:$addr))>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; +def : T1Pat<(sextloadi16 t_addrmode_is2:$addr), + (tSXTH (tLDRHi t_addrmode_is2:$addr))>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; +def : T1Pat<(sextloadi16 t_addrmode_rrs2:$addr), + (tSXTH (tLDRHr t_addrmode_rrs2:$addr))>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; + +def : T1Pat<(sextloadi8 t_addrmode_rrs1:$addr), + (tASRri (tLSLri (tLDRBr t_addrmode_rrs1:$addr), 24), 24)>; +def : T1Pat<(sextloadi8 t_addrmode_is1:$addr), + (tASRri (tLSLri (tLDRBi t_addrmode_is1:$addr), 24), 24)>; +def : T1Pat<(sextloadi16 t_addrmode_rrs2:$addr), + (tASRri (tLSLri (tLDRHr t_addrmode_rrs2:$addr), 16), 16)>; +def : T1Pat<(sextloadi16 t_addrmode_is2:$addr), + (tASRri (tLSLri (tLDRHi t_addrmode_is2:$addr), 16), 16)>; + +def : T1Pat<(atomic_load_8 t_addrmode_is1:$src), + (tLDRBi t_addrmode_is1:$src)>; +def : T1Pat<(atomic_load_8 t_addrmode_rrs1:$src), + (tLDRBr t_addrmode_rrs1:$src)>; +def : T1Pat<(atomic_load_16 t_addrmode_is2:$src), + (tLDRHi t_addrmode_is2:$src)>; +def : T1Pat<(atomic_load_16 t_addrmode_rrs2:$src), + (tLDRHr t_addrmode_rrs2:$src)>; +def : T1Pat<(atomic_load_32 t_addrmode_is4:$src), + (tLDRi t_addrmode_is4:$src)>; +def : T1Pat<(atomic_load_32 t_addrmode_rrs4:$src), + (tLDRr t_addrmode_rrs4:$src)>; +def : T1Pat<(atomic_store_8 t_addrmode_is1:$ptr, tGPR:$val), + (tSTRBi tGPR:$val, t_addrmode_is1:$ptr)>; +def : T1Pat<(atomic_store_8 t_addrmode_rrs1:$ptr, tGPR:$val), + (tSTRBr tGPR:$val, t_addrmode_rrs1:$ptr)>; +def : T1Pat<(atomic_store_16 t_addrmode_is2:$ptr, tGPR:$val), + (tSTRHi tGPR:$val, t_addrmode_is2:$ptr)>; +def : T1Pat<(atomic_store_16 t_addrmode_rrs2:$ptr, tGPR:$val), + (tSTRHr tGPR:$val, t_addrmode_rrs2:$ptr)>; +def : T1Pat<(atomic_store_32 t_addrmode_is4:$ptr, tGPR:$val), + (tSTRi tGPR:$val, t_addrmode_is4:$ptr)>; +def : T1Pat<(atomic_store_32 t_addrmode_rrs4:$ptr, tGPR:$val), + (tSTRr tGPR:$val, t_addrmode_rrs4:$ptr)>; + +// Large immediate handling. + +// Two piece imms. +def : T1Pat<(i32 thumb_immshifted:$src), + (tLSLri (tMOVi8 (thumb_immshifted_val imm:$src)), + (thumb_immshifted_shamt imm:$src))>; + +def : T1Pat<(i32 imm0_255_comp:$src), + (tMVN (tMOVi8 (imm_comp_XFORM imm:$src)))>; + +// Pseudo instruction that combines ldr from constpool and add pc. This should +// be expanded into two instructions late to allow if-conversion and +// scheduling. +let isReMaterializable = 1 in +def tLDRpci_pic : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr, pclabel:$cp), + NoItinerary, + [(set GPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)), + imm:$cp))]>, + Requires<[IsThumb, IsThumb1Only]>; + +// Pseudo-instruction for merged POP and return. +// FIXME: remove when we have a way to marking a MI with these properties. +let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1, + hasExtraDefRegAllocReq = 1 in +def tPOP_RET : tPseudoExpand<(outs), (ins pred:$p, reglist:$regs, variable_ops), + 2, IIC_iPop_Br, [], + (tPOP pred:$p, reglist:$regs)>; + +// Indirect branch using "mov pc, $Rm" +let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { + def tBRIND : tPseudoExpand<(outs), (ins GPR:$Rm, pred:$p), + 2, IIC_Br, [(brind GPR:$Rm)], + (tMOVr PC, GPR:$Rm, pred:$p)>; +} + + +// In Thumb1, "nop" is encoded as a "mov r8, r8". Technically, the bf00 +// encoding is available on ARMv6K, but we don't differentiate that finely. +def : InstAlias<"nop", (tMOVr R8, R8, 14, 0)>,Requires<[IsThumb, IsThumb1Only]>; + + +// For round-trip assembly/disassembly, we have to handle a CPS instruction +// without any iflags. That's not, strictly speaking, valid syntax, but it's +// a useful extention and assembles to defined behaviour (the insn does +// nothing). +def : tInstAlias<"cps$imod", (tCPS imod_op:$imod, 0)>; +def : tInstAlias<"cps$imod", (tCPS imod_op:$imod, 0)>; diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td new file mode 100644 index 0000000..471ec29 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -0,0 +1,4022 @@ +//===- ARMInstrThumb2.td - Thumb2 support for ARM -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the Thumb2 instruction set. +// +//===----------------------------------------------------------------------===// + +// IT block predicate field +def it_pred_asmoperand : AsmOperandClass { + let Name = "ITCondCode"; + let ParserMethod = "parseITCondCode"; +} +def it_pred : Operand<i32> { + let PrintMethod = "printMandatoryPredicateOperand"; + let ParserMatchClass = it_pred_asmoperand; +} + +// IT block condition mask +def it_mask_asmoperand : AsmOperandClass { let Name = "ITMask"; } +def it_mask : Operand<i32> { + let PrintMethod = "printThumbITMask"; + let ParserMatchClass = it_mask_asmoperand; +} + +// t2_shift_imm: An integer that encodes a shift amount and the type of shift +// (asr or lsl). The 6-bit immediate encodes as: +// {5} 0 ==> lsl +// 1 asr +// {4-0} imm5 shift amount. +// asr #32 not allowed +def t2_shift_imm : Operand<i32> { + let PrintMethod = "printShiftImmOperand"; + let ParserMatchClass = ShifterImmAsmOperand; + let DecoderMethod = "DecodeT2ShifterImmOperand"; +} + +// Shifted operands. No register controlled shifts for Thumb2. +// Note: We do not support rrx shifted operands yet. +def t2_so_reg : Operand<i32>, // reg imm + ComplexPattern<i32, 2, "SelectT2ShifterOperandReg", + [shl,srl,sra,rotr]> { + let EncoderMethod = "getT2SORegOpValue"; + let PrintMethod = "printT2SOOperand"; + let DecoderMethod = "DecodeSORegImmOperand"; + let ParserMatchClass = ShiftedImmAsmOperand; + let MIOperandInfo = (ops rGPR, i32imm); +} + +// t2_so_imm_not_XFORM - Return the complement of a t2_so_imm value +def t2_so_imm_not_XFORM : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), MVT::i32); +}]>; + +// t2_so_imm_neg_XFORM - Return the negation of a t2_so_imm value +def t2_so_imm_neg_XFORM : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(-((int)N->getZExtValue()), MVT::i32); +}]>; + +// t2_so_imm - Match a 32-bit immediate operand, which is an +// 8-bit immediate rotated by an arbitrary number of bits, or an 8-bit +// immediate splatted into multiple bytes of the word. +def t2_so_imm_asmoperand : AsmOperandClass { let Name = "T2SOImm"; } +def t2_so_imm : Operand<i32>, ImmLeaf<i32, [{ + return ARM_AM::getT2SOImmVal(Imm) != -1; + }]> { + let ParserMatchClass = t2_so_imm_asmoperand; + let EncoderMethod = "getT2SOImmOpValue"; + let DecoderMethod = "DecodeT2SOImm"; +} + +// t2_so_imm_not - Match an immediate that is a complement +// of a t2_so_imm. +def t2_so_imm_not : Operand<i32>, + PatLeaf<(imm), [{ + return ARM_AM::getT2SOImmVal(~((uint32_t)N->getZExtValue())) != -1; +}], t2_so_imm_not_XFORM>; + +// t2_so_imm_neg - Match an immediate that is a negation of a t2_so_imm. +def t2_so_imm_neg : Operand<i32>, + PatLeaf<(imm), [{ + return ARM_AM::getT2SOImmVal(-((uint32_t)N->getZExtValue())) != -1; +}], t2_so_imm_neg_XFORM>; + +/// imm0_4095 predicate - True if the 32-bit immediate is in the range [0.4095]. +def imm0_4095 : Operand<i32>, + ImmLeaf<i32, [{ + return Imm >= 0 && Imm < 4096; +}]>; + +def imm0_4095_neg : PatLeaf<(i32 imm), [{ + return (uint32_t)(-N->getZExtValue()) < 4096; +}], imm_neg_XFORM>; + +def imm0_255_neg : PatLeaf<(i32 imm), [{ + return (uint32_t)(-N->getZExtValue()) < 255; +}], imm_neg_XFORM>; + +def imm0_255_not : PatLeaf<(i32 imm), [{ + return (uint32_t)(~N->getZExtValue()) < 255; +}], imm_comp_XFORM>; + +def lo5AllOne : PatLeaf<(i32 imm), [{ + // Returns true if all low 5-bits are 1. + return (((uint32_t)N->getZExtValue()) & 0x1FUL) == 0x1FUL; +}]>; + +// Define Thumb2 specific addressing modes. + +// t2addrmode_imm12 := reg + imm12 +def t2addrmode_imm12_asmoperand : AsmOperandClass {let Name="MemUImm12Offset";} +def t2addrmode_imm12 : Operand<i32>, + ComplexPattern<i32, 2, "SelectT2AddrModeImm12", []> { + let PrintMethod = "printAddrModeImm12Operand"; + let EncoderMethod = "getAddrModeImm12OpValue"; + let DecoderMethod = "DecodeT2AddrModeImm12"; + let ParserMatchClass = t2addrmode_imm12_asmoperand; + let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); +} + +// t2ldrlabel := imm12 +def t2ldrlabel : Operand<i32> { + let EncoderMethod = "getAddrModeImm12OpValue"; + let PrintMethod = "printT2LdrLabelOperand"; +} + + +// ADR instruction labels. +def t2adrlabel : Operand<i32> { + let EncoderMethod = "getT2AdrLabelOpValue"; +} + + +// t2addrmode_posimm8 := reg + imm8 +def MemPosImm8OffsetAsmOperand : AsmOperandClass {let Name="MemPosImm8Offset";} +def t2addrmode_posimm8 : Operand<i32> { + let PrintMethod = "printT2AddrModeImm8Operand"; + let EncoderMethod = "getT2AddrModeImm8OpValue"; + let DecoderMethod = "DecodeT2AddrModeImm8"; + let ParserMatchClass = MemPosImm8OffsetAsmOperand; + let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); +} + +// t2addrmode_negimm8 := reg - imm8 +def MemNegImm8OffsetAsmOperand : AsmOperandClass {let Name="MemNegImm8Offset";} +def t2addrmode_negimm8 : Operand<i32>, + ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> { + let PrintMethod = "printT2AddrModeImm8Operand"; + let EncoderMethod = "getT2AddrModeImm8OpValue"; + let DecoderMethod = "DecodeT2AddrModeImm8"; + let ParserMatchClass = MemNegImm8OffsetAsmOperand; + let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); +} + +// t2addrmode_imm8 := reg +/- imm8 +def MemImm8OffsetAsmOperand : AsmOperandClass { let Name = "MemImm8Offset"; } +def t2addrmode_imm8 : Operand<i32>, + ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> { + let PrintMethod = "printT2AddrModeImm8Operand"; + let EncoderMethod = "getT2AddrModeImm8OpValue"; + let DecoderMethod = "DecodeT2AddrModeImm8"; + let ParserMatchClass = MemImm8OffsetAsmOperand; + let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); +} + +def t2am_imm8_offset : Operand<i32>, + ComplexPattern<i32, 1, "SelectT2AddrModeImm8Offset", + [], [SDNPWantRoot]> { + let PrintMethod = "printT2AddrModeImm8OffsetOperand"; + let EncoderMethod = "getT2AddrModeImm8OffsetOpValue"; + let DecoderMethod = "DecodeT2Imm8"; +} + +// t2addrmode_imm8s4 := reg +/- (imm8 << 2) +def MemImm8s4OffsetAsmOperand : AsmOperandClass {let Name = "MemImm8s4Offset";} +def t2addrmode_imm8s4 : Operand<i32> { + let PrintMethod = "printT2AddrModeImm8s4Operand"; + let EncoderMethod = "getT2AddrModeImm8s4OpValue"; + let DecoderMethod = "DecodeT2AddrModeImm8s4"; + let ParserMatchClass = MemImm8s4OffsetAsmOperand; + let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); +} + +def t2am_imm8s4_offset_asmoperand : AsmOperandClass { let Name = "Imm8s4"; } +def t2am_imm8s4_offset : Operand<i32> { + let PrintMethod = "printT2AddrModeImm8s4OffsetOperand"; + let EncoderMethod = "getT2Imm8s4OpValue"; + let DecoderMethod = "DecodeT2Imm8S4"; +} + +// t2addrmode_imm0_1020s4 := reg + (imm8 << 2) +def MemImm0_1020s4OffsetAsmOperand : AsmOperandClass { + let Name = "MemImm0_1020s4Offset"; +} +def t2addrmode_imm0_1020s4 : Operand<i32> { + let PrintMethod = "printT2AddrModeImm0_1020s4Operand"; + let EncoderMethod = "getT2AddrModeImm0_1020s4OpValue"; + let DecoderMethod = "DecodeT2AddrModeImm0_1020s4"; + let ParserMatchClass = MemImm0_1020s4OffsetAsmOperand; + let MIOperandInfo = (ops GPRnopc:$base, i32imm:$offsimm); +} + +// t2addrmode_so_reg := reg + (reg << imm2) +def t2addrmode_so_reg_asmoperand : AsmOperandClass {let Name="T2MemRegOffset";} +def t2addrmode_so_reg : Operand<i32>, + ComplexPattern<i32, 3, "SelectT2AddrModeSoReg", []> { + let PrintMethod = "printT2AddrModeSoRegOperand"; + let EncoderMethod = "getT2AddrModeSORegOpValue"; + let DecoderMethod = "DecodeT2AddrModeSOReg"; + let ParserMatchClass = t2addrmode_so_reg_asmoperand; + let MIOperandInfo = (ops GPR:$base, rGPR:$offsreg, i32imm:$offsimm); +} + +// Addresses for the TBB/TBH instructions. +def addrmode_tbb_asmoperand : AsmOperandClass { let Name = "MemTBB"; } +def addrmode_tbb : Operand<i32> { + let PrintMethod = "printAddrModeTBB"; + let ParserMatchClass = addrmode_tbb_asmoperand; + let MIOperandInfo = (ops GPR:$Rn, rGPR:$Rm); +} +def addrmode_tbh_asmoperand : AsmOperandClass { let Name = "MemTBH"; } +def addrmode_tbh : Operand<i32> { + let PrintMethod = "printAddrModeTBH"; + let ParserMatchClass = addrmode_tbh_asmoperand; + let MIOperandInfo = (ops GPR:$Rn, rGPR:$Rm); +} + +//===----------------------------------------------------------------------===// +// Multiclass helpers... +// + + +class T2OneRegImm<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<12> imm; + + let Inst{11-8} = Rd; + let Inst{26} = imm{11}; + let Inst{14-12} = imm{10-8}; + let Inst{7-0} = imm{7-0}; +} + + +class T2sOneRegImm<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2sI<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<12> imm; + + let Inst{11-8} = Rd; + let Inst{26} = imm{11}; + let Inst{14-12} = imm{10-8}; + let Inst{7-0} = imm{7-0}; +} + +class T2OneRegCmpImm<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rn; + bits<12> imm; + + let Inst{19-16} = Rn; + let Inst{26} = imm{11}; + let Inst{14-12} = imm{10-8}; + let Inst{7-0} = imm{7-0}; +} + + +class T2OneRegShiftedReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<12> ShiftedRm; + + let Inst{11-8} = Rd; + let Inst{3-0} = ShiftedRm{3-0}; + let Inst{5-4} = ShiftedRm{6-5}; + let Inst{14-12} = ShiftedRm{11-9}; + let Inst{7-6} = ShiftedRm{8-7}; +} + +class T2sOneRegShiftedReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2sI<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<12> ShiftedRm; + + let Inst{11-8} = Rd; + let Inst{3-0} = ShiftedRm{3-0}; + let Inst{5-4} = ShiftedRm{6-5}; + let Inst{14-12} = ShiftedRm{11-9}; + let Inst{7-6} = ShiftedRm{8-7}; +} + +class T2OneRegCmpShiftedReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rn; + bits<12> ShiftedRm; + + let Inst{19-16} = Rn; + let Inst{3-0} = ShiftedRm{3-0}; + let Inst{5-4} = ShiftedRm{6-5}; + let Inst{14-12} = ShiftedRm{11-9}; + let Inst{7-6} = ShiftedRm{8-7}; +} + +class T2TwoReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rm; + + let Inst{11-8} = Rd; + let Inst{3-0} = Rm; +} + +class T2sTwoReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2sI<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rm; + + let Inst{11-8} = Rd; + let Inst{3-0} = Rm; +} + +class T2TwoRegCmp<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rn; + bits<4> Rm; + + let Inst{19-16} = Rn; + let Inst{3-0} = Rm; +} + + +class T2TwoRegImm<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<12> imm; + + let Inst{11-8} = Rd; + let Inst{19-16} = Rn; + let Inst{26} = imm{11}; + let Inst{14-12} = imm{10-8}; + let Inst{7-0} = imm{7-0}; +} + +class T2sTwoRegImm<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2sI<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<12> imm; + + let Inst{11-8} = Rd; + let Inst{19-16} = Rn; + let Inst{26} = imm{11}; + let Inst{14-12} = imm{10-8}; + let Inst{7-0} = imm{7-0}; +} + +class T2TwoRegShiftImm<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rm; + bits<5> imm; + + let Inst{11-8} = Rd; + let Inst{3-0} = Rm; + let Inst{14-12} = imm{4-2}; + let Inst{7-6} = imm{1-0}; +} + +class T2sTwoRegShiftImm<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2sI<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rm; + bits<5> imm; + + let Inst{11-8} = Rd; + let Inst{3-0} = Rm; + let Inst{14-12} = imm{4-2}; + let Inst{7-6} = imm{1-0}; +} + +class T2ThreeReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + + let Inst{11-8} = Rd; + let Inst{19-16} = Rn; + let Inst{3-0} = Rm; +} + +class T2sThreeReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2sI<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + + let Inst{11-8} = Rd; + let Inst{19-16} = Rn; + let Inst{3-0} = Rm; +} + +class T2TwoRegShiftedReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<12> ShiftedRm; + + let Inst{11-8} = Rd; + let Inst{19-16} = Rn; + let Inst{3-0} = ShiftedRm{3-0}; + let Inst{5-4} = ShiftedRm{6-5}; + let Inst{14-12} = ShiftedRm{11-9}; + let Inst{7-6} = ShiftedRm{8-7}; +} + +class T2sTwoRegShiftedReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2sI<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<12> ShiftedRm; + + let Inst{11-8} = Rd; + let Inst{19-16} = Rn; + let Inst{3-0} = ShiftedRm{3-0}; + let Inst{5-4} = ShiftedRm{6-5}; + let Inst{14-12} = ShiftedRm{11-9}; + let Inst{7-6} = ShiftedRm{8-7}; +} + +class T2FourReg<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + bits<4> Ra; + + let Inst{19-16} = Rn; + let Inst{15-12} = Ra; + let Inst{11-8} = Rd; + let Inst{3-0} = Rm; +} + +class T2MulLong<bits<3> opc22_20, bits<4> opc7_4, + dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> RdLo; + bits<4> RdHi; + bits<4> Rn; + bits<4> Rm; + + let Inst{31-23} = 0b111110111; + let Inst{22-20} = opc22_20; + let Inst{19-16} = Rn; + let Inst{15-12} = RdLo; + let Inst{11-8} = RdHi; + let Inst{7-4} = opc7_4; + let Inst{3-0} = Rm; +} + + +/// T2I_bin_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a +/// binary operation that produces a value. These are predicable and can be +/// changed to modify CPSR. +multiclass T2I_bin_irs<bits<4> opcod, string opc, + InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, + PatFrag opnode, string baseOpc, bit Commutable = 0, + string wide = ""> { + // shifted imm + def ri : T2sTwoRegImm< + (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), iii, + opc, "\t$Rd, $Rn, $imm", + [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = opcod; + let Inst{15} = 0; + } + // register + def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), iir, + opc, !strconcat(wide, "\t$Rd, $Rn, $Rm"), + [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]> { + let isCommutable = Commutable; + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{14-12} = 0b000; // imm3 + let Inst{7-6} = 0b00; // imm2 + let Inst{5-4} = 0b00; // type + } + // shifted register + def rs : T2sTwoRegShiftedReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm), iis, + opc, !strconcat(wide, "\t$Rd, $Rn, $ShiftedRm"), + [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + } + // Assembly aliases for optional destination operand when it's the same + // as the source operand. + def : t2InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"), + (!cast<Instruction>(!strconcat(baseOpc, "ri")) rGPR:$Rdn, rGPR:$Rdn, + t2_so_imm:$imm, pred:$p, + cc_out:$s)>; + def : t2InstAlias<!strconcat(opc, "${s}${p}", wide, " $Rdn, $Rm"), + (!cast<Instruction>(!strconcat(baseOpc, "rr")) rGPR:$Rdn, rGPR:$Rdn, + rGPR:$Rm, pred:$p, + cc_out:$s)>; + def : t2InstAlias<!strconcat(opc, "${s}${p}", wide, " $Rdn, $shift"), + (!cast<Instruction>(!strconcat(baseOpc, "rs")) rGPR:$Rdn, rGPR:$Rdn, + t2_so_reg:$shift, pred:$p, + cc_out:$s)>; +} + +/// T2I_bin_w_irs - Same as T2I_bin_irs except these operations need +// the ".w" suffix to indicate that they are wide. +multiclass T2I_bin_w_irs<bits<4> opcod, string opc, + InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, + PatFrag opnode, string baseOpc, bit Commutable = 0> : + T2I_bin_irs<opcod, opc, iii, iir, iis, opnode, baseOpc, Commutable, ".w"> { + // Assembler aliases w/o the ".w" suffix. + def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $Rm"), + (!cast<Instruction>(!strconcat(baseOpc, "rr")) rGPR:$Rd, rGPR:$Rn, + rGPR:$Rm, pred:$p, + cc_out:$s)>; + def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $shift"), + (!cast<Instruction>(!strconcat(baseOpc, "rs")) rGPR:$Rd, rGPR:$Rn, + t2_so_reg:$shift, pred:$p, + cc_out:$s)>; + + // and with the optional destination operand, too. + def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $Rm"), + (!cast<Instruction>(!strconcat(baseOpc, "rr")) rGPR:$Rdn, rGPR:$Rdn, + rGPR:$Rm, pred:$p, + cc_out:$s)>; + def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $shift"), + (!cast<Instruction>(!strconcat(baseOpc, "rs")) rGPR:$Rdn, rGPR:$Rdn, + t2_so_reg:$shift, pred:$p, + cc_out:$s)>; +} + +/// T2I_rbin_is - Same as T2I_bin_irs except the order of operands are +/// reversed. The 'rr' form is only defined for the disassembler; for codegen +/// it is equivalent to the T2I_bin_irs counterpart. +multiclass T2I_rbin_irs<bits<4> opcod, string opc, PatFrag opnode> { + // shifted imm + def ri : T2sTwoRegImm< + (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), IIC_iALUi, + opc, ".w\t$Rd, $Rn, $imm", + [(set rGPR:$Rd, (opnode t2_so_imm:$imm, rGPR:$Rn))]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = opcod; + let Inst{15} = 0; + } + // register + def rr : T2sThreeReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr, + opc, "\t$Rd, $Rn, $Rm", + [/* For disassembly only; pattern left blank */]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{14-12} = 0b000; // imm3 + let Inst{7-6} = 0b00; // imm2 + let Inst{5-4} = 0b00; // type + } + // shifted register + def rs : T2sTwoRegShiftedReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm), + IIC_iALUsir, opc, "\t$Rd, $Rn, $ShiftedRm", + [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm, rGPR:$Rn))]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + } +} + +/// T2I_bin_s_irs - Similar to T2I_bin_irs except it sets the 's' bit so the +/// instruction modifies the CPSR register. +/// +/// These opcodes will be converted to the real non-S opcodes by +/// AdjustInstrPostInstrSelection after giving then an optional CPSR operand. +let hasPostISelHook = 1, isCodeGenOnly = 1, isPseudo = 1, Defs = [CPSR] in { +multiclass T2I_bin_s_irs<bits<4> opcod, string opc, + InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, + PatFrag opnode, bit Commutable = 0> { + // shifted imm + def ri : T2sTwoRegImm< + (outs rGPR:$Rd), (ins GPR:$Rn, t2_so_imm:$imm), iii, + opc, ".w\t$Rd, $Rn, $imm", + [(set rGPR:$Rd, CPSR, (opnode GPR:$Rn, t2_so_imm:$imm))]>; + // register + def rr : T2sThreeReg< + (outs rGPR:$Rd), (ins GPR:$Rn, rGPR:$Rm), iir, + opc, ".w\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, CPSR, (opnode GPR:$Rn, rGPR:$Rm))]>; + // shifted register + def rs : T2sTwoRegShiftedReg< + (outs rGPR:$Rd), (ins GPR:$Rn, t2_so_reg:$ShiftedRm), iis, + opc, ".w\t$Rd, $Rn, $ShiftedRm", + [(set rGPR:$Rd, CPSR, (opnode GPR:$Rn, t2_so_reg:$ShiftedRm))]>; +} +} + +/// T2I_bin_ii12rs - Defines a set of (op reg, {so_imm|imm0_4095|r|so_reg}) +/// patterns for a binary operation that produces a value. +multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode, + bit Commutable = 0> { + // shifted imm + // The register-immediate version is re-materializable. This is useful + // in particular for taking the address of a local. + let isReMaterializable = 1 in { + def ri : T2sTwoRegImm< + (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, t2_so_imm:$imm), IIC_iALUi, + opc, ".w\t$Rd, $Rn, $imm", + [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, t2_so_imm:$imm))]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24} = 1; + let Inst{23-21} = op23_21; + let Inst{15} = 0; + } + } + // 12-bit imm + def ri12 : T2I< + (outs GPRnopc:$Rd), (ins GPR:$Rn, imm0_4095:$imm), IIC_iALUi, + !strconcat(opc, "w"), "\t$Rd, $Rn, $imm", + [(set GPRnopc:$Rd, (opnode GPR:$Rn, imm0_4095:$imm))]> { + bits<4> Rd; + bits<4> Rn; + bits<12> imm; + let Inst{31-27} = 0b11110; + let Inst{26} = imm{11}; + let Inst{25-24} = 0b10; + let Inst{23-21} = op23_21; + let Inst{20} = 0; // The S bit. + let Inst{19-16} = Rn; + let Inst{15} = 0; + let Inst{14-12} = imm{10-8}; + let Inst{11-8} = Rd; + let Inst{7-0} = imm{7-0}; + } + // register + def rr : T2sThreeReg<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, rGPR:$Rm), + IIC_iALUr, opc, ".w\t$Rd, $Rn, $Rm", + [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, rGPR:$Rm))]> { + let isCommutable = Commutable; + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24} = 1; + let Inst{23-21} = op23_21; + let Inst{14-12} = 0b000; // imm3 + let Inst{7-6} = 0b00; // imm2 + let Inst{5-4} = 0b00; // type + } + // shifted register + def rs : T2sTwoRegShiftedReg< + (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm), + IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm", + [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, t2_so_reg:$ShiftedRm))]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24} = 1; + let Inst{23-21} = op23_21; + } +} + +/// T2I_adde_sube_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns +/// for a binary operation that produces a value and use the carry +/// bit. It's not predicable. +let Defs = [CPSR], Uses = [CPSR] in { +multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, + bit Commutable = 0> { + // shifted imm + def ri : T2sTwoRegImm<(outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), + IIC_iALUi, opc, "\t$Rd, $Rn, $imm", + [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, t2_so_imm:$imm, CPSR))]>, + Requires<[IsThumb2]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = opcod; + let Inst{15} = 0; + } + // register + def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr, + opc, ".w\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, rGPR:$Rm, CPSR))]>, + Requires<[IsThumb2]> { + let isCommutable = Commutable; + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{14-12} = 0b000; // imm3 + let Inst{7-6} = 0b00; // imm2 + let Inst{5-4} = 0b00; // type + } + // shifted register + def rs : T2sTwoRegShiftedReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm), + IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm", + [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm, CPSR))]>, + Requires<[IsThumb2]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + } +} +} + +/// T2I_rbin_s_is - Same as T2I_rbin_irs except sets 's' bit and the register +/// version is not needed since this is only for codegen. +/// +/// These opcodes will be converted to the real non-S opcodes by +/// AdjustInstrPostInstrSelection after giving then an optional CPSR operand. +let hasPostISelHook = 1, isCodeGenOnly = 1, isPseudo = 1, Defs = [CPSR] in { +multiclass T2I_rbin_s_is<bits<4> opcod, string opc, PatFrag opnode> { + // shifted imm + def ri : T2sTwoRegImm< + (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), IIC_iALUi, + opc, ".w\t$Rd, $Rn, $imm", + [(set rGPR:$Rd, CPSR, (opnode t2_so_imm:$imm, rGPR:$Rn))]>; + // shifted register + def rs : T2sTwoRegShiftedReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm), + IIC_iALUsi, opc, "\t$Rd, $Rn, $ShiftedRm", + [(set rGPR:$Rd, CPSR, (opnode t2_so_reg:$ShiftedRm, rGPR:$Rn))]>; +} +} + +/// T2I_sh_ir - Defines a set of (op reg, {so_imm|r}) patterns for a shift / +// rotate operation that produces a value. +multiclass T2I_sh_ir<bits<2> opcod, string opc, Operand ty, PatFrag opnode, + string baseOpc> { + // 5-bit imm + def ri : T2sTwoRegShiftImm< + (outs rGPR:$Rd), (ins rGPR:$Rm, ty:$imm), IIC_iMOVsi, + opc, ".w\t$Rd, $Rm, $imm", + [(set rGPR:$Rd, (opnode rGPR:$Rm, (i32 ty:$imm)))]> { + let Inst{31-27} = 0b11101; + let Inst{26-21} = 0b010010; + let Inst{19-16} = 0b1111; // Rn + let Inst{5-4} = opcod; + } + // register + def rr : T2sThreeReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMOVsr, + opc, ".w\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0100; + let Inst{22-21} = opcod; + let Inst{15-12} = 0b1111; + let Inst{7-4} = 0b0000; + } + + // Optional destination register + def : t2InstAlias<!strconcat(opc, "${s}${p}", ".w $Rdn, $imm"), + (!cast<Instruction>(!strconcat(baseOpc, "ri")) rGPR:$Rdn, rGPR:$Rdn, + ty:$imm, pred:$p, + cc_out:$s)>; + def : t2InstAlias<!strconcat(opc, "${s}${p}", ".w $Rdn, $Rm"), + (!cast<Instruction>(!strconcat(baseOpc, "rr")) rGPR:$Rdn, rGPR:$Rdn, + rGPR:$Rm, pred:$p, + cc_out:$s)>; + + // Assembler aliases w/o the ".w" suffix. + def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $imm"), + (!cast<Instruction>(!strconcat(baseOpc, "ri")) rGPR:$Rd, rGPR:$Rn, + ty:$imm, pred:$p, + cc_out:$s)>; + def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $Rm"), + (!cast<Instruction>(!strconcat(baseOpc, "rr")) rGPR:$Rd, rGPR:$Rn, + rGPR:$Rm, pred:$p, + cc_out:$s)>; + + // and with the optional destination operand, too. + def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $imm"), + (!cast<Instruction>(!strconcat(baseOpc, "ri")) rGPR:$Rdn, rGPR:$Rdn, + ty:$imm, pred:$p, + cc_out:$s)>; + def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $Rm"), + (!cast<Instruction>(!strconcat(baseOpc, "rr")) rGPR:$Rdn, rGPR:$Rdn, + rGPR:$Rm, pred:$p, + cc_out:$s)>; +} + +/// T2I_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test +/// patterns. Similar to T2I_bin_irs except the instruction does not produce +/// a explicit result, only implicitly set CPSR. +multiclass T2I_cmp_irs<bits<4> opcod, string opc, + InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, + PatFrag opnode, string baseOpc> { +let isCompare = 1, Defs = [CPSR] in { + // shifted imm + def ri : T2OneRegCmpImm< + (outs), (ins GPRnopc:$Rn, t2_so_imm:$imm), iii, + opc, ".w\t$Rn, $imm", + [(opnode GPRnopc:$Rn, t2_so_imm:$imm)]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = opcod; + let Inst{20} = 1; // The S bit. + let Inst{15} = 0; + let Inst{11-8} = 0b1111; // Rd + } + // register + def rr : T2TwoRegCmp< + (outs), (ins GPRnopc:$Rn, rGPR:$Rm), iir, + opc, ".w\t$Rn, $Rm", + [(opnode GPRnopc:$Rn, rGPR:$Rm)]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{20} = 1; // The S bit. + let Inst{14-12} = 0b000; // imm3 + let Inst{11-8} = 0b1111; // Rd + let Inst{7-6} = 0b00; // imm2 + let Inst{5-4} = 0b00; // type + } + // shifted register + def rs : T2OneRegCmpShiftedReg< + (outs), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm), iis, + opc, ".w\t$Rn, $ShiftedRm", + [(opnode GPRnopc:$Rn, t2_so_reg:$ShiftedRm)]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{20} = 1; // The S bit. + let Inst{11-8} = 0b1111; // Rd + } +} + + // Assembler aliases w/o the ".w" suffix. + // No alias here for 'rr' version as not all instantiations of this + // multiclass want one (CMP in particular, does not). + def : t2InstAlias<!strconcat(opc, "${p}", " $Rn, $imm"), + (!cast<Instruction>(!strconcat(baseOpc, "ri")) GPRnopc:$Rn, + t2_so_imm:$imm, pred:$p)>; + def : t2InstAlias<!strconcat(opc, "${p}", " $Rn, $shift"), + (!cast<Instruction>(!strconcat(baseOpc, "rs")) GPRnopc:$Rn, + t2_so_reg:$shift, + pred:$p)>; +} + +/// T2I_ld - Defines a set of (op r, {imm12|imm8|so_reg}) load patterns. +multiclass T2I_ld<bit signed, bits<2> opcod, string opc, + InstrItinClass iii, InstrItinClass iis, RegisterClass target, + PatFrag opnode> { + def i12 : T2Ii12<(outs target:$Rt), (ins t2addrmode_imm12:$addr), iii, + opc, ".w\t$Rt, $addr", + [(set target:$Rt, (opnode t2addrmode_imm12:$addr))]> { + bits<4> Rt; + bits<17> addr; + let Inst{31-25} = 0b1111100; + let Inst{24} = signed; + let Inst{23} = 1; + let Inst{22-21} = opcod; + let Inst{20} = 1; // load + let Inst{19-16} = addr{16-13}; // Rn + let Inst{15-12} = Rt; + let Inst{11-0} = addr{11-0}; // imm + } + def i8 : T2Ii8 <(outs target:$Rt), (ins t2addrmode_negimm8:$addr), iii, + opc, "\t$Rt, $addr", + [(set target:$Rt, (opnode t2addrmode_negimm8:$addr))]> { + bits<4> Rt; + bits<13> addr; + let Inst{31-27} = 0b11111; + let Inst{26-25} = 0b00; + let Inst{24} = signed; + let Inst{23} = 0; + let Inst{22-21} = opcod; + let Inst{20} = 1; // load + let Inst{19-16} = addr{12-9}; // Rn + let Inst{15-12} = Rt; + let Inst{11} = 1; + // Offset: index==TRUE, wback==FALSE + let Inst{10} = 1; // The P bit. + let Inst{9} = addr{8}; // U + let Inst{8} = 0; // The W bit. + let Inst{7-0} = addr{7-0}; // imm + } + def s : T2Iso <(outs target:$Rt), (ins t2addrmode_so_reg:$addr), iis, + opc, ".w\t$Rt, $addr", + [(set target:$Rt, (opnode t2addrmode_so_reg:$addr))]> { + let Inst{31-27} = 0b11111; + let Inst{26-25} = 0b00; + let Inst{24} = signed; + let Inst{23} = 0; + let Inst{22-21} = opcod; + let Inst{20} = 1; // load + let Inst{11-6} = 0b000000; + + bits<4> Rt; + let Inst{15-12} = Rt; + + bits<10> addr; + let Inst{19-16} = addr{9-6}; // Rn + let Inst{3-0} = addr{5-2}; // Rm + let Inst{5-4} = addr{1-0}; // imm + + let DecoderMethod = "DecodeT2LoadShift"; + } + + // FIXME: Is the pci variant actually needed? + def pci : T2Ipc <(outs target:$Rt), (ins t2ldrlabel:$addr), iii, + opc, ".w\t$Rt, $addr", + [(set target:$Rt, (opnode (ARMWrapper tconstpool:$addr)))]> { + let isReMaterializable = 1; + let Inst{31-27} = 0b11111; + let Inst{26-25} = 0b00; + let Inst{24} = signed; + let Inst{23} = ?; // add = (U == '1') + let Inst{22-21} = opcod; + let Inst{20} = 1; // load + let Inst{19-16} = 0b1111; // Rn + bits<4> Rt; + bits<12> addr; + let Inst{15-12} = Rt{3-0}; + let Inst{11-0} = addr{11-0}; + } +} + +/// T2I_st - Defines a set of (op r, {imm12|imm8|so_reg}) store patterns. +multiclass T2I_st<bits<2> opcod, string opc, + InstrItinClass iii, InstrItinClass iis, RegisterClass target, + PatFrag opnode> { + def i12 : T2Ii12<(outs), (ins target:$Rt, t2addrmode_imm12:$addr), iii, + opc, ".w\t$Rt, $addr", + [(opnode target:$Rt, t2addrmode_imm12:$addr)]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0001; + let Inst{22-21} = opcod; + let Inst{20} = 0; // !load + + bits<4> Rt; + let Inst{15-12} = Rt; + + bits<17> addr; + let addr{12} = 1; // add = TRUE + let Inst{19-16} = addr{16-13}; // Rn + let Inst{23} = addr{12}; // U + let Inst{11-0} = addr{11-0}; // imm + } + def i8 : T2Ii8 <(outs), (ins target:$Rt, t2addrmode_negimm8:$addr), iii, + opc, "\t$Rt, $addr", + [(opnode target:$Rt, t2addrmode_negimm8:$addr)]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0000; + let Inst{22-21} = opcod; + let Inst{20} = 0; // !load + let Inst{11} = 1; + // Offset: index==TRUE, wback==FALSE + let Inst{10} = 1; // The P bit. + let Inst{8} = 0; // The W bit. + + bits<4> Rt; + let Inst{15-12} = Rt; + + bits<13> addr; + let Inst{19-16} = addr{12-9}; // Rn + let Inst{9} = addr{8}; // U + let Inst{7-0} = addr{7-0}; // imm + } + def s : T2Iso <(outs), (ins target:$Rt, t2addrmode_so_reg:$addr), iis, + opc, ".w\t$Rt, $addr", + [(opnode target:$Rt, t2addrmode_so_reg:$addr)]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0000; + let Inst{22-21} = opcod; + let Inst{20} = 0; // !load + let Inst{11-6} = 0b000000; + + bits<4> Rt; + let Inst{15-12} = Rt; + + bits<10> addr; + let Inst{19-16} = addr{9-6}; // Rn + let Inst{3-0} = addr{5-2}; // Rm + let Inst{5-4} = addr{1-0}; // imm + } +} + +/// T2I_ext_rrot - A unary operation with two forms: one whose operand is a +/// register and one whose operand is a register rotated by 8/16/24. +class T2I_ext_rrot<bits<3> opcod, string opc, PatFrag opnode> + : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), IIC_iEXTr, + opc, ".w\t$Rd, $Rm$rot", + [(set rGPR:$Rd, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]>, + Requires<[IsThumb2]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0100; + let Inst{22-20} = opcod; + let Inst{19-16} = 0b1111; // Rn + let Inst{15-12} = 0b1111; + let Inst{7} = 1; + + bits<2> rot; + let Inst{5-4} = rot{1-0}; // rotate +} + +// UXTB16 - Requres T2ExtractPack, does not need the .w qualifier. +class T2I_ext_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode> + : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), + IIC_iEXTr, opc, "\t$Rd, $Rm$rot", + [(set rGPR:$Rd, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]>, + Requires<[HasT2ExtractPack, IsThumb2]> { + bits<2> rot; + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0100; + let Inst{22-20} = opcod; + let Inst{19-16} = 0b1111; // Rn + let Inst{15-12} = 0b1111; + let Inst{7} = 1; + let Inst{5-4} = rot; +} + +// SXTB16 - Requres T2ExtractPack, does not need the .w qualifier, no pattern +// supported yet. +class T2I_ext_rrot_sxtb16<bits<3> opcod, string opc> + : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), IIC_iEXTr, + opc, "\t$Rd, $Rm$rot", []>, + Requires<[IsThumb2, HasT2ExtractPack]> { + bits<2> rot; + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0100; + let Inst{22-20} = opcod; + let Inst{19-16} = 0b1111; // Rn + let Inst{15-12} = 0b1111; + let Inst{7} = 1; + let Inst{5-4} = rot; +} + +/// T2I_exta_rrot - A binary operation with two forms: one whose operand is a +/// register and one whose operand is a register rotated by 8/16/24. +class T2I_exta_rrot<bits<3> opcod, string opc, PatFrag opnode> + : T2ThreeReg<(outs rGPR:$Rd), + (ins rGPR:$Rn, rGPR:$Rm, rot_imm:$rot), + IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm$rot", + [(set rGPR:$Rd, (opnode rGPR:$Rn, (rotr rGPR:$Rm,rot_imm:$rot)))]>, + Requires<[HasT2ExtractPack, IsThumb2]> { + bits<2> rot; + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0100; + let Inst{22-20} = opcod; + let Inst{15-12} = 0b1111; + let Inst{7} = 1; + let Inst{5-4} = rot; +} + +class T2I_exta_rrot_np<bits<3> opcod, string opc> + : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm,rot_imm:$rot), + IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm$rot", []> { + bits<2> rot; + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0100; + let Inst{22-20} = opcod; + let Inst{15-12} = 0b1111; + let Inst{7} = 1; + let Inst{5-4} = rot; +} + +//===----------------------------------------------------------------------===// +// Instructions +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Miscellaneous Instructions. +// + +class T2PCOneRegImm<dag oops, dag iops, InstrItinClass itin, + string asm, list<dag> pattern> + : T2XI<oops, iops, itin, asm, pattern> { + bits<4> Rd; + bits<12> label; + + let Inst{11-8} = Rd; + let Inst{26} = label{11}; + let Inst{14-12} = label{10-8}; + let Inst{7-0} = label{7-0}; +} + +// LEApcrel - Load a pc-relative address into a register without offending the +// assembler. +def t2ADR : T2PCOneRegImm<(outs rGPR:$Rd), + (ins t2adrlabel:$addr, pred:$p), + IIC_iALUi, "adr{$p}.w\t$Rd, $addr", []> { + let Inst{31-27} = 0b11110; + let Inst{25-24} = 0b10; + // Inst{23:21} = '11' (add = FALSE) or '00' (add = TRUE) + let Inst{22} = 0; + let Inst{20} = 0; + let Inst{19-16} = 0b1111; // Rn + let Inst{15} = 0; + + bits<4> Rd; + bits<13> addr; + let Inst{11-8} = Rd; + let Inst{23} = addr{12}; + let Inst{21} = addr{12}; + let Inst{26} = addr{11}; + let Inst{14-12} = addr{10-8}; + let Inst{7-0} = addr{7-0}; + + let DecoderMethod = "DecodeT2Adr"; +} + +let neverHasSideEffects = 1, isReMaterializable = 1 in +def t2LEApcrel : t2PseudoInst<(outs rGPR:$Rd), (ins i32imm:$label, pred:$p), + 4, IIC_iALUi, []>; +def t2LEApcrelJT : t2PseudoInst<(outs rGPR:$Rd), + (ins i32imm:$label, nohash_imm:$id, pred:$p), + 4, IIC_iALUi, + []>; + + +//===----------------------------------------------------------------------===// +// Load / store Instructions. +// + +// Load +let canFoldAsLoad = 1, isReMaterializable = 1 in +defm t2LDR : T2I_ld<0, 0b10, "ldr", IIC_iLoad_i, IIC_iLoad_si, GPR, + UnOpFrag<(load node:$Src)>>; + +// Loads with zero extension +defm t2LDRH : T2I_ld<0, 0b01, "ldrh", IIC_iLoad_bh_i, IIC_iLoad_bh_si, + rGPR, UnOpFrag<(zextloadi16 node:$Src)>>; +defm t2LDRB : T2I_ld<0, 0b00, "ldrb", IIC_iLoad_bh_i, IIC_iLoad_bh_si, + rGPR, UnOpFrag<(zextloadi8 node:$Src)>>; + +// Loads with sign extension +defm t2LDRSH : T2I_ld<1, 0b01, "ldrsh", IIC_iLoad_bh_i, IIC_iLoad_bh_si, + rGPR, UnOpFrag<(sextloadi16 node:$Src)>>; +defm t2LDRSB : T2I_ld<1, 0b00, "ldrsb", IIC_iLoad_bh_i, IIC_iLoad_bh_si, + rGPR, UnOpFrag<(sextloadi8 node:$Src)>>; + +let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { +// Load doubleword +def t2LDRDi8 : T2Ii8s4<1, 0, 1, (outs rGPR:$Rt, rGPR:$Rt2), + (ins t2addrmode_imm8s4:$addr), + IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", "", []>; +} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 + +// zextload i1 -> zextload i8 +def : T2Pat<(zextloadi1 t2addrmode_imm12:$addr), + (t2LDRBi12 t2addrmode_imm12:$addr)>; +def : T2Pat<(zextloadi1 t2addrmode_negimm8:$addr), + (t2LDRBi8 t2addrmode_negimm8:$addr)>; +def : T2Pat<(zextloadi1 t2addrmode_so_reg:$addr), + (t2LDRBs t2addrmode_so_reg:$addr)>; +def : T2Pat<(zextloadi1 (ARMWrapper tconstpool:$addr)), + (t2LDRBpci tconstpool:$addr)>; + +// extload -> zextload +// FIXME: Reduce the number of patterns by legalizing extload to zextload +// earlier? +def : T2Pat<(extloadi1 t2addrmode_imm12:$addr), + (t2LDRBi12 t2addrmode_imm12:$addr)>; +def : T2Pat<(extloadi1 t2addrmode_negimm8:$addr), + (t2LDRBi8 t2addrmode_negimm8:$addr)>; +def : T2Pat<(extloadi1 t2addrmode_so_reg:$addr), + (t2LDRBs t2addrmode_so_reg:$addr)>; +def : T2Pat<(extloadi1 (ARMWrapper tconstpool:$addr)), + (t2LDRBpci tconstpool:$addr)>; + +def : T2Pat<(extloadi8 t2addrmode_imm12:$addr), + (t2LDRBi12 t2addrmode_imm12:$addr)>; +def : T2Pat<(extloadi8 t2addrmode_negimm8:$addr), + (t2LDRBi8 t2addrmode_negimm8:$addr)>; +def : T2Pat<(extloadi8 t2addrmode_so_reg:$addr), + (t2LDRBs t2addrmode_so_reg:$addr)>; +def : T2Pat<(extloadi8 (ARMWrapper tconstpool:$addr)), + (t2LDRBpci tconstpool:$addr)>; + +def : T2Pat<(extloadi16 t2addrmode_imm12:$addr), + (t2LDRHi12 t2addrmode_imm12:$addr)>; +def : T2Pat<(extloadi16 t2addrmode_negimm8:$addr), + (t2LDRHi8 t2addrmode_negimm8:$addr)>; +def : T2Pat<(extloadi16 t2addrmode_so_reg:$addr), + (t2LDRHs t2addrmode_so_reg:$addr)>; +def : T2Pat<(extloadi16 (ARMWrapper tconstpool:$addr)), + (t2LDRHpci tconstpool:$addr)>; + +// FIXME: The destination register of the loads and stores can't be PC, but +// can be SP. We need another regclass (similar to rGPR) to represent +// that. Not a pressing issue since these are selected manually, +// not via pattern. + +// Indexed loads + +let mayLoad = 1, neverHasSideEffects = 1 in { +def t2LDR_PRE : T2Ipreldst<0, 0b10, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb), + (ins t2addrmode_imm8:$addr), + AddrModeT2_i8, IndexModePre, IIC_iLoad_iu, + "ldr", "\t$Rt, $addr!", "$addr.base = $Rn_wb", + []> { + let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8"; +} + +def t2LDR_POST : T2Ipostldst<0, 0b10, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset), + AddrModeT2_i8, IndexModePost, IIC_iLoad_iu, + "ldr", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>; + +def t2LDRB_PRE : T2Ipreldst<0, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb), + (ins t2addrmode_imm8:$addr), + AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu, + "ldrb", "\t$Rt, $addr!", "$addr.base = $Rn_wb", + []> { + let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8"; +} +def t2LDRB_POST : T2Ipostldst<0, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset), + AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu, + "ldrb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>; + +def t2LDRH_PRE : T2Ipreldst<0, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb), + (ins t2addrmode_imm8:$addr), + AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu, + "ldrh", "\t$Rt, $addr!", "$addr.base = $Rn_wb", + []> { + let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8"; +} +def t2LDRH_POST : T2Ipostldst<0, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset), + AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu, + "ldrh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>; + +def t2LDRSB_PRE : T2Ipreldst<1, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb), + (ins t2addrmode_imm8:$addr), + AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu, + "ldrsb", "\t$Rt, $addr!", "$addr.base = $Rn_wb", + []> { + let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8"; +} +def t2LDRSB_POST : T2Ipostldst<1, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset), + AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu, + "ldrsb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>; + +def t2LDRSH_PRE : T2Ipreldst<1, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb), + (ins t2addrmode_imm8:$addr), + AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu, + "ldrsh", "\t$Rt, $addr!", "$addr.base = $Rn_wb", + []> { + let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8"; +} +def t2LDRSH_POST : T2Ipostldst<1, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb), + (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset), + AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu, + "ldrsh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>; +} // mayLoad = 1, neverHasSideEffects = 1 + +// LDRT, LDRBT, LDRHT, LDRSBT, LDRSHT all have offset mode (PUW=0b110). +// Ref: A8.6.57 LDR (immediate, Thumb) Encoding T4 +class T2IldT<bit signed, bits<2> type, string opc, InstrItinClass ii> + : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_posimm8:$addr), ii, opc, + "\t$Rt, $addr", []> { + bits<4> Rt; + bits<13> addr; + let Inst{31-27} = 0b11111; + let Inst{26-25} = 0b00; + let Inst{24} = signed; + let Inst{23} = 0; + let Inst{22-21} = type; + let Inst{20} = 1; // load + let Inst{19-16} = addr{12-9}; + let Inst{15-12} = Rt; + let Inst{11} = 1; + let Inst{10-8} = 0b110; // PUW. + let Inst{7-0} = addr{7-0}; +} + +def t2LDRT : T2IldT<0, 0b10, "ldrt", IIC_iLoad_i>; +def t2LDRBT : T2IldT<0, 0b00, "ldrbt", IIC_iLoad_bh_i>; +def t2LDRHT : T2IldT<0, 0b01, "ldrht", IIC_iLoad_bh_i>; +def t2LDRSBT : T2IldT<1, 0b00, "ldrsbt", IIC_iLoad_bh_i>; +def t2LDRSHT : T2IldT<1, 0b01, "ldrsht", IIC_iLoad_bh_i>; + +// Store +defm t2STR :T2I_st<0b10,"str", IIC_iStore_i, IIC_iStore_si, GPR, + BinOpFrag<(store node:$LHS, node:$RHS)>>; +defm t2STRB:T2I_st<0b00,"strb", IIC_iStore_bh_i, IIC_iStore_bh_si, + rGPR, BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>; +defm t2STRH:T2I_st<0b01,"strh", IIC_iStore_bh_i, IIC_iStore_bh_si, + rGPR, BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>; + +// Store doubleword +let mayLoad = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in +def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs), + (ins GPR:$Rt, GPR:$Rt2, t2addrmode_imm8s4:$addr), + IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "", []>; + +// Indexed stores +def t2STR_PRE : T2Ipreldst<0, 0b10, 0, 1, (outs GPRnopc:$Rn_wb), + (ins rGPR:$Rt, t2addrmode_imm8:$addr), + AddrModeT2_i8, IndexModePre, IIC_iStore_iu, + "str", "\t$Rt, $addr!", + "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> { + let AsmMatchConverter = "cvtStWriteBackRegT2AddrModeImm8"; +} +def t2STRH_PRE : T2Ipreldst<0, 0b01, 0, 1, (outs GPRnopc:$Rn_wb), + (ins rGPR:$Rt, t2addrmode_imm8:$addr), + AddrModeT2_i8, IndexModePre, IIC_iStore_iu, + "strh", "\t$Rt, $addr!", + "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> { + let AsmMatchConverter = "cvtStWriteBackRegT2AddrModeImm8"; +} + +def t2STRB_PRE : T2Ipreldst<0, 0b00, 0, 1, (outs GPRnopc:$Rn_wb), + (ins rGPR:$Rt, t2addrmode_imm8:$addr), + AddrModeT2_i8, IndexModePre, IIC_iStore_bh_iu, + "strb", "\t$Rt, $addr!", + "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> { + let AsmMatchConverter = "cvtStWriteBackRegT2AddrModeImm8"; +} + +def t2STR_POST : T2Ipostldst<0, 0b10, 0, 0, (outs GPRnopc:$Rn_wb), + (ins rGPR:$Rt, addr_offset_none:$Rn, + t2am_imm8_offset:$offset), + AddrModeT2_i8, IndexModePost, IIC_iStore_iu, + "str", "\t$Rt, $Rn$offset", + "$Rn = $Rn_wb,@earlyclobber $Rn_wb", + [(set GPRnopc:$Rn_wb, + (post_store rGPR:$Rt, addr_offset_none:$Rn, + t2am_imm8_offset:$offset))]>; + +def t2STRH_POST : T2Ipostldst<0, 0b01, 0, 0, (outs GPRnopc:$Rn_wb), + (ins rGPR:$Rt, addr_offset_none:$Rn, + t2am_imm8_offset:$offset), + AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu, + "strh", "\t$Rt, $Rn$offset", + "$Rn = $Rn_wb,@earlyclobber $Rn_wb", + [(set GPRnopc:$Rn_wb, + (post_truncsti16 rGPR:$Rt, addr_offset_none:$Rn, + t2am_imm8_offset:$offset))]>; + +def t2STRB_POST : T2Ipostldst<0, 0b00, 0, 0, (outs GPRnopc:$Rn_wb), + (ins rGPR:$Rt, addr_offset_none:$Rn, + t2am_imm8_offset:$offset), + AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu, + "strb", "\t$Rt, $Rn$offset", + "$Rn = $Rn_wb,@earlyclobber $Rn_wb", + [(set GPRnopc:$Rn_wb, + (post_truncsti8 rGPR:$Rt, addr_offset_none:$Rn, + t2am_imm8_offset:$offset))]>; + +// Pseudo-instructions for pattern matching the pre-indexed stores. We can't +// put the patterns on the instruction definitions directly as ISel wants +// the address base and offset to be separate operands, not a single +// complex operand like we represent the instructions themselves. The +// pseudos map between the two. +let usesCustomInserter = 1, + Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in { +def t2STR_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb), + (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p), + 4, IIC_iStore_ru, + [(set GPRnopc:$Rn_wb, + (pre_store rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>; +def t2STRB_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb), + (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p), + 4, IIC_iStore_ru, + [(set GPRnopc:$Rn_wb, + (pre_truncsti8 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>; +def t2STRH_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb), + (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p), + 4, IIC_iStore_ru, + [(set GPRnopc:$Rn_wb, + (pre_truncsti16 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>; +} + + +// STRT, STRBT, STRHT all have offset mode (PUW=0b110) and are for disassembly +// only. +// Ref: A8.6.193 STR (immediate, Thumb) Encoding T4 +class T2IstT<bits<2> type, string opc, InstrItinClass ii> + : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc, + "\t$Rt, $addr", []> { + let Inst{31-27} = 0b11111; + let Inst{26-25} = 0b00; + let Inst{24} = 0; // not signed + let Inst{23} = 0; + let Inst{22-21} = type; + let Inst{20} = 0; // store + let Inst{11} = 1; + let Inst{10-8} = 0b110; // PUW + + bits<4> Rt; + bits<13> addr; + let Inst{15-12} = Rt; + let Inst{19-16} = addr{12-9}; + let Inst{7-0} = addr{7-0}; +} + +def t2STRT : T2IstT<0b10, "strt", IIC_iStore_i>; +def t2STRBT : T2IstT<0b00, "strbt", IIC_iStore_bh_i>; +def t2STRHT : T2IstT<0b01, "strht", IIC_iStore_bh_i>; + +// ldrd / strd pre / post variants +// For disassembly only. + +def t2LDRD_PRE : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb), + (ins t2addrmode_imm8s4:$addr), IIC_iLoad_d_ru, + "ldrd", "\t$Rt, $Rt2, $addr!", "$addr.base = $wb", []> { + let AsmMatchConverter = "cvtT2LdrdPre"; + let DecoderMethod = "DecodeT2LDRDPreInstruction"; +} + +def t2LDRD_POST : T2Ii8s4post<0, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb), + (ins addr_offset_none:$addr, t2am_imm8s4_offset:$imm), + IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, $addr$imm", + "$addr.base = $wb", []>; + +def t2STRD_PRE : T2Ii8s4<1, 1, 0, (outs GPR:$wb), + (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr), + IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr!", + "$addr.base = $wb", []> { + let AsmMatchConverter = "cvtT2StrdPre"; + let DecoderMethod = "DecodeT2STRDPreInstruction"; +} + +def t2STRD_POST : T2Ii8s4post<0, 1, 0, (outs GPR:$wb), + (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr, + t2am_imm8s4_offset:$imm), + IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr$imm", + "$addr.base = $wb", []>; + +// T2Ipl (Preload Data/Instruction) signals the memory system of possible future +// data/instruction access. These are for disassembly only. +// instr_write is inverted for Thumb mode: (prefetch 3) -> (preload 0), +// (prefetch 1) -> (preload 2), (prefetch 2) -> (preload 1). +multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> { + + def i12 : T2Ii12<(outs), (ins t2addrmode_imm12:$addr), IIC_Preload, opc, + "\t$addr", + [(ARMPreload t2addrmode_imm12:$addr, (i32 write), (i32 instr))]> { + let Inst{31-25} = 0b1111100; + let Inst{24} = instr; + let Inst{22} = 0; + let Inst{21} = write; + let Inst{20} = 1; + let Inst{15-12} = 0b1111; + + bits<17> addr; + let addr{12} = 1; // add = TRUE + let Inst{19-16} = addr{16-13}; // Rn + let Inst{23} = addr{12}; // U + let Inst{11-0} = addr{11-0}; // imm12 + } + + def i8 : T2Ii8<(outs), (ins t2addrmode_negimm8:$addr), IIC_Preload, opc, + "\t$addr", + [(ARMPreload t2addrmode_negimm8:$addr, (i32 write), (i32 instr))]> { + let Inst{31-25} = 0b1111100; + let Inst{24} = instr; + let Inst{23} = 0; // U = 0 + let Inst{22} = 0; + let Inst{21} = write; + let Inst{20} = 1; + let Inst{15-12} = 0b1111; + let Inst{11-8} = 0b1100; + + bits<13> addr; + let Inst{19-16} = addr{12-9}; // Rn + let Inst{7-0} = addr{7-0}; // imm8 + } + + def s : T2Iso<(outs), (ins t2addrmode_so_reg:$addr), IIC_Preload, opc, + "\t$addr", + [(ARMPreload t2addrmode_so_reg:$addr, (i32 write), (i32 instr))]> { + let Inst{31-25} = 0b1111100; + let Inst{24} = instr; + let Inst{23} = 0; // add = TRUE for T1 + let Inst{22} = 0; + let Inst{21} = write; + let Inst{20} = 1; + let Inst{15-12} = 0b1111; + let Inst{11-6} = 0000000; + + bits<10> addr; + let Inst{19-16} = addr{9-6}; // Rn + let Inst{3-0} = addr{5-2}; // Rm + let Inst{5-4} = addr{1-0}; // imm2 + + let DecoderMethod = "DecodeT2LoadShift"; + } +} + +defm t2PLD : T2Ipl<0, 0, "pld">, Requires<[IsThumb2]>; +defm t2PLDW : T2Ipl<1, 0, "pldw">, Requires<[IsThumb2,HasV7,HasMP]>; +defm t2PLI : T2Ipl<0, 1, "pli">, Requires<[IsThumb2,HasV7]>; + +//===----------------------------------------------------------------------===// +// Load / store multiple Instructions. +// + +multiclass thumb2_ld_mult<string asm, InstrItinClass itin, + InstrItinClass itin_upd, bit L_bit> { + def IA : + T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + itin, !strconcat(asm, "${p}.w\t$Rn, $regs"), []> { + bits<4> Rn; + bits<16> regs; + + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b00; + let Inst{24-23} = 0b01; // Increment After + let Inst{22} = 0; + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + let Inst{19-16} = Rn; + let Inst{15} = 0; + let Inst{14-0} = regs{14-0}; + } + def IA_UPD : + T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + itin_upd, !strconcat(asm, "${p}.w\t$Rn!, $regs"), "$Rn = $wb", []> { + bits<4> Rn; + bits<16> regs; + + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b00; + let Inst{24-23} = 0b01; // Increment After + let Inst{22} = 0; + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + let Inst{19-16} = Rn; + let Inst{15} = 0; + let Inst{14-0} = regs{14-0}; + } + def DB : + T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + itin, !strconcat(asm, "db${p}\t$Rn, $regs"), []> { + bits<4> Rn; + bits<16> regs; + + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b00; + let Inst{24-23} = 0b10; // Decrement Before + let Inst{22} = 0; + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + let Inst{19-16} = Rn; + let Inst{15} = 0; + let Inst{14-0} = regs{14-0}; + } + def DB_UPD : + T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + itin_upd, !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> { + bits<4> Rn; + bits<16> regs; + + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b00; + let Inst{24-23} = 0b10; // Decrement Before + let Inst{22} = 0; + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + let Inst{19-16} = Rn; + let Inst{15} = 0; + let Inst{14-0} = regs{14-0}; + } +} + +let neverHasSideEffects = 1 in { + +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in +defm t2LDM : thumb2_ld_mult<"ldm", IIC_iLoad_m, IIC_iLoad_mu, 1>; + +multiclass thumb2_st_mult<string asm, InstrItinClass itin, + InstrItinClass itin_upd, bit L_bit> { + def IA : + T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + itin, !strconcat(asm, "${p}.w\t$Rn, $regs"), []> { + bits<4> Rn; + bits<16> regs; + + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b00; + let Inst{24-23} = 0b01; // Increment After + let Inst{22} = 0; + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + let Inst{19-16} = Rn; + let Inst{15} = 0; + let Inst{14} = regs{14}; + let Inst{13} = 0; + let Inst{12-0} = regs{12-0}; + } + def IA_UPD : + T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + itin_upd, !strconcat(asm, "${p}.w\t$Rn!, $regs"), "$Rn = $wb", []> { + bits<4> Rn; + bits<16> regs; + + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b00; + let Inst{24-23} = 0b01; // Increment After + let Inst{22} = 0; + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + let Inst{19-16} = Rn; + let Inst{15} = 0; + let Inst{14} = regs{14}; + let Inst{13} = 0; + let Inst{12-0} = regs{12-0}; + } + def DB : + T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + itin, !strconcat(asm, "db${p}\t$Rn, $regs"), []> { + bits<4> Rn; + bits<16> regs; + + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b00; + let Inst{24-23} = 0b10; // Decrement Before + let Inst{22} = 0; + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + let Inst{19-16} = Rn; + let Inst{15} = 0; + let Inst{14} = regs{14}; + let Inst{13} = 0; + let Inst{12-0} = regs{12-0}; + } + def DB_UPD : + T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops), + itin_upd, !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> { + bits<4> Rn; + bits<16> regs; + + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b00; + let Inst{24-23} = 0b10; // Decrement Before + let Inst{22} = 0; + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + let Inst{19-16} = Rn; + let Inst{15} = 0; + let Inst{14} = regs{14}; + let Inst{13} = 0; + let Inst{12-0} = regs{12-0}; + } +} + + +let mayStore = 1, hasExtraSrcRegAllocReq = 1 in +defm t2STM : thumb2_st_mult<"stm", IIC_iStore_m, IIC_iStore_mu, 0>; + +} // neverHasSideEffects + + +//===----------------------------------------------------------------------===// +// Move Instructions. +// + +let neverHasSideEffects = 1 in +def t2MOVr : T2sTwoReg<(outs GPRnopc:$Rd), (ins GPR:$Rm), IIC_iMOVr, + "mov", ".w\t$Rd, $Rm", []> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = 0b0010; + let Inst{19-16} = 0b1111; // Rn + let Inst{14-12} = 0b000; + let Inst{7-4} = 0b0000; +} +def : t2InstAlias<"movs${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm, + pred:$p, CPSR)>; +def : t2InstAlias<"movs${p} $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm, + pred:$p, CPSR)>; + +// AddedComplexity to ensure isel tries t2MOVi before t2MOVi16. +let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1, + AddedComplexity = 1 in +def t2MOVi : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), IIC_iMOVi, + "mov", ".w\t$Rd, $imm", + [(set rGPR:$Rd, t2_so_imm:$imm)]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = 0b0010; + let Inst{19-16} = 0b1111; // Rn + let Inst{15} = 0; +} + +// cc_out is handled as part of the explicit mnemonic in the parser for 'mov'. +// Use aliases to get that to play nice here. +def : t2InstAlias<"movs${p}.w $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm, + pred:$p, CPSR)>; +def : t2InstAlias<"movs${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm, + pred:$p, CPSR)>; + +def : t2InstAlias<"mov${p}.w $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm, + pred:$p, zero_reg)>; +def : t2InstAlias<"mov${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm, + pred:$p, zero_reg)>; + +let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in +def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins imm0_65535_expr:$imm), IIC_iMOVi, + "movw", "\t$Rd, $imm", + [(set rGPR:$Rd, imm0_65535:$imm)]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 1; + let Inst{24-21} = 0b0010; + let Inst{20} = 0; // The S bit. + let Inst{15} = 0; + + bits<4> Rd; + bits<16> imm; + + let Inst{11-8} = Rd; + let Inst{19-16} = imm{15-12}; + let Inst{26} = imm{11}; + let Inst{14-12} = imm{10-8}; + let Inst{7-0} = imm{7-0}; + let DecoderMethod = "DecodeT2MOVTWInstruction"; +} + +def t2MOVi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd), + (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>; + +let Constraints = "$src = $Rd" in { +def t2MOVTi16 : T2I<(outs rGPR:$Rd), + (ins rGPR:$src, imm0_65535_expr:$imm), IIC_iMOVi, + "movt", "\t$Rd, $imm", + [(set rGPR:$Rd, + (or (and rGPR:$src, 0xffff), lo16AllZero:$imm))]> { + let Inst{31-27} = 0b11110; + let Inst{25} = 1; + let Inst{24-21} = 0b0110; + let Inst{20} = 0; // The S bit. + let Inst{15} = 0; + + bits<4> Rd; + bits<16> imm; + + let Inst{11-8} = Rd; + let Inst{19-16} = imm{15-12}; + let Inst{26} = imm{11}; + let Inst{14-12} = imm{10-8}; + let Inst{7-0} = imm{7-0}; + let DecoderMethod = "DecodeT2MOVTWInstruction"; +} + +def t2MOVTi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd), + (ins rGPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>; +} // Constraints + +def : T2Pat<(or rGPR:$src, 0xffff0000), (t2MOVTi16 rGPR:$src, 0xffff)>; + +//===----------------------------------------------------------------------===// +// Extend Instructions. +// + +// Sign extenders + +def t2SXTB : T2I_ext_rrot<0b100, "sxtb", + UnOpFrag<(sext_inreg node:$Src, i8)>>; +def t2SXTH : T2I_ext_rrot<0b000, "sxth", + UnOpFrag<(sext_inreg node:$Src, i16)>>; +def t2SXTB16 : T2I_ext_rrot_sxtb16<0b010, "sxtb16">; + +def t2SXTAB : T2I_exta_rrot<0b100, "sxtab", + BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>; +def t2SXTAH : T2I_exta_rrot<0b000, "sxtah", + BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>; +def t2SXTAB16 : T2I_exta_rrot_np<0b010, "sxtab16">; + +// Zero extenders + +let AddedComplexity = 16 in { +def t2UXTB : T2I_ext_rrot<0b101, "uxtb", + UnOpFrag<(and node:$Src, 0x000000FF)>>; +def t2UXTH : T2I_ext_rrot<0b001, "uxth", + UnOpFrag<(and node:$Src, 0x0000FFFF)>>; +def t2UXTB16 : T2I_ext_rrot_uxtb16<0b011, "uxtb16", + UnOpFrag<(and node:$Src, 0x00FF00FF)>>; + +// FIXME: This pattern incorrectly assumes the shl operator is a rotate. +// The transformation should probably be done as a combiner action +// instead so we can include a check for masking back in the upper +// eight bits of the source into the lower eight bits of the result. +//def : T2Pat<(and (shl rGPR:$Src, (i32 8)), 0xFF00FF), +// (t2UXTB16 rGPR:$Src, 3)>, +// Requires<[HasT2ExtractPack, IsThumb2]>; +def : T2Pat<(and (srl rGPR:$Src, (i32 8)), 0xFF00FF), + (t2UXTB16 rGPR:$Src, 1)>, + Requires<[HasT2ExtractPack, IsThumb2]>; + +def t2UXTAB : T2I_exta_rrot<0b101, "uxtab", + BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>; +def t2UXTAH : T2I_exta_rrot<0b001, "uxtah", + BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>; +def t2UXTAB16 : T2I_exta_rrot_np<0b011, "uxtab16">; +} + +//===----------------------------------------------------------------------===// +// Arithmetic Instructions. +// + +defm t2ADD : T2I_bin_ii12rs<0b000, "add", + BinOpFrag<(add node:$LHS, node:$RHS)>, 1>; +defm t2SUB : T2I_bin_ii12rs<0b101, "sub", + BinOpFrag<(sub node:$LHS, node:$RHS)>>; + +// ADD and SUB with 's' bit set. No 12-bit immediate (T4) variants. +// +// Currently, t2ADDS/t2SUBS are pseudo opcodes that exist only in the +// selection DAG. They are "lowered" to real t2ADD/t2SUB opcodes by +// AdjustInstrPostInstrSelection where we determine whether or not to +// set the "s" bit based on CPSR liveness. +// +// FIXME: Eliminate t2ADDS/t2SUBS pseudo opcodes after adding tablegen +// support for an optional CPSR definition that corresponds to the DAG +// node's second value. We can then eliminate the implicit def of CPSR. +defm t2ADDS : T2I_bin_s_irs <0b1000, "add", + IIC_iALUi, IIC_iALUr, IIC_iALUsi, + BinOpFrag<(ARMaddc node:$LHS, node:$RHS)>, 1>; +defm t2SUBS : T2I_bin_s_irs <0b1101, "sub", + IIC_iALUi, IIC_iALUr, IIC_iALUsi, + BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>; + +let hasPostISelHook = 1 in { +defm t2ADC : T2I_adde_sube_irs<0b1010, "adc", + BinOpWithFlagFrag<(ARMadde node:$LHS, node:$RHS, node:$FLAG)>, 1>; +defm t2SBC : T2I_adde_sube_irs<0b1011, "sbc", + BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>>; +} + +// RSB +defm t2RSB : T2I_rbin_irs <0b1110, "rsb", + BinOpFrag<(sub node:$LHS, node:$RHS)>>; + +// FIXME: Eliminate them if we can write def : Pat patterns which defines +// CPSR and the implicit def of CPSR is not needed. +defm t2RSBS : T2I_rbin_s_is <0b1110, "rsb", + BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>; + +// (sub X, imm) gets canonicalized to (add X, -imm). Match this form. +// The assume-no-carry-in form uses the negation of the input since add/sub +// assume opposite meanings of the carry flag (i.e., carry == !borrow). +// See the definition of AddWithCarry() in the ARM ARM A2.2.1 for the gory +// details. +// The AddedComplexity preferences the first variant over the others since +// it can be shrunk to a 16-bit wide encoding, while the others cannot. +let AddedComplexity = 1 in +def : T2Pat<(add GPR:$src, imm0_255_neg:$imm), + (t2SUBri GPR:$src, imm0_255_neg:$imm)>; +def : T2Pat<(add GPR:$src, t2_so_imm_neg:$imm), + (t2SUBri GPR:$src, t2_so_imm_neg:$imm)>; +def : T2Pat<(add GPR:$src, imm0_4095_neg:$imm), + (t2SUBri12 GPR:$src, imm0_4095_neg:$imm)>; +let AddedComplexity = 1 in +def : T2Pat<(ARMaddc rGPR:$src, imm0_255_neg:$imm), + (t2SUBSri rGPR:$src, imm0_255_neg:$imm)>; +def : T2Pat<(ARMaddc rGPR:$src, t2_so_imm_neg:$imm), + (t2SUBSri rGPR:$src, t2_so_imm_neg:$imm)>; +// The with-carry-in form matches bitwise not instead of the negation. +// Effectively, the inverse interpretation of the carry flag already accounts +// for part of the negation. +let AddedComplexity = 1 in +def : T2Pat<(ARMadde rGPR:$src, imm0_255_not:$imm, CPSR), + (t2SBCri rGPR:$src, imm0_255_not:$imm)>; +def : T2Pat<(ARMadde rGPR:$src, t2_so_imm_not:$imm, CPSR), + (t2SBCri rGPR:$src, t2_so_imm_not:$imm)>; + +// Select Bytes -- for disassembly only + +def t2SEL : T2ThreeReg<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + NoItinerary, "sel", "\t$Rd, $Rn, $Rm", []>, + Requires<[IsThumb2, HasThumb2DSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-24} = 0b010; + let Inst{23} = 0b1; + let Inst{22-20} = 0b010; + let Inst{15-12} = 0b1111; + let Inst{7} = 0b1; + let Inst{6-4} = 0b000; +} + +// A6.3.13, A6.3.14, A6.3.15 Parallel addition and subtraction (signed/unsigned) +// And Miscellaneous operations -- for disassembly only +class T2I_pam<bits<3> op22_20, bits<4> op7_4, string opc, + list<dag> pat = [/* For disassembly only; pattern left blank */], + dag iops = (ins rGPR:$Rn, rGPR:$Rm), + string asm = "\t$Rd, $Rn, $Rm"> + : T2I<(outs rGPR:$Rd), iops, NoItinerary, opc, asm, pat>, + Requires<[IsThumb2, HasThumb2DSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0101; + let Inst{22-20} = op22_20; + let Inst{15-12} = 0b1111; + let Inst{7-4} = op7_4; + + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + + let Inst{11-8} = Rd; + let Inst{19-16} = Rn; + let Inst{3-0} = Rm; +} + +// Saturating add/subtract -- for disassembly only + +def t2QADD : T2I_pam<0b000, 0b1000, "qadd", + [(set rGPR:$Rd, (int_arm_qadd rGPR:$Rn, rGPR:$Rm))], + (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">; +def t2QADD16 : T2I_pam<0b001, 0b0001, "qadd16">; +def t2QADD8 : T2I_pam<0b000, 0b0001, "qadd8">; +def t2QASX : T2I_pam<0b010, 0b0001, "qasx">; +def t2QDADD : T2I_pam<0b000, 0b1001, "qdadd", [], + (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">; +def t2QDSUB : T2I_pam<0b000, 0b1011, "qdsub", [], + (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">; +def t2QSAX : T2I_pam<0b110, 0b0001, "qsax">; +def t2QSUB : T2I_pam<0b000, 0b1010, "qsub", + [(set rGPR:$Rd, (int_arm_qsub rGPR:$Rn, rGPR:$Rm))], + (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">; +def t2QSUB16 : T2I_pam<0b101, 0b0001, "qsub16">; +def t2QSUB8 : T2I_pam<0b100, 0b0001, "qsub8">; +def t2UQADD16 : T2I_pam<0b001, 0b0101, "uqadd16">; +def t2UQADD8 : T2I_pam<0b000, 0b0101, "uqadd8">; +def t2UQASX : T2I_pam<0b010, 0b0101, "uqasx">; +def t2UQSAX : T2I_pam<0b110, 0b0101, "uqsax">; +def t2UQSUB16 : T2I_pam<0b101, 0b0101, "uqsub16">; +def t2UQSUB8 : T2I_pam<0b100, 0b0101, "uqsub8">; + +// Signed/Unsigned add/subtract -- for disassembly only + +def t2SASX : T2I_pam<0b010, 0b0000, "sasx">; +def t2SADD16 : T2I_pam<0b001, 0b0000, "sadd16">; +def t2SADD8 : T2I_pam<0b000, 0b0000, "sadd8">; +def t2SSAX : T2I_pam<0b110, 0b0000, "ssax">; +def t2SSUB16 : T2I_pam<0b101, 0b0000, "ssub16">; +def t2SSUB8 : T2I_pam<0b100, 0b0000, "ssub8">; +def t2UASX : T2I_pam<0b010, 0b0100, "uasx">; +def t2UADD16 : T2I_pam<0b001, 0b0100, "uadd16">; +def t2UADD8 : T2I_pam<0b000, 0b0100, "uadd8">; +def t2USAX : T2I_pam<0b110, 0b0100, "usax">; +def t2USUB16 : T2I_pam<0b101, 0b0100, "usub16">; +def t2USUB8 : T2I_pam<0b100, 0b0100, "usub8">; + +// Signed/Unsigned halving add/subtract -- for disassembly only + +def t2SHASX : T2I_pam<0b010, 0b0010, "shasx">; +def t2SHADD16 : T2I_pam<0b001, 0b0010, "shadd16">; +def t2SHADD8 : T2I_pam<0b000, 0b0010, "shadd8">; +def t2SHSAX : T2I_pam<0b110, 0b0010, "shsax">; +def t2SHSUB16 : T2I_pam<0b101, 0b0010, "shsub16">; +def t2SHSUB8 : T2I_pam<0b100, 0b0010, "shsub8">; +def t2UHASX : T2I_pam<0b010, 0b0110, "uhasx">; +def t2UHADD16 : T2I_pam<0b001, 0b0110, "uhadd16">; +def t2UHADD8 : T2I_pam<0b000, 0b0110, "uhadd8">; +def t2UHSAX : T2I_pam<0b110, 0b0110, "uhsax">; +def t2UHSUB16 : T2I_pam<0b101, 0b0110, "uhsub16">; +def t2UHSUB8 : T2I_pam<0b100, 0b0110, "uhsub8">; + +// Helper class for disassembly only +// A6.3.16 & A6.3.17 +// T2Imac - Thumb2 multiply [accumulate, and absolute difference] instructions. +class T2ThreeReg_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops, + dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> + : T2ThreeReg<oops, iops, itin, opc, asm, pattern> { + let Inst{31-27} = 0b11111; + let Inst{26-24} = 0b011; + let Inst{23} = long; + let Inst{22-20} = op22_20; + let Inst{7-4} = op7_4; +} + +class T2FourReg_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops, + dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern> + : T2FourReg<oops, iops, itin, opc, asm, pattern> { + let Inst{31-27} = 0b11111; + let Inst{26-24} = 0b011; + let Inst{23} = long; + let Inst{22-20} = op22_20; + let Inst{7-4} = op7_4; +} + +// Unsigned Sum of Absolute Differences [and Accumulate]. +def t2USAD8 : T2ThreeReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd), + (ins rGPR:$Rn, rGPR:$Rm), + NoItinerary, "usad8", "\t$Rd, $Rn, $Rm", []>, + Requires<[IsThumb2, HasThumb2DSP]> { + let Inst{15-12} = 0b1111; +} +def t2USADA8 : T2FourReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd), + (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), NoItinerary, + "usada8", "\t$Rd, $Rn, $Rm, $Ra", []>, + Requires<[IsThumb2, HasThumb2DSP]>; + +// Signed/Unsigned saturate. +class T2SatI<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<4> Rn; + bits<5> sat_imm; + bits<7> sh; + + let Inst{11-8} = Rd; + let Inst{19-16} = Rn; + let Inst{4-0} = sat_imm; + let Inst{21} = sh{5}; + let Inst{14-12} = sh{4-2}; + let Inst{7-6} = sh{1-0}; +} + +def t2SSAT: T2SatI< + (outs rGPR:$Rd), + (ins imm1_32:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh), + NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []> { + let Inst{31-27} = 0b11110; + let Inst{25-22} = 0b1100; + let Inst{20} = 0; + let Inst{15} = 0; + let Inst{5} = 0; +} + +def t2SSAT16: T2SatI< + (outs rGPR:$Rd), (ins imm1_16:$sat_imm, rGPR:$Rn), NoItinerary, + "ssat16", "\t$Rd, $sat_imm, $Rn", []>, + Requires<[IsThumb2, HasThumb2DSP]> { + let Inst{31-27} = 0b11110; + let Inst{25-22} = 0b1100; + let Inst{20} = 0; + let Inst{15} = 0; + let Inst{21} = 1; // sh = '1' + let Inst{14-12} = 0b000; // imm3 = '000' + let Inst{7-6} = 0b00; // imm2 = '00' + let Inst{5-4} = 0b00; +} + +def t2USAT: T2SatI< + (outs rGPR:$Rd), + (ins imm0_31:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh), + NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []> { + let Inst{31-27} = 0b11110; + let Inst{25-22} = 0b1110; + let Inst{20} = 0; + let Inst{15} = 0; +} + +def t2USAT16: T2SatI<(outs rGPR:$Rd), (ins imm0_15:$sat_imm, rGPR:$Rn), + NoItinerary, + "usat16", "\t$Rd, $sat_imm, $Rn", []>, + Requires<[IsThumb2, HasThumb2DSP]> { + let Inst{31-22} = 0b1111001110; + let Inst{20} = 0; + let Inst{15} = 0; + let Inst{21} = 1; // sh = '1' + let Inst{14-12} = 0b000; // imm3 = '000' + let Inst{7-6} = 0b00; // imm2 = '00' + let Inst{5-4} = 0b00; +} + +def : T2Pat<(int_arm_ssat GPR:$a, imm:$pos), (t2SSAT imm:$pos, GPR:$a, 0)>; +def : T2Pat<(int_arm_usat GPR:$a, imm:$pos), (t2USAT imm:$pos, GPR:$a, 0)>; + +//===----------------------------------------------------------------------===// +// Shift and rotate Instructions. +// + +defm t2LSL : T2I_sh_ir<0b00, "lsl", imm0_31, + BinOpFrag<(shl node:$LHS, node:$RHS)>, "t2LSL">; +defm t2LSR : T2I_sh_ir<0b01, "lsr", imm_sr, + BinOpFrag<(srl node:$LHS, node:$RHS)>, "t2LSR">; +defm t2ASR : T2I_sh_ir<0b10, "asr", imm_sr, + BinOpFrag<(sra node:$LHS, node:$RHS)>, "t2ASR">; +defm t2ROR : T2I_sh_ir<0b11, "ror", imm0_31, + BinOpFrag<(rotr node:$LHS, node:$RHS)>, "t2ROR">; + +// (rotr x, (and y, 0x...1f)) ==> (ROR x, y) +def : Pat<(rotr rGPR:$lhs, (and rGPR:$rhs, lo5AllOne)), + (t2RORrr rGPR:$lhs, rGPR:$rhs)>; + +let Uses = [CPSR] in { +def t2RRX : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi, + "rrx", "\t$Rd, $Rm", + [(set rGPR:$Rd, (ARMrrx rGPR:$Rm))]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = 0b0010; + let Inst{19-16} = 0b1111; // Rn + let Inst{14-12} = 0b000; + let Inst{7-4} = 0b0011; +} +} + +let isCodeGenOnly = 1, Defs = [CPSR] in { +def t2MOVsrl_flag : T2TwoRegShiftImm< + (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi, + "lsrs", ".w\t$Rd, $Rm, #1", + [(set rGPR:$Rd, (ARMsrl_flag rGPR:$Rm))]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = 0b0010; + let Inst{20} = 1; // The S bit. + let Inst{19-16} = 0b1111; // Rn + let Inst{5-4} = 0b01; // Shift type. + // Shift amount = Inst{14-12:7-6} = 1. + let Inst{14-12} = 0b000; + let Inst{7-6} = 0b01; +} +def t2MOVsra_flag : T2TwoRegShiftImm< + (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi, + "asrs", ".w\t$Rd, $Rm, #1", + [(set rGPR:$Rd, (ARMsra_flag rGPR:$Rm))]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = 0b0010; + let Inst{20} = 1; // The S bit. + let Inst{19-16} = 0b1111; // Rn + let Inst{5-4} = 0b10; // Shift type. + // Shift amount = Inst{14-12:7-6} = 1. + let Inst{14-12} = 0b000; + let Inst{7-6} = 0b01; +} +} + +//===----------------------------------------------------------------------===// +// Bitwise Instructions. +// + +defm t2AND : T2I_bin_w_irs<0b0000, "and", + IIC_iBITi, IIC_iBITr, IIC_iBITsi, + BinOpFrag<(and node:$LHS, node:$RHS)>, "t2AND", 1>; +defm t2ORR : T2I_bin_w_irs<0b0010, "orr", + IIC_iBITi, IIC_iBITr, IIC_iBITsi, + BinOpFrag<(or node:$LHS, node:$RHS)>, "t2ORR", 1>; +defm t2EOR : T2I_bin_w_irs<0b0100, "eor", + IIC_iBITi, IIC_iBITr, IIC_iBITsi, + BinOpFrag<(xor node:$LHS, node:$RHS)>, "t2EOR", 1>; + +defm t2BIC : T2I_bin_w_irs<0b0001, "bic", + IIC_iBITi, IIC_iBITr, IIC_iBITsi, + BinOpFrag<(and node:$LHS, (not node:$RHS))>, + "t2BIC">; + +class T2BitFI<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<4> Rd; + bits<5> msb; + bits<5> lsb; + + let Inst{11-8} = Rd; + let Inst{4-0} = msb{4-0}; + let Inst{14-12} = lsb{4-2}; + let Inst{7-6} = lsb{1-0}; +} + +class T2TwoRegBitFI<dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2BitFI<oops, iops, itin, opc, asm, pattern> { + bits<4> Rn; + + let Inst{19-16} = Rn; +} + +let Constraints = "$src = $Rd" in +def t2BFC : T2BitFI<(outs rGPR:$Rd), (ins rGPR:$src, bf_inv_mask_imm:$imm), + IIC_iUNAsi, "bfc", "\t$Rd, $imm", + [(set rGPR:$Rd, (and rGPR:$src, bf_inv_mask_imm:$imm))]> { + let Inst{31-27} = 0b11110; + let Inst{26} = 0; // should be 0. + let Inst{25} = 1; + let Inst{24-20} = 0b10110; + let Inst{19-16} = 0b1111; // Rn + let Inst{15} = 0; + let Inst{5} = 0; // should be 0. + + bits<10> imm; + let msb{4-0} = imm{9-5}; + let lsb{4-0} = imm{4-0}; +} + +def t2SBFX: T2TwoRegBitFI< + (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm1_32:$msb), + IIC_iUNAsi, "sbfx", "\t$Rd, $Rn, $lsb, $msb", []> { + let Inst{31-27} = 0b11110; + let Inst{25} = 1; + let Inst{24-20} = 0b10100; + let Inst{15} = 0; +} + +def t2UBFX: T2TwoRegBitFI< + (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm1_32:$msb), + IIC_iUNAsi, "ubfx", "\t$Rd, $Rn, $lsb, $msb", []> { + let Inst{31-27} = 0b11110; + let Inst{25} = 1; + let Inst{24-20} = 0b11100; + let Inst{15} = 0; +} + +// A8.6.18 BFI - Bitfield insert (Encoding T1) +let Constraints = "$src = $Rd" in { + def t2BFI : T2TwoRegBitFI<(outs rGPR:$Rd), + (ins rGPR:$src, rGPR:$Rn, bf_inv_mask_imm:$imm), + IIC_iBITi, "bfi", "\t$Rd, $Rn, $imm", + [(set rGPR:$Rd, (ARMbfi rGPR:$src, rGPR:$Rn, + bf_inv_mask_imm:$imm))]> { + let Inst{31-27} = 0b11110; + let Inst{26} = 0; // should be 0. + let Inst{25} = 1; + let Inst{24-20} = 0b10110; + let Inst{15} = 0; + let Inst{5} = 0; // should be 0. + + bits<10> imm; + let msb{4-0} = imm{9-5}; + let lsb{4-0} = imm{4-0}; + } +} + +defm t2ORN : T2I_bin_irs<0b0011, "orn", + IIC_iBITi, IIC_iBITr, IIC_iBITsi, + BinOpFrag<(or node:$LHS, (not node:$RHS))>, + "t2ORN", 0, "">; + +/// T2I_un_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a +/// unary operation that produces a value. These are predicable and can be +/// changed to modify CPSR. +multiclass T2I_un_irs<bits<4> opcod, string opc, + InstrItinClass iii, InstrItinClass iir, InstrItinClass iis, + PatFrag opnode, bit Cheap = 0, bit ReMat = 0> { + // shifted imm + def i : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), iii, + opc, "\t$Rd, $imm", + [(set rGPR:$Rd, (opnode t2_so_imm:$imm))]> { + let isAsCheapAsAMove = Cheap; + let isReMaterializable = ReMat; + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = opcod; + let Inst{19-16} = 0b1111; // Rn + let Inst{15} = 0; + } + // register + def r : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), iir, + opc, ".w\t$Rd, $Rm", + [(set rGPR:$Rd, (opnode rGPR:$Rm))]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{19-16} = 0b1111; // Rn + let Inst{14-12} = 0b000; // imm3 + let Inst{7-6} = 0b00; // imm2 + let Inst{5-4} = 0b00; // type + } + // shifted register + def s : T2sOneRegShiftedReg<(outs rGPR:$Rd), (ins t2_so_reg:$ShiftedRm), iis, + opc, ".w\t$Rd, $ShiftedRm", + [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm))]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = opcod; + let Inst{19-16} = 0b1111; // Rn + } +} + +// Prefer over of t2EORri ra, rb, -1 because mvn has 16-bit version +let AddedComplexity = 1 in +defm t2MVN : T2I_un_irs <0b0011, "mvn", + IIC_iMVNi, IIC_iMVNr, IIC_iMVNsi, + UnOpFrag<(not node:$Src)>, 1, 1>; + +let AddedComplexity = 1 in +def : T2Pat<(and rGPR:$src, t2_so_imm_not:$imm), + (t2BICri rGPR:$src, t2_so_imm_not:$imm)>; + +// FIXME: Disable this pattern on Darwin to workaround an assembler bug. +def : T2Pat<(or rGPR:$src, t2_so_imm_not:$imm), + (t2ORNri rGPR:$src, t2_so_imm_not:$imm)>, + Requires<[IsThumb2]>; + +def : T2Pat<(t2_so_imm_not:$src), + (t2MVNi t2_so_imm_not:$src)>; + +//===----------------------------------------------------------------------===// +// Multiply Instructions. +// +let isCommutable = 1 in +def t2MUL: T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32, + "mul", "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (mul rGPR:$Rn, rGPR:$Rm))]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b000; + let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate) + let Inst{7-4} = 0b0000; // Multiply +} + +def t2MLA: T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, + "mla", "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (add (mul rGPR:$Rn, rGPR:$Rm), rGPR:$Ra))]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b000; + let Inst{7-4} = 0b0000; // Multiply +} + +def t2MLS: T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, + "mls", "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (sub rGPR:$Ra, (mul rGPR:$Rn, rGPR:$Rm)))]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b000; + let Inst{7-4} = 0b0001; // Multiply and Subtract +} + +// Extra precision multiplies with low / high results +let neverHasSideEffects = 1 in { +let isCommutable = 1 in { +def t2SMULL : T2MulLong<0b000, 0b0000, + (outs rGPR:$RdLo, rGPR:$RdHi), + (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL64, + "smull", "\t$RdLo, $RdHi, $Rn, $Rm", []>; + +def t2UMULL : T2MulLong<0b010, 0b0000, + (outs rGPR:$RdLo, rGPR:$RdHi), + (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL64, + "umull", "\t$RdLo, $RdHi, $Rn, $Rm", []>; +} // isCommutable + +// Multiply + accumulate +def t2SMLAL : T2MulLong<0b100, 0b0000, + (outs rGPR:$RdLo, rGPR:$RdHi), + (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64, + "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>; + +def t2UMLAL : T2MulLong<0b110, 0b0000, + (outs rGPR:$RdLo, rGPR:$RdHi), + (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64, + "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>; + +def t2UMAAL : T2MulLong<0b110, 0b0110, + (outs rGPR:$RdLo, rGPR:$RdHi), + (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64, + "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + Requires<[IsThumb2, HasThumb2DSP]>; +} // neverHasSideEffects + +// Rounding variants of the below included for disassembly only + +// Most significant word multiply +def t2SMMUL : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32, + "smmul", "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (mulhs rGPR:$Rn, rGPR:$Rm))]>, + Requires<[IsThumb2, HasThumb2DSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b101; + let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate) + let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0) +} + +def t2SMMULR : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32, + "smmulr", "\t$Rd, $Rn, $Rm", []>, + Requires<[IsThumb2, HasThumb2DSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b101; + let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate) + let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1) +} + +def t2SMMLA : T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, + "smmla", "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (add (mulhs rGPR:$Rm, rGPR:$Rn), rGPR:$Ra))]>, + Requires<[IsThumb2, HasThumb2DSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b101; + let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0) +} + +def t2SMMLAR: T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, + "smmlar", "\t$Rd, $Rn, $Rm, $Ra", []>, + Requires<[IsThumb2, HasThumb2DSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b101; + let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1) +} + +def t2SMMLS: T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, + "smmls", "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (sub rGPR:$Ra, (mulhs rGPR:$Rn, rGPR:$Rm)))]>, + Requires<[IsThumb2, HasThumb2DSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b110; + let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0) +} + +def t2SMMLSR:T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, + "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []>, + Requires<[IsThumb2, HasThumb2DSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b110; + let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1) +} + +multiclass T2I_smul<string opc, PatFrag opnode> { + def BB : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16, + !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (opnode (sext_inreg rGPR:$Rn, i16), + (sext_inreg rGPR:$Rm, i16)))]>, + Requires<[IsThumb2, HasThumb2DSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b001; + let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate) + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b00; + } + + def BT : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16, + !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (opnode (sext_inreg rGPR:$Rn, i16), + (sra rGPR:$Rm, (i32 16))))]>, + Requires<[IsThumb2, HasThumb2DSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b001; + let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate) + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b01; + } + + def TB : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16, + !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (opnode (sra rGPR:$Rn, (i32 16)), + (sext_inreg rGPR:$Rm, i16)))]>, + Requires<[IsThumb2, HasThumb2DSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b001; + let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate) + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b10; + } + + def TT : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16, + !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (opnode (sra rGPR:$Rn, (i32 16)), + (sra rGPR:$Rm, (i32 16))))]>, + Requires<[IsThumb2, HasThumb2DSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b001; + let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate) + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b11; + } + + def WB : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16, + !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (sra (opnode rGPR:$Rn, + (sext_inreg rGPR:$Rm, i16)), (i32 16)))]>, + Requires<[IsThumb2, HasThumb2DSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b011; + let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate) + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b00; + } + + def WT : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16, + !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (sra (opnode rGPR:$Rn, + (sra rGPR:$Rm, (i32 16))), (i32 16)))]>, + Requires<[IsThumb2, HasThumb2DSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b011; + let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate) + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b01; + } +} + + +multiclass T2I_smla<string opc, PatFrag opnode> { + def BB : T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16, + !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (add rGPR:$Ra, + (opnode (sext_inreg rGPR:$Rn, i16), + (sext_inreg rGPR:$Rm, i16))))]>, + Requires<[IsThumb2, HasThumb2DSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b001; + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b00; + } + + def BT : T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16, + !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sext_inreg rGPR:$Rn, i16), + (sra rGPR:$Rm, (i32 16)))))]>, + Requires<[IsThumb2, HasThumb2DSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b001; + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b01; + } + + def TB : T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16, + !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sra rGPR:$Rn, (i32 16)), + (sext_inreg rGPR:$Rm, i16))))]>, + Requires<[IsThumb2, HasThumb2DSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b001; + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b10; + } + + def TT : T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16, + !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sra rGPR:$Rn, (i32 16)), + (sra rGPR:$Rm, (i32 16)))))]>, + Requires<[IsThumb2, HasThumb2DSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b001; + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b11; + } + + def WB : T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16, + !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (add rGPR:$Ra, (sra (opnode rGPR:$Rn, + (sext_inreg rGPR:$Rm, i16)), (i32 16))))]>, + Requires<[IsThumb2, HasThumb2DSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b011; + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b00; + } + + def WT : T2FourReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16, + !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra", + [(set rGPR:$Rd, (add rGPR:$Ra, (sra (opnode rGPR:$Rn, + (sra rGPR:$Rm, (i32 16))), (i32 16))))]>, + Requires<[IsThumb2, HasThumb2DSP]> { + let Inst{31-27} = 0b11111; + let Inst{26-23} = 0b0110; + let Inst{22-20} = 0b011; + let Inst{7-6} = 0b00; + let Inst{5-4} = 0b01; + } +} + +defm t2SMUL : T2I_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>; +defm t2SMLA : T2I_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>; + +// Halfword multiple accumulate long: SMLAL<x><y> +def t2SMLALBB : T2FourReg_mac<1, 0b100, 0b1000, (outs rGPR:$Ra,rGPR:$Rd), + (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlalbb", "\t$Ra, $Rd, $Rn, $Rm", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsThumb2, HasThumb2DSP]>; +def t2SMLALBT : T2FourReg_mac<1, 0b100, 0b1001, (outs rGPR:$Ra,rGPR:$Rd), + (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlalbt", "\t$Ra, $Rd, $Rn, $Rm", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsThumb2, HasThumb2DSP]>; +def t2SMLALTB : T2FourReg_mac<1, 0b100, 0b1010, (outs rGPR:$Ra,rGPR:$Rd), + (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaltb", "\t$Ra, $Rd, $Rn, $Rm", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsThumb2, HasThumb2DSP]>; +def t2SMLALTT : T2FourReg_mac<1, 0b100, 0b1011, (outs rGPR:$Ra,rGPR:$Rd), + (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaltt", "\t$Ra, $Rd, $Rn, $Rm", + [/* For disassembly only; pattern left blank */]>, + Requires<[IsThumb2, HasThumb2DSP]>; + +// Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD +def t2SMUAD: T2ThreeReg_mac< + 0, 0b010, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), + IIC_iMAC32, "smuad", "\t$Rd, $Rn, $Rm", []>, + Requires<[IsThumb2, HasThumb2DSP]> { + let Inst{15-12} = 0b1111; +} +def t2SMUADX:T2ThreeReg_mac< + 0, 0b010, 0b0001, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), + IIC_iMAC32, "smuadx", "\t$Rd, $Rn, $Rm", []>, + Requires<[IsThumb2, HasThumb2DSP]> { + let Inst{15-12} = 0b1111; +} +def t2SMUSD: T2ThreeReg_mac< + 0, 0b100, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), + IIC_iMAC32, "smusd", "\t$Rd, $Rn, $Rm", []>, + Requires<[IsThumb2, HasThumb2DSP]> { + let Inst{15-12} = 0b1111; +} +def t2SMUSDX:T2ThreeReg_mac< + 0, 0b100, 0b0001, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), + IIC_iMAC32, "smusdx", "\t$Rd, $Rn, $Rm", []>, + Requires<[IsThumb2, HasThumb2DSP]> { + let Inst{15-12} = 0b1111; +} +def t2SMLAD : T2FourReg_mac< + 0, 0b010, 0b0000, (outs rGPR:$Rd), + (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlad", + "\t$Rd, $Rn, $Rm, $Ra", []>, + Requires<[IsThumb2, HasThumb2DSP]>; +def t2SMLADX : T2FourReg_mac< + 0, 0b010, 0b0001, (outs rGPR:$Rd), + (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smladx", + "\t$Rd, $Rn, $Rm, $Ra", []>, + Requires<[IsThumb2, HasThumb2DSP]>; +def t2SMLSD : T2FourReg_mac<0, 0b100, 0b0000, (outs rGPR:$Rd), + (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlsd", + "\t$Rd, $Rn, $Rm, $Ra", []>, + Requires<[IsThumb2, HasThumb2DSP]>; +def t2SMLSDX : T2FourReg_mac<0, 0b100, 0b0001, (outs rGPR:$Rd), + (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlsdx", + "\t$Rd, $Rn, $Rm, $Ra", []>, + Requires<[IsThumb2, HasThumb2DSP]>; +def t2SMLALD : T2FourReg_mac<1, 0b100, 0b1100, (outs rGPR:$Ra,rGPR:$Rd), + (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64, "smlald", + "\t$Ra, $Rd, $Rn, $Rm", []>, + Requires<[IsThumb2, HasThumb2DSP]>; +def t2SMLALDX : T2FourReg_mac<1, 0b100, 0b1101, (outs rGPR:$Ra,rGPR:$Rd), + (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaldx", + "\t$Ra, $Rd, $Rn, $Rm", []>, + Requires<[IsThumb2, HasThumb2DSP]>; +def t2SMLSLD : T2FourReg_mac<1, 0b101, 0b1100, (outs rGPR:$Ra,rGPR:$Rd), + (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlsld", + "\t$Ra, $Rd, $Rn, $Rm", []>, + Requires<[IsThumb2, HasThumb2DSP]>; +def t2SMLSLDX : T2FourReg_mac<1, 0b101, 0b1101, (outs rGPR:$Ra,rGPR:$Rd), + (ins rGPR:$Rm,rGPR:$Rn), IIC_iMAC64, "smlsldx", + "\t$Ra, $Rd, $Rn, $Rm", []>, + Requires<[IsThumb2, HasThumb2DSP]>; + +//===----------------------------------------------------------------------===// +// Division Instructions. +// Signed and unsigned division on v7-M +// +def t2SDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUi, + "sdiv", "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (sdiv rGPR:$Rn, rGPR:$Rm))]>, + Requires<[HasDivide, IsThumb2]> { + let Inst{31-27} = 0b11111; + let Inst{26-21} = 0b011100; + let Inst{20} = 0b1; + let Inst{15-12} = 0b1111; + let Inst{7-4} = 0b1111; +} + +def t2UDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUi, + "udiv", "\t$Rd, $Rn, $Rm", + [(set rGPR:$Rd, (udiv rGPR:$Rn, rGPR:$Rm))]>, + Requires<[HasDivide, IsThumb2]> { + let Inst{31-27} = 0b11111; + let Inst{26-21} = 0b011101; + let Inst{20} = 0b1; + let Inst{15-12} = 0b1111; + let Inst{7-4} = 0b1111; +} + +//===----------------------------------------------------------------------===// +// Misc. Arithmetic Instructions. +// + +class T2I_misc<bits<2> op1, bits<2> op2, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, list<dag> pattern> + : T2ThreeReg<oops, iops, itin, opc, asm, pattern> { + let Inst{31-27} = 0b11111; + let Inst{26-22} = 0b01010; + let Inst{21-20} = op1; + let Inst{15-12} = 0b1111; + let Inst{7-6} = 0b10; + let Inst{5-4} = op2; + let Rn{3-0} = Rm; +} + +def t2CLZ : T2I_misc<0b11, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr, + "clz", "\t$Rd, $Rm", [(set rGPR:$Rd, (ctlz rGPR:$Rm))]>; + +def t2RBIT : T2I_misc<0b01, 0b10, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr, + "rbit", "\t$Rd, $Rm", + [(set rGPR:$Rd, (ARMrbit rGPR:$Rm))]>; + +def t2REV : T2I_misc<0b01, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr, + "rev", ".w\t$Rd, $Rm", [(set rGPR:$Rd, (bswap rGPR:$Rm))]>; + +def t2REV16 : T2I_misc<0b01, 0b01, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr, + "rev16", ".w\t$Rd, $Rm", + [(set rGPR:$Rd, (rotr (bswap rGPR:$Rm), (i32 16)))]>; + +def t2REVSH : T2I_misc<0b01, 0b11, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr, + "revsh", ".w\t$Rd, $Rm", + [(set rGPR:$Rd, (sra (bswap rGPR:$Rm), (i32 16)))]>; + +def : T2Pat<(or (sra (shl rGPR:$Rm, (i32 24)), (i32 16)), + (and (srl rGPR:$Rm, (i32 8)), 0xFF)), + (t2REVSH rGPR:$Rm)>; + +def t2PKHBT : T2ThreeReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, pkh_lsl_amt:$sh), + IIC_iBITsi, "pkhbt", "\t$Rd, $Rn, $Rm$sh", + [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF), + (and (shl rGPR:$Rm, pkh_lsl_amt:$sh), + 0xFFFF0000)))]>, + Requires<[HasT2ExtractPack, IsThumb2]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-20} = 0b01100; + let Inst{5} = 0; // BT form + let Inst{4} = 0; + + bits<5> sh; + let Inst{14-12} = sh{4-2}; + let Inst{7-6} = sh{1-0}; +} + +// Alternate cases for PKHBT where identities eliminate some nodes. +def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (and rGPR:$src2, 0xFFFF0000)), + (t2PKHBT rGPR:$src1, rGPR:$src2, 0)>, + Requires<[HasT2ExtractPack, IsThumb2]>; +def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (shl rGPR:$src2, imm16_31:$sh)), + (t2PKHBT rGPR:$src1, rGPR:$src2, imm16_31:$sh)>, + Requires<[HasT2ExtractPack, IsThumb2]>; + +// Note: Shifts of 1-15 bits will be transformed to srl instead of sra and +// will match the pattern below. +def t2PKHTB : T2ThreeReg< + (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, pkh_asr_amt:$sh), + IIC_iBITsi, "pkhtb", "\t$Rd, $Rn, $Rm$sh", + [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF0000), + (and (sra rGPR:$Rm, pkh_asr_amt:$sh), + 0xFFFF)))]>, + Requires<[HasT2ExtractPack, IsThumb2]> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-20} = 0b01100; + let Inst{5} = 1; // TB form + let Inst{4} = 0; + + bits<5> sh; + let Inst{14-12} = sh{4-2}; + let Inst{7-6} = sh{1-0}; +} + +// Alternate cases for PKHTB where identities eliminate some nodes. Note that +// a shift amount of 0 is *not legal* here, it is PKHBT instead. +def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (srl rGPR:$src2, imm16_31:$sh)), + (t2PKHTB rGPR:$src1, rGPR:$src2, imm16_31:$sh)>, + Requires<[HasT2ExtractPack, IsThumb2]>; +def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), + (and (srl rGPR:$src2, imm1_15:$sh), 0xFFFF)), + (t2PKHTB rGPR:$src1, rGPR:$src2, imm1_15:$sh)>, + Requires<[HasT2ExtractPack, IsThumb2]>; + +//===----------------------------------------------------------------------===// +// Comparison Instructions... +// +defm t2CMP : T2I_cmp_irs<0b1101, "cmp", + IIC_iCMPi, IIC_iCMPr, IIC_iCMPsi, + BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>, "t2CMP">; + +def : T2Pat<(ARMcmpZ GPRnopc:$lhs, t2_so_imm:$imm), + (t2CMPri GPRnopc:$lhs, t2_so_imm:$imm)>; +def : T2Pat<(ARMcmpZ GPRnopc:$lhs, rGPR:$rhs), + (t2CMPrr GPRnopc:$lhs, rGPR:$rhs)>; +def : T2Pat<(ARMcmpZ GPRnopc:$lhs, t2_so_reg:$rhs), + (t2CMPrs GPRnopc:$lhs, t2_so_reg:$rhs)>; + +//FIXME: Disable CMN, as CCodes are backwards from compare expectations +// Compare-to-zero still works out, just not the relationals +//defm t2CMN : T2I_cmp_irs<0b1000, "cmn", +// BinOpFrag<(ARMcmp node:$LHS,(ineg node:$RHS))>>; +defm t2CMNz : T2I_cmp_irs<0b1000, "cmn", + IIC_iCMPi, IIC_iCMPr, IIC_iCMPsi, + BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>, + "t2CMNz">; + +//def : T2Pat<(ARMcmp GPR:$src, t2_so_imm_neg:$imm), +// (t2CMNri GPR:$src, t2_so_imm_neg:$imm)>; + +def : T2Pat<(ARMcmpZ GPRnopc:$src, t2_so_imm_neg:$imm), + (t2CMNzri GPRnopc:$src, t2_so_imm_neg:$imm)>; + +defm t2TST : T2I_cmp_irs<0b0000, "tst", + IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi, + BinOpFrag<(ARMcmpZ (and_su node:$LHS, node:$RHS), 0)>, + "t2TST">; +defm t2TEQ : T2I_cmp_irs<0b0100, "teq", + IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi, + BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>, + "t2TEQ">; + +// Conditional moves +// FIXME: should be able to write a pattern for ARMcmov, but can't use +// a two-value operand where a dag node expects two operands. :( +let neverHasSideEffects = 1 in { +def t2MOVCCr : t2PseudoInst<(outs rGPR:$Rd), + (ins rGPR:$false, rGPR:$Rm, pred:$p), + 4, IIC_iCMOVr, + [/*(set rGPR:$Rd, (ARMcmov rGPR:$false, rGPR:$Rm, imm:$cc, CCR:$ccr))*/]>, + RegConstraint<"$false = $Rd">; + +let isMoveImm = 1 in +def t2MOVCCi : t2PseudoInst<(outs rGPR:$Rd), + (ins rGPR:$false, t2_so_imm:$imm, pred:$p), + 4, IIC_iCMOVi, +[/*(set rGPR:$Rd,(ARMcmov rGPR:$false,t2_so_imm:$imm, imm:$cc, CCR:$ccr))*/]>, + RegConstraint<"$false = $Rd">; + +// FIXME: Pseudo-ize these. For now, just mark codegen only. +let isCodeGenOnly = 1 in { +let isMoveImm = 1 in +def t2MOVCCi16 : T2I<(outs rGPR:$Rd), (ins rGPR:$false, imm0_65535_expr:$imm), + IIC_iCMOVi, + "movw", "\t$Rd, $imm", []>, + RegConstraint<"$false = $Rd"> { + let Inst{31-27} = 0b11110; + let Inst{25} = 1; + let Inst{24-21} = 0b0010; + let Inst{20} = 0; // The S bit. + let Inst{15} = 0; + + bits<4> Rd; + bits<16> imm; + + let Inst{11-8} = Rd; + let Inst{19-16} = imm{15-12}; + let Inst{26} = imm{11}; + let Inst{14-12} = imm{10-8}; + let Inst{7-0} = imm{7-0}; +} + +let isMoveImm = 1 in +def t2MOVCCi32imm : PseudoInst<(outs rGPR:$dst), + (ins rGPR:$false, i32imm:$src, pred:$p), + IIC_iCMOVix2, []>, RegConstraint<"$false = $dst">; + +let isMoveImm = 1 in +def t2MVNCCi : T2OneRegImm<(outs rGPR:$Rd), (ins rGPR:$false, t2_so_imm:$imm), + IIC_iCMOVi, "mvn", ".w\t$Rd, $imm", +[/*(set rGPR:$Rd,(ARMcmov rGPR:$false,t2_so_imm_not:$imm, + imm:$cc, CCR:$ccr))*/]>, + RegConstraint<"$false = $Rd"> { + let Inst{31-27} = 0b11110; + let Inst{25} = 0; + let Inst{24-21} = 0b0011; + let Inst{20} = 0; // The S bit. + let Inst{19-16} = 0b1111; // Rn + let Inst{15} = 0; +} + +class T2I_movcc_sh<bits<2> opcod, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2TwoRegShiftImm<oops, iops, itin, opc, asm, pattern> { + let Inst{31-27} = 0b11101; + let Inst{26-25} = 0b01; + let Inst{24-21} = 0b0010; + let Inst{20} = 0; // The S bit. + let Inst{19-16} = 0b1111; // Rn + let Inst{5-4} = opcod; // Shift type. +} +def t2MOVCClsl : T2I_movcc_sh<0b00, (outs rGPR:$Rd), + (ins rGPR:$false, rGPR:$Rm, i32imm:$imm), + IIC_iCMOVsi, "lsl", ".w\t$Rd, $Rm, $imm", []>, + RegConstraint<"$false = $Rd">; +def t2MOVCClsr : T2I_movcc_sh<0b01, (outs rGPR:$Rd), + (ins rGPR:$false, rGPR:$Rm, i32imm:$imm), + IIC_iCMOVsi, "lsr", ".w\t$Rd, $Rm, $imm", []>, + RegConstraint<"$false = $Rd">; +def t2MOVCCasr : T2I_movcc_sh<0b10, (outs rGPR:$Rd), + (ins rGPR:$false, rGPR:$Rm, i32imm:$imm), + IIC_iCMOVsi, "asr", ".w\t$Rd, $Rm, $imm", []>, + RegConstraint<"$false = $Rd">; +def t2MOVCCror : T2I_movcc_sh<0b11, (outs rGPR:$Rd), + (ins rGPR:$false, rGPR:$Rm, i32imm:$imm), + IIC_iCMOVsi, "ror", ".w\t$Rd, $Rm, $imm", []>, + RegConstraint<"$false = $Rd">; +} // isCodeGenOnly = 1 +} // neverHasSideEffects + +//===----------------------------------------------------------------------===// +// Atomic operations intrinsics +// + +// memory barriers protect the atomic sequences +let hasSideEffects = 1 in { +def t2DMB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary, + "dmb", "\t$opt", [(ARMMemBarrier (i32 imm:$opt))]>, + Requires<[IsThumb, HasDB]> { + bits<4> opt; + let Inst{31-4} = 0xf3bf8f5; + let Inst{3-0} = opt; +} +} + +def t2DSB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary, + "dsb", "\t$opt", []>, + Requires<[IsThumb, HasDB]> { + bits<4> opt; + let Inst{31-4} = 0xf3bf8f4; + let Inst{3-0} = opt; +} + +def t2ISB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary, + "isb", "\t$opt", + []>, Requires<[IsThumb2, HasDB]> { + bits<4> opt; + let Inst{31-4} = 0xf3bf8f6; + let Inst{3-0} = opt; +} + +class T2I_ldrex<bits<2> opcod, dag oops, dag iops, AddrMode am, int sz, + InstrItinClass itin, string opc, string asm, string cstr, + list<dag> pattern, bits<4> rt2 = 0b1111> + : Thumb2I<oops, iops, am, sz, itin, opc, asm, cstr, pattern> { + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0001101; + let Inst{11-8} = rt2; + let Inst{7-6} = 0b01; + let Inst{5-4} = opcod; + let Inst{3-0} = 0b1111; + + bits<4> addr; + bits<4> Rt; + let Inst{19-16} = addr; + let Inst{15-12} = Rt; +} +class T2I_strex<bits<2> opcod, dag oops, dag iops, AddrMode am, int sz, + InstrItinClass itin, string opc, string asm, string cstr, + list<dag> pattern, bits<4> rt2 = 0b1111> + : Thumb2I<oops, iops, am, sz, itin, opc, asm, cstr, pattern> { + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0001100; + let Inst{11-8} = rt2; + let Inst{7-6} = 0b01; + let Inst{5-4} = opcod; + + bits<4> Rd; + bits<4> addr; + bits<4> Rt; + let Inst{3-0} = Rd; + let Inst{19-16} = addr; + let Inst{15-12} = Rt; +} + +let mayLoad = 1 in { +def t2LDREXB : T2I_ldrex<0b00, (outs rGPR:$Rt), (ins addr_offset_none:$addr), + AddrModeNone, 4, NoItinerary, + "ldrexb", "\t$Rt, $addr", "", []>; +def t2LDREXH : T2I_ldrex<0b01, (outs rGPR:$Rt), (ins addr_offset_none:$addr), + AddrModeNone, 4, NoItinerary, + "ldrexh", "\t$Rt, $addr", "", []>; +def t2LDREX : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_imm0_1020s4:$addr), + AddrModeNone, 4, NoItinerary, + "ldrex", "\t$Rt, $addr", "", []> { + bits<4> Rt; + bits<12> addr; + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0000101; + let Inst{19-16} = addr{11-8}; + let Inst{15-12} = Rt; + let Inst{11-8} = 0b1111; + let Inst{7-0} = addr{7-0}; +} +let hasExtraDefRegAllocReq = 1 in +def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2), + (ins addr_offset_none:$addr), + AddrModeNone, 4, NoItinerary, + "ldrexd", "\t$Rt, $Rt2, $addr", "", + [], {?, ?, ?, ?}> { + bits<4> Rt2; + let Inst{11-8} = Rt2; +} +} + +let mayStore = 1, Constraints = "@earlyclobber $Rd" in { +def t2STREXB : T2I_strex<0b00, (outs rGPR:$Rd), + (ins rGPR:$Rt, addr_offset_none:$addr), + AddrModeNone, 4, NoItinerary, + "strexb", "\t$Rd, $Rt, $addr", "", []>; +def t2STREXH : T2I_strex<0b01, (outs rGPR:$Rd), + (ins rGPR:$Rt, addr_offset_none:$addr), + AddrModeNone, 4, NoItinerary, + "strexh", "\t$Rd, $Rt, $addr", "", []>; +def t2STREX : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt, + t2addrmode_imm0_1020s4:$addr), + AddrModeNone, 4, NoItinerary, + "strex", "\t$Rd, $Rt, $addr", "", + []> { + bits<4> Rd; + bits<4> Rt; + bits<12> addr; + let Inst{31-27} = 0b11101; + let Inst{26-20} = 0b0000100; + let Inst{19-16} = addr{11-8}; + let Inst{15-12} = Rt; + let Inst{11-8} = Rd; + let Inst{7-0} = addr{7-0}; +} +} + +let hasExtraSrcRegAllocReq = 1, Constraints = "@earlyclobber $Rd" in +def t2STREXD : T2I_strex<0b11, (outs rGPR:$Rd), + (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr), + AddrModeNone, 4, NoItinerary, + "strexd", "\t$Rd, $Rt, $Rt2, $addr", "", [], + {?, ?, ?, ?}> { + bits<4> Rt2; + let Inst{11-8} = Rt2; +} + +def t2CLREX : T2I<(outs), (ins), NoItinerary, "clrex", "", []>, + Requires<[IsThumb2, HasV7]> { + let Inst{31-16} = 0xf3bf; + let Inst{15-14} = 0b10; + let Inst{13} = 0; + let Inst{12} = 0; + let Inst{11-8} = 0b1111; + let Inst{7-4} = 0b0010; + let Inst{3-0} = 0b1111; +} + +//===----------------------------------------------------------------------===// +// SJLJ Exception handling intrinsics +// eh_sjlj_setjmp() is an instruction sequence to store the return +// address and save #0 in R0 for the non-longjmp case. +// Since by its nature we may be coming from some other function to get +// here, and we're using the stack frame for the containing function to +// save/restore registers, we can't keep anything live in regs across +// the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon +// when we get here from a longjmp(). We force everything out of registers +// except for our own input by listing the relevant registers in Defs. By +// doing so, we also cause the prologue/epilogue code to actively preserve +// all of the callee-saved resgisters, which is exactly what we want. +// $val is a scratch register for our use. +let Defs = + [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR, + QQQQ0, QQQQ1, QQQQ2, QQQQ3 ], + hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1 in { + def t2Int_eh_sjlj_setjmp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val), + AddrModeNone, 0, NoItinerary, "", "", + [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>, + Requires<[IsThumb2, HasVFP2]>; +} + +let Defs = + [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR ], + hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1 in { + def t2Int_eh_sjlj_setjmp_nofp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val), + AddrModeNone, 0, NoItinerary, "", "", + [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>, + Requires<[IsThumb2, NoVFP]>; +} + + +//===----------------------------------------------------------------------===// +// Control-Flow Instructions +// + +// FIXME: remove when we have a way to marking a MI with these properties. +// FIXME: Should pc be an implicit operand like PICADD, etc? +let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1, + hasExtraDefRegAllocReq = 1, isCodeGenOnly = 1 in +def t2LDMIA_RET: t2PseudoExpand<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, + reglist:$regs, variable_ops), + 4, IIC_iLoad_mBr, [], + (t2LDMIA_UPD GPR:$wb, GPR:$Rn, pred:$p, reglist:$regs)>, + RegConstraint<"$Rn = $wb">; + +let isBranch = 1, isTerminator = 1, isBarrier = 1 in { +let isPredicable = 1 in +def t2B : T2I<(outs), (ins uncondbrtarget:$target), IIC_Br, + "b", ".w\t$target", + [(br bb:$target)]> { + let Inst{31-27} = 0b11110; + let Inst{15-14} = 0b10; + let Inst{12} = 1; + + bits<20> target; + let Inst{26} = target{19}; + let Inst{11} = target{18}; + let Inst{13} = target{17}; + let Inst{21-16} = target{16-11}; + let Inst{10-0} = target{10-0}; +} + +let isNotDuplicable = 1, isIndirectBranch = 1 in { +def t2BR_JT : t2PseudoInst<(outs), + (ins GPR:$target, GPR:$index, i32imm:$jt, i32imm:$id), + 0, IIC_Br, + [(ARMbr2jt GPR:$target, GPR:$index, tjumptable:$jt, imm:$id)]>; + +// FIXME: Add a non-pc based case that can be predicated. +def t2TBB_JT : t2PseudoInst<(outs), + (ins GPR:$index, i32imm:$jt, i32imm:$id), 0, IIC_Br, []>; + +def t2TBH_JT : t2PseudoInst<(outs), + (ins GPR:$index, i32imm:$jt, i32imm:$id), 0, IIC_Br, []>; + +def t2TBB : T2I<(outs), (ins addrmode_tbb:$addr), IIC_Br, + "tbb", "\t$addr", []> { + bits<4> Rn; + bits<4> Rm; + let Inst{31-20} = 0b111010001101; + let Inst{19-16} = Rn; + let Inst{15-5} = 0b11110000000; + let Inst{4} = 0; // B form + let Inst{3-0} = Rm; + + let DecoderMethod = "DecodeThumbTableBranch"; +} + +def t2TBH : T2I<(outs), (ins addrmode_tbh:$addr), IIC_Br, + "tbh", "\t$addr", []> { + bits<4> Rn; + bits<4> Rm; + let Inst{31-20} = 0b111010001101; + let Inst{19-16} = Rn; + let Inst{15-5} = 0b11110000000; + let Inst{4} = 1; // H form + let Inst{3-0} = Rm; + + let DecoderMethod = "DecodeThumbTableBranch"; +} +} // isNotDuplicable, isIndirectBranch + +} // isBranch, isTerminator, isBarrier + +// FIXME: should be able to write a pattern for ARMBrcond, but can't use +// a two-value operand where a dag node expects ", "two operands. :( +let isBranch = 1, isTerminator = 1 in +def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br, + "b", ".w\t$target", + [/*(ARMbrcond bb:$target, imm:$cc)*/]> { + let Inst{31-27} = 0b11110; + let Inst{15-14} = 0b10; + let Inst{12} = 0; + + bits<4> p; + let Inst{25-22} = p; + + bits<21> target; + let Inst{26} = target{20}; + let Inst{11} = target{19}; + let Inst{13} = target{18}; + let Inst{21-16} = target{17-12}; + let Inst{10-0} = target{11-1}; + + let DecoderMethod = "DecodeThumb2BCCInstruction"; +} + +// Tail calls. The Darwin version of thumb tail calls uses a t2 branch, so +// it goes here. +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in { + // Darwin version. + let Defs = [R0, R1, R2, R3, R9, R12, QQQQ0, QQQQ2, QQQQ3, PC], + Uses = [SP] in + def tTAILJMPd: tPseudoExpand<(outs), + (ins uncondbrtarget:$dst, pred:$p, variable_ops), + 4, IIC_Br, [], + (t2B uncondbrtarget:$dst, pred:$p)>, + Requires<[IsThumb2, IsDarwin]>; +} + +// IT block +let Defs = [ITSTATE] in +def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask), + AddrModeNone, 2, IIC_iALUx, + "it$mask\t$cc", "", []> { + // 16-bit instruction. + let Inst{31-16} = 0x0000; + let Inst{15-8} = 0b10111111; + + bits<4> cc; + bits<4> mask; + let Inst{7-4} = cc; + let Inst{3-0} = mask; + + let DecoderMethod = "DecodeIT"; +} + +// Branch and Exchange Jazelle -- for disassembly only +// Rm = Inst{19-16} +def t2BXJ : T2I<(outs), (ins rGPR:$func), NoItinerary, "bxj", "\t$func", []> { + bits<4> func; + let Inst{31-27} = 0b11110; + let Inst{26} = 0; + let Inst{25-20} = 0b111100; + let Inst{19-16} = func; + let Inst{15-0} = 0b1000111100000000; +} + +// Compare and branch on zero / non-zero +let isBranch = 1, isTerminator = 1 in { + def tCBZ : T1I<(outs), (ins tGPR:$Rn, t_cbtarget:$target), IIC_Br, + "cbz\t$Rn, $target", []>, + T1Misc<{0,0,?,1,?,?,?}>, + Requires<[IsThumb2]> { + // A8.6.27 + bits<6> target; + bits<3> Rn; + let Inst{9} = target{5}; + let Inst{7-3} = target{4-0}; + let Inst{2-0} = Rn; + } + + def tCBNZ : T1I<(outs), (ins tGPR:$Rn, t_cbtarget:$target), IIC_Br, + "cbnz\t$Rn, $target", []>, + T1Misc<{1,0,?,1,?,?,?}>, + Requires<[IsThumb2]> { + // A8.6.27 + bits<6> target; + bits<3> Rn; + let Inst{9} = target{5}; + let Inst{7-3} = target{4-0}; + let Inst{2-0} = Rn; + } +} + + +// Change Processor State is a system instruction. +// FIXME: Since the asm parser has currently no clean way to handle optional +// operands, create 3 versions of the same instruction. Once there's a clean +// framework to represent optional operands, change this behavior. +class t2CPS<dag iops, string asm_op> : T2XI<(outs), iops, NoItinerary, + !strconcat("cps", asm_op), []> { + bits<2> imod; + bits<3> iflags; + bits<5> mode; + bit M; + + let Inst{31-27} = 0b11110; + let Inst{26} = 0; + let Inst{25-20} = 0b111010; + let Inst{19-16} = 0b1111; + let Inst{15-14} = 0b10; + let Inst{12} = 0; + let Inst{10-9} = imod; + let Inst{8} = M; + let Inst{7-5} = iflags; + let Inst{4-0} = mode; + let DecoderMethod = "DecodeT2CPSInstruction"; +} + +let M = 1 in + def t2CPS3p : t2CPS<(ins imod_op:$imod, iflags_op:$iflags, i32imm:$mode), + "$imod.w\t$iflags, $mode">; +let mode = 0, M = 0 in + def t2CPS2p : t2CPS<(ins imod_op:$imod, iflags_op:$iflags), + "$imod.w\t$iflags">; +let imod = 0, iflags = 0, M = 1 in + def t2CPS1p : t2CPS<(ins imm0_31:$mode), "\t$mode">; + +// A6.3.4 Branches and miscellaneous control +// Table A6-14 Change Processor State, and hint instructions +class T2I_hint<bits<8> op7_0, string opc, string asm> + : T2I<(outs), (ins), NoItinerary, opc, asm, []> { + let Inst{31-20} = 0xf3a; + let Inst{19-16} = 0b1111; + let Inst{15-14} = 0b10; + let Inst{12} = 0; + let Inst{10-8} = 0b000; + let Inst{7-0} = op7_0; +} + +def t2NOP : T2I_hint<0b00000000, "nop", ".w">; +def t2YIELD : T2I_hint<0b00000001, "yield", ".w">; +def t2WFE : T2I_hint<0b00000010, "wfe", ".w">; +def t2WFI : T2I_hint<0b00000011, "wfi", ".w">; +def t2SEV : T2I_hint<0b00000100, "sev", ".w">; + +def t2DBG : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "dbg", "\t$opt", []> { + bits<4> opt; + let Inst{31-20} = 0b111100111010; + let Inst{19-16} = 0b1111; + let Inst{15-8} = 0b10000000; + let Inst{7-4} = 0b1111; + let Inst{3-0} = opt; +} + +// Secure Monitor Call is a system instruction. +// Option = Inst{19-16} +def t2SMC : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt", []> { + let Inst{31-27} = 0b11110; + let Inst{26-20} = 0b1111111; + let Inst{15-12} = 0b1000; + + bits<4> opt; + let Inst{19-16} = opt; +} + +class T2SRS<bits<2> Op, bit W, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + bits<5> mode; + let Inst{31-25} = 0b1110100; + let Inst{24-23} = Op; + let Inst{22} = 0; + let Inst{21} = W; + let Inst{20-16} = 0b01101; + let Inst{15-5} = 0b11000000000; + let Inst{4-0} = mode{4-0}; +} + +// Store Return State is a system instruction. +def t2SRSDB_UPD : T2SRS<0b00, 1, (outs), (ins imm0_31:$mode), NoItinerary, + "srsdb", "\tsp!, $mode", []>; +def t2SRSDB : T2SRS<0b00, 0, (outs), (ins imm0_31:$mode), NoItinerary, + "srsdb","\tsp, $mode", []>; +def t2SRSIA_UPD : T2SRS<0b11, 1, (outs), (ins imm0_31:$mode), NoItinerary, + "srsia","\tsp!, $mode", []>; +def t2SRSIA : T2SRS<0b11, 0, (outs), (ins imm0_31:$mode), NoItinerary, + "srsia","\tsp, $mode", []>; + +// Return From Exception is a system instruction. +class T2RFE<bits<12> op31_20, dag oops, dag iops, InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : T2I<oops, iops, itin, opc, asm, pattern> { + let Inst{31-20} = op31_20{11-0}; + + bits<4> Rn; + let Inst{19-16} = Rn; + let Inst{15-0} = 0xc000; +} + +def t2RFEDBW : T2RFE<0b111010000011, + (outs), (ins GPR:$Rn), NoItinerary, "rfedb", "\t$Rn!", + [/* For disassembly only; pattern left blank */]>; +def t2RFEDB : T2RFE<0b111010000001, + (outs), (ins GPR:$Rn), NoItinerary, "rfedb", "\t$Rn", + [/* For disassembly only; pattern left blank */]>; +def t2RFEIAW : T2RFE<0b111010011011, + (outs), (ins GPR:$Rn), NoItinerary, "rfeia", "\t$Rn!", + [/* For disassembly only; pattern left blank */]>; +def t2RFEIA : T2RFE<0b111010011001, + (outs), (ins GPR:$Rn), NoItinerary, "rfeia", "\t$Rn", + [/* For disassembly only; pattern left blank */]>; + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +// + +// 32-bit immediate using movw + movt. +// This is a single pseudo instruction to make it re-materializable. +// FIXME: Remove this when we can do generalized remat. +let isReMaterializable = 1, isMoveImm = 1 in +def t2MOVi32imm : PseudoInst<(outs rGPR:$dst), (ins i32imm:$src), IIC_iMOVix2, + [(set rGPR:$dst, (i32 imm:$src))]>, + Requires<[IsThumb, HasV6T2]>; + +// Pseudo instruction that combines movw + movt + add pc (if pic). +// It also makes it possible to rematerialize the instructions. +// FIXME: Remove this when we can do generalized remat and when machine licm +// can properly the instructions. +let isReMaterializable = 1 in { +def t2MOV_ga_pcrel : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr), + IIC_iMOVix2addpc, + [(set rGPR:$dst, (ARMWrapperPIC tglobaladdr:$addr))]>, + Requires<[IsThumb2, UseMovt]>; + +def t2MOV_ga_dyn : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr), + IIC_iMOVix2, + [(set rGPR:$dst, (ARMWrapperDYN tglobaladdr:$addr))]>, + Requires<[IsThumb2, UseMovt]>; +} + +// ConstantPool, GlobalAddress, and JumpTable +def : T2Pat<(ARMWrapper tglobaladdr :$dst), (t2LEApcrel tglobaladdr :$dst)>, + Requires<[IsThumb2, DontUseMovt]>; +def : T2Pat<(ARMWrapper tconstpool :$dst), (t2LEApcrel tconstpool :$dst)>; +def : T2Pat<(ARMWrapper tglobaladdr :$dst), (t2MOVi32imm tglobaladdr :$dst)>, + Requires<[IsThumb2, UseMovt]>; + +def : T2Pat<(ARMWrapperJT tjumptable:$dst, imm:$id), + (t2LEApcrelJT tjumptable:$dst, imm:$id)>; + +// Pseudo instruction that combines ldr from constpool and add pc. This should +// be expanded into two instructions late to allow if-conversion and +// scheduling. +let canFoldAsLoad = 1, isReMaterializable = 1 in +def t2LDRpci_pic : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr, pclabel:$cp), + IIC_iLoadiALU, + [(set rGPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)), + imm:$cp))]>, + Requires<[IsThumb2]>; + +// Pseudo isntruction that combines movs + predicated rsbmi +// to implement integer ABS +let usesCustomInserter = 1, Defs = [CPSR] in { +def t2ABS : PseudoInst<(outs rGPR:$dst), (ins rGPR:$src), + NoItinerary, []>, Requires<[IsThumb2]>; +} + +//===----------------------------------------------------------------------===// +// Coprocessor load/store -- for disassembly only +// +class T2CI<bits<4> op31_28, dag oops, dag iops, string opc, string asm> + : T2I<oops, iops, NoItinerary, opc, asm, []> { + let Inst{31-28} = op31_28; + let Inst{27-25} = 0b110; +} + +multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm> { + def _OFFSET : T2CI<op31_28, + (outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr), + asm, "\t$cop, $CRd, $addr"> { + bits<13> addr; + bits<4> cop; + bits<4> CRd; + let Inst{24} = 1; // P = 1 + let Inst{23} = addr{8}; + let Inst{22} = Dbit; + let Inst{21} = 0; // W = 0 + let Inst{20} = load; + let Inst{19-16} = addr{12-9}; + let Inst{15-12} = CRd; + let Inst{11-8} = cop; + let Inst{7-0} = addr{7-0}; + let DecoderMethod = "DecodeCopMemInstruction"; + } + def _PRE : T2CI<op31_28, + (outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr), + asm, "\t$cop, $CRd, $addr!"> { + bits<13> addr; + bits<4> cop; + bits<4> CRd; + let Inst{24} = 1; // P = 1 + let Inst{23} = addr{8}; + let Inst{22} = Dbit; + let Inst{21} = 1; // W = 1 + let Inst{20} = load; + let Inst{19-16} = addr{12-9}; + let Inst{15-12} = CRd; + let Inst{11-8} = cop; + let Inst{7-0} = addr{7-0}; + let DecoderMethod = "DecodeCopMemInstruction"; + } + def _POST: T2CI<op31_28, + (outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr, + postidx_imm8s4:$offset), + asm, "\t$cop, $CRd, $addr, $offset"> { + bits<9> offset; + bits<4> addr; + bits<4> cop; + bits<4> CRd; + let Inst{24} = 0; // P = 0 + let Inst{23} = offset{8}; + let Inst{22} = Dbit; + let Inst{21} = 1; // W = 1 + let Inst{20} = load; + let Inst{19-16} = addr; + let Inst{15-12} = CRd; + let Inst{11-8} = cop; + let Inst{7-0} = offset{7-0}; + let DecoderMethod = "DecodeCopMemInstruction"; + } + def _OPTION : T2CI<op31_28, (outs), + (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr, + coproc_option_imm:$option), + asm, "\t$cop, $CRd, $addr, $option"> { + bits<8> option; + bits<4> addr; + bits<4> cop; + bits<4> CRd; + let Inst{24} = 0; // P = 0 + let Inst{23} = 1; // U = 1 + let Inst{22} = Dbit; + let Inst{21} = 0; // W = 0 + let Inst{20} = load; + let Inst{19-16} = addr; + let Inst{15-12} = CRd; + let Inst{11-8} = cop; + let Inst{7-0} = option; + let DecoderMethod = "DecodeCopMemInstruction"; + } +} + +defm t2LDC : t2LdStCop<0b1110, 1, 0, "ldc">; +defm t2LDCL : t2LdStCop<0b1110, 1, 1, "ldcl">; +defm t2STC : t2LdStCop<0b1110, 0, 0, "stc">; +defm t2STCL : t2LdStCop<0b1110, 0, 1, "stcl">; +defm t2LDC2 : t2LdStCop<0b1111, 1, 0, "ldc2">; +defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l">; +defm t2STC2 : t2LdStCop<0b1111, 0, 0, "stc2">; +defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l">; + + +//===----------------------------------------------------------------------===// +// Move between special register and ARM core register -- for disassembly only +// +// Move to ARM core register from Special Register + +// A/R class MRS. +// +// A/R class can only move from CPSR or SPSR. +def t2MRS_AR : T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, apsr", []>, + Requires<[IsThumb2,IsARClass]> { + bits<4> Rd; + let Inst{31-12} = 0b11110011111011111000; + let Inst{11-8} = Rd; + let Inst{7-0} = 0b0000; +} + +def : t2InstAlias<"mrs${p} $Rd, cpsr", (t2MRS_AR GPR:$Rd, pred:$p)>; + +def t2MRSsys_AR: T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, spsr", []>, + Requires<[IsThumb2,IsARClass]> { + bits<4> Rd; + let Inst{31-12} = 0b11110011111111111000; + let Inst{11-8} = Rd; + let Inst{7-0} = 0b0000; +} + +// M class MRS. +// +// This MRS has a mask field in bits 7-0 and can take more values than +// the A/R class (a full msr_mask). +def t2MRS_M : T2I<(outs rGPR:$Rd), (ins msr_mask:$mask), NoItinerary, + "mrs", "\t$Rd, $mask", []>, + Requires<[IsThumb2,IsMClass]> { + bits<4> Rd; + bits<8> mask; + let Inst{31-12} = 0b11110011111011111000; + let Inst{11-8} = Rd; + let Inst{19-16} = 0b1111; + let Inst{7-0} = mask; +} + + +// Move from ARM core register to Special Register +// +// A/R class MSR. +// +// No need to have both system and application versions, the encodings are the +// same and the assembly parser has no way to distinguish between them. The mask +// operand contains the special register (R Bit) in bit 4 and bits 3-0 contains +// the mask with the fields to be accessed in the special register. +def t2MSR_AR : T2I<(outs), (ins msr_mask:$mask, rGPR:$Rn), + NoItinerary, "msr", "\t$mask, $Rn", []>, + Requires<[IsThumb2,IsARClass]> { + bits<5> mask; + bits<4> Rn; + let Inst{31-21} = 0b11110011100; + let Inst{20} = mask{4}; // R Bit + let Inst{19-16} = Rn; + let Inst{15-12} = 0b1000; + let Inst{11-8} = mask{3-0}; + let Inst{7-0} = 0; +} + +// M class MSR. +// +// Move from ARM core register to Special Register +def t2MSR_M : T2I<(outs), (ins msr_mask:$SYSm, rGPR:$Rn), + NoItinerary, "msr", "\t$SYSm, $Rn", []>, + Requires<[IsThumb2,IsMClass]> { + bits<8> SYSm; + bits<4> Rn; + let Inst{31-21} = 0b11110011100; + let Inst{20} = 0b0; + let Inst{19-16} = Rn; + let Inst{15-12} = 0b1000; + let Inst{7-0} = SYSm; +} + + +//===----------------------------------------------------------------------===// +// Move between coprocessor and ARM core register +// + +class t2MovRCopro<bits<4> Op, string opc, bit direction, dag oops, dag iops, + list<dag> pattern> + : T2Cop<Op, oops, iops, + !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"), + pattern> { + let Inst{27-24} = 0b1110; + let Inst{20} = direction; + let Inst{4} = 1; + + bits<4> Rt; + bits<4> cop; + bits<3> opc1; + bits<3> opc2; + bits<4> CRm; + bits<4> CRn; + + let Inst{15-12} = Rt; + let Inst{11-8} = cop; + let Inst{23-21} = opc1; + let Inst{7-5} = opc2; + let Inst{3-0} = CRm; + let Inst{19-16} = CRn; +} + +class t2MovRRCopro<bits<4> Op, string opc, bit direction, + list<dag> pattern = []> + : T2Cop<Op, (outs), + (ins p_imm:$cop, imm0_15:$opc1, GPR:$Rt, GPR:$Rt2, c_imm:$CRm), + !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern> { + let Inst{27-24} = 0b1100; + let Inst{23-21} = 0b010; + let Inst{20} = direction; + + bits<4> Rt; + bits<4> Rt2; + bits<4> cop; + bits<4> opc1; + bits<4> CRm; + + let Inst{15-12} = Rt; + let Inst{19-16} = Rt2; + let Inst{11-8} = cop; + let Inst{7-4} = opc1; + let Inst{3-0} = CRm; +} + +/* from ARM core register to coprocessor */ +def t2MCR : t2MovRCopro<0b1110, "mcr", 0, + (outs), + (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, + c_imm:$CRm, imm0_7:$opc2), + [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn, + imm:$CRm, imm:$opc2)]>; +def t2MCR2 : t2MovRCopro<0b1111, "mcr2", 0, + (outs), (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn, + c_imm:$CRm, imm0_7:$opc2), + [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn, + imm:$CRm, imm:$opc2)]>; + +/* from coprocessor to ARM core register */ +def t2MRC : t2MovRCopro<0b1110, "mrc", 1, + (outs GPR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, + c_imm:$CRm, imm0_7:$opc2), []>; + +def t2MRC2 : t2MovRCopro<0b1111, "mrc2", 1, + (outs GPR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, + c_imm:$CRm, imm0_7:$opc2), []>; + +def : T2v6Pat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2), + (t2MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>; + +def : T2v6Pat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2), + (t2MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>; + + +/* from ARM core register to coprocessor */ +def t2MCRR : t2MovRRCopro<0b1110, "mcrr", 0, + [(int_arm_mcrr imm:$cop, imm:$opc1, GPR:$Rt, GPR:$Rt2, + imm:$CRm)]>; +def t2MCRR2 : t2MovRRCopro<0b1111, "mcrr2", 0, + [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPR:$Rt, + GPR:$Rt2, imm:$CRm)]>; +/* from coprocessor to ARM core register */ +def t2MRRC : t2MovRRCopro<0b1110, "mrrc", 1>; + +def t2MRRC2 : t2MovRRCopro<0b1111, "mrrc2", 1>; + +//===----------------------------------------------------------------------===// +// Other Coprocessor Instructions. +// + +def tCDP : T2Cop<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1, + c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), + "cdp\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2", + [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn, + imm:$CRm, imm:$opc2)]> { + let Inst{27-24} = 0b1110; + + bits<4> opc1; + bits<4> CRn; + bits<4> CRd; + bits<4> cop; + bits<3> opc2; + bits<4> CRm; + + let Inst{3-0} = CRm; + let Inst{4} = 0; + let Inst{7-5} = opc2; + let Inst{11-8} = cop; + let Inst{15-12} = CRd; + let Inst{19-16} = CRn; + let Inst{23-20} = opc1; +} + +def t2CDP2 : T2Cop<0b1111, (outs), (ins p_imm:$cop, imm0_15:$opc1, + c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2), + "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2", + [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn, + imm:$CRm, imm:$opc2)]> { + let Inst{27-24} = 0b1110; + + bits<4> opc1; + bits<4> CRn; + bits<4> CRd; + bits<4> cop; + bits<3> opc2; + bits<4> CRm; + + let Inst{3-0} = CRm; + let Inst{4} = 0; + let Inst{7-5} = opc2; + let Inst{11-8} = cop; + let Inst{15-12} = CRd; + let Inst{19-16} = CRn; + let Inst{23-20} = opc1; +} + + + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +// + +// SXT/UXT with no rotate +let AddedComplexity = 16 in { +def : T2Pat<(and rGPR:$Rm, 0x000000FF), (t2UXTB rGPR:$Rm, 0)>, + Requires<[IsThumb2]>; +def : T2Pat<(and rGPR:$Rm, 0x0000FFFF), (t2UXTH rGPR:$Rm, 0)>, + Requires<[IsThumb2]>; +def : T2Pat<(and rGPR:$Rm, 0x00FF00FF), (t2UXTB16 rGPR:$Rm, 0)>, + Requires<[HasT2ExtractPack, IsThumb2]>; +def : T2Pat<(add rGPR:$Rn, (and rGPR:$Rm, 0x00FF)), + (t2UXTAB rGPR:$Rn, rGPR:$Rm, 0)>, + Requires<[HasT2ExtractPack, IsThumb2]>; +def : T2Pat<(add rGPR:$Rn, (and rGPR:$Rm, 0xFFFF)), + (t2UXTAH rGPR:$Rn, rGPR:$Rm, 0)>, + Requires<[HasT2ExtractPack, IsThumb2]>; +} + +def : T2Pat<(sext_inreg rGPR:$Src, i8), (t2SXTB rGPR:$Src, 0)>, + Requires<[IsThumb2]>; +def : T2Pat<(sext_inreg rGPR:$Src, i16), (t2SXTH rGPR:$Src, 0)>, + Requires<[IsThumb2]>; +def : T2Pat<(add rGPR:$Rn, (sext_inreg rGPR:$Rm, i8)), + (t2SXTAB rGPR:$Rn, rGPR:$Rm, 0)>, + Requires<[HasT2ExtractPack, IsThumb2]>; +def : T2Pat<(add rGPR:$Rn, (sext_inreg rGPR:$Rm, i16)), + (t2SXTAH rGPR:$Rn, rGPR:$Rm, 0)>, + Requires<[HasT2ExtractPack, IsThumb2]>; + +// Atomic load/store patterns +def : T2Pat<(atomic_load_8 t2addrmode_imm12:$addr), + (t2LDRBi12 t2addrmode_imm12:$addr)>; +def : T2Pat<(atomic_load_8 t2addrmode_negimm8:$addr), + (t2LDRBi8 t2addrmode_negimm8:$addr)>; +def : T2Pat<(atomic_load_8 t2addrmode_so_reg:$addr), + (t2LDRBs t2addrmode_so_reg:$addr)>; +def : T2Pat<(atomic_load_16 t2addrmode_imm12:$addr), + (t2LDRHi12 t2addrmode_imm12:$addr)>; +def : T2Pat<(atomic_load_16 t2addrmode_negimm8:$addr), + (t2LDRHi8 t2addrmode_negimm8:$addr)>; +def : T2Pat<(atomic_load_16 t2addrmode_so_reg:$addr), + (t2LDRHs t2addrmode_so_reg:$addr)>; +def : T2Pat<(atomic_load_32 t2addrmode_imm12:$addr), + (t2LDRi12 t2addrmode_imm12:$addr)>; +def : T2Pat<(atomic_load_32 t2addrmode_negimm8:$addr), + (t2LDRi8 t2addrmode_negimm8:$addr)>; +def : T2Pat<(atomic_load_32 t2addrmode_so_reg:$addr), + (t2LDRs t2addrmode_so_reg:$addr)>; +def : T2Pat<(atomic_store_8 t2addrmode_imm12:$addr, GPR:$val), + (t2STRBi12 GPR:$val, t2addrmode_imm12:$addr)>; +def : T2Pat<(atomic_store_8 t2addrmode_negimm8:$addr, GPR:$val), + (t2STRBi8 GPR:$val, t2addrmode_negimm8:$addr)>; +def : T2Pat<(atomic_store_8 t2addrmode_so_reg:$addr, GPR:$val), + (t2STRBs GPR:$val, t2addrmode_so_reg:$addr)>; +def : T2Pat<(atomic_store_16 t2addrmode_imm12:$addr, GPR:$val), + (t2STRHi12 GPR:$val, t2addrmode_imm12:$addr)>; +def : T2Pat<(atomic_store_16 t2addrmode_negimm8:$addr, GPR:$val), + (t2STRHi8 GPR:$val, t2addrmode_negimm8:$addr)>; +def : T2Pat<(atomic_store_16 t2addrmode_so_reg:$addr, GPR:$val), + (t2STRHs GPR:$val, t2addrmode_so_reg:$addr)>; +def : T2Pat<(atomic_store_32 t2addrmode_imm12:$addr, GPR:$val), + (t2STRi12 GPR:$val, t2addrmode_imm12:$addr)>; +def : T2Pat<(atomic_store_32 t2addrmode_negimm8:$addr, GPR:$val), + (t2STRi8 GPR:$val, t2addrmode_negimm8:$addr)>; +def : T2Pat<(atomic_store_32 t2addrmode_so_reg:$addr, GPR:$val), + (t2STRs GPR:$val, t2addrmode_so_reg:$addr)>; + + +//===----------------------------------------------------------------------===// +// Assembler aliases +// + +// Aliases for ADC without the ".w" optional width specifier. +def : t2InstAlias<"adc${s}${p} $Rd, $Rn, $Rm", + (t2ADCrr rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"adc${s}${p} $Rd, $Rn, $ShiftedRm", + (t2ADCrs rGPR:$Rd, rGPR:$Rn, t2_so_reg:$ShiftedRm, + pred:$p, cc_out:$s)>; + +// Aliases for SBC without the ".w" optional width specifier. +def : t2InstAlias<"sbc${s}${p} $Rd, $Rn, $Rm", + (t2SBCrr rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"sbc${s}${p} $Rd, $Rn, $ShiftedRm", + (t2SBCrs rGPR:$Rd, rGPR:$Rn, t2_so_reg:$ShiftedRm, + pred:$p, cc_out:$s)>; + +// Aliases for ADD without the ".w" optional width specifier. +def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm", + (t2ADDri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"add${p} $Rd, $Rn, $imm", + (t2ADDri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>; +def : t2InstAlias<"add${s}${p} $Rd, $Rn, $Rm", + (t2ADDrr GPRnopc:$Rd, GPRnopc:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"add${s}${p} $Rd, $Rn, $ShiftedRm", + (t2ADDrs GPRnopc:$Rd, GPRnopc:$Rn, t2_so_reg:$ShiftedRm, + pred:$p, cc_out:$s)>; + +// Aliases for SUB without the ".w" optional width specifier. +def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $imm", + (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"sub${p} $Rd, $Rn, $imm", + (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>; +def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $Rm", + (t2SUBrr GPRnopc:$Rd, GPRnopc:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $ShiftedRm", + (t2SUBrs GPRnopc:$Rd, GPRnopc:$Rn, t2_so_reg:$ShiftedRm, + pred:$p, cc_out:$s)>; + +// Alias for compares without the ".w" optional width specifier. +def : t2InstAlias<"cmn${p} $Rn, $Rm", + (t2CMNzrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>; +def : t2InstAlias<"teq${p} $Rn, $Rm", + (t2TEQrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>; +def : t2InstAlias<"tst${p} $Rn, $Rm", + (t2TSTrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>; + +// Memory barriers +def : InstAlias<"dmb", (t2DMB 0xf)>, Requires<[IsThumb2, HasDB]>; +def : InstAlias<"dsb", (t2DSB 0xf)>, Requires<[IsThumb2, HasDB]>; +def : InstAlias<"isb", (t2ISB 0xf)>, Requires<[IsThumb2, HasDB]>; + +// Alias for LDR, LDRB, LDRH, LDRSB, and LDRSH without the ".w" optional +// width specifier. +def : t2InstAlias<"ldr${p} $Rt, $addr", + (t2LDRi12 GPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>; +def : t2InstAlias<"ldrb${p} $Rt, $addr", + (t2LDRBi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>; +def : t2InstAlias<"ldrh${p} $Rt, $addr", + (t2LDRHi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>; +def : t2InstAlias<"ldrsb${p} $Rt, $addr", + (t2LDRSBi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>; +def : t2InstAlias<"ldrsh${p} $Rt, $addr", + (t2LDRSHi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>; + +def : t2InstAlias<"ldr${p} $Rt, $addr", + (t2LDRs GPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>; +def : t2InstAlias<"ldrb${p} $Rt, $addr", + (t2LDRBs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>; +def : t2InstAlias<"ldrh${p} $Rt, $addr", + (t2LDRHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>; +def : t2InstAlias<"ldrsb${p} $Rt, $addr", + (t2LDRSBs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>; +def : t2InstAlias<"ldrsh${p} $Rt, $addr", + (t2LDRSHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>; + +// Alias for MVN without the ".w" optional width specifier. +def : t2InstAlias<"mvn${s}${p} $Rd, $Rm", + (t2MVNr rGPR:$Rd, rGPR:$Rm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"mvn${s}${p} $Rd, $ShiftedRm", + (t2MVNs rGPR:$Rd, t2_so_reg:$ShiftedRm, pred:$p, cc_out:$s)>; + +// PKHBT/PKHTB with default shift amount. PKHTB is equivalent to PKHBT when the +// shift amount is zero (i.e., unspecified). +def : InstAlias<"pkhbt${p} $Rd, $Rn, $Rm", + (t2PKHBT rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>, + Requires<[HasT2ExtractPack, IsThumb2]>; +def : InstAlias<"pkhtb${p} $Rd, $Rn, $Rm", + (t2PKHBT rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>, + Requires<[HasT2ExtractPack, IsThumb2]>; + +// PUSH/POP aliases for STM/LDM +def : t2InstAlias<"push${p}.w $regs", (t2STMDB_UPD SP, pred:$p, reglist:$regs)>; +def : t2InstAlias<"push${p} $regs", (t2STMDB_UPD SP, pred:$p, reglist:$regs)>; +def : t2InstAlias<"pop${p}.w $regs", (t2LDMIA_UPD SP, pred:$p, reglist:$regs)>; +def : t2InstAlias<"pop${p} $regs", (t2LDMIA_UPD SP, pred:$p, reglist:$regs)>; + +// Alias for REV/REV16/REVSH without the ".w" optional width specifier. +def : t2InstAlias<"rev${p} $Rd, $Rm", (t2REV rGPR:$Rd, rGPR:$Rm, pred:$p)>; +def : t2InstAlias<"rev16${p} $Rd, $Rm", (t2REV16 rGPR:$Rd, rGPR:$Rm, pred:$p)>; +def : t2InstAlias<"revsh${p} $Rd, $Rm", (t2REVSH rGPR:$Rd, rGPR:$Rm, pred:$p)>; + + +// Alias for RSB without the ".w" optional width specifier, and with optional +// implied destination register. +def : t2InstAlias<"rsb${s}${p} $Rd, $Rn, $imm", + (t2RSBri rGPR:$Rd, rGPR:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"rsb${s}${p} $Rdn, $imm", + (t2RSBri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"rsb${s}${p} $Rdn, $Rm", + (t2RSBrr rGPR:$Rdn, rGPR:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>; +def : t2InstAlias<"rsb${s}${p} $Rdn, $ShiftedRm", + (t2RSBrs rGPR:$Rdn, rGPR:$Rdn, t2_so_reg:$ShiftedRm, pred:$p, + cc_out:$s)>; + +// SSAT/USAT optional shift operand. +def : t2InstAlias<"ssat${p} $Rd, $sat_imm, $Rn", + (t2SSAT rGPR:$Rd, imm1_32:$sat_imm, rGPR:$Rn, 0, pred:$p)>; +def : t2InstAlias<"usat${p} $Rd, $sat_imm, $Rn", + (t2USAT rGPR:$Rd, imm0_31:$sat_imm, rGPR:$Rn, 0, pred:$p)>; + +// STM w/o the .w suffix. +def : t2InstAlias<"stm${p} $Rn, $regs", + (t2STMIA GPR:$Rn, pred:$p, reglist:$regs)>; + +// Alias for STR, STRB, and STRH without the ".w" optional +// width specifier. +def : t2InstAlias<"str${p} $Rt, $addr", + (t2STRi12 GPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>; +def : t2InstAlias<"strb${p} $Rt, $addr", + (t2STRBi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>; +def : t2InstAlias<"strh${p} $Rt, $addr", + (t2STRHi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>; + +def : t2InstAlias<"str${p} $Rt, $addr", + (t2STRs GPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>; +def : t2InstAlias<"strb${p} $Rt, $addr", + (t2STRBs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>; +def : t2InstAlias<"strh${p} $Rt, $addr", + (t2STRHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>; + +// Extend instruction optional rotate operand. +def : t2InstAlias<"sxtab${p} $Rd, $Rn, $Rm", + (t2SXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>; +def : t2InstAlias<"sxtah${p} $Rd, $Rn, $Rm", + (t2SXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>; +def : t2InstAlias<"sxtab16${p} $Rd, $Rn, $Rm", + (t2SXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>; + +def : t2InstAlias<"sxtb${p} $Rd, $Rm", + (t2SXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>; +def : t2InstAlias<"sxtb16${p} $Rd, $Rm", + (t2SXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>; +def : t2InstAlias<"sxth${p} $Rd, $Rm", + (t2SXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>; +def : t2InstAlias<"sxtb${p}.w $Rd, $Rm", + (t2SXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>; +def : t2InstAlias<"sxth${p}.w $Rd, $Rm", + (t2SXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>; + +def : t2InstAlias<"uxtab${p} $Rd, $Rn, $Rm", + (t2UXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>; +def : t2InstAlias<"uxtah${p} $Rd, $Rn, $Rm", + (t2UXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>; +def : t2InstAlias<"uxtab16${p} $Rd, $Rn, $Rm", + (t2UXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>; +def : t2InstAlias<"uxtb${p} $Rd, $Rm", + (t2UXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>; +def : t2InstAlias<"uxtb16${p} $Rd, $Rm", + (t2UXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>; +def : t2InstAlias<"uxth${p} $Rd, $Rm", + (t2UXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>; + +def : t2InstAlias<"uxtb${p}.w $Rd, $Rm", + (t2UXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>; +def : t2InstAlias<"uxth${p}.w $Rd, $Rm", + (t2UXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>; + +// Extend instruction w/o the ".w" optional width specifier. +def : t2InstAlias<"uxtb${p} $Rd, $Rm$rot", + (t2UXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>; +def : t2InstAlias<"uxtb16${p} $Rd, $Rm$rot", + (t2UXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>; +def : t2InstAlias<"uxth${p} $Rd, $Rm$rot", + (t2UXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>; + +def : t2InstAlias<"sxtb${p} $Rd, $Rm$rot", + (t2SXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>; +def : t2InstAlias<"sxtb16${p} $Rd, $Rm$rot", + (t2SXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>; +def : t2InstAlias<"sxth${p} $Rd, $Rm$rot", + (t2SXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>; diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td new file mode 100644 index 0000000..e746cf2 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td @@ -0,0 +1,1165 @@ +//===- ARMInstrVFP.td - VFP support for ARM ----------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the ARM VFP instruction set. +// +//===----------------------------------------------------------------------===// + +def SDT_FTOI : SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisFP<1>]>; +def SDT_ITOF : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f32>]>; +def SDT_CMPFP0 : SDTypeProfile<0, 1, [SDTCisFP<0>]>; +def SDT_VMOVDRR : SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>, + SDTCisSameAs<1, 2>]>; + +def arm_ftoui : SDNode<"ARMISD::FTOUI", SDT_FTOI>; +def arm_ftosi : SDNode<"ARMISD::FTOSI", SDT_FTOI>; +def arm_sitof : SDNode<"ARMISD::SITOF", SDT_ITOF>; +def arm_uitof : SDNode<"ARMISD::UITOF", SDT_ITOF>; +def arm_fmstat : SDNode<"ARMISD::FMSTAT", SDTNone, [SDNPInGlue, SDNPOutGlue]>; +def arm_cmpfp : SDNode<"ARMISD::CMPFP", SDT_ARMCmp, [SDNPOutGlue]>; +def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutGlue]>; +def arm_fmdrr : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>; + + +//===----------------------------------------------------------------------===// +// Operand Definitions. +// + +// 8-bit floating-point immediate encodings. +def FPImmOperand : AsmOperandClass { + let Name = "FPImm"; + let ParserMethod = "parseFPImm"; +} + +def vfp_f32imm : Operand<f32>, + PatLeaf<(f32 fpimm), [{ + return ARM_AM::getFP32Imm(N->getValueAPF()) != -1; + }], SDNodeXForm<fpimm, [{ + APFloat InVal = N->getValueAPF(); + uint32_t enc = ARM_AM::getFP32Imm(InVal); + return CurDAG->getTargetConstant(enc, MVT::i32); + }]>> { + let PrintMethod = "printFPImmOperand"; + let ParserMatchClass = FPImmOperand; +} + +def vfp_f64imm : Operand<f64>, + PatLeaf<(f64 fpimm), [{ + return ARM_AM::getFP64Imm(N->getValueAPF()) != -1; + }], SDNodeXForm<fpimm, [{ + APFloat InVal = N->getValueAPF(); + uint32_t enc = ARM_AM::getFP64Imm(InVal); + return CurDAG->getTargetConstant(enc, MVT::i32); + }]>> { + let PrintMethod = "printFPImmOperand"; + let ParserMatchClass = FPImmOperand; +} + + +//===----------------------------------------------------------------------===// +// Load / store Instructions. +// + +let canFoldAsLoad = 1, isReMaterializable = 1 in { + +def VLDRD : ADI5<0b1101, 0b01, (outs DPR:$Dd), (ins addrmode5:$addr), + IIC_fpLoad64, "vldr", ".64\t$Dd, $addr", + [(set DPR:$Dd, (f64 (load addrmode5:$addr)))]>; + +def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5:$addr), + IIC_fpLoad32, "vldr", ".32\t$Sd, $addr", + [(set SPR:$Sd, (load addrmode5:$addr))]> { + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} + +} // End of 'let canFoldAsLoad = 1, isReMaterializable = 1 in' + +def VSTRD : ADI5<0b1101, 0b00, (outs), (ins DPR:$Dd, addrmode5:$addr), + IIC_fpStore64, "vstr", ".64\t$Dd, $addr", + [(store (f64 DPR:$Dd), addrmode5:$addr)]>; + +def VSTRS : ASI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5:$addr), + IIC_fpStore32, "vstr", ".32\t$Sd, $addr", + [(store SPR:$Sd, addrmode5:$addr)]> { + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} + +//===----------------------------------------------------------------------===// +// Load / store multiple Instructions. +// + +multiclass vfp_ldst_mult<string asm, bit L_bit, + InstrItinClass itin, InstrItinClass itin_upd> { + // Double Precision + def DIA : + AXDI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops), + IndexModeNone, itin, + !strconcat(asm, "ia${p}\t$Rn, $regs"), "", []> { + let Inst{24-23} = 0b01; // Increment After + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + } + def DIA_UPD : + AXDI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, + variable_ops), + IndexModeUpd, itin_upd, + !strconcat(asm, "ia${p}\t$Rn!, $regs"), "$Rn = $wb", []> { + let Inst{24-23} = 0b01; // Increment After + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + } + def DDB_UPD : + AXDI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, + variable_ops), + IndexModeUpd, itin_upd, + !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> { + let Inst{24-23} = 0b10; // Decrement Before + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + } + + // Single Precision + def SIA : + AXSI4<(outs), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, variable_ops), + IndexModeNone, itin, + !strconcat(asm, "ia${p}\t$Rn, $regs"), "", []> { + let Inst{24-23} = 0b01; // Increment After + let Inst{21} = 0; // No writeback + let Inst{20} = L_bit; + + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines. + let D = VFPNeonDomain; + } + def SIA_UPD : + AXSI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, + variable_ops), + IndexModeUpd, itin_upd, + !strconcat(asm, "ia${p}\t$Rn!, $regs"), "$Rn = $wb", []> { + let Inst{24-23} = 0b01; // Increment After + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines. + let D = VFPNeonDomain; + } + def SDB_UPD : + AXSI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, + variable_ops), + IndexModeUpd, itin_upd, + !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> { + let Inst{24-23} = 0b10; // Decrement Before + let Inst{21} = 1; // Writeback + let Inst{20} = L_bit; + + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines. + let D = VFPNeonDomain; + } +} + +let neverHasSideEffects = 1 in { + +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in +defm VLDM : vfp_ldst_mult<"vldm", 1, IIC_fpLoad_m, IIC_fpLoad_mu>; + +let mayStore = 1, hasExtraSrcRegAllocReq = 1 in +defm VSTM : vfp_ldst_mult<"vstm", 0, IIC_fpLoad_m, IIC_fpLoad_mu>; + +} // neverHasSideEffects + +def : MnemonicAlias<"vldm", "vldmia">; +def : MnemonicAlias<"vstm", "vstmia">; + +def : InstAlias<"vpush${p} $r", (VSTMDDB_UPD SP, pred:$p, dpr_reglist:$r)>, + Requires<[HasVFP2]>; +def : InstAlias<"vpush${p} $r", (VSTMSDB_UPD SP, pred:$p, spr_reglist:$r)>, + Requires<[HasVFP2]>; +def : InstAlias<"vpop${p} $r", (VLDMDIA_UPD SP, pred:$p, dpr_reglist:$r)>, + Requires<[HasVFP2]>; +def : InstAlias<"vpop${p} $r", (VLDMSIA_UPD SP, pred:$p, spr_reglist:$r)>, + Requires<[HasVFP2]>; + +// FLDMX, FSTMX - mixing S/D registers for pre-armv6 cores + +//===----------------------------------------------------------------------===// +// FP Binary Operations. +// + +def VADDD : ADbI<0b11100, 0b11, 0, 0, + (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), + IIC_fpALU64, "vadd", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fadd DPR:$Dn, (f64 DPR:$Dm)))]>; + +def VADDS : ASbIn<0b11100, 0b11, 0, 0, + (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + IIC_fpALU32, "vadd", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fadd SPR:$Sn, SPR:$Sm))]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def VSUBD : ADbI<0b11100, 0b11, 1, 0, + (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), + IIC_fpALU64, "vsub", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fsub DPR:$Dn, (f64 DPR:$Dm)))]>; + +def VSUBS : ASbIn<0b11100, 0b11, 1, 0, + (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + IIC_fpALU32, "vsub", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fsub SPR:$Sn, SPR:$Sm))]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def VDIVD : ADbI<0b11101, 0b00, 0, 0, + (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), + IIC_fpDIV64, "vdiv", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fdiv DPR:$Dn, (f64 DPR:$Dm)))]>; + +def VDIVS : ASbI<0b11101, 0b00, 0, 0, + (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + IIC_fpDIV32, "vdiv", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fdiv SPR:$Sn, SPR:$Sm))]>; + +def VMULD : ADbI<0b11100, 0b10, 0, 0, + (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), + IIC_fpMUL64, "vmul", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fmul DPR:$Dn, (f64 DPR:$Dm)))]>; + +def VMULS : ASbIn<0b11100, 0b10, 0, 0, + (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + IIC_fpMUL32, "vmul", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fmul SPR:$Sn, SPR:$Sm))]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def VNMULD : ADbI<0b11100, 0b10, 1, 0, + (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), + IIC_fpMUL64, "vnmul", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fneg (fmul DPR:$Dn, (f64 DPR:$Dm))))]>; + +def VNMULS : ASbI<0b11100, 0b10, 1, 0, + (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), + IIC_fpMUL32, "vnmul", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fneg (fmul SPR:$Sn, SPR:$Sm)))]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +// Match reassociated forms only if not sign dependent rounding. +def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)), + (VNMULD DPR:$a, DPR:$b)>, Requires<[NoHonorSignDependentRounding]>; +def : Pat<(fmul (fneg SPR:$a), SPR:$b), + (VNMULS SPR:$a, SPR:$b)>, Requires<[NoHonorSignDependentRounding]>; + +// These are encoded as unary instructions. +let Defs = [FPSCR] in { +def VCMPED : ADuI<0b11101, 0b11, 0b0100, 0b11, 0, + (outs), (ins DPR:$Dd, DPR:$Dm), + IIC_fpCMP64, "vcmpe", ".f64\t$Dd, $Dm", + [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm))]>; + +def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0, + (outs), (ins SPR:$Sd, SPR:$Sm), + IIC_fpCMP32, "vcmpe", ".f32\t$Sd, $Sm", + [(arm_cmpfp SPR:$Sd, SPR:$Sm)]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +// FIXME: Verify encoding after integrated assembler is working. +def VCMPD : ADuI<0b11101, 0b11, 0b0100, 0b01, 0, + (outs), (ins DPR:$Dd, DPR:$Dm), + IIC_fpCMP64, "vcmp", ".f64\t$Dd, $Dm", + [/* For disassembly only; pattern left blank */]>; + +def VCMPS : ASuI<0b11101, 0b11, 0b0100, 0b01, 0, + (outs), (ins SPR:$Sd, SPR:$Sm), + IIC_fpCMP32, "vcmp", ".f32\t$Sd, $Sm", + [/* For disassembly only; pattern left blank */]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} +} // Defs = [FPSCR] + +//===----------------------------------------------------------------------===// +// FP Unary Operations. +// + +def VABSD : ADuI<0b11101, 0b11, 0b0000, 0b11, 0, + (outs DPR:$Dd), (ins DPR:$Dm), + IIC_fpUNA64, "vabs", ".f64\t$Dd, $Dm", + [(set DPR:$Dd, (fabs (f64 DPR:$Dm)))]>; + +def VABSS : ASuIn<0b11101, 0b11, 0b0000, 0b11, 0, + (outs SPR:$Sd), (ins SPR:$Sm), + IIC_fpUNA32, "vabs", ".f32\t$Sd, $Sm", + [(set SPR:$Sd, (fabs SPR:$Sm))]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +let Defs = [FPSCR] in { +def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0, + (outs), (ins DPR:$Dd), + IIC_fpCMP64, "vcmpe", ".f64\t$Dd, #0", + [(arm_cmpfp0 (f64 DPR:$Dd))]> { + let Inst{3-0} = 0b0000; + let Inst{5} = 0; +} + +def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0, + (outs), (ins SPR:$Sd), + IIC_fpCMP32, "vcmpe", ".f32\t$Sd, #0", + [(arm_cmpfp0 SPR:$Sd)]> { + let Inst{3-0} = 0b0000; + let Inst{5} = 0; + + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +// FIXME: Verify encoding after integrated assembler is working. +def VCMPZD : ADuI<0b11101, 0b11, 0b0101, 0b01, 0, + (outs), (ins DPR:$Dd), + IIC_fpCMP64, "vcmp", ".f64\t$Dd, #0", + [/* For disassembly only; pattern left blank */]> { + let Inst{3-0} = 0b0000; + let Inst{5} = 0; +} + +def VCMPZS : ASuI<0b11101, 0b11, 0b0101, 0b01, 0, + (outs), (ins SPR:$Sd), + IIC_fpCMP32, "vcmp", ".f32\t$Sd, #0", + [/* For disassembly only; pattern left blank */]> { + let Inst{3-0} = 0b0000; + let Inst{5} = 0; + + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} +} // Defs = [FPSCR] + +def VCVTDS : ASuI<0b11101, 0b11, 0b0111, 0b11, 0, + (outs DPR:$Dd), (ins SPR:$Sm), + IIC_fpCVTDS, "vcvt", ".f64.f32\t$Dd, $Sm", + [(set DPR:$Dd, (fextend SPR:$Sm))]> { + // Instruction operands. + bits<5> Dd; + bits<5> Sm; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{15-12} = Dd{3-0}; + let Inst{22} = Dd{4}; +} + +// Special case encoding: bits 11-8 is 0b1011. +def VCVTSD : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm, + IIC_fpCVTSD, "vcvt", ".f32.f64\t$Sd, $Dm", + [(set SPR:$Sd, (fround DPR:$Dm))]> { + // Instruction operands. + bits<5> Sd; + bits<5> Dm; + + // Encode instruction operands. + let Inst{3-0} = Dm{3-0}; + let Inst{5} = Dm{4}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; + + let Inst{27-23} = 0b11101; + let Inst{21-16} = 0b110111; + let Inst{11-8} = 0b1011; + let Inst{7-6} = 0b11; + let Inst{4} = 0; +} + +// Between half-precision and single-precision. For disassembly only. + +// FIXME: Verify encoding after integrated assembler is working. +def VCVTBSH: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), + /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm", + [/* For disassembly only; pattern left blank */]>; + +def : ARMPat<(f32_to_f16 SPR:$a), + (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>; + +def VCVTBHS: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), + /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm", + [/* For disassembly only; pattern left blank */]>; + +def : ARMPat<(f16_to_f32 GPR:$a), + (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>; + +def VCVTTSH: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), + /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm", + [/* For disassembly only; pattern left blank */]>; + +def VCVTTHS: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), + /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm", + [/* For disassembly only; pattern left blank */]>; + +def VNEGD : ADuI<0b11101, 0b11, 0b0001, 0b01, 0, + (outs DPR:$Dd), (ins DPR:$Dm), + IIC_fpUNA64, "vneg", ".f64\t$Dd, $Dm", + [(set DPR:$Dd, (fneg (f64 DPR:$Dm)))]>; + +def VNEGS : ASuIn<0b11101, 0b11, 0b0001, 0b01, 0, + (outs SPR:$Sd), (ins SPR:$Sm), + IIC_fpUNA32, "vneg", ".f32\t$Sd, $Sm", + [(set SPR:$Sd, (fneg SPR:$Sm))]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0, + (outs DPR:$Dd), (ins DPR:$Dm), + IIC_fpSQRT64, "vsqrt", ".f64\t$Dd, $Dm", + [(set DPR:$Dd, (fsqrt (f64 DPR:$Dm)))]>; + +def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0, + (outs SPR:$Sd), (ins SPR:$Sm), + IIC_fpSQRT32, "vsqrt", ".f32\t$Sd, $Sm", + [(set SPR:$Sd, (fsqrt SPR:$Sm))]>; + +let neverHasSideEffects = 1 in { +def VMOVD : ADuI<0b11101, 0b11, 0b0000, 0b01, 0, + (outs DPR:$Dd), (ins DPR:$Dm), + IIC_fpUNA64, "vmov", ".f64\t$Dd, $Dm", []>; + +def VMOVS : ASuI<0b11101, 0b11, 0b0000, 0b01, 0, + (outs SPR:$Sd), (ins SPR:$Sm), + IIC_fpUNA32, "vmov", ".f32\t$Sd, $Sm", []>; +} // neverHasSideEffects + +//===----------------------------------------------------------------------===// +// FP <-> GPR Copies. Int <-> FP Conversions. +// + +def VMOVRS : AVConv2I<0b11100001, 0b1010, + (outs GPR:$Rt), (ins SPR:$Sn), + IIC_fpMOVSI, "vmov", "\t$Rt, $Sn", + [(set GPR:$Rt, (bitconvert SPR:$Sn))]> { + // Instruction operands. + bits<4> Rt; + bits<5> Sn; + + // Encode instruction operands. + let Inst{19-16} = Sn{4-1}; + let Inst{7} = Sn{0}; + let Inst{15-12} = Rt; + + let Inst{6-5} = 0b00; + let Inst{3-0} = 0b0000; + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} + +def VMOVSR : AVConv4I<0b11100000, 0b1010, + (outs SPR:$Sn), (ins GPR:$Rt), + IIC_fpMOVIS, "vmov", "\t$Sn, $Rt", + [(set SPR:$Sn, (bitconvert GPR:$Rt))]> { + // Instruction operands. + bits<5> Sn; + bits<4> Rt; + + // Encode instruction operands. + let Inst{19-16} = Sn{4-1}; + let Inst{7} = Sn{0}; + let Inst{15-12} = Rt; + + let Inst{6-5} = 0b00; + let Inst{3-0} = 0b0000; + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} + +let neverHasSideEffects = 1 in { +def VMOVRRD : AVConv3I<0b11000101, 0b1011, + (outs GPR:$Rt, GPR:$Rt2), (ins DPR:$Dm), + IIC_fpMOVDI, "vmov", "\t$Rt, $Rt2, $Dm", + [/* FIXME: Can't write pattern for multiple result instr*/]> { + // Instruction operands. + bits<5> Dm; + bits<4> Rt; + bits<4> Rt2; + + // Encode instruction operands. + let Inst{3-0} = Dm{3-0}; + let Inst{5} = Dm{4}; + let Inst{15-12} = Rt; + let Inst{19-16} = Rt2; + + let Inst{7-6} = 0b00; + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} + +def VMOVRRS : AVConv3I<0b11000101, 0b1010, + (outs GPR:$Rt, GPR:$Rt2), (ins SPR:$src1, SPR:$src2), + IIC_fpMOVDI, "vmov", "\t$Rt, $Rt2, $src1, $src2", + [/* For disassembly only; pattern left blank */]> { + bits<5> src1; + bits<4> Rt; + bits<4> Rt2; + + // Encode instruction operands. + let Inst{3-0} = src1{3-0}; + let Inst{5} = src1{4}; + let Inst{15-12} = Rt; + let Inst{19-16} = Rt2; + + let Inst{7-6} = 0b00; + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; + let DecoderMethod = "DecodeVMOVRRS"; +} +} // neverHasSideEffects + +// FMDHR: GPR -> SPR +// FMDLR: GPR -> SPR + +def VMOVDRR : AVConv5I<0b11000100, 0b1011, + (outs DPR:$Dm), (ins GPR:$Rt, GPR:$Rt2), + IIC_fpMOVID, "vmov", "\t$Dm, $Rt, $Rt2", + [(set DPR:$Dm, (arm_fmdrr GPR:$Rt, GPR:$Rt2))]> { + // Instruction operands. + bits<5> Dm; + bits<4> Rt; + bits<4> Rt2; + + // Encode instruction operands. + let Inst{3-0} = Dm{3-0}; + let Inst{5} = Dm{4}; + let Inst{15-12} = Rt; + let Inst{19-16} = Rt2; + + let Inst{7-6} = 0b00; + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; +} + +let neverHasSideEffects = 1 in +def VMOVSRR : AVConv5I<0b11000100, 0b1010, + (outs SPR:$dst1, SPR:$dst2), (ins GPR:$src1, GPR:$src2), + IIC_fpMOVID, "vmov", "\t$dst1, $dst2, $src1, $src2", + [/* For disassembly only; pattern left blank */]> { + // Instruction operands. + bits<5> dst1; + bits<4> src1; + bits<4> src2; + + // Encode instruction operands. + let Inst{3-0} = dst1{3-0}; + let Inst{5} = dst1{4}; + let Inst{15-12} = src1; + let Inst{19-16} = src2; + + let Inst{7-6} = 0b00; + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; + + let DecoderMethod = "DecodeVMOVSRR"; +} + +// FMRDH: SPR -> GPR +// FMRDL: SPR -> GPR +// FMRRS: SPR -> GPR +// FMRX: SPR system reg -> GPR +// FMSRR: GPR -> SPR +// FMXR: GPR -> VFP system reg + + +// Int -> FP: + +class AVConv1IDs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, + bits<4> opcod4, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm, + pattern> { + // Instruction operands. + bits<5> Dd; + bits<5> Sm; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{15-12} = Dd{3-0}; + let Inst{22} = Dd{4}; +} + +class AVConv1InSs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, + bits<4> opcod4, dag oops, dag iops,InstrItinClass itin, + string opc, string asm, list<dag> pattern> + : AVConv1In<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm, + pattern> { + // Instruction operands. + bits<5> Sd; + bits<5> Sm; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; +} + +def VSITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011, + (outs DPR:$Dd), (ins SPR:$Sm), + IIC_fpCVTID, "vcvt", ".f64.s32\t$Dd, $Sm", + [(set DPR:$Dd, (f64 (arm_sitof SPR:$Sm)))]> { + let Inst{7} = 1; // s32 +} + +def VSITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010, + (outs SPR:$Sd),(ins SPR:$Sm), + IIC_fpCVTIS, "vcvt", ".f32.s32\t$Sd, $Sm", + [(set SPR:$Sd, (arm_sitof SPR:$Sm))]> { + let Inst{7} = 1; // s32 + + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011, + (outs DPR:$Dd), (ins SPR:$Sm), + IIC_fpCVTID, "vcvt", ".f64.u32\t$Dd, $Sm", + [(set DPR:$Dd, (f64 (arm_uitof SPR:$Sm)))]> { + let Inst{7} = 0; // u32 +} + +def VUITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010, + (outs SPR:$Sd), (ins SPR:$Sm), + IIC_fpCVTIS, "vcvt", ".f32.u32\t$Sd, $Sm", + [(set SPR:$Sd, (arm_uitof SPR:$Sm))]> { + let Inst{7} = 0; // u32 + + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +// FP -> Int: + +class AVConv1IsD_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, + bits<4> opcod4, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm, + pattern> { + // Instruction operands. + bits<5> Sd; + bits<5> Dm; + + // Encode instruction operands. + let Inst{3-0} = Dm{3-0}; + let Inst{5} = Dm{4}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; +} + +class AVConv1InsS_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, + bits<4> opcod4, dag oops, dag iops, + InstrItinClass itin, string opc, string asm, + list<dag> pattern> + : AVConv1In<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm, + pattern> { + // Instruction operands. + bits<5> Sd; + bits<5> Sm; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; +} + +// Always set Z bit in the instruction, i.e. "round towards zero" variants. +def VTOSIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011, + (outs SPR:$Sd), (ins DPR:$Dm), + IIC_fpCVTDI, "vcvt", ".s32.f64\t$Sd, $Dm", + [(set SPR:$Sd, (arm_ftosi (f64 DPR:$Dm)))]> { + let Inst{7} = 1; // Z bit +} + +def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010, + (outs SPR:$Sd), (ins SPR:$Sm), + IIC_fpCVTSI, "vcvt", ".s32.f32\t$Sd, $Sm", + [(set SPR:$Sd, (arm_ftosi SPR:$Sm))]> { + let Inst{7} = 1; // Z bit + + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011, + (outs SPR:$Sd), (ins DPR:$Dm), + IIC_fpCVTDI, "vcvt", ".u32.f64\t$Sd, $Dm", + [(set SPR:$Sd, (arm_ftoui (f64 DPR:$Dm)))]> { + let Inst{7} = 1; // Z bit +} + +def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010, + (outs SPR:$Sd), (ins SPR:$Sm), + IIC_fpCVTSI, "vcvt", ".u32.f32\t$Sd, $Sm", + [(set SPR:$Sd, (arm_ftoui SPR:$Sm))]> { + let Inst{7} = 1; // Z bit + + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +// And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR. +let Uses = [FPSCR] in { +// FIXME: Verify encoding after integrated assembler is working. +def VTOSIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011, + (outs SPR:$Sd), (ins DPR:$Dm), + IIC_fpCVTDI, "vcvtr", ".s32.f64\t$Sd, $Dm", + [(set SPR:$Sd, (int_arm_vcvtr (f64 DPR:$Dm)))]>{ + let Inst{7} = 0; // Z bit +} + +def VTOSIRS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010, + (outs SPR:$Sd), (ins SPR:$Sm), + IIC_fpCVTSI, "vcvtr", ".s32.f32\t$Sd, $Sm", + [(set SPR:$Sd, (int_arm_vcvtr SPR:$Sm))]> { + let Inst{7} = 0; // Z bit +} + +def VTOUIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011, + (outs SPR:$Sd), (ins DPR:$Dm), + IIC_fpCVTDI, "vcvtr", ".u32.f64\t$Sd, $Dm", + [(set SPR:$Sd, (int_arm_vcvtru(f64 DPR:$Dm)))]>{ + let Inst{7} = 0; // Z bit +} + +def VTOUIRS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010, + (outs SPR:$Sd), (ins SPR:$Sm), + IIC_fpCVTSI, "vcvtr", ".u32.f32\t$Sd, $Sm", + [(set SPR:$Sd, (int_arm_vcvtru SPR:$Sm))]> { + let Inst{7} = 0; // Z bit +} +} + +// Convert between floating-point and fixed-point +// Data type for fixed-point naming convention: +// S16 (U=0, sx=0) -> SH +// U16 (U=1, sx=0) -> UH +// S32 (U=0, sx=1) -> SL +// U32 (U=1, sx=1) -> UL + +// FIXME: Marking these as codegen only seems wrong. They are real +// instructions(?) +let Constraints = "$a = $dst", isCodeGenOnly = 1 in { + +// FP to Fixed-Point: + +def VTOSHS : AVConv1XI<0b11101, 0b11, 0b1110, 0b1010, 0, + (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits), + IIC_fpCVTSI, "vcvt", ".s16.f32\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def VTOUHS : AVConv1XI<0b11101, 0b11, 0b1111, 0b1010, 0, + (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits), + IIC_fpCVTSI, "vcvt", ".u16.f32\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def VTOSLS : AVConv1XI<0b11101, 0b11, 0b1110, 0b1010, 1, + (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits), + IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def VTOULS : AVConv1XI<0b11101, 0b11, 0b1111, 0b1010, 1, + (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits), + IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def VTOSHD : AVConv1XI<0b11101, 0b11, 0b1110, 0b1011, 0, + (outs DPR:$dst), (ins DPR:$a, i32imm:$fbits), + IIC_fpCVTDI, "vcvt", ".s16.f64\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]>; + +def VTOUHD : AVConv1XI<0b11101, 0b11, 0b1111, 0b1011, 0, + (outs DPR:$dst), (ins DPR:$a, i32imm:$fbits), + IIC_fpCVTDI, "vcvt", ".u16.f64\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]>; + +def VTOSLD : AVConv1XI<0b11101, 0b11, 0b1110, 0b1011, 1, + (outs DPR:$dst), (ins DPR:$a, i32imm:$fbits), + IIC_fpCVTDI, "vcvt", ".s32.f64\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]>; + +def VTOULD : AVConv1XI<0b11101, 0b11, 0b1111, 0b1011, 1, + (outs DPR:$dst), (ins DPR:$a, i32imm:$fbits), + IIC_fpCVTDI, "vcvt", ".u32.f64\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]>; + +// Fixed-Point to FP: + +def VSHTOS : AVConv1XI<0b11101, 0b11, 0b1010, 0b1010, 0, + (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits), + IIC_fpCVTIS, "vcvt", ".f32.s16\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def VUHTOS : AVConv1XI<0b11101, 0b11, 0b1011, 0b1010, 0, + (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits), + IIC_fpCVTIS, "vcvt", ".f32.u16\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def VSLTOS : AVConv1XI<0b11101, 0b11, 0b1010, 0b1010, 1, + (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits), + IIC_fpCVTIS, "vcvt", ".f32.s32\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def VULTOS : AVConv1XI<0b11101, 0b11, 0b1011, 0b1010, 1, + (outs SPR:$dst), (ins SPR:$a, i32imm:$fbits), + IIC_fpCVTIS, "vcvt", ".f32.u32\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def VSHTOD : AVConv1XI<0b11101, 0b11, 0b1010, 0b1011, 0, + (outs DPR:$dst), (ins DPR:$a, i32imm:$fbits), + IIC_fpCVTID, "vcvt", ".f64.s16\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]>; + +def VUHTOD : AVConv1XI<0b11101, 0b11, 0b1011, 0b1011, 0, + (outs DPR:$dst), (ins DPR:$a, i32imm:$fbits), + IIC_fpCVTID, "vcvt", ".f64.u16\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]>; + +def VSLTOD : AVConv1XI<0b11101, 0b11, 0b1010, 0b1011, 1, + (outs DPR:$dst), (ins DPR:$a, i32imm:$fbits), + IIC_fpCVTID, "vcvt", ".f64.s32\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]>; + +def VULTOD : AVConv1XI<0b11101, 0b11, 0b1011, 0b1011, 1, + (outs DPR:$dst), (ins DPR:$a, i32imm:$fbits), + IIC_fpCVTID, "vcvt", ".f64.u32\t$dst, $a, $fbits", + [/* For disassembly only; pattern left blank */]>; + +} // End of 'let Constraints = "$a = $dst", isCodeGenOnly = 1 in' + +//===----------------------------------------------------------------------===// +// FP Multiply-Accumulate Operations. +// + +def VMLAD : ADbI<0b11100, 0b00, 0, 0, + (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), + IIC_fpMAC64, "vmla", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm), + (f64 DPR:$Ddin)))]>, + RegConstraint<"$Ddin = $Dd">, + Requires<[HasVFP2,UseFPVMLx]>; + +def VMLAS : ASbIn<0b11100, 0b00, 0, 0, + (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + IIC_fpMAC32, "vmla", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm), + SPR:$Sdin))]>, + RegConstraint<"$Sdin = $Sd">, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), + (VMLAD DPR:$dstin, DPR:$a, DPR:$b)>, + Requires<[HasVFP2,UseFPVMLx]>; +def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), + (VMLAS SPR:$dstin, SPR:$a, SPR:$b)>, + Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx]>; + +def VMLSD : ADbI<0b11100, 0b00, 1, 0, + (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), + IIC_fpMAC64, "vmls", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)), + (f64 DPR:$Ddin)))]>, + RegConstraint<"$Ddin = $Dd">, + Requires<[HasVFP2,UseFPVMLx]>; + +def VMLSS : ASbIn<0b11100, 0b00, 1, 0, + (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + IIC_fpMAC32, "vmls", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)), + SPR:$Sdin))]>, + RegConstraint<"$Sdin = $Sd">, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), + (VMLSD DPR:$dstin, DPR:$a, DPR:$b)>, + Requires<[HasVFP2,UseFPVMLx]>; +def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), + (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; + +def VNMLAD : ADbI<0b11100, 0b01, 1, 0, + (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), + IIC_fpMAC64, "vnmla", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)), + (f64 DPR:$Ddin)))]>, + RegConstraint<"$Ddin = $Dd">, + Requires<[HasVFP2,UseFPVMLx]>; + +def VNMLAS : ASbI<0b11100, 0b01, 1, 0, + (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + IIC_fpMAC32, "vnmla", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)), + SPR:$Sdin))]>, + RegConstraint<"$Sdin = $Sd">, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin), + (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>, + Requires<[HasVFP2,UseFPVMLx]>; +def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin), + (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; + +def VNMLSD : ADbI<0b11100, 0b01, 0, 0, + (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), + IIC_fpMAC64, "vnmls", ".f64\t$Dd, $Dn, $Dm", + [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm), + (f64 DPR:$Ddin)))]>, + RegConstraint<"$Ddin = $Dd">, + Requires<[HasVFP2,UseFPVMLx]>; + +def VNMLSS : ASbI<0b11100, 0b01, 0, 0, + (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), + IIC_fpMAC32, "vnmls", ".f32\t$Sd, $Sn, $Sm", + [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>, + RegConstraint<"$Sdin = $Sd">, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]> { + // Some single precision VFP instructions may be executed on both NEON and + // VFP pipelines on A8. + let D = VFPNeonA8Domain; +} + +def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin), + (VNMLSD DPR:$dstin, DPR:$a, DPR:$b)>, + Requires<[HasVFP2,UseFPVMLx]>; +def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin), + (VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>, + Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; + + +//===----------------------------------------------------------------------===// +// FP Conditional moves. +// + +let neverHasSideEffects = 1 in { +def VMOVDcc : ARMPseudoInst<(outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm, pred:$p), + 4, IIC_fpUNA64, + [/*(set DPR:$Dd, (ARMcmov DPR:$Dn, DPR:$Dm, imm:$cc))*/]>, + RegConstraint<"$Dn = $Dd">; + +def VMOVScc : ARMPseudoInst<(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm, pred:$p), + 4, IIC_fpUNA32, + [/*(set SPR:$Sd, (ARMcmov SPR:$Sn, SPR:$Sm, imm:$cc))*/]>, + RegConstraint<"$Sn = $Sd">; +} // neverHasSideEffects + +//===----------------------------------------------------------------------===// +// Move from VFP System Register to ARM core register. +// + +class MovFromVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm, + list<dag> pattern>: + VFPAI<oops, iops, VFPMiscFrm, IIC_fpSTAT, opc, asm, pattern> { + + // Instruction operand. + bits<4> Rt; + + let Inst{27-20} = 0b11101111; + let Inst{19-16} = opc19_16; + let Inst{15-12} = Rt; + let Inst{11-8} = 0b1010; + let Inst{7} = 0; + let Inst{6-5} = 0b00; + let Inst{4} = 1; + let Inst{3-0} = 0b0000; +} + +// APSR is the application level alias of CPSR. This FPSCR N, Z, C, V flags +// to APSR. +let Defs = [CPSR], Uses = [FPSCR], Rt = 0b1111 /* apsr_nzcv */ in +def FMSTAT : MovFromVFP<0b0001 /* fpscr */, (outs), (ins), + "vmrs", "\tapsr_nzcv, fpscr", [(arm_fmstat)]>; + +// Application level FPSCR -> GPR +let hasSideEffects = 1, Uses = [FPSCR] in +def VMRS : MovFromVFP<0b0001 /* fpscr */, (outs GPR:$Rt), (ins), + "vmrs", "\t$Rt, fpscr", + [(set GPR:$Rt, (int_arm_get_fpscr))]>; + +// System level FPEXC, FPSID -> GPR +let Uses = [FPSCR] in { + def VMRS_FPEXC : MovFromVFP<0b1000 /* fpexc */, (outs GPR:$Rt), (ins), + "vmrs", "\t$Rt, fpexc", []>; + def VMRS_FPSID : MovFromVFP<0b0000 /* fpsid */, (outs GPR:$Rt), (ins), + "vmrs", "\t$Rt, fpsid", []>; +} + +//===----------------------------------------------------------------------===// +// Move from ARM core register to VFP System Register. +// + +class MovToVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm, + list<dag> pattern>: + VFPAI<oops, iops, VFPMiscFrm, IIC_fpSTAT, opc, asm, pattern> { + + // Instruction operand. + bits<4> src; + + // Encode instruction operand. + let Inst{15-12} = src; + + let Inst{27-20} = 0b11101110; + let Inst{19-16} = opc19_16; + let Inst{11-8} = 0b1010; + let Inst{7} = 0; + let Inst{4} = 1; +} + +let Defs = [FPSCR] in { + // Application level GPR -> FPSCR + def VMSR : MovToVFP<0b0001 /* fpscr */, (outs), (ins GPR:$src), + "vmsr", "\tfpscr, $src", [(int_arm_set_fpscr GPR:$src)]>; + // System level GPR -> FPEXC + def VMSR_FPEXC : MovToVFP<0b1000 /* fpexc */, (outs), (ins GPR:$src), + "vmsr", "\tfpexc, $src", []>; + // System level GPR -> FPSID + def VMSR_FPSID : MovToVFP<0b0000 /* fpsid */, (outs), (ins GPR:$src), + "vmsr", "\tfpsid, $src", []>; +} + +//===----------------------------------------------------------------------===// +// Misc. +// + +// Materialize FP immediates. VFP3 only. +let isReMaterializable = 1 in { +def FCONSTD : VFPAI<(outs DPR:$Dd), (ins vfp_f64imm:$imm), + VFPMiscFrm, IIC_fpUNA64, + "vmov", ".f64\t$Dd, $imm", + [(set DPR:$Dd, vfp_f64imm:$imm)]>, Requires<[HasVFP3]> { + bits<5> Dd; + bits<8> imm; + + let Inst{27-23} = 0b11101; + let Inst{22} = Dd{4}; + let Inst{21-20} = 0b11; + let Inst{19-16} = imm{7-4}; + let Inst{15-12} = Dd{3-0}; + let Inst{11-9} = 0b101; + let Inst{8} = 1; // Double precision. + let Inst{7-4} = 0b0000; + let Inst{3-0} = imm{3-0}; +} + +def FCONSTS : VFPAI<(outs SPR:$Sd), (ins vfp_f32imm:$imm), + VFPMiscFrm, IIC_fpUNA32, + "vmov", ".f32\t$Sd, $imm", + [(set SPR:$Sd, vfp_f32imm:$imm)]>, Requires<[HasVFP3]> { + bits<5> Sd; + bits<8> imm; + + let Inst{27-23} = 0b11101; + let Inst{22} = Sd{0}; + let Inst{21-20} = 0b11; + let Inst{19-16} = imm{7-4}; + let Inst{15-12} = Sd{4-1}; + let Inst{11-9} = 0b101; + let Inst{8} = 0; // Single precision. + let Inst{7-4} = 0b0000; + let Inst{3-0} = imm{3-0}; +} +} + +//===----------------------------------------------------------------------===// +// Assembler aliases. +// + +def : VFP2InstAlias<"fmstat${p}", (FMSTAT pred:$p)>; + diff --git a/contrib/llvm/lib/Target/ARM/ARMJITInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMJITInfo.cpp new file mode 100644 index 0000000..45b7e48 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMJITInfo.cpp @@ -0,0 +1,336 @@ +//===-- ARMJITInfo.cpp - Implement the JIT interfaces for the ARM target --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the JIT interfaces for the ARM target. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "jit" +#include "ARMJITInfo.h" +#include "ARMInstrInfo.h" +#include "ARMConstantPoolValue.h" +#include "ARMRelocations.h" +#include "ARMSubtarget.h" +#include "llvm/Function.h" +#include "llvm/CodeGen/JITCodeEmitter.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Memory.h" +#include <cstdlib> +using namespace llvm; + +void ARMJITInfo::replaceMachineCodeForFunction(void *Old, void *New) { + report_fatal_error("ARMJITInfo::replaceMachineCodeForFunction"); +} + +/// JITCompilerFunction - This contains the address of the JIT function used to +/// compile a function lazily. +static TargetJITInfo::JITCompilerFn JITCompilerFunction; + +// Get the ASMPREFIX for the current host. This is often '_'. +#ifndef __USER_LABEL_PREFIX__ +#define __USER_LABEL_PREFIX__ +#endif +#define GETASMPREFIX2(X) #X +#define GETASMPREFIX(X) GETASMPREFIX2(X) +#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__) + +// CompilationCallback stub - We can't use a C function with inline assembly in +// it, because the prolog/epilog inserted by GCC won't work for us. (We need +// to preserve more context and manipulate the stack directly). Instead, +// write our own wrapper, which does things our way, so we have complete +// control over register saving and restoring. +extern "C" { +#if defined(__arm__) + void ARMCompilationCallback(); + asm( + ".text\n" + ".align 2\n" + ".globl " ASMPREFIX "ARMCompilationCallback\n" + ASMPREFIX "ARMCompilationCallback:\n" + // Save caller saved registers since they may contain stuff + // for the real target function right now. We have to act as if this + // whole compilation callback doesn't exist as far as the caller is + // concerned, so we can't just preserve the callee saved regs. + "stmdb sp!, {r0, r1, r2, r3, lr}\n" +#if (defined(__VFP_FP__) && !defined(__SOFTFP__)) + "fstmfdd sp!, {d0, d1, d2, d3, d4, d5, d6, d7}\n" +#endif + // The LR contains the address of the stub function on entry. + // pass it as the argument to the C part of the callback + "mov r0, lr\n" + "sub sp, sp, #4\n" + // Call the C portion of the callback + "bl " ASMPREFIX "ARMCompilationCallbackC\n" + "add sp, sp, #4\n" + // Restoring the LR to the return address of the function that invoked + // the stub and de-allocating the stack space for it requires us to + // swap the two saved LR values on the stack, as they're backwards + // for what we need since the pop instruction has a pre-determined + // order for the registers. + // +--------+ + // 0 | LR | Original return address + // +--------+ + // 1 | LR | Stub address (start of stub) + // 2-5 | R3..R0 | Saved registers (we need to preserve all regs) + // 6-20 | D0..D7 | Saved VFP registers + // +--------+ + // +#if (defined(__VFP_FP__) && !defined(__SOFTFP__)) + // Restore VFP caller-saved registers. + "fldmfdd sp!, {d0, d1, d2, d3, d4, d5, d6, d7}\n" +#endif + // + // We need to exchange the values in slots 0 and 1 so we can + // return to the address in slot 1 with the address in slot 0 + // restored to the LR. + "ldr r0, [sp,#20]\n" + "ldr r1, [sp,#16]\n" + "str r1, [sp,#20]\n" + "str r0, [sp,#16]\n" + // Return to the (newly modified) stub to invoke the real function. + // The above twiddling of the saved return addresses allows us to + // deallocate everything, including the LR the stub saved, with two + // updating load instructions. + "ldmia sp!, {r0, r1, r2, r3, lr}\n" + "ldr pc, [sp], #4\n" + ); +#else // Not an ARM host + void ARMCompilationCallback() { + llvm_unreachable("Cannot call ARMCompilationCallback() on a non-ARM arch!"); + } +#endif +} + +/// ARMCompilationCallbackC - This is the target-specific function invoked +/// by the function stub when we did not know the real target of a call. +/// This function must locate the start of the stub or call site and pass +/// it into the JIT compiler function. +extern "C" void ARMCompilationCallbackC(intptr_t StubAddr) { + // Get the address of the compiled code for this function. + intptr_t NewVal = (intptr_t)JITCompilerFunction((void*)StubAddr); + + // Rewrite the call target... so that we don't end up here every time we + // execute the call. We're replacing the first two instructions of the + // stub with: + // ldr pc, [pc,#-4] + // <addr> + if (!sys::Memory::setRangeWritable((void*)StubAddr, 8)) { + llvm_unreachable("ERROR: Unable to mark stub writable"); + } + *(intptr_t *)StubAddr = 0xe51ff004; // ldr pc, [pc, #-4] + *(intptr_t *)(StubAddr+4) = NewVal; + if (!sys::Memory::setRangeExecutable((void*)StubAddr, 8)) { + llvm_unreachable("ERROR: Unable to mark stub executable"); + } +} + +TargetJITInfo::LazyResolverFn +ARMJITInfo::getLazyResolverFunction(JITCompilerFn F) { + JITCompilerFunction = F; + return ARMCompilationCallback; +} + +void *ARMJITInfo::emitGlobalValueIndirectSym(const GlobalValue *GV, void *Ptr, + JITCodeEmitter &JCE) { + uint8_t Buffer[4]; + uint8_t *Cur = Buffer; + MachineCodeEmitter::emitWordLEInto(Cur, (intptr_t)Ptr); + void *PtrAddr = JCE.allocIndirectGV( + GV, Buffer, sizeof(Buffer), /*Alignment=*/4); + addIndirectSymAddr(Ptr, (intptr_t)PtrAddr); + return PtrAddr; +} + +TargetJITInfo::StubLayout ARMJITInfo::getStubLayout() { + // The stub contains up to 3 4-byte instructions, aligned at 4 bytes, and a + // 4-byte address. See emitFunctionStub for details. + StubLayout Result = {16, 4}; + return Result; +} + +void *ARMJITInfo::emitFunctionStub(const Function* F, void *Fn, + JITCodeEmitter &JCE) { + void *Addr; + // If this is just a call to an external function, emit a branch instead of a + // call. The code is the same except for one bit of the last instruction. + if (Fn != (void*)(intptr_t)ARMCompilationCallback) { + // Branch to the corresponding function addr. + if (IsPIC) { + // The stub is 16-byte size and 4-aligned. + intptr_t LazyPtr = getIndirectSymAddr(Fn); + if (!LazyPtr) { + // In PIC mode, the function stub is loading a lazy-ptr. + LazyPtr= (intptr_t)emitGlobalValueIndirectSym((GlobalValue*)F, Fn, JCE); + DEBUG(if (F) + errs() << "JIT: Indirect symbol emitted at [" << LazyPtr + << "] for GV '" << F->getName() << "'\n"; + else + errs() << "JIT: Stub emitted at [" << LazyPtr + << "] for external function at '" << Fn << "'\n"); + } + JCE.emitAlignment(4); + Addr = (void*)JCE.getCurrentPCValue(); + if (!sys::Memory::setRangeWritable(Addr, 16)) { + llvm_unreachable("ERROR: Unable to mark stub writable"); + } + JCE.emitWordLE(0xe59fc004); // ldr ip, [pc, #+4] + JCE.emitWordLE(0xe08fc00c); // L_func$scv: add ip, pc, ip + JCE.emitWordLE(0xe59cf000); // ldr pc, [ip] + JCE.emitWordLE(LazyPtr - (intptr_t(Addr)+4+8)); // func - (L_func$scv+8) + sys::Memory::InvalidateInstructionCache(Addr, 16); + if (!sys::Memory::setRangeExecutable(Addr, 16)) { + llvm_unreachable("ERROR: Unable to mark stub executable"); + } + } else { + // The stub is 8-byte size and 4-aligned. + JCE.emitAlignment(4); + Addr = (void*)JCE.getCurrentPCValue(); + if (!sys::Memory::setRangeWritable(Addr, 8)) { + llvm_unreachable("ERROR: Unable to mark stub writable"); + } + JCE.emitWordLE(0xe51ff004); // ldr pc, [pc, #-4] + JCE.emitWordLE((intptr_t)Fn); // addr of function + sys::Memory::InvalidateInstructionCache(Addr, 8); + if (!sys::Memory::setRangeExecutable(Addr, 8)) { + llvm_unreachable("ERROR: Unable to mark stub executable"); + } + } + } else { + // The compilation callback will overwrite the first two words of this + // stub with indirect branch instructions targeting the compiled code. + // This stub sets the return address to restart the stub, so that + // the new branch will be invoked when we come back. + // + // Branch and link to the compilation callback. + // The stub is 16-byte size and 4-byte aligned. + JCE.emitAlignment(4); + Addr = (void*)JCE.getCurrentPCValue(); + if (!sys::Memory::setRangeWritable(Addr, 16)) { + llvm_unreachable("ERROR: Unable to mark stub writable"); + } + // Save LR so the callback can determine which stub called it. + // The compilation callback is responsible for popping this prior + // to returning. + JCE.emitWordLE(0xe92d4000); // push {lr} + // Set the return address to go back to the start of this stub. + JCE.emitWordLE(0xe24fe00c); // sub lr, pc, #12 + // Invoke the compilation callback. + JCE.emitWordLE(0xe51ff004); // ldr pc, [pc, #-4] + // The address of the compilation callback. + JCE.emitWordLE((intptr_t)ARMCompilationCallback); + sys::Memory::InvalidateInstructionCache(Addr, 16); + if (!sys::Memory::setRangeExecutable(Addr, 16)) { + llvm_unreachable("ERROR: Unable to mark stub executable"); + } + } + + return Addr; +} + +intptr_t ARMJITInfo::resolveRelocDestAddr(MachineRelocation *MR) const { + ARM::RelocationType RT = (ARM::RelocationType)MR->getRelocationType(); + switch (RT) { + default: + return (intptr_t)(MR->getResultPointer()); + case ARM::reloc_arm_pic_jt: + // Destination address - jump table base. + return (intptr_t)(MR->getResultPointer()) - MR->getConstantVal(); + case ARM::reloc_arm_jt_base: + // Jump table base address. + return getJumpTableBaseAddr(MR->getJumpTableIndex()); + case ARM::reloc_arm_cp_entry: + case ARM::reloc_arm_vfp_cp_entry: + // Constant pool entry address. + return getConstantPoolEntryAddr(MR->getConstantPoolIndex()); + case ARM::reloc_arm_machine_cp_entry: { + ARMConstantPoolValue *ACPV = (ARMConstantPoolValue*)MR->getConstantVal(); + assert((!ACPV->hasModifier() && !ACPV->mustAddCurrentAddress()) && + "Can't handle this machine constant pool entry yet!"); + intptr_t Addr = (intptr_t)(MR->getResultPointer()); + Addr -= getPCLabelAddr(ACPV->getLabelId()) + ACPV->getPCAdjustment(); + return Addr; + } + } +} + +/// relocate - Before the JIT can run a block of code that has been emitted, +/// it must rewrite the code to contain the actual addresses of any +/// referenced global symbols. +void ARMJITInfo::relocate(void *Function, MachineRelocation *MR, + unsigned NumRelocs, unsigned char* GOTBase) { + for (unsigned i = 0; i != NumRelocs; ++i, ++MR) { + void *RelocPos = (char*)Function + MR->getMachineCodeOffset(); + intptr_t ResultPtr = resolveRelocDestAddr(MR); + switch ((ARM::RelocationType)MR->getRelocationType()) { + case ARM::reloc_arm_cp_entry: + case ARM::reloc_arm_vfp_cp_entry: + case ARM::reloc_arm_relative: { + // It is necessary to calculate the correct PC relative value. We + // subtract the base addr from the target addr to form a byte offset. + ResultPtr = ResultPtr - (intptr_t)RelocPos - 8; + // If the result is positive, set bit U(23) to 1. + if (ResultPtr >= 0) + *((intptr_t*)RelocPos) |= 1 << ARMII::U_BitShift; + else { + // Otherwise, obtain the absolute value and set bit U(23) to 0. + *((intptr_t*)RelocPos) &= ~(1 << ARMII::U_BitShift); + ResultPtr = - ResultPtr; + } + // Set the immed value calculated. + // VFP immediate offset is multiplied by 4. + if (MR->getRelocationType() == ARM::reloc_arm_vfp_cp_entry) + ResultPtr = ResultPtr >> 2; + *((intptr_t*)RelocPos) |= ResultPtr; + // Set register Rn to PC. + *((intptr_t*)RelocPos) |= + getARMRegisterNumbering(ARM::PC) << ARMII::RegRnShift; + break; + } + case ARM::reloc_arm_pic_jt: + case ARM::reloc_arm_machine_cp_entry: + case ARM::reloc_arm_absolute: { + // These addresses have already been resolved. + *((intptr_t*)RelocPos) |= (intptr_t)ResultPtr; + break; + } + case ARM::reloc_arm_branch: { + // It is necessary to calculate the correct value of signed_immed_24 + // field. We subtract the base addr from the target addr to form a + // byte offset, which must be inside the range -33554432 and +33554428. + // Then, we set the signed_immed_24 field of the instruction to bits + // [25:2] of the byte offset. More details ARM-ARM p. A4-11. + ResultPtr = ResultPtr - (intptr_t)RelocPos - 8; + ResultPtr = (ResultPtr & 0x03FFFFFC) >> 2; + assert(ResultPtr >= -33554432 && ResultPtr <= 33554428); + *((intptr_t*)RelocPos) |= ResultPtr; + break; + } + case ARM::reloc_arm_jt_base: { + // JT base - (instruction addr + 8) + ResultPtr = ResultPtr - (intptr_t)RelocPos - 8; + *((intptr_t*)RelocPos) |= ResultPtr; + break; + } + case ARM::reloc_arm_movw: { + ResultPtr = ResultPtr & 0xFFFF; + *((intptr_t*)RelocPos) |= ResultPtr & 0xFFF; + *((intptr_t*)RelocPos) |= ((ResultPtr >> 12) & 0xF) << 16; + break; + } + case ARM::reloc_arm_movt: { + ResultPtr = (ResultPtr >> 16) & 0xFFFF; + *((intptr_t*)RelocPos) |= ResultPtr & 0xFFF; + *((intptr_t*)RelocPos) |= ((ResultPtr >> 12) & 0xF) << 16; + break; + } + } + } +} diff --git a/contrib/llvm/lib/Target/ARM/ARMJITInfo.h b/contrib/llvm/lib/Target/ARM/ARMJITInfo.h new file mode 100644 index 0000000..2f97928 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMJITInfo.h @@ -0,0 +1,183 @@ +//===- ARMJITInfo.h - ARM implementation of the JIT interface --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the ARMJITInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMJITINFO_H +#define ARMJITINFO_H + +#include "ARMMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/Target/TargetJITInfo.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" + +namespace llvm { + class ARMTargetMachine; + + class ARMJITInfo : public TargetJITInfo { + // ConstPoolId2AddrMap - A map from constant pool ids to the corresponding + // CONSTPOOL_ENTRY addresses. + SmallVector<intptr_t, 16> ConstPoolId2AddrMap; + + // JumpTableId2AddrMap - A map from inline jumptable ids to the + // corresponding inline jump table bases. + SmallVector<intptr_t, 16> JumpTableId2AddrMap; + + // PCLabelMap - A map from PC labels to addresses. + DenseMap<unsigned, intptr_t> PCLabelMap; + + // Sym2IndirectSymMap - A map from symbol (GlobalValue and ExternalSymbol) + // addresses to their indirect symbol addresses. + DenseMap<void*, intptr_t> Sym2IndirectSymMap; + + // IsPIC - True if the relocation model is PIC. This is used to determine + // how to codegen function stubs. + bool IsPIC; + + public: + explicit ARMJITInfo() : IsPIC(false) { useGOT = false; } + + /// replaceMachineCodeForFunction - Make it so that calling the function + /// whose machine code is at OLD turns into a call to NEW, perhaps by + /// overwriting OLD with a branch to NEW. This is used for self-modifying + /// code. + /// + virtual void replaceMachineCodeForFunction(void *Old, void *New); + + /// emitGlobalValueIndirectSym - Use the specified JITCodeEmitter object + /// to emit an indirect symbol which contains the address of the specified + /// ptr. + virtual void *emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr, + JITCodeEmitter &JCE); + + // getStubLayout - Returns the size and alignment of the largest call stub + // on ARM. + virtual StubLayout getStubLayout(); + + /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a + /// small native function that simply calls the function at the specified + /// address. + virtual void *emitFunctionStub(const Function* F, void *Fn, + JITCodeEmitter &JCE); + + /// getLazyResolverFunction - Expose the lazy resolver to the JIT. + virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn); + + /// relocate - Before the JIT can run a block of code that has been emitted, + /// it must rewrite the code to contain the actual addresses of any + /// referenced global symbols. + virtual void relocate(void *Function, MachineRelocation *MR, + unsigned NumRelocs, unsigned char* GOTBase); + + /// hasCustomConstantPool - Allows a target to specify that constant + /// pool address resolution is handled by the target. + virtual bool hasCustomConstantPool() const { return true; } + + /// hasCustomJumpTables - Allows a target to specify that jumptables + /// are emitted by the target. + virtual bool hasCustomJumpTables() const { return true; } + + /// allocateSeparateGVMemory - If true, globals should be placed in + /// separately allocated heap memory rather than in the same + /// code memory allocated by JITCodeEmitter. + virtual bool allocateSeparateGVMemory() const { +#ifdef __APPLE__ + return true; +#else + return false; +#endif + } + + /// Initialize - Initialize internal stage for the function being JITted. + /// Resize constant pool ids to CONSTPOOL_ENTRY addresses map; resize + /// jump table ids to jump table bases map; remember if codegen relocation + /// model is PIC. + void Initialize(const MachineFunction &MF, bool isPIC) { + const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + ConstPoolId2AddrMap.resize(AFI->getNumPICLabels()); + JumpTableId2AddrMap.resize(AFI->getNumJumpTables()); + IsPIC = isPIC; + } + + /// getConstantPoolEntryAddr - The ARM target puts all constant + /// pool entries into constant islands. This returns the address of the + /// constant pool entry of the specified index. + intptr_t getConstantPoolEntryAddr(unsigned CPI) const { + assert(CPI < ConstPoolId2AddrMap.size()); + return ConstPoolId2AddrMap[CPI]; + } + + /// addConstantPoolEntryAddr - Map a Constant Pool Index to the address + /// where its associated value is stored. When relocations are processed, + /// this value will be used to resolve references to the constant. + void addConstantPoolEntryAddr(unsigned CPI, intptr_t Addr) { + assert(CPI < ConstPoolId2AddrMap.size()); + ConstPoolId2AddrMap[CPI] = Addr; + } + + /// getJumpTableBaseAddr - The ARM target inline all jump tables within + /// text section of the function. This returns the address of the base of + /// the jump table of the specified index. + intptr_t getJumpTableBaseAddr(unsigned JTI) const { + assert(JTI < JumpTableId2AddrMap.size()); + return JumpTableId2AddrMap[JTI]; + } + + /// addJumpTableBaseAddr - Map a jump table index to the address where + /// the corresponding inline jump table is emitted. When relocations are + /// processed, this value will be used to resolve references to the + /// jump table. + void addJumpTableBaseAddr(unsigned JTI, intptr_t Addr) { + assert(JTI < JumpTableId2AddrMap.size()); + JumpTableId2AddrMap[JTI] = Addr; + } + + /// getPCLabelAddr - Retrieve the address of the PC label of the + /// specified id. + intptr_t getPCLabelAddr(unsigned Id) const { + DenseMap<unsigned, intptr_t>::const_iterator I = PCLabelMap.find(Id); + assert(I != PCLabelMap.end()); + return I->second; + } + + /// addPCLabelAddr - Remember the address of the specified PC label. + void addPCLabelAddr(unsigned Id, intptr_t Addr) { + PCLabelMap.insert(std::make_pair(Id, Addr)); + } + + /// getIndirectSymAddr - Retrieve the address of the indirect symbol of the + /// specified symbol located at address. Returns 0 if the indirect symbol + /// has not been emitted. + intptr_t getIndirectSymAddr(void *Addr) const { + DenseMap<void*,intptr_t>::const_iterator I= Sym2IndirectSymMap.find(Addr); + if (I != Sym2IndirectSymMap.end()) + return I->second; + return 0; + } + + /// addIndirectSymAddr - Add a mapping from address of an emitted symbol to + /// its indirect symbol address. + void addIndirectSymAddr(void *SymAddr, intptr_t IndSymAddr) { + Sym2IndirectSymMap.insert(std::make_pair(SymAddr, IndSymAddr)); + } + + private: + /// resolveRelocDestAddr - Resolve the resulting address of the relocation + /// if it's not already solved. Constantpool entries must be resolved by + /// ARM target. + intptr_t resolveRelocDestAddr(MachineRelocation *MR) const; + }; +} + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp new file mode 100644 index 0000000..faa8ba7 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -0,0 +1,1850 @@ +//===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ----*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that performs load / store related peephole +// optimizations. This pass should be run after register allocation. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "arm-ldst-opt" +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMRegisterInfo.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumLDMGened , "Number of ldm instructions generated"); +STATISTIC(NumSTMGened , "Number of stm instructions generated"); +STATISTIC(NumVLDMGened, "Number of vldm instructions generated"); +STATISTIC(NumVSTMGened, "Number of vstm instructions generated"); +STATISTIC(NumLdStMoved, "Number of load / store instructions moved"); +STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation"); +STATISTIC(NumSTRDFormed,"Number of strd created before allocation"); +STATISTIC(NumLDRD2LDM, "Number of ldrd instructions turned back into ldm"); +STATISTIC(NumSTRD2STM, "Number of strd instructions turned back into stm"); +STATISTIC(NumLDRD2LDR, "Number of ldrd instructions turned back into ldr's"); +STATISTIC(NumSTRD2STR, "Number of strd instructions turned back into str's"); + +/// ARMAllocLoadStoreOpt - Post- register allocation pass the combine +/// load / store instructions to form ldm / stm instructions. + +namespace { + struct ARMLoadStoreOpt : public MachineFunctionPass { + static char ID; + ARMLoadStoreOpt() : MachineFunctionPass(ID) {} + + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + ARMFunctionInfo *AFI; + RegScavenger *RS; + bool isThumb2; + + virtual bool runOnMachineFunction(MachineFunction &Fn); + + virtual const char *getPassName() const { + return "ARM load / store optimization pass"; + } + + private: + struct MemOpQueueEntry { + int Offset; + unsigned Reg; + bool isKill; + unsigned Position; + MachineBasicBlock::iterator MBBI; + bool Merged; + MemOpQueueEntry(int o, unsigned r, bool k, unsigned p, + MachineBasicBlock::iterator i) + : Offset(o), Reg(r), isKill(k), Position(p), MBBI(i), Merged(false) {} + }; + typedef SmallVector<MemOpQueueEntry,8> MemOpQueue; + typedef MemOpQueue::iterator MemOpQueueIter; + + bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + int Offset, unsigned Base, bool BaseKill, int Opcode, + ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch, + DebugLoc dl, SmallVector<std::pair<unsigned, bool>, 8> &Regs); + void MergeOpsUpdate(MachineBasicBlock &MBB, + MemOpQueue &MemOps, + unsigned memOpsBegin, + unsigned memOpsEnd, + unsigned insertAfter, + int Offset, + unsigned Base, + bool BaseKill, + int Opcode, + ARMCC::CondCodes Pred, + unsigned PredReg, + unsigned Scratch, + DebugLoc dl, + SmallVector<MachineBasicBlock::iterator, 4> &Merges); + void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base, + int Opcode, unsigned Size, + ARMCC::CondCodes Pred, unsigned PredReg, + unsigned Scratch, MemOpQueue &MemOps, + SmallVector<MachineBasicBlock::iterator, 4> &Merges); + + void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps); + bool FixInvalidRegPairOp(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI); + bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const TargetInstrInfo *TII, + bool &Advance, + MachineBasicBlock::iterator &I); + bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + bool &Advance, + MachineBasicBlock::iterator &I); + bool LoadStoreMultipleOpti(MachineBasicBlock &MBB); + bool MergeReturnIntoLDM(MachineBasicBlock &MBB); + }; + char ARMLoadStoreOpt::ID = 0; +} + +static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) { + switch (Opcode) { + default: llvm_unreachable("Unhandled opcode!"); + case ARM::LDRi12: + ++NumLDMGened; + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::LDMIA; + case ARM_AM::da: return ARM::LDMDA; + case ARM_AM::db: return ARM::LDMDB; + case ARM_AM::ib: return ARM::LDMIB; + } + break; + case ARM::STRi12: + ++NumSTMGened; + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::STMIA; + case ARM_AM::da: return ARM::STMDA; + case ARM_AM::db: return ARM::STMDB; + case ARM_AM::ib: return ARM::STMIB; + } + break; + case ARM::t2LDRi8: + case ARM::t2LDRi12: + ++NumLDMGened; + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::t2LDMIA; + case ARM_AM::db: return ARM::t2LDMDB; + } + break; + case ARM::t2STRi8: + case ARM::t2STRi12: + ++NumSTMGened; + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::t2STMIA; + case ARM_AM::db: return ARM::t2STMDB; + } + break; + case ARM::VLDRS: + ++NumVLDMGened; + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::VLDMSIA; + case ARM_AM::db: return 0; // Only VLDMSDB_UPD exists. + } + break; + case ARM::VSTRS: + ++NumVSTMGened; + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::VSTMSIA; + case ARM_AM::db: return 0; // Only VSTMSDB_UPD exists. + } + break; + case ARM::VLDRD: + ++NumVLDMGened; + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::VLDMDIA; + case ARM_AM::db: return 0; // Only VLDMDDB_UPD exists. + } + break; + case ARM::VSTRD: + ++NumVSTMGened; + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::VSTMDIA; + case ARM_AM::db: return 0; // Only VSTMDDB_UPD exists. + } + break; + } + + return 0; +} + +namespace llvm { + namespace ARM_AM { + +AMSubMode getLoadStoreMultipleSubMode(int Opcode) { + switch (Opcode) { + default: llvm_unreachable("Unhandled opcode!"); + case ARM::LDMIA_RET: + case ARM::LDMIA: + case ARM::LDMIA_UPD: + case ARM::STMIA: + case ARM::STMIA_UPD: + case ARM::t2LDMIA_RET: + case ARM::t2LDMIA: + case ARM::t2LDMIA_UPD: + case ARM::t2STMIA: + case ARM::t2STMIA_UPD: + case ARM::VLDMSIA: + case ARM::VLDMSIA_UPD: + case ARM::VSTMSIA: + case ARM::VSTMSIA_UPD: + case ARM::VLDMDIA: + case ARM::VLDMDIA_UPD: + case ARM::VSTMDIA: + case ARM::VSTMDIA_UPD: + return ARM_AM::ia; + + case ARM::LDMDA: + case ARM::LDMDA_UPD: + case ARM::STMDA: + case ARM::STMDA_UPD: + return ARM_AM::da; + + case ARM::LDMDB: + case ARM::LDMDB_UPD: + case ARM::STMDB: + case ARM::STMDB_UPD: + case ARM::t2LDMDB: + case ARM::t2LDMDB_UPD: + case ARM::t2STMDB: + case ARM::t2STMDB_UPD: + case ARM::VLDMSDB_UPD: + case ARM::VSTMSDB_UPD: + case ARM::VLDMDDB_UPD: + case ARM::VSTMDDB_UPD: + return ARM_AM::db; + + case ARM::LDMIB: + case ARM::LDMIB_UPD: + case ARM::STMIB: + case ARM::STMIB_UPD: + return ARM_AM::ib; + } + + return ARM_AM::bad_am_submode; +} + + } // end namespace ARM_AM +} // end namespace llvm + +static bool isT2i32Load(unsigned Opc) { + return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8; +} + +static bool isi32Load(unsigned Opc) { + return Opc == ARM::LDRi12 || isT2i32Load(Opc); +} + +static bool isT2i32Store(unsigned Opc) { + return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8; +} + +static bool isi32Store(unsigned Opc) { + return Opc == ARM::STRi12 || isT2i32Store(Opc); +} + +/// MergeOps - Create and insert a LDM or STM with Base as base register and +/// registers in Regs as the register operands that would be loaded / stored. +/// It returns true if the transformation is done. +bool +ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + int Offset, unsigned Base, bool BaseKill, + int Opcode, ARMCC::CondCodes Pred, + unsigned PredReg, unsigned Scratch, DebugLoc dl, + SmallVector<std::pair<unsigned, bool>, 8> &Regs) { + // Only a single register to load / store. Don't bother. + unsigned NumRegs = Regs.size(); + if (NumRegs <= 1) + return false; + + ARM_AM::AMSubMode Mode = ARM_AM::ia; + // VFP and Thumb2 do not support IB or DA modes. + bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode); + bool haveIBAndDA = isNotVFP && !isThumb2; + if (Offset == 4 && haveIBAndDA) + Mode = ARM_AM::ib; + else if (Offset == -4 * (int)NumRegs + 4 && haveIBAndDA) + Mode = ARM_AM::da; + else if (Offset == -4 * (int)NumRegs && isNotVFP) + // VLDM/VSTM do not support DB mode without also updating the base reg. + Mode = ARM_AM::db; + else if (Offset != 0) { + // Check if this is a supported opcode before we insert instructions to + // calculate a new base register. + if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return false; + + // If starting offset isn't zero, insert a MI to materialize a new base. + // But only do so if it is cost effective, i.e. merging more than two + // loads / stores. + if (NumRegs <= 2) + return false; + + unsigned NewBase; + if (isi32Load(Opcode)) + // If it is a load, then just use one of the destination register to + // use as the new base. + NewBase = Regs[NumRegs-1].first; + else { + // Use the scratch register to use as a new base. + NewBase = Scratch; + if (NewBase == 0) + return false; + } + int BaseOpc = !isThumb2 ? ARM::ADDri : ARM::t2ADDri; + if (Offset < 0) { + BaseOpc = !isThumb2 ? ARM::SUBri : ARM::t2SUBri; + Offset = - Offset; + } + int ImmedOffset = isThumb2 + ? ARM_AM::getT2SOImmVal(Offset) : ARM_AM::getSOImmVal(Offset); + if (ImmedOffset == -1) + // FIXME: Try t2ADDri12 or t2SUBri12? + return false; // Probably not worth it then. + + BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase) + .addReg(Base, getKillRegState(BaseKill)).addImm(Offset) + .addImm(Pred).addReg(PredReg).addReg(0); + Base = NewBase; + BaseKill = true; // New base is always killed right its use. + } + + bool isDef = (isi32Load(Opcode) || Opcode == ARM::VLDRS || + Opcode == ARM::VLDRD); + Opcode = getLoadStoreMultipleOpcode(Opcode, Mode); + if (!Opcode) return false; + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode)) + .addReg(Base, getKillRegState(BaseKill)) + .addImm(Pred).addReg(PredReg); + for (unsigned i = 0; i != NumRegs; ++i) + MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef) + | getKillRegState(Regs[i].second)); + + return true; +} + +// MergeOpsUpdate - call MergeOps and update MemOps and merges accordingly on +// success. +void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB, + MemOpQueue &memOps, + unsigned memOpsBegin, unsigned memOpsEnd, + unsigned insertAfter, int Offset, + unsigned Base, bool BaseKill, + int Opcode, + ARMCC::CondCodes Pred, unsigned PredReg, + unsigned Scratch, + DebugLoc dl, + SmallVector<MachineBasicBlock::iterator, 4> &Merges) { + // First calculate which of the registers should be killed by the merged + // instruction. + const unsigned insertPos = memOps[insertAfter].Position; + SmallSet<unsigned, 4> KilledRegs; + DenseMap<unsigned, unsigned> Killer; + for (unsigned i = 0, e = memOps.size(); i != e; ++i) { + if (i == memOpsBegin) { + i = memOpsEnd; + if (i == e) + break; + } + if (memOps[i].Position < insertPos && memOps[i].isKill) { + unsigned Reg = memOps[i].Reg; + KilledRegs.insert(Reg); + Killer[Reg] = i; + } + } + + SmallVector<std::pair<unsigned, bool>, 8> Regs; + for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) { + unsigned Reg = memOps[i].Reg; + // If we are inserting the merged operation after an operation that + // uses the same register, make sure to transfer any kill flag. + bool isKill = memOps[i].isKill || KilledRegs.count(Reg); + Regs.push_back(std::make_pair(Reg, isKill)); + } + + // Try to do the merge. + MachineBasicBlock::iterator Loc = memOps[insertAfter].MBBI; + ++Loc; + if (!MergeOps(MBB, Loc, Offset, Base, BaseKill, Opcode, + Pred, PredReg, Scratch, dl, Regs)) + return; + + // Merge succeeded, update records. + Merges.push_back(prior(Loc)); + for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) { + // Remove kill flags from any memops that come before insertPos. + if (Regs[i-memOpsBegin].second) { + unsigned Reg = Regs[i-memOpsBegin].first; + if (KilledRegs.count(Reg)) { + unsigned j = Killer[Reg]; + int Idx = memOps[j].MBBI->findRegisterUseOperandIdx(Reg, true); + assert(Idx >= 0 && "Cannot find killing operand"); + memOps[j].MBBI->getOperand(Idx).setIsKill(false); + memOps[j].isKill = false; + } + memOps[i].isKill = true; + } + MBB.erase(memOps[i].MBBI); + // Update this memop to refer to the merged instruction. + // We may need to move kill flags again. + memOps[i].Merged = true; + memOps[i].MBBI = Merges.back(); + memOps[i].Position = insertPos; + } +} + +/// MergeLDR_STR - Merge a number of load / store instructions into one or more +/// load / store multiple instructions. +void +ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, + unsigned Base, int Opcode, unsigned Size, + ARMCC::CondCodes Pred, unsigned PredReg, + unsigned Scratch, MemOpQueue &MemOps, + SmallVector<MachineBasicBlock::iterator, 4> &Merges) { + bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode); + int Offset = MemOps[SIndex].Offset; + int SOffset = Offset; + unsigned insertAfter = SIndex; + MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI; + DebugLoc dl = Loc->getDebugLoc(); + const MachineOperand &PMO = Loc->getOperand(0); + unsigned PReg = PMO.getReg(); + unsigned PRegNum = PMO.isUndef() ? UINT_MAX + : getARMRegisterNumbering(PReg); + unsigned Count = 1; + unsigned Limit = ~0U; + + // vldm / vstm limit are 32 for S variants, 16 for D variants. + + switch (Opcode) { + default: break; + case ARM::VSTRS: + Limit = 32; + break; + case ARM::VSTRD: + Limit = 16; + break; + case ARM::VLDRD: + Limit = 16; + break; + case ARM::VLDRS: + Limit = 32; + break; + } + + for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) { + int NewOffset = MemOps[i].Offset; + const MachineOperand &MO = MemOps[i].MBBI->getOperand(0); + unsigned Reg = MO.getReg(); + unsigned RegNum = MO.isUndef() ? UINT_MAX + : getARMRegisterNumbering(Reg); + // Register numbers must be in ascending order. For VFP / NEON load and + // store multiples, the registers must also be consecutive and within the + // limit on the number of registers per instruction. + if (Reg != ARM::SP && + NewOffset == Offset + (int)Size && + ((isNotVFP && RegNum > PRegNum) || + ((Count < Limit) && RegNum == PRegNum+1))) { + Offset += Size; + PRegNum = RegNum; + ++Count; + } else { + // Can't merge this in. Try merge the earlier ones first. + MergeOpsUpdate(MBB, MemOps, SIndex, i, insertAfter, SOffset, + Base, false, Opcode, Pred, PredReg, Scratch, dl, Merges); + MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch, + MemOps, Merges); + return; + } + + if (MemOps[i].Position > MemOps[insertAfter].Position) + insertAfter = i; + } + + bool BaseKill = Loc->findRegisterUseOperandIdx(Base, true) != -1; + MergeOpsUpdate(MBB, MemOps, SIndex, MemOps.size(), insertAfter, SOffset, + Base, BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges); + return; +} + +static inline bool isMatchingDecrement(MachineInstr *MI, unsigned Base, + unsigned Bytes, unsigned Limit, + ARMCC::CondCodes Pred, unsigned PredReg){ + unsigned MyPredReg = 0; + if (!MI) + return false; + if (MI->getOpcode() != ARM::t2SUBri && + MI->getOpcode() != ARM::tSUBspi && + MI->getOpcode() != ARM::SUBri) + return false; + + // Make sure the offset fits in 8 bits. + if (Bytes == 0 || (Limit && Bytes >= Limit)) + return false; + + unsigned Scale = (MI->getOpcode() == ARM::tSUBspi) ? 4 : 1; // FIXME + return (MI->getOperand(0).getReg() == Base && + MI->getOperand(1).getReg() == Base && + (MI->getOperand(2).getImm()*Scale) == Bytes && + llvm::getInstrPredicate(MI, MyPredReg) == Pred && + MyPredReg == PredReg); +} + +static inline bool isMatchingIncrement(MachineInstr *MI, unsigned Base, + unsigned Bytes, unsigned Limit, + ARMCC::CondCodes Pred, unsigned PredReg){ + unsigned MyPredReg = 0; + if (!MI) + return false; + if (MI->getOpcode() != ARM::t2ADDri && + MI->getOpcode() != ARM::tADDspi && + MI->getOpcode() != ARM::ADDri) + return false; + + if (Bytes == 0 || (Limit && Bytes >= Limit)) + // Make sure the offset fits in 8 bits. + return false; + + unsigned Scale = (MI->getOpcode() == ARM::tADDspi) ? 4 : 1; // FIXME + return (MI->getOperand(0).getReg() == Base && + MI->getOperand(1).getReg() == Base && + (MI->getOperand(2).getImm()*Scale) == Bytes && + llvm::getInstrPredicate(MI, MyPredReg) == Pred && + MyPredReg == PredReg); +} + +static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) { + switch (MI->getOpcode()) { + default: return 0; + case ARM::LDRi12: + case ARM::STRi12: + case ARM::t2LDRi8: + case ARM::t2LDRi12: + case ARM::t2STRi8: + case ARM::t2STRi12: + case ARM::VLDRS: + case ARM::VSTRS: + return 4; + case ARM::VLDRD: + case ARM::VSTRD: + return 8; + case ARM::LDMIA: + case ARM::LDMDA: + case ARM::LDMDB: + case ARM::LDMIB: + case ARM::STMIA: + case ARM::STMDA: + case ARM::STMDB: + case ARM::STMIB: + case ARM::t2LDMIA: + case ARM::t2LDMDB: + case ARM::t2STMIA: + case ARM::t2STMDB: + case ARM::VLDMSIA: + case ARM::VSTMSIA: + return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4; + case ARM::VLDMDIA: + case ARM::VSTMDIA: + return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8; + } +} + +static unsigned getUpdatingLSMultipleOpcode(unsigned Opc, + ARM_AM::AMSubMode Mode) { + switch (Opc) { + default: llvm_unreachable("Unhandled opcode!"); + case ARM::LDMIA: + case ARM::LDMDA: + case ARM::LDMDB: + case ARM::LDMIB: + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::LDMIA_UPD; + case ARM_AM::ib: return ARM::LDMIB_UPD; + case ARM_AM::da: return ARM::LDMDA_UPD; + case ARM_AM::db: return ARM::LDMDB_UPD; + } + break; + case ARM::STMIA: + case ARM::STMDA: + case ARM::STMDB: + case ARM::STMIB: + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::STMIA_UPD; + case ARM_AM::ib: return ARM::STMIB_UPD; + case ARM_AM::da: return ARM::STMDA_UPD; + case ARM_AM::db: return ARM::STMDB_UPD; + } + break; + case ARM::t2LDMIA: + case ARM::t2LDMDB: + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::t2LDMIA_UPD; + case ARM_AM::db: return ARM::t2LDMDB_UPD; + } + break; + case ARM::t2STMIA: + case ARM::t2STMDB: + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::t2STMIA_UPD; + case ARM_AM::db: return ARM::t2STMDB_UPD; + } + break; + case ARM::VLDMSIA: + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::VLDMSIA_UPD; + case ARM_AM::db: return ARM::VLDMSDB_UPD; + } + break; + case ARM::VLDMDIA: + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::VLDMDIA_UPD; + case ARM_AM::db: return ARM::VLDMDDB_UPD; + } + break; + case ARM::VSTMSIA: + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::VSTMSIA_UPD; + case ARM_AM::db: return ARM::VSTMSDB_UPD; + } + break; + case ARM::VSTMDIA: + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::VSTMDIA_UPD; + case ARM_AM::db: return ARM::VSTMDDB_UPD; + } + break; + } + + return 0; +} + +/// MergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base +/// register into the LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible: +/// +/// stmia rn, <ra, rb, rc> +/// rn := rn + 4 * 3; +/// => +/// stmia rn!, <ra, rb, rc> +/// +/// rn := rn - 4 * 3; +/// ldmia rn, <ra, rb, rc> +/// => +/// ldmdb rn!, <ra, rb, rc> +bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + bool &Advance, + MachineBasicBlock::iterator &I) { + MachineInstr *MI = MBBI; + unsigned Base = MI->getOperand(0).getReg(); + bool BaseKill = MI->getOperand(0).isKill(); + unsigned Bytes = getLSMultipleTransferSize(MI); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg); + int Opcode = MI->getOpcode(); + DebugLoc dl = MI->getDebugLoc(); + + // Can't use an updating ld/st if the base register is also a dest + // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined. + for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i) + if (MI->getOperand(i).getReg() == Base) + return false; + + bool DoMerge = false; + ARM_AM::AMSubMode Mode = ARM_AM::getLoadStoreMultipleSubMode(Opcode); + + // Try merging with the previous instruction. + MachineBasicBlock::iterator BeginMBBI = MBB.begin(); + if (MBBI != BeginMBBI) { + MachineBasicBlock::iterator PrevMBBI = prior(MBBI); + while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue()) + --PrevMBBI; + if (Mode == ARM_AM::ia && + isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { + Mode = ARM_AM::db; + DoMerge = true; + } else if (Mode == ARM_AM::ib && + isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { + Mode = ARM_AM::da; + DoMerge = true; + } + if (DoMerge) + MBB.erase(PrevMBBI); + } + + // Try merging with the next instruction. + MachineBasicBlock::iterator EndMBBI = MBB.end(); + if (!DoMerge && MBBI != EndMBBI) { + MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI); + while (NextMBBI != EndMBBI && NextMBBI->isDebugValue()) + ++NextMBBI; + if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) && + isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { + DoMerge = true; + } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) && + isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { + DoMerge = true; + } + if (DoMerge) { + if (NextMBBI == I) { + Advance = true; + ++I; + } + MBB.erase(NextMBBI); + } + } + + if (!DoMerge) + return false; + + unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode); + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(NewOpc)) + .addReg(Base, getDefRegState(true)) // WB base register + .addReg(Base, getKillRegState(BaseKill)) + .addImm(Pred).addReg(PredReg); + + // Transfer the rest of operands. + for (unsigned OpNum = 3, e = MI->getNumOperands(); OpNum != e; ++OpNum) + MIB.addOperand(MI->getOperand(OpNum)); + + // Transfer memoperands. + MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + + MBB.erase(MBBI); + return true; +} + +static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc, + ARM_AM::AddrOpc Mode) { + switch (Opc) { + case ARM::LDRi12: + return ARM::LDR_PRE_IMM; + case ARM::STRi12: + return ARM::STR_PRE_IMM; + case ARM::VLDRS: + return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD; + case ARM::VLDRD: + return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD; + case ARM::VSTRS: + return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD; + case ARM::VSTRD: + return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD; + case ARM::t2LDRi8: + case ARM::t2LDRi12: + return ARM::t2LDR_PRE; + case ARM::t2STRi8: + case ARM::t2STRi12: + return ARM::t2STR_PRE; + default: llvm_unreachable("Unhandled opcode!"); + } + return 0; +} + +static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc, + ARM_AM::AddrOpc Mode) { + switch (Opc) { + case ARM::LDRi12: + return ARM::LDR_POST_IMM; + case ARM::STRi12: + return ARM::STR_POST_IMM; + case ARM::VLDRS: + return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD; + case ARM::VLDRD: + return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD; + case ARM::VSTRS: + return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD; + case ARM::VSTRD: + return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD; + case ARM::t2LDRi8: + case ARM::t2LDRi12: + return ARM::t2LDR_POST; + case ARM::t2STRi8: + case ARM::t2STRi12: + return ARM::t2STR_POST; + default: llvm_unreachable("Unhandled opcode!"); + } + return 0; +} + +/// MergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base +/// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible: +bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const TargetInstrInfo *TII, + bool &Advance, + MachineBasicBlock::iterator &I) { + MachineInstr *MI = MBBI; + unsigned Base = MI->getOperand(1).getReg(); + bool BaseKill = MI->getOperand(1).isKill(); + unsigned Bytes = getLSMultipleTransferSize(MI); + int Opcode = MI->getOpcode(); + DebugLoc dl = MI->getDebugLoc(); + bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS || + Opcode == ARM::VSTRD || Opcode == ARM::VSTRS); + bool isAM2 = (Opcode == ARM::LDRi12 || Opcode == ARM::STRi12); + if (isi32Load(Opcode) || isi32Store(Opcode)) + if (MI->getOperand(2).getImm() != 0) + return false; + if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0) + return false; + + bool isLd = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD; + // Can't do the merge if the destination register is the same as the would-be + // writeback register. + if (isLd && MI->getOperand(0).getReg() == Base) + return false; + + unsigned PredReg = 0; + ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg); + bool DoMerge = false; + ARM_AM::AddrOpc AddSub = ARM_AM::add; + unsigned NewOpc = 0; + // AM2 - 12 bits, thumb2 - 8 bits. + unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100); + + // Try merging with the previous instruction. + MachineBasicBlock::iterator BeginMBBI = MBB.begin(); + if (MBBI != BeginMBBI) { + MachineBasicBlock::iterator PrevMBBI = prior(MBBI); + while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue()) + --PrevMBBI; + if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) { + DoMerge = true; + AddSub = ARM_AM::sub; + } else if (!isAM5 && + isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) { + DoMerge = true; + } + if (DoMerge) { + NewOpc = getPreIndexedLoadStoreOpcode(Opcode, AddSub); + MBB.erase(PrevMBBI); + } + } + + // Try merging with the next instruction. + MachineBasicBlock::iterator EndMBBI = MBB.end(); + if (!DoMerge && MBBI != EndMBBI) { + MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI); + while (NextMBBI != EndMBBI && NextMBBI->isDebugValue()) + ++NextMBBI; + if (!isAM5 && + isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) { + DoMerge = true; + AddSub = ARM_AM::sub; + } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) { + DoMerge = true; + } + if (DoMerge) { + NewOpc = getPostIndexedLoadStoreOpcode(Opcode, AddSub); + if (NextMBBI == I) { + Advance = true; + ++I; + } + MBB.erase(NextMBBI); + } + } + + if (!DoMerge) + return false; + + if (isAM5) { + // VLDM[SD}_UPD, VSTM[SD]_UPD + // (There are no base-updating versions of VLDR/VSTR instructions, but the + // updating load/store-multiple instructions can be used with only one + // register.) + MachineOperand &MO = MI->getOperand(0); + BuildMI(MBB, MBBI, dl, TII->get(NewOpc)) + .addReg(Base, getDefRegState(true)) // WB base register + .addReg(Base, getKillRegState(isLd ? BaseKill : false)) + .addImm(Pred).addReg(PredReg) + .addReg(MO.getReg(), (isLd ? getDefRegState(true) : + getKillRegState(MO.isKill()))); + } else if (isLd) { + if (isAM2) { + // LDR_PRE, LDR_POST + if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) { + int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; + BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) + .addReg(Base, RegState::Define) + .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); + } else { + int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); + BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) + .addReg(Base, RegState::Define) + .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); + } + } else { + int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; + // t2LDR_PRE, t2LDR_POST + BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) + .addReg(Base, RegState::Define) + .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); + } + } else { + MachineOperand &MO = MI->getOperand(0); + // FIXME: post-indexed stores use am2offset_imm, which still encodes + // the vestigal zero-reg offset register. When that's fixed, this clause + // can be removed entirely. + if (isAM2 && NewOpc == ARM::STR_POST_IMM) { + int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); + // STR_PRE, STR_POST + BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base) + .addReg(MO.getReg(), getKillRegState(MO.isKill())) + .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); + } else { + int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; + // t2STR_PRE, t2STR_POST + BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base) + .addReg(MO.getReg(), getKillRegState(MO.isKill())) + .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); + } + } + MBB.erase(MBBI); + + return true; +} + +/// isMemoryOp - Returns true if instruction is a memory operation that this +/// pass is capable of operating on. +static bool isMemoryOp(const MachineInstr *MI) { + // When no memory operands are present, conservatively assume unaligned, + // volatile, unfoldable. + if (!MI->hasOneMemOperand()) + return false; + + const MachineMemOperand *MMO = *MI->memoperands_begin(); + + // Don't touch volatile memory accesses - we may be changing their order. + if (MMO->isVolatile()) + return false; + + // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is + // not. + if (MMO->getAlignment() < 4) + return false; + + // str <undef> could probably be eliminated entirely, but for now we just want + // to avoid making a mess of it. + // FIXME: Use str <undef> as a wildcard to enable better stm folding. + if (MI->getNumOperands() > 0 && MI->getOperand(0).isReg() && + MI->getOperand(0).isUndef()) + return false; + + // Likewise don't mess with references to undefined addresses. + if (MI->getNumOperands() > 1 && MI->getOperand(1).isReg() && + MI->getOperand(1).isUndef()) + return false; + + int Opcode = MI->getOpcode(); + switch (Opcode) { + default: break; + case ARM::VLDRS: + case ARM::VSTRS: + return MI->getOperand(1).isReg(); + case ARM::VLDRD: + case ARM::VSTRD: + return MI->getOperand(1).isReg(); + case ARM::LDRi12: + case ARM::STRi12: + case ARM::t2LDRi8: + case ARM::t2LDRi12: + case ARM::t2STRi8: + case ARM::t2STRi12: + return MI->getOperand(1).isReg(); + } + return false; +} + +/// AdvanceRS - Advance register scavenger to just before the earliest memory +/// op that is being merged. +void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) { + MachineBasicBlock::iterator Loc = MemOps[0].MBBI; + unsigned Position = MemOps[0].Position; + for (unsigned i = 1, e = MemOps.size(); i != e; ++i) { + if (MemOps[i].Position < Position) { + Position = MemOps[i].Position; + Loc = MemOps[i].MBBI; + } + } + + if (Loc != MBB.begin()) + RS->forward(prior(Loc)); +} + +static int getMemoryOpOffset(const MachineInstr *MI) { + int Opcode = MI->getOpcode(); + bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD; + unsigned NumOperands = MI->getDesc().getNumOperands(); + unsigned OffField = MI->getOperand(NumOperands-3).getImm(); + + if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 || + Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 || + Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8 || + Opcode == ARM::LDRi12 || Opcode == ARM::STRi12) + return OffField; + + int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField) + : ARM_AM::getAM5Offset(OffField) * 4; + if (isAM3) { + if (ARM_AM::getAM3Op(OffField) == ARM_AM::sub) + Offset = -Offset; + } else { + if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub) + Offset = -Offset; + } + return Offset; +} + +static void InsertLDR_STR(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + int Offset, bool isDef, + DebugLoc dl, unsigned NewOpc, + unsigned Reg, bool RegDeadKill, bool RegUndef, + unsigned BaseReg, bool BaseKill, bool BaseUndef, + bool OffKill, bool OffUndef, + ARMCC::CondCodes Pred, unsigned PredReg, + const TargetInstrInfo *TII, bool isT2) { + if (isDef) { + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(), + TII->get(NewOpc)) + .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill)) + .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef)); + MIB.addImm(Offset).addImm(Pred).addReg(PredReg); + } else { + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(), + TII->get(NewOpc)) + .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef)) + .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef)); + MIB.addImm(Offset).addImm(Pred).addReg(PredReg); + } +} + +bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI) { + MachineInstr *MI = &*MBBI; + unsigned Opcode = MI->getOpcode(); + if (Opcode == ARM::LDRD || Opcode == ARM::STRD || + Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) { + unsigned EvenReg = MI->getOperand(0).getReg(); + unsigned OddReg = MI->getOperand(1).getReg(); + unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false); + unsigned OddRegNum = TRI->getDwarfRegNum(OddReg, false); + if ((EvenRegNum & 1) == 0 && (EvenRegNum + 1) == OddRegNum) + return false; + + MachineBasicBlock::iterator NewBBI = MBBI; + bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8; + bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8; + bool EvenDeadKill = isLd ? + MI->getOperand(0).isDead() : MI->getOperand(0).isKill(); + bool EvenUndef = MI->getOperand(0).isUndef(); + bool OddDeadKill = isLd ? + MI->getOperand(1).isDead() : MI->getOperand(1).isKill(); + bool OddUndef = MI->getOperand(1).isUndef(); + const MachineOperand &BaseOp = MI->getOperand(2); + unsigned BaseReg = BaseOp.getReg(); + bool BaseKill = BaseOp.isKill(); + bool BaseUndef = BaseOp.isUndef(); + bool OffKill = isT2 ? false : MI->getOperand(3).isKill(); + bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef(); + int OffImm = getMemoryOpOffset(MI); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg); + + if (OddRegNum > EvenRegNum && OffImm == 0) { + // Ascending register numbers and no offset. It's safe to change it to a + // ldm or stm. + unsigned NewOpc = (isLd) + ? (isT2 ? ARM::t2LDMIA : ARM::LDMIA) + : (isT2 ? ARM::t2STMIA : ARM::STMIA); + if (isLd) { + BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) + .addReg(BaseReg, getKillRegState(BaseKill)) + .addImm(Pred).addReg(PredReg) + .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill)) + .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill)); + ++NumLDRD2LDM; + } else { + BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) + .addReg(BaseReg, getKillRegState(BaseKill)) + .addImm(Pred).addReg(PredReg) + .addReg(EvenReg, + getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef)) + .addReg(OddReg, + getKillRegState(OddDeadKill) | getUndefRegState(OddUndef)); + ++NumSTRD2STM; + } + NewBBI = llvm::prior(MBBI); + } else { + // Split into two instructions. + unsigned NewOpc = (isLd) + ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12) + : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12); + DebugLoc dl = MBBI->getDebugLoc(); + // If this is a load and base register is killed, it may have been + // re-defed by the load, make sure the first load does not clobber it. + if (isLd && + (BaseKill || OffKill) && + (TRI->regsOverlap(EvenReg, BaseReg))) { + assert(!TRI->regsOverlap(OddReg, BaseReg)); + InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc, + OddReg, OddDeadKill, false, + BaseReg, false, BaseUndef, false, OffUndef, + Pred, PredReg, TII, isT2); + NewBBI = llvm::prior(MBBI); + InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, + EvenReg, EvenDeadKill, false, + BaseReg, BaseKill, BaseUndef, OffKill, OffUndef, + Pred, PredReg, TII, isT2); + } else { + if (OddReg == EvenReg && EvenDeadKill) { + // If the two source operands are the same, the kill marker is + // probably on the first one. e.g. + // t2STRDi8 %R5<kill>, %R5, %R9<kill>, 0, 14, %reg0 + EvenDeadKill = false; + OddDeadKill = true; + } + InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, + EvenReg, EvenDeadKill, EvenUndef, + BaseReg, false, BaseUndef, false, OffUndef, + Pred, PredReg, TII, isT2); + NewBBI = llvm::prior(MBBI); + InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc, + OddReg, OddDeadKill, OddUndef, + BaseReg, BaseKill, BaseUndef, OffKill, OffUndef, + Pred, PredReg, TII, isT2); + } + if (isLd) + ++NumLDRD2LDR; + else + ++NumSTRD2STR; + } + + MBB.erase(MI); + MBBI = NewBBI; + return true; + } + return false; +} + +/// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR +/// ops of the same base and incrementing offset into LDM / STM ops. +bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { + unsigned NumMerges = 0; + unsigned NumMemOps = 0; + MemOpQueue MemOps; + unsigned CurrBase = 0; + int CurrOpc = -1; + unsigned CurrSize = 0; + ARMCC::CondCodes CurrPred = ARMCC::AL; + unsigned CurrPredReg = 0; + unsigned Position = 0; + SmallVector<MachineBasicBlock::iterator,4> Merges; + + RS->enterBasicBlock(&MBB); + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + while (MBBI != E) { + if (FixInvalidRegPairOp(MBB, MBBI)) + continue; + + bool Advance = false; + bool TryMerge = false; + bool Clobber = false; + + bool isMemOp = isMemoryOp(MBBI); + if (isMemOp) { + int Opcode = MBBI->getOpcode(); + unsigned Size = getLSMultipleTransferSize(MBBI); + const MachineOperand &MO = MBBI->getOperand(0); + unsigned Reg = MO.getReg(); + bool isKill = MO.isDef() ? false : MO.isKill(); + unsigned Base = MBBI->getOperand(1).getReg(); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = llvm::getInstrPredicate(MBBI, PredReg); + int Offset = getMemoryOpOffset(MBBI); + // Watch out for: + // r4 := ldr [r5] + // r5 := ldr [r5, #4] + // r6 := ldr [r5, #8] + // + // The second ldr has effectively broken the chain even though it + // looks like the later ldr(s) use the same base register. Try to + // merge the ldr's so far, including this one. But don't try to + // combine the following ldr(s). + Clobber = (isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg()); + if (CurrBase == 0 && !Clobber) { + // Start of a new chain. + CurrBase = Base; + CurrOpc = Opcode; + CurrSize = Size; + CurrPred = Pred; + CurrPredReg = PredReg; + MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, Position, MBBI)); + ++NumMemOps; + Advance = true; + } else { + if (Clobber) { + TryMerge = true; + Advance = true; + } + + if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) { + // No need to match PredReg. + // Continue adding to the queue. + if (Offset > MemOps.back().Offset) { + MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, + Position, MBBI)); + ++NumMemOps; + Advance = true; + } else { + for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); + I != E; ++I) { + if (Offset < I->Offset) { + MemOps.insert(I, MemOpQueueEntry(Offset, Reg, isKill, + Position, MBBI)); + ++NumMemOps; + Advance = true; + break; + } else if (Offset == I->Offset) { + // Collision! This can't be merged! + break; + } + } + } + } + } + } + + if (MBBI->isDebugValue()) { + ++MBBI; + if (MBBI == E) + // Reach the end of the block, try merging the memory instructions. + TryMerge = true; + } else if (Advance) { + ++Position; + ++MBBI; + if (MBBI == E) + // Reach the end of the block, try merging the memory instructions. + TryMerge = true; + } else + TryMerge = true; + + if (TryMerge) { + if (NumMemOps > 1) { + // Try to find a free register to use as a new base in case it's needed. + // First advance to the instruction just before the start of the chain. + AdvanceRS(MBB, MemOps); + // Find a scratch register. + unsigned Scratch = RS->FindUnusedReg(ARM::GPRRegisterClass); + // Process the load / store instructions. + RS->forward(prior(MBBI)); + + // Merge ops. + Merges.clear(); + MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize, + CurrPred, CurrPredReg, Scratch, MemOps, Merges); + + // Try folding preceding/trailing base inc/dec into the generated + // LDM/STM ops. + for (unsigned i = 0, e = Merges.size(); i < e; ++i) + if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI)) + ++NumMerges; + NumMerges += Merges.size(); + + // Try folding preceding/trailing base inc/dec into those load/store + // that were not merged to form LDM/STM ops. + for (unsigned i = 0; i != NumMemOps; ++i) + if (!MemOps[i].Merged) + if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI)) + ++NumMerges; + + // RS may be pointing to an instruction that's deleted. + RS->skipTo(prior(MBBI)); + } else if (NumMemOps == 1) { + // Try folding preceding/trailing base inc/dec into the single + // load/store. + if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) { + ++NumMerges; + RS->forward(prior(MBBI)); + } + } + + CurrBase = 0; + CurrOpc = -1; + CurrSize = 0; + CurrPred = ARMCC::AL; + CurrPredReg = 0; + if (NumMemOps) { + MemOps.clear(); + NumMemOps = 0; + } + + // If iterator hasn't been advanced and this is not a memory op, skip it. + // It can't start a new chain anyway. + if (!Advance && !isMemOp && MBBI != E) { + ++Position; + ++MBBI; + } + } + } + return NumMerges > 0; +} + +/// MergeReturnIntoLDM - If this is a exit BB, try merging the return ops +/// ("bx lr" and "mov pc, lr") into the preceding stack restore so it +/// directly restore the value of LR into pc. +/// ldmfd sp!, {..., lr} +/// bx lr +/// or +/// ldmfd sp!, {..., lr} +/// mov pc, lr +/// => +/// ldmfd sp!, {..., pc} +bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) { + if (MBB.empty()) return false; + + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + if (MBBI != MBB.begin() && + (MBBI->getOpcode() == ARM::BX_RET || + MBBI->getOpcode() == ARM::tBX_RET || + MBBI->getOpcode() == ARM::MOVPCLR)) { + MachineInstr *PrevMI = prior(MBBI); + unsigned Opcode = PrevMI->getOpcode(); + if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::LDMDA_UPD || + Opcode == ARM::LDMDB_UPD || Opcode == ARM::LDMIB_UPD || + Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD) { + MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1); + if (MO.getReg() != ARM::LR) + return false; + unsigned NewOpc = (isThumb2 ? ARM::t2LDMIA_RET : ARM::LDMIA_RET); + assert(((isThumb2 && Opcode == ARM::t2LDMIA_UPD) || + Opcode == ARM::LDMIA_UPD) && "Unsupported multiple load-return!"); + PrevMI->setDesc(TII->get(NewOpc)); + MO.setReg(ARM::PC); + PrevMI->copyImplicitOps(&*MBBI); + MBB.erase(MBBI); + return true; + } + } + return false; +} + +bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { + const TargetMachine &TM = Fn.getTarget(); + AFI = Fn.getInfo<ARMFunctionInfo>(); + TII = TM.getInstrInfo(); + TRI = TM.getRegisterInfo(); + RS = new RegScavenger(); + isThumb2 = AFI->isThumb2Function(); + + bool Modified = false; + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; + ++MFI) { + MachineBasicBlock &MBB = *MFI; + Modified |= LoadStoreMultipleOpti(MBB); + if (TM.getSubtarget<ARMSubtarget>().hasV5TOps()) + Modified |= MergeReturnIntoLDM(MBB); + } + + delete RS; + return Modified; +} + + +/// ARMPreAllocLoadStoreOpt - Pre- register allocation pass that move +/// load / stores from consecutive locations close to make it more +/// likely they will be combined later. + +namespace { + struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{ + static char ID; + ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {} + + const TargetData *TD; + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + const ARMSubtarget *STI; + MachineRegisterInfo *MRI; + MachineFunction *MF; + + virtual bool runOnMachineFunction(MachineFunction &Fn); + + virtual const char *getPassName() const { + return "ARM pre- register allocation load / store optimization pass"; + } + + private: + bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl, + unsigned &NewOpc, unsigned &EvenReg, + unsigned &OddReg, unsigned &BaseReg, + int &Offset, + unsigned &PredReg, ARMCC::CondCodes &Pred, + bool &isT2); + bool RescheduleOps(MachineBasicBlock *MBB, + SmallVector<MachineInstr*, 4> &Ops, + unsigned Base, bool isLd, + DenseMap<MachineInstr*, unsigned> &MI2LocMap); + bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB); + }; + char ARMPreAllocLoadStoreOpt::ID = 0; +} + +bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { + TD = Fn.getTarget().getTargetData(); + TII = Fn.getTarget().getInstrInfo(); + TRI = Fn.getTarget().getRegisterInfo(); + STI = &Fn.getTarget().getSubtarget<ARMSubtarget>(); + MRI = &Fn.getRegInfo(); + MF = &Fn; + + bool Modified = false; + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; + ++MFI) + Modified |= RescheduleLoadStoreInstrs(MFI); + + return Modified; +} + +static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base, + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator E, + SmallPtrSet<MachineInstr*, 4> &MemOps, + SmallSet<unsigned, 4> &MemRegs, + const TargetRegisterInfo *TRI) { + // Are there stores / loads / calls between them? + // FIXME: This is overly conservative. We should make use of alias information + // some day. + SmallSet<unsigned, 4> AddedRegPressure; + while (++I != E) { + if (I->isDebugValue() || MemOps.count(&*I)) + continue; + const MCInstrDesc &MCID = I->getDesc(); + if (MCID.isCall() || MCID.isTerminator() || I->hasUnmodeledSideEffects()) + return false; + if (isLd && MCID.mayStore()) + return false; + if (!isLd) { + if (MCID.mayLoad()) + return false; + // It's not safe to move the first 'str' down. + // str r1, [r0] + // strh r5, [r0] + // str r4, [r0, #+4] + if (MCID.mayStore()) + return false; + } + for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) { + MachineOperand &MO = I->getOperand(j); + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (MO.isDef() && TRI->regsOverlap(Reg, Base)) + return false; + if (Reg != Base && !MemRegs.count(Reg)) + AddedRegPressure.insert(Reg); + } + } + + // Estimate register pressure increase due to the transformation. + if (MemRegs.size() <= 4) + // Ok if we are moving small number of instructions. + return true; + return AddedRegPressure.size() <= MemRegs.size() * 2; +} + +bool +ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, + DebugLoc &dl, + unsigned &NewOpc, unsigned &EvenReg, + unsigned &OddReg, unsigned &BaseReg, + int &Offset, unsigned &PredReg, + ARMCC::CondCodes &Pred, + bool &isT2) { + // Make sure we're allowed to generate LDRD/STRD. + if (!STI->hasV5TEOps()) + return false; + + // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD + unsigned Scale = 1; + unsigned Opcode = Op0->getOpcode(); + if (Opcode == ARM::LDRi12) + NewOpc = ARM::LDRD; + else if (Opcode == ARM::STRi12) + NewOpc = ARM::STRD; + else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) { + NewOpc = ARM::t2LDRDi8; + Scale = 4; + isT2 = true; + } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) { + NewOpc = ARM::t2STRDi8; + Scale = 4; + isT2 = true; + } else + return false; + + // Make sure the base address satisfies i64 ld / st alignment requirement. + if (!Op0->hasOneMemOperand() || + !(*Op0->memoperands_begin())->getValue() || + (*Op0->memoperands_begin())->isVolatile()) + return false; + + unsigned Align = (*Op0->memoperands_begin())->getAlignment(); + const Function *Func = MF->getFunction(); + unsigned ReqAlign = STI->hasV6Ops() + ? TD->getABITypeAlignment(Type::getInt64Ty(Func->getContext())) + : 8; // Pre-v6 need 8-byte align + if (Align < ReqAlign) + return false; + + // Then make sure the immediate offset fits. + int OffImm = getMemoryOpOffset(Op0); + if (isT2) { + int Limit = (1 << 8) * Scale; + if (OffImm >= Limit || (OffImm <= -Limit) || (OffImm & (Scale-1))) + return false; + Offset = OffImm; + } else { + ARM_AM::AddrOpc AddSub = ARM_AM::add; + if (OffImm < 0) { + AddSub = ARM_AM::sub; + OffImm = - OffImm; + } + int Limit = (1 << 8) * Scale; + if (OffImm >= Limit || (OffImm & (Scale-1))) + return false; + Offset = ARM_AM::getAM3Opc(AddSub, OffImm); + } + EvenReg = Op0->getOperand(0).getReg(); + OddReg = Op1->getOperand(0).getReg(); + if (EvenReg == OddReg) + return false; + BaseReg = Op0->getOperand(1).getReg(); + Pred = llvm::getInstrPredicate(Op0, PredReg); + dl = Op0->getDebugLoc(); + return true; +} + +namespace { + struct OffsetCompare { + bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const { + int LOffset = getMemoryOpOffset(LHS); + int ROffset = getMemoryOpOffset(RHS); + assert(LHS == RHS || LOffset != ROffset); + return LOffset > ROffset; + } + }; +} + +bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, + SmallVector<MachineInstr*, 4> &Ops, + unsigned Base, bool isLd, + DenseMap<MachineInstr*, unsigned> &MI2LocMap) { + bool RetVal = false; + + // Sort by offset (in reverse order). + std::sort(Ops.begin(), Ops.end(), OffsetCompare()); + + // The loads / stores of the same base are in order. Scan them from first to + // last and check for the following: + // 1. Any def of base. + // 2. Any gaps. + while (Ops.size() > 1) { + unsigned FirstLoc = ~0U; + unsigned LastLoc = 0; + MachineInstr *FirstOp = 0; + MachineInstr *LastOp = 0; + int LastOffset = 0; + unsigned LastOpcode = 0; + unsigned LastBytes = 0; + unsigned NumMove = 0; + for (int i = Ops.size() - 1; i >= 0; --i) { + MachineInstr *Op = Ops[i]; + unsigned Loc = MI2LocMap[Op]; + if (Loc <= FirstLoc) { + FirstLoc = Loc; + FirstOp = Op; + } + if (Loc >= LastLoc) { + LastLoc = Loc; + LastOp = Op; + } + + unsigned Opcode = Op->getOpcode(); + if (LastOpcode && Opcode != LastOpcode) + break; + + int Offset = getMemoryOpOffset(Op); + unsigned Bytes = getLSMultipleTransferSize(Op); + if (LastBytes) { + if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes)) + break; + } + LastOffset = Offset; + LastBytes = Bytes; + LastOpcode = Opcode; + if (++NumMove == 8) // FIXME: Tune this limit. + break; + } + + if (NumMove <= 1) + Ops.pop_back(); + else { + SmallPtrSet<MachineInstr*, 4> MemOps; + SmallSet<unsigned, 4> MemRegs; + for (int i = NumMove-1; i >= 0; --i) { + MemOps.insert(Ops[i]); + MemRegs.insert(Ops[i]->getOperand(0).getReg()); + } + + // Be conservative, if the instructions are too far apart, don't + // move them. We want to limit the increase of register pressure. + bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this. + if (DoMove) + DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp, + MemOps, MemRegs, TRI); + if (!DoMove) { + for (unsigned i = 0; i != NumMove; ++i) + Ops.pop_back(); + } else { + // This is the new location for the loads / stores. + MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp; + while (InsertPos != MBB->end() + && (MemOps.count(InsertPos) || InsertPos->isDebugValue())) + ++InsertPos; + + // If we are moving a pair of loads / stores, see if it makes sense + // to try to allocate a pair of registers that can form register pairs. + MachineInstr *Op0 = Ops.back(); + MachineInstr *Op1 = Ops[Ops.size()-2]; + unsigned EvenReg = 0, OddReg = 0; + unsigned BaseReg = 0, PredReg = 0; + ARMCC::CondCodes Pred = ARMCC::AL; + bool isT2 = false; + unsigned NewOpc = 0; + int Offset = 0; + DebugLoc dl; + if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc, + EvenReg, OddReg, BaseReg, + Offset, PredReg, Pred, isT2)) { + Ops.pop_back(); + Ops.pop_back(); + + const MCInstrDesc &MCID = TII->get(NewOpc); + const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI); + MRI->constrainRegClass(EvenReg, TRC); + MRI->constrainRegClass(OddReg, TRC); + + // Form the pair instruction. + if (isLd) { + MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID) + .addReg(EvenReg, RegState::Define) + .addReg(OddReg, RegState::Define) + .addReg(BaseReg); + // FIXME: We're converting from LDRi12 to an insn that still + // uses addrmode2, so we need an explicit offset reg. It should + // always by reg0 since we're transforming LDRi12s. + if (!isT2) + MIB.addReg(0); + MIB.addImm(Offset).addImm(Pred).addReg(PredReg); + ++NumLDRDFormed; + } else { + MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID) + .addReg(EvenReg) + .addReg(OddReg) + .addReg(BaseReg); + // FIXME: We're converting from LDRi12 to an insn that still + // uses addrmode2, so we need an explicit offset reg. It should + // always by reg0 since we're transforming STRi12s. + if (!isT2) + MIB.addReg(0); + MIB.addImm(Offset).addImm(Pred).addReg(PredReg); + ++NumSTRDFormed; + } + MBB->erase(Op0); + MBB->erase(Op1); + + // Add register allocation hints to form register pairs. + MRI->setRegAllocationHint(EvenReg, ARMRI::RegPairEven, OddReg); + MRI->setRegAllocationHint(OddReg, ARMRI::RegPairOdd, EvenReg); + } else { + for (unsigned i = 0; i != NumMove; ++i) { + MachineInstr *Op = Ops.back(); + Ops.pop_back(); + MBB->splice(InsertPos, MBB, Op); + } + } + + NumLdStMoved += NumMove; + RetVal = true; + } + } + } + + return RetVal; +} + +bool +ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { + bool RetVal = false; + + DenseMap<MachineInstr*, unsigned> MI2LocMap; + DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap; + DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap; + SmallVector<unsigned, 4> LdBases; + SmallVector<unsigned, 4> StBases; + + unsigned Loc = 0; + MachineBasicBlock::iterator MBBI = MBB->begin(); + MachineBasicBlock::iterator E = MBB->end(); + while (MBBI != E) { + for (; MBBI != E; ++MBBI) { + MachineInstr *MI = MBBI; + const MCInstrDesc &MCID = MI->getDesc(); + if (MCID.isCall() || MCID.isTerminator()) { + // Stop at barriers. + ++MBBI; + break; + } + + if (!MI->isDebugValue()) + MI2LocMap[MI] = ++Loc; + + if (!isMemoryOp(MI)) + continue; + unsigned PredReg = 0; + if (llvm::getInstrPredicate(MI, PredReg) != ARMCC::AL) + continue; + + int Opc = MI->getOpcode(); + bool isLd = isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD; + unsigned Base = MI->getOperand(1).getReg(); + int Offset = getMemoryOpOffset(MI); + + bool StopHere = false; + if (isLd) { + DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI = + Base2LdsMap.find(Base); + if (BI != Base2LdsMap.end()) { + for (unsigned i = 0, e = BI->second.size(); i != e; ++i) { + if (Offset == getMemoryOpOffset(BI->second[i])) { + StopHere = true; + break; + } + } + if (!StopHere) + BI->second.push_back(MI); + } else { + SmallVector<MachineInstr*, 4> MIs; + MIs.push_back(MI); + Base2LdsMap[Base] = MIs; + LdBases.push_back(Base); + } + } else { + DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI = + Base2StsMap.find(Base); + if (BI != Base2StsMap.end()) { + for (unsigned i = 0, e = BI->second.size(); i != e; ++i) { + if (Offset == getMemoryOpOffset(BI->second[i])) { + StopHere = true; + break; + } + } + if (!StopHere) + BI->second.push_back(MI); + } else { + SmallVector<MachineInstr*, 4> MIs; + MIs.push_back(MI); + Base2StsMap[Base] = MIs; + StBases.push_back(Base); + } + } + + if (StopHere) { + // Found a duplicate (a base+offset combination that's seen earlier). + // Backtrack. + --Loc; + break; + } + } + + // Re-schedule loads. + for (unsigned i = 0, e = LdBases.size(); i != e; ++i) { + unsigned Base = LdBases[i]; + SmallVector<MachineInstr*, 4> &Lds = Base2LdsMap[Base]; + if (Lds.size() > 1) + RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap); + } + + // Re-schedule stores. + for (unsigned i = 0, e = StBases.size(); i != e; ++i) { + unsigned Base = StBases[i]; + SmallVector<MachineInstr*, 4> &Sts = Base2StsMap[Base]; + if (Sts.size() > 1) + RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap); + } + + if (MBBI != E) { + Base2LdsMap.clear(); + Base2StsMap.clear(); + LdBases.clear(); + StBases.clear(); + } + } + + return RetVal; +} + + +/// createARMLoadStoreOptimizationPass - returns an instance of the load / store +/// optimization pass. +FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) { + if (PreAlloc) + return new ARMPreAllocLoadStoreOpt(); + return new ARMLoadStoreOpt(); +} diff --git a/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp b/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp new file mode 100644 index 0000000..daa126d --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp @@ -0,0 +1,125 @@ +//===-- ARMMCInstLower.cpp - Convert ARM MachineInstr to an MCInst --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains code to lower ARM MachineInstrs to their corresponding +// MCInst records. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMAsmPrinter.h" +#include "MCTargetDesc/ARMMCExpr.h" +#include "llvm/Constants.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Target/Mangler.h" +using namespace llvm; + + +MCOperand ARMAsmPrinter::GetSymbolRef(const MachineOperand &MO, + const MCSymbol *Symbol) { + const MCExpr *Expr; + switch (MO.getTargetFlags()) { + default: { + Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None, + OutContext); + switch (MO.getTargetFlags()) { + default: + assert(0 && "Unknown target flag on symbol operand"); + case 0: + break; + case ARMII::MO_LO16: + Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None, + OutContext); + Expr = ARMMCExpr::CreateLower16(Expr, OutContext); + break; + case ARMII::MO_HI16: + Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None, + OutContext); + Expr = ARMMCExpr::CreateUpper16(Expr, OutContext); + break; + } + break; + } + + case ARMII::MO_PLT: + Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_ARM_PLT, + OutContext); + break; + } + + if (!MO.isJTI() && MO.getOffset()) + Expr = MCBinaryExpr::CreateAdd(Expr, + MCConstantExpr::Create(MO.getOffset(), + OutContext), + OutContext); + return MCOperand::CreateExpr(Expr); + +} + +bool ARMAsmPrinter::lowerOperand(const MachineOperand &MO, + MCOperand &MCOp) { + switch (MO.getType()) { + default: + assert(0 && "unknown operand type"); + return false; + case MachineOperand::MO_Register: + // Ignore all non-CPSR implicit register operands. + if (MO.isImplicit() && MO.getReg() != ARM::CPSR) + return false; + assert(!MO.getSubReg() && "Subregs should be eliminated!"); + MCOp = MCOperand::CreateReg(MO.getReg()); + break; + case MachineOperand::MO_Immediate: + MCOp = MCOperand::CreateImm(MO.getImm()); + break; + case MachineOperand::MO_MachineBasicBlock: + MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create( + MO.getMBB()->getSymbol(), OutContext)); + break; + case MachineOperand::MO_GlobalAddress: + MCOp = GetSymbolRef(MO, Mang->getSymbol(MO.getGlobal())); + break; + case MachineOperand::MO_ExternalSymbol: + MCOp = GetSymbolRef(MO, + GetExternalSymbolSymbol(MO.getSymbolName())); + break; + case MachineOperand::MO_JumpTableIndex: + MCOp = GetSymbolRef(MO, GetJTISymbol(MO.getIndex())); + break; + case MachineOperand::MO_ConstantPoolIndex: + MCOp = GetSymbolRef(MO, GetCPISymbol(MO.getIndex())); + break; + case MachineOperand::MO_BlockAddress: + MCOp = GetSymbolRef(MO, GetBlockAddressSymbol(MO.getBlockAddress())); + break; + case MachineOperand::MO_FPImmediate: { + APFloat Val = MO.getFPImm()->getValueAPF(); + bool ignored; + Val.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &ignored); + MCOp = MCOperand::CreateFPImm(Val.convertToDouble()); + break; + } + } + return true; +} + +void llvm::LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, + ARMAsmPrinter &AP) { + OutMI.setOpcode(MI->getOpcode()); + + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + + MCOperand MCOp; + if (AP.lowerOperand(MO, MCOp)) + OutMI.addOperand(MCOp); + } +} diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h new file mode 100644 index 0000000..138f0c2 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -0,0 +1,250 @@ +//====- ARMMachineFuctionInfo.h - ARM machine function info -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares ARM-specific per-machine-function information. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMMACHINEFUNCTIONINFO_H +#define ARMMACHINEFUNCTIONINFO_H + +#include "ARMSubtarget.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/ADT/BitVector.h" + +namespace llvm { + +/// ARMFunctionInfo - This class is derived from MachineFunctionInfo and +/// contains private ARM-specific information for each MachineFunction. +class ARMFunctionInfo : public MachineFunctionInfo { + + /// isThumb - True if this function is compiled under Thumb mode. + /// Used to initialized Align, so must precede it. + bool isThumb; + + /// hasThumb2 - True if the target architecture supports Thumb2. Do not use + /// to determine if function is compiled under Thumb mode, for that use + /// 'isThumb'. + bool hasThumb2; + + /// VarArgsRegSaveSize - Size of the register save area for vararg functions. + /// + unsigned VarArgsRegSaveSize; + + /// HasStackFrame - True if this function has a stack frame. Set by + /// processFunctionBeforeCalleeSavedScan(). + bool HasStackFrame; + + /// RestoreSPFromFP - True if epilogue should restore SP from FP. Set by + /// emitPrologue. + bool RestoreSPFromFP; + + /// LRSpilledForFarJump - True if the LR register has been for spilled to + /// enable far jump. + bool LRSpilledForFarJump; + + /// FramePtrSpillOffset - If HasStackFrame, this records the frame pointer + /// spill stack offset. + unsigned FramePtrSpillOffset; + + /// GPRCS1Offset, GPRCS2Offset, DPRCSOffset - Starting offset of callee saved + /// register spills areas. For Mac OS X: + /// + /// GPR callee-saved (1) : r4, r5, r6, r7, lr + /// -------------------------------------------- + /// GPR callee-saved (2) : r8, r10, r11 + /// -------------------------------------------- + /// DPR callee-saved : d8 - d15 + unsigned GPRCS1Offset; + unsigned GPRCS2Offset; + unsigned DPRCSOffset; + + /// GPRCS1Size, GPRCS2Size, DPRCSSize - Sizes of callee saved register spills + /// areas. + unsigned GPRCS1Size; + unsigned GPRCS2Size; + unsigned DPRCSSize; + + /// GPRCS1Frames, GPRCS2Frames, DPRCSFrames - Keeps track of frame indices + /// which belong to these spill areas. + BitVector GPRCS1Frames; + BitVector GPRCS2Frames; + BitVector DPRCSFrames; + + /// JumpTableUId - Unique id for jumptables. + /// + unsigned JumpTableUId; + + unsigned PICLabelUId; + + /// VarArgsFrameIndex - FrameIndex for start of varargs area. + int VarArgsFrameIndex; + + /// HasITBlocks - True if IT blocks have been inserted. + bool HasITBlocks; + + /// CPEClones - Track constant pool entries clones created by Constant Island + /// pass. + DenseMap<unsigned, unsigned> CPEClones; + +public: + ARMFunctionInfo() : + isThumb(false), + hasThumb2(false), + VarArgsRegSaveSize(0), HasStackFrame(false), RestoreSPFromFP(false), + LRSpilledForFarJump(false), + FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), + GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), + GPRCS1Frames(0), GPRCS2Frames(0), DPRCSFrames(0), + JumpTableUId(0), PICLabelUId(0), + VarArgsFrameIndex(0), HasITBlocks(false) {} + + explicit ARMFunctionInfo(MachineFunction &MF) : + isThumb(MF.getTarget().getSubtarget<ARMSubtarget>().isThumb()), + hasThumb2(MF.getTarget().getSubtarget<ARMSubtarget>().hasThumb2()), + VarArgsRegSaveSize(0), HasStackFrame(false), RestoreSPFromFP(false), + LRSpilledForFarJump(false), + FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), + GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), + GPRCS1Frames(32), GPRCS2Frames(32), DPRCSFrames(32), + JumpTableUId(0), PICLabelUId(0), + VarArgsFrameIndex(0), HasITBlocks(false) {} + + bool isThumbFunction() const { return isThumb; } + bool isThumb1OnlyFunction() const { return isThumb && !hasThumb2; } + bool isThumb2Function() const { return isThumb && hasThumb2; } + + unsigned getVarArgsRegSaveSize() const { return VarArgsRegSaveSize; } + void setVarArgsRegSaveSize(unsigned s) { VarArgsRegSaveSize = s; } + + bool hasStackFrame() const { return HasStackFrame; } + void setHasStackFrame(bool s) { HasStackFrame = s; } + + bool shouldRestoreSPFromFP() const { return RestoreSPFromFP; } + void setShouldRestoreSPFromFP(bool s) { RestoreSPFromFP = s; } + + bool isLRSpilledForFarJump() const { return LRSpilledForFarJump; } + void setLRIsSpilledForFarJump(bool s) { LRSpilledForFarJump = s; } + + unsigned getFramePtrSpillOffset() const { return FramePtrSpillOffset; } + void setFramePtrSpillOffset(unsigned o) { FramePtrSpillOffset = o; } + + unsigned getGPRCalleeSavedArea1Offset() const { return GPRCS1Offset; } + unsigned getGPRCalleeSavedArea2Offset() const { return GPRCS2Offset; } + unsigned getDPRCalleeSavedAreaOffset() const { return DPRCSOffset; } + + void setGPRCalleeSavedArea1Offset(unsigned o) { GPRCS1Offset = o; } + void setGPRCalleeSavedArea2Offset(unsigned o) { GPRCS2Offset = o; } + void setDPRCalleeSavedAreaOffset(unsigned o) { DPRCSOffset = o; } + + unsigned getGPRCalleeSavedArea1Size() const { return GPRCS1Size; } + unsigned getGPRCalleeSavedArea2Size() const { return GPRCS2Size; } + unsigned getDPRCalleeSavedAreaSize() const { return DPRCSSize; } + + void setGPRCalleeSavedArea1Size(unsigned s) { GPRCS1Size = s; } + void setGPRCalleeSavedArea2Size(unsigned s) { GPRCS2Size = s; } + void setDPRCalleeSavedAreaSize(unsigned s) { DPRCSSize = s; } + + bool isGPRCalleeSavedArea1Frame(int fi) const { + if (fi < 0 || fi >= (int)GPRCS1Frames.size()) + return false; + return GPRCS1Frames[fi]; + } + bool isGPRCalleeSavedArea2Frame(int fi) const { + if (fi < 0 || fi >= (int)GPRCS2Frames.size()) + return false; + return GPRCS2Frames[fi]; + } + bool isDPRCalleeSavedAreaFrame(int fi) const { + if (fi < 0 || fi >= (int)DPRCSFrames.size()) + return false; + return DPRCSFrames[fi]; + } + + void addGPRCalleeSavedArea1Frame(int fi) { + if (fi >= 0) { + int Size = GPRCS1Frames.size(); + if (fi >= Size) { + Size *= 2; + if (fi >= Size) + Size = fi+1; + GPRCS1Frames.resize(Size); + } + GPRCS1Frames[fi] = true; + } + } + void addGPRCalleeSavedArea2Frame(int fi) { + if (fi >= 0) { + int Size = GPRCS2Frames.size(); + if (fi >= Size) { + Size *= 2; + if (fi >= Size) + Size = fi+1; + GPRCS2Frames.resize(Size); + } + GPRCS2Frames[fi] = true; + } + } + void addDPRCalleeSavedAreaFrame(int fi) { + if (fi >= 0) { + int Size = DPRCSFrames.size(); + if (fi >= Size) { + Size *= 2; + if (fi >= Size) + Size = fi+1; + DPRCSFrames.resize(Size); + } + DPRCSFrames[fi] = true; + } + } + + unsigned createJumpTableUId() { + return JumpTableUId++; + } + + unsigned getNumJumpTables() const { + return JumpTableUId; + } + + void initPICLabelUId(unsigned UId) { + PICLabelUId = UId; + } + + unsigned getNumPICLabels() const { + return PICLabelUId; + } + + unsigned createPICLabelUId() { + return PICLabelUId++; + } + + int getVarArgsFrameIndex() const { return VarArgsFrameIndex; } + void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; } + + bool hasITBlocks() const { return HasITBlocks; } + void setHasITBlocks(bool h) { HasITBlocks = h; } + + void recordCPEClone(unsigned CPIdx, unsigned CPCloneIdx) { + if (!CPEClones.insert(std::make_pair(CPCloneIdx, CPIdx)).second) + assert(0 && "Duplicate entries!"); + } + + unsigned getOriginalCPIdx(unsigned CloneIdx) const { + DenseMap<unsigned, unsigned>::const_iterator I = CPEClones.find(CloneIdx); + if (I != CPEClones.end()) + return I->second; + else + return -1U; + } +}; +} // End llvm namespace + +#endif // ARMMACHINEFUNCTIONINFO_H diff --git a/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h b/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h new file mode 100644 index 0000000..18e1620 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h @@ -0,0 +1,6586 @@ +//===-- ARMPerfectShuffle.h - NEON Perfect Shuffle Table ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file, which was autogenerated by llvm-PerfectShuffle, contains data +// for the optimal way to build a perfect shuffle using neon instructions. +// +//===----------------------------------------------------------------------===// + +// 31 entries have cost 0 +// 242 entries have cost 1 +// 1447 entries have cost 2 +// 3602 entries have cost 3 +// 1237 entries have cost 4 +// 2 entries have cost 5 + +// This table is 6561*4 = 26244 bytes in size. +static const unsigned PerfectShuffleTable[6561+1] = { + 135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS + 1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS + 2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0> + 2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0> + 1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS + 2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3> + 2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3> + 2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0> + 135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS + 2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0> + 1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS + 1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS + 2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0> + 2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5> + 2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7> + 3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1> + 2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1> + 1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS + 3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0> + 3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1> + 1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS + 2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0> + 3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6> + 3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6> + 2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7> + 2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2> + 1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS + 2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0> + 2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0> + 2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0> + 2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3> + 2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6> + 3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6> + 3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7> + 3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0> + 2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0> + 2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1> + 2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS + 3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS + 3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4> + 3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6> + 1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS + 2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS + 3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5> + 1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS + 2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7> + 2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3> + 3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7> + 3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5> + 2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6> + 2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5> + 2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7> + 2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7> + 2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7> + 2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7> + 3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS + 2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3> + 3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7> + 3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS + 3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0> + 2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6> + 2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0> + 2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0> + 2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0> + 3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0> + 3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7> + 3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0> + 2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6> + 3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0> + 3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7> + 2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7> + 2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0> + 135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS + 1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS + 1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS + 2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u> + 1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS + 1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS + 2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS + 2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u> + 135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS + 2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1> + 1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS + 2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1> + 2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0> + 2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5> + 4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7> + 2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1> + 2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0> + 1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS + 1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1> + 2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1> + 2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0> + 2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3> + 1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS + 2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7> + 2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3> + 2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1> + 1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS + 1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS + 3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1> + 2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2> + 835584U, // <0,1,2,3>: Cost 0 copy LHS + 1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS + 3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7> + 2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7> + 1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2> + 835584U, // <0,1,2,u>: Cost 0 copy LHS + 2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0> + 2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3> + 2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0> + 2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0> + 2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS + 2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7> + 2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0> + 2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1> + 2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3> + 2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS + 4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1> + 2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1> + 2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4> + 2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS + 1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS + 2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS + 2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4> + 1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS + 3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1> + 2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1> + 3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0> + 2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7> + 2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6> + 2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1> + 2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1> + 2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1> + 2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7> + 2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS + 3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7> + 2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1> + 3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7> + 2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS + 3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7> + 2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1> + 1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1> + 1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1> + 2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0> + 2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1> + 2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0> + 2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1> + 2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6> + 3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0> + 2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0> + 2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7> + 2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2> + 1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS + 1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS + 2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS + 835584U, // <0,1,u,3>: Cost 0 copy LHS + 1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS + 1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS + 2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS + 1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u> + 835584U, // <0,1,u,u>: Cost 0 copy LHS + 2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0> + 1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS + 1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS + 2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0> + 2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6> + 2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7> + 2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7> + 2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0> + 1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS + 2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2> + 2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1> + 2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2> + 2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2> + 2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS + 2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7> + 2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7> + 3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7> + 2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2> + 1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2> + 2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2> + 2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2> + 3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3> + 1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS + 2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3> + 2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7> + 2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2> + 1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS + 2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2> + 2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2> + 3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3> + 2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3> + 2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6> + 2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6> + 3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3> + 2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0> + 2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2> + 2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS + 4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3> + 2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4> + 2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4> + 2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS + 1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS + 1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS + 2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4> + 1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS + 2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7> + 2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3> + 3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7> + 3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6> + 2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6> + 2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5> + 2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0> + 2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS + 2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS + 2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1> + 3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2> + 2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3> + 2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7> + 2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5> + 3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6> + 2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6> + 2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2> + 2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7> + 2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2> + 2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2> + 3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2> + 2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0> + 2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6> + 2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2> + 3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2> + 2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7> + 2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2> + 1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u> + 1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS + 1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS + 3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3> + 1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS + 1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS + 1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS + 2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS + 1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS + 2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0> + 2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2> + 2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0> + 4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3> + 2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS + 4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6> + 3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7> + 3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0> + 2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS + 2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2> + 2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1> + 2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3> + 2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3> + 2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6> + 4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6> + 3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1> + 3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3> + 2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2> + 1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS + 1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2> + 2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2> + 3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3> + 1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS + 3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6> + 2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3> + 2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2> + 1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS + 3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2> + 2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3> + 3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3> + 2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3> + 3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6> + 3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6> + 3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7> + 3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7> + 2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3> + 3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2> + 2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4> + 2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4> + 4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3> + 3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6> + 2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6> + 3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS + 3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4> + 2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6> + 3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS + 3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2> + 4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2> + 3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7> + 2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5> + 3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7> + 2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7> + 2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0> + 2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5> + 2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7> + 3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3> + 3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7> + 3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7> + 3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7> + 3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6> + 3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6> + 2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0> + 2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0> + 3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1> + 2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3> + 3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7> + 3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7> + 3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5> + 3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7> + 3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7> + 3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0> + 2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3> + 1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS + 1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u> + 2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2> + 3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3> + 1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS + 3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6> + 2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3> + 2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0> + 1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS + 2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4> + 2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS + 2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4> + 3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0> + 2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6> + 2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1> + 3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS + 3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0> + 3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS + 2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS + 3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1> + 3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0> + 2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1> + 2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS + 1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS + 3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS + 2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1> + 1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS + 2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS + 2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2> + 3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2> + 2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4> + 2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS + 2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS + 1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS + 2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2> + 1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS + 3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2> + 3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2> + 3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4> + 3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3> + 3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6> + 3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6> + 4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS + 3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4> + 3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2> + 2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4> + 3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0> + 3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3> + 3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4> + 2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4> + 2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS + 2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS + 3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4> + 2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS + 2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS + 2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0> + 3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5> + 2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5> + 2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS + 2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6> + 1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS + 2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5> + 1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS + 2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6> + 3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2> + 2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6> + 3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0> + 2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6> + 3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6> + 2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0> + 2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4> + 2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2> + 3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS + 3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1> + 3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4> + 3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4> + 3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS + 2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0> + 2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0> + 3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2> + 2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0> + 2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS + 2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS + 2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS + 2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u> + 2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS + 1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS + 1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS + 2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u> + 1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS + 3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0> + 2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS + 3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2> + 3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5> + 3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS + 3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1> + 2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1> + 3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS + 2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS + 1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS + 2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3> + 2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2> + 2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2> + 1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1> + 2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5> + 2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0> + 2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3> + 1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS + 2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS + 2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2> + 3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7> + 2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2> + 2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS + 3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5> + 4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6> + 3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS + 3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS + 3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2> + 3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3> + 3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1> + 3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3> + 2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5> + 3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0> + 3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7> + 2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0> + 2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0> + 2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1> + 3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4> + 3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4> + 3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5> + 3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6> + 2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS + 3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5> + 2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6> + 2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS + 3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0> + 3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0> + 3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5> + 3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0> + 3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5> + 2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5> + 2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0> + 2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7> + 2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7> + 2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS + 3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0> + 3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3> + 3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4> + 2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS + 2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0> + 3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7> + 1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0> + 1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0> + 2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS + 2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0> + 2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7> + 3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2> + 2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS + 2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7> + 2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0> + 2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0> + 2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS + 1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS + 2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS + 2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0> + 2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u> + 1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u> + 2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS + 2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u> + 1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0> + 1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0> + 2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS + 2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS + 2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2> + 3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4> + 2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6> + 2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0> + 2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0> + 2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS + 2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS + 2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS + 3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1> + 2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3> + 3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3> + 2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS + 2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1> + 2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6> + 2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS + 2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS + 1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS + 2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2> + 2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2> + 2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1> + 1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2> + 2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3> + 3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6> + 2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS + 1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS + 3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2> + 3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7> + 3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0> + 3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3> + 2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6> + 3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7> + 3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0> + 2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0> + 2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0> + 2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS + 4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2> + 2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4> + 3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2> + 2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS + 2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS + 2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0> + 4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS + 2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS + 2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7> + 3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0> + 3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7> + 3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0> + 3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5> + 3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0> + 3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7> + 4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS + 2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7> + 3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0> + 3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3> + 3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6> + 3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0> + 3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4> + 3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6> + 2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6> + 2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7> + 2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7> + 2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1> + 2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0> + 3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7> + 2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0> + 2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5> + 2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6> + 3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2> + 2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0> + 2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1> + 1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS + 2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS + 2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u> + 2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0> + 1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u> + 2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS + 2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0> + 2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS + 1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS + 2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0> + 2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS + 2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0> + 3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0> + 2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5> + 2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6> + 2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7> + 3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7> + 2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS + 2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1> + 3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1> + 2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0> + 3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5> + 2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1> + 3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3> + 3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7> + 2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7> + 2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1> + 2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS + 1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2> + 3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2> + 2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0> + 2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS + 2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7> + 2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2> + 3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7> + 1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2> + 3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2> + 3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3> + 3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3> + 3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3> + 3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6> + 3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7> + 2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7> + 3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0> + 2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7> + 3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS + 2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4> + 2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4> + 3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7> + 3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6> + 2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS + 3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7> + 3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5> + 2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS + 3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0> + 3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7> + 3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5> + 3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7> + 2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5> + 3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7> + 2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7> + 2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0> + 2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7> + 3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0> + 2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6> + 3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7> + 3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7> + 3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS + 2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7> + 3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6> + 2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0> + 2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7> + 3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1> + 3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0> + 3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7> + 3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0> + 3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS + 3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7> + 3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7> + 2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7> + 2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7> + 2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u> + 1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u> + 2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u> + 2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0> + 2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u> + 2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS + 2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7> + 2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0> + 1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u> + 135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS + 1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS + 1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS + 2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2> + 1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS + 2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6> + 3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS + 2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0> + 135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS + 1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1> + 1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS + 1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS + 2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2> + 1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS + 1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS + 2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7> + 2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS + 1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS + 1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS + 1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2> + 1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS + 835584U, // <0,u,2,3>: Cost 0 copy LHS + 1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS + 3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6> + 1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS + 1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2> + 835584U, // <0,u,2,u>: Cost 0 copy LHS + 2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2> + 2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2> + 2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u> + 2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3> + 2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6> + 2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6> + 2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7> + 2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u> + 2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2> + 2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS + 2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS + 3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS + 2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4> + 2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS + 1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS + 1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS + 2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6> + 1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS + 2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS + 2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0> + 3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7> + 2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7> + 2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS + 2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u> + 1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS + 2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS + 1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS + 2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS + 2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6> + 2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u> + 2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7> + 2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS + 2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u> + 2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u> + 1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u> + 1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u> + 2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS + 2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0> + 2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7> + 2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u> + 2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS + 2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6> + 2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7> + 2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7> + 2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS + 135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS + 1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS + 1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS + 835584U, // <0,u,u,3>: Cost 0 copy LHS + 1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS + 1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS + 1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS + 1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u> + 835584U, // <0,u,u,u>: Cost 0 copy LHS + 2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0> + 1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1> + 2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2> + 2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0> + 2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1> + 2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0> + 3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7> + 3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0> + 1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1> + 2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS + 2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1> + 1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS + 3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3> + 2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS + 2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1> + 2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7> + 3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2> + 1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS + 2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1> + 2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1> + 2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0> + 2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1> + 2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6> + 3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7> + 2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0> + 2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2> + 2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1> + 3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0> + 3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1> + 67944550U, // <1,0,3,2>: Cost 1 vrev LHS + 2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3> + 2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS + 4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7> + 3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7> + 2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3> + 68386972U, // <1,0,3,u>: Cost 1 vrev LHS + 2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1> + 2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5> + 2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6> + 3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1> + 3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1> + 2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS + 3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1> + 3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4> + 2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS + 4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0> + 2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS + 3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS + 3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5> + 2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5> + 3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0> + 3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0> + 3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS + 3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS + 3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1> + 2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7> + 3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7> + 3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6> + 3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1> + 3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0> + 3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0> + 2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0> + 2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0> + 2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0> + 4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1> + 2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7> + 3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0> + 3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6> + 3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0> + 2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0> + 3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7> + 2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0> + 3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0> + 1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1> + 67985515U, // <1,0,u,2>: Cost 1 vrev LHS + 2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1> + 2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6> + 2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS + 2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0> + 2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u> + 68427937U, // <1,0,u,u>: Cost 1 vrev LHS + 1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1> + 1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS + 2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1> + 2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2> + 2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5> + 2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1> + 3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7> + 3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0> + 1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1> + 1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS + 202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS + 2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0> + 2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3> + 1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS + 2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7> + 2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7> + 2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1> + 202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS + 2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2> + 2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1> + 2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2> + 2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1> + 2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS + 3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7> + 2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7> + 3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0> + 2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1> + 2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2> + 3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1> + 4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2> + 2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS + 2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6> + 2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7> + 3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7> + 2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3> + 2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS + 2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS + 2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4> + 3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0> + 3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5> + 2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS + 1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS + 2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS + 3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4> + 1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS + 2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1> + 2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3> + 4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2> + 2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7> + 2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5> + 2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5> + 2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0> + 2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS + 2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7> + 3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2> + 2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7> + 2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3> + 3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7> + 3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6> + 4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5> + 2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6> + 2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0> + 2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0> + 2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1> + 2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1> + 3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3> + 4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS + 2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6> + 3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7> + 3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0> + 2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7> + 2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1> + 1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS + 202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS + 2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3> + 2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS + 1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS + 1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS + 2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7> + 2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS + 202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS + 2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0> + 1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS + 2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2> + 2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1> + 2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5> + 3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7> + 2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2> + 2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1> + 1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS + 2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2> + 2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1> + 2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0> + 2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS + 2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS + 2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7> + 2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7> + 3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0> + 2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS + 2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2> + 2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2> + 2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2> + 2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3> + 3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5> + 3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7> + 2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7> + 2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1> + 2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3> + 403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS + 1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2> + 1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3> + 403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS + 1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3> + 1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2> + 403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS + 2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2> + 3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6> + 3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4> + 2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2> + 2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4> + 1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS + 2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6> + 2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0> + 1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS + 2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS + 2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7> + 3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2> + 2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS + 2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS + 2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5> + 2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0> + 2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7> + 2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS + 2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1> + 2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2> + 2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3> + 2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7> + 3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5> + 3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7> + 2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6> + 2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2> + 2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7> + 1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2> + 2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2> + 3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3> + 2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1> + 2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6> + 3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0> + 3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1> + 2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1> + 1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2> + 403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS + 1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2> + 1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2> + 1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2> + 403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS + 1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS + 1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2> + 403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS + 2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0> + 1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS + 2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2> + 2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3> + 2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5> + 3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6> + 3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7> + 3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1> + 1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS + 2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2> + 2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1> + 2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3> + 1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS + 2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS + 2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7> + 3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7> + 2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1> + 1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS + 2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0> + 2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3> + 2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2> + 2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3> + 2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4> + 3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5> + 2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6> + 3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3> + 2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u> + 1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS + 1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3> + 2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2> + 2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3> + 1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS + 2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5> + 2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3> + 3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7> + 1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS + 2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS + 2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3> + 3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3> + 2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4> + 2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS + 1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS + 2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6> + 3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4> + 1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS + 2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS + 2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7> + 2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5> + 2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5> + 2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS + 2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5> + 2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4> + 1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS + 1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS + 2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0> + 3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1> + 2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3> + 3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7> + 2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3> + 3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5> + 2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3> + 2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7> + 2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u> + 2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1> + 2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3> + 3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7> + 2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3> + 2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6> + 2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5> + 3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1> + 2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7> + 2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1> + 1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS + 1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u> + 2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2> + 1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS + 1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS + 1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS + 2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6> + 1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS + 1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS + 2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4> + 2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS + 3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4> + 2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4> + 2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5> + 1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1> + 2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2> + 3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1> + 1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1> + 3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2> + 2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4> + 3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0> + 3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6> + 3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1> + 2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS + 3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS + 3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0> + 2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS + 3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2> + 3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3> + 3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2> + 2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1> + 3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4> + 2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS + 3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7> + 3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2> + 2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS + 2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS + 2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4> + 2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3> + 3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3> + 2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6> + 3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5> + 3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6> + 3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3> + 3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u> + 2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1> + 3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4> + 3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4> + 3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4> + 2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4> + 2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS + 2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6> + 3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1> + 2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS + 2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS + 2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5> + 2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1> + 3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2> + 2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS + 2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS + 1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS + 3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS + 1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS + 2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS + 2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1> + 3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2> + 2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2> + 2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS + 3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7> + 3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7> + 2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1> + 2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS + 2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4> + 3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1> + 3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4> + 3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4> + 3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS + 3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0> + 2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1> + 3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2> + 2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4> + 2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS + 2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS + 2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u> + 2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1> + 2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6> + 1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1> + 1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS + 2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1> + 1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS + 2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0> + 1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS + 2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5> + 3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4> + 1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5> + 2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5> + 2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1> + 3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1> + 1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS + 2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2> + 2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1> + 2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0> + 2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7> + 2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5> + 2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1> + 2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7> + 3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS + 2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3> + 3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1> + 2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5> + 2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2> + 2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1> + 3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5> + 3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7> + 2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7> + 4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS + 2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1> + 2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2> + 2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7> + 3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2> + 2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3> + 2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6> + 3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5> + 4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6> + 2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS + 2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS + 2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1> + 2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0> + 3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3> + 3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4> + 2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4> + 1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS + 2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS + 2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6> + 1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS + 2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS + 2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5> + 3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5> + 4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3> + 2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS + 2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5> + 2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0> + 2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7> + 2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS + 2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1> + 2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5> + 2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3> + 3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4> + 3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6> + 3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6> + 2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6> + 2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0> + 2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0> + 2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS + 2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1> + 2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1> + 2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7> + 2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS + 2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7> + 3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0> + 2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1> + 2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS + 2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2> + 1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS + 2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3> + 2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1> + 1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5> + 1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS + 2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7> + 2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS + 2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS + 3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0> + 2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS + 2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6> + 3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1> + 2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5> + 2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6> + 2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6> + 4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS + 2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS + 3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2> + 2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1> + 3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0> + 3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3> + 2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6> + 3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7> + 3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1> + 2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS + 2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS + 2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1> + 3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0> + 3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2> + 3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1> + 3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS + 3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7> + 2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7> + 2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS + 2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS + 2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS + 3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1> + 2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3> + 3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3> + 2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS + 4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5> + 2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3> + 3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7> + 3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u> + 3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1> + 3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0> + 3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4> + 3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4> + 3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5> + 2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS + 3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS + 4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS + 2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS + 3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2> + 3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5> + 3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5> + 3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5> + 3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6> + 3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5> + 2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0> + 2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS + 2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS + 2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1> + 3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6> + 3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7> + 3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0> + 3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS + 3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7> + 2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6> + 2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7> + 2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7> + 1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1> + 2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1> + 2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3> + 3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7> + 2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5> + 2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1> + 2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7> + 4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS + 1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1> + 1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1> + 2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS + 3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2> + 3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1> + 2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5> + 2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS + 2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7> + 3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7> + 1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1> + 2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1> + 2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS + 3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1> + 3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1> + 2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1> + 3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0> + 2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7> + 2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7> + 2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS + 2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS + 2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1> + 3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0> + 2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1> + 2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS + 2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7> + 2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7> + 2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1> + 2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7> + 2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS + 3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7> + 3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2> + 2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1> + 2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS + 3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7> + 2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2> + 3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2> + 2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS + 1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS + 2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7> + 2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2> + 2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3> + 1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS + 1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3> + 2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3> + 2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2> + 1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS + 2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1> + 3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1> + 3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4> + 3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4> + 3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4> + 2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS + 3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0> + 3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6> + 2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS + 2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS + 2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7> + 3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2> + 2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5> + 2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS + 4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6> + 2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7> + 3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS + 2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS + 2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1> + 2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0> + 3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7> + 3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6> + 3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS + 3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6> + 3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6> + 2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0> + 2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0> + 2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1> + 3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1> + 3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1> + 3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0> + 3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS + 2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7> + 3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7> + 2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7> + 2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1> + 1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS + 2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS + 2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2> + 2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u> + 1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS + 1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u> + 2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3> + 2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2> + 1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS + 1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u> + 1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS + 2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2> + 2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u> + 1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u> + 1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1> + 2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2> + 2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1> + 1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS + 1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS + 202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS + 1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS + 1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS + 1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS + 2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7> + 3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS + 2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS + 202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS + 2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0> + 2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS + 2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2> + 2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3> + 2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4> + 2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS + 2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6> + 2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS + 2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u> + 403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS + 1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 115726126U, // <1,u,3,2>: Cost 1 vrev LHS + 2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS + 403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS + 1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3> + 1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS + 403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS + 2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1> + 2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0> + 2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4> + 2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4> + 2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS + 1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS + 2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6> + 2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6> + 1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS + 2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS + 2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7> + 3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS + 2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS + 2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS + 2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS + 1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS + 1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS + 1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS + 2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u> + 2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0> + 2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3> + 2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7> + 2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4> + 3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7> + 2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u> + 2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7> + 2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u> + 1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u> + 2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1> + 2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1> + 2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7> + 2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS + 2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u> + 2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7> + 2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7> + 1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u> + 403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS + 202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS + 115767091U, // <1,u,u,2>: Cost 1 vrev LHS + 1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS + 403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS + 1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS + 1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS + 1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS + 403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS + 2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0> + 2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1> + 1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2> + 2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0> + 2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS + 3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5> + 2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0> + 4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7> + 1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2> + 2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1> + 2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0> + 1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS + 2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS + 2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS + 3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7> + 2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1> + 3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2> + 1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS + 1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS + 2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2> + 2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0> + 2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2> + 1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS + 2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3> + 2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2> + 2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2> + 1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS + 2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0> + 2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1> + 2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2> + 4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3> + 3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS + 3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5> + 4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6> + 3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7> + 2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u> + 2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS + 2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5> + 2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6> + 3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2> + 2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS + 2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS + 2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS + 3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5> + 2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6> + 3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS + 3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5> + 2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7> + 3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5> + 2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5> + 3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5> + 3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0> + 2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS + 2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5> + 4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0> + 2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS + 2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7> + 3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6> + 2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6> + 3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0> + 3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6> + 3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7> + 2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6> + 2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0> + 3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3> + 2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1> + 3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0> + 3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6> + 3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2> + 3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0> + 3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7> + 2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0> + 1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS + 2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1> + 1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS + 2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2> + 1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS + 2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS + 2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS + 2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS + 1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS + 2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS + 2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS + 3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0> + 1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2> + 2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS + 2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2> + 3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0> + 2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1> + 1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2> + 2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS + 2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1> + 2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0> + 2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3> + 2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS + 3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7> + 3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1> + 3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2> + 2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3> + 2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS + 3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1> + 2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2> + 2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0> + 2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS + 3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3> + 3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7> + 2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0> + 2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0> + 2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS + 2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1> + 2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2> + 2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3> + 2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS + 2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5> + 2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3> + 4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7> + 2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u> + 2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5> + 3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6> + 3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5> + 1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4> + 3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS + 2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS + 3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS + 3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6> + 1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4> + 2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS + 2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7> + 3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1> + 2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7> + 2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS + 3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5> + 3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5> + 3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS + 2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7> + 2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS + 3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2> + 2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7> + 3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS + 2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS + 4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5> + 3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6> + 3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1> + 2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS + 2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2> + 3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1> + 3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0> + 2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7> + 3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6> + 3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7> + 3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1> + 3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u> + 2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2> + 2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS + 2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1> + 2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2> + 1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u> + 2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS + 2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5> + 2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3> + 2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1> + 1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u> + 1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2> + 1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS + 2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0> + 2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2> + 2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS + 3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7> + 3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6> + 3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2> + 1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2> + 2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2> + 2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1> + 2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0> + 2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS + 3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS + 2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7> + 3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3> + 3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1> + 2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS + 1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS + 2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2> + 269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS + 2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3> + 1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS + 2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7> + 2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7> + 2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2> + 269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS + 2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1> + 2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0> + 2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2> + 1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS + 2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5> + 3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5> + 2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6> + 2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3> + 1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS + 2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS + 3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2> + 2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5> + 3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5> + 2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS + 1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS + 2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS + 3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4> + 1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS + 3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2> + 2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3> + 2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7> + 4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS + 2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5> + 2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5> + 2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0> + 2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS + 2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS + 3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6> + 3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3> + 2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3> + 2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7> + 3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS + 3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7> + 3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6> + 2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2> + 2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7> + 2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1> + 3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5> + 2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2> + 4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS + 2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6> + 3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7> + 3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7> + 2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7> + 2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1> + 1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS + 1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS + 269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS + 1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS + 1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS + 1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS + 2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6> + 2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS + 269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS + 1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0> + 470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS + 1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3> + 1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7> + 2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7> + 2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0> + 470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS + 1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1> + 1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0> + 1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS + 1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7> + 2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0> + 1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3> + 2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2> + 2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1> + 1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2> + 1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1> + 2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5> + 2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7> + 1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1> + 1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1> + 1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2> + 2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3> + 1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3> + 1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3> + 1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6> + 2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7> + 2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3> + 2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7> + 1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2> + 1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS + 1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4> + 2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0> + 2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2> + 1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS + 470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS + 1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6> + 2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4> + 470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS + 2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS + 1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3> + 2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4> + 2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6> + 1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6> + 1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5> + 1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0> + 1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7> + 1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7> + 2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2> + 2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6> + 1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3> + 2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5> + 2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6> + 2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7> + 1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6> + 1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1> + 1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1> + 1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2> + 2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3> + 2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3> + 2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1> + 1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6> + 2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2> + 2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1> + 1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7> + 1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2> + 1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2> + 470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS + 1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0> + 1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1> + 1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6> + 470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS + 1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7> + 1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1> + 470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS + 2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4> + 2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS + 2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4> + 2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4> + 2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2> + 2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1> + 1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2> + 3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1> + 1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2> + 2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2> + 3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1> + 2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4> + 2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS + 3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4> + 3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7> + 3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3> + 3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1> + 2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS + 2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4> + 3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3> + 2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4> + 2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5> + 2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4> + 2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS + 3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS + 3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0> + 2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS + 2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4> + 4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1> + 3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4> + 3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3> + 3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4> + 2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5> + 2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6> + 3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7> + 2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u> + 2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2> + 3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1> + 3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2> + 3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3> + 2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4> + 2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS + 2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS + 4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7> + 2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS + 2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5> + 2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2> + 2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5> + 2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5> + 2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS + 3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5> + 1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS + 2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS + 1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS + 1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS + 2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2> + 2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4> + 2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2> + 1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS + 2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS + 2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6> + 2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1> + 1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS + 2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4> + 3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2> + 3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4> + 3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4> + 3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6> + 2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0> + 2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1> + 3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7> + 2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4> + 1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS + 2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS + 2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2> + 2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2> + 1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS + 2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5> + 1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS + 2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS + 1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS + 3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0> + 2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS + 2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2> + 2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5> + 2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5> + 2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5> + 3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7> + 3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS + 2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS + 3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2> + 3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1> + 2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0> + 2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7> + 2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5> + 3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7> + 3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2> + 2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3> + 2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5> + 3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2> + 4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1> + 2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2> + 3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1> + 2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5> + 3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1> + 3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7> + 3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS + 3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS + 2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS + 2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3> + 2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5> + 3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3> + 2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6> + 2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3> + 2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6> + 2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7> + 2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u> + 2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS + 3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4> + 2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5> + 2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4> + 2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS + 2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS + 3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6> + 3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS + 2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS + 2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS + 3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7> + 3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5> + 3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5> + 2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS + 2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5> + 4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6> + 2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7> + 2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7> + 2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS + 3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6> + 2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3> + 3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6> + 2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5> + 3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7> + 4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6> + 3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS + 3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS + 2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS + 2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2> + 2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7> + 3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2> + 2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS + 2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7> + 2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2> + 4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS + 2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS + 2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS + 2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS + 2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u> + 2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u> + 2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6> + 2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS + 2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6> + 2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7> + 2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS + 2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0> + 1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS + 2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2> + 3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0> + 1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6> + 3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6> + 2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1> + 2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2> + 1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS + 2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2> + 2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1> + 2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0> + 2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS + 2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6> + 2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7> + 3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7> + 2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2> + 2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS + 2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1> + 2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3> + 2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2> + 2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1> + 2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6> + 2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7> + 2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6> + 2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS + 2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1> + 2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2> + 3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1> + 2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7> + 2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3> + 2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6> + 3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5> + 3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6> + 1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS + 1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS + 2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2> + 2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3> + 2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0> + 3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4> + 2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4> + 1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS + 2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6> + 4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS + 1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS + 3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2> + 2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3> + 2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7> + 3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5> + 2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5> + 2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5> + 2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0> + 2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS + 2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS + 2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS + 3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1> + 2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6> + 3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3> + 2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS + 3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5> + 2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6> + 2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS + 2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS + 2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1> + 2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2> + 2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2> + 2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0> + 2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5> + 3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7> + 2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2> + 2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0> + 2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1> + 2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2> + 1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS + 2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3> + 2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS + 1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6> + 1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS + 2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7> + 1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS + 1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS + 2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2> + 1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2> + 2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2> + 2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0> + 2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2> + 2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7> + 2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7> + 2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2> + 1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2> + 2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2> + 3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1> + 3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0> + 3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7> + 3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5> + 3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7> + 2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7> + 3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0> + 2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2> + 3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2> + 3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7> + 2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2> + 3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1> + 3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6> + 2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7> + 2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7> + 3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1> + 2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7> + 1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS + 2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7> + 2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2> + 2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3> + 1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS + 2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3> + 1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3> + 2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2> + 1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS + 2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6> + 3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4> + 3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7> + 2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4> + 3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4> + 2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS + 2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u> + 3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0> + 2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS + 2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2> + 3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7> + 3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7> + 3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5> + 2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS + 3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5> + 3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7> + 2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6> + 2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS + 2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS + 3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7> + 2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3> + 3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6> + 2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS + 2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6> + 2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6> + 3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0> + 2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS + 2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1> + 3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2> + 3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7> + 3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7> + 2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS + 3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5> + 2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7> + 2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7> + 2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1> + 1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS + 1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2> + 2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2> + 2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u> + 1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS + 2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS + 1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u> + 2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2> + 1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS + 1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0> + 470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS + 1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2> + 1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7> + 1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2> + 3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS + 470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS + 1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1> + 1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0> + 1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5> + 1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7> + 2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3> + 1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3> + 1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS + 2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1> + 269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS + 1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1> + 1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS + 2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6> + 1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS + 269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS + 1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2> + 2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1> + 1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3> + 1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS + 1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6> + 2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5> + 1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3> + 1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS + 1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS + 1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS + 1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4> + 2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5> + 1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4> + 1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS + 470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS + 1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6> + 3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS + 470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS + 2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5> + 1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3> + 2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5> + 2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7> + 1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6> + 1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5> + 1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS + 1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7> + 1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS + 1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS + 2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2> + 1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3> + 2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7> + 1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS + 2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS + 1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6> + 1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1> + 1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS + 1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2> + 2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u> + 2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7> + 2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1> + 1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6> + 2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2> + 2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7> + 1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7> + 1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2> + 1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS + 470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS + 269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS + 1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS + 1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS + 470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS + 1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS + 1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS + 470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS + 1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0> + 1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1> + 1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2> + 3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1> + 2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1> + 3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7> + 3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1> + 3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0> + 1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2> + 1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS + 2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0> + 537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS + 2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3> + 1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS + 2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7> + 2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7> + 2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1> + 537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS + 1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2> + 2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1> + 2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0> + 2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1> + 1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6> + 2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7> + 2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7> + 2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7> + 1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2> + 2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2> + 2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0> + 2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1> + 2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3> + 2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6> + 3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7> + 2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7> + 3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1> + 2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3> + 2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4> + 1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5> + 1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6> + 3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1> + 2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6> + 1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS + 2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6> + 3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4> + 1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6> + 2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7> + 2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3> + 3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS + 3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6> + 2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6> + 2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5> + 2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7> + 2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7> + 3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS + 2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7> + 2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7> + 2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7> + 3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2> + 3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1> + 3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0> + 2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6> + 2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0> + 2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7> + 2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2> + 2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7> + 2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0> + 3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7> + 2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6> + 3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7> + 3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3> + 2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7> + 2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0> + 1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2> + 1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1> + 537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS + 2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1> + 1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6> + 1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS + 2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7> + 2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u> + 537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS + 2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS + 2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS + 2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1> + 1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2> + 2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS + 2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1> + 3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6> + 3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0> + 1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2> + 2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1> + 1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1> + 2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1> + 1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3> + 2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5> + 2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5> + 3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5> + 3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3> + 1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3> + 2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1> + 2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3> + 2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2> + 1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0> + 2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS + 2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3> + 3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7> + 2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0> + 1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0> + 1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS + 1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3> + 2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0> + 2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1> + 1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS + 1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7> + 2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7> + 2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3> + 1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3> + 2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS + 2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5> + 2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5> + 2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5> + 2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS + 2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS + 2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS + 3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4> + 2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5> + 2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1> + 2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7> + 2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5> + 1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7> + 2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5> + 3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7> + 3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0> + 2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS + 1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7> + 3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1> + 2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7> + 2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7> + 2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7> + 3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5> + 2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7> + 3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7> + 2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0> + 2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7> + 3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS + 2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1> + 4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2> + 3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS + 3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS + 2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7> + 3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7> + 3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7> + 3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS + 1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS + 1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3> + 2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0> + 1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0> + 1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS + 1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7> + 2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7> + 2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS + 1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3> + 2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0> + 1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS + 2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0> + 2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0> + 2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5> + 2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7> + 2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4> + 2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0> + 1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS + 1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2> + 2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1> + 2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0> + 2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1> + 2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS + 2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7> + 2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3> + 3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1> + 1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1> + 2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1> + 2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3> + 1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2> + 1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3> + 2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5> + 2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7> + 2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6> + 3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3> + 1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3> + 1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1> + 2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0> + 2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2> + 2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3> + 1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5> + 2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1> + 2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2> + 2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0> + 1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1> + 2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS + 2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4> + 2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4> + 2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5> + 2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS + 1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS + 2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0> + 2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4> + 1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS + 2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS + 2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5> + 2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7> + 2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6> + 1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5> + 2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5> + 2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7> + 2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7> + 1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5> + 2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1> + 2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3> + 2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6> + 1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7> + 2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5> + 2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7> + 2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7> + 2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1> + 1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7> + 2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2> + 2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3> + 4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2> + 2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS + 2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6> + 2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7> + 2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7> + 2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7> + 2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS + 1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1> + 1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS + 1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2> + 1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3> + 1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5> + 1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS + 2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0> + 2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1> + 1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1> + 2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0> + 1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2> + 2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0> + 2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2> + 2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1> + 2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2> + 3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2> + 4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7> + 1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2> + 2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3> + 1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3> + 2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3> + 2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1> + 2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS + 2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3> + 3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3> + 2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3> + 1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3> + 2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS + 2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3> + 1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3> + 2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0> + 2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS + 2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4> + 2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3> + 2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3> + 1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3> + 1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS + 2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3> + 2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3> + 336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS + 1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS + 2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5> + 2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7> + 2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3> + 336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS + 2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS + 2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4> + 2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4> + 2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6> + 2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4> + 1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6> + 2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS + 4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7> + 1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6> + 2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS + 2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5> + 2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5> + 2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5> + 2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS + 3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5> + 2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0> + 2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS + 2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS + 2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7> + 3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7> + 2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7> + 2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7> + 2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7> + 3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7> + 2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6> + 2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3> + 2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3> + 2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS + 2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7> + 2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7> + 2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3> + 2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS + 2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7> + 3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3> + 3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7> + 2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS + 1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS + 1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2> + 1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3> + 336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS + 1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS + 1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6> + 2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3> + 2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS + 336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS + 2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0> + 1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS + 2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2> + 3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4> + 2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5> + 1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1> + 1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2> + 3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0> + 1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS + 2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2> + 2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1> + 1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4> + 2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3> + 2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS + 2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0> + 2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3> + 2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4> + 1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4> + 3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS + 2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3> + 2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2> + 2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1> + 2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3> + 2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3> + 2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0> + 2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4> + 2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0> + 2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2> + 2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4> + 2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4> + 2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3> + 2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1> + 2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS + 3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS + 3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1> + 2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2> + 2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS + 2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4> + 2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4> + 2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4> + 1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4> + 1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS + 1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6> + 3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4> + 1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS + 1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS + 2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3> + 1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5> + 2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2> + 1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS + 2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7> + 537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS + 2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5> + 537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS + 2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1> + 2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6> + 2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3> + 2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6> + 1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6> + 2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7> + 2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7> + 2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4> + 1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2> + 2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2> + 3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5> + 2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4> + 3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7> + 2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6> + 2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0> + 2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0> + 2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7> + 2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4> + 1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS + 1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS + 1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u> + 2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1> + 1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS + 1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS + 537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS + 2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u> + 537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS + 3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0> + 2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS + 2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5> + 3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4> + 2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1> + 2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1> + 2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1> + 2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0> + 2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS + 2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS + 3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1> + 2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5> + 2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5> + 2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5> + 2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3> + 3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7> + 1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3> + 1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3> + 3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3> + 2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5> + 3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2> + 2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5> + 2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5> + 2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3> + 3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7> + 2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3> + 2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5> + 3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2> + 2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5> + 3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4> + 2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3> + 2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6> + 2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5> + 4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6> + 3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS + 3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS + 2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS + 3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5> + 2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4> + 3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0> + 2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS + 2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS + 2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS + 1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6> + 1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6> + 2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS + 2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3> + 2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5> + 2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5> + 2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS + 1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5> + 2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6> + 1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7> + 1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7> + 2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1> + 2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7> + 2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6> + 2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4> + 2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5> + 2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7> + 2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7> + 1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0> + 1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0> + 1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS + 1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7> + 2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2> + 2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2> + 1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS + 1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7> + 2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0> + 2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7> + 1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS + 1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS + 1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u> + 2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2> + 2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2> + 1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS + 1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7> + 2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS + 1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3> + 1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS + 2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS + 2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2> + 2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4> + 3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4> + 2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2> + 2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7> + 2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0> + 2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS + 2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2> + 3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3> + 3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1> + 2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6> + 3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1> + 2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6> + 3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3> + 2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3> + 2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3> + 2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6> + 2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS + 2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3> + 2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6> + 3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0> + 2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6> + 2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6> + 2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3> + 1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3> + 1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3> + 3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2> + 3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3> + 3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3> + 2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3> + 2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6> + 3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6> + 3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2> + 2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS + 2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS + 3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6> + 2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3> + 2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5> + 3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6> + 2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6> + 2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6> + 2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0> + 2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS + 2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6> + 2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS + 3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2> + 2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7> + 2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6> + 2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5> + 3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6> + 4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6> + 2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5> + 2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS + 2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1> + 2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3> + 2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3> + 3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6> + 2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4> + 2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7> + 1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6> + 1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7> + 1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7> + 1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1> + 2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7> + 2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7> + 2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3> + 1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5> + 2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1> + 2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2> + 2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS + 1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1> + 1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1> + 2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2> + 2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u> + 2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6> + 1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5> + 2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6> + 1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6> + 1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3> + 1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1> + 2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0> + 1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS + 2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2> + 3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0> + 2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5> + 2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0> + 2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0> + 2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1> + 1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS + 2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2> + 2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1> + 2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0> + 2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7> + 2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS + 1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7> + 2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7> + 2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3> + 1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7> + 2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS + 3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3> + 2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2> + 2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1> + 2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS + 2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7> + 1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7> + 2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3> + 1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7> + 2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2> + 2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3> + 2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3> + 2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3> + 2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6> + 2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7> + 2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7> + 2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7> + 2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2> + 2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS + 2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7> + 2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7> + 3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7> + 2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS + 1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS + 2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4> + 2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6> + 1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS + 2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2> + 2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3> + 3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3> + 2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0> + 2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5> + 2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5> + 2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7> + 2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS + 2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS + 2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1> + 2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0> + 2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3> + 2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0> + 2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5> + 2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4> + 2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6> + 2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0> + 2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7> + 2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS + 2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7> + 2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7> + 2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7> + 2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS + 2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7> + 2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7> + 1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7> + 1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7> + 2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2> + 1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS + 2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0> + 2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS + 2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6> + 1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS + 1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7> + 1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7> + 1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS + 1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0> + 1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2> + 1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2> + 1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2> + 2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1> + 1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1> + 1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2> + 2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS + 1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2> + 1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u> + 1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u> + 537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS + 1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3> + 1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS + 1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u> + 2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u> + 1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3> + 537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS + 1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2> + 2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3> + 1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u> + 1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0> + 1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6> + 2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7> + 1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u> + 1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3> + 1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0> + 1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1> + 1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3> + 2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2> + 336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS + 1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5> + 1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7> + 2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7> + 2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS + 336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS + 2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS + 1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5> + 1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6> + 2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5> + 1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4> + 1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6> + 1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6> + 1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6> + 1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6> + 1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS + 2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7> + 1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5> + 1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7> + 1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS + 1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5> + 537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS + 1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7> + 537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS + 2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1> + 2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6> + 2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7> + 1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7> + 1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6> + 2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7> + 1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6> + 1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0> + 1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7> + 1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS + 1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7> + 2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2> + 3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS + 1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS + 1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7> + 2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2> + 1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7> + 1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS + 1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1> + 1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2> + 537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS + 336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS + 1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5> + 1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6> + 537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS + 1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0> + 537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS + 2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0> + 2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1> + 2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2> + 3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4> + 2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4> + 3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0> + 3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0> + 3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0> + 2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1> + 2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS + 2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4> + 1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS + 2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1> + 2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS + 2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4> + 3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1> + 2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1> + 1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS + 2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4> + 3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4> + 2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4> + 2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0> + 2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6> + 3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7> + 2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4> + 3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2> + 2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4> + 3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS + 2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4> + 2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4> + 3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4> + 3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS + 3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5> + 3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7> + 3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0> + 2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4> + 2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS + 2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5> + 2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6> + 3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2> + 2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS + 2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS + 3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2> + 3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4> + 2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS + 2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS + 1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS + 2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2> + 2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5> + 2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS + 3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7> + 3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7> + 2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5> + 1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS + 2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS + 3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1> + 1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS + 2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6> + 2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS + 2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6> + 3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6> + 2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0> + 1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS + 3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2> + 3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS + 4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS + 3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0> + 3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5> + 3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5> + 3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0> + 3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7> + 4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS + 2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS + 1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS + 1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS + 2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u> + 2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS + 2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS + 2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4> + 2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u> + 1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS + 2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1> + 2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS + 3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6> + 2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2> + 2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5> + 3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4> + 3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1> + 3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4> + 2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS + 3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2> + 2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4> + 2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4> + 2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3> + 3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5> + 3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1> + 3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7> + 3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1> + 2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3> + 2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS + 2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4> + 3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2> + 1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4> + 2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS + 3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3> + 3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7> + 3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2> + 1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4> + 2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS + 2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3> + 2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4> + 2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4> + 2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS + 2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7> + 2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> + 3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2> + 2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3> + 3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1> + 3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0> + 3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5> + 3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS + 3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5> + 2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS + 3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4> + 3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0> + 2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS + 1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS + 2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2> + 2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5> + 2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2> + 1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS + 2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7> + 2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3> + 2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2> + 1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS + 2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS + 2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1> + 4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2> + 3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS + 2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS + 2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6> + 4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7> + 3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1> + 2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS + 2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1> + 3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1> + 3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1> + 2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4> + 3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6> + 3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4> + 3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7> + 3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7> + 2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1> + 1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS + 2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2> + 2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2> + 1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4> + 1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS + 2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS + 2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> + 2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2> + 1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS + 2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2> + 2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS + 2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6> + 3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2> + 2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6> + 3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7> + 2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4> + 3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2> + 2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS + 3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2> + 3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1> + 3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0> + 3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS + 2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3> + 3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7> + 3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3> + 3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3> + 2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3> + 3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4> + 3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3> + 2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2> + 2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3> + 3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0> + 3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7> + 3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6> + 3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2> + 2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3> + 2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1> + 3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1> + 3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2> + 2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4> + 2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5> + 2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4> + 3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4> + 2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4> + 2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1> + 2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS + 2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4> + 2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4> + 2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4> + 2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS + 2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS + 2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4> + 3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0> + 2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS + 2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS + 2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0> + 2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2> + 2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS + 2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS + 3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7> + 2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7> + 3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS + 2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS + 1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS + 2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2> + 2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2> + 2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS + 1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS + 2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6> + 2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3> + 2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2> + 1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS + 2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2> + 3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2> + 3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2> + 2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4> + 4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4> + 3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7> + 3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4> + 3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7> + 2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2> + 1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS + 2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2> + 2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2> + 2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS + 1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS + 2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS + 2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u> + 2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2> + 1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS + 3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0> + 2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2> + 2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4> + 3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3> + 3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1> + 2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0> + 3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0> + 3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0> + 2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2> + 2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1> + 3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1> + 2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4> + 2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4> + 3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0> + 3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3> + 3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1> + 3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3> + 2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4> + 3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS + 2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3> + 3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2> + 2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4> + 3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS + 2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4> + 3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3> + 3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3> + 2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3> + 3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1> + 3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1> + 3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3> + 2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3> + 2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4> + 3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7> + 3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7> + 3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7> + 2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3> + 2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1> + 2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2> + 2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4> + 3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0> + 2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5> + 2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6> + 2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4> + 3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2> + 2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1> + 2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS + 2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5> + 2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5> + 2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3> + 2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS + 4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5> + 3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5> + 2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4> + 2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS + 2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS + 2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6> + 2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6> + 2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3> + 2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS + 1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6> + 4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6> + 2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4> + 1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6> + 3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1> + 3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5> + 3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7> + 3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7> + 3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5> + 3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7> + 3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7> + 3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4> + 3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1> + 2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS + 2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2> + 2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u> + 2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4> + 2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS + 1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u> + 2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u> + 2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4> + 1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u> + 2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4> + 1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS + 2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS + 3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1> + 2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0> + 2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1> + 2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2> + 3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0> + 1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS + 2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2> + 2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1> + 2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3> + 2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3> + 2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3> + 2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4> + 3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3> + 3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3> + 2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3> + 3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4> + 3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4> + 2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2> + 2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4> + 2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4> + 3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7> + 2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4> + 3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4> + 2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4> + 2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2> + 3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4> + 3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3> + 2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4> + 2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4> + 3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5> + 3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4> + 3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4> + 2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4> + 1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS + 2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4> + 2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2> + 2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4> + 161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS + 1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS + 2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS + 2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4> + 161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS + 2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS + 2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4> + 3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5> + 2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5> + 2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS + 1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS + 1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS + 2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5> + 1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS + 2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS + 3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2> + 2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5> + 2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6> + 2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS + 2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6> + 1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS + 2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4> + 1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS + 2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2> + 3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4> + 3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7> + 3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4> + 2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4> + 2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4> + 2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4> + 2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7> + 2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4> + 1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS + 1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS + 2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS + 2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u> + 161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS + 1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS + 1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS + 2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u> + 161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS + 2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0> + 1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS + 2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5> + 2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0> + 2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5> + 3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7> + 2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5> + 2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0> + 1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS + 2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2> + 2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5> + 2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0> + 2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS + 3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4> + 2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5> + 3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6> + 2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3> + 2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS + 2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS + 3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3> + 2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5> + 1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5> + 2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5> + 3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7> + 2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7> + 4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS + 1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5> + 2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2> + 3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1> + 2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4> + 2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3> + 2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0> + 2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5> + 2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4> + 2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5> + 2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2> + 2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS + 3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4> + 2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5> + 2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4> + 2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS + 1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS + 2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5> + 2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6> + 1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS + 1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS + 2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3> + 2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4> + 2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2> + 1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5> + 2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5> + 2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0> + 2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS + 1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS + 1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS + 2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6> + 2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6> + 1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6> + 1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS + 3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5> + 2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6> + 27705344U, // <4,5,6,7>: Cost 0 copy RHS + 27705344U, // <4,5,6,u>: Cost 0 copy RHS + 2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS + 2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4> + 2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7> + 2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5> + 2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS + 2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7> + 2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4> + 2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4> + 2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS + 1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS + 1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS + 2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0> + 1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u> + 1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS + 1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS + 2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7> + 27705344U, // <4,5,u,7>: Cost 0 copy RHS + 27705344U, // <4,5,u,u>: Cost 0 copy RHS + 2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0> + 1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS + 1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6> + 3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0> + 2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5> + 3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7> + 2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0> + 4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS + 1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS + 2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2> + 2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1> + 2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0> + 2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3> + 3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5> + 2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7> + 3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7> + 4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS + 2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3> + 2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4> + 2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3> + 2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2> + 2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1> + 2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6> + 2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7> + 2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7> + 2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3> + 2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1> + 2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2> + 2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6> + 3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2> + 2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3> + 2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6> + 2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6> + 3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6> + 2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4> + 2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2> + 2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS + 2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3> + 2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4> + 2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4> + 2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS + 1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS + 1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS + 2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4> + 1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS + 2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS + 2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3> + 2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3> + 3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3> + 2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6> + 2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5> + 2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6> + 2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS + 2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS + 1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS + 2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2> + 2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3> + 2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2> + 1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6> + 2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6> + 2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6> + 2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS + 1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS + 2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2> + 2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2> + 3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7> + 2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4> + 2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6> + 2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6> + 3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3> + 2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7> + 2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2> + 1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS + 1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS + 1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS + 2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1> + 1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u> + 1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS + 1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS + 2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS + 1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS + 3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0> + 2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS + 2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4> + 3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4> + 3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5> + 2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0> + 3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7> + 3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4> + 2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS + 2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1> + 3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1> + 2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4> + 3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5> + 3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS + 3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7> + 3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7> + 3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3> + 2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1> + 3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS + 3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3> + 3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2> + 2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4> + 3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7> + 2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7> + 3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7> + 3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3> + 2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7> + 3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2> + 3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4> + 3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4> + 3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3> + 3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6> + 2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7> + 2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7> + 3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4> + 2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7> + 2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1> + 3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3> + 3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7> + 3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5> + 2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4> + 2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS + 2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4> + 3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7> + 2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS + 2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2> + 3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7> + 2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5> + 3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5> + 2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6> + 2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5> + 2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5> + 2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7> + 2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2> + 1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS + 3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2> + 2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2> + 2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2> + 1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS + 1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6> + 2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3> + 3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7> + 1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS + 3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS + 3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4> + 3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7> + 3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4> + 3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7> + 2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7> + 3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7> + 2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7> + 2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7> + 1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS + 2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS + 2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2> + 2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2> + 1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS + 1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u> + 2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3> + 3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7> + 1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS + 2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0> + 1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS + 1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u> + 2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2> + 2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5> + 2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0> + 2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u> + 2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0> + 1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS + 2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2> + 2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1> + 1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS + 2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3> + 2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3> + 2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7> + 3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7> + 2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3> + 1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS + 2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS + 2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u> + 2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2> + 1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u> + 2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u> + 2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4> + 2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7> + 2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3> + 1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u> + 2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2> + 2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2> + 2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u> + 2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3> + 2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6> + 2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7> + 2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> + 2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u> + 2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2> + 1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS + 2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2> + 2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4> + 2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4> + 161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS + 1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS + 1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS + 2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6> + 161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS + 1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS + 1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS + 2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5> + 3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS + 1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS + 1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS + 1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS + 3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS + 1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS + 1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS + 2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2> + 1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS + 1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6> + 1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS + 1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6> + 1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS + 27705344U, // <4,u,6,7>: Cost 0 copy RHS + 27705344U, // <4,u,6,u>: Cost 0 copy RHS + 2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS + 2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4> + 2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7> + 2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u> + 2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS + 2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6> + 2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7> + 2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7> + 2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS + 1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS + 1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS + 1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS + 1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u> + 161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS + 1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS + 1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS + 27705344U, // <4,u,u,7>: Cost 0 copy RHS + 27705344U, // <4,u,u,u>: Cost 0 copy RHS + 2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0> + 2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1> + 2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2> + 3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5> + 2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5> + 3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0> + 3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0> + 3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0> + 2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2> + 2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS + 2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1> + 1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS + 3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7> + 2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS + 2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1> + 3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7> + 3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2> + 1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS + 2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2> + 2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5> + 2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4> + 2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5> + 2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5> + 2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5> + 2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4> + 2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6> + 2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5> + 3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5> + 3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4> + 2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5> + 3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5> + 2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0> + 3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4> + 3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0> + 3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7> + 2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5> + 2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS + 1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5> + 2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6> + 3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5> + 2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS + 2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS + 3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5> + 2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0> + 1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5> + 3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1> + 2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS + 3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS + 3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0> + 3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5> + 2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0> + 3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0> + 3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS + 2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS + 4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0> + 2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS + 3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7> + 3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6> + 3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5> + 3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7> + 3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6> + 2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5> + 2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS + 2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS + 2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0> + 2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7> + 3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2> + 2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS + 2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0> + 2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7> + 4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7> + 2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS + 2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2> + 1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5> + 1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS + 2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5> + 2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6> + 2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS + 2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u> + 2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6> + 1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS + 2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0> + 1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS + 2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2> + 2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2> + 2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5> + 2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0> + 3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7> + 3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0> + 1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS + 2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1> + 2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1> + 2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0> + 2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3> + 2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5> + 2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5> + 3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5> + 3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5> + 2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3> + 3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2> + 3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3> + 2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2> + 2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0> + 2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5> + 2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3> + 2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7> + 3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0> + 2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0> + 2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS + 2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3> + 3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5> + 2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5> + 2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5> + 2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7> + 3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7> + 2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5> + 2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3> + 1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1> + 2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5> + 2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5> + 2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5> + 2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4> + 1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS + 2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6> + 3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4> + 1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1> + 2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1> + 2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1> + 3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1> + 2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7> + 2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5> + 2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5> + 2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0> + 2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7> + 2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1> + 2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS + 3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7> + 2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3> + 3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7> + 2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS + 2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7> + 2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6> + 2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1> + 2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1> + 2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS + 2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1> + 2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1> + 2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS + 2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS + 2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3> + 4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6> + 2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7> + 2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS + 1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1> + 1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS + 2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5> + 2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS + 2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5> + 1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS + 2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7> + 2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1> + 2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS + 3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0> + 2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS + 2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2> + 3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2> + 3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1> + 3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1> + 3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4> + 3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0> + 2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS + 2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2> + 3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2> + 3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5> + 2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5> + 3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS + 3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0> + 3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3> + 3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1> + 2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5> + 3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS + 3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3> + 2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2> + 2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3> + 2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5> + 3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7> + 3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6> + 3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5> + 2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3> + 2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1> + 2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5> + 2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5> + 3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5> + 1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5> + 2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5> + 3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5> + 3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7> + 1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5> + 2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2> + 3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3> + 2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5> + 2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5> + 2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6> + 2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS + 2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2> + 3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4> + 2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS + 2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS + 3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3> + 3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7> + 2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS + 2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS + 2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5> + 3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7> + 3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1> + 2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS + 3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS + 3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3> + 2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3> + 2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7> + 2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5> + 3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7> + 3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7> + 3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1> + 2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7> + 2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS + 2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2> + 2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7> + 2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS + 2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS + 4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5> + 3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6> + 3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7> + 2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS + 2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1> + 2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS + 2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u> + 2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3> + 1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5> + 2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS + 2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5> + 3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u> + 1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5> + 3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0> + 2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2> + 3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0> + 3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2> + 2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1> + 3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2> + 3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0> + 2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0> + 2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2> + 3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3> + 3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1> + 2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3> + 2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5> + 3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3> + 2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7> + 3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7> + 2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5> + 2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3> + 3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1> + 3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5> + 3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2> + 2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4> + 2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5> + 2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4> + 3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3> + 3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3> + 2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4> + 3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1> + 2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3> + 3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2> + 2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3> + 2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5> + 2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5> + 3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7> + 3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5> + 2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5> + 2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5> + 3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0> + 2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3> + 2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5> + 2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5> + 2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6> + 3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5> + 2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4> + 2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6> + 2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS + 2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5> + 2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5> + 2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5> + 2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS + 2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5> + 3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0> + 2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5> + 2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS + 2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS + 3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6> + 2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6> + 2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6> + 2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS + 3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0> + 4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6> + 2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4> + 2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS + 1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS + 1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7> + 2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2> + 2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2> + 1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS + 2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3> + 2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3> + 3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7> + 1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS + 1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS + 1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u> + 2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2> + 2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2> + 1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS + 2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6> + 2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3> + 2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u> + 1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS + 3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS + 2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS + 3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2> + 3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5> + 2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5> + 2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1> + 2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0> + 3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0> + 2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS + 2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1> + 3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4> + 3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4> + 3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7> + 3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4> + 2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0> + 3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5> + 3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1> + 2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1> + 3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4> + 3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4> + 3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4> + 2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5> + 3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4> + 2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3> + 3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3> + 3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5> + 2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5> + 3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS + 3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4> + 2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4> + 3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3> + 2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4> + 3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0> + 2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5> + 3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7> + 2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4> + 2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS + 2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4> + 3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3> + 3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4> + 2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4> + 1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5> + 2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4> + 2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4> + 1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5> + 2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS + 2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5> + 2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3> + 3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2> + 2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS + 2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5> + 1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS + 3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS + 1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS + 2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS + 2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7> + 3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2> + 2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6> + 2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS + 2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5> + 3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7> + 2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5> + 2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS + 2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS + 2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4> + 2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7> + 2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7> + 2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS + 3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5> + 94817590U, // <5,4,7,6>: Cost 1 vrev RHS + 2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7> + 94965064U, // <5,4,7,u>: Cost 1 vrev RHS + 2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS + 2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u> + 2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u> + 2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4> + 2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS + 1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5> + 94825783U, // <5,4,u,6>: Cost 1 vrev RHS + 2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5> + 94973257U, // <5,4,u,u>: Cost 1 vrev RHS + 2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0> + 1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS + 2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2> + 3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2> + 2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1> + 2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0> + 3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7> + 4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS + 1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS + 2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2> + 2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5> + 2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0> + 2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3> + 2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5> + 2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7> + 3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7> + 2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3> + 2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5> + 3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS + 3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3> + 2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2> + 2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4> + 3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3> + 2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3> + 2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7> + 4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7> + 2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4> + 2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2> + 3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5> + 3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3> + 2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5> + 2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6> + 2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5> + 3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7> + 3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1> + 2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2> + 2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1> + 2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5> + 3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3> + 3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4> + 1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5> + 1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS + 2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5> + 2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6> + 1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5> + 1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS + 2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3> + 2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2> + 2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2> + 1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS + 229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS + 2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0> + 2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7> + 229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS + 2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS + 3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6> + 2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3> + 3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6> + 2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5> + 2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5> + 2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6> + 2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1> + 2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1> + 2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS + 3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7> + 2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5> + 2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7> + 2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS + 3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5> + 4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6> + 2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS + 2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS + 1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS + 1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS + 2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5> + 2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u> + 1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS + 229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS + 2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7> + 2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS + 229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS + 2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0> + 1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS + 2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2> + 2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4> + 2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5> + 3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6> + 3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7> + 4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS + 1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS + 2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2> + 2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1> + 2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0> + 2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3> + 2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6> + 2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7> + 3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7> + 2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS + 2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6> + 3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2> + 3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3> + 2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2> + 2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1> + 2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6> + 2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6> + 2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7> + 2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3> + 2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6> + 2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2> + 3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3> + 3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6> + 2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3> + 1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6> + 2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6> + 3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7> + 4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS + 1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6> + 2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS + 3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5> + 2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5> + 2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5> + 2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6> + 1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS + 2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6> + 2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5> + 1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS + 2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS + 2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3> + 3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6> + 3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4> + 2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6> + 2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5> + 2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1> + 2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS + 2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS + 2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS + 3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4> + 2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3> + 2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6> + 2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS + 2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6> + 2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6> + 2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS + 2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS + 430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS + 1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2> + 1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2> + 430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS + 1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6> + 1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6> + 1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7> + 430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS + 430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS + 1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS + 1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2> + 430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS + 1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS + 1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3> + 1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2> + 430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS + 2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0> + 1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS + 2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2> + 2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0> + 2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5> + 2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7> + 3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7> + 2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0> + 1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS + 2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2> + 2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1> + 2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0> + 1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7> + 2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS + 2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7> + 2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7> + 3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7> + 1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7> + 2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7> + 3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3> + 2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2> + 2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1> + 2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7> + 2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7> + 2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7> + 3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7> + 2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1> + 2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2> + 2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5> + 3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1> + 2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3> + 2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6> + 2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0> + 3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7> + 2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7> + 2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2> + 2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS + 2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7> + 3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0> + 2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4> + 2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS + 1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS + 2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6> + 2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7> + 1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS + 2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS + 2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3> + 2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3> + 2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7> + 2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS + 2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5> + 2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7> + 1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS + 1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS + 2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0> + 3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5> + 2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2> + 2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6> + 2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4> + 2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u> + 2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6> + 2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7> + 2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u> + 1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS + 2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1> + 2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2> + 2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3> + 1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS + 1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7> + 2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3> + 2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7> + 1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS + 1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS + 1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS + 2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0> + 1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS + 1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS + 1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS + 2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7> + 1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS + 1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS + 2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0> + 1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS + 2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2> + 2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2> + 2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1> + 2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1> + 2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0> + 2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0> + 1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS + 2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2> + 2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1> + 1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS + 1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u> + 2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u> + 2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0> + 2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u> + 2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS + 1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS + 2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0> + 2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5> + 2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2> + 2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3> + 2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u> + 2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u> + 2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7> + 2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3> + 2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1> + 2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1> + 2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u> + 2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u> + 2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3> + 1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u> + 2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0> + 2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5> + 2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u> + 1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u> + 1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u> + 1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5> + 2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6> + 2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5> + 1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5> + 1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS + 2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6> + 2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u> + 1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS + 1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS + 2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5> + 2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3> + 2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7> + 1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS + 229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS + 1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS + 1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS + 229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS + 2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS + 2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS + 2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6> + 2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7> + 2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS + 2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS + 2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6> + 2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS + 2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7> + 430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS + 1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7> + 1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2> + 2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS + 430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS + 1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3> + 118708378U, // <5,u,7,6>: Cost 1 vrev RHS + 2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS + 430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS + 430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS + 1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS + 1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS + 1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS + 430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS + 229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS + 118716571U, // <5,u,u,6>: Cost 1 vrev RHS + 1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS + 430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS + 2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0> + 2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1> + 2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2> + 3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5> + 2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6> + 3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0> + 3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6> + 3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7> + 2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2> + 2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS + 3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0> + 1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS + 2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6> + 2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS + 2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1> + 2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1> + 2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1> + 1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS + 2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2> + 2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6> + 3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6> + 2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5> + 1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6> + 3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7> + 2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6> + 3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2> + 1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6> + 3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2> + 2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4> + 2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5> + 3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3> + 2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6> + 3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6> + 3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6> + 3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7> + 2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5> + 2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6> + 2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5> + 1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6> + 3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6> + 2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6> + 2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS + 2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0> + 2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0> + 1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6> + 3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS + 2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6> + 3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6> + 3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0> + 2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6> + 3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6> + 2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0> + 2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS + 2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS + 2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0> + 2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS + 3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS + 3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5> + 2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0> + 3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7> + 2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0> + 2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1> + 2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS + 2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS + 2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0> + 2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2> + 2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7> + 2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS + 4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5> + 3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0> + 4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7> + 2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS + 2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2> + 2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1> + 1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS + 2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5> + 1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6> + 2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS + 2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u> + 2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS + 1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS + 2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS + 2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS + 3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6> + 2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2> + 2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS + 3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2> + 2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1> + 2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0> + 2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2> + 3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1> + 2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1> + 3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6> + 2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3> + 2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6> + 3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5> + 3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6> + 3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1> + 2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3> + 2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS + 3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3> + 3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2> + 2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0> + 2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS + 3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3> + 2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3> + 3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0> + 2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0> + 2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS + 2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3> + 2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6> + 3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1> + 2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6> + 2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7> + 2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3> + 3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2> + 2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3> + 2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1> + 2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6> + 3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4> + 2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6> + 3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS + 2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6> + 3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0> + 2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1> + 2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6> + 2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1> + 3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7> + 3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6> + 2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7> + 2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6> + 3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6> + 2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0> + 3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS + 2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7> + 2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS + 3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7> + 3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6> + 3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS + 2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS + 3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7> + 2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6> + 3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1> + 3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS + 2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS + 2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7> + 2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2> + 3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS + 2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS + 2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5> + 3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0> + 4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7> + 2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS + 2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS + 2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3> + 2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6> + 2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0> + 2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6> + 2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7> + 2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u> + 2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u> + 2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0> + 2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0> + 1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS + 2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2> + 3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0> + 2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6> + 3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3> + 2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4> + 3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0> + 1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS + 2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1> + 2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1> + 2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0> + 2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS + 3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6> + 2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7> + 2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3> + 3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1> + 2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1> + 3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1> + 3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3> + 2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2> + 2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3> + 2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6> + 3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7> + 2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6> + 3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7> + 2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3> + 2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1> + 3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0> + 2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6> + 2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4> + 2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5> + 2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6> + 2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6> + 2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4> + 2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1> + 1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2> + 3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u> + 2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6> + 2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6> + 2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6> + 1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS + 2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0> + 3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2> + 1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2> + 3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3> + 2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3> + 3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7> + 2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6> + 2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5> + 2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5> + 2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0> + 2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS + 2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6> + 2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1> + 2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3> + 2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6> + 2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7> + 2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5> + 2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7> + 2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6> + 2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1> + 2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7> + 2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS + 2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2> + 2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7> + 1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS + 2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS + 3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7> + 2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6> + 2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7> + 1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS + 1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2> + 1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS + 2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6> + 1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS + 2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5> + 1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS + 2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0> + 2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS + 1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS + 3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0> + 2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2> + 2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4> + 3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2> + 2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2> + 2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2> + 3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0> + 2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0> + 2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2> + 3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3> + 3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1> + 3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3> + 3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1> + 2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6> + 3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3> + 3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0> + 3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3> + 2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6> + 3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4> + 2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3> + 3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2> + 3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0> + 3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6> + 3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7> + 2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7> + 3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6> + 2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3> + 3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1> + 3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3> + 2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3> + 2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3> + 2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6> + 3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5> + 3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7> + 3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7> + 2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5> + 2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS + 2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3> + 2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3> + 2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6> + 2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS + 1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6> + 2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6> + 2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4> + 1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6> + 3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS + 3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7> + 3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5> + 3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5> + 2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6> + 3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7> + 3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6> + 3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0> + 2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6> + 2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS + 2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3> + 2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6> + 3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1> + 2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6> + 3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7> + 2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6> + 3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7> + 2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS + 1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS + 2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7> + 1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7> + 2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2> + 1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS + 2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3> + 2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3> + 2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7> + 1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS + 1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS + 2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2> + 1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u> + 2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2> + 1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS + 1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6> + 2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0> + 2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7> + 1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS + 3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0> + 2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS + 2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6> + 3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1> + 2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6> + 2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1> + 2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2> + 3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0> + 2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS + 2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1> + 3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1> + 3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0> + 2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS + 3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS + 3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0> + 2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3> + 3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1> + 2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1> + 2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4> + 3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3> + 3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2> + 3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1> + 2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4> + 2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS + 2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0> + 3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0> + 2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4> + 3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2> + 3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3> + 3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6> + 3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3> + 2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6> + 3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6> + 2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5> + 3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7> + 2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6> + 2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS + 3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4> + 2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4> + 3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4> + 2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4> + 2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS + 1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6> + 2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4> + 1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6> + 2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS + 3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3> + 2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5> + 2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6> + 2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS + 2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5> + 1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS + 2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS + 1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS + 1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS + 2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2> + 2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2> + 2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2> + 1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS + 2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3> + 2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3> + 2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2> + 1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS + 2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS + 2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4> + 3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5> + 2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7> + 2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS + 2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5> + 2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6> + 4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7> + 2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS + 1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS + 2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS + 2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u> + 2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6> + 1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6> + 2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS + 1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS + 2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS + 1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS + 3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0> + 2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS + 3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6> + 3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2> + 3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1> + 4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5> + 3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7> + 1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0> + 1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0> + 2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1> + 3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5> + 3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0> + 3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS + 3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6> + 3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7> + 3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4> + 2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3> + 2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3> + 2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS + 3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5> + 3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2> + 3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5> + 2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS + 3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6> + 2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7> + 3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS + 2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5> + 3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2> + 3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5> + 3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3> + 3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3> + 2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6> + 3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6> + 3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0> + 2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3> + 2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3> + 2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS + 3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5> + 3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5> + 2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5> + 2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS + 2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS + 3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0> + 1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6> + 1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6> + 2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS + 3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2> + 3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2> + 3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2> + 2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS + 2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5> + 2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6> + 2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7> + 2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7> + 2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1> + 3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4> + 3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4> + 2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4> + 2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5> + 3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5> + 2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6> + 2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0> + 2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1> + 2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS + 2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7> + 2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7> + 3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2> + 2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS + 2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5> + 2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6> + 3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS + 2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS + 2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS + 2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u> + 2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u> + 2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7> + 2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS + 2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS + 2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6> + 1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u> + 1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u> + 2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS + 1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS + 2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4> + 3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1> + 2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2> + 3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3> + 2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0> + 4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS + 1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS + 2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2> + 2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1> + 2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0> + 2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS + 3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3> + 2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7> + 2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3> + 3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3> + 2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS + 3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6> + 3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3> + 2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6> + 2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1> + 3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3> + 3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7> + 2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7> + 2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3> + 2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3> + 2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2> + 3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4> + 3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6> + 2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3> + 2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5> + 3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4> + 2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6> + 4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS + 2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5> + 2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS + 3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2> + 3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6> + 3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6> + 1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6> + 1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS + 2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0> + 2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6> + 1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6> + 3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS + 2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3> + 3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5> + 3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0> + 2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6> + 2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5> + 2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0> + 2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS + 2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS + 1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS + 2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2> + 2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3> + 2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2> + 1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS + 2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3> + 296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS + 2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7> + 296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS + 2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS + 3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7> + 2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7> + 2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6> + 2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS + 2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4> + 2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6> + 1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS + 1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS + 1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS + 1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS + 2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u> + 2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS + 1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS + 1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS + 296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS + 1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS + 296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS + 1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0> + 497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS + 1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2> + 2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0> + 1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5> + 1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0> + 2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7> + 2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2> + 497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS + 1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2> + 1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1> + 1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0> + 1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3> + 2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5> + 1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7> + 2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7> + 2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2> + 1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3> + 2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2> + 2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3> + 1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2> + 1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1> + 2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6> + 2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7> + 1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7> + 2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1> + 1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1> + 1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2> + 2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3> + 2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1> + 1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3> + 1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6> + 2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7> + 2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7> + 2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1> + 1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2> + 1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1> + 2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3> + 2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3> + 2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5> + 1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4> + 497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS + 1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7> + 497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS + 2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2> + 1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4> + 2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0> + 1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6> + 1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5> + 1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0> + 1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7> + 1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7> + 2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2> + 2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3> + 1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5> + 2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6> + 2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7> + 1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6> + 1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1> + 1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1> + 1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2> + 2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7> + 2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3> + 2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7> + 1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6> + 2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7> + 1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7> + 1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7> + 1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2> + 1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2> + 497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS + 1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0> + 1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1> + 1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6> + 497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS + 1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7> + 1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1> + 497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS + 1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0> + 497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS + 1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2> + 2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2> + 1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5> + 1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0> + 2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7> + 1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0> + 497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS + 1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2> + 1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1> + 1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS + 1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3> + 2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS + 1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7> + 2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7> + 2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3> + 1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS + 1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS + 2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3> + 1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2> + 1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1> + 1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS + 2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u> + 1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7> + 2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3> + 1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1> + 1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2> + 2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3> + 2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3> + 1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3> + 1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6> + 2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7> + 2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5> + 2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1> + 1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2> + 1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1> + 2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3> + 1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6> + 2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6> + 1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4> + 497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS + 1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6> + 497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS + 2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS + 1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5> + 2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7> + 1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6> + 1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5> + 1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS + 1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7> + 1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS + 1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS + 2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2> + 1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7> + 1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS + 2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5> + 296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS + 1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1> + 296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS + 1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS + 2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7> + 1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7> + 1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS + 1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS + 2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5> + 1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7> + 1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS + 1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS + 1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS + 497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS + 1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS + 1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1> + 1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS + 497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS + 296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS + 1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS + 497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS + 1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0> + 1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1> + 1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2> + 3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0> + 2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1> + 2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6> + 2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0> + 3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7> + 1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1> + 1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS + 2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5> + 564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS + 2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7> + 1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS + 2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7> + 1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1> + 2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0> + 564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS + 1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2> + 2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5> + 2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0> + 2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1> + 1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6> + 2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7> + 2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7> + 2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7> + 1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2> + 2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2> + 2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0> + 2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0> + 2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3> + 2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6> + 2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0> + 2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0> + 3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7> + 2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0> + 2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4> + 1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5> + 1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6> + 3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4> + 2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6> + 1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS + 2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6> + 3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5> + 1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5> + 2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS + 2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3> + 2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7> + 2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0> + 2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6> + 2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5> + 1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0> + 2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7> + 1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0> + 2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7> + 2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7> + 2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7> + 2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0> + 3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7> + 2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7> + 2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6> + 2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1> + 2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7> + 2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2> + 2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS + 3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS + 3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7> + 2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6> + 2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7> + 2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0> + 2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7> + 2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2> + 1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2> + 1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1> + 564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS + 2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u> + 1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6> + 1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS + 1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u> + 2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1> + 564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS + 2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS + 2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS + 2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS + 1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2> + 2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS + 2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1> + 2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0> + 3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1> + 1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2> + 2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1> + 1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1> + 3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6> + 1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3> + 2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5> + 2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7> + 2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1> + 3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5> + 1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3> + 2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1> + 2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3> + 3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2> + 1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0> + 2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5> + 2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3> + 2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2> + 2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0> + 1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0> + 2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0> + 1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3> + 2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0> + 2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7> + 2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5> + 1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7> + 2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7> + 2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3> + 1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7> + 2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5> + 2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5> + 2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5> + 2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5> + 3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5> + 2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS + 2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1> + 3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0> + 2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS + 2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS + 2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7> + 3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6> + 1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7> + 2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS + 3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7> + 2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1> + 2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1> + 1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7> + 3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7> + 2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7> + 2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7> + 2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7> + 3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7> + 2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7> + 3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7> + 2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1> + 2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7> + 2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2> + 2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1> + 3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3> + 3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS + 2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS + 3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7> + 3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0> + 2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7> + 3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS + 2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS + 1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3> + 2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0> + 1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7> + 2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS + 1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7> + 2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7> + 2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS + 1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7> + 2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2> + 2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2> + 2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0> + 2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0> + 2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6> + 2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7> + 2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1> + 3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2> + 2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7> + 2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3> + 3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0> + 3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0> + 2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1> + 2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3> + 3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0> + 2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3> + 2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2> + 2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1> + 2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1> + 2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3> + 1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2> + 1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3> + 2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5> + 2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7> + 2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6> + 3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5> + 1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3> + 1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1> + 2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5> + 2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6> + 2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7> + 1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5> + 2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7> + 2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6> + 2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0> + 1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1> + 2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6> + 2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3> + 2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4> + 2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5> + 2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6> + 2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7> + 2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0> + 3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0> + 2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7> + 2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7> + 3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3> + 2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7> + 2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7> + 2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7> + 3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7> + 2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7> + 3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0> + 2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7> + 2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS + 2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3> + 2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7> + 1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7> + 2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS + 2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7> + 2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7> + 3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7> + 1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7> + 2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1> + 3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0> + 3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5> + 2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS + 2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS + 3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7> + 4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6> + 2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7> + 2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS + 1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1> + 2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5> + 1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2> + 1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7> + 1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5> + 2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7> + 2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7> + 2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0> + 1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7> + 2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0> + 1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2> + 2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0> + 2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2> + 2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1> + 2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2> + 2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0> + 3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0> + 1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2> + 2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1> + 2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1> + 2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3> + 2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5> + 2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6> + 2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3> + 3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1> + 2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5> + 2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5> + 2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1> + 2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0> + 2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2> + 2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0> + 2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5> + 2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4> + 2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3> + 2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3> + 2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0> + 2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1> + 2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3> + 2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3> + 1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3> + 2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4> + 2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7> + 2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3> + 2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7> + 1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3> + 2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1> + 2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2> + 2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4> + 2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5> + 2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5> + 1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6> + 2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6> + 3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4> + 1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6> + 2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS + 1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3> + 2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3> + 2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7> + 2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5> + 2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7> + 2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0> + 2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0> + 1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3> + 2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7> + 2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3> + 1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3> + 2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7> + 2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7> + 3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7> + 2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6> + 2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7> + 1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3> + 2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1> + 2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5> + 2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6> + 2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7> + 2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5> + 2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7> + 2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7> + 2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7> + 2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1> + 2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u> + 1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2> + 1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3> + 1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3> + 2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u> + 1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6> + 2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3> + 2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0> + 1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2> + 2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0> + 1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS + 2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2> + 3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1> + 2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5> + 1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1> + 1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2> + 3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1> + 1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1> + 2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2> + 2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1> + 2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3> + 2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7> + 2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3> + 2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0> + 2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3> + 3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3> + 2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0> + 3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1> + 3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3> + 2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2> + 2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1> + 3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7> + 2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3> + 2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0> + 3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5> + 2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0> + 2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2> + 3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5> + 3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1> + 2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3> + 2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6> + 2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4> + 2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5> + 3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7> + 2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4> + 2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2> + 3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1> + 3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4> + 3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3> + 1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4> + 1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5> + 1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6> + 3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7> + 1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5> + 1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS + 2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7> + 2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3> + 2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5> + 1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS + 2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7> + 564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS + 2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7> + 564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS + 2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1> + 3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5> + 2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3> + 2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4> + 1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6> + 2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7> + 2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7> + 2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1> + 1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2> + 2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2> + 3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2> + 3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5> + 3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6> + 2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1> + 2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0> + 2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1> + 2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7> + 2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7> + 1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS + 1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS + 2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3> + 2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u> + 1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6> + 1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1> + 564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS + 2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5> + 564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS + 2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS + 2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS + 2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS + 3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0> + 2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1> + 2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1> + 2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1> + 2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2> + 2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS + 2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1> + 3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1> + 3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0> + 2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7> + 2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5> + 2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3> + 2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1> + 1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3> + 1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3> + 3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1> + 3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0> + 3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7> + 2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4> + 3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7> + 2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3> + 3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3> + 2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3> + 2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4> + 3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS + 2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5> + 3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5> + 3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3> + 3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS + 2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5> + 3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5> + 2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0> + 2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0> + 2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS + 3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7> + 3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5> + 3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4> + 2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS + 2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS + 2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5> + 1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6> + 1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6> + 2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1> + 2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3> + 3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3> + 3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3> + 2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4> + 1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5> + 2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6> + 1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7> + 1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7> + 2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1> + 2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7> + 3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2> + 2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4> + 2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5> + 2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7> + 2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7> + 1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0> + 1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0> + 1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS + 2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3> + 2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2> + 2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7> + 1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS + 1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7> + 2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0> + 2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1> + 1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3> + 1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS + 2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS + 2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0> + 2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u> + 1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS + 1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7> + 2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7> + 1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0> + 1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3> + 2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0> + 1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS + 2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4> + 2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0> + 2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2> + 2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0> + 2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0> + 2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2> + 1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS + 1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1> + 2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1> + 2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0> + 2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3> + 2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS + 2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7> + 2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3> + 2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3> + 1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1> + 2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1> + 2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3> + 2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2> + 2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1> + 2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5> + 2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7> + 2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7> + 1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3> + 1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3> + 2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2> + 2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3> + 2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3> + 2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3> + 2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5> + 2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7> + 3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6> + 2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0> + 2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0> + 2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1> + 2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3> + 2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5> + 2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4> + 2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6> + 1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS + 2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0> + 2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5> + 1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS + 2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS + 2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3> + 2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7> + 3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4> + 1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6> + 2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5> + 2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0> + 2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7> + 1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5> + 2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1> + 2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3> + 2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3> + 3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4> + 2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4> + 2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7> + 1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6> + 1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7> + 1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7> + 1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1> + 2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0> + 2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2> + 2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0> + 1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5> + 2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4> + 2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6> + 2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7> + 1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1> + 1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1> + 1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS + 2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2> + 2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0> + 1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5> + 1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS + 1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6> + 1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3> + 1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1> + 2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0> + 1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2> + 2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0> + 3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0> + 2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1> + 2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0> + 2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0> + 2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2> + 1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2> + 2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3> + 2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1> + 2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3> + 2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5> + 2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS + 2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3> + 2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1> + 2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1> + 2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3> + 3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5> + 3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0> + 2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2> + 2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0> + 3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5> + 3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3> + 2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3> + 2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3> + 2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3> + 2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1> + 3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3> + 3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6> + 2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7> + 2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5> + 2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3> + 2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7> + 2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7> + 2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7> + 2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5> + 3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7> + 3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3> + 3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4> + 2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4> + 1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6> + 2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4> + 2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6> + 1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6> + 2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS + 2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7> + 3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3> + 2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7> + 2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7> + 1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7> + 2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7> + 2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5> + 1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7> + 2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS + 2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0> + 2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7> + 2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7> + 2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS + 2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7> + 1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7> + 2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0> + 1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7> + 1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS + 2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2> + 2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2> + 2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2> + 1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS + 2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7> + 2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7> + 363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS + 363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS + 1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS + 1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2> + 2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3> + 2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0> + 1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS + 1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6> + 1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7> + 363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS + 363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS + 1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0> + 1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2> + 1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2> + 1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2> + 2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1> + 1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1> + 1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2> + 2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS + 1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2> + 1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS + 1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1> + 564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS + 1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3> + 1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS + 2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3> + 1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1> + 1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3> + 564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS + 1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2> + 2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0> + 1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2> + 1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0> + 1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6> + 2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4> + 2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3> + 1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3> + 1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0> + 1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1> + 1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3> + 2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6> + 1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3> + 1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5> + 1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7> + 2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7> + 2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0> + 1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1> + 2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1> + 1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5> + 1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6> + 2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5> + 1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4> + 1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6> + 1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6> + 1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6> + 1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6> + 1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS + 1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u> + 2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3> + 1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7> + 1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u> + 1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u> + 564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS + 1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7> + 564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS + 2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1> + 2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7> + 1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u> + 1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7> + 1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6> + 2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7> + 1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u> + 1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0> + 1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7> + 1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1> + 2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3> + 2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6> + 2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7> + 1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5> + 1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7> + 2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2> + 363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS + 363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS + 1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1> + 1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2> + 564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS + 1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0> + 1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5> + 1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6> + 564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS + 363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS + 564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS + 135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS + 1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1> + 1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2> + 2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0> + 1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS + 2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3> + 2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0> + 2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0> + 135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS + 1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS + 1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS + 537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS + 2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3> + 1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS + 2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7> + 1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1> + 2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2> + 537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS + 1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2> + 2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1> + 1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS + 2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1> + 1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6> + 2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7> + 2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7> + 2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2> + 1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2> + 2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0> + 2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0> + 72589981U, // <u,0,3,2>: Cost 1 vrev LHS + 2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3> + 2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6> + 2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0> + 2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0> + 2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3> + 73032403U, // <u,0,3,u>: Cost 1 vrev LHS + 2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u> + 1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5> + 1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6> + 3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2> + 2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS + 1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS + 2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6> + 2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0> + 1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6> + 2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS + 1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS + 2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7> + 2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0> + 2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5> + 2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5> + 1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0> + 2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS + 1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0> + 2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS + 2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7> + 1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS + 2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0> + 2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS + 2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u> + 2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6> + 2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1> + 1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS + 2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS + 2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0> + 2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7> + 2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7> + 2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS + 2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u> + 2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7> + 2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7> + 2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS + 135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS + 1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1> + 537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS + 2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1> + 1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6> + 1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS + 1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u> + 2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS + 537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS + 1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1> + 1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS + 2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1> + 1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2> + 2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS + 2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1> + 2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0> + 2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0> + 1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2> + 1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS + 202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS + 2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2> + 1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3> + 1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS + 2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5> + 2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1> + 2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1> + 202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS + 1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS + 2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3> + 2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1> + 835584U, // <u,1,2,3>: Cost 0 copy LHS + 1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS + 2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3> + 2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7> + 1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2> + 835584U, // <u,1,2,u>: Cost 0 copy LHS + 1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS + 1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3> + 2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0> + 2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS + 1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS + 1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7> + 2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7> + 2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1> + 1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3> + 1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1> + 2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u> + 2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u> + 1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4> + 2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS + 1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS + 2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1> + 2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4> + 1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4> + 1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS + 2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2> + 2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2> + 1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7> + 1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS + 2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1> + 2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1> + 2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS + 1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7> + 2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS + 2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7> + 2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7> + 2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7> + 2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS + 2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7> + 2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1> + 1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1> + 1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1> + 2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS + 2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7> + 2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1> + 2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS + 2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS + 2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7> + 2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u> + 2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7> + 2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS + 1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS + 202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS + 2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0> + 835584U, // <u,1,u,3>: Cost 0 copy LHS + 1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS + 1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7> + 2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7> + 1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u> + 835584U, // <u,1,u,u>: Cost 0 copy LHS + 1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2> + 1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS + 1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2> + 2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0> + 2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5> + 2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7> + 2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4> + 2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u> + 1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS + 1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2> + 2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1> + 2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0> + 2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1> + 2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS + 2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7> + 2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3> + 2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2> + 1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1> + 1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS + 2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2> + 269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS + 1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3> + 1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS + 2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7> + 2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7> + 2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2> + 269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS + 408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS + 1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2> + 1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS + 408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS + 1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3> + 1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2> + 408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS + 1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2> + 2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4> + 2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5> + 2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5> + 2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS + 1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS + 1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2> + 2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4> + 1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS + 2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS + 2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0> + 2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7> + 2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6> + 1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5> + 2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5> + 2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7> + 2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS + 1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5> + 1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS + 2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2> + 2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2> + 1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7> + 1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS + 2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7> + 2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7> + 2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1> + 1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7> + 1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2> + 2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2> + 2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7> + 1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS + 2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS + 2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6> + 2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6> + 2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7> + 1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS + 408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS + 1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS + 269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS + 1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3> + 408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS + 1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS + 1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS + 1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2> + 408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS + 1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0> + 471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS + 1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0> + 1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0> + 2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7> + 2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0> + 471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS + 1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1> + 1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0> + 1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5> + 1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7> + 2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u> + 1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3> + 1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS + 1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2> + 1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2> + 1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1> + 1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS + 2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6> + 1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u> + 1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1> + 1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2> + 1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3> + 1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3> + 336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS + 1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6> + 2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5> + 2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3> + 2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7> + 336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS + 1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS + 1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4> + 2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0> + 2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2> + 1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS + 471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS + 1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6> + 2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4> + 471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS + 2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS + 1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3> + 2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5> + 2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6> + 1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6> + 1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5> + 1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0> + 1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS + 1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS + 2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2> + 2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6> + 1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3> + 2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u> + 2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6> + 1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6> + 1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6> + 1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1> + 1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6> + 1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS + 1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7> + 1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7> + 2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2> + 1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS + 2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u> + 2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u> + 1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7> + 1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS + 1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2> + 471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS + 1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0> + 336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS + 1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6> + 471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS + 1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7> + 1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS + 471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS + 2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0> + 1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS + 2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4> + 2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4> + 2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5> + 1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1> + 1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2> + 3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1> + 1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS + 2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS + 2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1> + 1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4> + 2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3> + 2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS + 1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS + 2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3> + 2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1> + 1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4> + 2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS + 2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u> + 2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2> + 2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5> + 2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS + 2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3> + 1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS + 2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2> + 1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS + 2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2> + 2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4> + 2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3> + 2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3> + 2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6> + 2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5> + 2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6> + 3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7> + 2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u> + 1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS + 2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4> + 2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4> + 2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4> + 161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS + 1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS + 1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6> + 2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4> + 161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS + 1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS + 2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3> + 1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5> + 2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2> + 1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS + 1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS + 537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS + 2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS + 537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS + 1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS + 2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2> + 2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2> + 2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2> + 1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS + 2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7> + 1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS + 2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1> + 1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS + 2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS + 2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4> + 2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7> + 2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7> + 2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS + 2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0> + 96808489U, // <u,4,7,6>: Cost 1 vrev RHS + 2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7> + 96955963U, // <u,4,7,u>: Cost 1 vrev RHS + 1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS + 1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS + 1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u> + 2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2> + 161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS + 1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS + 537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS + 2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS + 537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS + 2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0> + 1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS + 2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5> + 2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5> + 1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5> + 2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1> + 2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5> + 1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0> + 1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS + 1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS + 2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5> + 2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0> + 2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7> + 1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS + 2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5> + 2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3> + 1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3> + 1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3> + 2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS + 2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7> + 2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5> + 1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5> + 2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS + 2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3> + 2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7> + 3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS + 1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5> + 2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS + 2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3> + 2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u> + 2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3> + 2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6> + 3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5> + 2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6> + 2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS + 2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS + 2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS + 2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0> + 2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5> + 2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4> + 1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5> + 1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS + 2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5> + 1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6> + 1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS + 1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS + 2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7> + 2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2> + 2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2> + 1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS + 229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS + 2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6> + 1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7> + 229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS + 1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS + 2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6> + 2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6> + 1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6> + 1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS + 2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7> + 2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7> + 27705344U, // <u,5,6,7>: Cost 0 copy RHS + 27705344U, // <u,5,6,u>: Cost 0 copy RHS + 1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS + 1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7> + 2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2> + 2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2> + 1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS + 1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7> + 2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0> + 2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS + 1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS + 1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS + 1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u> + 2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2> + 1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u> + 1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS + 229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS + 2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6> + 27705344U, // <u,5,u,7>: Cost 0 copy RHS + 27705344U, // <u,5,u,u>: Cost 0 copy RHS + 2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0> + 1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS + 1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6> + 2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0> + 1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6> + 2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u> + 2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0> + 2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS + 1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS + 1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1> + 2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1> + 2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0> + 2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3> + 2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6> + 2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7> + 2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3> + 2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS + 1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1> + 1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS + 2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2> + 2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2> + 2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1> + 1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS + 2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6> + 2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7> + 1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3> + 1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3> + 2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2> + 2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3> + 2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3> + 2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3> + 1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6> + 2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6> + 3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6> + 1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS + 1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS + 2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS + 2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3> + 2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4> + 2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u> + 1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6> + 1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS + 1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6> + 2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS + 1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS + 2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS + 2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3> + 2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7> + 2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6> + 1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6> + 2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5> + 2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0> + 2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS + 1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5> + 1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS + 2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2> + 2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3> + 2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2> + 1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS + 2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3> + 296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS + 1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7> + 296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS + 432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS + 1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2> + 1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2> + 432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS + 1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3> + 1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3> + 1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS + 432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS + 432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS + 1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS + 1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2> + 432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS + 1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS + 296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS + 1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS + 432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS + 1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0> + 497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS + 1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2> + 2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0> + 1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5> + 1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0> + 2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7> + 2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2> + 497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS + 1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2> + 1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1> + 1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0> + 1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7> + 2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS + 1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7> + 2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7> + 2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u> + 1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7> + 2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7> + 1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2> + 1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2> + 1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1> + 2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7> + 2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7> + 1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7> + 2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u> + 1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7> + 1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2> + 2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u> + 2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u> + 1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3> + 1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6> + 1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3> + 1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3> + 2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7> + 1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2> + 1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1> + 2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7> + 2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3> + 2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4> + 1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4> + 497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS + 1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0> + 497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS + 2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2> + 1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4> + 2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0> + 1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6> + 1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5> + 1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0> + 1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7> + 1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7> + 1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS + 2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2> + 1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5> + 1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS + 1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6> + 1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6> + 1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1> + 1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1> + 1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2> + 2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7> + 2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2> + 2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7> + 1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6> + 1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7> + 1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7> + 363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS + 363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS + 1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2> + 497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS + 1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0> + 1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1> + 1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6> + 497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS + 1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7> + 363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS + 497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS + 135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS + 471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS + 1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2> + 1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0> + 1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2> + 1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0> + 471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS + 1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS + 537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS + 1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS + 1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1> + 1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3> + 537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS + 1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS + 1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2> + 269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS + 835584U, // <u,u,2,3>: Cost 0 copy LHS + 1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS + 2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6> + 1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3> + 835584U, // <u,u,2,u>: Cost 0 copy LHS + 408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS + 1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 120371557U, // <u,u,3,2>: Cost 1 vrev LHS + 336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS + 408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS + 1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7> + 1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS + 408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS + 1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS + 1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4> + 1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6> + 1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4> + 161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS + 471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS + 1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6> + 471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS + 1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS + 1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5> + 1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7> + 1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS + 229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS + 537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS + 1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS + 537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS + 1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS + 2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2> + 1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7> + 1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS + 1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6> + 296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS + 27705344U, // <u,u,6,7>: Cost 0 copy RHS + 27705344U, // <u,u,6,u>: Cost 0 copy RHS + 432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS + 1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7> + 1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7> + 1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS + 432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS + 1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3> + 120699277U, // <u,u,7,6>: Cost 1 vrev RHS + 363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS + 432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS + 408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS + 471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS + 537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS + 835584U, // <u,u,u,3>: Cost 0 copy LHS + 408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS + 471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS + 537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS + 27705344U, // <u,u,u,7>: Cost 0 copy RHS + 835584U, // <u,u,u,u>: Cost 0 copy LHS + 0 +}; diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp new file mode 100644 index 0000000..1cba1ba --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp @@ -0,0 +1,22 @@ +//===- ARMRegisterInfo.cpp - ARM Register Information -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the ARM implementation of the TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "ARMRegisterInfo.h" +using namespace llvm; + +ARMRegisterInfo::ARMRegisterInfo(const ARMBaseInstrInfo &tii, + const ARMSubtarget &sti) + : ARMBaseRegisterInfo(tii, sti) { +} diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.h new file mode 100644 index 0000000..8edfb9a --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.h @@ -0,0 +1,33 @@ +//===- ARMRegisterInfo.h - ARM Register Information Impl --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the ARM implementation of the TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMREGISTERINFO_H +#define ARMREGISTERINFO_H + +#include "ARM.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "ARMBaseRegisterInfo.h" + +namespace llvm { + class ARMSubtarget; + class ARMBaseInstrInfo; + class Type; + +struct ARMRegisterInfo : public ARMBaseRegisterInfo { +public: + ARMRegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI); +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td new file mode 100644 index 0000000..036822d --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td @@ -0,0 +1,352 @@ +//===- ARMRegisterInfo.td - ARM Register defs --------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Declarations that describe the ARM register file +//===----------------------------------------------------------------------===// + +// Registers are identified with 4-bit ID numbers. +class ARMReg<bits<4> num, string n, list<Register> subregs = []> : Register<n> { + field bits<4> Num; + let Namespace = "ARM"; + let SubRegs = subregs; +} + +class ARMFReg<bits<6> num, string n> : Register<n> { + field bits<6> Num; + let Namespace = "ARM"; +} + +// Subregister indices. +let Namespace = "ARM" in { +// Note: Code depends on these having consecutive numbers. +def ssub_0 : SubRegIndex; +def ssub_1 : SubRegIndex; +def ssub_2 : SubRegIndex; // In a Q reg. +def ssub_3 : SubRegIndex; + +def dsub_0 : SubRegIndex; +def dsub_1 : SubRegIndex; +def dsub_2 : SubRegIndex; +def dsub_3 : SubRegIndex; +def dsub_4 : SubRegIndex; +def dsub_5 : SubRegIndex; +def dsub_6 : SubRegIndex; +def dsub_7 : SubRegIndex; + +def qsub_0 : SubRegIndex; +def qsub_1 : SubRegIndex; +def qsub_2 : SubRegIndex; +def qsub_3 : SubRegIndex; + +def qqsub_0 : SubRegIndex; +def qqsub_1 : SubRegIndex; +} + +// Integer registers +def R0 : ARMReg< 0, "r0">, DwarfRegNum<[0]>; +def R1 : ARMReg< 1, "r1">, DwarfRegNum<[1]>; +def R2 : ARMReg< 2, "r2">, DwarfRegNum<[2]>; +def R3 : ARMReg< 3, "r3">, DwarfRegNum<[3]>; +def R4 : ARMReg< 4, "r4">, DwarfRegNum<[4]>; +def R5 : ARMReg< 5, "r5">, DwarfRegNum<[5]>; +def R6 : ARMReg< 6, "r6">, DwarfRegNum<[6]>; +def R7 : ARMReg< 7, "r7">, DwarfRegNum<[7]>; +// These require 32-bit instructions. +let CostPerUse = 1 in { +def R8 : ARMReg< 8, "r8">, DwarfRegNum<[8]>; +def R9 : ARMReg< 9, "r9">, DwarfRegNum<[9]>; +def R10 : ARMReg<10, "r10">, DwarfRegNum<[10]>; +def R11 : ARMReg<11, "r11">, DwarfRegNum<[11]>; +def R12 : ARMReg<12, "r12">, DwarfRegNum<[12]>; +def SP : ARMReg<13, "sp">, DwarfRegNum<[13]>; +def LR : ARMReg<14, "lr">, DwarfRegNum<[14]>; +def PC : ARMReg<15, "pc">, DwarfRegNum<[15]>; +} + +// Float registers +def S0 : ARMFReg< 0, "s0">; def S1 : ARMFReg< 1, "s1">; +def S2 : ARMFReg< 2, "s2">; def S3 : ARMFReg< 3, "s3">; +def S4 : ARMFReg< 4, "s4">; def S5 : ARMFReg< 5, "s5">; +def S6 : ARMFReg< 6, "s6">; def S7 : ARMFReg< 7, "s7">; +def S8 : ARMFReg< 8, "s8">; def S9 : ARMFReg< 9, "s9">; +def S10 : ARMFReg<10, "s10">; def S11 : ARMFReg<11, "s11">; +def S12 : ARMFReg<12, "s12">; def S13 : ARMFReg<13, "s13">; +def S14 : ARMFReg<14, "s14">; def S15 : ARMFReg<15, "s15">; +def S16 : ARMFReg<16, "s16">; def S17 : ARMFReg<17, "s17">; +def S18 : ARMFReg<18, "s18">; def S19 : ARMFReg<19, "s19">; +def S20 : ARMFReg<20, "s20">; def S21 : ARMFReg<21, "s21">; +def S22 : ARMFReg<22, "s22">; def S23 : ARMFReg<23, "s23">; +def S24 : ARMFReg<24, "s24">; def S25 : ARMFReg<25, "s25">; +def S26 : ARMFReg<26, "s26">; def S27 : ARMFReg<27, "s27">; +def S28 : ARMFReg<28, "s28">; def S29 : ARMFReg<29, "s29">; +def S30 : ARMFReg<30, "s30">; def S31 : ARMFReg<31, "s31">; + +// Aliases of the F* registers used to hold 64-bit fp values (doubles) +let SubRegIndices = [ssub_0, ssub_1] in { +def D0 : ARMReg< 0, "d0", [S0, S1]>, DwarfRegNum<[256]>; +def D1 : ARMReg< 1, "d1", [S2, S3]>, DwarfRegNum<[257]>; +def D2 : ARMReg< 2, "d2", [S4, S5]>, DwarfRegNum<[258]>; +def D3 : ARMReg< 3, "d3", [S6, S7]>, DwarfRegNum<[259]>; +def D4 : ARMReg< 4, "d4", [S8, S9]>, DwarfRegNum<[260]>; +def D5 : ARMReg< 5, "d5", [S10, S11]>, DwarfRegNum<[261]>; +def D6 : ARMReg< 6, "d6", [S12, S13]>, DwarfRegNum<[262]>; +def D7 : ARMReg< 7, "d7", [S14, S15]>, DwarfRegNum<[263]>; +def D8 : ARMReg< 8, "d8", [S16, S17]>, DwarfRegNum<[264]>; +def D9 : ARMReg< 9, "d9", [S18, S19]>, DwarfRegNum<[265]>; +def D10 : ARMReg<10, "d10", [S20, S21]>, DwarfRegNum<[266]>; +def D11 : ARMReg<11, "d11", [S22, S23]>, DwarfRegNum<[267]>; +def D12 : ARMReg<12, "d12", [S24, S25]>, DwarfRegNum<[268]>; +def D13 : ARMReg<13, "d13", [S26, S27]>, DwarfRegNum<[269]>; +def D14 : ARMReg<14, "d14", [S28, S29]>, DwarfRegNum<[270]>; +def D15 : ARMReg<15, "d15", [S30, S31]>, DwarfRegNum<[271]>; +} + +// VFP3 defines 16 additional double registers +def D16 : ARMFReg<16, "d16">, DwarfRegNum<[272]>; +def D17 : ARMFReg<17, "d17">, DwarfRegNum<[273]>; +def D18 : ARMFReg<18, "d18">, DwarfRegNum<[274]>; +def D19 : ARMFReg<19, "d19">, DwarfRegNum<[275]>; +def D20 : ARMFReg<20, "d20">, DwarfRegNum<[276]>; +def D21 : ARMFReg<21, "d21">, DwarfRegNum<[277]>; +def D22 : ARMFReg<22, "d22">, DwarfRegNum<[278]>; +def D23 : ARMFReg<23, "d23">, DwarfRegNum<[279]>; +def D24 : ARMFReg<24, "d24">, DwarfRegNum<[280]>; +def D25 : ARMFReg<25, "d25">, DwarfRegNum<[281]>; +def D26 : ARMFReg<26, "d26">, DwarfRegNum<[282]>; +def D27 : ARMFReg<27, "d27">, DwarfRegNum<[283]>; +def D28 : ARMFReg<28, "d28">, DwarfRegNum<[284]>; +def D29 : ARMFReg<29, "d29">, DwarfRegNum<[285]>; +def D30 : ARMFReg<30, "d30">, DwarfRegNum<[286]>; +def D31 : ARMFReg<31, "d31">, DwarfRegNum<[287]>; + +// Advanced SIMD (NEON) defines 16 quad-word aliases +let SubRegIndices = [dsub_0, dsub_1], + CompositeIndices = [(ssub_2 dsub_1, ssub_0), + (ssub_3 dsub_1, ssub_1)] in { +def Q0 : ARMReg< 0, "q0", [D0, D1]>; +def Q1 : ARMReg< 1, "q1", [D2, D3]>; +def Q2 : ARMReg< 2, "q2", [D4, D5]>; +def Q3 : ARMReg< 3, "q3", [D6, D7]>; +def Q4 : ARMReg< 4, "q4", [D8, D9]>; +def Q5 : ARMReg< 5, "q5", [D10, D11]>; +def Q6 : ARMReg< 6, "q6", [D12, D13]>; +def Q7 : ARMReg< 7, "q7", [D14, D15]>; +} +let SubRegIndices = [dsub_0, dsub_1] in { +def Q8 : ARMReg< 8, "q8", [D16, D17]>; +def Q9 : ARMReg< 9, "q9", [D18, D19]>; +def Q10 : ARMReg<10, "q10", [D20, D21]>; +def Q11 : ARMReg<11, "q11", [D22, D23]>; +def Q12 : ARMReg<12, "q12", [D24, D25]>; +def Q13 : ARMReg<13, "q13", [D26, D27]>; +def Q14 : ARMReg<14, "q14", [D28, D29]>; +def Q15 : ARMReg<15, "q15", [D30, D31]>; +} + +// Pseudo 256-bit registers to represent pairs of Q registers. These should +// never be present in the emitted code. +// These are used for NEON load / store instructions, e.g., vld4, vst3. +// NOTE: It's possible to define more QQ registers since technically the +// starting D register number doesn't have to be multiple of 4, e.g., +// D1, D2, D3, D4 would be a legal quad, but that would make the subregister +// stuff very messy. +let SubRegIndices = [qsub_0, qsub_1], + CompositeIndices = [(dsub_2 qsub_1, dsub_0), (dsub_3 qsub_1, dsub_1)] in { +def QQ0 : ARMReg<0, "qq0", [Q0, Q1]>; +def QQ1 : ARMReg<1, "qq1", [Q2, Q3]>; +def QQ2 : ARMReg<2, "qq2", [Q4, Q5]>; +def QQ3 : ARMReg<3, "qq3", [Q6, Q7]>; +def QQ4 : ARMReg<4, "qq4", [Q8, Q9]>; +def QQ5 : ARMReg<5, "qq5", [Q10, Q11]>; +def QQ6 : ARMReg<6, "qq6", [Q12, Q13]>; +def QQ7 : ARMReg<7, "qq7", [Q14, Q15]>; +} + +// Pseudo 512-bit registers to represent four consecutive Q registers. +let SubRegIndices = [qqsub_0, qqsub_1], + CompositeIndices = [(qsub_2 qqsub_1, qsub_0), (qsub_3 qqsub_1, qsub_1), + (dsub_4 qqsub_1, dsub_0), (dsub_5 qqsub_1, dsub_1), + (dsub_6 qqsub_1, dsub_2), (dsub_7 qqsub_1, dsub_3)] in { +def QQQQ0 : ARMReg<0, "qqqq0", [QQ0, QQ1]>; +def QQQQ1 : ARMReg<1, "qqqq1", [QQ2, QQ3]>; +def QQQQ2 : ARMReg<2, "qqqq2", [QQ4, QQ5]>; +def QQQQ3 : ARMReg<3, "qqqq3", [QQ6, QQ7]>; +} + +// Current Program Status Register. +def CPSR : ARMReg<0, "cpsr">; +def APSR : ARMReg<1, "apsr">; +def SPSR : ARMReg<2, "spsr">; +def FPSCR : ARMReg<3, "fpscr">; +def ITSTATE : ARMReg<4, "itstate">; + +// Special Registers - only available in privileged mode. +def FPSID : ARMReg<0, "fpsid">; +def FPEXC : ARMReg<8, "fpexc">; + +// Register classes. +// +// pc == Program Counter +// lr == Link Register +// sp == Stack Pointer +// r12 == ip (scratch) +// r7 == Frame Pointer (thumb-style backtraces) +// r9 == May be reserved as Thread Register +// r11 == Frame Pointer (arm-style backtraces) +// r10 == Stack Limit +// +def GPR : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12), + SP, LR, PC)> { + // Allocate LR as the first CSR since it is always saved anyway. + // For Thumb1 mode, we don't want to allocate hi regs at all, as we don't + // know how to spill them. If we make our prologue/epilogue code smarter at + // some point, we can go back to using the above allocation orders for the + // Thumb1 instructions that know how to use hi regs. + let AltOrders = [(add LR, GPR), (trunc GPR, 8)]; + let AltOrderSelect = [{ + return 1 + MF.getTarget().getSubtarget<ARMSubtarget>().isThumb1Only(); + }]; +} + +// GPRs without the PC. Some ARM instructions do not allow the PC in +// certain operand slots, particularly as the destination. Primarily +// useful for disassembly. +def GPRnopc : RegisterClass<"ARM", [i32], 32, (sub GPR, PC)> { + let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)]; + let AltOrderSelect = [{ + return 1 + MF.getTarget().getSubtarget<ARMSubtarget>().isThumb1Only(); + }]; +} + +// GPRsp - Only the SP is legal. Used by Thumb1 instructions that want the +// implied SP argument list. +// FIXME: It would be better to not use this at all and refactor the +// instructions to not have SP an an explicit argument. That makes +// frame index resolution a bit trickier, though. +def GPRsp : RegisterClass<"ARM", [i32], 32, (add SP)>; + +// restricted GPR register class. Many Thumb2 instructions allow the full +// register range for operands, but have undefined behaviours when PC +// or SP (R13 or R15) are used. The ARM ISA refers to these operands +// via the BadReg() pseudo-code description. +def rGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, SP, PC)> { + let AltOrders = [(add LR, rGPR), (trunc rGPR, 8)]; + let AltOrderSelect = [{ + return 1 + MF.getTarget().getSubtarget<ARMSubtarget>().isThumb1Only(); + }]; +} + +// Thumb registers are R0-R7 normally. Some instructions can still use +// the general GPR register class above (MOV, e.g.) +def tGPR : RegisterClass<"ARM", [i32], 32, (trunc GPR, 8)>; + +// The high registers in thumb mode, R8-R15. +def hGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, tGPR)>; + +// For tail calls, we can't use callee-saved registers, as they are restored +// to the saved value before the tail call, which would clobber a call address. +// Note, getMinimalPhysRegClass(R0) returns tGPR because of the names of +// this class and the preceding one(!) This is what we want. +def tcGPR : RegisterClass<"ARM", [i32], 32, (add R0, R1, R2, R3, R9, R12)> { + let AltOrders = [(and tcGPR, tGPR)]; + let AltOrderSelect = [{ + return MF.getTarget().getSubtarget<ARMSubtarget>().isThumb1Only(); + }]; +} + +// Scalar single precision floating point register class.. +def SPR : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 31)>; + +// Subset of SPR which can be used as a source of NEON scalars for 16-bit +// operations +def SPR_8 : RegisterClass<"ARM", [f32], 32, (trunc SPR, 16)>; + +// Scalar double precision floating point / generic 64-bit vector register +// class. +// ARM requires only word alignment for double. It's more performant if it +// is double-word alignment though. +def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64, + (sequence "D%u", 0, 31)> { + // Allocate non-VFP2 registers D16-D31 first. + let AltOrders = [(rotl DPR, 16)]; + let AltOrderSelect = [{ return 1; }]; +} + +// Subset of DPR that are accessible with VFP2 (and so that also have +// 32-bit SPR subregs). +def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64, + (trunc DPR, 16)> { + let SubRegClasses = [(SPR ssub_0, ssub_1)]; +} + +// Subset of DPR which can be used as a source of NEON scalars for 16-bit +// operations +def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64, + (trunc DPR, 8)> { + let SubRegClasses = [(SPR_8 ssub_0, ssub_1)]; +} + +// Generic 128-bit vector register class. +def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128, + (sequence "Q%u", 0, 15)> { + let SubRegClasses = [(DPR dsub_0, dsub_1)]; + // Allocate non-VFP2 aliases Q8-Q15 first. + let AltOrders = [(rotl QPR, 8)]; + let AltOrderSelect = [{ return 1; }]; +} + +// Subset of QPR that have 32-bit SPR subregs. +def QPR_VFP2 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + 128, (trunc QPR, 8)> { + let SubRegClasses = [(SPR ssub_0, ssub_1, ssub_2, ssub_3), + (DPR_VFP2 dsub_0, dsub_1)]; +} + +// Subset of QPR that have DPR_8 and SPR_8 subregs. +def QPR_8 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + 128, (trunc QPR, 4)> { + let SubRegClasses = [(SPR_8 ssub_0, ssub_1, ssub_2, ssub_3), + (DPR_8 dsub_0, dsub_1)]; +} + +// Pseudo 256-bit vector register class to model pairs of Q registers +// (4 consecutive D registers). +def QQPR : RegisterClass<"ARM", [v4i64], 256, (sequence "QQ%u", 0, 7)> { + let SubRegClasses = [(DPR dsub_0, dsub_1, dsub_2, dsub_3), + (QPR qsub_0, qsub_1)]; + // Allocate non-VFP2 aliases first. + let AltOrders = [(rotl QQPR, 4)]; + let AltOrderSelect = [{ return 1; }]; +} + +// Subset of QQPR that have 32-bit SPR subregs. +def QQPR_VFP2 : RegisterClass<"ARM", [v4i64], 256, (trunc QQPR, 4)> { + let SubRegClasses = [(SPR ssub_0, ssub_1, ssub_2, ssub_3), + (DPR_VFP2 dsub_0, dsub_1, dsub_2, dsub_3), + (QPR_VFP2 qsub_0, qsub_1)]; + +} + +// Pseudo 512-bit vector register class to model 4 consecutive Q registers +// (8 consecutive D registers). +def QQQQPR : RegisterClass<"ARM", [v8i64], 256, (sequence "QQQQ%u", 0, 3)> { + let SubRegClasses = [(DPR dsub_0, dsub_1, dsub_2, dsub_3, + dsub_4, dsub_5, dsub_6, dsub_7), + (QPR qsub_0, qsub_1, qsub_2, qsub_3)]; + // Allocate non-VFP2 aliases first. + let AltOrders = [(rotl QQQQPR, 2)]; + let AltOrderSelect = [{ return 1; }]; +} + +// Condition code registers. +def CCR : RegisterClass<"ARM", [i32], 32, (add CPSR)> { + let CopyCost = -1; // Don't allow copying of status registers. + let isAllocatable = 0; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMRelocations.h b/contrib/llvm/lib/Target/ARM/ARMRelocations.h new file mode 100644 index 0000000..86e7206 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMRelocations.h @@ -0,0 +1,62 @@ +//===- ARMRelocations.h - ARM Code Relocations ------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the ARM target-specific relocation types. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMRELOCATIONS_H +#define ARMRELOCATIONS_H + +#include "llvm/CodeGen/MachineRelocation.h" + +namespace llvm { + namespace ARM { + enum RelocationType { + // reloc_arm_absolute - Absolute relocation, just add the relocated value + // to the value already in memory. + reloc_arm_absolute, + + // reloc_arm_relative - PC relative relocation, add the relocated value to + // the value already in memory, after we adjust it for where the PC is. + reloc_arm_relative, + + // reloc_arm_cp_entry - PC relative relocation for constpool_entry's whose + // addresses are kept locally in a map. + reloc_arm_cp_entry, + + // reloc_arm_vfp_cp_entry - Same as reloc_arm_cp_entry except the offset + // should be divided by 4. + reloc_arm_vfp_cp_entry, + + // reloc_arm_machine_cp_entry - Relocation of a ARM machine constantpool + // entry. + reloc_arm_machine_cp_entry, + + // reloc_arm_jt_base - PC relative relocation for jump tables whose + // addresses are kept locally in a map. + reloc_arm_jt_base, + + // reloc_arm_pic_jt - PIC jump table entry relocation: dest bb - jt base. + reloc_arm_pic_jt, + + // reloc_arm_branch - Branch address relocation. + reloc_arm_branch, + + // reloc_arm_movt - MOVT immediate relocation. + reloc_arm_movt, + + // reloc_arm_movw - MOVW immediate relocation. + reloc_arm_movw + }; + } +} + +#endif + diff --git a/contrib/llvm/lib/Target/ARM/ARMSchedule.td b/contrib/llvm/lib/Target/ARM/ARMSchedule.td new file mode 100644 index 0000000..958c5c6 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMSchedule.td @@ -0,0 +1,261 @@ +//===- ARMSchedule.td - ARM Scheduling Definitions ---------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Instruction Itinerary classes used for ARM +// +def IIC_iALUx : InstrItinClass; +def IIC_iALUi : InstrItinClass; +def IIC_iALUr : InstrItinClass; +def IIC_iALUsi : InstrItinClass; +def IIC_iALUsir : InstrItinClass; +def IIC_iALUsr : InstrItinClass; +def IIC_iBITi : InstrItinClass; +def IIC_iBITr : InstrItinClass; +def IIC_iBITsi : InstrItinClass; +def IIC_iBITsr : InstrItinClass; +def IIC_iUNAr : InstrItinClass; +def IIC_iUNAsi : InstrItinClass; +def IIC_iEXTr : InstrItinClass; +def IIC_iEXTAr : InstrItinClass; +def IIC_iEXTAsr : InstrItinClass; +def IIC_iCMPi : InstrItinClass; +def IIC_iCMPr : InstrItinClass; +def IIC_iCMPsi : InstrItinClass; +def IIC_iCMPsr : InstrItinClass; +def IIC_iTSTi : InstrItinClass; +def IIC_iTSTr : InstrItinClass; +def IIC_iTSTsi : InstrItinClass; +def IIC_iTSTsr : InstrItinClass; +def IIC_iMOVi : InstrItinClass; +def IIC_iMOVr : InstrItinClass; +def IIC_iMOVsi : InstrItinClass; +def IIC_iMOVsr : InstrItinClass; +def IIC_iMOVix2 : InstrItinClass; +def IIC_iMOVix2addpc : InstrItinClass; +def IIC_iMOVix2ld : InstrItinClass; +def IIC_iMVNi : InstrItinClass; +def IIC_iMVNr : InstrItinClass; +def IIC_iMVNsi : InstrItinClass; +def IIC_iMVNsr : InstrItinClass; +def IIC_iCMOVi : InstrItinClass; +def IIC_iCMOVr : InstrItinClass; +def IIC_iCMOVsi : InstrItinClass; +def IIC_iCMOVsr : InstrItinClass; +def IIC_iCMOVix2 : InstrItinClass; +def IIC_iMUL16 : InstrItinClass; +def IIC_iMAC16 : InstrItinClass; +def IIC_iMUL32 : InstrItinClass; +def IIC_iMAC32 : InstrItinClass; +def IIC_iMUL64 : InstrItinClass; +def IIC_iMAC64 : InstrItinClass; +def IIC_iLoad_i : InstrItinClass; +def IIC_iLoad_r : InstrItinClass; +def IIC_iLoad_si : InstrItinClass; +def IIC_iLoad_iu : InstrItinClass; +def IIC_iLoad_ru : InstrItinClass; +def IIC_iLoad_siu : InstrItinClass; +def IIC_iLoad_bh_i : InstrItinClass; +def IIC_iLoad_bh_r : InstrItinClass; +def IIC_iLoad_bh_si : InstrItinClass; +def IIC_iLoad_bh_iu : InstrItinClass; +def IIC_iLoad_bh_ru : InstrItinClass; +def IIC_iLoad_bh_siu : InstrItinClass; +def IIC_iLoad_d_i : InstrItinClass; +def IIC_iLoad_d_r : InstrItinClass; +def IIC_iLoad_d_ru : InstrItinClass; +def IIC_iLoad_m : InstrItinClass<0>; // micro-coded +def IIC_iLoad_mu : InstrItinClass<0>; // micro-coded +def IIC_iLoad_mBr : InstrItinClass<0>; // micro-coded +def IIC_iPop : InstrItinClass<0>; // micro-coded +def IIC_iPop_Br : InstrItinClass<0>; // micro-coded +def IIC_iLoadiALU : InstrItinClass; +def IIC_iStore_i : InstrItinClass; +def IIC_iStore_r : InstrItinClass; +def IIC_iStore_si : InstrItinClass; +def IIC_iStore_iu : InstrItinClass; +def IIC_iStore_ru : InstrItinClass; +def IIC_iStore_siu : InstrItinClass; +def IIC_iStore_bh_i : InstrItinClass; +def IIC_iStore_bh_r : InstrItinClass; +def IIC_iStore_bh_si : InstrItinClass; +def IIC_iStore_bh_iu : InstrItinClass; +def IIC_iStore_bh_ru : InstrItinClass; +def IIC_iStore_bh_siu : InstrItinClass; +def IIC_iStore_d_i : InstrItinClass; +def IIC_iStore_d_r : InstrItinClass; +def IIC_iStore_d_ru : InstrItinClass; +def IIC_iStore_m : InstrItinClass<0>; // micro-coded +def IIC_iStore_mu : InstrItinClass<0>; // micro-coded +def IIC_Preload : InstrItinClass; +def IIC_Br : InstrItinClass; +def IIC_fpSTAT : InstrItinClass; +def IIC_fpUNA32 : InstrItinClass; +def IIC_fpUNA64 : InstrItinClass; +def IIC_fpCMP32 : InstrItinClass; +def IIC_fpCMP64 : InstrItinClass; +def IIC_fpCVTSD : InstrItinClass; +def IIC_fpCVTDS : InstrItinClass; +def IIC_fpCVTSH : InstrItinClass; +def IIC_fpCVTHS : InstrItinClass; +def IIC_fpCVTIS : InstrItinClass; +def IIC_fpCVTID : InstrItinClass; +def IIC_fpCVTSI : InstrItinClass; +def IIC_fpCVTDI : InstrItinClass; +def IIC_fpMOVIS : InstrItinClass; +def IIC_fpMOVID : InstrItinClass; +def IIC_fpMOVSI : InstrItinClass; +def IIC_fpMOVDI : InstrItinClass; +def IIC_fpALU32 : InstrItinClass; +def IIC_fpALU64 : InstrItinClass; +def IIC_fpMUL32 : InstrItinClass; +def IIC_fpMUL64 : InstrItinClass; +def IIC_fpMAC32 : InstrItinClass; +def IIC_fpMAC64 : InstrItinClass; +def IIC_fpDIV32 : InstrItinClass; +def IIC_fpDIV64 : InstrItinClass; +def IIC_fpSQRT32 : InstrItinClass; +def IIC_fpSQRT64 : InstrItinClass; +def IIC_fpLoad32 : InstrItinClass; +def IIC_fpLoad64 : InstrItinClass; +def IIC_fpLoad_m : InstrItinClass<0>; // micro-coded +def IIC_fpLoad_mu : InstrItinClass<0>; // micro-coded +def IIC_fpStore32 : InstrItinClass; +def IIC_fpStore64 : InstrItinClass; +def IIC_fpStore_m : InstrItinClass<0>; // micro-coded +def IIC_fpStore_mu : InstrItinClass<0>; // micro-coded +def IIC_VLD1 : InstrItinClass; +def IIC_VLD1x2 : InstrItinClass; +def IIC_VLD1x3 : InstrItinClass; +def IIC_VLD1x4 : InstrItinClass; +def IIC_VLD1u : InstrItinClass; +def IIC_VLD1x2u : InstrItinClass; +def IIC_VLD1x3u : InstrItinClass; +def IIC_VLD1x4u : InstrItinClass; +def IIC_VLD1ln : InstrItinClass; +def IIC_VLD1lnu : InstrItinClass; +def IIC_VLD1dup : InstrItinClass; +def IIC_VLD1dupu : InstrItinClass; +def IIC_VLD2 : InstrItinClass; +def IIC_VLD2x2 : InstrItinClass; +def IIC_VLD2u : InstrItinClass; +def IIC_VLD2x2u : InstrItinClass; +def IIC_VLD2ln : InstrItinClass; +def IIC_VLD2lnu : InstrItinClass; +def IIC_VLD2dup : InstrItinClass; +def IIC_VLD2dupu : InstrItinClass; +def IIC_VLD3 : InstrItinClass; +def IIC_VLD3ln : InstrItinClass; +def IIC_VLD3u : InstrItinClass; +def IIC_VLD3lnu : InstrItinClass; +def IIC_VLD3dup : InstrItinClass; +def IIC_VLD3dupu : InstrItinClass; +def IIC_VLD4 : InstrItinClass; +def IIC_VLD4ln : InstrItinClass; +def IIC_VLD4u : InstrItinClass; +def IIC_VLD4lnu : InstrItinClass; +def IIC_VLD4dup : InstrItinClass; +def IIC_VLD4dupu : InstrItinClass; +def IIC_VST1 : InstrItinClass; +def IIC_VST1x2 : InstrItinClass; +def IIC_VST1x3 : InstrItinClass; +def IIC_VST1x4 : InstrItinClass; +def IIC_VST1u : InstrItinClass; +def IIC_VST1x2u : InstrItinClass; +def IIC_VST1x3u : InstrItinClass; +def IIC_VST1x4u : InstrItinClass; +def IIC_VST1ln : InstrItinClass; +def IIC_VST1lnu : InstrItinClass; +def IIC_VST2 : InstrItinClass; +def IIC_VST2x2 : InstrItinClass; +def IIC_VST2u : InstrItinClass; +def IIC_VST2x2u : InstrItinClass; +def IIC_VST2ln : InstrItinClass; +def IIC_VST2lnu : InstrItinClass; +def IIC_VST3 : InstrItinClass; +def IIC_VST3u : InstrItinClass; +def IIC_VST3ln : InstrItinClass; +def IIC_VST3lnu : InstrItinClass; +def IIC_VST4 : InstrItinClass; +def IIC_VST4u : InstrItinClass; +def IIC_VST4ln : InstrItinClass; +def IIC_VST4lnu : InstrItinClass; +def IIC_VUNAD : InstrItinClass; +def IIC_VUNAQ : InstrItinClass; +def IIC_VBIND : InstrItinClass; +def IIC_VBINQ : InstrItinClass; +def IIC_VPBIND : InstrItinClass; +def IIC_VFMULD : InstrItinClass; +def IIC_VFMULQ : InstrItinClass; +def IIC_VMOV : InstrItinClass; +def IIC_VMOVImm : InstrItinClass; +def IIC_VMOVD : InstrItinClass; +def IIC_VMOVQ : InstrItinClass; +def IIC_VMOVIS : InstrItinClass; +def IIC_VMOVID : InstrItinClass; +def IIC_VMOVISL : InstrItinClass; +def IIC_VMOVSI : InstrItinClass; +def IIC_VMOVDI : InstrItinClass; +def IIC_VMOVN : InstrItinClass; +def IIC_VPERMD : InstrItinClass; +def IIC_VPERMQ : InstrItinClass; +def IIC_VPERMQ3 : InstrItinClass; +def IIC_VMACD : InstrItinClass; +def IIC_VMACQ : InstrItinClass; +def IIC_VRECSD : InstrItinClass; +def IIC_VRECSQ : InstrItinClass; +def IIC_VCNTiD : InstrItinClass; +def IIC_VCNTiQ : InstrItinClass; +def IIC_VUNAiD : InstrItinClass; +def IIC_VUNAiQ : InstrItinClass; +def IIC_VQUNAiD : InstrItinClass; +def IIC_VQUNAiQ : InstrItinClass; +def IIC_VBINiD : InstrItinClass; +def IIC_VBINiQ : InstrItinClass; +def IIC_VSUBiD : InstrItinClass; +def IIC_VSUBiQ : InstrItinClass; +def IIC_VBINi4D : InstrItinClass; +def IIC_VBINi4Q : InstrItinClass; +def IIC_VSUBi4D : InstrItinClass; +def IIC_VSUBi4Q : InstrItinClass; +def IIC_VABAD : InstrItinClass; +def IIC_VABAQ : InstrItinClass; +def IIC_VSHLiD : InstrItinClass; +def IIC_VSHLiQ : InstrItinClass; +def IIC_VSHLi4D : InstrItinClass; +def IIC_VSHLi4Q : InstrItinClass; +def IIC_VPALiD : InstrItinClass; +def IIC_VPALiQ : InstrItinClass; +def IIC_VMULi16D : InstrItinClass; +def IIC_VMULi32D : InstrItinClass; +def IIC_VMULi16Q : InstrItinClass; +def IIC_VMULi32Q : InstrItinClass; +def IIC_VMACi16D : InstrItinClass; +def IIC_VMACi32D : InstrItinClass; +def IIC_VMACi16Q : InstrItinClass; +def IIC_VMACi32Q : InstrItinClass; +def IIC_VEXTD : InstrItinClass; +def IIC_VEXTQ : InstrItinClass; +def IIC_VTB1 : InstrItinClass; +def IIC_VTB2 : InstrItinClass; +def IIC_VTB3 : InstrItinClass; +def IIC_VTB4 : InstrItinClass; +def IIC_VTBX1 : InstrItinClass; +def IIC_VTBX2 : InstrItinClass; +def IIC_VTBX3 : InstrItinClass; +def IIC_VTBX4 : InstrItinClass; + +//===----------------------------------------------------------------------===// +// Processor instruction itineraries. + +def GenericItineraries : ProcessorItineraries<[], [], []>; + +include "ARMScheduleV6.td" +include "ARMScheduleA8.td" +include "ARMScheduleA9.td" diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td b/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td new file mode 100644 index 0000000..8d86c01 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td @@ -0,0 +1,1034 @@ +//=- ARMScheduleA8.td - ARM Cortex-A8 Scheduling Definitions -*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the ARM Cortex A8 processors. +// +//===----------------------------------------------------------------------===// + +// +// Scheduling information derived from "Cortex-A8 Technical Reference Manual". +// Functional Units. +def A8_Pipe0 : FuncUnit; // pipeline 0 +def A8_Pipe1 : FuncUnit; // pipeline 1 +def A8_LSPipe : FuncUnit; // Load / store pipeline +def A8_NPipe : FuncUnit; // NEON ALU/MUL pipe +def A8_NLSPipe : FuncUnit; // NEON LS pipe +// +// Dual issue pipeline represented by A8_Pipe0 | A8_Pipe1 +// +def CortexA8Itineraries : ProcessorItineraries< + [A8_Pipe0, A8_Pipe1, A8_LSPipe, A8_NPipe, A8_NLSPipe], + [], [ + // Two fully-pipelined integer ALU pipelines + // + // No operand cycles + InstrItinData<IIC_iALUx , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>]>, + // + // Binary Instructions that produce a result + InstrItinData<IIC_iALUi ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>, + InstrItinData<IIC_iALUr ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 2]>, + InstrItinData<IIC_iALUsi,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>, + InstrItinData<IIC_iALUsir,[InstrStage<1,[A8_Pipe0, A8_Pipe1]>], [2, 1, 2]>, + InstrItinData<IIC_iALUsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1, 1]>, + // + // Bitwise Instructions that produce a result + InstrItinData<IIC_iBITi ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>, + InstrItinData<IIC_iBITr ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 2]>, + InstrItinData<IIC_iBITsi,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>, + InstrItinData<IIC_iBITsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1, 1]>, + // + // Unary Instructions that produce a result + InstrItinData<IIC_iUNAr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>, + InstrItinData<IIC_iUNAsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>, + // + // Zero and sign extension instructions + InstrItinData<IIC_iEXTr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>, + InstrItinData<IIC_iEXTAr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>, + InstrItinData<IIC_iEXTAsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>],[2, 2, 1, 1]>, + // + // Compare instructions + InstrItinData<IIC_iCMPi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>, + InstrItinData<IIC_iCMPr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>, + InstrItinData<IIC_iCMPsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>, + InstrItinData<IIC_iCMPsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>, + // + // Test instructions + InstrItinData<IIC_iTSTi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>, + InstrItinData<IIC_iTSTr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>, + InstrItinData<IIC_iTSTsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>, + InstrItinData<IIC_iTSTsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>, + // + // Move instructions, unconditional + InstrItinData<IIC_iMOVi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1]>, + InstrItinData<IIC_iMOVr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>, + InstrItinData<IIC_iMOVsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>, + InstrItinData<IIC_iMOVsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1, 1]>, + InstrItinData<IIC_iMOVix2,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>, + InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [3]>, + InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LSPipe]>], [5]>, + // + // Move instructions, conditional + InstrItinData<IIC_iCMOVi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>, + InstrItinData<IIC_iCMOVr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>, + InstrItinData<IIC_iCMOVsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>, + InstrItinData<IIC_iCMOVsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>, + InstrItinData<IIC_iCMOVix2,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [3, 1]>, + // + // MVN instructions + InstrItinData<IIC_iMVNi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1]>, + InstrItinData<IIC_iMVNr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>, + InstrItinData<IIC_iMVNsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>, + InstrItinData<IIC_iMVNsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1, 1]>, + + // Integer multiply pipeline + // Result written in E5, but that is relative to the last cycle of multicycle, + // so we use 6 for those cases + // + InstrItinData<IIC_iMUL16 , [InstrStage<1, [A8_Pipe0]>], [5, 1, 1]>, + InstrItinData<IIC_iMAC16 , [InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>, + InstrItinData<IIC_iMUL32 , [InstrStage<2, [A8_Pipe0]>], [6, 1, 1]>, + InstrItinData<IIC_iMAC32 , [InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>, + InstrItinData<IIC_iMUL64 , [InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>, + InstrItinData<IIC_iMAC64 , [InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>, + + // Integer load pipeline + // + // Immediate offset + InstrItinData<IIC_iLoad_i , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1]>, + InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1]>, + InstrItinData<IIC_iLoad_d_i, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1]>, + // + // Register offset + InstrItinData<IIC_iLoad_r , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>, + InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>, + InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>, + // + // Scaled register offset, issues over 2 cycles + // FIXME: lsl by 2 takes 1 cycle. + InstrItinData<IIC_iLoad_si , [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [4, 1, 1]>, + InstrItinData<IIC_iLoad_bh_si,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [4, 1, 1]>, + // + // Immediate offset with update + InstrItinData<IIC_iLoad_iu , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 2, 1]>, + InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 2, 1]>, + // + // Register offset with update + InstrItinData<IIC_iLoad_ru , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 2, 1, 1]>, + InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 2, 1, 1]>, + InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 2, 1, 1]>, + // + // Scaled register offset with update, issues over 2 cycles + InstrItinData<IIC_iLoad_siu , [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_LSPipe]>], [4, 3, 1, 1]>, + InstrItinData<IIC_iLoad_bh_siu,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_LSPipe]>], [4, 3, 1, 1]>, + // + // Load multiple, def is the 5th operand. Pipeline 0 only. + // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers. + InstrItinData<IIC_iLoad_m , [InstrStage<2, [A8_Pipe0], 0>, + InstrStage<2, [A8_LSPipe]>], [1, 1, 1, 1, 3]>, + // + // Load multiple + update, defs are the 1st and 5th operands. + InstrItinData<IIC_iLoad_mu , [InstrStage<3, [A8_Pipe0], 0>, + InstrStage<3, [A8_LSPipe]>], [2, 1, 1, 1, 3]>, + // + // Load multiple plus branch + InstrItinData<IIC_iLoad_mBr, [InstrStage<3, [A8_Pipe0], 0>, + InstrStage<3, [A8_LSPipe]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>], + [1, 2, 1, 1, 3]>, + // + // Pop, def is the 3rd operand. + InstrItinData<IIC_iPop , [InstrStage<3, [A8_Pipe0], 0>, + InstrStage<3, [A8_LSPipe]>], [1, 1, 3]>, + // + // Push, def is the 3th operand. + InstrItinData<IIC_iPop_Br, [InstrStage<3, [A8_Pipe0], 0>, + InstrStage<3, [A8_LSPipe]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>], + [1, 1, 3]>, + + // + // iLoadi + iALUr for t2LDRpci_pic. + InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [4, 1]>, + + + // Integer store pipeline + // + // Immediate offset + InstrItinData<IIC_iStore_i , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1]>, + InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1]>, + InstrItinData<IIC_iStore_d_i, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1]>, + // + // Register offset + InstrItinData<IIC_iStore_r , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>, + InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>, + InstrItinData<IIC_iStore_d_r, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>, + // + // Scaled register offset, issues over 2 cycles + InstrItinData<IIC_iStore_si , [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_LSPipe]>], [3, 1, 1]>, + InstrItinData<IIC_iStore_bh_si,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_LSPipe]>], [3, 1, 1]>, + // + // Immediate offset with update + InstrItinData<IIC_iStore_iu , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [2, 3, 1]>, + InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [2, 3, 1]>, + // + // Register offset with update + InstrItinData<IIC_iStore_ru , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [2, 3, 1, 1]>, + InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [2, 3, 1, 1]>, + InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_LSPipe]>], [2, 3, 1, 1]>, + // + // Scaled register offset with update, issues over 2 cycles + InstrItinData<IIC_iStore_siu, [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_LSPipe]>], [3, 3, 1, 1]>, + InstrItinData<IIC_iStore_bh_siu,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_LSPipe]>], [3, 3, 1, 1]>, + // + // Store multiple. Pipeline 0 only. + // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers. + InstrItinData<IIC_iStore_m , [InstrStage<2, [A8_Pipe0], 0>, + InstrStage<2, [A8_LSPipe]>]>, + // + // Store multiple + update + InstrItinData<IIC_iStore_mu, [InstrStage<2, [A8_Pipe0], 0>, + InstrStage<2, [A8_LSPipe]>], [2]>, + + // + // Preload + InstrItinData<IIC_Preload, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>, + + // Branch + // + // no delay slots, so the latency of a branch is unimportant + InstrItinData<IIC_Br , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>]>, + + // VFP + // Issue through integer pipeline, and execute in NEON unit. We assume + // RunFast mode so that NFP pipeline is used for single-precision when + // possible. + // + // FP Special Register to Integer Register File Move + InstrItinData<IIC_fpSTAT , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe]>], [20]>, + // + // Single-precision FP Unary + InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [7, 1]>, + // + // Double-precision FP Unary + InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NPipe], 0>, + InstrStage<4, [A8_NLSPipe]>], [4, 1]>, + // + // Single-precision FP Compare + InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [1, 1]>, + // + // Double-precision FP Compare + InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NPipe], 0>, + InstrStage<4, [A8_NLSPipe]>], [4, 1]>, + // + // Single to Double FP Convert + InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<7, [A8_NPipe], 0>, + InstrStage<7, [A8_NLSPipe]>], [7, 1]>, + // + // Double to Single FP Convert + InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<5, [A8_NPipe], 0>, + InstrStage<5, [A8_NLSPipe]>], [5, 1]>, + // + // Single-Precision FP to Integer Convert + InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [7, 1]>, + // + // Double-Precision FP to Integer Convert + InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<8, [A8_NPipe], 0>, + InstrStage<8, [A8_NLSPipe]>], [8, 1]>, + // + // Integer to Single-Precision FP Convert + InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [7, 1]>, + // + // Integer to Double-Precision FP Convert + InstrItinData<IIC_fpCVTID , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<8, [A8_NPipe], 0>, + InstrStage<8, [A8_NLSPipe]>], [8, 1]>, + // + // Single-precision FP ALU + InstrItinData<IIC_fpALU32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [7, 1, 1]>, + // + // Double-precision FP ALU + InstrItinData<IIC_fpALU64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<9, [A8_NPipe], 0>, + InstrStage<9, [A8_NLSPipe]>], [9, 1, 1]>, + // + // Single-precision FP Multiply + InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [7, 1, 1]>, + // + // Double-precision FP Multiply + InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<11, [A8_NPipe], 0>, + InstrStage<11, [A8_NLSPipe]>], [11, 1, 1]>, + // + // Single-precision FP MAC + InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [7, 2, 1, 1]>, + // + // Double-precision FP MAC + InstrItinData<IIC_fpMAC64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<19, [A8_NPipe], 0>, + InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>, + // + // Single-precision FP DIV + InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<20, [A8_NPipe], 0>, + InstrStage<20, [A8_NLSPipe]>], [20, 1, 1]>, + // + // Double-precision FP DIV + InstrItinData<IIC_fpDIV64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<29, [A8_NPipe], 0>, + InstrStage<29, [A8_NLSPipe]>], [29, 1, 1]>, + // + // Single-precision FP SQRT + InstrItinData<IIC_fpSQRT32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<19, [A8_NPipe], 0>, + InstrStage<19, [A8_NLSPipe]>], [19, 1]>, + // + // Double-precision FP SQRT + InstrItinData<IIC_fpSQRT64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<29, [A8_NPipe], 0>, + InstrStage<29, [A8_NLSPipe]>], [29, 1]>, + + // + // Integer to Single-precision Move + InstrItinData<IIC_fpMOVIS, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], + [2, 1]>, + // + // Integer to Double-precision Move + InstrItinData<IIC_fpMOVID, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], + [2, 1, 1]>, + // + // Single-precision to Integer Move + InstrItinData<IIC_fpMOVSI, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], + [20, 1]>, + // + // Double-precision to Integer Move + InstrItinData<IIC_fpMOVDI, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], + [20, 20, 1]>, + + // + // Single-precision FP Load + InstrItinData<IIC_fpLoad32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>], + [2, 1]>, + // + // Double-precision FP Load + InstrItinData<IIC_fpLoad64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>], + [2, 1]>, + // + // FP Load Multiple + // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers. + InstrItinData<IIC_fpLoad_m, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>], [1, 1, 1, 2]>, + // + // FP Load Multiple + update + InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>], [2, 1, 1, 1, 2]>, + // + // Single-precision FP Store + InstrItinData<IIC_fpStore32,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>], + [1, 1]>, + // + // Double-precision FP Store + InstrItinData<IIC_fpStore64,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>], + [1, 1]>, + // + // FP Store Multiple + InstrItinData<IIC_fpStore_m,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>], [1, 1, 1, 1]>, + // + // FP Store Multiple + update + InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>, + InstrStage<1, [A8_NLSPipe], 0>, + InstrStage<1, [A8_LSPipe]>], [2, 1, 1, 1, 1]>, + + // NEON + // Issue through integer pipeline, and execute in NEON unit. + // + // VLD1 + InstrItinData<IIC_VLD1, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 1]>, + // VLD1x2 + InstrItinData<IIC_VLD1x2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 2, 1]>, + // + // VLD1x3 + InstrItinData<IIC_VLD1x3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 1]>, + // + // VLD1x4 + InstrItinData<IIC_VLD1x4, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 3, 1]>, + // + // VLD1u + InstrItinData<IIC_VLD1u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 2, 1]>, + // + // VLD1x2u + InstrItinData<IIC_VLD1x2u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 2, 2, 1]>, + // + // VLD1x3u + InstrItinData<IIC_VLD1x3u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 2, 1]>, + // + // VLD1x4u + InstrItinData<IIC_VLD1x4u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 3, 2, 1]>, + // + // VLD1ln + InstrItinData<IIC_VLD1ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [3, 1, 1, 1]>, + // + // VLD1lnu + InstrItinData<IIC_VLD1lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [3, 2, 1, 1, 1, 1]>, + // + // VLD1dup + InstrItinData<IIC_VLD1dup, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 1]>, + // + // VLD1dupu + InstrItinData<IIC_VLD1dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 2, 1, 1]>, + // + // VLD2 + InstrItinData<IIC_VLD2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 2, 1]>, + // + // VLD2x2 + InstrItinData<IIC_VLD2x2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 3, 1]>, + // + // VLD2ln + InstrItinData<IIC_VLD2ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [3, 3, 1, 1, 1, 1]>, + // + // VLD2u + InstrItinData<IIC_VLD2u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 2, 2, 1, 1, 1]>, + // + // VLD2x2u + InstrItinData<IIC_VLD2x2u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 3, 2, 1]>, + // + // VLD2lnu + InstrItinData<IIC_VLD2lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [3, 3, 2, 1, 1, 1, 1, 1]>, + // + // VLD2dup + InstrItinData<IIC_VLD2dup, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 2, 1]>, + // + // VLD2dupu + InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 2, 2, 1, 1]>, + // + // VLD3 + InstrItinData<IIC_VLD3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [3, 3, 4, 1]>, + // + // VLD3ln + InstrItinData<IIC_VLD3ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<5, [A8_NLSPipe], 0>, + InstrStage<5, [A8_LSPipe]>], + [4, 4, 5, 1, 1, 1, 1, 2]>, + // + // VLD3u + InstrItinData<IIC_VLD3u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [3, 3, 4, 2, 1]>, + // + // VLD3lnu + InstrItinData<IIC_VLD3lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<5, [A8_NLSPipe], 0>, + InstrStage<5, [A8_LSPipe]>], + [4, 4, 5, 2, 1, 1, 1, 1, 1, 2]>, + // + // VLD3dup + InstrItinData<IIC_VLD3dup, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 1]>, + // + // VLD3dupu + InstrItinData<IIC_VLD3dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 2, 1, 1]>, + // + // VLD4 + InstrItinData<IIC_VLD4, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [3, 3, 4, 4, 1]>, + // + // VLD4ln + InstrItinData<IIC_VLD4ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<5, [A8_NLSPipe], 0>, + InstrStage<5, [A8_LSPipe]>], + [4, 4, 5, 5, 1, 1, 1, 1, 2, 2]>, + // + // VLD4u + InstrItinData<IIC_VLD4u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [3, 3, 4, 4, 2, 1]>, + // + // VLD4lnu + InstrItinData<IIC_VLD4lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<5, [A8_NLSPipe], 0>, + InstrStage<5, [A8_LSPipe]>], + [4, 4, 5, 5, 2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VLD4dup + InstrItinData<IIC_VLD4dup, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 3, 1]>, + // + // VLD4dupu + InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 2, 3, 3, 2, 1, 1]>, + // + // VST1 + InstrItinData<IIC_VST1, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [1, 1, 1]>, + // + // VST1x2 + InstrItinData<IIC_VST1x2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [1, 1, 1, 1]>, + // + // VST1x3 + InstrItinData<IIC_VST1x3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [1, 1, 1, 1, 2]>, + // + // VST1x4 + InstrItinData<IIC_VST1x4, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST1u + InstrItinData<IIC_VST1u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 1, 1, 1, 1]>, + // + // VST1x2u + InstrItinData<IIC_VST1x2u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1]>, + // + // VST1x3u + InstrItinData<IIC_VST1x3u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1, 2]>, + // + // VST1x4u + InstrItinData<IIC_VST1x4u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VST1ln + InstrItinData<IIC_VST1ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [1, 1, 1]>, + // + // VST1lnu + InstrItinData<IIC_VST1lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 1, 1, 1, 1]>, + // + // VST2 + InstrItinData<IIC_VST2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [1, 1, 1, 1]>, + // + // VST2x2 + InstrItinData<IIC_VST2x2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST2u + InstrItinData<IIC_VST2u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1]>, + // + // VST2x2u + InstrItinData<IIC_VST2x2u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VST2ln + InstrItinData<IIC_VST2ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [1, 1, 1, 1]>, + // + // VST2lnu + InstrItinData<IIC_VST2lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<2, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1]>, + // + // VST3 + InstrItinData<IIC_VST3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [1, 1, 1, 1, 2]>, + // + // VST3u + InstrItinData<IIC_VST3u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1, 2]>, + // + // VST3ln + InstrItinData<IIC_VST3ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [1, 1, 1, 1, 2]>, + // + // VST3lnu + InstrItinData<IIC_VST3lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<3, [A8_NLSPipe], 0>, + InstrStage<3, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1, 2]>, + // + // VST4 + InstrItinData<IIC_VST4, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST4u + InstrItinData<IIC_VST4u, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VST4ln + InstrItinData<IIC_VST4ln, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST4lnu + InstrItinData<IIC_VST4lnu, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<4, [A8_NLSPipe], 0>, + InstrStage<4, [A8_LSPipe]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, + // + // Double-register FP Unary + InstrItinData<IIC_VUNAD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [5, 2]>, + // + // Quad-register FP Unary + // Result written in N5, but that is relative to the last cycle of multicycle, + // so we use 6 for those cases + InstrItinData<IIC_VUNAQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [6, 2]>, + // + // Double-register FP Binary + InstrItinData<IIC_VBIND, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [5, 2, 2]>, + // + // VPADD, etc. + InstrItinData<IIC_VPBIND, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [5, 2, 2]>, + // + // Double-register FP VMUL + InstrItinData<IIC_VFMULD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [5, 2, 1]>, + + // + // Quad-register FP Binary + // Result written in N5, but that is relative to the last cycle of multicycle, + // so we use 6 for those cases + InstrItinData<IIC_VBINQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [6, 2, 2]>, + // + // Quad-register FP VMUL + InstrItinData<IIC_VFMULQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [6, 2, 1]>, + // + // Move + InstrItinData<IIC_VMOV, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [1, 1]>, + // + // Move Immediate + InstrItinData<IIC_VMOVImm, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [3]>, + // + // Double-register Permute Move + InstrItinData<IIC_VMOVD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe]>], [2, 1]>, + // + // Quad-register Permute Move + // Result written in N2, but that is relative to the last cycle of multicycle, + // so we use 3 for those cases + InstrItinData<IIC_VMOVQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe]>], [3, 1]>, + // + // Integer to Single-precision Move + InstrItinData<IIC_VMOVIS , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe]>], [2, 1]>, + // + // Integer to Double-precision Move + InstrItinData<IIC_VMOVID , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe]>], [2, 1, 1]>, + // + // Single-precision to Integer Move + InstrItinData<IIC_VMOVSI , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe]>], [20, 1]>, + // + // Double-precision to Integer Move + InstrItinData<IIC_VMOVDI , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe]>], [20, 20, 1]>, + // + // Integer to Lane Move + InstrItinData<IIC_VMOVISL , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe]>], [3, 1, 1]>, + // + // Vector narrow move + InstrItinData<IIC_VMOVN , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [2, 1]>, + // + // Double-register Permute + InstrItinData<IIC_VPERMD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe]>], [2, 2, 1, 1]>, + // + // Quad-register Permute + // Result written in N2, but that is relative to the last cycle of multicycle, + // so we use 3 for those cases + InstrItinData<IIC_VPERMQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe]>], [3, 3, 1, 1]>, + // + // Quad-register Permute (3 cycle issue) + // Result written in N2, but that is relative to the last cycle of multicycle, + // so we use 4 for those cases + InstrItinData<IIC_VPERMQ3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe]>, + InstrStage<1, [A8_NPipe], 0>, + InstrStage<2, [A8_NLSPipe]>], [4, 4, 1, 1]>, + // + // Double-register FP Multiple-Accumulate + InstrItinData<IIC_VMACD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [9, 3, 2, 2]>, + // + // Quad-register FP Multiple-Accumulate + // Result written in N9, but that is relative to the last cycle of multicycle, + // so we use 10 for those cases + InstrItinData<IIC_VMACQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>, + // + // Double-register Reciprical Step + InstrItinData<IIC_VRECSD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [9, 2, 2]>, + // + // Quad-register Reciprical Step + InstrItinData<IIC_VRECSQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [10, 2, 2]>, + // + // Double-register Integer Count + InstrItinData<IIC_VCNTiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [3, 2, 2]>, + // + // Quad-register Integer Count + // Result written in N3, but that is relative to the last cycle of multicycle, + // so we use 4 for those cases + InstrItinData<IIC_VCNTiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [4, 2, 2]>, + // + // Double-register Integer Unary + InstrItinData<IIC_VUNAiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [4, 2]>, + // + // Quad-register Integer Unary + InstrItinData<IIC_VUNAiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [4, 2]>, + // + // Double-register Integer Q-Unary + InstrItinData<IIC_VQUNAiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [4, 1]>, + // + // Quad-register Integer CountQ-Unary + InstrItinData<IIC_VQUNAiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [4, 1]>, + // + // Double-register Integer Binary + InstrItinData<IIC_VBINiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [3, 2, 2]>, + // + // Quad-register Integer Binary + InstrItinData<IIC_VBINiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [3, 2, 2]>, + // + // Double-register Integer Binary (4 cycle) + InstrItinData<IIC_VBINi4D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [4, 2, 1]>, + // + // Quad-register Integer Binary (4 cycle) + InstrItinData<IIC_VBINi4Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [4, 2, 1]>, + + // + // Double-register Integer Subtract + InstrItinData<IIC_VSUBiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [3, 2, 1]>, + // + // Quad-register Integer Subtract + InstrItinData<IIC_VSUBiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [3, 2, 1]>, + // + // Double-register Integer Subtract + InstrItinData<IIC_VSUBi4D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [4, 2, 1]>, + // + // Quad-register Integer Subtract + InstrItinData<IIC_VSUBi4Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [4, 2, 1]>, + // + // Double-register Integer Shift + InstrItinData<IIC_VSHLiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [3, 1, 1]>, + // + // Quad-register Integer Shift + InstrItinData<IIC_VSHLiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [4, 1, 1]>, + // + // Double-register Integer Shift (4 cycle) + InstrItinData<IIC_VSHLi4D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [4, 1, 1]>, + // + // Quad-register Integer Shift (4 cycle) + InstrItinData<IIC_VSHLi4Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [5, 1, 1]>, + // + // Double-register Integer Pair Add Long + InstrItinData<IIC_VPALiD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [6, 3, 1]>, + // + // Quad-register Integer Pair Add Long + InstrItinData<IIC_VPALiQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [7, 3, 1]>, + // + // Double-register Absolute Difference and Accumulate + InstrItinData<IIC_VABAD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [6, 3, 2, 1]>, + // + // Quad-register Absolute Difference and Accumulate + InstrItinData<IIC_VABAQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [6, 3, 2, 1]>, + + // + // Double-register Integer Multiply (.8, .16) + InstrItinData<IIC_VMULi16D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [6, 2, 2]>, + // + // Double-register Integer Multiply (.32) + InstrItinData<IIC_VMULi32D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [7, 2, 1]>, + // + // Quad-register Integer Multiply (.8, .16) + InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [7, 2, 2]>, + // + // Quad-register Integer Multiply (.32) + InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<3, [A8_NPipe]>], [9, 2, 1]>, + // + // Double-register Integer Multiply-Accumulate (.8, .16) + InstrItinData<IIC_VMACi16D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [6, 3, 2, 2]>, + // + // Double-register Integer Multiply-Accumulate (.32) + InstrItinData<IIC_VMACi32D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [7, 3, 2, 1]>, + // + // Quad-register Integer Multiply-Accumulate (.8, .16) + InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [7, 3, 2, 2]>, + // + // Quad-register Integer Multiply-Accumulate (.32) + InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>, + InstrStage<2, [A8_NLSPipe], 0>, + InstrStage<3, [A8_NPipe]>], [9, 3, 2, 1]>, + // + // Double-register VEXT + InstrItinData<IIC_VEXTD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe]>], [2, 1, 1]>, + // + // Quad-register VEXT + InstrItinData<IIC_VEXTQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe]>], [3, 1, 1]>, + // + // VTB + InstrItinData<IIC_VTB1, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe]>], [3, 2, 1]>, + InstrItinData<IIC_VTB2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe]>], [3, 2, 2, 1]>, + InstrItinData<IIC_VTB3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe]>, + InstrStage<1, [A8_NPipe], 0>, + InstrStage<2, [A8_NLSPipe]>], [4, 2, 2, 3, 1]>, + InstrItinData<IIC_VTB4, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe]>, + InstrStage<1, [A8_NPipe], 0>, + InstrStage<2, [A8_NLSPipe]>],[4, 2, 2, 3, 3, 1]>, + // + // VTBX + InstrItinData<IIC_VTBX1, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe]>], [3, 1, 2, 1]>, + InstrItinData<IIC_VTBX2, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NLSPipe]>], [3, 1, 2, 2, 1]>, + InstrItinData<IIC_VTBX3, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe]>, + InstrStage<1, [A8_NPipe], 0>, + InstrStage<2, [A8_NLSPipe]>],[4, 1, 2, 2, 3, 1]>, + InstrItinData<IIC_VTBX4, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NLSPipe]>, + InstrStage<1, [A8_NPipe], 0>, + InstrStage<2, [A8_NLSPipe]>], [4, 1, 2, 2, 3, 3, 1]> +]>; diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td b/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td new file mode 100644 index 0000000..49fedf6 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td @@ -0,0 +1,1827 @@ +//=- ARMScheduleA9.td - ARM Cortex-A9 Scheduling Definitions -*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the ARM Cortex A9 processors. +// +//===----------------------------------------------------------------------===// + +// +// Ad-hoc scheduling information derived from pretty vague "Cortex-A9 Technical +// Reference Manual". +// +// Functional units +def A9_Issue0 : FuncUnit; // Issue 0 +def A9_Issue1 : FuncUnit; // Issue 1 +def A9_Branch : FuncUnit; // Branch +def A9_ALU0 : FuncUnit; // ALU / MUL pipeline 0 +def A9_ALU1 : FuncUnit; // ALU pipeline 1 +def A9_AGU : FuncUnit; // Address generation unit for ld / st +def A9_NPipe : FuncUnit; // NEON pipeline +def A9_MUX0 : FuncUnit; // AGU + NEON/FPU multiplexer +def A9_LSUnit : FuncUnit; // L/S Unit +def A9_DRegsVFP: FuncUnit; // FP register set, VFP side +def A9_DRegsN : FuncUnit; // FP register set, NEON side + +// Bypasses +def A9_LdBypass : Bypass; + +def CortexA9Itineraries : ProcessorItineraries< + [A9_Issue0, A9_Issue1, A9_Branch, A9_ALU0, A9_ALU1, A9_AGU, A9_NPipe, A9_MUX0, + A9_LSUnit, A9_DRegsVFP, A9_DRegsN], + [A9_LdBypass], [ + // Two fully-pipelined integer ALU pipelines + + // + // Move instructions, unconditional + InstrItinData<IIC_iMOVi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>, + InstrItinData<IIC_iMOVr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData<IIC_iMOVsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData<IIC_iMOVsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>, + InstrItinData<IIC_iMOVix2 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [2]>, + InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [3]>, + InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], [5]>, + // + // MVN instructions + InstrItinData<IIC_iMVNi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [1]>, + InstrItinData<IIC_iMVNr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [1, 1], [NoBypass, A9_LdBypass]>, + InstrItinData<IIC_iMVNsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], + [2, 1]>, + InstrItinData<IIC_iMVNsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<3, [A9_ALU0, A9_ALU1]>], + [3, 1, 1]>, + // + // No operand cycles + InstrItinData<IIC_iALUx , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>]>, + // + // Binary Instructions that produce a result + InstrItinData<IIC_iALUi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [1, 1], [NoBypass, A9_LdBypass]>, + InstrItinData<IIC_iALUr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [1, 1, 1], [NoBypass, A9_LdBypass, A9_LdBypass]>, + InstrItinData<IIC_iALUsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], + [2, 1, 1], [NoBypass, A9_LdBypass, NoBypass]>, + InstrItinData<IIC_iALUsir,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], + [2, 1, 1], [NoBypass, NoBypass, A9_LdBypass]>, + InstrItinData<IIC_iALUsr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<3, [A9_ALU0, A9_ALU1]>], + [3, 1, 1, 1], + [NoBypass, A9_LdBypass, NoBypass, NoBypass]>, + // + // Bitwise Instructions that produce a result + InstrItinData<IIC_iBITi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData<IIC_iBITr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1, 1]>, + InstrItinData<IIC_iBITsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>, + InstrItinData<IIC_iBITsr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<3, [A9_ALU0, A9_ALU1]>], [3, 1, 1, 1]>, + // + // Unary Instructions that produce a result + + // CLZ, RBIT, etc. + InstrItinData<IIC_iUNAr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + + // BFC, BFI, UBFX, SBFX + InstrItinData<IIC_iUNAsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1]>, + + // + // Zero and sign extension instructions + InstrItinData<IIC_iEXTr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [2, 1]>, + InstrItinData<IIC_iEXTAr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], [3, 1, 1]>, + InstrItinData<IIC_iEXTAsr,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<3, [A9_ALU0, A9_ALU1]>], [3, 1, 1, 1]>, + // + // Compare instructions + InstrItinData<IIC_iCMPi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [1], [A9_LdBypass]>, + InstrItinData<IIC_iCMPr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [1, 1], [A9_LdBypass, A9_LdBypass]>, + InstrItinData<IIC_iCMPsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], + [1, 1], [A9_LdBypass, NoBypass]>, + InstrItinData<IIC_iCMPsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<3, [A9_ALU0, A9_ALU1]>], + [1, 1, 1], [A9_LdBypass, NoBypass, NoBypass]>, + // + // Test instructions + InstrItinData<IIC_iTSTi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>, + InstrItinData<IIC_iTSTr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData<IIC_iTSTsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData<IIC_iTSTsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<3, [A9_ALU0, A9_ALU1]>], [1, 1, 1]>, + // + // Move instructions, conditional + // FIXME: Correctly model the extra input dep on the destination. + InstrItinData<IIC_iCMOVi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>, + InstrItinData<IIC_iCMOVr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData<IIC_iCMOVsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>, + InstrItinData<IIC_iCMOVsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>, + InstrItinData<IIC_iCMOVix2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>, + InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], [2]>, + + // Integer multiply pipeline + // + InstrItinData<IIC_iMUL16 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0]>], [3, 1, 1]>, + InstrItinData<IIC_iMAC16 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0]>], + [3, 1, 1, 1]>, + InstrItinData<IIC_iMUL32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0]>], [4, 1, 1]>, + InstrItinData<IIC_iMAC32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<2, [A9_ALU0]>], + [4, 1, 1, 1]>, + InstrItinData<IIC_iMUL64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<3, [A9_ALU0]>], [4, 5, 1, 1]>, + InstrItinData<IIC_iMAC64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<3, [A9_ALU0]>], + [4, 5, 1, 1]>, + // Integer load pipeline + // FIXME: The timings are some rough approximations + // + // Immediate offset + InstrItinData<IIC_iLoad_i , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 1], [A9_LdBypass]>, + InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [4, 1], [A9_LdBypass]>, + // FIXME: If address is 64-bit aligned, AGU cycles is 1. + InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 3, 1], [A9_LdBypass]>, + // + // Register offset + InstrItinData<IIC_iLoad_r , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 1, 1], [A9_LdBypass]>, + InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [4, 1, 1], [A9_LdBypass]>, + InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 3, 1, 1], [A9_LdBypass]>, + // + // Scaled register offset + InstrItinData<IIC_iLoad_si , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit], 0>], + [4, 1, 1], [A9_LdBypass]>, + InstrItinData<IIC_iLoad_bh_si,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [5, 1, 1], [A9_LdBypass]>, + // + // Immediate offset with update + InstrItinData<IIC_iLoad_iu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 2, 1], [A9_LdBypass]>, + InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [4, 3, 1], [A9_LdBypass]>, + // + // Register offset with update + InstrItinData<IIC_iLoad_ru , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 2, 1, 1], [A9_LdBypass]>, + InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [4, 3, 1, 1], [A9_LdBypass]>, + InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [3, 3, 1, 1], [A9_LdBypass]>, + // + // Scaled register offset with update + InstrItinData<IIC_iLoad_siu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [4, 3, 1, 1], [A9_LdBypass]>, + InstrItinData<IIC_iLoad_bh_siu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [5, 4, 1, 1], [A9_LdBypass]>, + // + // Load multiple, def is the 5th operand. + // FIXME: This assumes 3 to 4 registers. + InstrItinData<IIC_iLoad_m , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1, 3], + [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>, + // + // Load multiple + update, defs are the 1st and 5th operands. + InstrItinData<IIC_iLoad_mu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1, 3], + [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>, + // + // Load multiple plus branch + InstrItinData<IIC_iLoad_mBr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 1>, + InstrStage<2, [A9_LSUnit]>, + InstrStage<1, [A9_Branch]>], + [1, 2, 1, 1, 3], + [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>, + // + // Pop, def is the 3rd operand. + InstrItinData<IIC_iPop , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 3], + [NoBypass, NoBypass, A9_LdBypass]>, + // + // Pop + branch, def is the 3rd operand. + InstrItinData<IIC_iPop_Br, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<2, [A9_LSUnit]>, + InstrStage<1, [A9_Branch]>], + [1, 1, 3], + [NoBypass, NoBypass, A9_LdBypass]>, + + // + // iLoadi + iALUr for t2LDRpci_pic. + InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>, + InstrStage<1, [A9_ALU0, A9_ALU1]>], + [2, 1]>, + + // Integer store pipeline + /// + // Immediate offset + InstrItinData<IIC_iStore_i , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1]>, + InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], [1, 1]>, + // FIXME: If address is 64-bit aligned, AGU cycles is 1. + InstrItinData<IIC_iStore_d_i, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], [1, 1]>, + // + // Register offset + InstrItinData<IIC_iStore_r , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>, + InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>, + InstrItinData<IIC_iStore_d_r, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>, + // + // Scaled register offset + InstrItinData<IIC_iStore_si , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>, + InstrItinData<IIC_iStore_bh_si,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>, + // + // Immediate offset with update + InstrItinData<IIC_iStore_iu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], [2, 1, 1]>, + InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], [3, 1, 1]>, + // + // Register offset with update + InstrItinData<IIC_iStore_ru , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1, 1, 1]>, + InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], + [3, 1, 1, 1]>, + InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], + [3, 1, 1, 1]>, + // + // Scaled register offset with update + InstrItinData<IIC_iStore_siu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1, 1, 1]>, + InstrItinData<IIC_iStore_bh_siu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU], 1>, + InstrStage<1, [A9_LSUnit]>], + [3, 1, 1, 1]>, + // + // Store multiple + InstrItinData<IIC_iStore_m , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<2, [A9_LSUnit]>]>, + // + // Store multiple + update + InstrItinData<IIC_iStore_mu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU], 0>, + InstrStage<2, [A9_LSUnit]>], [2]>, + + // + // Preload + InstrItinData<IIC_Preload, [InstrStage<1, [A9_Issue0, A9_Issue1]>], [1, 1]>, + + // Branch + // + // no delay slots, so the latency of a branch is unimportant + InstrItinData<IIC_Br , [InstrStage<1, [A9_Issue0], 0>, + InstrStage<1, [A9_Issue1], 0>, + InstrStage<1, [A9_Branch]>]>, + + // VFP and NEON shares the same register file. This means that every VFP + // instruction should wait for full completion of the consecutive NEON + // instruction and vice-versa. We model this behavior with two artificial FUs: + // DRegsVFP and DRegsVFP. + // + // Every VFP instruction: + // - Acquires DRegsVFP resource for 1 cycle + // - Reserves DRegsN resource for the whole duration (including time to + // register file writeback!). + // Every NEON instruction does the same but with FUs swapped. + // + // Since the reserved FU cannot be acquired, this models precisely + // "cross-domain" stalls. + + // VFP + // Issue through integer pipeline, and execute in NEON unit. + + // FP Special Register to Integer Register File Move + InstrItinData<IIC_fpSTAT , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [1]>, + // + // Single-precision FP Unary + InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + // Extra latency cycles since wbck is 2 cycles + InstrStage<3, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [1, 1]>, + // + // Double-precision FP Unary + InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + // Extra latency cycles since wbck is 2 cycles + InstrStage<3, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [1, 1]>, + + // + // Single-precision FP Compare + InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + // Extra latency cycles since wbck is 4 cycles + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [1, 1]>, + // + // Double-precision FP Compare + InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + // Extra latency cycles since wbck is 4 cycles + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [1, 1]>, + // + // Single to Double FP Convert + InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, + // + // Double to Single FP Convert + InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, + + // + // Single to Half FP Convert + InstrItinData<IIC_fpCVTSH , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, + // + // Half to Single FP Convert + InstrItinData<IIC_fpCVTHS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<3, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [2, 1]>, + + // + // Single-Precision FP to Integer Convert + InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, + // + // Double-Precision FP to Integer Convert + InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, + // + // Integer to Single-Precision FP Convert + InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, + // + // Integer to Double-Precision FP Convert + InstrItinData<IIC_fpCVTID , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, + // + // Single-precision FP ALU + InstrItinData<IIC_fpALU32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 1, 1]>, + // + // Double-precision FP ALU + InstrItinData<IIC_fpALU64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<5, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 1, 1]>, + // + // Single-precision FP Multiply + InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<6, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [5, 1, 1]>, + // + // Double-precision FP Multiply + InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<7, [A9_DRegsN], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [6, 1, 1]>, + // + // Single-precision FP MAC + InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<9, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [8, 1, 1, 1]>, + // + // Double-precision FP MAC + InstrItinData<IIC_fpMAC64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<10, [A9_DRegsN], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [9, 1, 1, 1]>, + // + // Single-precision FP DIV + InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<16, [A9_DRegsN], 0, Reserved>, + InstrStage<10, [A9_NPipe]>], + [15, 1, 1]>, + // + // Double-precision FP DIV + InstrItinData<IIC_fpDIV64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<26, [A9_DRegsN], 0, Reserved>, + InstrStage<20, [A9_NPipe]>], + [25, 1, 1]>, + // + // Single-precision FP SQRT + InstrItinData<IIC_fpSQRT32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<18, [A9_DRegsN], 0, Reserved>, + InstrStage<13, [A9_NPipe]>], + [17, 1]>, + // + // Double-precision FP SQRT + InstrItinData<IIC_fpSQRT64, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<33, [A9_DRegsN], 0, Reserved>, + InstrStage<28, [A9_NPipe]>], + [32, 1]>, + + // + // Integer to Single-precision Move + InstrItinData<IIC_fpMOVIS, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + // Extra 1 latency cycle since wbck is 2 cycles + InstrStage<3, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [1, 1]>, + // + // Integer to Double-precision Move + InstrItinData<IIC_fpMOVID, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + // Extra 1 latency cycle since wbck is 2 cycles + InstrStage<3, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [1, 1, 1]>, + // + // Single-precision to Integer Move + // + // On A9 move-from-VFP is free to issue with no stall if other VFP + // operations are in flight. I assume it still can't dual-issue though. + InstrItinData<IIC_fpMOVSI, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>], + [2, 1]>, + // + // Double-precision to Integer Move + // + // On A9 move-from-VFP is free to issue with no stall if other VFP + // operations are in flight. I assume it still can't dual-issue though. + InstrItinData<IIC_fpMOVDI, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>], + [2, 1, 1]>, + // + // Single-precision FP Load + InstrItinData<IIC_fpLoad32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1]>, + // + // Double-precision FP Load + // FIXME: Result latency is 1 if address is 64-bit aligned. + InstrItinData<IIC_fpLoad64, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1]>, + // + // FP Load Multiple + // FIXME: assumes 2 doubles which requires 2 LS cycles. + InstrItinData<IIC_fpLoad_m, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1]>, + // + // FP Load Multiple + update + // FIXME: assumes 2 doubles which requires 2 LS cycles. + InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1]>, + // + // Single-precision FP Store + InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1]>, + // + // Double-precision FP Store + InstrItinData<IIC_fpStore64,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1]>, + // + // FP Store Multiple + // FIXME: assumes 2 doubles which requires 2 LS cycles. + InstrItinData<IIC_fpStore_m,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1]>, + // + // FP Store Multiple + update + // FIXME: assumes 2 doubles which requires 2 LS cycles. + InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<2, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1]>, + // NEON + // VLD1 + InstrItinData<IIC_VLD1, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1]>, + // VLD1x2 + InstrItinData<IIC_VLD1x2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1, 1]>, + // VLD1x3 + InstrItinData<IIC_VLD1x3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 2, 1]>, + // VLD1x4 + InstrItinData<IIC_VLD1x4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 2, 2, 1]>, + // VLD1u + InstrItinData<IIC_VLD1u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 2, 1]>, + // VLD1x2u + InstrItinData<IIC_VLD1x2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1, 2, 1]>, + // VLD1x3u + InstrItinData<IIC_VLD1x3u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 2, 2, 1]>, + // VLD1x4u + InstrItinData<IIC_VLD1x4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 2, 2, 2, 1]>, + // + // VLD1ln + InstrItinData<IIC_VLD1ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 1, 1, 1]>, + // + // VLD1lnu + InstrItinData<IIC_VLD1lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 2, 1, 1, 1, 1]>, + // + // VLD1dup + InstrItinData<IIC_VLD1dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1]>, + // + // VLD1dupu + InstrItinData<IIC_VLD1dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 1, 1]>, + // + // VLD2 + InstrItinData<IIC_VLD2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 1]>, + // + // VLD2x2 + InstrItinData<IIC_VLD2x2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 3, 2, 3, 1]>, + // + // VLD2ln + InstrItinData<IIC_VLD2ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 3, 1, 1, 1, 1]>, + // + // VLD2u + InstrItinData<IIC_VLD2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 2, 1, 1, 1]>, + // + // VLD2x2u + InstrItinData<IIC_VLD2x2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 3, 2, 3, 2, 1]>, + // + // VLD2lnu + InstrItinData<IIC_VLD2lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 3, 2, 1, 1, 1, 1, 1]>, + // + // VLD2dup + InstrItinData<IIC_VLD2dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 1]>, + // + // VLD2dupu + InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 2, 1, 1]>, + // + // VLD3 + InstrItinData<IIC_VLD3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9,[A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 1]>, + // + // VLD3ln + InstrItinData<IIC_VLD3ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<11,[A9_DRegsVFP], 0, Reserved>, + InstrStage<5, [A9_NPipe], 0>, + InstrStage<5, [A9_LSUnit]>], + [5, 5, 6, 1, 1, 1, 1, 2]>, + // + // VLD3u + InstrItinData<IIC_VLD3u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9,[A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 2, 1]>, + // + // VLD3lnu + InstrItinData<IIC_VLD3lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<11,[A9_DRegsVFP], 0, Reserved>, + InstrStage<5, [A9_NPipe], 0>, + InstrStage<5, [A9_LSUnit]>], + [5, 5, 6, 2, 1, 1, 1, 1, 1, 2]>, + // + // VLD3dup + InstrItinData<IIC_VLD3dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 1]>, + // + // VLD3dupu + InstrItinData<IIC_VLD3dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 2, 1, 1]>, + // + // VLD4 + InstrItinData<IIC_VLD4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9,[A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 4, 1]>, + // + // VLD4ln + InstrItinData<IIC_VLD4ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<10,[A9_DRegsVFP], 0, Reserved>, + InstrStage<4, [A9_NPipe], 0>, + InstrStage<4, [A9_LSUnit]>], + [4, 4, 5, 5, 1, 1, 1, 1, 2, 2]>, + // + // VLD4u + InstrItinData<IIC_VLD4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<9,[A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 4, 2, 1]>, + // + // VLD4lnu + InstrItinData<IIC_VLD4lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<10,[A9_DRegsVFP], 0, Reserved>, + InstrStage<4, [A9_NPipe], 0>, + InstrStage<4, [A9_LSUnit]>], + [4, 4, 5, 5, 2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VLD4dup + InstrItinData<IIC_VLD4dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 2, 3, 3, 1]>, + // + // VLD4dupu + InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 2, 3, 3, 2, 1, 1]>, + // + // VST1 + InstrItinData<IIC_VST1, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1, 1]>, + // + // VST1x2 + InstrItinData<IIC_VST1x2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1, 1, 1]>, + // + // VST1x3 + InstrItinData<IIC_VST1x3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1, 2]>, + // + // VST1x4 + InstrItinData<IIC_VST1x4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST1u + InstrItinData<IIC_VST1u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1, 1, 1, 1]>, + // + // VST1x2u + InstrItinData<IIC_VST1x2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1]>, + // + // VST1x3u + InstrItinData<IIC_VST1x3u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2]>, + // + // VST1x4u + InstrItinData<IIC_VST1x4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VST1ln + InstrItinData<IIC_VST1ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1, 1]>, + // + // VST1lnu + InstrItinData<IIC_VST1lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1, 1, 1, 1]>, + // + // VST2 + InstrItinData<IIC_VST2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1, 1, 1]>, + // + // VST2x2 + InstrItinData<IIC_VST2x2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST2u + InstrItinData<IIC_VST2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1]>, + // + // VST2x2u + InstrItinData<IIC_VST2x2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VST2ln + InstrItinData<IIC_VST2ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1, 1, 1]>, + // + // VST2lnu + InstrItinData<IIC_VST2lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1]>, + // + // VST3 + InstrItinData<IIC_VST3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1, 2]>, + // + // VST3u + InstrItinData<IIC_VST3u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2]>, + // + // VST3ln + InstrItinData<IIC_VST3ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [1, 1, 1, 1, 2]>, + // + // VST3lnu + InstrItinData<IIC_VST3lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2]>, + // + // VST4 + InstrItinData<IIC_VST4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST4u + InstrItinData<IIC_VST4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, + // + // VST4ln + InstrItinData<IIC_VST4ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 1, 1, 2, 2]>, + // + // VST4lnu + InstrItinData<IIC_VST4lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 1, 1, 1, 1, 1, 2, 2]>, + + // + // Double-register Integer Unary + InstrItinData<IIC_VUNAiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 2]>, + // + // Quad-register Integer Unary + InstrItinData<IIC_VUNAiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 2]>, + // + // Double-register Integer Q-Unary + InstrItinData<IIC_VQUNAiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, + // + // Quad-register Integer CountQ-Unary + InstrItinData<IIC_VQUNAiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 1]>, + // + // Double-register Integer Binary + InstrItinData<IIC_VBINiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [3, 2, 2]>, + // + // Quad-register Integer Binary + InstrItinData<IIC_VBINiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [3, 2, 2]>, + // + // Double-register Integer Subtract + InstrItinData<IIC_VSUBiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [3, 2, 1]>, + // + // Quad-register Integer Subtract + InstrItinData<IIC_VSUBiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [3, 2, 1]>, + // + // Double-register Integer Shift + InstrItinData<IIC_VSHLiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [3, 1, 1]>, + // + // Quad-register Integer Shift + InstrItinData<IIC_VSHLiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [3, 1, 1]>, + // + // Double-register Integer Shift (4 cycle) + InstrItinData<IIC_VSHLi4D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 1, 1]>, + // + // Quad-register Integer Shift (4 cycle) + InstrItinData<IIC_VSHLi4Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 1, 1]>, + // + // Double-register Integer Binary (4 cycle) + InstrItinData<IIC_VBINi4D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 2, 2]>, + // + // Quad-register Integer Binary (4 cycle) + InstrItinData<IIC_VBINi4Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 2, 2]>, + // + // Double-register Integer Subtract (4 cycle) + InstrItinData<IIC_VSUBi4D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 2, 1]>, + // + // Quad-register Integer Subtract (4 cycle) + InstrItinData<IIC_VSUBi4Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [4, 2, 1]>, + + // + // Double-register Integer Count + InstrItinData<IIC_VCNTiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [3, 2, 2]>, + // + // Quad-register Integer Count + // Result written in N3, but that is relative to the last cycle of multicycle, + // so we use 4 for those cases + InstrItinData<IIC_VCNTiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [4, 2, 2]>, + // + // Double-register Absolute Difference and Accumulate + InstrItinData<IIC_VABAD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [6, 3, 2, 1]>, + // + // Quad-register Absolute Difference and Accumulate + InstrItinData<IIC_VABAQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [6, 3, 2, 1]>, + // + // Double-register Integer Pair Add Long + InstrItinData<IIC_VPALiD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [6, 3, 1]>, + // + // Quad-register Integer Pair Add Long + InstrItinData<IIC_VPALiQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [6, 3, 1]>, + + // + // Double-register Integer Multiply (.8, .16) + InstrItinData<IIC_VMULi16D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [6, 2, 2]>, + // + // Quad-register Integer Multiply (.8, .16) + InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [7, 2, 2]>, + + // + // Double-register Integer Multiply (.32) + InstrItinData<IIC_VMULi32D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [7, 2, 1]>, + // + // Quad-register Integer Multiply (.32) + InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 9 cycles + InstrStage<10, [A9_DRegsVFP], 0, Reserved>, + InstrStage<4, [A9_NPipe]>], + [9, 2, 1]>, + // + // Double-register Integer Multiply-Accumulate (.8, .16) + InstrItinData<IIC_VMACi16D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [6, 3, 2, 2]>, + // + // Double-register Integer Multiply-Accumulate (.32) + InstrItinData<IIC_VMACi32D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [7, 3, 2, 1]>, + // + // Quad-register Integer Multiply-Accumulate (.8, .16) + InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [7, 3, 2, 2]>, + // + // Quad-register Integer Multiply-Accumulate (.32) + InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 9 cycles + InstrStage<10, [A9_DRegsVFP], 0, Reserved>, + InstrStage<4, [A9_NPipe]>], + [9, 3, 2, 1]>, + + // + // Move + InstrItinData<IIC_VMOV, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [1,1]>, + // + // Move Immediate + InstrItinData<IIC_VMOVImm, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [3]>, + // + // Double-register Permute Move + InstrItinData<IIC_VMOVD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [2, 1]>, + // + // Quad-register Permute Move + InstrItinData<IIC_VMOVQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [2, 1]>, + // + // Integer to Single-precision Move + InstrItinData<IIC_VMOVIS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [1, 1]>, + // + // Integer to Double-precision Move + InstrItinData<IIC_VMOVID , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [1, 1, 1]>, + // + // Single-precision to Integer Move + InstrItinData<IIC_VMOVSI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [2, 1]>, + // + // Double-precision to Integer Move + InstrItinData<IIC_VMOVDI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<3, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [2, 2, 1]>, + // + // Integer to Lane Move + InstrItinData<IIC_VMOVISL , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + InstrStage<4, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [3, 1, 1]>, + + // + // Vector narrow move + InstrItinData<IIC_VMOVN, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [3, 1]>, + // + // Double-register FP Unary + InstrItinData<IIC_VUNAD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [5, 2]>, + // + // Quad-register FP Unary + // Result written in N5, but that is relative to the last cycle of multicycle, + // so we use 6 for those cases + InstrItinData<IIC_VUNAQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [6, 2]>, + // + // Double-register FP Binary + // FIXME: We're using this itin for many instructions and [2, 2] here is too + // optimistic. + InstrItinData<IIC_VBIND, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [5, 2, 2]>, + + // + // VPADD, etc. + InstrItinData<IIC_VPBIND, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [5, 1, 1]>, + // + // Double-register FP VMUL + InstrItinData<IIC_VFMULD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [5, 2, 1]>, + // + // Quad-register FP Binary + // Result written in N5, but that is relative to the last cycle of multicycle, + // so we use 6 for those cases + // FIXME: We're using this itin for many instructions and [2, 2] here is too + // optimistic. + InstrItinData<IIC_VBINQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [6, 2, 2]>, + // + // Quad-register FP VMUL + InstrItinData<IIC_VFMULQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [6, 2, 1]>, + // + // Double-register FP Multiple-Accumulate + InstrItinData<IIC_VMACD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [6, 3, 2, 1]>, + // + // Quad-register FP Multiple-Accumulate + // Result written in N9, but that is relative to the last cycle of multicycle, + // so we use 10 for those cases + InstrItinData<IIC_VMACQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 9 cycles + InstrStage<10, [A9_DRegsVFP], 0, Reserved>, + InstrStage<4, [A9_NPipe]>], + [8, 4, 2, 1]>, + // + // Double-register Reciprical Step + InstrItinData<IIC_VRECSD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 10 cycles + InstrStage<11, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [9, 2, 2]>, + // + // Quad-register Reciprical Step + InstrItinData<IIC_VRECSQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 11 cycles + InstrStage<12, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [10, 2, 2]>, + // + // Double-register Permute + InstrItinData<IIC_VPERMD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [2, 2, 1, 1]>, + // + // Quad-register Permute + // Result written in N2, but that is relative to the last cycle of multicycle, + // so we use 3 for those cases + InstrItinData<IIC_VPERMQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [3, 3, 1, 1]>, + // + // Quad-register Permute (3 cycle issue) + // Result written in N2, but that is relative to the last cycle of multicycle, + // so we use 4 for those cases + InstrItinData<IIC_VPERMQ3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 8 cycles + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe]>], + [4, 4, 1, 1]>, + + // + // Double-register VEXT + InstrItinData<IIC_VEXTD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 6 cycles + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [2, 1, 1]>, + // + // Quad-register VEXT + InstrItinData<IIC_VEXTQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [3, 1, 2]>, + // + // VTB + InstrItinData<IIC_VTB1, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [3, 2, 1]>, + InstrItinData<IIC_VTB2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [3, 2, 2, 1]>, + InstrItinData<IIC_VTB3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 8 cycles + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe]>], + [4, 2, 2, 3, 1]>, + InstrItinData<IIC_VTB4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 8 cycles + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe]>], + [4, 2, 2, 3, 3, 1]>, + // + // VTBX + InstrItinData<IIC_VTBX1, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [3, 1, 2, 1]>, + InstrItinData<IIC_VTBX2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [3, 1, 2, 2, 1]>, + InstrItinData<IIC_VTBX3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 8 cycles + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe]>], + [4, 1, 2, 2, 3, 1]>, + InstrItinData<IIC_VTBX4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 8 cycles + InstrStage<9, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [4, 1, 2, 2, 3, 3, 1]> +]>; diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td b/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td new file mode 100644 index 0000000..c1880a7 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td @@ -0,0 +1,294 @@ +//===- ARMScheduleV6.td - ARM v6 Scheduling Definitions ----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the ARM v6 processors. +// +//===----------------------------------------------------------------------===// + +// Model based on ARM1176 +// +// Functional Units +def V6_Pipe : FuncUnit; // pipeline + +// Scheduling information derived from "ARM1176JZF-S Technical Reference Manual" +// +def ARMV6Itineraries : ProcessorItineraries< + [V6_Pipe], [], [ + // + // No operand cycles + InstrItinData<IIC_iALUx , [InstrStage<1, [V6_Pipe]>]>, + // + // Binary Instructions that produce a result + InstrItinData<IIC_iALUi , [InstrStage<1, [V6_Pipe]>], [2, 2]>, + InstrItinData<IIC_iALUr , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>, + InstrItinData<IIC_iALUsi , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>, + InstrItinData<IIC_iALUsr , [InstrStage<2, [V6_Pipe]>], [3, 3, 2, 1]>, + // + // Bitwise Instructions that produce a result + InstrItinData<IIC_iBITi , [InstrStage<1, [V6_Pipe]>], [2, 2]>, + InstrItinData<IIC_iBITr , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>, + InstrItinData<IIC_iBITsi , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>, + InstrItinData<IIC_iBITsr , [InstrStage<2, [V6_Pipe]>], [3, 3, 2, 1]>, + // + // Unary Instructions that produce a result + InstrItinData<IIC_iUNAr , [InstrStage<1, [V6_Pipe]>], [2, 2]>, + InstrItinData<IIC_iUNAsi , [InstrStage<1, [V6_Pipe]>], [2, 1]>, + // + // Zero and sign extension instructions + InstrItinData<IIC_iEXTr , [InstrStage<1, [V6_Pipe]>], [1, 1]>, + InstrItinData<IIC_iEXTAr , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>, + InstrItinData<IIC_iEXTAsr , [InstrStage<2, [V6_Pipe]>], [3, 3, 2, 1]>, + // + // Compare instructions + InstrItinData<IIC_iCMPi , [InstrStage<1, [V6_Pipe]>], [2]>, + InstrItinData<IIC_iCMPr , [InstrStage<1, [V6_Pipe]>], [2, 2]>, + InstrItinData<IIC_iCMPsi , [InstrStage<1, [V6_Pipe]>], [2, 1]>, + InstrItinData<IIC_iCMPsr , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>, + // + // Test instructions + InstrItinData<IIC_iTSTi , [InstrStage<1, [V6_Pipe]>], [2]>, + InstrItinData<IIC_iTSTr , [InstrStage<1, [V6_Pipe]>], [2, 2]>, + InstrItinData<IIC_iTSTsi , [InstrStage<1, [V6_Pipe]>], [2, 1]>, + InstrItinData<IIC_iTSTsr , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>, + // + // Move instructions, unconditional + InstrItinData<IIC_iMOVi , [InstrStage<1, [V6_Pipe]>], [2]>, + InstrItinData<IIC_iMOVr , [InstrStage<1, [V6_Pipe]>], [2, 2]>, + InstrItinData<IIC_iMOVsi , [InstrStage<1, [V6_Pipe]>], [2, 1]>, + InstrItinData<IIC_iMOVsr , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>, + InstrItinData<IIC_iMOVix2 , [InstrStage<1, [V6_Pipe]>, + InstrStage<1, [V6_Pipe]>], [2]>, + InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [V6_Pipe]>, + InstrStage<1, [V6_Pipe]>, + InstrStage<1, [V6_Pipe]>], [3]>, + InstrItinData<IIC_iMOVix2ld , [InstrStage<1, [V6_Pipe]>, + InstrStage<1, [V6_Pipe]>, + InstrStage<1, [V6_Pipe]>], [5]>, + // + // Move instructions, conditional + InstrItinData<IIC_iCMOVi , [InstrStage<1, [V6_Pipe]>], [3]>, + InstrItinData<IIC_iCMOVr , [InstrStage<1, [V6_Pipe]>], [3, 2]>, + InstrItinData<IIC_iCMOVsi , [InstrStage<1, [V6_Pipe]>], [3, 1]>, + InstrItinData<IIC_iCMOVsr , [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>, + InstrItinData<IIC_iCMOVix2 , [InstrStage<1, [V6_Pipe]>, + InstrStage<1, [V6_Pipe]>], [4]>, + // + // MVN instructions + InstrItinData<IIC_iMVNi , [InstrStage<1, [V6_Pipe]>], [2]>, + InstrItinData<IIC_iMVNr , [InstrStage<1, [V6_Pipe]>], [2, 2]>, + InstrItinData<IIC_iMVNsi , [InstrStage<1, [V6_Pipe]>], [2, 1]>, + InstrItinData<IIC_iMVNsr , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>, + + // Integer multiply pipeline + // + InstrItinData<IIC_iMUL16 , [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>, + InstrItinData<IIC_iMAC16 , [InstrStage<1, [V6_Pipe]>], [4, 1, 1, 2]>, + InstrItinData<IIC_iMUL32 , [InstrStage<2, [V6_Pipe]>], [5, 1, 1]>, + InstrItinData<IIC_iMAC32 , [InstrStage<2, [V6_Pipe]>], [5, 1, 1, 2]>, + InstrItinData<IIC_iMUL64 , [InstrStage<3, [V6_Pipe]>], [6, 1, 1]>, + InstrItinData<IIC_iMAC64 , [InstrStage<3, [V6_Pipe]>], [6, 1, 1, 2]>, + + // Integer load pipeline + // + // Immediate offset + InstrItinData<IIC_iLoad_i , [InstrStage<1, [V6_Pipe]>], [4, 1]>, + InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [V6_Pipe]>], [4, 1]>, + InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [V6_Pipe]>], [4, 1]>, + // + // Register offset + InstrItinData<IIC_iLoad_r , [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>, + InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>, + InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>, + // + // Scaled register offset, issues over 2 cycles + InstrItinData<IIC_iLoad_si , [InstrStage<2, [V6_Pipe]>], [5, 2, 1]>, + InstrItinData<IIC_iLoad_bh_si, [InstrStage<2, [V6_Pipe]>], [5, 2, 1]>, + // + // Immediate offset with update + InstrItinData<IIC_iLoad_iu , [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>, + InstrItinData<IIC_iLoad_bh_iu, [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>, + // + // Register offset with update + InstrItinData<IIC_iLoad_ru , [InstrStage<1, [V6_Pipe]>], [4, 2, 1, 1]>, + InstrItinData<IIC_iLoad_bh_ru, [InstrStage<1, [V6_Pipe]>], [4, 2, 1, 1]>, + InstrItinData<IIC_iLoad_d_ru , [InstrStage<1, [V6_Pipe]>], [4, 2, 1, 1]>, + // + // Scaled register offset with update, issues over 2 cycles + InstrItinData<IIC_iLoad_siu, [InstrStage<2, [V6_Pipe]>], [5, 2, 2, 1]>, + InstrItinData<IIC_iLoad_bh_siu,[InstrStage<2, [V6_Pipe]>], [5, 2, 2, 1]>, + + // + // Load multiple, def is the 5th operand. + InstrItinData<IIC_iLoad_m , [InstrStage<3, [V6_Pipe]>], [1, 1, 1, 1, 4]>, + // + // Load multiple + update, defs are the 1st and 5th operands. + InstrItinData<IIC_iLoad_mu , [InstrStage<3, [V6_Pipe]>], [2, 1, 1, 1, 4]>, + // + // Load multiple plus branch + InstrItinData<IIC_iLoad_mBr, [InstrStage<3, [V6_Pipe]>, + InstrStage<1, [V6_Pipe]>], [1, 2, 1, 1, 4]>, + + // + // iLoadi + iALUr for t2LDRpci_pic. + InstrItinData<IIC_iLoadiALU, [InstrStage<1, [V6_Pipe]>, + InstrStage<1, [V6_Pipe]>], [3, 1]>, + + // + // Pop, def is the 3rd operand. + InstrItinData<IIC_iPop , [InstrStage<3, [V6_Pipe]>], [1, 1, 4]>, + // + // Pop + branch, def is the 3rd operand. + InstrItinData<IIC_iPop_Br, [InstrStage<3, [V6_Pipe]>, + InstrStage<1, [V6_Pipe]>], [1, 2, 4]>, + + // Integer store pipeline + // + // Immediate offset + InstrItinData<IIC_iStore_i , [InstrStage<1, [V6_Pipe]>], [2, 1]>, + InstrItinData<IIC_iStore_bh_i, [InstrStage<1, [V6_Pipe]>], [2, 1]>, + InstrItinData<IIC_iStore_d_i , [InstrStage<1, [V6_Pipe]>], [2, 1]>, + // + // Register offset + InstrItinData<IIC_iStore_r , [InstrStage<1, [V6_Pipe]>], [2, 1, 1]>, + InstrItinData<IIC_iStore_bh_r, [InstrStage<1, [V6_Pipe]>], [2, 1, 1]>, + InstrItinData<IIC_iStore_d_r , [InstrStage<1, [V6_Pipe]>], [2, 1, 1]>, + // + // Scaled register offset, issues over 2 cycles + InstrItinData<IIC_iStore_si , [InstrStage<2, [V6_Pipe]>], [2, 2, 1]>, + InstrItinData<IIC_iStore_bh_si, [InstrStage<2, [V6_Pipe]>], [2, 2, 1]>, + // + // Immediate offset with update + InstrItinData<IIC_iStore_iu , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>, + InstrItinData<IIC_iStore_bh_iu, [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>, + // + // Register offset with update + InstrItinData<IIC_iStore_ru, [InstrStage<1, [V6_Pipe]>], [2, 2, 1, 1]>, + InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [V6_Pipe]>], [2, 2, 1, 1]>, + InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [V6_Pipe]>], [2, 2, 1, 1]>, + // + // Scaled register offset with update, issues over 2 cycles + InstrItinData<IIC_iStore_siu, [InstrStage<2, [V6_Pipe]>], [2, 2, 2, 1]>, + InstrItinData<IIC_iStore_bh_siu,[InstrStage<2, [V6_Pipe]>], [2, 2, 2, 1]>, + // + // Store multiple + InstrItinData<IIC_iStore_m , [InstrStage<3, [V6_Pipe]>]>, + // + // Store multiple + update + InstrItinData<IIC_iStore_mu , [InstrStage<3, [V6_Pipe]>], [2]>, + + // Branch + // + // no delay slots, so the latency of a branch is unimportant + InstrItinData<IIC_Br , [InstrStage<1, [V6_Pipe]>]>, + + // VFP + // Issue through integer pipeline, and execute in NEON unit. We assume + // RunFast mode so that NFP pipeline is used for single-precision when + // possible. + // + // FP Special Register to Integer Register File Move + InstrItinData<IIC_fpSTAT , [InstrStage<1, [V6_Pipe]>], [3]>, + // + // Single-precision FP Unary + InstrItinData<IIC_fpUNA32 , [InstrStage<1, [V6_Pipe]>], [5, 2]>, + // + // Double-precision FP Unary + InstrItinData<IIC_fpUNA64 , [InstrStage<1, [V6_Pipe]>], [5, 2]>, + // + // Single-precision FP Compare + InstrItinData<IIC_fpCMP32 , [InstrStage<1, [V6_Pipe]>], [2, 2]>, + // + // Double-precision FP Compare + InstrItinData<IIC_fpCMP64 , [InstrStage<1, [V6_Pipe]>], [2, 2]>, + // + // Single to Double FP Convert + InstrItinData<IIC_fpCVTSD , [InstrStage<1, [V6_Pipe]>], [5, 2]>, + // + // Double to Single FP Convert + InstrItinData<IIC_fpCVTDS , [InstrStage<1, [V6_Pipe]>], [5, 2]>, + // + // Single-Precision FP to Integer Convert + InstrItinData<IIC_fpCVTSI , [InstrStage<1, [V6_Pipe]>], [9, 2]>, + // + // Double-Precision FP to Integer Convert + InstrItinData<IIC_fpCVTDI , [InstrStage<1, [V6_Pipe]>], [9, 2]>, + // + // Integer to Single-Precision FP Convert + InstrItinData<IIC_fpCVTIS , [InstrStage<1, [V6_Pipe]>], [9, 2]>, + // + // Integer to Double-Precision FP Convert + InstrItinData<IIC_fpCVTID , [InstrStage<1, [V6_Pipe]>], [9, 2]>, + // + // Single-precision FP ALU + InstrItinData<IIC_fpALU32 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2]>, + // + // Double-precision FP ALU + InstrItinData<IIC_fpALU64 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2]>, + // + // Single-precision FP Multiply + InstrItinData<IIC_fpMUL32 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2]>, + // + // Double-precision FP Multiply + InstrItinData<IIC_fpMUL64 , [InstrStage<2, [V6_Pipe]>], [9, 2, 2]>, + // + // Single-precision FP MAC + InstrItinData<IIC_fpMAC32 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2, 2]>, + // + // Double-precision FP MAC + InstrItinData<IIC_fpMAC64 , [InstrStage<2, [V6_Pipe]>], [9, 2, 2, 2]>, + // + // Single-precision FP DIV + InstrItinData<IIC_fpDIV32 , [InstrStage<15, [V6_Pipe]>], [20, 2, 2]>, + // + // Double-precision FP DIV + InstrItinData<IIC_fpDIV64 , [InstrStage<29, [V6_Pipe]>], [34, 2, 2]>, + // + // Single-precision FP SQRT + InstrItinData<IIC_fpSQRT32 , [InstrStage<15, [V6_Pipe]>], [20, 2, 2]>, + // + // Double-precision FP SQRT + InstrItinData<IIC_fpSQRT64 , [InstrStage<29, [V6_Pipe]>], [34, 2, 2]>, + // + // Integer to Single-precision Move + InstrItinData<IIC_fpMOVIS, [InstrStage<1, [V6_Pipe]>], [10, 1]>, + // + // Integer to Double-precision Move + InstrItinData<IIC_fpMOVID, [InstrStage<1, [V6_Pipe]>], [10, 1, 1]>, + // + // Single-precision to Integer Move + InstrItinData<IIC_fpMOVSI, [InstrStage<1, [V6_Pipe]>], [10, 1]>, + // + // Double-precision to Integer Move + InstrItinData<IIC_fpMOVDI, [InstrStage<1, [V6_Pipe]>], [10, 10, 1]>, + // + // Single-precision FP Load + InstrItinData<IIC_fpLoad32 , [InstrStage<1, [V6_Pipe]>], [5, 2, 2]>, + // + // Double-precision FP Load + InstrItinData<IIC_fpLoad64 , [InstrStage<1, [V6_Pipe]>], [5, 2, 2]>, + // + // FP Load Multiple + InstrItinData<IIC_fpLoad_m , [InstrStage<3, [V6_Pipe]>], [2, 1, 1, 5]>, + // + // FP Load Multiple + update + InstrItinData<IIC_fpLoad_mu, [InstrStage<3, [V6_Pipe]>], [3, 2, 1, 1, 5]>, + // + // Single-precision FP Store + InstrItinData<IIC_fpStore32 , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>, + // + // Double-precision FP Store + // use FU_Issue to enforce the 1 load/store per cycle limit + InstrItinData<IIC_fpStore64 , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>, + // + // FP Store Multiple + InstrItinData<IIC_fpStore_m, [InstrStage<3, [V6_Pipe]>], [2, 2, 2, 2]>, + // + // FP Store Multiple + update + InstrItinData<IIC_fpStore_mu,[InstrStage<3, [V6_Pipe]>], [3, 2, 2, 2, 2]> +]>; diff --git a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp new file mode 100644 index 0000000..a3a3d58 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -0,0 +1,197 @@ +//===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the ARMSelectionDAGInfo class. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "arm-selectiondag-info" +#include "ARMTargetMachine.h" +#include "llvm/DerivedTypes.h" +#include "llvm/CodeGen/SelectionDAG.h" +using namespace llvm; + +ARMSelectionDAGInfo::ARMSelectionDAGInfo(const TargetMachine &TM) + : TargetSelectionDAGInfo(TM), + Subtarget(&TM.getSubtarget<ARMSubtarget>()) { +} + +ARMSelectionDAGInfo::~ARMSelectionDAGInfo() { +} + +SDValue +ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, + SDValue Chain, + SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, + bool isVolatile, bool AlwaysInline, + MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo) const { + // Do repeated 4-byte loads and stores. To be improved. + // This requires 4-byte alignment. + if ((Align & 3) != 0) + return SDValue(); + // This requires the copy size to be a constant, preferably + // within a subtarget-specific limit. + ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); + if (!ConstantSize) + return SDValue(); + uint64_t SizeVal = ConstantSize->getZExtValue(); + if (!AlwaysInline && SizeVal > Subtarget->getMaxInlineSizeThreshold()) + return SDValue(); + + unsigned BytesLeft = SizeVal & 3; + unsigned NumMemOps = SizeVal >> 2; + unsigned EmittedNumMemOps = 0; + EVT VT = MVT::i32; + unsigned VTSize = 4; + unsigned i = 0; + const unsigned MAX_LOADS_IN_LDM = 6; + SDValue TFOps[MAX_LOADS_IN_LDM]; + SDValue Loads[MAX_LOADS_IN_LDM]; + uint64_t SrcOff = 0, DstOff = 0; + + // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the + // same number of stores. The loads and stores will get combined into + // ldm/stm later on. + while (EmittedNumMemOps < NumMemOps) { + for (i = 0; + i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) { + Loads[i] = DAG.getLoad(VT, dl, Chain, + DAG.getNode(ISD::ADD, dl, MVT::i32, Src, + DAG.getConstant(SrcOff, MVT::i32)), + SrcPtrInfo.getWithOffset(SrcOff), isVolatile, + false, 0); + TFOps[i] = Loads[i].getValue(1); + SrcOff += VTSize; + } + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i); + + for (i = 0; + i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) { + TFOps[i] = DAG.getStore(Chain, dl, Loads[i], + DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, + DAG.getConstant(DstOff, MVT::i32)), + DstPtrInfo.getWithOffset(DstOff), + isVolatile, false, 0); + DstOff += VTSize; + } + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i); + + EmittedNumMemOps += i; + } + + if (BytesLeft == 0) + return Chain; + + // Issue loads / stores for the trailing (1 - 3) bytes. + unsigned BytesLeftSave = BytesLeft; + i = 0; + while (BytesLeft) { + if (BytesLeft >= 2) { + VT = MVT::i16; + VTSize = 2; + } else { + VT = MVT::i8; + VTSize = 1; + } + + Loads[i] = DAG.getLoad(VT, dl, Chain, + DAG.getNode(ISD::ADD, dl, MVT::i32, Src, + DAG.getConstant(SrcOff, MVT::i32)), + SrcPtrInfo.getWithOffset(SrcOff), false, false, 0); + TFOps[i] = Loads[i].getValue(1); + ++i; + SrcOff += VTSize; + BytesLeft -= VTSize; + } + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i); + + i = 0; + BytesLeft = BytesLeftSave; + while (BytesLeft) { + if (BytesLeft >= 2) { + VT = MVT::i16; + VTSize = 2; + } else { + VT = MVT::i8; + VTSize = 1; + } + + TFOps[i] = DAG.getStore(Chain, dl, Loads[i], + DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, + DAG.getConstant(DstOff, MVT::i32)), + DstPtrInfo.getWithOffset(DstOff), false, false, 0); + ++i; + DstOff += VTSize; + BytesLeft -= VTSize; + } + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i); +} + +// Adjust parameters for memset, EABI uses format (ptr, size, value), +// GNU library uses (ptr, value, size) +// See RTABI section 4.3.4 +SDValue ARMSelectionDAGInfo:: +EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, + SDValue Chain, SDValue Dst, + SDValue Src, SDValue Size, + unsigned Align, bool isVolatile, + MachinePointerInfo DstPtrInfo) const { + // Use default for non AAPCS subtargets + if (!Subtarget->isAAPCS_ABI()) + return SDValue(); + + const ARMTargetLowering &TLI = + *static_cast<const ARMTargetLowering*>(DAG.getTarget().getTargetLowering()); + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + + // First argument: data pointer + Type *IntPtrTy = TLI.getTargetData()->getIntPtrType(*DAG.getContext()); + Entry.Node = Dst; + Entry.Ty = IntPtrTy; + Args.push_back(Entry); + + // Second argument: buffer size + Entry.Node = Size; + Entry.Ty = IntPtrTy; + Entry.isSExt = false; + Args.push_back(Entry); + + // Extend or truncate the argument to be an i32 value for the call. + if (Src.getValueType().bitsGT(MVT::i32)) + Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src); + else + Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src); + + // Third argument: value to fill + Entry.Node = Src; + Entry.Ty = Type::getInt32Ty(*DAG.getContext()); + Entry.isSExt = true; + Args.push_back(Entry); + + // Emit __eabi_memset call + std::pair<SDValue,SDValue> CallResult = + TLI.LowerCallTo(Chain, + Type::getVoidTy(*DAG.getContext()), // return type + false, // return sign ext + false, // return zero ext + false, // is var arg + false, // is in regs + 0, // number of fixed arguments + TLI.getLibcallCallingConv(RTLIB::MEMSET), // call conv + false, // is tail call + false, // is return val used + DAG.getExternalSymbol(TLI.getLibcallName(RTLIB::MEMSET), + TLI.getPointerTy()), // callee + Args, DAG, dl); // arg list, DAG and debug + + return CallResult.second; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h new file mode 100644 index 0000000..6419a73 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h @@ -0,0 +1,68 @@ +//===-- ARMSelectionDAGInfo.h - ARM SelectionDAG Info -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the ARM subclass for TargetSelectionDAGInfo. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMSELECTIONDAGINFO_H +#define ARMSELECTIONDAGINFO_H + +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/Target/TargetSelectionDAGInfo.h" + +namespace llvm { + +namespace ARM_AM { + static inline ShiftOpc getShiftOpcForNode(unsigned Opcode) { + switch (Opcode) { + default: return ARM_AM::no_shift; + case ISD::SHL: return ARM_AM::lsl; + case ISD::SRL: return ARM_AM::lsr; + case ISD::SRA: return ARM_AM::asr; + case ISD::ROTR: return ARM_AM::ror; + //case ISD::ROTL: // Only if imm -> turn into ROTR. + // Can't handle RRX here, because it would require folding a flag into + // the addressing mode. :( This causes us to miss certain things. + //case ARMISD::RRX: return ARM_AM::rrx; + } + } +} // end namespace ARM_AM + +class ARMSelectionDAGInfo : public TargetSelectionDAGInfo { + /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can + /// make the right decision when generating code for different targets. + const ARMSubtarget *Subtarget; + +public: + explicit ARMSelectionDAGInfo(const TargetMachine &TM); + ~ARMSelectionDAGInfo(); + + virtual + SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, + SDValue Chain, + SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, + bool isVolatile, bool AlwaysInline, + MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo) const; + + // Adjust parameters for memset, see RTABI section 4.3.4 + virtual + SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, + SDValue Chain, + SDValue Op1, SDValue Op2, + SDValue Op3, unsigned Align, + bool isVolatile, + MachinePointerInfo DstPtrInfo) const; +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp new file mode 100644 index 0000000..247d6be --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -0,0 +1,219 @@ +//===-- ARMSubtarget.cpp - ARM Subtarget Information ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the ARM specific subclass of TargetSubtargetInfo. +// +//===----------------------------------------------------------------------===// + +#include "ARMSubtarget.h" +#include "ARMBaseRegisterInfo.h" +#include "llvm/GlobalValue.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/ADT/SmallVector.h" + +#define GET_SUBTARGETINFO_TARGET_DESC +#define GET_SUBTARGETINFO_CTOR +#include "ARMGenSubtargetInfo.inc" + +using namespace llvm; + +static cl::opt<bool> +ReserveR9("arm-reserve-r9", cl::Hidden, + cl::desc("Reserve R9, making it unavailable as GPR")); + +static cl::opt<bool> +DarwinUseMOVT("arm-darwin-use-movt", cl::init(true), cl::Hidden); + +static cl::opt<bool> +StrictAlign("arm-strict-align", cl::Hidden, + cl::desc("Disallow all unaligned memory accesses")); + +ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU, + const std::string &FS) + : ARMGenSubtargetInfo(TT, CPU, FS) + , ARMProcFamily(Others) + , HasV4TOps(false) + , HasV5TOps(false) + , HasV5TEOps(false) + , HasV6Ops(false) + , HasV6T2Ops(false) + , HasV7Ops(false) + , HasVFPv2(false) + , HasVFPv3(false) + , HasNEON(false) + , UseNEONForSinglePrecisionFP(false) + , SlowFPVMLx(false) + , HasVMLxForwarding(false) + , SlowFPBrcc(false) + , InThumbMode(false) + , InNaClMode(false) + , HasThumb2(false) + , IsMClass(false) + , NoARM(false) + , PostRAScheduler(false) + , IsR9Reserved(ReserveR9) + , UseMovt(false) + , SupportsTailCall(false) + , HasFP16(false) + , HasD16(false) + , HasHardwareDivide(false) + , HasT2ExtractPack(false) + , HasDataBarrier(false) + , Pref32BitThumb(false) + , AvoidCPSRPartialUpdate(false) + , HasMPExtension(false) + , FPOnlySP(false) + , AllowsUnalignedMem(false) + , Thumb2DSP(false) + , stackAlignment(4) + , CPUString(CPU) + , TargetTriple(TT) + , TargetABI(ARM_ABI_APCS) { + // Determine default and user specified characteristics + if (CPUString.empty()) + CPUString = "generic"; + + // Insert the architecture feature derived from the target triple into the + // feature string. This is important for setting features that are implied + // based on the architecture version. + std::string ArchFS = ARM_MC::ParseARMTriple(TT); + if (!FS.empty()) { + if (!ArchFS.empty()) + ArchFS = ArchFS + "," + FS; + else + ArchFS = FS; + } + ParseSubtargetFeatures(CPUString, ArchFS); + + // Thumb2 implies at least V6T2. FIXME: Fix tests to explicitly specify a + // ARM version or CPU and then remove this. + if (!HasV6T2Ops && hasThumb2()) + HasV4TOps = HasV5TOps = HasV5TEOps = HasV6Ops = HasV6T2Ops = true; + + // Initialize scheduling itinerary for the specified CPU. + InstrItins = getInstrItineraryForCPU(CPUString); + + // After parsing Itineraries, set ItinData.IssueWidth. + computeIssueWidth(); + + if (TT.find("eabi") != std::string::npos) + TargetABI = ARM_ABI_AAPCS; + + if (isAAPCS_ABI()) + stackAlignment = 8; + + if (!isTargetDarwin()) + UseMovt = hasV6T2Ops(); + else { + IsR9Reserved = ReserveR9 | !HasV6Ops; + UseMovt = DarwinUseMOVT && hasV6T2Ops(); + const Triple &T = getTargetTriple(); + SupportsTailCall = T.getOS() == Triple::IOS && !T.isOSVersionLT(5, 0); + } + + if (!isThumb() || hasThumb2()) + PostRAScheduler = true; + + // v6+ may or may not support unaligned mem access depending on the system + // configuration. + if (!StrictAlign && hasV6Ops() && isTargetDarwin()) + AllowsUnalignedMem = true; +} + +/// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol. +bool +ARMSubtarget::GVIsIndirectSymbol(const GlobalValue *GV, + Reloc::Model RelocM) const { + if (RelocM == Reloc::Static) + return false; + + // Materializable GVs (in JIT lazy compilation mode) do not require an extra + // load from stub. + bool isDecl = GV->hasAvailableExternallyLinkage(); + if (GV->isDeclaration() && !GV->isMaterializable()) + isDecl = true; + + if (!isTargetDarwin()) { + // Extra load is needed for all externally visible. + if (GV->hasLocalLinkage() || GV->hasHiddenVisibility()) + return false; + return true; + } else { + if (RelocM == Reloc::PIC_) { + // If this is a strong reference to a definition, it is definitely not + // through a stub. + if (!isDecl && !GV->isWeakForLinker()) + return false; + + // Unless we have a symbol with hidden visibility, we have to go through a + // normal $non_lazy_ptr stub because this symbol might be resolved late. + if (!GV->hasHiddenVisibility()) // Non-hidden $non_lazy_ptr reference. + return true; + + // If symbol visibility is hidden, we have a stub for common symbol + // references and external declarations. + if (isDecl || GV->hasCommonLinkage()) + // Hidden $non_lazy_ptr reference. + return true; + + return false; + } else { + // If this is a strong reference to a definition, it is definitely not + // through a stub. + if (!isDecl && !GV->isWeakForLinker()) + return false; + + // Unless we have a symbol with hidden visibility, we have to go through a + // normal $non_lazy_ptr stub because this symbol might be resolved late. + if (!GV->hasHiddenVisibility()) // Non-hidden $non_lazy_ptr reference. + return true; + } + } + + return false; +} + +unsigned ARMSubtarget::getMispredictionPenalty() const { + // If we have a reasonable estimate of the pipeline depth, then we can + // estimate the penalty of a misprediction based on that. + if (isCortexA8()) + return 13; + else if (isCortexA9()) + return 8; + + // Otherwise, just return a sensible default. + return 10; +} + +void ARMSubtarget::computeIssueWidth() { + unsigned allStage1Units = 0; + for (const InstrItinerary *itin = InstrItins.Itineraries; + itin->FirstStage != ~0U; ++itin) { + const InstrStage *IS = InstrItins.Stages + itin->FirstStage; + allStage1Units |= IS->getUnits(); + } + InstrItins.IssueWidth = 0; + while (allStage1Units) { + ++InstrItins.IssueWidth; + // clear the lowest bit + allStage1Units ^= allStage1Units & ~(allStage1Units - 1); + } + assert(InstrItins.IssueWidth <= 2 && "itinerary bug, too many stage 1 units"); +} + +bool ARMSubtarget::enablePostRAScheduler( + CodeGenOpt::Level OptLevel, + TargetSubtargetInfo::AntiDepBreakMode& Mode, + RegClassVector& CriticalPathRCs) const { + Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL; + CriticalPathRCs.clear(); + CriticalPathRCs.push_back(&ARM::GPRRegClass); + return PostRAScheduler && OptLevel >= CodeGenOpt::Default; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h new file mode 100644 index 0000000..b63e108 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h @@ -0,0 +1,270 @@ +//=====---- ARMSubtarget.h - Define Subtarget for the ARM -----*- C++ -*--====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the ARM specific subclass of TargetSubtargetInfo. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMSUBTARGET_H +#define ARMSUBTARGET_H + +#include "MCTargetDesc/ARMMCTargetDesc.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include "llvm/MC/MCInstrItineraries.h" +#include "llvm/ADT/Triple.h" +#include <string> + +#define GET_SUBTARGETINFO_HEADER +#include "ARMGenSubtargetInfo.inc" + +namespace llvm { +class GlobalValue; +class StringRef; + +class ARMSubtarget : public ARMGenSubtargetInfo { +protected: + enum ARMProcFamilyEnum { + Others, CortexA8, CortexA9 + }; + + /// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others. + ARMProcFamilyEnum ARMProcFamily; + + /// HasV4TOps, HasV5TOps, HasV5TEOps, HasV6Ops, HasV6T2Ops, HasV7Ops - + /// Specify whether target support specific ARM ISA variants. + bool HasV4TOps; + bool HasV5TOps; + bool HasV5TEOps; + bool HasV6Ops; + bool HasV6T2Ops; + bool HasV7Ops; + + /// HasVFPv2, HasVFPv3, HasNEON - Specify what floating point ISAs are + /// supported. + bool HasVFPv2; + bool HasVFPv3; + bool HasNEON; + + /// UseNEONForSinglePrecisionFP - if the NEONFP attribute has been + /// specified. Use the method useNEONForSinglePrecisionFP() to + /// determine if NEON should actually be used. + bool UseNEONForSinglePrecisionFP; + + /// SlowFPVMLx - If the VFP2 / NEON instructions are available, indicates + /// whether the FP VML[AS] instructions are slow (if so, don't use them). + bool SlowFPVMLx; + + /// HasVMLxForwarding - If true, NEON has special multiplier accumulator + /// forwarding to allow mul + mla being issued back to back. + bool HasVMLxForwarding; + + /// SlowFPBrcc - True if floating point compare + branch is slow. + bool SlowFPBrcc; + + /// InThumbMode - True if compiling for Thumb, false for ARM. + bool InThumbMode; + + /// InNaClMode - True if targeting Native Client + bool InNaClMode; + + /// HasThumb2 - True if Thumb2 instructions are supported. + bool HasThumb2; + + /// IsMClass - True if the subtarget belongs to the 'M' profile of CPUs - + /// v6m, v7m for example. + bool IsMClass; + + /// NoARM - True if subtarget does not support ARM mode execution. + bool NoARM; + + /// PostRAScheduler - True if using post-register-allocation scheduler. + bool PostRAScheduler; + + /// IsR9Reserved - True if R9 is a not available as general purpose register. + bool IsR9Reserved; + + /// UseMovt - True if MOVT / MOVW pairs are used for materialization of 32-bit + /// imms (including global addresses). + bool UseMovt; + + /// SupportsTailCall - True if the OS supports tail call. The dynamic linker + /// must be able to synthesize call stubs for interworking between ARM and + /// Thumb. + bool SupportsTailCall; + + /// HasFP16 - True if subtarget supports half-precision FP (We support VFP+HF + /// only so far) + bool HasFP16; + + /// HasD16 - True if subtarget is limited to 16 double precision + /// FP registers for VFPv3. + bool HasD16; + + /// HasHardwareDivide - True if subtarget supports [su]div + bool HasHardwareDivide; + + /// HasT2ExtractPack - True if subtarget supports thumb2 extract/pack + /// instructions. + bool HasT2ExtractPack; + + /// HasDataBarrier - True if the subtarget supports DMB / DSB data barrier + /// instructions. + bool HasDataBarrier; + + /// Pref32BitThumb - If true, codegen would prefer 32-bit Thumb instructions + /// over 16-bit ones. + bool Pref32BitThumb; + + /// AvoidCPSRPartialUpdate - If true, codegen would avoid using instructions + /// that partially update CPSR and add false dependency on the previous + /// CPSR setting instruction. + bool AvoidCPSRPartialUpdate; + + /// HasMPExtension - True if the subtarget supports Multiprocessing + /// extension (ARMv7 only). + bool HasMPExtension; + + /// FPOnlySP - If true, the floating point unit only supports single + /// precision. + bool FPOnlySP; + + /// AllowsUnalignedMem - If true, the subtarget allows unaligned memory + /// accesses for some types. For details, see + /// ARMTargetLowering::allowsUnalignedMemoryAccesses(). + bool AllowsUnalignedMem; + + /// Thumb2DSP - If true, the subtarget supports the v7 DSP (saturating arith + /// and such) instructions in Thumb2 code. + bool Thumb2DSP; + + /// stackAlignment - The minimum alignment known to hold of the stack frame on + /// entry to the function and which must be maintained by every function. + unsigned stackAlignment; + + /// CPUString - String name of used CPU. + std::string CPUString; + + /// TargetTriple - What processor and OS we're targeting. + Triple TargetTriple; + + /// Selected instruction itineraries (one entry per itinerary class.) + InstrItineraryData InstrItins; + + public: + enum { + isELF, isDarwin + } TargetType; + + enum { + ARM_ABI_APCS, + ARM_ABI_AAPCS // ARM EABI + } TargetABI; + + /// This constructor initializes the data members to match that + /// of the specified triple. + /// + ARMSubtarget(const std::string &TT, const std::string &CPU, + const std::string &FS); + + /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size + /// that still makes it profitable to inline the call. + unsigned getMaxInlineSizeThreshold() const { + // FIXME: For now, we don't lower memcpy's to loads / stores for Thumb1. + // Change this once Thumb1 ldmia / stmia support is added. + return isThumb1Only() ? 0 : 64; + } + /// ParseSubtargetFeatures - Parses features string setting specified + /// subtarget options. Definition of function is auto generated by tblgen. + void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + + void computeIssueWidth(); + + bool hasV4TOps() const { return HasV4TOps; } + bool hasV5TOps() const { return HasV5TOps; } + bool hasV5TEOps() const { return HasV5TEOps; } + bool hasV6Ops() const { return HasV6Ops; } + bool hasV6T2Ops() const { return HasV6T2Ops; } + bool hasV7Ops() const { return HasV7Ops; } + + bool isCortexA8() const { return ARMProcFamily == CortexA8; } + bool isCortexA9() const { return ARMProcFamily == CortexA9; } + + bool hasARMOps() const { return !NoARM; } + + bool hasVFP2() const { return HasVFPv2; } + bool hasVFP3() const { return HasVFPv3; } + bool hasNEON() const { return HasNEON; } + bool useNEONForSinglePrecisionFP() const { + return hasNEON() && UseNEONForSinglePrecisionFP; } + + bool hasDivide() const { return HasHardwareDivide; } + bool hasT2ExtractPack() const { return HasT2ExtractPack; } + bool hasDataBarrier() const { return HasDataBarrier; } + bool useFPVMLx() const { return !SlowFPVMLx; } + bool hasVMLxForwarding() const { return HasVMLxForwarding; } + bool isFPBrccSlow() const { return SlowFPBrcc; } + bool isFPOnlySP() const { return FPOnlySP; } + bool prefers32BitThumb() const { return Pref32BitThumb; } + bool avoidCPSRPartialUpdate() const { return AvoidCPSRPartialUpdate; } + bool hasMPExtension() const { return HasMPExtension; } + bool hasThumb2DSP() const { return Thumb2DSP; } + + bool hasFP16() const { return HasFP16; } + bool hasD16() const { return HasD16; } + + const Triple &getTargetTriple() const { return TargetTriple; } + + bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } + bool isTargetNaCl() const { + return TargetTriple.getOS() == Triple::NativeClient; + } + bool isTargetELF() const { return !isTargetDarwin(); } + + bool isAPCS_ABI() const { return TargetABI == ARM_ABI_APCS; } + bool isAAPCS_ABI() const { return TargetABI == ARM_ABI_AAPCS; } + + bool isThumb() const { return InThumbMode; } + bool isThumb1Only() const { return InThumbMode && !HasThumb2; } + bool isThumb2() const { return InThumbMode && HasThumb2; } + bool hasThumb2() const { return HasThumb2; } + bool isMClass() const { return IsMClass; } + bool isARClass() const { return !IsMClass; } + + bool isR9Reserved() const { return IsR9Reserved; } + + bool useMovt() const { return UseMovt && hasV6T2Ops(); } + bool supportsTailCall() const { return SupportsTailCall; } + + bool allowsUnalignedMem() const { return AllowsUnalignedMem; } + + const std::string & getCPUString() const { return CPUString; } + + unsigned getMispredictionPenalty() const; + + /// enablePostRAScheduler - True at 'More' optimization. + bool enablePostRAScheduler(CodeGenOpt::Level OptLevel, + TargetSubtargetInfo::AntiDepBreakMode& Mode, + RegClassVector& CriticalPathRCs) const; + + /// getInstrItins - Return the instruction itineraies based on subtarget + /// selection. + const InstrItineraryData &getInstrItineraryData() const { return InstrItins; } + + /// getStackAlignment - Returns the minimum alignment known to hold of the + /// stack frame on entry to the function and which must be maintained by every + /// function for this subtarget. + unsigned getStackAlignment() const { return stackAlignment; } + + /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect + /// symbol. + bool GVIsIndirectSymbol(const GlobalValue *GV, Reloc::Model RelocM) const; +}; +} // End llvm namespace + +#endif // ARMSUBTARGET_H diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp new file mode 100644 index 0000000..96b1e89 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -0,0 +1,160 @@ +//===-- ARMTargetMachine.cpp - Define TargetMachine for ARM ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#include "ARMTargetMachine.h" +#include "ARMFrameLowering.h" +#include "ARM.h" +#include "llvm/PassManager.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetOptions.h" +using namespace llvm; + +static cl::opt<bool> +EnableGlobalMerge("global-merge", cl::Hidden, + cl::desc("Enable global merge pass"), + cl::init(true)); + +extern "C" void LLVMInitializeARMTarget() { + // Register the target. + RegisterTargetMachine<ARMTargetMachine> X(TheARMTarget); + RegisterTargetMachine<ThumbTargetMachine> Y(TheThumbTarget); +} + +/// TargetMachine ctor - Create an ARM architecture model. +/// +ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + Reloc::Model RM, CodeModel::Model CM) + : LLVMTargetMachine(T, TT, CPU, FS, RM, CM), + Subtarget(TT, CPU, FS), + JITInfo(), + InstrItins(Subtarget.getInstrItineraryData()) { + // Default to soft float ABI + if (FloatABIType == FloatABI::Default) + FloatABIType = FloatABI::Soft; +} + +ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + Reloc::Model RM, CodeModel::Model CM) + : ARMBaseTargetMachine(T, TT, CPU, FS, RM, CM), InstrInfo(Subtarget), + DataLayout(Subtarget.isAPCS_ABI() ? + std::string("e-p:32:32-f64:32:64-i64:32:64-" + "v128:32:128-v64:32:64-n32-S32") : + Subtarget.isAAPCS_ABI() ? + std::string("e-p:32:32-f64:64:64-i64:64:64-" + "v128:64:128-v64:64:64-n32-S64") : + std::string("e-p:32:32-f64:64:64-i64:64:64-" + "v128:64:128-v64:64:64-n32-S32")), + ELFWriterInfo(*this), + TLInfo(*this), + TSInfo(*this), + FrameLowering(Subtarget) { + if (!Subtarget.hasARMOps()) + report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not " + "support ARM mode execution!"); +} + +ThumbTargetMachine::ThumbTargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + Reloc::Model RM, CodeModel::Model CM) + : ARMBaseTargetMachine(T, TT, CPU, FS, RM, CM), + InstrInfo(Subtarget.hasThumb2() + ? ((ARMBaseInstrInfo*)new Thumb2InstrInfo(Subtarget)) + : ((ARMBaseInstrInfo*)new Thumb1InstrInfo(Subtarget))), + DataLayout(Subtarget.isAPCS_ABI() ? + std::string("e-p:32:32-f64:32:64-i64:32:64-" + "i16:16:32-i8:8:32-i1:8:32-" + "v128:32:128-v64:32:64-a:0:32-n32-S32") : + Subtarget.isAAPCS_ABI() ? + std::string("e-p:32:32-f64:64:64-i64:64:64-" + "i16:16:32-i8:8:32-i1:8:32-" + "v128:64:128-v64:64:64-a:0:32-n32-S64") : + std::string("e-p:32:32-f64:64:64-i64:64:64-" + "i16:16:32-i8:8:32-i1:8:32-" + "v128:64:128-v64:64:64-a:0:32-n32-S32")), + ELFWriterInfo(*this), + TLInfo(*this), + TSInfo(*this), + FrameLowering(Subtarget.hasThumb2() + ? new ARMFrameLowering(Subtarget) + : (ARMFrameLowering*)new Thumb1FrameLowering(Subtarget)) { +} + +bool ARMBaseTargetMachine::addPreISel(PassManagerBase &PM, + CodeGenOpt::Level OptLevel) { + if (OptLevel != CodeGenOpt::None && EnableGlobalMerge) + PM.add(createARMGlobalMergePass(getTargetLowering())); + + return false; +} + +bool ARMBaseTargetMachine::addInstSelector(PassManagerBase &PM, + CodeGenOpt::Level OptLevel) { + PM.add(createARMISelDag(*this, OptLevel)); + return false; +} + +bool ARMBaseTargetMachine::addPreRegAlloc(PassManagerBase &PM, + CodeGenOpt::Level OptLevel) { + // FIXME: temporarily disabling load / store optimization pass for Thumb1. + if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only()) + PM.add(createARMLoadStoreOptimizationPass(true)); + if (OptLevel != CodeGenOpt::None && Subtarget.isCortexA9()) + PM.add(createMLxExpansionPass()); + return true; +} + +bool ARMBaseTargetMachine::addPreSched2(PassManagerBase &PM, + CodeGenOpt::Level OptLevel) { + // FIXME: temporarily disabling load / store optimization pass for Thumb1. + if (OptLevel != CodeGenOpt::None) { + if (!Subtarget.isThumb1Only()) + PM.add(createARMLoadStoreOptimizationPass()); + if (Subtarget.hasNEON()) + PM.add(createExecutionDependencyFixPass(&ARM::DPRRegClass)); + } + + // Expand some pseudo instructions into multiple instructions to allow + // proper scheduling. + PM.add(createARMExpandPseudoPass()); + + if (OptLevel != CodeGenOpt::None) { + if (!Subtarget.isThumb1Only()) + PM.add(createIfConverterPass()); + } + if (Subtarget.isThumb2()) + PM.add(createThumb2ITBlockPass()); + + return true; +} + +bool ARMBaseTargetMachine::addPreEmitPass(PassManagerBase &PM, + CodeGenOpt::Level OptLevel) { + if (Subtarget.isThumb2() && !Subtarget.prefers32BitThumb()) + PM.add(createThumb2SizeReductionPass()); + + PM.add(createARMConstantIslandPass()); + return true; +} + +bool ARMBaseTargetMachine::addCodeEmitter(PassManagerBase &PM, + CodeGenOpt::Level OptLevel, + JITCodeEmitter &JCE) { + // Machine code emitter pass for ARM. + PM.add(createARMJITCodeEmitterPass(*this, JCE)); + return false; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h new file mode 100644 index 0000000..c8c601c --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h @@ -0,0 +1,145 @@ +//===-- ARMTargetMachine.h - Define TargetMachine for ARM -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the ARM specific subclass of TargetMachine. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMTARGETMACHINE_H +#define ARMTARGETMACHINE_H + +#include "ARMInstrInfo.h" +#include "ARMELFWriterInfo.h" +#include "ARMFrameLowering.h" +#include "ARMJITInfo.h" +#include "ARMSubtarget.h" +#include "ARMISelLowering.h" +#include "ARMSelectionDAGInfo.h" +#include "Thumb1InstrInfo.h" +#include "Thumb1FrameLowering.h" +#include "Thumb2InstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetData.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/ADT/OwningPtr.h" + +namespace llvm { + +class ARMBaseTargetMachine : public LLVMTargetMachine { +protected: + ARMSubtarget Subtarget; +private: + ARMJITInfo JITInfo; + InstrItineraryData InstrItins; + +public: + ARMBaseTargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + Reloc::Model RM, CodeModel::Model CM); + + virtual ARMJITInfo *getJITInfo() { return &JITInfo; } + virtual const ARMSubtarget *getSubtargetImpl() const { return &Subtarget; } + virtual const InstrItineraryData *getInstrItineraryData() const { + return &InstrItins; + } + + // Pass Pipeline Configuration + virtual bool addPreISel(PassManagerBase &PM, CodeGenOpt::Level OptLevel); + virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel); + virtual bool addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel); + virtual bool addPreSched2(PassManagerBase &PM, CodeGenOpt::Level OptLevel); + virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel); + virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel, + JITCodeEmitter &MCE); +}; + +/// ARMTargetMachine - ARM target machine. +/// +class ARMTargetMachine : public ARMBaseTargetMachine { + ARMInstrInfo InstrInfo; + const TargetData DataLayout; // Calculates type size & alignment + ARMELFWriterInfo ELFWriterInfo; + ARMTargetLowering TLInfo; + ARMSelectionDAGInfo TSInfo; + ARMFrameLowering FrameLowering; + public: + ARMTargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + Reloc::Model RM, CodeModel::Model CM); + + virtual const ARMRegisterInfo *getRegisterInfo() const { + return &InstrInfo.getRegisterInfo(); + } + + virtual const ARMTargetLowering *getTargetLowering() const { + return &TLInfo; + } + + virtual const ARMSelectionDAGInfo* getSelectionDAGInfo() const { + return &TSInfo; + } + virtual const ARMFrameLowering *getFrameLowering() const { + return &FrameLowering; + } + + virtual const ARMInstrInfo *getInstrInfo() const { return &InstrInfo; } + virtual const TargetData *getTargetData() const { return &DataLayout; } + virtual const ARMELFWriterInfo *getELFWriterInfo() const { + return Subtarget.isTargetELF() ? &ELFWriterInfo : 0; + } +}; + +/// ThumbTargetMachine - Thumb target machine. +/// Due to the way architectures are handled, this represents both +/// Thumb-1 and Thumb-2. +/// +class ThumbTargetMachine : public ARMBaseTargetMachine { + // Either Thumb1InstrInfo or Thumb2InstrInfo. + OwningPtr<ARMBaseInstrInfo> InstrInfo; + const TargetData DataLayout; // Calculates type size & alignment + ARMELFWriterInfo ELFWriterInfo; + ARMTargetLowering TLInfo; + ARMSelectionDAGInfo TSInfo; + // Either Thumb1FrameLowering or ARMFrameLowering. + OwningPtr<ARMFrameLowering> FrameLowering; +public: + ThumbTargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + Reloc::Model RM, CodeModel::Model CM); + + /// returns either Thumb1RegisterInfo or Thumb2RegisterInfo + virtual const ARMBaseRegisterInfo *getRegisterInfo() const { + return &InstrInfo->getRegisterInfo(); + } + + virtual const ARMTargetLowering *getTargetLowering() const { + return &TLInfo; + } + + virtual const ARMSelectionDAGInfo *getSelectionDAGInfo() const { + return &TSInfo; + } + + /// returns either Thumb1InstrInfo or Thumb2InstrInfo + virtual const ARMBaseInstrInfo *getInstrInfo() const { + return InstrInfo.get(); + } + /// returns either Thumb1FrameLowering or ARMFrameLowering + virtual const ARMFrameLowering *getFrameLowering() const { + return FrameLowering.get(); + } + virtual const TargetData *getTargetData() const { return &DataLayout; } + virtual const ARMELFWriterInfo *getELFWriterInfo() const { + return Subtarget.isTargetELF() ? &ELFWriterInfo : 0; + } +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp new file mode 100644 index 0000000..19defa1 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp @@ -0,0 +1,47 @@ +//===-- llvm/Target/ARMTargetObjectFile.cpp - ARM Object Info Impl --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "ARMTargetObjectFile.h" +#include "ARMSubtarget.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/Support/Dwarf.h" +#include "llvm/Support/ELF.h" +#include "llvm/Target/TargetMachine.h" +using namespace llvm; +using namespace dwarf; + +//===----------------------------------------------------------------------===// +// ELF Target +//===----------------------------------------------------------------------===// + +void ARMElfTargetObjectFile::Initialize(MCContext &Ctx, + const TargetMachine &TM) { + TargetLoweringObjectFileELF::Initialize(Ctx, TM); + + if (TM.getSubtarget<ARMSubtarget>().isAAPCS_ABI()) { + StaticCtorSection = + getContext().getELFSection(".init_array", ELF::SHT_INIT_ARRAY, + ELF::SHF_WRITE | + ELF::SHF_ALLOC, + SectionKind::getDataRel()); + StaticDtorSection = + getContext().getELFSection(".fini_array", ELF::SHT_FINI_ARRAY, + ELF::SHF_WRITE | + ELF::SHF_ALLOC, + SectionKind::getDataRel()); + LSDASection = NULL; + } + + AttributesSection = + getContext().getELFSection(".ARM.attributes", + ELF::SHT_ARM_ATTRIBUTES, + 0, + SectionKind::getMetadata()); +} diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h new file mode 100644 index 0000000..c6a7261 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h @@ -0,0 +1,38 @@ +//===-- llvm/Target/ARMTargetObjectFile.h - ARM Object Info -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_ARM_TARGETOBJECTFILE_H +#define LLVM_TARGET_ARM_TARGETOBJECTFILE_H + +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" + +namespace llvm { + +class MCContext; +class TargetMachine; + +class ARMElfTargetObjectFile : public TargetLoweringObjectFileELF { +protected: + const MCSection *AttributesSection; +public: + ARMElfTargetObjectFile() : + TargetLoweringObjectFileELF(), + AttributesSection(NULL) + {} + + virtual void Initialize(MCContext &Ctx, const TargetMachine &TM); + + virtual const MCSection *getAttributesSection() const { + return AttributesSection; + } +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmLexer.cpp b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmLexer.cpp new file mode 100644 index 0000000..14d35ba --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmLexer.cpp @@ -0,0 +1,143 @@ +//===-- ARMAsmLexer.cpp - Tokenize ARM assembly to AsmTokens --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/ARMBaseInfo.h" + +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCParser/MCAsmLexer.h" +#include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCTargetAsmLexer.h" + +#include "llvm/Support/TargetRegistry.h" + +#include "llvm/ADT/OwningPtr.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringSwitch.h" + +#include <string> +#include <map> + +using namespace llvm; + +namespace { + +class ARMBaseAsmLexer : public MCTargetAsmLexer { + const MCAsmInfo &AsmInfo; + + const AsmToken &lexDefinite() { + return getLexer()->Lex(); + } + + AsmToken LexTokenUAL(); +protected: + typedef std::map <std::string, unsigned> rmap_ty; + + rmap_ty RegisterMap; + + void InitRegisterMap(const MCRegisterInfo *info) { + unsigned numRegs = info->getNumRegs(); + + for (unsigned i = 0; i < numRegs; ++i) { + const char *regName = info->getName(i); + if (regName) + RegisterMap[regName] = i; + } + } + + unsigned MatchRegisterName(StringRef Name) { + rmap_ty::iterator iter = RegisterMap.find(Name.str()); + if (iter != RegisterMap.end()) + return iter->second; + else + return 0; + } + + AsmToken LexToken() { + if (!Lexer) { + SetError(SMLoc(), "No MCAsmLexer installed"); + return AsmToken(AsmToken::Error, "", 0); + } + + switch (AsmInfo.getAssemblerDialect()) { + default: + SetError(SMLoc(), "Unhandled dialect"); + return AsmToken(AsmToken::Error, "", 0); + case 0: + return LexTokenUAL(); + } + } +public: + ARMBaseAsmLexer(const Target &T, const MCAsmInfo &MAI) + : MCTargetAsmLexer(T), AsmInfo(MAI) { + } +}; + +class ARMAsmLexer : public ARMBaseAsmLexer { +public: + ARMAsmLexer(const Target &T, const MCRegisterInfo &MRI, const MCAsmInfo &MAI) + : ARMBaseAsmLexer(T, MAI) { + InitRegisterMap(&MRI); + } +}; + +class ThumbAsmLexer : public ARMBaseAsmLexer { +public: + ThumbAsmLexer(const Target &T, const MCRegisterInfo &MRI,const MCAsmInfo &MAI) + : ARMBaseAsmLexer(T, MAI) { + InitRegisterMap(&MRI); + } +}; + +} // end anonymous namespace + +AsmToken ARMBaseAsmLexer::LexTokenUAL() { + const AsmToken &lexedToken = lexDefinite(); + + switch (lexedToken.getKind()) { + default: break; + case AsmToken::Error: + SetError(Lexer->getErrLoc(), Lexer->getErr()); + break; + case AsmToken::Identifier: { + std::string upperCase = lexedToken.getString().str(); + std::string lowerCase = LowercaseString(upperCase); + StringRef lowerRef(lowerCase); + + unsigned regID = MatchRegisterName(lowerRef); + // Check for register aliases. + // r13 -> sp + // r14 -> lr + // r15 -> pc + // ip -> r12 + // FIXME: Some assemblers support lots of others. Do we want them all? + if (!regID) { + regID = StringSwitch<unsigned>(lowerCase) + .Case("r13", ARM::SP) + .Case("r14", ARM::LR) + .Case("r15", ARM::PC) + .Case("ip", ARM::R12) + .Default(0); + } + + if (regID) + return AsmToken(AsmToken::Register, + lexedToken.getString(), + static_cast<int64_t>(regID)); + } + } + + return AsmToken(lexedToken); +} + +extern "C" void LLVMInitializeARMAsmLexer() { + RegisterMCAsmLexer<ARMAsmLexer> X(TheARMTarget); + RegisterMCAsmLexer<ThumbAsmLexer> Y(TheThumbTarget); +} diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp new file mode 100644 index 0000000..24f15b4 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -0,0 +1,4615 @@ +//===-- ARMAsmParser.cpp - Parse ARM assembly to MCInst instructions ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/ARMBaseInfo.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "MCTargetDesc/ARMMCExpr.h" +#include "llvm/MC/MCParser/MCAsmLexer.h" +#include "llvm/MC/MCParser/MCAsmParser.h" +#include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCTargetAsmParser.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/OwningPtr.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Twine.h" + +using namespace llvm; + +namespace { + +class ARMOperand; + +class ARMAsmParser : public MCTargetAsmParser { + MCSubtargetInfo &STI; + MCAsmParser &Parser; + + struct { + ARMCC::CondCodes Cond; // Condition for IT block. + unsigned Mask:4; // Condition mask for instructions. + // Starting at first 1 (from lsb). + // '1' condition as indicated in IT. + // '0' inverse of condition (else). + // Count of instructions in IT block is + // 4 - trailingzeroes(mask) + + bool FirstCond; // Explicit flag for when we're parsing the + // First instruction in the IT block. It's + // implied in the mask, so needs special + // handling. + + unsigned CurPosition; // Current position in parsing of IT + // block. In range [0,3]. Initialized + // according to count of instructions in block. + // ~0U if no active IT block. + } ITState; + bool inITBlock() { return ITState.CurPosition != ~0U;} + void forwardITPosition() { + if (!inITBlock()) return; + // Move to the next instruction in the IT block, if there is one. If not, + // mark the block as done. + unsigned TZ = CountTrailingZeros_32(ITState.Mask); + if (++ITState.CurPosition == 5 - TZ) + ITState.CurPosition = ~0U; // Done with the IT block after this. + } + + + MCAsmParser &getParser() const { return Parser; } + MCAsmLexer &getLexer() const { return Parser.getLexer(); } + + void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); } + bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); } + + int tryParseRegister(); + bool tryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &); + int tryParseShiftRegister(SmallVectorImpl<MCParsedAsmOperand*> &); + bool parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &); + bool parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &); + bool parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &, StringRef Mnemonic); + bool parsePrefix(ARMMCExpr::VariantKind &RefKind); + bool parseMemRegOffsetShift(ARM_AM::ShiftOpc &ShiftType, + unsigned &ShiftAmount); + bool parseDirectiveWord(unsigned Size, SMLoc L); + bool parseDirectiveThumb(SMLoc L); + bool parseDirectiveThumbFunc(SMLoc L); + bool parseDirectiveCode(SMLoc L); + bool parseDirectiveSyntax(SMLoc L); + + StringRef splitMnemonic(StringRef Mnemonic, unsigned &PredicationCode, + bool &CarrySetting, unsigned &ProcessorIMod, + StringRef &ITMask); + void getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet, + bool &CanAcceptPredicationCode); + + bool isThumb() const { + // FIXME: Can tablegen auto-generate this? + return (STI.getFeatureBits() & ARM::ModeThumb) != 0; + } + bool isThumbOne() const { + return isThumb() && (STI.getFeatureBits() & ARM::FeatureThumb2) == 0; + } + bool isThumbTwo() const { + return isThumb() && (STI.getFeatureBits() & ARM::FeatureThumb2); + } + bool hasV6Ops() const { + return STI.getFeatureBits() & ARM::HasV6Ops; + } + bool hasV7Ops() const { + return STI.getFeatureBits() & ARM::HasV7Ops; + } + void SwitchMode() { + unsigned FB = ComputeAvailableFeatures(STI.ToggleFeature(ARM::ModeThumb)); + setAvailableFeatures(FB); + } + bool isMClass() const { + return STI.getFeatureBits() & ARM::FeatureMClass; + } + + /// @name Auto-generated Match Functions + /// { + +#define GET_ASSEMBLER_HEADER +#include "ARMGenAsmMatcher.inc" + + /// } + + OperandMatchResultTy parseITCondCode(SmallVectorImpl<MCParsedAsmOperand*>&); + OperandMatchResultTy parseCoprocNumOperand( + SmallVectorImpl<MCParsedAsmOperand*>&); + OperandMatchResultTy parseCoprocRegOperand( + SmallVectorImpl<MCParsedAsmOperand*>&); + OperandMatchResultTy parseCoprocOptionOperand( + SmallVectorImpl<MCParsedAsmOperand*>&); + OperandMatchResultTy parseMemBarrierOptOperand( + SmallVectorImpl<MCParsedAsmOperand*>&); + OperandMatchResultTy parseProcIFlagsOperand( + SmallVectorImpl<MCParsedAsmOperand*>&); + OperandMatchResultTy parseMSRMaskOperand( + SmallVectorImpl<MCParsedAsmOperand*>&); + OperandMatchResultTy parsePKHImm(SmallVectorImpl<MCParsedAsmOperand*> &O, + StringRef Op, int Low, int High); + OperandMatchResultTy parsePKHLSLImm(SmallVectorImpl<MCParsedAsmOperand*> &O) { + return parsePKHImm(O, "lsl", 0, 31); + } + OperandMatchResultTy parsePKHASRImm(SmallVectorImpl<MCParsedAsmOperand*> &O) { + return parsePKHImm(O, "asr", 1, 32); + } + OperandMatchResultTy parseSetEndImm(SmallVectorImpl<MCParsedAsmOperand*>&); + OperandMatchResultTy parseShifterImm(SmallVectorImpl<MCParsedAsmOperand*>&); + OperandMatchResultTy parseRotImm(SmallVectorImpl<MCParsedAsmOperand*>&); + OperandMatchResultTy parseBitfield(SmallVectorImpl<MCParsedAsmOperand*>&); + OperandMatchResultTy parsePostIdxReg(SmallVectorImpl<MCParsedAsmOperand*>&); + OperandMatchResultTy parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*>&); + OperandMatchResultTy parseFPImm(SmallVectorImpl<MCParsedAsmOperand*>&); + + // Asm Match Converter Methods + bool cvtT2LdrdPre(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &); + bool cvtT2StrdPre(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &); + bool cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &); + bool cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &); + bool cvtLdWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &); + bool cvtLdWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &); + bool cvtStWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &); + bool cvtStWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &); + bool cvtStWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &); + bool cvtLdExtTWriteBackImm(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &); + bool cvtLdExtTWriteBackReg(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &); + bool cvtStExtTWriteBackImm(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &); + bool cvtStExtTWriteBackReg(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &); + bool cvtLdrdPre(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &); + bool cvtStrdPre(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &); + bool cvtLdWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &); + bool cvtThumbMultiply(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &); + + bool validateInstruction(MCInst &Inst, + const SmallVectorImpl<MCParsedAsmOperand*> &Ops); + void processInstruction(MCInst &Inst, + const SmallVectorImpl<MCParsedAsmOperand*> &Ops); + bool shouldOmitCCOutOperand(StringRef Mnemonic, + SmallVectorImpl<MCParsedAsmOperand*> &Operands); + +public: + enum ARMMatchResultTy { + Match_RequiresITBlock = FIRST_TARGET_MATCH_RESULT_TY, + Match_RequiresNotITBlock, + Match_RequiresV6, + Match_RequiresThumb2 + }; + + ARMAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser) + : MCTargetAsmParser(), STI(_STI), Parser(_Parser) { + MCAsmParserExtension::Initialize(_Parser); + + // Initialize the set of available features. + setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + + // Not in an ITBlock to start with. + ITState.CurPosition = ~0U; + } + + // Implementation of the MCTargetAsmParser interface: + bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc); + bool ParseInstruction(StringRef Name, SMLoc NameLoc, + SmallVectorImpl<MCParsedAsmOperand*> &Operands); + bool ParseDirective(AsmToken DirectiveID); + + unsigned checkTargetMatchPredicate(MCInst &Inst); + + bool MatchAndEmitInstruction(SMLoc IDLoc, + SmallVectorImpl<MCParsedAsmOperand*> &Operands, + MCStreamer &Out); +}; +} // end anonymous namespace + +namespace { + +/// ARMOperand - Instances of this class represent a parsed ARM machine +/// instruction. +class ARMOperand : public MCParsedAsmOperand { + enum KindTy { + k_CondCode, + k_CCOut, + k_ITCondMask, + k_CoprocNum, + k_CoprocReg, + k_CoprocOption, + k_Immediate, + k_FPImmediate, + k_MemBarrierOpt, + k_Memory, + k_PostIndexRegister, + k_MSRMask, + k_ProcIFlags, + k_VectorIndex, + k_Register, + k_RegisterList, + k_DPRRegisterList, + k_SPRRegisterList, + k_ShiftedRegister, + k_ShiftedImmediate, + k_ShifterImmediate, + k_RotateImmediate, + k_BitfieldDescriptor, + k_Token + } Kind; + + SMLoc StartLoc, EndLoc; + SmallVector<unsigned, 8> Registers; + + union { + struct { + ARMCC::CondCodes Val; + } CC; + + struct { + unsigned Val; + } Cop; + + struct { + unsigned Val; + } CoprocOption; + + struct { + unsigned Mask:4; + } ITMask; + + struct { + ARM_MB::MemBOpt Val; + } MBOpt; + + struct { + ARM_PROC::IFlags Val; + } IFlags; + + struct { + unsigned Val; + } MMask; + + struct { + const char *Data; + unsigned Length; + } Tok; + + struct { + unsigned RegNum; + } Reg; + + struct { + unsigned Val; + } VectorIndex; + + struct { + const MCExpr *Val; + } Imm; + + struct { + unsigned Val; // encoded 8-bit representation + } FPImm; + + /// Combined record for all forms of ARM address expressions. + struct { + unsigned BaseRegNum; + // Offset is in OffsetReg or OffsetImm. If both are zero, no offset + // was specified. + const MCConstantExpr *OffsetImm; // Offset immediate value + unsigned OffsetRegNum; // Offset register num, when OffsetImm == NULL + ARM_AM::ShiftOpc ShiftType; // Shift type for OffsetReg + unsigned ShiftImm; // shift for OffsetReg. + unsigned Alignment; // 0 = no alignment specified + // n = alignment in bytes (8, 16, or 32) + unsigned isNegative : 1; // Negated OffsetReg? (~'U' bit) + } Memory; + + struct { + unsigned RegNum; + bool isAdd; + ARM_AM::ShiftOpc ShiftTy; + unsigned ShiftImm; + } PostIdxReg; + + struct { + bool isASR; + unsigned Imm; + } ShifterImm; + struct { + ARM_AM::ShiftOpc ShiftTy; + unsigned SrcReg; + unsigned ShiftReg; + unsigned ShiftImm; + } RegShiftedReg; + struct { + ARM_AM::ShiftOpc ShiftTy; + unsigned SrcReg; + unsigned ShiftImm; + } RegShiftedImm; + struct { + unsigned Imm; + } RotImm; + struct { + unsigned LSB; + unsigned Width; + } Bitfield; + }; + + ARMOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {} +public: + ARMOperand(const ARMOperand &o) : MCParsedAsmOperand() { + Kind = o.Kind; + StartLoc = o.StartLoc; + EndLoc = o.EndLoc; + switch (Kind) { + case k_CondCode: + CC = o.CC; + break; + case k_ITCondMask: + ITMask = o.ITMask; + break; + case k_Token: + Tok = o.Tok; + break; + case k_CCOut: + case k_Register: + Reg = o.Reg; + break; + case k_RegisterList: + case k_DPRRegisterList: + case k_SPRRegisterList: + Registers = o.Registers; + break; + case k_CoprocNum: + case k_CoprocReg: + Cop = o.Cop; + break; + case k_CoprocOption: + CoprocOption = o.CoprocOption; + break; + case k_Immediate: + Imm = o.Imm; + break; + case k_FPImmediate: + FPImm = o.FPImm; + break; + case k_MemBarrierOpt: + MBOpt = o.MBOpt; + break; + case k_Memory: + Memory = o.Memory; + break; + case k_PostIndexRegister: + PostIdxReg = o.PostIdxReg; + break; + case k_MSRMask: + MMask = o.MMask; + break; + case k_ProcIFlags: + IFlags = o.IFlags; + break; + case k_ShifterImmediate: + ShifterImm = o.ShifterImm; + break; + case k_ShiftedRegister: + RegShiftedReg = o.RegShiftedReg; + break; + case k_ShiftedImmediate: + RegShiftedImm = o.RegShiftedImm; + break; + case k_RotateImmediate: + RotImm = o.RotImm; + break; + case k_BitfieldDescriptor: + Bitfield = o.Bitfield; + break; + case k_VectorIndex: + VectorIndex = o.VectorIndex; + break; + } + } + + /// getStartLoc - Get the location of the first token of this operand. + SMLoc getStartLoc() const { return StartLoc; } + /// getEndLoc - Get the location of the last token of this operand. + SMLoc getEndLoc() const { return EndLoc; } + + ARMCC::CondCodes getCondCode() const { + assert(Kind == k_CondCode && "Invalid access!"); + return CC.Val; + } + + unsigned getCoproc() const { + assert((Kind == k_CoprocNum || Kind == k_CoprocReg) && "Invalid access!"); + return Cop.Val; + } + + StringRef getToken() const { + assert(Kind == k_Token && "Invalid access!"); + return StringRef(Tok.Data, Tok.Length); + } + + unsigned getReg() const { + assert((Kind == k_Register || Kind == k_CCOut) && "Invalid access!"); + return Reg.RegNum; + } + + const SmallVectorImpl<unsigned> &getRegList() const { + assert((Kind == k_RegisterList || Kind == k_DPRRegisterList || + Kind == k_SPRRegisterList) && "Invalid access!"); + return Registers; + } + + const MCExpr *getImm() const { + assert(Kind == k_Immediate && "Invalid access!"); + return Imm.Val; + } + + unsigned getFPImm() const { + assert(Kind == k_FPImmediate && "Invalid access!"); + return FPImm.Val; + } + + unsigned getVectorIndex() const { + assert(Kind == k_VectorIndex && "Invalid access!"); + return VectorIndex.Val; + } + + ARM_MB::MemBOpt getMemBarrierOpt() const { + assert(Kind == k_MemBarrierOpt && "Invalid access!"); + return MBOpt.Val; + } + + ARM_PROC::IFlags getProcIFlags() const { + assert(Kind == k_ProcIFlags && "Invalid access!"); + return IFlags.Val; + } + + unsigned getMSRMask() const { + assert(Kind == k_MSRMask && "Invalid access!"); + return MMask.Val; + } + + bool isCoprocNum() const { return Kind == k_CoprocNum; } + bool isCoprocReg() const { return Kind == k_CoprocReg; } + bool isCoprocOption() const { return Kind == k_CoprocOption; } + bool isCondCode() const { return Kind == k_CondCode; } + bool isCCOut() const { return Kind == k_CCOut; } + bool isITMask() const { return Kind == k_ITCondMask; } + bool isITCondCode() const { return Kind == k_CondCode; } + bool isImm() const { return Kind == k_Immediate; } + bool isFPImm() const { return Kind == k_FPImmediate; } + bool isImm8s4() const { + if (Kind != k_Immediate) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return ((Value & 3) == 0) && Value >= -1020 && Value <= 1020; + } + bool isImm0_1020s4() const { + if (Kind != k_Immediate) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return ((Value & 3) == 0) && Value >= 0 && Value <= 1020; + } + bool isImm0_508s4() const { + if (Kind != k_Immediate) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return ((Value & 3) == 0) && Value >= 0 && Value <= 508; + } + bool isImm0_255() const { + if (Kind != k_Immediate) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value >= 0 && Value < 256; + } + bool isImm0_7() const { + if (Kind != k_Immediate) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value >= 0 && Value < 8; + } + bool isImm0_15() const { + if (Kind != k_Immediate) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value >= 0 && Value < 16; + } + bool isImm0_31() const { + if (Kind != k_Immediate) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value >= 0 && Value < 32; + } + bool isImm1_16() const { + if (Kind != k_Immediate) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value > 0 && Value < 17; + } + bool isImm1_32() const { + if (Kind != k_Immediate) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value > 0 && Value < 33; + } + bool isImm0_65535() const { + if (Kind != k_Immediate) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value >= 0 && Value < 65536; + } + bool isImm0_65535Expr() const { + if (Kind != k_Immediate) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + // If it's not a constant expression, it'll generate a fixup and be + // handled later. + if (!CE) return true; + int64_t Value = CE->getValue(); + return Value >= 0 && Value < 65536; + } + bool isImm24bit() const { + if (Kind != k_Immediate) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value >= 0 && Value <= 0xffffff; + } + bool isImmThumbSR() const { + if (Kind != k_Immediate) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value > 0 && Value < 33; + } + bool isPKHLSLImm() const { + if (Kind != k_Immediate) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value >= 0 && Value < 32; + } + bool isPKHASRImm() const { + if (Kind != k_Immediate) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value > 0 && Value <= 32; + } + bool isARMSOImm() const { + if (Kind != k_Immediate) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return ARM_AM::getSOImmVal(Value) != -1; + } + bool isT2SOImm() const { + if (Kind != k_Immediate) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return ARM_AM::getT2SOImmVal(Value) != -1; + } + bool isSetEndImm() const { + if (Kind != k_Immediate) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Value = CE->getValue(); + return Value == 1 || Value == 0; + } + bool isReg() const { return Kind == k_Register; } + bool isRegList() const { return Kind == k_RegisterList; } + bool isDPRRegList() const { return Kind == k_DPRRegisterList; } + bool isSPRRegList() const { return Kind == k_SPRRegisterList; } + bool isToken() const { return Kind == k_Token; } + bool isMemBarrierOpt() const { return Kind == k_MemBarrierOpt; } + bool isMemory() const { return Kind == k_Memory; } + bool isShifterImm() const { return Kind == k_ShifterImmediate; } + bool isRegShiftedReg() const { return Kind == k_ShiftedRegister; } + bool isRegShiftedImm() const { return Kind == k_ShiftedImmediate; } + bool isRotImm() const { return Kind == k_RotateImmediate; } + bool isBitfield() const { return Kind == k_BitfieldDescriptor; } + bool isPostIdxRegShifted() const { return Kind == k_PostIndexRegister; } + bool isPostIdxReg() const { + return Kind == k_PostIndexRegister && PostIdxReg.ShiftTy == ARM_AM::no_shift; + } + bool isMemNoOffset(bool alignOK = false) const { + if (!isMemory()) + return false; + // No offset of any kind. + return Memory.OffsetRegNum == 0 && Memory.OffsetImm == 0 && + (alignOK || Memory.Alignment == 0); + } + bool isAlignedMemory() const { + return isMemNoOffset(true); + } + bool isAddrMode2() const { + if (!isMemory() || Memory.Alignment != 0) return false; + // Check for register offset. + if (Memory.OffsetRegNum) return true; + // Immediate offset in range [-4095, 4095]. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + return Val > -4096 && Val < 4096; + } + bool isAM2OffsetImm() const { + if (Kind != k_Immediate) + return false; + // Immediate offset in range [-4095, 4095]. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Val = CE->getValue(); + return Val > -4096 && Val < 4096; + } + bool isAddrMode3() const { + if (!isMemory() || Memory.Alignment != 0) return false; + // No shifts are legal for AM3. + if (Memory.ShiftType != ARM_AM::no_shift) return false; + // Check for register offset. + if (Memory.OffsetRegNum) return true; + // Immediate offset in range [-255, 255]. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + return Val > -256 && Val < 256; + } + bool isAM3Offset() const { + if (Kind != k_Immediate && Kind != k_PostIndexRegister) + return false; + if (Kind == k_PostIndexRegister) + return PostIdxReg.ShiftTy == ARM_AM::no_shift; + // Immediate offset in range [-255, 255]. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Val = CE->getValue(); + // Special case, #-0 is INT32_MIN. + return (Val > -256 && Val < 256) || Val == INT32_MIN; + } + bool isAddrMode5() const { + if (!isMemory() || Memory.Alignment != 0) return false; + // Check for register offset. + if (Memory.OffsetRegNum) return false; + // Immediate offset in range [-1020, 1020] and a multiple of 4. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + return (Val >= -1020 && Val <= 1020 && ((Val & 3) == 0)) || + Val == INT32_MIN; + } + bool isMemTBB() const { + if (!isMemory() || !Memory.OffsetRegNum || Memory.isNegative || + Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0) + return false; + return true; + } + bool isMemTBH() const { + if (!isMemory() || !Memory.OffsetRegNum || Memory.isNegative || + Memory.ShiftType != ARM_AM::lsl || Memory.ShiftImm != 1 || + Memory.Alignment != 0 ) + return false; + return true; + } + bool isMemRegOffset() const { + if (!isMemory() || !Memory.OffsetRegNum || Memory.Alignment != 0) + return false; + return true; + } + bool isT2MemRegOffset() const { + if (!isMemory() || !Memory.OffsetRegNum || Memory.isNegative || + Memory.Alignment != 0) + return false; + // Only lsl #{0, 1, 2, 3} allowed. + if (Memory.ShiftType == ARM_AM::no_shift) + return true; + if (Memory.ShiftType != ARM_AM::lsl || Memory.ShiftImm > 3) + return false; + return true; + } + bool isMemThumbRR() const { + // Thumb reg+reg addressing is simple. Just two registers, a base and + // an offset. No shifts, negations or any other complicating factors. + if (!isMemory() || !Memory.OffsetRegNum || Memory.isNegative || + Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0) + return false; + return isARMLowRegister(Memory.BaseRegNum) && + (!Memory.OffsetRegNum || isARMLowRegister(Memory.OffsetRegNum)); + } + bool isMemThumbRIs4() const { + if (!isMemory() || Memory.OffsetRegNum != 0 || + !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0) + return false; + // Immediate offset, multiple of 4 in range [0, 124]. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + return Val >= 0 && Val <= 124 && (Val % 4) == 0; + } + bool isMemThumbRIs2() const { + if (!isMemory() || Memory.OffsetRegNum != 0 || + !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0) + return false; + // Immediate offset, multiple of 4 in range [0, 62]. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + return Val >= 0 && Val <= 62 && (Val % 2) == 0; + } + bool isMemThumbRIs1() const { + if (!isMemory() || Memory.OffsetRegNum != 0 || + !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0) + return false; + // Immediate offset in range [0, 31]. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + return Val >= 0 && Val <= 31; + } + bool isMemThumbSPI() const { + if (!isMemory() || Memory.OffsetRegNum != 0 || + Memory.BaseRegNum != ARM::SP || Memory.Alignment != 0) + return false; + // Immediate offset, multiple of 4 in range [0, 1020]. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + return Val >= 0 && Val <= 1020 && (Val % 4) == 0; + } + bool isMemImm8s4Offset() const { + if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) + return false; + // Immediate offset a multiple of 4 in range [-1020, 1020]. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + return Val >= -1020 && Val <= 1020 && (Val & 3) == 0; + } + bool isMemImm0_1020s4Offset() const { + if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) + return false; + // Immediate offset a multiple of 4 in range [0, 1020]. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + return Val >= 0 && Val <= 1020 && (Val & 3) == 0; + } + bool isMemImm8Offset() const { + if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) + return false; + // Immediate offset in range [-255, 255]. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + return (Val == INT32_MIN) || (Val > -256 && Val < 256); + } + bool isMemPosImm8Offset() const { + if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) + return false; + // Immediate offset in range [0, 255]. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + return Val >= 0 && Val < 256; + } + bool isMemNegImm8Offset() const { + if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) + return false; + // Immediate offset in range [-255, -1]. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + return Val > -256 && Val < 0; + } + bool isMemUImm12Offset() const { + // If we have an immediate that's not a constant, treat it as a label + // reference needing a fixup. If it is a constant, it's something else + // and we reject it. + if (Kind == k_Immediate && !isa<MCConstantExpr>(getImm())) + return true; + + if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) + return false; + // Immediate offset in range [0, 4095]. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + return (Val >= 0 && Val < 4096); + } + bool isMemImm12Offset() const { + // If we have an immediate that's not a constant, treat it as a label + // reference needing a fixup. If it is a constant, it's something else + // and we reject it. + if (Kind == k_Immediate && !isa<MCConstantExpr>(getImm())) + return true; + + if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0) + return false; + // Immediate offset in range [-4095, 4095]. + if (!Memory.OffsetImm) return true; + int64_t Val = Memory.OffsetImm->getValue(); + return (Val > -4096 && Val < 4096) || (Val == INT32_MIN); + } + bool isPostIdxImm8() const { + if (Kind != k_Immediate) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Val = CE->getValue(); + return (Val > -256 && Val < 256) || (Val == INT32_MIN); + } + bool isPostIdxImm8s4() const { + if (Kind != k_Immediate) + return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + int64_t Val = CE->getValue(); + return ((Val & 3) == 0 && Val >= -1020 && Val <= 1020) || + (Val == INT32_MIN); + } + + bool isMSRMask() const { return Kind == k_MSRMask; } + bool isProcIFlags() const { return Kind == k_ProcIFlags; } + + bool isVectorIndex8() const { + if (Kind != k_VectorIndex) return false; + return VectorIndex.Val < 8; + } + bool isVectorIndex16() const { + if (Kind != k_VectorIndex) return false; + return VectorIndex.Val < 4; + } + bool isVectorIndex32() const { + if (Kind != k_VectorIndex) return false; + return VectorIndex.Val < 2; + } + + + + void addExpr(MCInst &Inst, const MCExpr *Expr) const { + // Add as immediates when possible. Null MCExpr = 0. + if (Expr == 0) + Inst.addOperand(MCOperand::CreateImm(0)); + else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr)) + Inst.addOperand(MCOperand::CreateImm(CE->getValue())); + else + Inst.addOperand(MCOperand::CreateExpr(Expr)); + } + + void addCondCodeOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(unsigned(getCondCode()))); + unsigned RegNum = getCondCode() == ARMCC::AL ? 0: ARM::CPSR; + Inst.addOperand(MCOperand::CreateReg(RegNum)); + } + + void addCoprocNumOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(getCoproc())); + } + + void addCoprocRegOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(getCoproc())); + } + + void addCoprocOptionOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(CoprocOption.Val)); + } + + void addITMaskOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(ITMask.Mask)); + } + + void addITCondCodeOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(unsigned(getCondCode()))); + } + + void addCCOutOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(getReg())); + } + + void addRegOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(getReg())); + } + + void addRegShiftedRegOperands(MCInst &Inst, unsigned N) const { + assert(N == 3 && "Invalid number of operands!"); + assert(isRegShiftedReg() && "addRegShiftedRegOperands() on non RegShiftedReg!"); + Inst.addOperand(MCOperand::CreateReg(RegShiftedReg.SrcReg)); + Inst.addOperand(MCOperand::CreateReg(RegShiftedReg.ShiftReg)); + Inst.addOperand(MCOperand::CreateImm( + ARM_AM::getSORegOpc(RegShiftedReg.ShiftTy, RegShiftedReg.ShiftImm))); + } + + void addRegShiftedImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + assert(isRegShiftedImm() && "addRegShiftedImmOperands() on non RegShiftedImm!"); + Inst.addOperand(MCOperand::CreateReg(RegShiftedImm.SrcReg)); + Inst.addOperand(MCOperand::CreateImm( + ARM_AM::getSORegOpc(RegShiftedImm.ShiftTy, RegShiftedImm.ShiftImm))); + } + + void addShifterImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm((ShifterImm.isASR << 5) | + ShifterImm.Imm)); + } + + void addRegListOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const SmallVectorImpl<unsigned> &RegList = getRegList(); + for (SmallVectorImpl<unsigned>::const_iterator + I = RegList.begin(), E = RegList.end(); I != E; ++I) + Inst.addOperand(MCOperand::CreateReg(*I)); + } + + void addDPRRegListOperands(MCInst &Inst, unsigned N) const { + addRegListOperands(Inst, N); + } + + void addSPRRegListOperands(MCInst &Inst, unsigned N) const { + addRegListOperands(Inst, N); + } + + void addRotImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // Encoded as val>>3. The printer handles display as 8, 16, 24. + Inst.addOperand(MCOperand::CreateImm(RotImm.Imm >> 3)); + } + + void addBitfieldOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // Munge the lsb/width into a bitfield mask. + unsigned lsb = Bitfield.LSB; + unsigned width = Bitfield.Width; + // Make a 32-bit mask w/ the referenced bits clear and all other bits set. + uint32_t Mask = ~(((uint32_t)0xffffffff >> lsb) << (32 - width) >> + (32 - (lsb + width))); + Inst.addOperand(MCOperand::CreateImm(Mask)); + } + + void addImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + addExpr(Inst, getImm()); + } + + void addFPImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(getFPImm())); + } + + void addImm8s4Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // FIXME: We really want to scale the value here, but the LDRD/STRD + // instruction don't encode operands that way yet. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::CreateImm(CE->getValue())); + } + + void addImm0_1020s4Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // The immediate is scaled by four in the encoding and is stored + // in the MCInst as such. Lop off the low two bits here. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::CreateImm(CE->getValue() / 4)); + } + + void addImm0_508s4Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // The immediate is scaled by four in the encoding and is stored + // in the MCInst as such. Lop off the low two bits here. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::CreateImm(CE->getValue() / 4)); + } + + void addImm0_255Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + addExpr(Inst, getImm()); + } + + void addImm0_7Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + addExpr(Inst, getImm()); + } + + void addImm0_15Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + addExpr(Inst, getImm()); + } + + void addImm0_31Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + addExpr(Inst, getImm()); + } + + void addImm1_16Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // The constant encodes as the immediate-1, and we store in the instruction + // the bits as encoded, so subtract off one here. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::CreateImm(CE->getValue() - 1)); + } + + void addImm1_32Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // The constant encodes as the immediate-1, and we store in the instruction + // the bits as encoded, so subtract off one here. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::CreateImm(CE->getValue() - 1)); + } + + void addImm0_65535Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + addExpr(Inst, getImm()); + } + + void addImm0_65535ExprOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + addExpr(Inst, getImm()); + } + + void addImm24bitOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + addExpr(Inst, getImm()); + } + + void addImmThumbSROperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // The constant encodes as the immediate, except for 32, which encodes as + // zero. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + unsigned Imm = CE->getValue(); + Inst.addOperand(MCOperand::CreateImm((Imm == 32 ? 0 : Imm))); + } + + void addPKHLSLImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + addExpr(Inst, getImm()); + } + + void addPKHASRImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // An ASR value of 32 encodes as 0, so that's how we want to add it to + // the instruction as well. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + int Val = CE->getValue(); + Inst.addOperand(MCOperand::CreateImm(Val == 32 ? 0 : Val)); + } + + void addARMSOImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + addExpr(Inst, getImm()); + } + + void addT2SOImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + addExpr(Inst, getImm()); + } + + void addSetEndImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + addExpr(Inst, getImm()); + } + + void addMemBarrierOptOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(unsigned(getMemBarrierOpt()))); + } + + void addMemNoOffsetOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum)); + } + + void addAlignedMemoryOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::CreateImm(Memory.Alignment)); + } + + void addAddrMode2Operands(MCInst &Inst, unsigned N) const { + assert(N == 3 && "Invalid number of operands!"); + int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0; + if (!Memory.OffsetRegNum) { + ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add; + // Special case for #-0 + if (Val == INT32_MIN) Val = 0; + if (Val < 0) Val = -Val; + Val = ARM_AM::getAM2Opc(AddSub, Val, ARM_AM::no_shift); + } else { + // For register offset, we encode the shift type and negation flag + // here. + Val = ARM_AM::getAM2Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add, + Memory.ShiftImm, Memory.ShiftType); + } + Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum)); + Inst.addOperand(MCOperand::CreateImm(Val)); + } + + void addAM2OffsetImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + assert(CE && "non-constant AM2OffsetImm operand!"); + int32_t Val = CE->getValue(); + ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add; + // Special case for #-0 + if (Val == INT32_MIN) Val = 0; + if (Val < 0) Val = -Val; + Val = ARM_AM::getAM2Opc(AddSub, Val, ARM_AM::no_shift); + Inst.addOperand(MCOperand::CreateReg(0)); + Inst.addOperand(MCOperand::CreateImm(Val)); + } + + void addAddrMode3Operands(MCInst &Inst, unsigned N) const { + assert(N == 3 && "Invalid number of operands!"); + int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0; + if (!Memory.OffsetRegNum) { + ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add; + // Special case for #-0 + if (Val == INT32_MIN) Val = 0; + if (Val < 0) Val = -Val; + Val = ARM_AM::getAM3Opc(AddSub, Val); + } else { + // For register offset, we encode the shift type and negation flag + // here. + Val = ARM_AM::getAM3Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add, 0); + } + Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum)); + Inst.addOperand(MCOperand::CreateImm(Val)); + } + + void addAM3OffsetOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + if (Kind == k_PostIndexRegister) { + int32_t Val = + ARM_AM::getAM3Opc(PostIdxReg.isAdd ? ARM_AM::add : ARM_AM::sub, 0); + Inst.addOperand(MCOperand::CreateReg(PostIdxReg.RegNum)); + Inst.addOperand(MCOperand::CreateImm(Val)); + return; + } + + // Constant offset. + const MCConstantExpr *CE = static_cast<const MCConstantExpr*>(getImm()); + int32_t Val = CE->getValue(); + ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add; + // Special case for #-0 + if (Val == INT32_MIN) Val = 0; + if (Val < 0) Val = -Val; + Val = ARM_AM::getAM3Opc(AddSub, Val); + Inst.addOperand(MCOperand::CreateReg(0)); + Inst.addOperand(MCOperand::CreateImm(Val)); + } + + void addAddrMode5Operands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + // The lower two bits are always zero and as such are not encoded. + int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() / 4 : 0; + ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add; + // Special case for #-0 + if (Val == INT32_MIN) Val = 0; + if (Val < 0) Val = -Val; + Val = ARM_AM::getAM5Opc(AddSub, Val); + Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::CreateImm(Val)); + } + + void addMemImm8s4OffsetOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0; + Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::CreateImm(Val)); + } + + void addMemImm0_1020s4OffsetOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + // The lower two bits are always zero and as such are not encoded. + int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() / 4 : 0; + Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::CreateImm(Val)); + } + + void addMemImm8OffsetOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0; + Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::CreateImm(Val)); + } + + void addMemPosImm8OffsetOperands(MCInst &Inst, unsigned N) const { + addMemImm8OffsetOperands(Inst, N); + } + + void addMemNegImm8OffsetOperands(MCInst &Inst, unsigned N) const { + addMemImm8OffsetOperands(Inst, N); + } + + void addMemUImm12OffsetOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + // If this is an immediate, it's a label reference. + if (Kind == k_Immediate) { + addExpr(Inst, getImm()); + Inst.addOperand(MCOperand::CreateImm(0)); + return; + } + + // Otherwise, it's a normal memory reg+offset. + int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0; + Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::CreateImm(Val)); + } + + void addMemImm12OffsetOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + // If this is an immediate, it's a label reference. + if (Kind == k_Immediate) { + addExpr(Inst, getImm()); + Inst.addOperand(MCOperand::CreateImm(0)); + return; + } + + // Otherwise, it's a normal memory reg+offset. + int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0; + Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::CreateImm(Val)); + } + + void addMemTBBOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum)); + } + + void addMemTBHOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum)); + } + + void addMemRegOffsetOperands(MCInst &Inst, unsigned N) const { + assert(N == 3 && "Invalid number of operands!"); + unsigned Val = ARM_AM::getAM2Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add, + Memory.ShiftImm, Memory.ShiftType); + Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum)); + Inst.addOperand(MCOperand::CreateImm(Val)); + } + + void addT2MemRegOffsetOperands(MCInst &Inst, unsigned N) const { + assert(N == 3 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum)); + Inst.addOperand(MCOperand::CreateImm(Memory.ShiftImm)); + } + + void addMemThumbRROperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum)); + } + + void addMemThumbRIs4Operands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue() / 4) : 0; + Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::CreateImm(Val)); + } + + void addMemThumbRIs2Operands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue() / 2) : 0; + Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::CreateImm(Val)); + } + + void addMemThumbRIs1Operands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue()) : 0; + Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::CreateImm(Val)); + } + + void addMemThumbSPIOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue() / 4) : 0; + Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum)); + Inst.addOperand(MCOperand::CreateImm(Val)); + } + + void addPostIdxImm8Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + assert(CE && "non-constant post-idx-imm8 operand!"); + int Imm = CE->getValue(); + bool isAdd = Imm >= 0; + if (Imm == INT32_MIN) Imm = 0; + Imm = (Imm < 0 ? -Imm : Imm) | (int)isAdd << 8; + Inst.addOperand(MCOperand::CreateImm(Imm)); + } + + void addPostIdxImm8s4Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + assert(CE && "non-constant post-idx-imm8s4 operand!"); + int Imm = CE->getValue(); + bool isAdd = Imm >= 0; + if (Imm == INT32_MIN) Imm = 0; + // Immediate is scaled by 4. + Imm = ((Imm < 0 ? -Imm : Imm) / 4) | (int)isAdd << 8; + Inst.addOperand(MCOperand::CreateImm(Imm)); + } + + void addPostIdxRegOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(PostIdxReg.RegNum)); + Inst.addOperand(MCOperand::CreateImm(PostIdxReg.isAdd)); + } + + void addPostIdxRegShiftedOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(PostIdxReg.RegNum)); + // The sign, shift type, and shift amount are encoded in a single operand + // using the AM2 encoding helpers. + ARM_AM::AddrOpc opc = PostIdxReg.isAdd ? ARM_AM::add : ARM_AM::sub; + unsigned Imm = ARM_AM::getAM2Opc(opc, PostIdxReg.ShiftImm, + PostIdxReg.ShiftTy); + Inst.addOperand(MCOperand::CreateImm(Imm)); + } + + void addMSRMaskOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(unsigned(getMSRMask()))); + } + + void addProcIFlagsOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(unsigned(getProcIFlags()))); + } + + void addVectorIndex8Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(getVectorIndex())); + } + + void addVectorIndex16Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(getVectorIndex())); + } + + void addVectorIndex32Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(getVectorIndex())); + } + + virtual void print(raw_ostream &OS) const; + + static ARMOperand *CreateITMask(unsigned Mask, SMLoc S) { + ARMOperand *Op = new ARMOperand(k_ITCondMask); + Op->ITMask.Mask = Mask; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static ARMOperand *CreateCondCode(ARMCC::CondCodes CC, SMLoc S) { + ARMOperand *Op = new ARMOperand(k_CondCode); + Op->CC.Val = CC; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static ARMOperand *CreateCoprocNum(unsigned CopVal, SMLoc S) { + ARMOperand *Op = new ARMOperand(k_CoprocNum); + Op->Cop.Val = CopVal; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static ARMOperand *CreateCoprocReg(unsigned CopVal, SMLoc S) { + ARMOperand *Op = new ARMOperand(k_CoprocReg); + Op->Cop.Val = CopVal; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static ARMOperand *CreateCoprocOption(unsigned Val, SMLoc S, SMLoc E) { + ARMOperand *Op = new ARMOperand(k_CoprocOption); + Op->Cop.Val = Val; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static ARMOperand *CreateCCOut(unsigned RegNum, SMLoc S) { + ARMOperand *Op = new ARMOperand(k_CCOut); + Op->Reg.RegNum = RegNum; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static ARMOperand *CreateToken(StringRef Str, SMLoc S) { + ARMOperand *Op = new ARMOperand(k_Token); + Op->Tok.Data = Str.data(); + Op->Tok.Length = Str.size(); + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static ARMOperand *CreateReg(unsigned RegNum, SMLoc S, SMLoc E) { + ARMOperand *Op = new ARMOperand(k_Register); + Op->Reg.RegNum = RegNum; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static ARMOperand *CreateShiftedRegister(ARM_AM::ShiftOpc ShTy, + unsigned SrcReg, + unsigned ShiftReg, + unsigned ShiftImm, + SMLoc S, SMLoc E) { + ARMOperand *Op = new ARMOperand(k_ShiftedRegister); + Op->RegShiftedReg.ShiftTy = ShTy; + Op->RegShiftedReg.SrcReg = SrcReg; + Op->RegShiftedReg.ShiftReg = ShiftReg; + Op->RegShiftedReg.ShiftImm = ShiftImm; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static ARMOperand *CreateShiftedImmediate(ARM_AM::ShiftOpc ShTy, + unsigned SrcReg, + unsigned ShiftImm, + SMLoc S, SMLoc E) { + ARMOperand *Op = new ARMOperand(k_ShiftedImmediate); + Op->RegShiftedImm.ShiftTy = ShTy; + Op->RegShiftedImm.SrcReg = SrcReg; + Op->RegShiftedImm.ShiftImm = ShiftImm; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static ARMOperand *CreateShifterImm(bool isASR, unsigned Imm, + SMLoc S, SMLoc E) { + ARMOperand *Op = new ARMOperand(k_ShifterImmediate); + Op->ShifterImm.isASR = isASR; + Op->ShifterImm.Imm = Imm; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static ARMOperand *CreateRotImm(unsigned Imm, SMLoc S, SMLoc E) { + ARMOperand *Op = new ARMOperand(k_RotateImmediate); + Op->RotImm.Imm = Imm; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static ARMOperand *CreateBitfield(unsigned LSB, unsigned Width, + SMLoc S, SMLoc E) { + ARMOperand *Op = new ARMOperand(k_BitfieldDescriptor); + Op->Bitfield.LSB = LSB; + Op->Bitfield.Width = Width; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static ARMOperand * + CreateRegList(const SmallVectorImpl<std::pair<unsigned, SMLoc> > &Regs, + SMLoc StartLoc, SMLoc EndLoc) { + KindTy Kind = k_RegisterList; + + if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Regs.front().first)) + Kind = k_DPRRegisterList; + else if (ARMMCRegisterClasses[ARM::SPRRegClassID]. + contains(Regs.front().first)) + Kind = k_SPRRegisterList; + + ARMOperand *Op = new ARMOperand(Kind); + for (SmallVectorImpl<std::pair<unsigned, SMLoc> >::const_iterator + I = Regs.begin(), E = Regs.end(); I != E; ++I) + Op->Registers.push_back(I->first); + array_pod_sort(Op->Registers.begin(), Op->Registers.end()); + Op->StartLoc = StartLoc; + Op->EndLoc = EndLoc; + return Op; + } + + static ARMOperand *CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, + MCContext &Ctx) { + ARMOperand *Op = new ARMOperand(k_VectorIndex); + Op->VectorIndex.Val = Idx; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static ARMOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) { + ARMOperand *Op = new ARMOperand(k_Immediate); + Op->Imm.Val = Val; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static ARMOperand *CreateFPImm(unsigned Val, SMLoc S, MCContext &Ctx) { + ARMOperand *Op = new ARMOperand(k_FPImmediate); + Op->FPImm.Val = Val; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static ARMOperand *CreateMem(unsigned BaseRegNum, + const MCConstantExpr *OffsetImm, + unsigned OffsetRegNum, + ARM_AM::ShiftOpc ShiftType, + unsigned ShiftImm, + unsigned Alignment, + bool isNegative, + SMLoc S, SMLoc E) { + ARMOperand *Op = new ARMOperand(k_Memory); + Op->Memory.BaseRegNum = BaseRegNum; + Op->Memory.OffsetImm = OffsetImm; + Op->Memory.OffsetRegNum = OffsetRegNum; + Op->Memory.ShiftType = ShiftType; + Op->Memory.ShiftImm = ShiftImm; + Op->Memory.Alignment = Alignment; + Op->Memory.isNegative = isNegative; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static ARMOperand *CreatePostIdxReg(unsigned RegNum, bool isAdd, + ARM_AM::ShiftOpc ShiftTy, + unsigned ShiftImm, + SMLoc S, SMLoc E) { + ARMOperand *Op = new ARMOperand(k_PostIndexRegister); + Op->PostIdxReg.RegNum = RegNum; + Op->PostIdxReg.isAdd = isAdd; + Op->PostIdxReg.ShiftTy = ShiftTy; + Op->PostIdxReg.ShiftImm = ShiftImm; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static ARMOperand *CreateMemBarrierOpt(ARM_MB::MemBOpt Opt, SMLoc S) { + ARMOperand *Op = new ARMOperand(k_MemBarrierOpt); + Op->MBOpt.Val = Opt; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static ARMOperand *CreateProcIFlags(ARM_PROC::IFlags IFlags, SMLoc S) { + ARMOperand *Op = new ARMOperand(k_ProcIFlags); + Op->IFlags.Val = IFlags; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static ARMOperand *CreateMSRMask(unsigned MMask, SMLoc S) { + ARMOperand *Op = new ARMOperand(k_MSRMask); + Op->MMask.Val = MMask; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } +}; + +} // end anonymous namespace. + +void ARMOperand::print(raw_ostream &OS) const { + switch (Kind) { + case k_FPImmediate: + OS << "<fpimm " << getFPImm() << "(" << ARM_AM::getFPImmFloat(getFPImm()) + << ") >"; + break; + case k_CondCode: + OS << "<ARMCC::" << ARMCondCodeToString(getCondCode()) << ">"; + break; + case k_CCOut: + OS << "<ccout " << getReg() << ">"; + break; + case k_ITCondMask: { + static char MaskStr[][6] = { "()", "(t)", "(e)", "(tt)", "(et)", "(te)", + "(ee)", "(ttt)", "(ett)", "(tet)", "(eet)", "(tte)", "(ete)", + "(tee)", "(eee)" }; + assert((ITMask.Mask & 0xf) == ITMask.Mask); + OS << "<it-mask " << MaskStr[ITMask.Mask] << ">"; + break; + } + case k_CoprocNum: + OS << "<coprocessor number: " << getCoproc() << ">"; + break; + case k_CoprocReg: + OS << "<coprocessor register: " << getCoproc() << ">"; + break; + case k_CoprocOption: + OS << "<coprocessor option: " << CoprocOption.Val << ">"; + break; + case k_MSRMask: + OS << "<mask: " << getMSRMask() << ">"; + break; + case k_Immediate: + getImm()->print(OS); + break; + case k_MemBarrierOpt: + OS << "<ARM_MB::" << MemBOptToString(getMemBarrierOpt()) << ">"; + break; + case k_Memory: + OS << "<memory " + << " base:" << Memory.BaseRegNum; + OS << ">"; + break; + case k_PostIndexRegister: + OS << "post-idx register " << (PostIdxReg.isAdd ? "" : "-") + << PostIdxReg.RegNum; + if (PostIdxReg.ShiftTy != ARM_AM::no_shift) + OS << ARM_AM::getShiftOpcStr(PostIdxReg.ShiftTy) << " " + << PostIdxReg.ShiftImm; + OS << ">"; + break; + case k_ProcIFlags: { + OS << "<ARM_PROC::"; + unsigned IFlags = getProcIFlags(); + for (int i=2; i >= 0; --i) + if (IFlags & (1 << i)) + OS << ARM_PROC::IFlagsToString(1 << i); + OS << ">"; + break; + } + case k_Register: + OS << "<register " << getReg() << ">"; + break; + case k_ShifterImmediate: + OS << "<shift " << (ShifterImm.isASR ? "asr" : "lsl") + << " #" << ShifterImm.Imm << ">"; + break; + case k_ShiftedRegister: + OS << "<so_reg_reg " + << RegShiftedReg.SrcReg + << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(RegShiftedReg.ShiftImm)) + << ", " << RegShiftedReg.ShiftReg << ", " + << ARM_AM::getSORegOffset(RegShiftedReg.ShiftImm) + << ">"; + break; + case k_ShiftedImmediate: + OS << "<so_reg_imm " + << RegShiftedImm.SrcReg + << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(RegShiftedImm.ShiftImm)) + << ", " << ARM_AM::getSORegOffset(RegShiftedImm.ShiftImm) + << ">"; + break; + case k_RotateImmediate: + OS << "<ror " << " #" << (RotImm.Imm * 8) << ">"; + break; + case k_BitfieldDescriptor: + OS << "<bitfield " << "lsb: " << Bitfield.LSB + << ", width: " << Bitfield.Width << ">"; + break; + case k_RegisterList: + case k_DPRRegisterList: + case k_SPRRegisterList: { + OS << "<register_list "; + + const SmallVectorImpl<unsigned> &RegList = getRegList(); + for (SmallVectorImpl<unsigned>::const_iterator + I = RegList.begin(), E = RegList.end(); I != E; ) { + OS << *I; + if (++I < E) OS << ", "; + } + + OS << ">"; + break; + } + case k_Token: + OS << "'" << getToken() << "'"; + break; + case k_VectorIndex: + OS << "<vectorindex " << getVectorIndex() << ">"; + break; + } +} + +/// @name Auto-generated Match Functions +/// { + +static unsigned MatchRegisterName(StringRef Name); + +/// } + +bool ARMAsmParser::ParseRegister(unsigned &RegNo, + SMLoc &StartLoc, SMLoc &EndLoc) { + RegNo = tryParseRegister(); + + return (RegNo == (unsigned)-1); +} + +/// Try to parse a register name. The token must be an Identifier when called, +/// and if it is a register name the token is eaten and the register number is +/// returned. Otherwise return -1. +/// +int ARMAsmParser::tryParseRegister() { + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Identifier)) return -1; + + // FIXME: Validate register for the current architecture; we have to do + // validation later, so maybe there is no need for this here. + std::string upperCase = Tok.getString().str(); + std::string lowerCase = LowercaseString(upperCase); + unsigned RegNum = MatchRegisterName(lowerCase); + if (!RegNum) { + RegNum = StringSwitch<unsigned>(lowerCase) + .Case("r13", ARM::SP) + .Case("r14", ARM::LR) + .Case("r15", ARM::PC) + .Case("ip", ARM::R12) + .Default(0); + } + if (!RegNum) return -1; + + Parser.Lex(); // Eat identifier token. + +#if 0 + // Also check for an index operand. This is only legal for vector registers, + // but that'll get caught OK in operand matching, so we don't need to + // explicitly filter everything else out here. + if (Parser.getTok().is(AsmToken::LBrac)) { + SMLoc SIdx = Parser.getTok().getLoc(); + Parser.Lex(); // Eat left bracket token. + + const MCExpr *ImmVal; + SMLoc ExprLoc = Parser.getTok().getLoc(); + if (getParser().ParseExpression(ImmVal)) + return MatchOperand_ParseFail; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal); + if (!MCE) { + TokError("immediate value expected for vector index"); + return MatchOperand_ParseFail; + } + + SMLoc E = Parser.getTok().getLoc(); + if (Parser.getTok().isNot(AsmToken::RBrac)) { + Error(E, "']' expected"); + return MatchOperand_ParseFail; + } + + Parser.Lex(); // Eat right bracket token. + + Operands.push_back(ARMOperand::CreateVectorIndex(MCE->getValue(), + SIdx, E, + getContext())); + } +#endif + + return RegNum; +} + +// Try to parse a shifter (e.g., "lsl <amt>"). On success, return 0. +// If a recoverable error occurs, return 1. If an irrecoverable error +// occurs, return -1. An irrecoverable error is one where tokens have been +// consumed in the process of trying to parse the shifter (i.e., when it is +// indeed a shifter operand, but malformed). +int ARMAsmParser::tryParseShiftRegister( + SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + SMLoc S = Parser.getTok().getLoc(); + const AsmToken &Tok = Parser.getTok(); + assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier"); + + std::string upperCase = Tok.getString().str(); + std::string lowerCase = LowercaseString(upperCase); + ARM_AM::ShiftOpc ShiftTy = StringSwitch<ARM_AM::ShiftOpc>(lowerCase) + .Case("lsl", ARM_AM::lsl) + .Case("lsr", ARM_AM::lsr) + .Case("asr", ARM_AM::asr) + .Case("ror", ARM_AM::ror) + .Case("rrx", ARM_AM::rrx) + .Default(ARM_AM::no_shift); + + if (ShiftTy == ARM_AM::no_shift) + return 1; + + Parser.Lex(); // Eat the operator. + + // The source register for the shift has already been added to the + // operand list, so we need to pop it off and combine it into the shifted + // register operand instead. + OwningPtr<ARMOperand> PrevOp((ARMOperand*)Operands.pop_back_val()); + if (!PrevOp->isReg()) + return Error(PrevOp->getStartLoc(), "shift must be of a register"); + int SrcReg = PrevOp->getReg(); + int64_t Imm = 0; + int ShiftReg = 0; + if (ShiftTy == ARM_AM::rrx) { + // RRX Doesn't have an explicit shift amount. The encoder expects + // the shift register to be the same as the source register. Seems odd, + // but OK. + ShiftReg = SrcReg; + } else { + // Figure out if this is shifted by a constant or a register (for non-RRX). + if (Parser.getTok().is(AsmToken::Hash)) { + Parser.Lex(); // Eat hash. + SMLoc ImmLoc = Parser.getTok().getLoc(); + const MCExpr *ShiftExpr = 0; + if (getParser().ParseExpression(ShiftExpr)) { + Error(ImmLoc, "invalid immediate shift value"); + return -1; + } + // The expression must be evaluatable as an immediate. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftExpr); + if (!CE) { + Error(ImmLoc, "invalid immediate shift value"); + return -1; + } + // Range check the immediate. + // lsl, ror: 0 <= imm <= 31 + // lsr, asr: 0 <= imm <= 32 + Imm = CE->getValue(); + if (Imm < 0 || + ((ShiftTy == ARM_AM::lsl || ShiftTy == ARM_AM::ror) && Imm > 31) || + ((ShiftTy == ARM_AM::lsr || ShiftTy == ARM_AM::asr) && Imm > 32)) { + Error(ImmLoc, "immediate shift value out of range"); + return -1; + } + } else if (Parser.getTok().is(AsmToken::Identifier)) { + ShiftReg = tryParseRegister(); + SMLoc L = Parser.getTok().getLoc(); + if (ShiftReg == -1) { + Error (L, "expected immediate or register in shift operand"); + return -1; + } + } else { + Error (Parser.getTok().getLoc(), + "expected immediate or register in shift operand"); + return -1; + } + } + + if (ShiftReg && ShiftTy != ARM_AM::rrx) + Operands.push_back(ARMOperand::CreateShiftedRegister(ShiftTy, SrcReg, + ShiftReg, Imm, + S, Parser.getTok().getLoc())); + else + Operands.push_back(ARMOperand::CreateShiftedImmediate(ShiftTy, SrcReg, Imm, + S, Parser.getTok().getLoc())); + + return 0; +} + + +/// Try to parse a register name. The token must be an Identifier when called. +/// If it's a register, an AsmOperand is created. Another AsmOperand is created +/// if there is a "writeback". 'true' if it's not a register. +/// +/// TODO this is likely to change to allow different register types and or to +/// parse for a specific register type. +bool ARMAsmParser:: +tryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + SMLoc S = Parser.getTok().getLoc(); + int RegNo = tryParseRegister(); + if (RegNo == -1) + return true; + + Operands.push_back(ARMOperand::CreateReg(RegNo, S, Parser.getTok().getLoc())); + + const AsmToken &ExclaimTok = Parser.getTok(); + if (ExclaimTok.is(AsmToken::Exclaim)) { + Operands.push_back(ARMOperand::CreateToken(ExclaimTok.getString(), + ExclaimTok.getLoc())); + Parser.Lex(); // Eat exclaim token + return false; + } + + // Also check for an index operand. This is only legal for vector registers, + // but that'll get caught OK in operand matching, so we don't need to + // explicitly filter everything else out here. + if (Parser.getTok().is(AsmToken::LBrac)) { + SMLoc SIdx = Parser.getTok().getLoc(); + Parser.Lex(); // Eat left bracket token. + + const MCExpr *ImmVal; + SMLoc ExprLoc = Parser.getTok().getLoc(); + if (getParser().ParseExpression(ImmVal)) + return MatchOperand_ParseFail; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal); + if (!MCE) { + TokError("immediate value expected for vector index"); + return MatchOperand_ParseFail; + } + + SMLoc E = Parser.getTok().getLoc(); + if (Parser.getTok().isNot(AsmToken::RBrac)) { + Error(E, "']' expected"); + return MatchOperand_ParseFail; + } + + Parser.Lex(); // Eat right bracket token. + + Operands.push_back(ARMOperand::CreateVectorIndex(MCE->getValue(), + SIdx, E, + getContext())); + } + + return false; +} + +/// MatchCoprocessorOperandName - Try to parse an coprocessor related +/// instruction with a symbolic operand name. Example: "p1", "p7", "c3", +/// "c5", ... +static int MatchCoprocessorOperandName(StringRef Name, char CoprocOp) { + // Use the same layout as the tablegen'erated register name matcher. Ugly, + // but efficient. + switch (Name.size()) { + default: break; + case 2: + if (Name[0] != CoprocOp) + return -1; + switch (Name[1]) { + default: return -1; + case '0': return 0; + case '1': return 1; + case '2': return 2; + case '3': return 3; + case '4': return 4; + case '5': return 5; + case '6': return 6; + case '7': return 7; + case '8': return 8; + case '9': return 9; + } + break; + case 3: + if (Name[0] != CoprocOp || Name[1] != '1') + return -1; + switch (Name[2]) { + default: return -1; + case '0': return 10; + case '1': return 11; + case '2': return 12; + case '3': return 13; + case '4': return 14; + case '5': return 15; + } + break; + } + + return -1; +} + +/// parseITCondCode - Try to parse a condition code for an IT instruction. +ARMAsmParser::OperandMatchResultTy ARMAsmParser:: +parseITCondCode(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + SMLoc S = Parser.getTok().getLoc(); + const AsmToken &Tok = Parser.getTok(); + if (!Tok.is(AsmToken::Identifier)) + return MatchOperand_NoMatch; + unsigned CC = StringSwitch<unsigned>(Tok.getString()) + .Case("eq", ARMCC::EQ) + .Case("ne", ARMCC::NE) + .Case("hs", ARMCC::HS) + .Case("cs", ARMCC::HS) + .Case("lo", ARMCC::LO) + .Case("cc", ARMCC::LO) + .Case("mi", ARMCC::MI) + .Case("pl", ARMCC::PL) + .Case("vs", ARMCC::VS) + .Case("vc", ARMCC::VC) + .Case("hi", ARMCC::HI) + .Case("ls", ARMCC::LS) + .Case("ge", ARMCC::GE) + .Case("lt", ARMCC::LT) + .Case("gt", ARMCC::GT) + .Case("le", ARMCC::LE) + .Case("al", ARMCC::AL) + .Default(~0U); + if (CC == ~0U) + return MatchOperand_NoMatch; + Parser.Lex(); // Eat the token. + + Operands.push_back(ARMOperand::CreateCondCode(ARMCC::CondCodes(CC), S)); + + return MatchOperand_Success; +} + +/// parseCoprocNumOperand - Try to parse an coprocessor number operand. The +/// token must be an Identifier when called, and if it is a coprocessor +/// number, the token is eaten and the operand is added to the operand list. +ARMAsmParser::OperandMatchResultTy ARMAsmParser:: +parseCoprocNumOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + SMLoc S = Parser.getTok().getLoc(); + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Identifier)) + return MatchOperand_NoMatch; + + int Num = MatchCoprocessorOperandName(Tok.getString(), 'p'); + if (Num == -1) + return MatchOperand_NoMatch; + + Parser.Lex(); // Eat identifier token. + Operands.push_back(ARMOperand::CreateCoprocNum(Num, S)); + return MatchOperand_Success; +} + +/// parseCoprocRegOperand - Try to parse an coprocessor register operand. The +/// token must be an Identifier when called, and if it is a coprocessor +/// number, the token is eaten and the operand is added to the operand list. +ARMAsmParser::OperandMatchResultTy ARMAsmParser:: +parseCoprocRegOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + SMLoc S = Parser.getTok().getLoc(); + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Identifier)) + return MatchOperand_NoMatch; + + int Reg = MatchCoprocessorOperandName(Tok.getString(), 'c'); + if (Reg == -1) + return MatchOperand_NoMatch; + + Parser.Lex(); // Eat identifier token. + Operands.push_back(ARMOperand::CreateCoprocReg(Reg, S)); + return MatchOperand_Success; +} + +/// parseCoprocOptionOperand - Try to parse an coprocessor option operand. +/// coproc_option : '{' imm0_255 '}' +ARMAsmParser::OperandMatchResultTy ARMAsmParser:: +parseCoprocOptionOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + SMLoc S = Parser.getTok().getLoc(); + + // If this isn't a '{', this isn't a coprocessor immediate operand. + if (Parser.getTok().isNot(AsmToken::LCurly)) + return MatchOperand_NoMatch; + Parser.Lex(); // Eat the '{' + + const MCExpr *Expr; + SMLoc Loc = Parser.getTok().getLoc(); + if (getParser().ParseExpression(Expr)) { + Error(Loc, "illegal expression"); + return MatchOperand_ParseFail; + } + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr); + if (!CE || CE->getValue() < 0 || CE->getValue() > 255) { + Error(Loc, "coprocessor option must be an immediate in range [0, 255]"); + return MatchOperand_ParseFail; + } + int Val = CE->getValue(); + + // Check for and consume the closing '}' + if (Parser.getTok().isNot(AsmToken::RCurly)) + return MatchOperand_ParseFail; + SMLoc E = Parser.getTok().getLoc(); + Parser.Lex(); // Eat the '}' + + Operands.push_back(ARMOperand::CreateCoprocOption(Val, S, E)); + return MatchOperand_Success; +} + +// For register list parsing, we need to map from raw GPR register numbering +// to the enumeration values. The enumeration values aren't sorted by +// register number due to our using "sp", "lr" and "pc" as canonical names. +static unsigned getNextRegister(unsigned Reg) { + // If this is a GPR, we need to do it manually, otherwise we can rely + // on the sort ordering of the enumeration since the other reg-classes + // are sane. + if (!ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg)) + return Reg + 1; + switch(Reg) { + default: assert(0 && "Invalid GPR number!"); + case ARM::R0: return ARM::R1; case ARM::R1: return ARM::R2; + case ARM::R2: return ARM::R3; case ARM::R3: return ARM::R4; + case ARM::R4: return ARM::R5; case ARM::R5: return ARM::R6; + case ARM::R6: return ARM::R7; case ARM::R7: return ARM::R8; + case ARM::R8: return ARM::R9; case ARM::R9: return ARM::R10; + case ARM::R10: return ARM::R11; case ARM::R11: return ARM::R12; + case ARM::R12: return ARM::SP; case ARM::SP: return ARM::LR; + case ARM::LR: return ARM::PC; case ARM::PC: return ARM::R0; + } +} + +/// Parse a register list. +bool ARMAsmParser:: +parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + assert(Parser.getTok().is(AsmToken::LCurly) && + "Token is not a Left Curly Brace"); + SMLoc S = Parser.getTok().getLoc(); + Parser.Lex(); // Eat '{' token. + SMLoc RegLoc = Parser.getTok().getLoc(); + + // Check the first register in the list to see what register class + // this is a list of. + int Reg = tryParseRegister(); + if (Reg == -1) + return Error(RegLoc, "register expected"); + + MCRegisterClass *RC; + if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg)) + RC = &ARMMCRegisterClasses[ARM::GPRRegClassID]; + else if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg)) + RC = &ARMMCRegisterClasses[ARM::DPRRegClassID]; + else if (ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg)) + RC = &ARMMCRegisterClasses[ARM::SPRRegClassID]; + else + return Error(RegLoc, "invalid register in register list"); + + // The reglist instructions have at most 16 registers, so reserve + // space for that many. + SmallVector<std::pair<unsigned, SMLoc>, 16> Registers; + // Store the first register. + Registers.push_back(std::pair<unsigned, SMLoc>(Reg, RegLoc)); + + // This starts immediately after the first register token in the list, + // so we can see either a comma or a minus (range separator) as a legal + // next token. + while (Parser.getTok().is(AsmToken::Comma) || + Parser.getTok().is(AsmToken::Minus)) { + if (Parser.getTok().is(AsmToken::Minus)) { + Parser.Lex(); // Eat the comma. + SMLoc EndLoc = Parser.getTok().getLoc(); + int EndReg = tryParseRegister(); + if (EndReg == -1) + return Error(EndLoc, "register expected"); + // If the register is the same as the start reg, there's nothing + // more to do. + if (Reg == EndReg) + continue; + // The register must be in the same register class as the first. + if (!RC->contains(EndReg)) + return Error(EndLoc, "invalid register in register list"); + // Ranges must go from low to high. + if (getARMRegisterNumbering(Reg) > getARMRegisterNumbering(EndReg)) + return Error(EndLoc, "bad range in register list"); + + // Add all the registers in the range to the register list. + while (Reg != EndReg) { + Reg = getNextRegister(Reg); + Registers.push_back(std::pair<unsigned, SMLoc>(Reg, RegLoc)); + } + continue; + } + Parser.Lex(); // Eat the comma. + RegLoc = Parser.getTok().getLoc(); + int OldReg = Reg; + Reg = tryParseRegister(); + if (Reg == -1) + return Error(RegLoc, "register expected"); + // The register must be in the same register class as the first. + if (!RC->contains(Reg)) + return Error(RegLoc, "invalid register in register list"); + // List must be monotonically increasing. + if (getARMRegisterNumbering(Reg) <= getARMRegisterNumbering(OldReg)) + return Error(RegLoc, "register list not in ascending order"); + // VFP register lists must also be contiguous. + // It's OK to use the enumeration values directly here rather, as the + // VFP register classes have the enum sorted properly. + if (RC != &ARMMCRegisterClasses[ARM::GPRRegClassID] && + Reg != OldReg + 1) + return Error(RegLoc, "non-contiguous register range"); + Registers.push_back(std::pair<unsigned, SMLoc>(Reg, RegLoc)); + } + + SMLoc E = Parser.getTok().getLoc(); + if (Parser.getTok().isNot(AsmToken::RCurly)) + return Error(E, "'}' expected"); + Parser.Lex(); // Eat '}' token. + + Operands.push_back(ARMOperand::CreateRegList(Registers, S, E)); + return false; +} + +/// parseMemBarrierOptOperand - Try to parse DSB/DMB data barrier options. +ARMAsmParser::OperandMatchResultTy ARMAsmParser:: +parseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + SMLoc S = Parser.getTok().getLoc(); + const AsmToken &Tok = Parser.getTok(); + assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier"); + StringRef OptStr = Tok.getString(); + + unsigned Opt = StringSwitch<unsigned>(OptStr.slice(0, OptStr.size())) + .Case("sy", ARM_MB::SY) + .Case("st", ARM_MB::ST) + .Case("sh", ARM_MB::ISH) + .Case("ish", ARM_MB::ISH) + .Case("shst", ARM_MB::ISHST) + .Case("ishst", ARM_MB::ISHST) + .Case("nsh", ARM_MB::NSH) + .Case("un", ARM_MB::NSH) + .Case("nshst", ARM_MB::NSHST) + .Case("unst", ARM_MB::NSHST) + .Case("osh", ARM_MB::OSH) + .Case("oshst", ARM_MB::OSHST) + .Default(~0U); + + if (Opt == ~0U) + return MatchOperand_NoMatch; + + Parser.Lex(); // Eat identifier token. + Operands.push_back(ARMOperand::CreateMemBarrierOpt((ARM_MB::MemBOpt)Opt, S)); + return MatchOperand_Success; +} + +/// parseProcIFlagsOperand - Try to parse iflags from CPS instruction. +ARMAsmParser::OperandMatchResultTy ARMAsmParser:: +parseProcIFlagsOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + SMLoc S = Parser.getTok().getLoc(); + const AsmToken &Tok = Parser.getTok(); + assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier"); + StringRef IFlagsStr = Tok.getString(); + + // An iflags string of "none" is interpreted to mean that none of the AIF + // bits are set. Not a terribly useful instruction, but a valid encoding. + unsigned IFlags = 0; + if (IFlagsStr != "none") { + for (int i = 0, e = IFlagsStr.size(); i != e; ++i) { + unsigned Flag = StringSwitch<unsigned>(IFlagsStr.substr(i, 1)) + .Case("a", ARM_PROC::A) + .Case("i", ARM_PROC::I) + .Case("f", ARM_PROC::F) + .Default(~0U); + + // If some specific iflag is already set, it means that some letter is + // present more than once, this is not acceptable. + if (Flag == ~0U || (IFlags & Flag)) + return MatchOperand_NoMatch; + + IFlags |= Flag; + } + } + + Parser.Lex(); // Eat identifier token. + Operands.push_back(ARMOperand::CreateProcIFlags((ARM_PROC::IFlags)IFlags, S)); + return MatchOperand_Success; +} + +/// parseMSRMaskOperand - Try to parse mask flags from MSR instruction. +ARMAsmParser::OperandMatchResultTy ARMAsmParser:: +parseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + SMLoc S = Parser.getTok().getLoc(); + const AsmToken &Tok = Parser.getTok(); + assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier"); + StringRef Mask = Tok.getString(); + + if (isMClass()) { + // See ARMv6-M 10.1.1 + unsigned FlagsVal = StringSwitch<unsigned>(Mask) + .Case("apsr", 0) + .Case("iapsr", 1) + .Case("eapsr", 2) + .Case("xpsr", 3) + .Case("ipsr", 5) + .Case("epsr", 6) + .Case("iepsr", 7) + .Case("msp", 8) + .Case("psp", 9) + .Case("primask", 16) + .Case("basepri", 17) + .Case("basepri_max", 18) + .Case("faultmask", 19) + .Case("control", 20) + .Default(~0U); + + if (FlagsVal == ~0U) + return MatchOperand_NoMatch; + + if (!hasV7Ops() && FlagsVal >= 17 && FlagsVal <= 19) + // basepri, basepri_max and faultmask only valid for V7m. + return MatchOperand_NoMatch; + + Parser.Lex(); // Eat identifier token. + Operands.push_back(ARMOperand::CreateMSRMask(FlagsVal, S)); + return MatchOperand_Success; + } + + // Split spec_reg from flag, example: CPSR_sxf => "CPSR" and "sxf" + size_t Start = 0, Next = Mask.find('_'); + StringRef Flags = ""; + std::string SpecReg = LowercaseString(Mask.slice(Start, Next)); + if (Next != StringRef::npos) + Flags = Mask.slice(Next+1, Mask.size()); + + // FlagsVal contains the complete mask: + // 3-0: Mask + // 4: Special Reg (cpsr, apsr => 0; spsr => 1) + unsigned FlagsVal = 0; + + if (SpecReg == "apsr") { + FlagsVal = StringSwitch<unsigned>(Flags) + .Case("nzcvq", 0x8) // same as CPSR_f + .Case("g", 0x4) // same as CPSR_s + .Case("nzcvqg", 0xc) // same as CPSR_fs + .Default(~0U); + + if (FlagsVal == ~0U) { + if (!Flags.empty()) + return MatchOperand_NoMatch; + else + FlagsVal = 8; // No flag + } + } else if (SpecReg == "cpsr" || SpecReg == "spsr") { + if (Flags == "all") // cpsr_all is an alias for cpsr_fc + Flags = "fc"; + for (int i = 0, e = Flags.size(); i != e; ++i) { + unsigned Flag = StringSwitch<unsigned>(Flags.substr(i, 1)) + .Case("c", 1) + .Case("x", 2) + .Case("s", 4) + .Case("f", 8) + .Default(~0U); + + // If some specific flag is already set, it means that some letter is + // present more than once, this is not acceptable. + if (FlagsVal == ~0U || (FlagsVal & Flag)) + return MatchOperand_NoMatch; + FlagsVal |= Flag; + } + } else // No match for special register. + return MatchOperand_NoMatch; + + // Special register without flags are equivalent to "fc" flags. + if (!FlagsVal) + FlagsVal = 0x9; + + // Bit 4: Special Reg (cpsr, apsr => 0; spsr => 1) + if (SpecReg == "spsr") + FlagsVal |= 16; + + Parser.Lex(); // Eat identifier token. + Operands.push_back(ARMOperand::CreateMSRMask(FlagsVal, S)); + return MatchOperand_Success; +} + +ARMAsmParser::OperandMatchResultTy ARMAsmParser:: +parsePKHImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands, StringRef Op, + int Low, int High) { + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Identifier)) { + Error(Parser.getTok().getLoc(), Op + " operand expected."); + return MatchOperand_ParseFail; + } + StringRef ShiftName = Tok.getString(); + std::string LowerOp = LowercaseString(Op); + std::string UpperOp = UppercaseString(Op); + if (ShiftName != LowerOp && ShiftName != UpperOp) { + Error(Parser.getTok().getLoc(), Op + " operand expected."); + return MatchOperand_ParseFail; + } + Parser.Lex(); // Eat shift type token. + + // There must be a '#' and a shift amount. + if (Parser.getTok().isNot(AsmToken::Hash)) { + Error(Parser.getTok().getLoc(), "'#' expected"); + return MatchOperand_ParseFail; + } + Parser.Lex(); // Eat hash token. + + const MCExpr *ShiftAmount; + SMLoc Loc = Parser.getTok().getLoc(); + if (getParser().ParseExpression(ShiftAmount)) { + Error(Loc, "illegal expression"); + return MatchOperand_ParseFail; + } + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftAmount); + if (!CE) { + Error(Loc, "constant expression expected"); + return MatchOperand_ParseFail; + } + int Val = CE->getValue(); + if (Val < Low || Val > High) { + Error(Loc, "immediate value out of range"); + return MatchOperand_ParseFail; + } + + Operands.push_back(ARMOperand::CreateImm(CE, Loc, Parser.getTok().getLoc())); + + return MatchOperand_Success; +} + +ARMAsmParser::OperandMatchResultTy ARMAsmParser:: +parseSetEndImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + const AsmToken &Tok = Parser.getTok(); + SMLoc S = Tok.getLoc(); + if (Tok.isNot(AsmToken::Identifier)) { + Error(Tok.getLoc(), "'be' or 'le' operand expected"); + return MatchOperand_ParseFail; + } + int Val = StringSwitch<int>(Tok.getString()) + .Case("be", 1) + .Case("le", 0) + .Default(-1); + Parser.Lex(); // Eat the token. + + if (Val == -1) { + Error(Tok.getLoc(), "'be' or 'le' operand expected"); + return MatchOperand_ParseFail; + } + Operands.push_back(ARMOperand::CreateImm(MCConstantExpr::Create(Val, + getContext()), + S, Parser.getTok().getLoc())); + return MatchOperand_Success; +} + +/// parseShifterImm - Parse the shifter immediate operand for SSAT/USAT +/// instructions. Legal values are: +/// lsl #n 'n' in [0,31] +/// asr #n 'n' in [1,32] +/// n == 32 encoded as n == 0. +ARMAsmParser::OperandMatchResultTy ARMAsmParser:: +parseShifterImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + const AsmToken &Tok = Parser.getTok(); + SMLoc S = Tok.getLoc(); + if (Tok.isNot(AsmToken::Identifier)) { + Error(S, "shift operator 'asr' or 'lsl' expected"); + return MatchOperand_ParseFail; + } + StringRef ShiftName = Tok.getString(); + bool isASR; + if (ShiftName == "lsl" || ShiftName == "LSL") + isASR = false; + else if (ShiftName == "asr" || ShiftName == "ASR") + isASR = true; + else { + Error(S, "shift operator 'asr' or 'lsl' expected"); + return MatchOperand_ParseFail; + } + Parser.Lex(); // Eat the operator. + + // A '#' and a shift amount. + if (Parser.getTok().isNot(AsmToken::Hash)) { + Error(Parser.getTok().getLoc(), "'#' expected"); + return MatchOperand_ParseFail; + } + Parser.Lex(); // Eat hash token. + + const MCExpr *ShiftAmount; + SMLoc E = Parser.getTok().getLoc(); + if (getParser().ParseExpression(ShiftAmount)) { + Error(E, "malformed shift expression"); + return MatchOperand_ParseFail; + } + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftAmount); + if (!CE) { + Error(E, "shift amount must be an immediate"); + return MatchOperand_ParseFail; + } + + int64_t Val = CE->getValue(); + if (isASR) { + // Shift amount must be in [1,32] + if (Val < 1 || Val > 32) { + Error(E, "'asr' shift amount must be in range [1,32]"); + return MatchOperand_ParseFail; + } + // asr #32 encoded as asr #0, but is not allowed in Thumb2 mode. + if (isThumb() && Val == 32) { + Error(E, "'asr #32' shift amount not allowed in Thumb mode"); + return MatchOperand_ParseFail; + } + if (Val == 32) Val = 0; + } else { + // Shift amount must be in [1,32] + if (Val < 0 || Val > 31) { + Error(E, "'lsr' shift amount must be in range [0,31]"); + return MatchOperand_ParseFail; + } + } + + E = Parser.getTok().getLoc(); + Operands.push_back(ARMOperand::CreateShifterImm(isASR, Val, S, E)); + + return MatchOperand_Success; +} + +/// parseRotImm - Parse the shifter immediate operand for SXTB/UXTB family +/// of instructions. Legal values are: +/// ror #n 'n' in {0, 8, 16, 24} +ARMAsmParser::OperandMatchResultTy ARMAsmParser:: +parseRotImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + const AsmToken &Tok = Parser.getTok(); + SMLoc S = Tok.getLoc(); + if (Tok.isNot(AsmToken::Identifier)) + return MatchOperand_NoMatch; + StringRef ShiftName = Tok.getString(); + if (ShiftName != "ror" && ShiftName != "ROR") + return MatchOperand_NoMatch; + Parser.Lex(); // Eat the operator. + + // A '#' and a rotate amount. + if (Parser.getTok().isNot(AsmToken::Hash)) { + Error(Parser.getTok().getLoc(), "'#' expected"); + return MatchOperand_ParseFail; + } + Parser.Lex(); // Eat hash token. + + const MCExpr *ShiftAmount; + SMLoc E = Parser.getTok().getLoc(); + if (getParser().ParseExpression(ShiftAmount)) { + Error(E, "malformed rotate expression"); + return MatchOperand_ParseFail; + } + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftAmount); + if (!CE) { + Error(E, "rotate amount must be an immediate"); + return MatchOperand_ParseFail; + } + + int64_t Val = CE->getValue(); + // Shift amount must be in {0, 8, 16, 24} (0 is undocumented extension) + // normally, zero is represented in asm by omitting the rotate operand + // entirely. + if (Val != 8 && Val != 16 && Val != 24 && Val != 0) { + Error(E, "'ror' rotate amount must be 8, 16, or 24"); + return MatchOperand_ParseFail; + } + + E = Parser.getTok().getLoc(); + Operands.push_back(ARMOperand::CreateRotImm(Val, S, E)); + + return MatchOperand_Success; +} + +ARMAsmParser::OperandMatchResultTy ARMAsmParser:: +parseBitfield(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + SMLoc S = Parser.getTok().getLoc(); + // The bitfield descriptor is really two operands, the LSB and the width. + if (Parser.getTok().isNot(AsmToken::Hash)) { + Error(Parser.getTok().getLoc(), "'#' expected"); + return MatchOperand_ParseFail; + } + Parser.Lex(); // Eat hash token. + + const MCExpr *LSBExpr; + SMLoc E = Parser.getTok().getLoc(); + if (getParser().ParseExpression(LSBExpr)) { + Error(E, "malformed immediate expression"); + return MatchOperand_ParseFail; + } + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(LSBExpr); + if (!CE) { + Error(E, "'lsb' operand must be an immediate"); + return MatchOperand_ParseFail; + } + + int64_t LSB = CE->getValue(); + // The LSB must be in the range [0,31] + if (LSB < 0 || LSB > 31) { + Error(E, "'lsb' operand must be in the range [0,31]"); + return MatchOperand_ParseFail; + } + E = Parser.getTok().getLoc(); + + // Expect another immediate operand. + if (Parser.getTok().isNot(AsmToken::Comma)) { + Error(Parser.getTok().getLoc(), "too few operands"); + return MatchOperand_ParseFail; + } + Parser.Lex(); // Eat hash token. + if (Parser.getTok().isNot(AsmToken::Hash)) { + Error(Parser.getTok().getLoc(), "'#' expected"); + return MatchOperand_ParseFail; + } + Parser.Lex(); // Eat hash token. + + const MCExpr *WidthExpr; + if (getParser().ParseExpression(WidthExpr)) { + Error(E, "malformed immediate expression"); + return MatchOperand_ParseFail; + } + CE = dyn_cast<MCConstantExpr>(WidthExpr); + if (!CE) { + Error(E, "'width' operand must be an immediate"); + return MatchOperand_ParseFail; + } + + int64_t Width = CE->getValue(); + // The LSB must be in the range [1,32-lsb] + if (Width < 1 || Width > 32 - LSB) { + Error(E, "'width' operand must be in the range [1,32-lsb]"); + return MatchOperand_ParseFail; + } + E = Parser.getTok().getLoc(); + + Operands.push_back(ARMOperand::CreateBitfield(LSB, Width, S, E)); + + return MatchOperand_Success; +} + +ARMAsmParser::OperandMatchResultTy ARMAsmParser:: +parsePostIdxReg(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + // Check for a post-index addressing register operand. Specifically: + // postidx_reg := '+' register {, shift} + // | '-' register {, shift} + // | register {, shift} + + // This method must return MatchOperand_NoMatch without consuming any tokens + // in the case where there is no match, as other alternatives take other + // parse methods. + AsmToken Tok = Parser.getTok(); + SMLoc S = Tok.getLoc(); + bool haveEaten = false; + bool isAdd = true; + int Reg = -1; + if (Tok.is(AsmToken::Plus)) { + Parser.Lex(); // Eat the '+' token. + haveEaten = true; + } else if (Tok.is(AsmToken::Minus)) { + Parser.Lex(); // Eat the '-' token. + isAdd = false; + haveEaten = true; + } + if (Parser.getTok().is(AsmToken::Identifier)) + Reg = tryParseRegister(); + if (Reg == -1) { + if (!haveEaten) + return MatchOperand_NoMatch; + Error(Parser.getTok().getLoc(), "register expected"); + return MatchOperand_ParseFail; + } + SMLoc E = Parser.getTok().getLoc(); + + ARM_AM::ShiftOpc ShiftTy = ARM_AM::no_shift; + unsigned ShiftImm = 0; + if (Parser.getTok().is(AsmToken::Comma)) { + Parser.Lex(); // Eat the ','. + if (parseMemRegOffsetShift(ShiftTy, ShiftImm)) + return MatchOperand_ParseFail; + } + + Operands.push_back(ARMOperand::CreatePostIdxReg(Reg, isAdd, ShiftTy, + ShiftImm, S, E)); + + return MatchOperand_Success; +} + +ARMAsmParser::OperandMatchResultTy ARMAsmParser:: +parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + // Check for a post-index addressing register operand. Specifically: + // am3offset := '+' register + // | '-' register + // | register + // | # imm + // | # + imm + // | # - imm + + // This method must return MatchOperand_NoMatch without consuming any tokens + // in the case where there is no match, as other alternatives take other + // parse methods. + AsmToken Tok = Parser.getTok(); + SMLoc S = Tok.getLoc(); + + // Do immediates first, as we always parse those if we have a '#'. + if (Parser.getTok().is(AsmToken::Hash)) { + Parser.Lex(); // Eat the '#'. + // Explicitly look for a '-', as we need to encode negative zero + // differently. + bool isNegative = Parser.getTok().is(AsmToken::Minus); + const MCExpr *Offset; + if (getParser().ParseExpression(Offset)) + return MatchOperand_ParseFail; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Offset); + if (!CE) { + Error(S, "constant expression expected"); + return MatchOperand_ParseFail; + } + SMLoc E = Tok.getLoc(); + // Negative zero is encoded as the flag value INT32_MIN. + int32_t Val = CE->getValue(); + if (isNegative && Val == 0) + Val = INT32_MIN; + + Operands.push_back( + ARMOperand::CreateImm(MCConstantExpr::Create(Val, getContext()), S, E)); + + return MatchOperand_Success; + } + + + bool haveEaten = false; + bool isAdd = true; + int Reg = -1; + if (Tok.is(AsmToken::Plus)) { + Parser.Lex(); // Eat the '+' token. + haveEaten = true; + } else if (Tok.is(AsmToken::Minus)) { + Parser.Lex(); // Eat the '-' token. + isAdd = false; + haveEaten = true; + } + if (Parser.getTok().is(AsmToken::Identifier)) + Reg = tryParseRegister(); + if (Reg == -1) { + if (!haveEaten) + return MatchOperand_NoMatch; + Error(Parser.getTok().getLoc(), "register expected"); + return MatchOperand_ParseFail; + } + SMLoc E = Parser.getTok().getLoc(); + + Operands.push_back(ARMOperand::CreatePostIdxReg(Reg, isAdd, ARM_AM::no_shift, + 0, S, E)); + + return MatchOperand_Success; +} + +/// cvtT2LdrdPre - Convert parsed operands to MCInst. +/// Needed here because the Asm Gen Matcher can't handle properly tied operands +/// when they refer multiple MIOperands inside a single one. +bool ARMAsmParser:: +cvtT2LdrdPre(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + // Rt, Rt2 + ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); + ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1); + // Create a writeback register dummy placeholder. + Inst.addOperand(MCOperand::CreateReg(0)); + // addr + ((ARMOperand*)Operands[4])->addMemImm8s4OffsetOperands(Inst, 2); + // pred + ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); + return true; +} + +/// cvtT2StrdPre - Convert parsed operands to MCInst. +/// Needed here because the Asm Gen Matcher can't handle properly tied operands +/// when they refer multiple MIOperands inside a single one. +bool ARMAsmParser:: +cvtT2StrdPre(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + // Create a writeback register dummy placeholder. + Inst.addOperand(MCOperand::CreateReg(0)); + // Rt, Rt2 + ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); + ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1); + // addr + ((ARMOperand*)Operands[4])->addMemImm8s4OffsetOperands(Inst, 2); + // pred + ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); + return true; +} + +/// cvtLdWriteBackRegT2AddrModeImm8 - Convert parsed operands to MCInst. +/// Needed here because the Asm Gen Matcher can't handle properly tied operands +/// when they refer multiple MIOperands inside a single one. +bool ARMAsmParser:: +cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); + + // Create a writeback register dummy placeholder. + Inst.addOperand(MCOperand::CreateImm(0)); + + ((ARMOperand*)Operands[3])->addMemImm8OffsetOperands(Inst, 2); + ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); + return true; +} + +/// cvtStWriteBackRegT2AddrModeImm8 - Convert parsed operands to MCInst. +/// Needed here because the Asm Gen Matcher can't handle properly tied operands +/// when they refer multiple MIOperands inside a single one. +bool ARMAsmParser:: +cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + // Create a writeback register dummy placeholder. + Inst.addOperand(MCOperand::CreateImm(0)); + ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); + ((ARMOperand*)Operands[3])->addMemImm8OffsetOperands(Inst, 2); + ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); + return true; +} + +/// cvtLdWriteBackRegAddrMode2 - Convert parsed operands to MCInst. +/// Needed here because the Asm Gen Matcher can't handle properly tied operands +/// when they refer multiple MIOperands inside a single one. +bool ARMAsmParser:: +cvtLdWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); + + // Create a writeback register dummy placeholder. + Inst.addOperand(MCOperand::CreateImm(0)); + + ((ARMOperand*)Operands[3])->addAddrMode2Operands(Inst, 3); + ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); + return true; +} + +/// cvtLdWriteBackRegAddrModeImm12 - Convert parsed operands to MCInst. +/// Needed here because the Asm Gen Matcher can't handle properly tied operands +/// when they refer multiple MIOperands inside a single one. +bool ARMAsmParser:: +cvtLdWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); + + // Create a writeback register dummy placeholder. + Inst.addOperand(MCOperand::CreateImm(0)); + + ((ARMOperand*)Operands[3])->addMemImm12OffsetOperands(Inst, 2); + ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); + return true; +} + + +/// cvtStWriteBackRegAddrModeImm12 - Convert parsed operands to MCInst. +/// Needed here because the Asm Gen Matcher can't handle properly tied operands +/// when they refer multiple MIOperands inside a single one. +bool ARMAsmParser:: +cvtStWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + // Create a writeback register dummy placeholder. + Inst.addOperand(MCOperand::CreateImm(0)); + ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); + ((ARMOperand*)Operands[3])->addMemImm12OffsetOperands(Inst, 2); + ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); + return true; +} + +/// cvtStWriteBackRegAddrMode2 - Convert parsed operands to MCInst. +/// Needed here because the Asm Gen Matcher can't handle properly tied operands +/// when they refer multiple MIOperands inside a single one. +bool ARMAsmParser:: +cvtStWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + // Create a writeback register dummy placeholder. + Inst.addOperand(MCOperand::CreateImm(0)); + ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); + ((ARMOperand*)Operands[3])->addAddrMode2Operands(Inst, 3); + ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); + return true; +} + +/// cvtStWriteBackRegAddrMode3 - Convert parsed operands to MCInst. +/// Needed here because the Asm Gen Matcher can't handle properly tied operands +/// when they refer multiple MIOperands inside a single one. +bool ARMAsmParser:: +cvtStWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + // Create a writeback register dummy placeholder. + Inst.addOperand(MCOperand::CreateImm(0)); + ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); + ((ARMOperand*)Operands[3])->addAddrMode3Operands(Inst, 3); + ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); + return true; +} + +/// cvtLdExtTWriteBackImm - Convert parsed operands to MCInst. +/// Needed here because the Asm Gen Matcher can't handle properly tied operands +/// when they refer multiple MIOperands inside a single one. +bool ARMAsmParser:: +cvtLdExtTWriteBackImm(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + // Rt + ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); + // Create a writeback register dummy placeholder. + Inst.addOperand(MCOperand::CreateImm(0)); + // addr + ((ARMOperand*)Operands[3])->addMemNoOffsetOperands(Inst, 1); + // offset + ((ARMOperand*)Operands[4])->addPostIdxImm8Operands(Inst, 1); + // pred + ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); + return true; +} + +/// cvtLdExtTWriteBackReg - Convert parsed operands to MCInst. +/// Needed here because the Asm Gen Matcher can't handle properly tied operands +/// when they refer multiple MIOperands inside a single one. +bool ARMAsmParser:: +cvtLdExtTWriteBackReg(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + // Rt + ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); + // Create a writeback register dummy placeholder. + Inst.addOperand(MCOperand::CreateImm(0)); + // addr + ((ARMOperand*)Operands[3])->addMemNoOffsetOperands(Inst, 1); + // offset + ((ARMOperand*)Operands[4])->addPostIdxRegOperands(Inst, 2); + // pred + ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); + return true; +} + +/// cvtStExtTWriteBackImm - Convert parsed operands to MCInst. +/// Needed here because the Asm Gen Matcher can't handle properly tied operands +/// when they refer multiple MIOperands inside a single one. +bool ARMAsmParser:: +cvtStExtTWriteBackImm(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + // Create a writeback register dummy placeholder. + Inst.addOperand(MCOperand::CreateImm(0)); + // Rt + ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); + // addr + ((ARMOperand*)Operands[3])->addMemNoOffsetOperands(Inst, 1); + // offset + ((ARMOperand*)Operands[4])->addPostIdxImm8Operands(Inst, 1); + // pred + ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); + return true; +} + +/// cvtStExtTWriteBackReg - Convert parsed operands to MCInst. +/// Needed here because the Asm Gen Matcher can't handle properly tied operands +/// when they refer multiple MIOperands inside a single one. +bool ARMAsmParser:: +cvtStExtTWriteBackReg(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + // Create a writeback register dummy placeholder. + Inst.addOperand(MCOperand::CreateImm(0)); + // Rt + ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); + // addr + ((ARMOperand*)Operands[3])->addMemNoOffsetOperands(Inst, 1); + // offset + ((ARMOperand*)Operands[4])->addPostIdxRegOperands(Inst, 2); + // pred + ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); + return true; +} + +/// cvtLdrdPre - Convert parsed operands to MCInst. +/// Needed here because the Asm Gen Matcher can't handle properly tied operands +/// when they refer multiple MIOperands inside a single one. +bool ARMAsmParser:: +cvtLdrdPre(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + // Rt, Rt2 + ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); + ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1); + // Create a writeback register dummy placeholder. + Inst.addOperand(MCOperand::CreateImm(0)); + // addr + ((ARMOperand*)Operands[4])->addAddrMode3Operands(Inst, 3); + // pred + ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); + return true; +} + +/// cvtStrdPre - Convert parsed operands to MCInst. +/// Needed here because the Asm Gen Matcher can't handle properly tied operands +/// when they refer multiple MIOperands inside a single one. +bool ARMAsmParser:: +cvtStrdPre(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + // Create a writeback register dummy placeholder. + Inst.addOperand(MCOperand::CreateImm(0)); + // Rt, Rt2 + ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); + ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1); + // addr + ((ARMOperand*)Operands[4])->addAddrMode3Operands(Inst, 3); + // pred + ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); + return true; +} + +/// cvtLdWriteBackRegAddrMode3 - Convert parsed operands to MCInst. +/// Needed here because the Asm Gen Matcher can't handle properly tied operands +/// when they refer multiple MIOperands inside a single one. +bool ARMAsmParser:: +cvtLdWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); + // Create a writeback register dummy placeholder. + Inst.addOperand(MCOperand::CreateImm(0)); + ((ARMOperand*)Operands[3])->addAddrMode3Operands(Inst, 3); + ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); + return true; +} + +/// cvtThumbMultiple- Convert parsed operands to MCInst. +/// Needed here because the Asm Gen Matcher can't handle properly tied operands +/// when they refer multiple MIOperands inside a single one. +bool ARMAsmParser:: +cvtThumbMultiply(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + // The second source operand must be the same register as the destination + // operand. + if (Operands.size() == 6 && + (((ARMOperand*)Operands[3])->getReg() != + ((ARMOperand*)Operands[5])->getReg()) && + (((ARMOperand*)Operands[3])->getReg() != + ((ARMOperand*)Operands[4])->getReg())) { + Error(Operands[3]->getStartLoc(), + "destination register must match source register"); + return false; + } + ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1); + ((ARMOperand*)Operands[1])->addCCOutOperands(Inst, 1); + ((ARMOperand*)Operands[4])->addRegOperands(Inst, 1); + // If we have a three-operand form, use that, else the second source operand + // is just the destination operand again. + if (Operands.size() == 6) + ((ARMOperand*)Operands[5])->addRegOperands(Inst, 1); + else + Inst.addOperand(Inst.getOperand(0)); + ((ARMOperand*)Operands[2])->addCondCodeOperands(Inst, 2); + + return true; +} + +/// Parse an ARM memory expression, return false if successful else return true +/// or an error. The first token must be a '[' when called. +bool ARMAsmParser:: +parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + SMLoc S, E; + assert(Parser.getTok().is(AsmToken::LBrac) && + "Token is not a Left Bracket"); + S = Parser.getTok().getLoc(); + Parser.Lex(); // Eat left bracket token. + + const AsmToken &BaseRegTok = Parser.getTok(); + int BaseRegNum = tryParseRegister(); + if (BaseRegNum == -1) + return Error(BaseRegTok.getLoc(), "register expected"); + + // The next token must either be a comma or a closing bracket. + const AsmToken &Tok = Parser.getTok(); + if (!Tok.is(AsmToken::Comma) && !Tok.is(AsmToken::RBrac)) + return Error(Tok.getLoc(), "malformed memory operand"); + + if (Tok.is(AsmToken::RBrac)) { + E = Tok.getLoc(); + Parser.Lex(); // Eat right bracket token. + + Operands.push_back(ARMOperand::CreateMem(BaseRegNum, 0, 0, ARM_AM::no_shift, + 0, 0, false, S, E)); + + // If there's a pre-indexing writeback marker, '!', just add it as a token + // operand. It's rather odd, but syntactically valid. + if (Parser.getTok().is(AsmToken::Exclaim)) { + Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc())); + Parser.Lex(); // Eat the '!'. + } + + return false; + } + + assert(Tok.is(AsmToken::Comma) && "Lost comma in memory operand?!"); + Parser.Lex(); // Eat the comma. + + // If we have a ':', it's an alignment specifier. + if (Parser.getTok().is(AsmToken::Colon)) { + Parser.Lex(); // Eat the ':'. + E = Parser.getTok().getLoc(); + + const MCExpr *Expr; + if (getParser().ParseExpression(Expr)) + return true; + + // The expression has to be a constant. Memory references with relocations + // don't come through here, as they use the <label> forms of the relevant + // instructions. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr); + if (!CE) + return Error (E, "constant expression expected"); + + unsigned Align = 0; + switch (CE->getValue()) { + default: + return Error(E, "alignment specifier must be 64, 128, or 256 bits"); + case 64: Align = 8; break; + case 128: Align = 16; break; + case 256: Align = 32; break; + } + + // Now we should have the closing ']' + E = Parser.getTok().getLoc(); + if (Parser.getTok().isNot(AsmToken::RBrac)) + return Error(E, "']' expected"); + Parser.Lex(); // Eat right bracket token. + + // Don't worry about range checking the value here. That's handled by + // the is*() predicates. + Operands.push_back(ARMOperand::CreateMem(BaseRegNum, 0, 0, + ARM_AM::no_shift, 0, Align, + false, S, E)); + + // If there's a pre-indexing writeback marker, '!', just add it as a token + // operand. + if (Parser.getTok().is(AsmToken::Exclaim)) { + Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc())); + Parser.Lex(); // Eat the '!'. + } + + return false; + } + + // If we have a '#', it's an immediate offset, else assume it's a register + // offset. + if (Parser.getTok().is(AsmToken::Hash)) { + Parser.Lex(); // Eat the '#'. + E = Parser.getTok().getLoc(); + + bool isNegative = getParser().getTok().is(AsmToken::Minus); + const MCExpr *Offset; + if (getParser().ParseExpression(Offset)) + return true; + + // The expression has to be a constant. Memory references with relocations + // don't come through here, as they use the <label> forms of the relevant + // instructions. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Offset); + if (!CE) + return Error (E, "constant expression expected"); + + // If the constant was #-0, represent it as INT32_MIN. + int32_t Val = CE->getValue(); + if (isNegative && Val == 0) + CE = MCConstantExpr::Create(INT32_MIN, getContext()); + + // Now we should have the closing ']' + E = Parser.getTok().getLoc(); + if (Parser.getTok().isNot(AsmToken::RBrac)) + return Error(E, "']' expected"); + Parser.Lex(); // Eat right bracket token. + + // Don't worry about range checking the value here. That's handled by + // the is*() predicates. + Operands.push_back(ARMOperand::CreateMem(BaseRegNum, CE, 0, + ARM_AM::no_shift, 0, 0, + false, S, E)); + + // If there's a pre-indexing writeback marker, '!', just add it as a token + // operand. + if (Parser.getTok().is(AsmToken::Exclaim)) { + Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc())); + Parser.Lex(); // Eat the '!'. + } + + return false; + } + + // The register offset is optionally preceded by a '+' or '-' + bool isNegative = false; + if (Parser.getTok().is(AsmToken::Minus)) { + isNegative = true; + Parser.Lex(); // Eat the '-'. + } else if (Parser.getTok().is(AsmToken::Plus)) { + // Nothing to do. + Parser.Lex(); // Eat the '+'. + } + + E = Parser.getTok().getLoc(); + int OffsetRegNum = tryParseRegister(); + if (OffsetRegNum == -1) + return Error(E, "register expected"); + + // If there's a shift operator, handle it. + ARM_AM::ShiftOpc ShiftType = ARM_AM::no_shift; + unsigned ShiftImm = 0; + if (Parser.getTok().is(AsmToken::Comma)) { + Parser.Lex(); // Eat the ','. + if (parseMemRegOffsetShift(ShiftType, ShiftImm)) + return true; + } + + // Now we should have the closing ']' + E = Parser.getTok().getLoc(); + if (Parser.getTok().isNot(AsmToken::RBrac)) + return Error(E, "']' expected"); + Parser.Lex(); // Eat right bracket token. + + Operands.push_back(ARMOperand::CreateMem(BaseRegNum, 0, OffsetRegNum, + ShiftType, ShiftImm, 0, isNegative, + S, E)); + + // If there's a pre-indexing writeback marker, '!', just add it as a token + // operand. + if (Parser.getTok().is(AsmToken::Exclaim)) { + Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc())); + Parser.Lex(); // Eat the '!'. + } + + return false; +} + +/// parseMemRegOffsetShift - one of these two: +/// ( lsl | lsr | asr | ror ) , # shift_amount +/// rrx +/// return true if it parses a shift otherwise it returns false. +bool ARMAsmParser::parseMemRegOffsetShift(ARM_AM::ShiftOpc &St, + unsigned &Amount) { + SMLoc Loc = Parser.getTok().getLoc(); + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Identifier)) + return true; + StringRef ShiftName = Tok.getString(); + if (ShiftName == "lsl" || ShiftName == "LSL") + St = ARM_AM::lsl; + else if (ShiftName == "lsr" || ShiftName == "LSR") + St = ARM_AM::lsr; + else if (ShiftName == "asr" || ShiftName == "ASR") + St = ARM_AM::asr; + else if (ShiftName == "ror" || ShiftName == "ROR") + St = ARM_AM::ror; + else if (ShiftName == "rrx" || ShiftName == "RRX") + St = ARM_AM::rrx; + else + return Error(Loc, "illegal shift operator"); + Parser.Lex(); // Eat shift type token. + + // rrx stands alone. + Amount = 0; + if (St != ARM_AM::rrx) { + Loc = Parser.getTok().getLoc(); + // A '#' and a shift amount. + const AsmToken &HashTok = Parser.getTok(); + if (HashTok.isNot(AsmToken::Hash)) + return Error(HashTok.getLoc(), "'#' expected"); + Parser.Lex(); // Eat hash token. + + const MCExpr *Expr; + if (getParser().ParseExpression(Expr)) + return true; + // Range check the immediate. + // lsl, ror: 0 <= imm <= 31 + // lsr, asr: 0 <= imm <= 32 + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr); + if (!CE) + return Error(Loc, "shift amount must be an immediate"); + int64_t Imm = CE->getValue(); + if (Imm < 0 || + ((St == ARM_AM::lsl || St == ARM_AM::ror) && Imm > 31) || + ((St == ARM_AM::lsr || St == ARM_AM::asr) && Imm > 32)) + return Error(Loc, "immediate shift value out of range"); + Amount = Imm; + } + + return false; +} + +/// parseFPImm - A floating point immediate expression operand. +ARMAsmParser::OperandMatchResultTy ARMAsmParser:: +parseFPImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + SMLoc S = Parser.getTok().getLoc(); + + if (Parser.getTok().isNot(AsmToken::Hash)) + return MatchOperand_NoMatch; + Parser.Lex(); // Eat the '#'. + + // Handle negation, as that still comes through as a separate token. + bool isNegative = false; + if (Parser.getTok().is(AsmToken::Minus)) { + isNegative = true; + Parser.Lex(); + } + const AsmToken &Tok = Parser.getTok(); + if (Tok.is(AsmToken::Real)) { + APFloat RealVal(APFloat::IEEEdouble, Tok.getString()); + uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue(); + // If we had a '-' in front, toggle the sign bit. + IntVal ^= (uint64_t)isNegative << 63; + int Val = ARM_AM::getFP64Imm(APInt(64, IntVal)); + Parser.Lex(); // Eat the token. + if (Val == -1) { + TokError("floating point value out of range"); + return MatchOperand_ParseFail; + } + Operands.push_back(ARMOperand::CreateFPImm(Val, S, getContext())); + return MatchOperand_Success; + } + if (Tok.is(AsmToken::Integer)) { + int64_t Val = Tok.getIntVal(); + Parser.Lex(); // Eat the token. + if (Val > 255 || Val < 0) { + TokError("encoded floating point value out of range"); + return MatchOperand_ParseFail; + } + Operands.push_back(ARMOperand::CreateFPImm(Val, S, getContext())); + return MatchOperand_Success; + } + + TokError("invalid floating point immediate"); + return MatchOperand_ParseFail; +} +/// Parse a arm instruction operand. For now this parses the operand regardless +/// of the mnemonic. +bool ARMAsmParser::parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands, + StringRef Mnemonic) { + SMLoc S, E; + + // Check if the current operand has a custom associated parser, if so, try to + // custom parse the operand, or fallback to the general approach. + OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic); + if (ResTy == MatchOperand_Success) + return false; + // If there wasn't a custom match, try the generic matcher below. Otherwise, + // there was a match, but an error occurred, in which case, just return that + // the operand parsing failed. + if (ResTy == MatchOperand_ParseFail) + return true; + + switch (getLexer().getKind()) { + default: + Error(Parser.getTok().getLoc(), "unexpected token in operand"); + return true; + case AsmToken::Identifier: { + // If this is VMRS, check for the apsr_nzcv operand. + if (!tryParseRegisterWithWriteBack(Operands)) + return false; + int Res = tryParseShiftRegister(Operands); + if (Res == 0) // success + return false; + else if (Res == -1) // irrecoverable error + return true; + if (Mnemonic == "vmrs" && Parser.getTok().getString() == "apsr_nzcv") { + S = Parser.getTok().getLoc(); + Parser.Lex(); + Operands.push_back(ARMOperand::CreateToken("apsr_nzcv", S)); + return false; + } + + // Fall though for the Identifier case that is not a register or a + // special name. + } + case AsmToken::Integer: // things like 1f and 2b as a branch targets + case AsmToken::Dot: { // . as a branch target + // This was not a register so parse other operands that start with an + // identifier (like labels) as expressions and create them as immediates. + const MCExpr *IdVal; + S = Parser.getTok().getLoc(); + if (getParser().ParseExpression(IdVal)) + return true; + E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); + Operands.push_back(ARMOperand::CreateImm(IdVal, S, E)); + return false; + } + case AsmToken::LBrac: + return parseMemory(Operands); + case AsmToken::LCurly: + return parseRegisterList(Operands); + case AsmToken::Hash: { + // #42 -> immediate. + // TODO: ":lower16:" and ":upper16:" modifiers after # before immediate + S = Parser.getTok().getLoc(); + Parser.Lex(); + bool isNegative = Parser.getTok().is(AsmToken::Minus); + const MCExpr *ImmVal; + if (getParser().ParseExpression(ImmVal)) + return true; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ImmVal); + if (!CE) { + Error(S, "constant expression expected"); + return MatchOperand_ParseFail; + } + int32_t Val = CE->getValue(); + if (isNegative && Val == 0) + ImmVal = MCConstantExpr::Create(INT32_MIN, getContext()); + E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); + Operands.push_back(ARMOperand::CreateImm(ImmVal, S, E)); + return false; + } + case AsmToken::Colon: { + // ":lower16:" and ":upper16:" expression prefixes + // FIXME: Check it's an expression prefix, + // e.g. (FOO - :lower16:BAR) isn't legal. + ARMMCExpr::VariantKind RefKind; + if (parsePrefix(RefKind)) + return true; + + const MCExpr *SubExprVal; + if (getParser().ParseExpression(SubExprVal)) + return true; + + const MCExpr *ExprVal = ARMMCExpr::Create(RefKind, SubExprVal, + getContext()); + E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); + Operands.push_back(ARMOperand::CreateImm(ExprVal, S, E)); + return false; + } + } +} + +// parsePrefix - Parse ARM 16-bit relocations expression prefix, i.e. +// :lower16: and :upper16:. +bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) { + RefKind = ARMMCExpr::VK_ARM_None; + + // :lower16: and :upper16: modifiers + assert(getLexer().is(AsmToken::Colon) && "expected a :"); + Parser.Lex(); // Eat ':' + + if (getLexer().isNot(AsmToken::Identifier)) { + Error(Parser.getTok().getLoc(), "expected prefix identifier in operand"); + return true; + } + + StringRef IDVal = Parser.getTok().getIdentifier(); + if (IDVal == "lower16") { + RefKind = ARMMCExpr::VK_ARM_LO16; + } else if (IDVal == "upper16") { + RefKind = ARMMCExpr::VK_ARM_HI16; + } else { + Error(Parser.getTok().getLoc(), "unexpected prefix in operand"); + return true; + } + Parser.Lex(); + + if (getLexer().isNot(AsmToken::Colon)) { + Error(Parser.getTok().getLoc(), "unexpected token after prefix"); + return true; + } + Parser.Lex(); // Eat the last ':' + return false; +} + +/// \brief Given a mnemonic, split out possible predication code and carry +/// setting letters to form a canonical mnemonic and flags. +// +// FIXME: Would be nice to autogen this. +// FIXME: This is a bit of a maze of special cases. +StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic, + unsigned &PredicationCode, + bool &CarrySetting, + unsigned &ProcessorIMod, + StringRef &ITMask) { + PredicationCode = ARMCC::AL; + CarrySetting = false; + ProcessorIMod = 0; + + // Ignore some mnemonics we know aren't predicated forms. + // + // FIXME: Would be nice to autogen this. + if ((Mnemonic == "movs" && isThumb()) || + Mnemonic == "teq" || Mnemonic == "vceq" || Mnemonic == "svc" || + Mnemonic == "mls" || Mnemonic == "smmls" || Mnemonic == "vcls" || + Mnemonic == "vmls" || Mnemonic == "vnmls" || Mnemonic == "vacge" || + Mnemonic == "vcge" || Mnemonic == "vclt" || Mnemonic == "vacgt" || + Mnemonic == "vcgt" || Mnemonic == "vcle" || Mnemonic == "smlal" || + Mnemonic == "umaal" || Mnemonic == "umlal" || Mnemonic == "vabal" || + Mnemonic == "vmlal" || Mnemonic == "vpadal" || Mnemonic == "vqdmlal") + return Mnemonic; + + // First, split out any predication code. Ignore mnemonics we know aren't + // predicated but do have a carry-set and so weren't caught above. + if (Mnemonic != "adcs" && Mnemonic != "bics" && Mnemonic != "movs" && + Mnemonic != "muls" && Mnemonic != "smlals" && Mnemonic != "smulls" && + Mnemonic != "umlals" && Mnemonic != "umulls" && Mnemonic != "lsls" && + Mnemonic != "sbcs" && Mnemonic != "rscs") { + unsigned CC = StringSwitch<unsigned>(Mnemonic.substr(Mnemonic.size()-2)) + .Case("eq", ARMCC::EQ) + .Case("ne", ARMCC::NE) + .Case("hs", ARMCC::HS) + .Case("cs", ARMCC::HS) + .Case("lo", ARMCC::LO) + .Case("cc", ARMCC::LO) + .Case("mi", ARMCC::MI) + .Case("pl", ARMCC::PL) + .Case("vs", ARMCC::VS) + .Case("vc", ARMCC::VC) + .Case("hi", ARMCC::HI) + .Case("ls", ARMCC::LS) + .Case("ge", ARMCC::GE) + .Case("lt", ARMCC::LT) + .Case("gt", ARMCC::GT) + .Case("le", ARMCC::LE) + .Case("al", ARMCC::AL) + .Default(~0U); + if (CC != ~0U) { + Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 2); + PredicationCode = CC; + } + } + + // Next, determine if we have a carry setting bit. We explicitly ignore all + // the instructions we know end in 's'. + if (Mnemonic.endswith("s") && + !(Mnemonic == "cps" || Mnemonic == "mls" || + Mnemonic == "mrs" || Mnemonic == "smmls" || Mnemonic == "vabs" || + Mnemonic == "vcls" || Mnemonic == "vmls" || Mnemonic == "vmrs" || + Mnemonic == "vnmls" || Mnemonic == "vqabs" || Mnemonic == "vrecps" || + Mnemonic == "vrsqrts" || Mnemonic == "srs" || + (Mnemonic == "movs" && isThumb()))) { + Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 1); + CarrySetting = true; + } + + // The "cps" instruction can have a interrupt mode operand which is glued into + // the mnemonic. Check if this is the case, split it and parse the imod op + if (Mnemonic.startswith("cps")) { + // Split out any imod code. + unsigned IMod = + StringSwitch<unsigned>(Mnemonic.substr(Mnemonic.size()-2, 2)) + .Case("ie", ARM_PROC::IE) + .Case("id", ARM_PROC::ID) + .Default(~0U); + if (IMod != ~0U) { + Mnemonic = Mnemonic.slice(0, Mnemonic.size()-2); + ProcessorIMod = IMod; + } + } + + // The "it" instruction has the condition mask on the end of the mnemonic. + if (Mnemonic.startswith("it")) { + ITMask = Mnemonic.slice(2, Mnemonic.size()); + Mnemonic = Mnemonic.slice(0, 2); + } + + return Mnemonic; +} + +/// \brief Given a canonical mnemonic, determine if the instruction ever allows +/// inclusion of carry set or predication code operands. +// +// FIXME: It would be nice to autogen this. +void ARMAsmParser:: +getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet, + bool &CanAcceptPredicationCode) { + if (Mnemonic == "and" || Mnemonic == "lsl" || Mnemonic == "lsr" || + Mnemonic == "rrx" || Mnemonic == "ror" || Mnemonic == "sub" || + Mnemonic == "add" || Mnemonic == "adc" || + Mnemonic == "mul" || Mnemonic == "bic" || Mnemonic == "asr" || + Mnemonic == "orr" || Mnemonic == "mvn" || + Mnemonic == "rsb" || Mnemonic == "rsc" || Mnemonic == "orn" || + Mnemonic == "sbc" || Mnemonic == "eor" || Mnemonic == "neg" || + (!isThumb() && (Mnemonic == "smull" || Mnemonic == "mov" || + Mnemonic == "mla" || Mnemonic == "smlal" || + Mnemonic == "umlal" || Mnemonic == "umull"))) { + CanAcceptCarrySet = true; + } else + CanAcceptCarrySet = false; + + if (Mnemonic == "cbnz" || Mnemonic == "setend" || Mnemonic == "dmb" || + Mnemonic == "cps" || Mnemonic == "mcr2" || Mnemonic == "it" || + Mnemonic == "mcrr2" || Mnemonic == "cbz" || Mnemonic == "cdp2" || + Mnemonic == "trap" || Mnemonic == "mrc2" || Mnemonic == "mrrc2" || + Mnemonic == "dsb" || Mnemonic == "isb" || Mnemonic == "setend" || + (Mnemonic == "clrex" && !isThumb()) || + (Mnemonic == "nop" && isThumbOne()) || + ((Mnemonic == "pld" || Mnemonic == "pli" || Mnemonic == "pldw" || + Mnemonic == "ldc2" || Mnemonic == "ldc2l" || + Mnemonic == "stc2" || Mnemonic == "stc2l") && !isThumb()) || + ((Mnemonic.startswith("rfe") || Mnemonic.startswith("srs")) && + !isThumb()) || + Mnemonic.startswith("cps") || (Mnemonic == "movs" && isThumbOne())) { + CanAcceptPredicationCode = false; + } else + CanAcceptPredicationCode = true; + + if (isThumb()) { + if (Mnemonic == "bkpt" || Mnemonic == "mcr" || Mnemonic == "mcrr" || + Mnemonic == "mrc" || Mnemonic == "mrrc" || Mnemonic == "cdp") + CanAcceptPredicationCode = false; + } +} + +bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic, + SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + // FIXME: This is all horribly hacky. We really need a better way to deal + // with optional operands like this in the matcher table. + + // The 'mov' mnemonic is special. One variant has a cc_out operand, while + // another does not. Specifically, the MOVW instruction does not. So we + // special case it here and remove the defaulted (non-setting) cc_out + // operand if that's the instruction we're trying to match. + // + // We do this as post-processing of the explicit operands rather than just + // conditionally adding the cc_out in the first place because we need + // to check the type of the parsed immediate operand. + if (Mnemonic == "mov" && Operands.size() > 4 && !isThumb() && + !static_cast<ARMOperand*>(Operands[4])->isARMSOImm() && + static_cast<ARMOperand*>(Operands[4])->isImm0_65535Expr() && + static_cast<ARMOperand*>(Operands[1])->getReg() == 0) + return true; + + // Register-register 'add' for thumb does not have a cc_out operand + // when there are only two register operands. + if (isThumb() && Mnemonic == "add" && Operands.size() == 5 && + static_cast<ARMOperand*>(Operands[3])->isReg() && + static_cast<ARMOperand*>(Operands[4])->isReg() && + static_cast<ARMOperand*>(Operands[1])->getReg() == 0) + return true; + // Register-register 'add' for thumb does not have a cc_out operand + // when it's an ADD Rdm, SP, {Rdm|#imm0_255} instruction. We do + // have to check the immediate range here since Thumb2 has a variant + // that can handle a different range and has a cc_out operand. + if (((isThumb() && Mnemonic == "add") || + (isThumbTwo() && Mnemonic == "sub")) && + Operands.size() == 6 && + static_cast<ARMOperand*>(Operands[3])->isReg() && + static_cast<ARMOperand*>(Operands[4])->isReg() && + static_cast<ARMOperand*>(Operands[4])->getReg() == ARM::SP && + static_cast<ARMOperand*>(Operands[1])->getReg() == 0 && + (static_cast<ARMOperand*>(Operands[5])->isReg() || + static_cast<ARMOperand*>(Operands[5])->isImm0_1020s4())) + return true; + // For Thumb2, add/sub immediate does not have a cc_out operand for the + // imm0_4095 variant. That's the least-preferred variant when + // selecting via the generic "add" mnemonic, so to know that we + // should remove the cc_out operand, we have to explicitly check that + // it's not one of the other variants. Ugh. + if (isThumbTwo() && (Mnemonic == "add" || Mnemonic == "sub") && + Operands.size() == 6 && + static_cast<ARMOperand*>(Operands[3])->isReg() && + static_cast<ARMOperand*>(Operands[4])->isReg() && + static_cast<ARMOperand*>(Operands[5])->isImm()) { + // Nest conditions rather than one big 'if' statement for readability. + // + // If either register is a high reg, it's either one of the SP + // variants (handled above) or a 32-bit encoding, so we just + // check against T3. + if ((!isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) || + !isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg())) && + static_cast<ARMOperand*>(Operands[5])->isT2SOImm()) + return false; + // If both registers are low, we're in an IT block, and the immediate is + // in range, we should use encoding T1 instead, which has a cc_out. + if (inITBlock() && + isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) && + isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg()) && + static_cast<ARMOperand*>(Operands[5])->isImm0_7()) + return false; + + // Otherwise, we use encoding T4, which does not have a cc_out + // operand. + return true; + } + + // The thumb2 multiply instruction doesn't have a CCOut register, so + // if we have a "mul" mnemonic in Thumb mode, check if we'll be able to + // use the 16-bit encoding or not. + if (isThumbTwo() && Mnemonic == "mul" && Operands.size() == 6 && + static_cast<ARMOperand*>(Operands[1])->getReg() == 0 && + static_cast<ARMOperand*>(Operands[3])->isReg() && + static_cast<ARMOperand*>(Operands[4])->isReg() && + static_cast<ARMOperand*>(Operands[5])->isReg() && + // If the registers aren't low regs, the destination reg isn't the + // same as one of the source regs, or the cc_out operand is zero + // outside of an IT block, we have to use the 32-bit encoding, so + // remove the cc_out operand. + (!isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) || + !isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg()) || + !inITBlock() || + (static_cast<ARMOperand*>(Operands[3])->getReg() != + static_cast<ARMOperand*>(Operands[5])->getReg() && + static_cast<ARMOperand*>(Operands[3])->getReg() != + static_cast<ARMOperand*>(Operands[4])->getReg()))) + return true; + + + + // Register-register 'add/sub' for thumb does not have a cc_out operand + // when it's an ADD/SUB SP, #imm. Be lenient on count since there's also + // the "add/sub SP, SP, #imm" version. If the follow-up operands aren't + // right, this will result in better diagnostics (which operand is off) + // anyway. + if (isThumb() && (Mnemonic == "add" || Mnemonic == "sub") && + (Operands.size() == 5 || Operands.size() == 6) && + static_cast<ARMOperand*>(Operands[3])->isReg() && + static_cast<ARMOperand*>(Operands[3])->getReg() == ARM::SP && + static_cast<ARMOperand*>(Operands[1])->getReg() == 0) + return true; + + return false; +} + +/// Parse an arm instruction mnemonic followed by its operands. +bool ARMAsmParser::ParseInstruction(StringRef Name, SMLoc NameLoc, + SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + // Create the leading tokens for the mnemonic, split by '.' characters. + size_t Start = 0, Next = Name.find('.'); + StringRef Mnemonic = Name.slice(Start, Next); + + // Split out the predication code and carry setting flag from the mnemonic. + unsigned PredicationCode; + unsigned ProcessorIMod; + bool CarrySetting; + StringRef ITMask; + Mnemonic = splitMnemonic(Mnemonic, PredicationCode, CarrySetting, + ProcessorIMod, ITMask); + + // In Thumb1, only the branch (B) instruction can be predicated. + if (isThumbOne() && PredicationCode != ARMCC::AL && Mnemonic != "b") { + Parser.EatToEndOfStatement(); + return Error(NameLoc, "conditional execution not supported in Thumb1"); + } + + Operands.push_back(ARMOperand::CreateToken(Mnemonic, NameLoc)); + + // Handle the IT instruction ITMask. Convert it to a bitmask. This + // is the mask as it will be for the IT encoding if the conditional + // encoding has a '1' as it's bit0 (i.e. 't' ==> '1'). In the case + // where the conditional bit0 is zero, the instruction post-processing + // will adjust the mask accordingly. + if (Mnemonic == "it") { + SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + 2); + if (ITMask.size() > 3) { + Parser.EatToEndOfStatement(); + return Error(Loc, "too many conditions on IT instruction"); + } + unsigned Mask = 8; + for (unsigned i = ITMask.size(); i != 0; --i) { + char pos = ITMask[i - 1]; + if (pos != 't' && pos != 'e') { + Parser.EatToEndOfStatement(); + return Error(Loc, "illegal IT block condition mask '" + ITMask + "'"); + } + Mask >>= 1; + if (ITMask[i - 1] == 't') + Mask |= 8; + } + Operands.push_back(ARMOperand::CreateITMask(Mask, Loc)); + } + + // FIXME: This is all a pretty gross hack. We should automatically handle + // optional operands like this via tblgen. + + // Next, add the CCOut and ConditionCode operands, if needed. + // + // For mnemonics which can ever incorporate a carry setting bit or predication + // code, our matching model involves us always generating CCOut and + // ConditionCode operands to match the mnemonic "as written" and then we let + // the matcher deal with finding the right instruction or generating an + // appropriate error. + bool CanAcceptCarrySet, CanAcceptPredicationCode; + getMnemonicAcceptInfo(Mnemonic, CanAcceptCarrySet, CanAcceptPredicationCode); + + // If we had a carry-set on an instruction that can't do that, issue an + // error. + if (!CanAcceptCarrySet && CarrySetting) { + Parser.EatToEndOfStatement(); + return Error(NameLoc, "instruction '" + Mnemonic + + "' can not set flags, but 's' suffix specified"); + } + // If we had a predication code on an instruction that can't do that, issue an + // error. + if (!CanAcceptPredicationCode && PredicationCode != ARMCC::AL) { + Parser.EatToEndOfStatement(); + return Error(NameLoc, "instruction '" + Mnemonic + + "' is not predicable, but condition code specified"); + } + + // Add the carry setting operand, if necessary. + if (CanAcceptCarrySet) { + SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size()); + Operands.push_back(ARMOperand::CreateCCOut(CarrySetting ? ARM::CPSR : 0, + Loc)); + } + + // Add the predication code operand, if necessary. + if (CanAcceptPredicationCode) { + SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size() + + CarrySetting); + Operands.push_back(ARMOperand::CreateCondCode( + ARMCC::CondCodes(PredicationCode), Loc)); + } + + // Add the processor imod operand, if necessary. + if (ProcessorIMod) { + Operands.push_back(ARMOperand::CreateImm( + MCConstantExpr::Create(ProcessorIMod, getContext()), + NameLoc, NameLoc)); + } + + // Add the remaining tokens in the mnemonic. + while (Next != StringRef::npos) { + Start = Next; + Next = Name.find('.', Start + 1); + StringRef ExtraToken = Name.slice(Start, Next); + + // For now, we're only parsing Thumb1 (for the most part), so + // just ignore ".n" qualifiers. We'll use them to restrict + // matching when we do Thumb2. + if (ExtraToken != ".n") { + SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Start); + Operands.push_back(ARMOperand::CreateToken(ExtraToken, Loc)); + } + } + + // Read the remaining operands. + if (getLexer().isNot(AsmToken::EndOfStatement)) { + // Read the first operand. + if (parseOperand(Operands, Mnemonic)) { + Parser.EatToEndOfStatement(); + return true; + } + + while (getLexer().is(AsmToken::Comma)) { + Parser.Lex(); // Eat the comma. + + // Parse and remember the operand. + if (parseOperand(Operands, Mnemonic)) { + Parser.EatToEndOfStatement(); + return true; + } + } + } + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + SMLoc Loc = getLexer().getLoc(); + Parser.EatToEndOfStatement(); + return Error(Loc, "unexpected token in argument list"); + } + + Parser.Lex(); // Consume the EndOfStatement + + // Some instructions, mostly Thumb, have forms for the same mnemonic that + // do and don't have a cc_out optional-def operand. With some spot-checks + // of the operand list, we can figure out which variant we're trying to + // parse and adjust accordingly before actually matching. We shouldn't ever + // try to remove a cc_out operand that was explicitly set on the the + // mnemonic, of course (CarrySetting == true). Reason number #317 the + // table driven matcher doesn't fit well with the ARM instruction set. + if (!CarrySetting && shouldOmitCCOutOperand(Mnemonic, Operands)) { + ARMOperand *Op = static_cast<ARMOperand*>(Operands[1]); + Operands.erase(Operands.begin() + 1); + delete Op; + } + + // ARM mode 'blx' need special handling, as the register operand version + // is predicable, but the label operand version is not. So, we can't rely + // on the Mnemonic based checking to correctly figure out when to put + // a k_CondCode operand in the list. If we're trying to match the label + // version, remove the k_CondCode operand here. + if (!isThumb() && Mnemonic == "blx" && Operands.size() == 3 && + static_cast<ARMOperand*>(Operands[2])->isImm()) { + ARMOperand *Op = static_cast<ARMOperand*>(Operands[1]); + Operands.erase(Operands.begin() + 1); + delete Op; + } + + // The vector-compare-to-zero instructions have a literal token "#0" at + // the end that comes to here as an immediate operand. Convert it to a + // token to play nicely with the matcher. + if ((Mnemonic == "vceq" || Mnemonic == "vcge" || Mnemonic == "vcgt" || + Mnemonic == "vcle" || Mnemonic == "vclt") && Operands.size() == 6 && + static_cast<ARMOperand*>(Operands[5])->isImm()) { + ARMOperand *Op = static_cast<ARMOperand*>(Operands[5]); + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm()); + if (CE && CE->getValue() == 0) { + Operands.erase(Operands.begin() + 5); + Operands.push_back(ARMOperand::CreateToken("#0", Op->getStartLoc())); + delete Op; + } + } + // VCMP{E} does the same thing, but with a different operand count. + if ((Mnemonic == "vcmp" || Mnemonic == "vcmpe") && Operands.size() == 5 && + static_cast<ARMOperand*>(Operands[4])->isImm()) { + ARMOperand *Op = static_cast<ARMOperand*>(Operands[4]); + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm()); + if (CE && CE->getValue() == 0) { + Operands.erase(Operands.begin() + 4); + Operands.push_back(ARMOperand::CreateToken("#0", Op->getStartLoc())); + delete Op; + } + } + // Similarly, the Thumb1 "RSB" instruction has a literal "#0" on the + // end. Convert it to a token here. + if (Mnemonic == "rsb" && isThumb() && Operands.size() == 6 && + static_cast<ARMOperand*>(Operands[5])->isImm()) { + ARMOperand *Op = static_cast<ARMOperand*>(Operands[5]); + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm()); + if (CE && CE->getValue() == 0) { + Operands.erase(Operands.begin() + 5); + Operands.push_back(ARMOperand::CreateToken("#0", Op->getStartLoc())); + delete Op; + } + } + + return false; +} + +// Validate context-sensitive operand constraints. + +// return 'true' if register list contains non-low GPR registers, +// 'false' otherwise. If Reg is in the register list or is HiReg, set +// 'containsReg' to true. +static bool checkLowRegisterList(MCInst Inst, unsigned OpNo, unsigned Reg, + unsigned HiReg, bool &containsReg) { + containsReg = false; + for (unsigned i = OpNo; i < Inst.getNumOperands(); ++i) { + unsigned OpReg = Inst.getOperand(i).getReg(); + if (OpReg == Reg) + containsReg = true; + // Anything other than a low register isn't legal here. + if (!isARMLowRegister(OpReg) && (!HiReg || OpReg != HiReg)) + return true; + } + return false; +} + +// Check if the specified regisgter is in the register list of the inst, +// starting at the indicated operand number. +static bool listContainsReg(MCInst &Inst, unsigned OpNo, unsigned Reg) { + for (unsigned i = OpNo; i < Inst.getNumOperands(); ++i) { + unsigned OpReg = Inst.getOperand(i).getReg(); + if (OpReg == Reg) + return true; + } + return false; +} + +// FIXME: We would really prefer to have MCInstrInfo (the wrapper around +// the ARMInsts array) instead. Getting that here requires awkward +// API changes, though. Better way? +namespace llvm { +extern MCInstrDesc ARMInsts[]; +} +static MCInstrDesc &getInstDesc(unsigned Opcode) { + return ARMInsts[Opcode]; +} + +// FIXME: We would really like to be able to tablegen'erate this. +bool ARMAsmParser:: +validateInstruction(MCInst &Inst, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + MCInstrDesc &MCID = getInstDesc(Inst.getOpcode()); + SMLoc Loc = Operands[0]->getStartLoc(); + // Check the IT block state first. + // NOTE: In Thumb mode, the BKPT instruction has the interesting property of + // being allowed in IT blocks, but not being predicable. It just always + // executes. + if (inITBlock() && Inst.getOpcode() != ARM::tBKPT) { + unsigned bit = 1; + if (ITState.FirstCond) + ITState.FirstCond = false; + else + bit = (ITState.Mask >> (5 - ITState.CurPosition)) & 1; + // The instruction must be predicable. + if (!MCID.isPredicable()) + return Error(Loc, "instructions in IT block must be predicable"); + unsigned Cond = Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm(); + unsigned ITCond = bit ? ITState.Cond : + ARMCC::getOppositeCondition(ITState.Cond); + if (Cond != ITCond) { + // Find the condition code Operand to get its SMLoc information. + SMLoc CondLoc; + for (unsigned i = 1; i < Operands.size(); ++i) + if (static_cast<ARMOperand*>(Operands[i])->isCondCode()) + CondLoc = Operands[i]->getStartLoc(); + return Error(CondLoc, "incorrect condition in IT block; got '" + + StringRef(ARMCondCodeToString(ARMCC::CondCodes(Cond))) + + "', but expected '" + + ARMCondCodeToString(ARMCC::CondCodes(ITCond)) + "'"); + } + // Check for non-'al' condition codes outside of the IT block. + } else if (isThumbTwo() && MCID.isPredicable() && + Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm() != + ARMCC::AL && Inst.getOpcode() != ARM::tB && + Inst.getOpcode() != ARM::t2B) + return Error(Loc, "predicated instructions must be in IT block"); + + switch (Inst.getOpcode()) { + case ARM::LDRD: + case ARM::LDRD_PRE: + case ARM::LDRD_POST: + case ARM::LDREXD: { + // Rt2 must be Rt + 1. + unsigned Rt = getARMRegisterNumbering(Inst.getOperand(0).getReg()); + unsigned Rt2 = getARMRegisterNumbering(Inst.getOperand(1).getReg()); + if (Rt2 != Rt + 1) + return Error(Operands[3]->getStartLoc(), + "destination operands must be sequential"); + return false; + } + case ARM::STRD: { + // Rt2 must be Rt + 1. + unsigned Rt = getARMRegisterNumbering(Inst.getOperand(0).getReg()); + unsigned Rt2 = getARMRegisterNumbering(Inst.getOperand(1).getReg()); + if (Rt2 != Rt + 1) + return Error(Operands[3]->getStartLoc(), + "source operands must be sequential"); + return false; + } + case ARM::STRD_PRE: + case ARM::STRD_POST: + case ARM::STREXD: { + // Rt2 must be Rt + 1. + unsigned Rt = getARMRegisterNumbering(Inst.getOperand(1).getReg()); + unsigned Rt2 = getARMRegisterNumbering(Inst.getOperand(2).getReg()); + if (Rt2 != Rt + 1) + return Error(Operands[3]->getStartLoc(), + "source operands must be sequential"); + return false; + } + case ARM::SBFX: + case ARM::UBFX: { + // width must be in range [1, 32-lsb] + unsigned lsb = Inst.getOperand(2).getImm(); + unsigned widthm1 = Inst.getOperand(3).getImm(); + if (widthm1 >= 32 - lsb) + return Error(Operands[5]->getStartLoc(), + "bitfield width must be in range [1,32-lsb]"); + return false; + } + case ARM::tLDMIA: { + // If we're parsing Thumb2, the .w variant is available and handles + // most cases that are normally illegal for a Thumb1 LDM + // instruction. We'll make the transformation in processInstruction() + // if necessary. + // + // Thumb LDM instructions are writeback iff the base register is not + // in the register list. + unsigned Rn = Inst.getOperand(0).getReg(); + bool hasWritebackToken = + (static_cast<ARMOperand*>(Operands[3])->isToken() && + static_cast<ARMOperand*>(Operands[3])->getToken() == "!"); + bool listContainsBase; + if (checkLowRegisterList(Inst, 3, Rn, 0, listContainsBase) && !isThumbTwo()) + return Error(Operands[3 + hasWritebackToken]->getStartLoc(), + "registers must be in range r0-r7"); + // If we should have writeback, then there should be a '!' token. + if (!listContainsBase && !hasWritebackToken && !isThumbTwo()) + return Error(Operands[2]->getStartLoc(), + "writeback operator '!' expected"); + // If we should not have writeback, there must not be a '!'. This is + // true even for the 32-bit wide encodings. + if (listContainsBase && hasWritebackToken) + return Error(Operands[3]->getStartLoc(), + "writeback operator '!' not allowed when base register " + "in register list"); + + break; + } + case ARM::t2LDMIA_UPD: { + if (listContainsReg(Inst, 3, Inst.getOperand(0).getReg())) + return Error(Operands[4]->getStartLoc(), + "writeback operator '!' not allowed when base register " + "in register list"); + break; + } + case ARM::tPOP: { + bool listContainsBase; + if (checkLowRegisterList(Inst, 3, 0, ARM::PC, listContainsBase)) + return Error(Operands[2]->getStartLoc(), + "registers must be in range r0-r7 or pc"); + break; + } + case ARM::tPUSH: { + bool listContainsBase; + if (checkLowRegisterList(Inst, 3, 0, ARM::LR, listContainsBase)) + return Error(Operands[2]->getStartLoc(), + "registers must be in range r0-r7 or lr"); + break; + } + case ARM::tSTMIA_UPD: { + bool listContainsBase; + if (checkLowRegisterList(Inst, 4, 0, 0, listContainsBase) && !isThumbTwo()) + return Error(Operands[4]->getStartLoc(), + "registers must be in range r0-r7"); + break; + } + } + + return false; +} + +void ARMAsmParser:: +processInstruction(MCInst &Inst, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + switch (Inst.getOpcode()) { + case ARM::LDMIA_UPD: + // If this is a load of a single register via a 'pop', then we should use + // a post-indexed LDR instruction instead, per the ARM ARM. + if (static_cast<ARMOperand*>(Operands[0])->getToken() == "pop" && + Inst.getNumOperands() == 5) { + MCInst TmpInst; + TmpInst.setOpcode(ARM::LDR_POST_IMM); + TmpInst.addOperand(Inst.getOperand(4)); // Rt + TmpInst.addOperand(Inst.getOperand(0)); // Rn_wb + TmpInst.addOperand(Inst.getOperand(1)); // Rn + TmpInst.addOperand(MCOperand::CreateReg(0)); // am2offset + TmpInst.addOperand(MCOperand::CreateImm(4)); + TmpInst.addOperand(Inst.getOperand(2)); // CondCode + TmpInst.addOperand(Inst.getOperand(3)); + Inst = TmpInst; + } + break; + case ARM::STMDB_UPD: + // If this is a store of a single register via a 'push', then we should use + // a pre-indexed STR instruction instead, per the ARM ARM. + if (static_cast<ARMOperand*>(Operands[0])->getToken() == "push" && + Inst.getNumOperands() == 5) { + MCInst TmpInst; + TmpInst.setOpcode(ARM::STR_PRE_IMM); + TmpInst.addOperand(Inst.getOperand(0)); // Rn_wb + TmpInst.addOperand(Inst.getOperand(4)); // Rt + TmpInst.addOperand(Inst.getOperand(1)); // addrmode_imm12 + TmpInst.addOperand(MCOperand::CreateImm(-4)); + TmpInst.addOperand(Inst.getOperand(2)); // CondCode + TmpInst.addOperand(Inst.getOperand(3)); + Inst = TmpInst; + } + break; + case ARM::tADDi8: + // If the immediate is in the range 0-7, we want tADDi3 iff Rd was + // explicitly specified. From the ARM ARM: "Encoding T1 is preferred + // to encoding T2 if <Rd> is specified and encoding T2 is preferred + // to encoding T1 if <Rd> is omitted." + if (Inst.getOperand(3).getImm() < 8 && Operands.size() == 6) + Inst.setOpcode(ARM::tADDi3); + break; + case ARM::tSUBi8: + // If the immediate is in the range 0-7, we want tADDi3 iff Rd was + // explicitly specified. From the ARM ARM: "Encoding T1 is preferred + // to encoding T2 if <Rd> is specified and encoding T2 is preferred + // to encoding T1 if <Rd> is omitted." + if (Inst.getOperand(3).getImm() < 8 && Operands.size() == 6) + Inst.setOpcode(ARM::tSUBi3); + break; + case ARM::tB: + // A Thumb conditional branch outside of an IT block is a tBcc. + if (Inst.getOperand(1).getImm() != ARMCC::AL && !inITBlock()) + Inst.setOpcode(ARM::tBcc); + break; + case ARM::t2B: + // A Thumb2 conditional branch outside of an IT block is a t2Bcc. + if (Inst.getOperand(1).getImm() != ARMCC::AL && !inITBlock()) + Inst.setOpcode(ARM::t2Bcc); + break; + case ARM::t2Bcc: + // If the conditional is AL or we're in an IT block, we really want t2B. + if (Inst.getOperand(1).getImm() == ARMCC::AL || inITBlock()) + Inst.setOpcode(ARM::t2B); + break; + case ARM::tBcc: + // If the conditional is AL, we really want tB. + if (Inst.getOperand(1).getImm() == ARMCC::AL) + Inst.setOpcode(ARM::tB); + break; + case ARM::tLDMIA: { + // If the register list contains any high registers, or if the writeback + // doesn't match what tLDMIA can do, we need to use the 32-bit encoding + // instead if we're in Thumb2. Otherwise, this should have generated + // an error in validateInstruction(). + unsigned Rn = Inst.getOperand(0).getReg(); + bool hasWritebackToken = + (static_cast<ARMOperand*>(Operands[3])->isToken() && + static_cast<ARMOperand*>(Operands[3])->getToken() == "!"); + bool listContainsBase; + if (checkLowRegisterList(Inst, 3, Rn, 0, listContainsBase) || + (!listContainsBase && !hasWritebackToken) || + (listContainsBase && hasWritebackToken)) { + // 16-bit encoding isn't sufficient. Switch to the 32-bit version. + assert (isThumbTwo()); + Inst.setOpcode(hasWritebackToken ? ARM::t2LDMIA_UPD : ARM::t2LDMIA); + // If we're switching to the updating version, we need to insert + // the writeback tied operand. + if (hasWritebackToken) + Inst.insert(Inst.begin(), + MCOperand::CreateReg(Inst.getOperand(0).getReg())); + } + break; + } + case ARM::tSTMIA_UPD: { + // If the register list contains any high registers, we need to use + // the 32-bit encoding instead if we're in Thumb2. Otherwise, this + // should have generated an error in validateInstruction(). + unsigned Rn = Inst.getOperand(0).getReg(); + bool listContainsBase; + if (checkLowRegisterList(Inst, 4, Rn, 0, listContainsBase)) { + // 16-bit encoding isn't sufficient. Switch to the 32-bit version. + assert (isThumbTwo()); + Inst.setOpcode(ARM::t2STMIA_UPD); + } + break; + } + case ARM::t2MOVi: { + // If we can use the 16-bit encoding and the user didn't explicitly + // request the 32-bit variant, transform it here. + if (isARMLowRegister(Inst.getOperand(0).getReg()) && + Inst.getOperand(1).getImm() <= 255 && + ((!inITBlock() && Inst.getOperand(2).getImm() == ARMCC::AL && + Inst.getOperand(4).getReg() == ARM::CPSR) || + (inITBlock() && Inst.getOperand(4).getReg() == 0)) && + (!static_cast<ARMOperand*>(Operands[2])->isToken() || + static_cast<ARMOperand*>(Operands[2])->getToken() != ".w")) { + // The operands aren't in the same order for tMOVi8... + MCInst TmpInst; + TmpInst.setOpcode(ARM::tMOVi8); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(4)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(Inst.getOperand(2)); + TmpInst.addOperand(Inst.getOperand(3)); + Inst = TmpInst; + } + break; + } + case ARM::t2MOVr: { + // If we can use the 16-bit encoding and the user didn't explicitly + // request the 32-bit variant, transform it here. + if (isARMLowRegister(Inst.getOperand(0).getReg()) && + isARMLowRegister(Inst.getOperand(1).getReg()) && + Inst.getOperand(2).getImm() == ARMCC::AL && + Inst.getOperand(4).getReg() == ARM::CPSR && + (!static_cast<ARMOperand*>(Operands[2])->isToken() || + static_cast<ARMOperand*>(Operands[2])->getToken() != ".w")) { + // The operands aren't the same for tMOV[S]r... (no cc_out) + MCInst TmpInst; + TmpInst.setOpcode(Inst.getOperand(4).getReg() ? ARM::tMOVSr : ARM::tMOVr); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(Inst.getOperand(2)); + TmpInst.addOperand(Inst.getOperand(3)); + Inst = TmpInst; + } + break; + } + case ARM::t2SXTH: + case ARM::t2SXTB: + case ARM::t2UXTH: + case ARM::t2UXTB: { + // If we can use the 16-bit encoding and the user didn't explicitly + // request the 32-bit variant, transform it here. + if (isARMLowRegister(Inst.getOperand(0).getReg()) && + isARMLowRegister(Inst.getOperand(1).getReg()) && + Inst.getOperand(2).getImm() == 0 && + (!static_cast<ARMOperand*>(Operands[2])->isToken() || + static_cast<ARMOperand*>(Operands[2])->getToken() != ".w")) { + unsigned NewOpc; + switch (Inst.getOpcode()) { + default: llvm_unreachable("Illegal opcode!"); + case ARM::t2SXTH: NewOpc = ARM::tSXTH; break; + case ARM::t2SXTB: NewOpc = ARM::tSXTB; break; + case ARM::t2UXTH: NewOpc = ARM::tUXTH; break; + case ARM::t2UXTB: NewOpc = ARM::tUXTB; break; + } + // The operands aren't the same for thumb1 (no rotate operand). + MCInst TmpInst; + TmpInst.setOpcode(NewOpc); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(Inst.getOperand(3)); + TmpInst.addOperand(Inst.getOperand(4)); + Inst = TmpInst; + } + break; + } + case ARM::t2IT: { + // The mask bits for all but the first condition are represented as + // the low bit of the condition code value implies 't'. We currently + // always have 1 implies 't', so XOR toggle the bits if the low bit + // of the condition code is zero. The encoding also expects the low + // bit of the condition to be encoded as bit 4 of the mask operand, + // so mask that in if needed + MCOperand &MO = Inst.getOperand(1); + unsigned Mask = MO.getImm(); + unsigned OrigMask = Mask; + unsigned TZ = CountTrailingZeros_32(Mask); + if ((Inst.getOperand(0).getImm() & 1) == 0) { + assert(Mask && TZ <= 3 && "illegal IT mask value!"); + for (unsigned i = 3; i != TZ; --i) + Mask ^= 1 << i; + } else + Mask |= 0x10; + MO.setImm(Mask); + + // Set up the IT block state according to the IT instruction we just + // matched. + assert(!inITBlock() && "nested IT blocks?!"); + ITState.Cond = ARMCC::CondCodes(Inst.getOperand(0).getImm()); + ITState.Mask = OrigMask; // Use the original mask, not the updated one. + ITState.CurPosition = 0; + ITState.FirstCond = true; + break; + } + } +} + +unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) { + // 16-bit thumb arithmetic instructions either require or preclude the 'S' + // suffix depending on whether they're in an IT block or not. + unsigned Opc = Inst.getOpcode(); + MCInstrDesc &MCID = getInstDesc(Opc); + if (MCID.TSFlags & ARMII::ThumbArithFlagSetting) { + assert(MCID.hasOptionalDef() && + "optionally flag setting instruction missing optional def operand"); + assert(MCID.NumOperands == Inst.getNumOperands() && + "operand count mismatch!"); + // Find the optional-def operand (cc_out). + unsigned OpNo; + for (OpNo = 0; + !MCID.OpInfo[OpNo].isOptionalDef() && OpNo < MCID.NumOperands; + ++OpNo) + ; + // If we're parsing Thumb1, reject it completely. + if (isThumbOne() && Inst.getOperand(OpNo).getReg() != ARM::CPSR) + return Match_MnemonicFail; + // If we're parsing Thumb2, which form is legal depends on whether we're + // in an IT block. + if (isThumbTwo() && Inst.getOperand(OpNo).getReg() != ARM::CPSR && + !inITBlock()) + return Match_RequiresITBlock; + if (isThumbTwo() && Inst.getOperand(OpNo).getReg() == ARM::CPSR && + inITBlock()) + return Match_RequiresNotITBlock; + } + // Some high-register supporting Thumb1 encodings only allow both registers + // to be from r0-r7 when in Thumb2. + else if (Opc == ARM::tADDhirr && isThumbOne() && + isARMLowRegister(Inst.getOperand(1).getReg()) && + isARMLowRegister(Inst.getOperand(2).getReg())) + return Match_RequiresThumb2; + // Others only require ARMv6 or later. + else if (Opc == ARM::tMOVr && isThumbOne() && !hasV6Ops() && + isARMLowRegister(Inst.getOperand(0).getReg()) && + isARMLowRegister(Inst.getOperand(1).getReg())) + return Match_RequiresV6; + return Match_Success; +} + +bool ARMAsmParser:: +MatchAndEmitInstruction(SMLoc IDLoc, + SmallVectorImpl<MCParsedAsmOperand*> &Operands, + MCStreamer &Out) { + MCInst Inst; + unsigned ErrorInfo; + unsigned MatchResult; + MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo); + switch (MatchResult) { + default: break; + case Match_Success: + // Context sensitive operand constraints aren't handled by the matcher, + // so check them here. + if (validateInstruction(Inst, Operands)) { + // Still progress the IT block, otherwise one wrong condition causes + // nasty cascading errors. + forwardITPosition(); + return true; + } + + // Some instructions need post-processing to, for example, tweak which + // encoding is selected. + processInstruction(Inst, Operands); + + // Only move forward at the very end so that everything in validate + // and process gets a consistent answer about whether we're in an IT + // block. + forwardITPosition(); + + Out.EmitInstruction(Inst); + return false; + case Match_MissingFeature: + Error(IDLoc, "instruction requires a CPU feature not currently enabled"); + return true; + case Match_InvalidOperand: { + SMLoc ErrorLoc = IDLoc; + if (ErrorInfo != ~0U) { + if (ErrorInfo >= Operands.size()) + return Error(IDLoc, "too few operands for instruction"); + + ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getStartLoc(); + if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; + } + + return Error(ErrorLoc, "invalid operand for instruction"); + } + case Match_MnemonicFail: + return Error(IDLoc, "invalid instruction"); + case Match_ConversionFail: + // The converter function will have already emited a diagnostic. + return true; + case Match_RequiresNotITBlock: + return Error(IDLoc, "flag setting instruction only valid outside IT block"); + case Match_RequiresITBlock: + return Error(IDLoc, "instruction only valid inside IT block"); + case Match_RequiresV6: + return Error(IDLoc, "instruction variant requires ARMv6 or later"); + case Match_RequiresThumb2: + return Error(IDLoc, "instruction variant requires Thumb2"); + } + + llvm_unreachable("Implement any new match types added!"); + return true; +} + +/// parseDirective parses the arm specific directives +bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) { + StringRef IDVal = DirectiveID.getIdentifier(); + if (IDVal == ".word") + return parseDirectiveWord(4, DirectiveID.getLoc()); + else if (IDVal == ".thumb") + return parseDirectiveThumb(DirectiveID.getLoc()); + else if (IDVal == ".thumb_func") + return parseDirectiveThumbFunc(DirectiveID.getLoc()); + else if (IDVal == ".code") + return parseDirectiveCode(DirectiveID.getLoc()); + else if (IDVal == ".syntax") + return parseDirectiveSyntax(DirectiveID.getLoc()); + return true; +} + +/// parseDirectiveWord +/// ::= .word [ expression (, expression)* ] +bool ARMAsmParser::parseDirectiveWord(unsigned Size, SMLoc L) { + if (getLexer().isNot(AsmToken::EndOfStatement)) { + for (;;) { + const MCExpr *Value; + if (getParser().ParseExpression(Value)) + return true; + + getParser().getStreamer().EmitValue(Value, Size, 0/*addrspace*/); + + if (getLexer().is(AsmToken::EndOfStatement)) + break; + + // FIXME: Improve diagnostic. + if (getLexer().isNot(AsmToken::Comma)) + return Error(L, "unexpected token in directive"); + Parser.Lex(); + } + } + + Parser.Lex(); + return false; +} + +/// parseDirectiveThumb +/// ::= .thumb +bool ARMAsmParser::parseDirectiveThumb(SMLoc L) { + if (getLexer().isNot(AsmToken::EndOfStatement)) + return Error(L, "unexpected token in directive"); + Parser.Lex(); + + // TODO: set thumb mode + // TODO: tell the MC streamer the mode + // getParser().getStreamer().Emit???(); + return false; +} + +/// parseDirectiveThumbFunc +/// ::= .thumbfunc symbol_name +bool ARMAsmParser::parseDirectiveThumbFunc(SMLoc L) { + const MCAsmInfo &MAI = getParser().getStreamer().getContext().getAsmInfo(); + bool isMachO = MAI.hasSubsectionsViaSymbols(); + StringRef Name; + + // Darwin asm has function name after .thumb_func direction + // ELF doesn't + if (isMachO) { + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Identifier) && Tok.isNot(AsmToken::String)) + return Error(L, "unexpected token in .thumb_func directive"); + Name = Tok.getString(); + Parser.Lex(); // Consume the identifier token. + } + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return Error(L, "unexpected token in directive"); + Parser.Lex(); + + // FIXME: assuming function name will be the line following .thumb_func + if (!isMachO) { + Name = Parser.getTok().getString(); + } + + // Mark symbol as a thumb symbol. + MCSymbol *Func = getParser().getContext().GetOrCreateSymbol(Name); + getParser().getStreamer().EmitThumbFunc(Func); + return false; +} + +/// parseDirectiveSyntax +/// ::= .syntax unified | divided +bool ARMAsmParser::parseDirectiveSyntax(SMLoc L) { + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Identifier)) + return Error(L, "unexpected token in .syntax directive"); + StringRef Mode = Tok.getString(); + if (Mode == "unified" || Mode == "UNIFIED") + Parser.Lex(); + else if (Mode == "divided" || Mode == "DIVIDED") + return Error(L, "'.syntax divided' arm asssembly not supported"); + else + return Error(L, "unrecognized syntax mode in .syntax directive"); + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return Error(Parser.getTok().getLoc(), "unexpected token in directive"); + Parser.Lex(); + + // TODO tell the MC streamer the mode + // getParser().getStreamer().Emit???(); + return false; +} + +/// parseDirectiveCode +/// ::= .code 16 | 32 +bool ARMAsmParser::parseDirectiveCode(SMLoc L) { + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Integer)) + return Error(L, "unexpected token in .code directive"); + int64_t Val = Parser.getTok().getIntVal(); + if (Val == 16) + Parser.Lex(); + else if (Val == 32) + Parser.Lex(); + else + return Error(L, "invalid operand to .code directive"); + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return Error(Parser.getTok().getLoc(), "unexpected token in directive"); + Parser.Lex(); + + if (Val == 16) { + if (!isThumb()) + SwitchMode(); + getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16); + } else { + if (isThumb()) + SwitchMode(); + getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32); + } + + return false; +} + +extern "C" void LLVMInitializeARMAsmLexer(); + +/// Force static initialization. +extern "C" void LLVMInitializeARMAsmParser() { + RegisterMCAsmParser<ARMAsmParser> X(TheARMTarget); + RegisterMCAsmParser<ARMAsmParser> Y(TheThumbTarget); + LLVMInitializeARMAsmLexer(); +} + +#define GET_REGISTER_MATCHER +#define GET_MATCHER_IMPLEMENTATION +#include "ARMGenAsmMatcher.inc" diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp new file mode 100644 index 0000000..8f2f813 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -0,0 +1,4082 @@ +//===- ARMDisassembler.cpp - Disassembler for ARM/Thumb ISA -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "arm-disassembler" + +#include "ARM.h" +#include "ARMRegisterInfo.h" +#include "ARMSubtarget.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "MCTargetDesc/ARMMCExpr.h" +#include "MCTargetDesc/ARMBaseInfo.h" +#include "llvm/MC/EDInstInfo.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDisassembler.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/MemoryObject.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +typedef MCDisassembler::DecodeStatus DecodeStatus; + +namespace { +/// ARMDisassembler - ARM disassembler for all ARM platforms. +class ARMDisassembler : public MCDisassembler { +public: + /// Constructor - Initializes the disassembler. + /// + ARMDisassembler(const MCSubtargetInfo &STI) : + MCDisassembler(STI) { + } + + ~ARMDisassembler() { + } + + /// getInstruction - See MCDisassembler. + DecodeStatus getInstruction(MCInst &instr, + uint64_t &size, + const MemoryObject ®ion, + uint64_t address, + raw_ostream &vStream, + raw_ostream &cStream) const; + + /// getEDInfo - See MCDisassembler. + EDInstInfo *getEDInfo() const; +private: +}; + +/// ThumbDisassembler - Thumb disassembler for all Thumb platforms. +class ThumbDisassembler : public MCDisassembler { +public: + /// Constructor - Initializes the disassembler. + /// + ThumbDisassembler(const MCSubtargetInfo &STI) : + MCDisassembler(STI) { + } + + ~ThumbDisassembler() { + } + + /// getInstruction - See MCDisassembler. + DecodeStatus getInstruction(MCInst &instr, + uint64_t &size, + const MemoryObject ®ion, + uint64_t address, + raw_ostream &vStream, + raw_ostream &cStream) const; + + /// getEDInfo - See MCDisassembler. + EDInstInfo *getEDInfo() const; +private: + mutable std::vector<unsigned> ITBlock; + DecodeStatus AddThumbPredicate(MCInst&) const; + void UpdateThumbVFPPredicate(MCInst&) const; +}; +} + +static bool Check(DecodeStatus &Out, DecodeStatus In) { + switch (In) { + case MCDisassembler::Success: + // Out stays the same. + return true; + case MCDisassembler::SoftFail: + Out = In; + return true; + case MCDisassembler::Fail: + Out = In; + return false; + } + return false; +} + + +// Forward declare these because the autogenerated code will reference them. +// Definitions are further down. +static DecodeStatus DecodeGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeGPRnopcRegisterClass(llvm::MCInst &Inst, + unsigned RegNo, uint64_t Address, + const void *Decoder); +static DecodeStatus DecodetGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodetcGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); +static DecodeStatus DecoderGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeSPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeDPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeDPR_8RegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeDPR_VFP2RegisterClass(llvm::MCInst &Inst, + unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeQPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); + +static DecodeStatus DecodePredicateOperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeCCOutOperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeSOImmOperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeRegListOperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeSPRRegListOperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeDPRRegListOperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); + +static DecodeStatus DecodeBitfieldMaskOperand(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeCopMemInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeAddrMode2IdxInstruction(llvm::MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeSORegMemOperand(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeAddrMode3Instruction(llvm::MCInst &Inst,unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeSORegImmOperand(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeSORegRegOperand(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); + +static DecodeStatus DecodeMemMultipleWritebackInstruction(llvm::MCInst & Inst, + unsigned Insn, + uint64_t Adddress, + const void *Decoder); +static DecodeStatus DecodeT2MOVTWInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeArmMOVTWInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeSMLAInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeCPSInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2CPSInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeAddrModeImm12Operand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeAddrMode5Operand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeAddrMode7Operand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeBranchImmInstruction(llvm::MCInst &Inst,unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVCVTImmOperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeAddrMode6Operand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVLDInstruction(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVSTInstruction(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVLD1DupInstruction(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVLD2DupInstruction(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVLD3DupInstruction(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVLD4DupInstruction(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeNEONModImmInstruction(llvm::MCInst &Inst,unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVSHLMaxInstruction(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeShiftRight8Imm(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeShiftRight16Imm(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeShiftRight32Imm(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeShiftRight64Imm(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeTBLInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodePostIdxReg(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeCoprocessor(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeMemBarrierOption(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeMSRMask(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeDoubleRegLoad(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeDoubleRegStore(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeLDRPreImm(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeLDRPreReg(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeSTRPreImm(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeSTRPreReg(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVLD1LN(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVLD2LN(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVLD3LN(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVLD4LN(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVST1LN(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVST2LN(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVST3LN(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVST4LN(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVMOVSRR(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeVMOVRRS(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); + +static DecodeStatus DecodeThumbAddSpecialReg(llvm::MCInst &Inst, uint16_t Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumbBROperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2BROperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumbCmpBROperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumbAddrModeRR(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumbAddrModeIS(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumbAddrModePC(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumbAddrModeSP(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2AddrModeSOReg(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2LoadShift(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2Imm8S4(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2AddrModeImm8s4(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2AddrModeImm0_1020s4(llvm::MCInst &Inst,unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2Imm8(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2AddrModeImm8(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumbAddSPImm(llvm::MCInst &Inst, uint16_t Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumbAddSPReg(llvm::MCInst &Inst, uint16_t Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumbCPS(llvm::MCInst &Inst, uint16_t Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumbBLXOffset(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2AddrModeImm12(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumbTableBranch(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumb2BCCInstruction(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2SOImm(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumbBCCTargetOperand(llvm::MCInst &Inst,unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThumbBLTargetOperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeIT(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2LDRDPreInstruction(llvm::MCInst &Inst,unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2STRDPreInstruction(llvm::MCInst &Inst,unsigned Insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2Adr(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2LdStPre(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeT2ShifterImmOperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder); + + + +#include "ARMGenDisassemblerTables.inc" +#include "ARMGenInstrInfo.inc" +#include "ARMGenEDInfo.inc" + +static MCDisassembler *createARMDisassembler(const Target &T, const MCSubtargetInfo &STI) { + return new ARMDisassembler(STI); +} + +static MCDisassembler *createThumbDisassembler(const Target &T, const MCSubtargetInfo &STI) { + return new ThumbDisassembler(STI); +} + +EDInstInfo *ARMDisassembler::getEDInfo() const { + return instInfoARM; +} + +EDInstInfo *ThumbDisassembler::getEDInfo() const { + return instInfoARM; +} + +DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size, + const MemoryObject &Region, + uint64_t Address, + raw_ostream &os, + raw_ostream &cs) const { + CommentStream = &cs; + + uint8_t bytes[4]; + + assert(!(STI.getFeatureBits() & ARM::ModeThumb) && + "Asked to disassemble an ARM instruction but Subtarget is in Thumb mode!"); + + // We want to read exactly 4 bytes of data. + if (Region.readBytes(Address, 4, (uint8_t*)bytes, NULL) == -1) { + Size = 0; + return MCDisassembler::Fail; + } + + // Encoded as a small-endian 32-bit word in the stream. + uint32_t insn = (bytes[3] << 24) | + (bytes[2] << 16) | + (bytes[1] << 8) | + (bytes[0] << 0); + + // Calling the auto-generated decoder function. + DecodeStatus result = decodeARMInstruction32(MI, insn, Address, this, STI); + if (result != MCDisassembler::Fail) { + Size = 4; + return result; + } + + // VFP and NEON instructions, similarly, are shared between ARM + // and Thumb modes. + MI.clear(); + result = decodeVFPInstruction32(MI, insn, Address, this, STI); + if (result != MCDisassembler::Fail) { + Size = 4; + return result; + } + + MI.clear(); + result = decodeNEONDataInstruction32(MI, insn, Address, this, STI); + if (result != MCDisassembler::Fail) { + Size = 4; + // Add a fake predicate operand, because we share these instruction + // definitions with Thumb2 where these instructions are predicable. + if (!DecodePredicateOperand(MI, 0xE, Address, this)) + return MCDisassembler::Fail; + return result; + } + + MI.clear(); + result = decodeNEONLoadStoreInstruction32(MI, insn, Address, this, STI); + if (result != MCDisassembler::Fail) { + Size = 4; + // Add a fake predicate operand, because we share these instruction + // definitions with Thumb2 where these instructions are predicable. + if (!DecodePredicateOperand(MI, 0xE, Address, this)) + return MCDisassembler::Fail; + return result; + } + + MI.clear(); + result = decodeNEONDupInstruction32(MI, insn, Address, this, STI); + if (result != MCDisassembler::Fail) { + Size = 4; + // Add a fake predicate operand, because we share these instruction + // definitions with Thumb2 where these instructions are predicable. + if (!DecodePredicateOperand(MI, 0xE, Address, this)) + return MCDisassembler::Fail; + return result; + } + + MI.clear(); + + Size = 0; + return MCDisassembler::Fail; +} + +namespace llvm { +extern MCInstrDesc ARMInsts[]; +} + +/// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the +/// immediate Value in the MCInst. The immediate Value has had any PC +/// adjustment made by the caller. If the instruction is a branch instruction +/// then isBranch is true, else false. If the getOpInfo() function was set as +/// part of the setupForSymbolicDisassembly() call then that function is called +/// to get any symbolic information at the Address for this instruction. If +/// that returns non-zero then the symbolic information it returns is used to +/// create an MCExpr and that is added as an operand to the MCInst. If +/// getOpInfo() returns zero and isBranch is true then a symbol look up for +/// Value is done and if a symbol is found an MCExpr is created with that, else +/// an MCExpr with Value is created. This function returns true if it adds an +/// operand to the MCInst and false otherwise. +static bool tryAddingSymbolicOperand(uint64_t Address, int32_t Value, + bool isBranch, uint64_t InstSize, + MCInst &MI, const void *Decoder) { + const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder); + LLVMOpInfoCallback getOpInfo = Dis->getLLVMOpInfoCallback(); + if (!getOpInfo) + return false; + + struct LLVMOpInfo1 SymbolicOp; + SymbolicOp.Value = Value; + void *DisInfo = Dis->getDisInfoBlock(); + if (!getOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp)) { + if (isBranch) { + LLVMSymbolLookupCallback SymbolLookUp = + Dis->getLLVMSymbolLookupCallback(); + if (SymbolLookUp) { + uint64_t ReferenceType; + ReferenceType = LLVMDisassembler_ReferenceType_In_Branch; + const char *ReferenceName; + const char *Name = SymbolLookUp(DisInfo, Value, &ReferenceType, Address, + &ReferenceName); + if (Name) { + SymbolicOp.AddSymbol.Name = Name; + SymbolicOp.AddSymbol.Present = true; + SymbolicOp.Value = 0; + } + else { + SymbolicOp.Value = Value; + } + if(ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub) + (*Dis->CommentStream) << "symbol stub for: " << ReferenceName; + } + else { + return false; + } + } + else { + return false; + } + } + + MCContext *Ctx = Dis->getMCContext(); + const MCExpr *Add = NULL; + if (SymbolicOp.AddSymbol.Present) { + if (SymbolicOp.AddSymbol.Name) { + StringRef Name(SymbolicOp.AddSymbol.Name); + MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name); + Add = MCSymbolRefExpr::Create(Sym, *Ctx); + } else { + Add = MCConstantExpr::Create(SymbolicOp.AddSymbol.Value, *Ctx); + } + } + + const MCExpr *Sub = NULL; + if (SymbolicOp.SubtractSymbol.Present) { + if (SymbolicOp.SubtractSymbol.Name) { + StringRef Name(SymbolicOp.SubtractSymbol.Name); + MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name); + Sub = MCSymbolRefExpr::Create(Sym, *Ctx); + } else { + Sub = MCConstantExpr::Create(SymbolicOp.SubtractSymbol.Value, *Ctx); + } + } + + const MCExpr *Off = NULL; + if (SymbolicOp.Value != 0) + Off = MCConstantExpr::Create(SymbolicOp.Value, *Ctx); + + const MCExpr *Expr; + if (Sub) { + const MCExpr *LHS; + if (Add) + LHS = MCBinaryExpr::CreateSub(Add, Sub, *Ctx); + else + LHS = MCUnaryExpr::CreateMinus(Sub, *Ctx); + if (Off != 0) + Expr = MCBinaryExpr::CreateAdd(LHS, Off, *Ctx); + else + Expr = LHS; + } else if (Add) { + if (Off != 0) + Expr = MCBinaryExpr::CreateAdd(Add, Off, *Ctx); + else + Expr = Add; + } else { + if (Off != 0) + Expr = Off; + else + Expr = MCConstantExpr::Create(0, *Ctx); + } + + if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_ARM_HI16) + MI.addOperand(MCOperand::CreateExpr(ARMMCExpr::CreateUpper16(Expr, *Ctx))); + else if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_ARM_LO16) + MI.addOperand(MCOperand::CreateExpr(ARMMCExpr::CreateLower16(Expr, *Ctx))); + else if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_None) + MI.addOperand(MCOperand::CreateExpr(Expr)); + else + assert(0 && "bad SymbolicOp.VariantKind"); + + return true; +} + +/// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being +/// referenced by a load instruction with the base register that is the Pc. +/// These can often be values in a literal pool near the Address of the +/// instruction. The Address of the instruction and its immediate Value are +/// used as a possible literal pool entry. The SymbolLookUp call back will +/// return the name of a symbol referenced by the the literal pool's entry if +/// the referenced address is that of a symbol. Or it will return a pointer to +/// a literal 'C' string if the referenced address of the literal pool's entry +/// is an address into a section with 'C' string literals. +static void tryAddingPcLoadReferenceComment(uint64_t Address, int Value, + const void *Decoder) { + const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder); + LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback(); + if (SymbolLookUp) { + void *DisInfo = Dis->getDisInfoBlock(); + uint64_t ReferenceType; + ReferenceType = LLVMDisassembler_ReferenceType_In_PCrel_Load; + const char *ReferenceName; + (void)SymbolLookUp(DisInfo, Value, &ReferenceType, Address, &ReferenceName); + if(ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr || + ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr) + (*Dis->CommentStream) << "literal pool for: " << ReferenceName; + } +} + +// Thumb1 instructions don't have explicit S bits. Rather, they +// implicitly set CPSR. Since it's not represented in the encoding, the +// auto-generated decoder won't inject the CPSR operand. We need to fix +// that as a post-pass. +static void AddThumb1SBit(MCInst &MI, bool InITBlock) { + const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo; + unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands; + MCInst::iterator I = MI.begin(); + for (unsigned i = 0; i < NumOps; ++i, ++I) { + if (I == MI.end()) break; + if (OpInfo[i].isOptionalDef() && OpInfo[i].RegClass == ARM::CCRRegClassID) { + if (i > 0 && OpInfo[i-1].isPredicate()) continue; + MI.insert(I, MCOperand::CreateReg(InITBlock ? 0 : ARM::CPSR)); + return; + } + } + + MI.insert(I, MCOperand::CreateReg(InITBlock ? 0 : ARM::CPSR)); +} + +// Most Thumb instructions don't have explicit predicates in the +// encoding, but rather get their predicates from IT context. We need +// to fix up the predicate operands using this context information as a +// post-pass. +MCDisassembler::DecodeStatus +ThumbDisassembler::AddThumbPredicate(MCInst &MI) const { + MCDisassembler::DecodeStatus S = Success; + + // A few instructions actually have predicates encoded in them. Don't + // try to overwrite it if we're seeing one of those. + switch (MI.getOpcode()) { + case ARM::tBcc: + case ARM::t2Bcc: + case ARM::tCBZ: + case ARM::tCBNZ: + case ARM::tCPS: + case ARM::t2CPS3p: + case ARM::t2CPS2p: + case ARM::t2CPS1p: + case ARM::tMOVSr: + case ARM::tSETEND: + // Some instructions (mostly conditional branches) are not + // allowed in IT blocks. + if (!ITBlock.empty()) + S = SoftFail; + else + return Success; + break; + case ARM::tB: + case ARM::t2B: + case ARM::t2TBB: + case ARM::t2TBH: + // Some instructions (mostly unconditional branches) can + // only appears at the end of, or outside of, an IT. + if (ITBlock.size() > 1) + S = SoftFail; + break; + default: + break; + } + + // If we're in an IT block, base the predicate on that. Otherwise, + // assume a predicate of AL. + unsigned CC; + if (!ITBlock.empty()) { + CC = ITBlock.back(); + if (CC == 0xF) + CC = ARMCC::AL; + ITBlock.pop_back(); + } else + CC = ARMCC::AL; + + const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo; + unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands; + MCInst::iterator I = MI.begin(); + for (unsigned i = 0; i < NumOps; ++i, ++I) { + if (I == MI.end()) break; + if (OpInfo[i].isPredicate()) { + I = MI.insert(I, MCOperand::CreateImm(CC)); + ++I; + if (CC == ARMCC::AL) + MI.insert(I, MCOperand::CreateReg(0)); + else + MI.insert(I, MCOperand::CreateReg(ARM::CPSR)); + return S; + } + } + + I = MI.insert(I, MCOperand::CreateImm(CC)); + ++I; + if (CC == ARMCC::AL) + MI.insert(I, MCOperand::CreateReg(0)); + else + MI.insert(I, MCOperand::CreateReg(ARM::CPSR)); + + return S; +} + +// Thumb VFP instructions are a special case. Because we share their +// encodings between ARM and Thumb modes, and they are predicable in ARM +// mode, the auto-generated decoder will give them an (incorrect) +// predicate operand. We need to rewrite these operands based on the IT +// context as a post-pass. +void ThumbDisassembler::UpdateThumbVFPPredicate(MCInst &MI) const { + unsigned CC; + if (!ITBlock.empty()) { + CC = ITBlock.back(); + ITBlock.pop_back(); + } else + CC = ARMCC::AL; + + const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo; + MCInst::iterator I = MI.begin(); + unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands; + for (unsigned i = 0; i < NumOps; ++i, ++I) { + if (OpInfo[i].isPredicate() ) { + I->setImm(CC); + ++I; + if (CC == ARMCC::AL) + I->setReg(0); + else + I->setReg(ARM::CPSR); + return; + } + } +} + +DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, + const MemoryObject &Region, + uint64_t Address, + raw_ostream &os, + raw_ostream &cs) const { + CommentStream = &cs; + + uint8_t bytes[4]; + + assert((STI.getFeatureBits() & ARM::ModeThumb) && + "Asked to disassemble in Thumb mode but Subtarget is in ARM mode!"); + + // We want to read exactly 2 bytes of data. + if (Region.readBytes(Address, 2, (uint8_t*)bytes, NULL) == -1) { + Size = 0; + return MCDisassembler::Fail; + } + + uint16_t insn16 = (bytes[1] << 8) | bytes[0]; + DecodeStatus result = decodeThumbInstruction16(MI, insn16, Address, this, STI); + if (result != MCDisassembler::Fail) { + Size = 2; + Check(result, AddThumbPredicate(MI)); + return result; + } + + MI.clear(); + result = decodeThumbSBitInstruction16(MI, insn16, Address, this, STI); + if (result) { + Size = 2; + bool InITBlock = !ITBlock.empty(); + Check(result, AddThumbPredicate(MI)); + AddThumb1SBit(MI, InITBlock); + return result; + } + + MI.clear(); + result = decodeThumb2Instruction16(MI, insn16, Address, this, STI); + if (result != MCDisassembler::Fail) { + Size = 2; + + // Nested IT blocks are UNPREDICTABLE. Must be checked before we add + // the Thumb predicate. + if (MI.getOpcode() == ARM::t2IT && !ITBlock.empty()) + result = MCDisassembler::SoftFail; + + Check(result, AddThumbPredicate(MI)); + + // If we find an IT instruction, we need to parse its condition + // code and mask operands so that we can apply them correctly + // to the subsequent instructions. + if (MI.getOpcode() == ARM::t2IT) { + + // (3 - the number of trailing zeros) is the number of then / else. + unsigned firstcond = MI.getOperand(0).getImm(); + unsigned Mask = MI.getOperand(1).getImm(); + unsigned CondBit0 = Mask >> 4 & 1; + unsigned NumTZ = CountTrailingZeros_32(Mask); + assert(NumTZ <= 3 && "Invalid IT mask!"); + for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) { + bool T = ((Mask >> Pos) & 1) == CondBit0; + if (T) + ITBlock.insert(ITBlock.begin(), firstcond); + else + ITBlock.insert(ITBlock.begin(), firstcond ^ 1); + } + + ITBlock.push_back(firstcond); + } + + return result; + } + + // We want to read exactly 4 bytes of data. + if (Region.readBytes(Address, 4, (uint8_t*)bytes, NULL) == -1) { + Size = 0; + return MCDisassembler::Fail; + } + + uint32_t insn32 = (bytes[3] << 8) | + (bytes[2] << 0) | + (bytes[1] << 24) | + (bytes[0] << 16); + MI.clear(); + result = decodeThumbInstruction32(MI, insn32, Address, this, STI); + if (result != MCDisassembler::Fail) { + Size = 4; + bool InITBlock = ITBlock.size(); + Check(result, AddThumbPredicate(MI)); + AddThumb1SBit(MI, InITBlock); + return result; + } + + MI.clear(); + result = decodeThumb2Instruction32(MI, insn32, Address, this, STI); + if (result != MCDisassembler::Fail) { + Size = 4; + Check(result, AddThumbPredicate(MI)); + return result; + } + + MI.clear(); + result = decodeVFPInstruction32(MI, insn32, Address, this, STI); + if (result != MCDisassembler::Fail) { + Size = 4; + UpdateThumbVFPPredicate(MI); + return result; + } + + MI.clear(); + result = decodeNEONDupInstruction32(MI, insn32, Address, this, STI); + if (result != MCDisassembler::Fail) { + Size = 4; + Check(result, AddThumbPredicate(MI)); + return result; + } + + if (fieldFromInstruction32(insn32, 24, 8) == 0xF9) { + MI.clear(); + uint32_t NEONLdStInsn = insn32; + NEONLdStInsn &= 0xF0FFFFFF; + NEONLdStInsn |= 0x04000000; + result = decodeNEONLoadStoreInstruction32(MI, NEONLdStInsn, Address, this, STI); + if (result != MCDisassembler::Fail) { + Size = 4; + Check(result, AddThumbPredicate(MI)); + return result; + } + } + + if (fieldFromInstruction32(insn32, 24, 4) == 0xF) { + MI.clear(); + uint32_t NEONDataInsn = insn32; + NEONDataInsn &= 0xF0FFFFFF; // Clear bits 27-24 + NEONDataInsn |= (NEONDataInsn & 0x10000000) >> 4; // Move bit 28 to bit 24 + NEONDataInsn |= 0x12000000; // Set bits 28 and 25 + result = decodeNEONDataInstruction32(MI, NEONDataInsn, Address, this, STI); + if (result != MCDisassembler::Fail) { + Size = 4; + Check(result, AddThumbPredicate(MI)); + return result; + } + } + + Size = 0; + return MCDisassembler::Fail; +} + + +extern "C" void LLVMInitializeARMDisassembler() { + TargetRegistry::RegisterMCDisassembler(TheARMTarget, + createARMDisassembler); + TargetRegistry::RegisterMCDisassembler(TheThumbTarget, + createThumbDisassembler); +} + +static const unsigned GPRDecoderTable[] = { + ARM::R0, ARM::R1, ARM::R2, ARM::R3, + ARM::R4, ARM::R5, ARM::R6, ARM::R7, + ARM::R8, ARM::R9, ARM::R10, ARM::R11, + ARM::R12, ARM::SP, ARM::LR, ARM::PC +}; + +static DecodeStatus DecodeGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + if (RegNo > 15) + return MCDisassembler::Fail; + + unsigned Register = GPRDecoderTable[RegNo]; + Inst.addOperand(MCOperand::CreateReg(Register)); + return MCDisassembler::Success; +} + +static DecodeStatus +DecodeGPRnopcRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + if (RegNo == 15) return MCDisassembler::Fail; + return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder); +} + +static DecodeStatus DecodetGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + if (RegNo > 7) + return MCDisassembler::Fail; + return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder); +} + +static DecodeStatus DecodetcGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + unsigned Register = 0; + switch (RegNo) { + case 0: + Register = ARM::R0; + break; + case 1: + Register = ARM::R1; + break; + case 2: + Register = ARM::R2; + break; + case 3: + Register = ARM::R3; + break; + case 9: + Register = ARM::R9; + break; + case 12: + Register = ARM::R12; + break; + default: + return MCDisassembler::Fail; + } + + Inst.addOperand(MCOperand::CreateReg(Register)); + return MCDisassembler::Success; +} + +static DecodeStatus DecoderGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + if (RegNo == 13 || RegNo == 15) return MCDisassembler::Fail; + return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder); +} + +static const unsigned SPRDecoderTable[] = { + ARM::S0, ARM::S1, ARM::S2, ARM::S3, + ARM::S4, ARM::S5, ARM::S6, ARM::S7, + ARM::S8, ARM::S9, ARM::S10, ARM::S11, + ARM::S12, ARM::S13, ARM::S14, ARM::S15, + ARM::S16, ARM::S17, ARM::S18, ARM::S19, + ARM::S20, ARM::S21, ARM::S22, ARM::S23, + ARM::S24, ARM::S25, ARM::S26, ARM::S27, + ARM::S28, ARM::S29, ARM::S30, ARM::S31 +}; + +static DecodeStatus DecodeSPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + if (RegNo > 31) + return MCDisassembler::Fail; + + unsigned Register = SPRDecoderTable[RegNo]; + Inst.addOperand(MCOperand::CreateReg(Register)); + return MCDisassembler::Success; +} + +static const unsigned DPRDecoderTable[] = { + ARM::D0, ARM::D1, ARM::D2, ARM::D3, + ARM::D4, ARM::D5, ARM::D6, ARM::D7, + ARM::D8, ARM::D9, ARM::D10, ARM::D11, + ARM::D12, ARM::D13, ARM::D14, ARM::D15, + ARM::D16, ARM::D17, ARM::D18, ARM::D19, + ARM::D20, ARM::D21, ARM::D22, ARM::D23, + ARM::D24, ARM::D25, ARM::D26, ARM::D27, + ARM::D28, ARM::D29, ARM::D30, ARM::D31 +}; + +static DecodeStatus DecodeDPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + if (RegNo > 31) + return MCDisassembler::Fail; + + unsigned Register = DPRDecoderTable[RegNo]; + Inst.addOperand(MCOperand::CreateReg(Register)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeDPR_8RegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + if (RegNo > 7) + return MCDisassembler::Fail; + return DecodeDPRRegisterClass(Inst, RegNo, Address, Decoder); +} + +static DecodeStatus +DecodeDPR_VFP2RegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + if (RegNo > 15) + return MCDisassembler::Fail; + return DecodeDPRRegisterClass(Inst, RegNo, Address, Decoder); +} + +static const unsigned QPRDecoderTable[] = { + ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3, + ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7, + ARM::Q8, ARM::Q9, ARM::Q10, ARM::Q11, + ARM::Q12, ARM::Q13, ARM::Q14, ARM::Q15 +}; + + +static DecodeStatus DecodeQPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + if (RegNo > 31) + return MCDisassembler::Fail; + RegNo >>= 1; + + unsigned Register = QPRDecoderTable[RegNo]; + Inst.addOperand(MCOperand::CreateReg(Register)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodePredicateOperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + if (Val == 0xF) return MCDisassembler::Fail; + // AL predicate is not allowed on Thumb1 branches. + if (Inst.getOpcode() == ARM::tBcc && Val == 0xE) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(Val)); + if (Val == ARMCC::AL) { + Inst.addOperand(MCOperand::CreateReg(0)); + } else + Inst.addOperand(MCOperand::CreateReg(ARM::CPSR)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeCCOutOperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + if (Val) + Inst.addOperand(MCOperand::CreateReg(ARM::CPSR)); + else + Inst.addOperand(MCOperand::CreateReg(0)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeSOImmOperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + uint32_t imm = Val & 0xFF; + uint32_t rot = (Val & 0xF00) >> 7; + uint32_t rot_imm = (imm >> rot) | (imm << ((32-rot) & 0x1F)); + Inst.addOperand(MCOperand::CreateImm(rot_imm)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeSORegImmOperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rm = fieldFromInstruction32(Val, 0, 4); + unsigned type = fieldFromInstruction32(Val, 5, 2); + unsigned imm = fieldFromInstruction32(Val, 7, 5); + + // Register-immediate + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + + ARM_AM::ShiftOpc Shift = ARM_AM::lsl; + switch (type) { + case 0: + Shift = ARM_AM::lsl; + break; + case 1: + Shift = ARM_AM::lsr; + break; + case 2: + Shift = ARM_AM::asr; + break; + case 3: + Shift = ARM_AM::ror; + break; + } + + if (Shift == ARM_AM::ror && imm == 0) + Shift = ARM_AM::rrx; + + unsigned Op = Shift | (imm << 3); + Inst.addOperand(MCOperand::CreateImm(Op)); + + return S; +} + +static DecodeStatus DecodeSORegRegOperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rm = fieldFromInstruction32(Val, 0, 4); + unsigned type = fieldFromInstruction32(Val, 5, 2); + unsigned Rs = fieldFromInstruction32(Val, 8, 4); + + // Register-register + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rs, Address, Decoder))) + return MCDisassembler::Fail; + + ARM_AM::ShiftOpc Shift = ARM_AM::lsl; + switch (type) { + case 0: + Shift = ARM_AM::lsl; + break; + case 1: + Shift = ARM_AM::lsr; + break; + case 2: + Shift = ARM_AM::asr; + break; + case 3: + Shift = ARM_AM::ror; + break; + } + + Inst.addOperand(MCOperand::CreateImm(Shift)); + + return S; +} + +static DecodeStatus DecodeRegListOperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + bool writebackLoad = false; + unsigned writebackReg = 0; + switch (Inst.getOpcode()) { + default: + break; + case ARM::LDMIA_UPD: + case ARM::LDMDB_UPD: + case ARM::LDMIB_UPD: + case ARM::LDMDA_UPD: + case ARM::t2LDMIA_UPD: + case ARM::t2LDMDB_UPD: + writebackLoad = true; + writebackReg = Inst.getOperand(0).getReg(); + break; + } + + // Empty register lists are not allowed. + if (CountPopulation_32(Val) == 0) return MCDisassembler::Fail; + for (unsigned i = 0; i < 16; ++i) { + if (Val & (1 << i)) { + if (!Check(S, DecodeGPRRegisterClass(Inst, i, Address, Decoder))) + return MCDisassembler::Fail; + // Writeback not allowed if Rn is in the target list. + if (writebackLoad && writebackReg == Inst.end()[-1].getReg()) + Check(S, MCDisassembler::SoftFail); + } + } + + return S; +} + +static DecodeStatus DecodeSPRRegListOperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Vd = fieldFromInstruction32(Val, 8, 4); + unsigned regs = Val & 0xFF; + + if (!Check(S, DecodeSPRRegisterClass(Inst, Vd, Address, Decoder))) + return MCDisassembler::Fail; + for (unsigned i = 0; i < (regs - 1); ++i) { + if (!Check(S, DecodeSPRRegisterClass(Inst, ++Vd, Address, Decoder))) + return MCDisassembler::Fail; + } + + return S; +} + +static DecodeStatus DecodeDPRRegListOperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Vd = fieldFromInstruction32(Val, 8, 4); + unsigned regs = (Val & 0xFF) / 2; + + if (!Check(S, DecodeDPRRegisterClass(Inst, Vd, Address, Decoder))) + return MCDisassembler::Fail; + for (unsigned i = 0; i < (regs - 1); ++i) { + if (!Check(S, DecodeDPRRegisterClass(Inst, ++Vd, Address, Decoder))) + return MCDisassembler::Fail; + } + + return S; +} + +static DecodeStatus DecodeBitfieldMaskOperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + // This operand encodes a mask of contiguous zeros between a specified MSB + // and LSB. To decode it, we create the mask of all bits MSB-and-lower, + // the mask of all bits LSB-and-lower, and then xor them to create + // the mask of that's all ones on [msb, lsb]. Finally we not it to + // create the final mask. + unsigned msb = fieldFromInstruction32(Val, 5, 5); + unsigned lsb = fieldFromInstruction32(Val, 0, 5); + + DecodeStatus S = MCDisassembler::Success; + if (lsb > msb) Check(S, MCDisassembler::SoftFail); + + uint32_t msb_mask = 0xFFFFFFFF; + if (msb != 31) msb_mask = (1U << (msb+1)) - 1; + uint32_t lsb_mask = (1U << lsb) - 1; + + Inst.addOperand(MCOperand::CreateImm(~(msb_mask ^ lsb_mask))); + return S; +} + +static DecodeStatus DecodeCopMemInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned pred = fieldFromInstruction32(Insn, 28, 4); + unsigned CRd = fieldFromInstruction32(Insn, 12, 4); + unsigned coproc = fieldFromInstruction32(Insn, 8, 4); + unsigned imm = fieldFromInstruction32(Insn, 0, 8); + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + unsigned U = fieldFromInstruction32(Insn, 23, 1); + + switch (Inst.getOpcode()) { + case ARM::LDC_OFFSET: + case ARM::LDC_PRE: + case ARM::LDC_POST: + case ARM::LDC_OPTION: + case ARM::LDCL_OFFSET: + case ARM::LDCL_PRE: + case ARM::LDCL_POST: + case ARM::LDCL_OPTION: + case ARM::STC_OFFSET: + case ARM::STC_PRE: + case ARM::STC_POST: + case ARM::STC_OPTION: + case ARM::STCL_OFFSET: + case ARM::STCL_PRE: + case ARM::STCL_POST: + case ARM::STCL_OPTION: + case ARM::t2LDC_OFFSET: + case ARM::t2LDC_PRE: + case ARM::t2LDC_POST: + case ARM::t2LDC_OPTION: + case ARM::t2LDCL_OFFSET: + case ARM::t2LDCL_PRE: + case ARM::t2LDCL_POST: + case ARM::t2LDCL_OPTION: + case ARM::t2STC_OFFSET: + case ARM::t2STC_PRE: + case ARM::t2STC_POST: + case ARM::t2STC_OPTION: + case ARM::t2STCL_OFFSET: + case ARM::t2STCL_PRE: + case ARM::t2STCL_POST: + case ARM::t2STCL_OPTION: + if (coproc == 0xA || coproc == 0xB) + return MCDisassembler::Fail; + break; + default: + break; + } + + Inst.addOperand(MCOperand::CreateImm(coproc)); + Inst.addOperand(MCOperand::CreateImm(CRd)); + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + + unsigned P = fieldFromInstruction32(Insn, 24, 1); + unsigned W = fieldFromInstruction32(Insn, 21, 1); + + bool writeback = (P == 0) || (W == 1); + unsigned idx_mode = 0; + if (P && writeback) + idx_mode = ARMII::IndexModePre; + else if (!P && writeback) + idx_mode = ARMII::IndexModePost; + + switch (Inst.getOpcode()) { + case ARM::t2LDC2_OFFSET: + case ARM::t2LDC2L_OFFSET: + case ARM::t2LDC2_PRE: + case ARM::t2LDC2L_PRE: + case ARM::t2STC2_OFFSET: + case ARM::t2STC2L_OFFSET: + case ARM::t2STC2_PRE: + case ARM::t2STC2L_PRE: + case ARM::LDC2_OFFSET: + case ARM::LDC2L_OFFSET: + case ARM::LDC2_PRE: + case ARM::LDC2L_PRE: + case ARM::STC2_OFFSET: + case ARM::STC2L_OFFSET: + case ARM::STC2_PRE: + case ARM::STC2L_PRE: + case ARM::t2LDC_OFFSET: + case ARM::t2LDCL_OFFSET: + case ARM::t2LDC_PRE: + case ARM::t2LDCL_PRE: + case ARM::t2STC_OFFSET: + case ARM::t2STCL_OFFSET: + case ARM::t2STC_PRE: + case ARM::t2STCL_PRE: + case ARM::LDC_OFFSET: + case ARM::LDCL_OFFSET: + case ARM::LDC_PRE: + case ARM::LDCL_PRE: + case ARM::STC_OFFSET: + case ARM::STCL_OFFSET: + case ARM::STC_PRE: + case ARM::STCL_PRE: + imm = ARM_AM::getAM5Opc(U ? ARM_AM::add : ARM_AM::sub, imm); + Inst.addOperand(MCOperand::CreateImm(imm)); + break; + case ARM::t2LDC2_POST: + case ARM::t2LDC2L_POST: + case ARM::t2STC2_POST: + case ARM::t2STC2L_POST: + case ARM::LDC2_POST: + case ARM::LDC2L_POST: + case ARM::STC2_POST: + case ARM::STC2L_POST: + case ARM::t2LDC_POST: + case ARM::t2LDCL_POST: + case ARM::t2STC_POST: + case ARM::t2STCL_POST: + case ARM::LDC_POST: + case ARM::LDCL_POST: + case ARM::STC_POST: + case ARM::STCL_POST: + imm |= U << 8; + // fall through. + default: + // The 'option' variant doesn't encode 'U' in the immediate since + // the immediate is unsigned [0,255]. + Inst.addOperand(MCOperand::CreateImm(imm)); + break; + } + + switch (Inst.getOpcode()) { + case ARM::LDC_OFFSET: + case ARM::LDC_PRE: + case ARM::LDC_POST: + case ARM::LDC_OPTION: + case ARM::LDCL_OFFSET: + case ARM::LDCL_PRE: + case ARM::LDCL_POST: + case ARM::LDCL_OPTION: + case ARM::STC_OFFSET: + case ARM::STC_PRE: + case ARM::STC_POST: + case ARM::STC_OPTION: + case ARM::STCL_OFFSET: + case ARM::STCL_PRE: + case ARM::STCL_POST: + case ARM::STCL_OPTION: + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + + return S; +} + +static DecodeStatus +DecodeAddrMode2IdxInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + unsigned Rt = fieldFromInstruction32(Insn, 12, 4); + unsigned Rm = fieldFromInstruction32(Insn, 0, 4); + unsigned imm = fieldFromInstruction32(Insn, 0, 12); + unsigned pred = fieldFromInstruction32(Insn, 28, 4); + unsigned reg = fieldFromInstruction32(Insn, 25, 1); + unsigned P = fieldFromInstruction32(Insn, 24, 1); + unsigned W = fieldFromInstruction32(Insn, 21, 1); + + // On stores, the writeback operand precedes Rt. + switch (Inst.getOpcode()) { + case ARM::STR_POST_IMM: + case ARM::STR_POST_REG: + case ARM::STRB_POST_IMM: + case ARM::STRB_POST_REG: + case ARM::STRT_POST_REG: + case ARM::STRT_POST_IMM: + case ARM::STRBT_POST_REG: + case ARM::STRBT_POST_IMM: + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + + // On loads, the writeback operand comes after Rt. + switch (Inst.getOpcode()) { + case ARM::LDR_POST_IMM: + case ARM::LDR_POST_REG: + case ARM::LDRB_POST_IMM: + case ARM::LDRB_POST_REG: + case ARM::LDRBT_POST_REG: + case ARM::LDRBT_POST_IMM: + case ARM::LDRT_POST_REG: + case ARM::LDRT_POST_IMM: + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + + ARM_AM::AddrOpc Op = ARM_AM::add; + if (!fieldFromInstruction32(Insn, 23, 1)) + Op = ARM_AM::sub; + + bool writeback = (P == 0) || (W == 1); + unsigned idx_mode = 0; + if (P && writeback) + idx_mode = ARMII::IndexModePre; + else if (!P && writeback) + idx_mode = ARMII::IndexModePost; + + if (writeback && (Rn == 15 || Rn == Rt)) + S = MCDisassembler::SoftFail; // UNPREDICTABLE + + if (reg) { + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + ARM_AM::ShiftOpc Opc = ARM_AM::lsl; + switch( fieldFromInstruction32(Insn, 5, 2)) { + case 0: + Opc = ARM_AM::lsl; + break; + case 1: + Opc = ARM_AM::lsr; + break; + case 2: + Opc = ARM_AM::asr; + break; + case 3: + Opc = ARM_AM::ror; + break; + default: + return MCDisassembler::Fail; + } + unsigned amt = fieldFromInstruction32(Insn, 7, 5); + unsigned imm = ARM_AM::getAM2Opc(Op, amt, Opc, idx_mode); + + Inst.addOperand(MCOperand::CreateImm(imm)); + } else { + Inst.addOperand(MCOperand::CreateReg(0)); + unsigned tmp = ARM_AM::getAM2Opc(Op, imm, ARM_AM::lsl, idx_mode); + Inst.addOperand(MCOperand::CreateImm(tmp)); + } + + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeSORegMemOperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction32(Val, 13, 4); + unsigned Rm = fieldFromInstruction32(Val, 0, 4); + unsigned type = fieldFromInstruction32(Val, 5, 2); + unsigned imm = fieldFromInstruction32(Val, 7, 5); + unsigned U = fieldFromInstruction32(Val, 12, 1); + + ARM_AM::ShiftOpc ShOp = ARM_AM::lsl; + switch (type) { + case 0: + ShOp = ARM_AM::lsl; + break; + case 1: + ShOp = ARM_AM::lsr; + break; + case 2: + ShOp = ARM_AM::asr; + break; + case 3: + ShOp = ARM_AM::ror; + break; + } + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + unsigned shift; + if (U) + shift = ARM_AM::getAM2Opc(ARM_AM::add, imm, ShOp); + else + shift = ARM_AM::getAM2Opc(ARM_AM::sub, imm, ShOp); + Inst.addOperand(MCOperand::CreateImm(shift)); + + return S; +} + +static DecodeStatus +DecodeAddrMode3Instruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rt = fieldFromInstruction32(Insn, 12, 4); + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + unsigned Rm = fieldFromInstruction32(Insn, 0, 4); + unsigned type = fieldFromInstruction32(Insn, 22, 1); + unsigned imm = fieldFromInstruction32(Insn, 8, 4); + unsigned U = ((~fieldFromInstruction32(Insn, 23, 1)) & 1) << 8; + unsigned pred = fieldFromInstruction32(Insn, 28, 4); + unsigned W = fieldFromInstruction32(Insn, 21, 1); + unsigned P = fieldFromInstruction32(Insn, 24, 1); + + bool writeback = (W == 1) | (P == 0); + + // For {LD,ST}RD, Rt must be even, else undefined. + switch (Inst.getOpcode()) { + case ARM::STRD: + case ARM::STRD_PRE: + case ARM::STRD_POST: + case ARM::LDRD: + case ARM::LDRD_PRE: + case ARM::LDRD_POST: + if (Rt & 0x1) return MCDisassembler::Fail; + break; + default: + break; + } + + if (writeback) { // Writeback + if (P) + U |= ARMII::IndexModePre << 9; + else + U |= ARMII::IndexModePost << 9; + + // On stores, the writeback operand precedes Rt. + switch (Inst.getOpcode()) { + case ARM::STRD: + case ARM::STRD_PRE: + case ARM::STRD_POST: + case ARM::STRH: + case ARM::STRH_PRE: + case ARM::STRH_POST: + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + } + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + switch (Inst.getOpcode()) { + case ARM::STRD: + case ARM::STRD_PRE: + case ARM::STRD_POST: + case ARM::LDRD: + case ARM::LDRD_PRE: + case ARM::LDRD_POST: + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt+1, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + + if (writeback) { + // On loads, the writeback operand comes after Rt. + switch (Inst.getOpcode()) { + case ARM::LDRD: + case ARM::LDRD_PRE: + case ARM::LDRD_POST: + case ARM::LDRH: + case ARM::LDRH_PRE: + case ARM::LDRH_POST: + case ARM::LDRSH: + case ARM::LDRSH_PRE: + case ARM::LDRSH_POST: + case ARM::LDRSB: + case ARM::LDRSB_PRE: + case ARM::LDRSB_POST: + case ARM::LDRHTr: + case ARM::LDRSBTr: + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + } + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + + if (type) { + Inst.addOperand(MCOperand::CreateReg(0)); + Inst.addOperand(MCOperand::CreateImm(U | (imm << 4) | Rm)); + } else { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(U)); + } + + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeRFEInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + unsigned mode = fieldFromInstruction32(Insn, 23, 2); + + switch (mode) { + case 0: + mode = ARM_AM::da; + break; + case 1: + mode = ARM_AM::ia; + break; + case 2: + mode = ARM_AM::db; + break; + case 3: + mode = ARM_AM::ib; + break; + } + + Inst.addOperand(MCOperand::CreateImm(mode)); + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeMemMultipleWritebackInstruction(llvm::MCInst &Inst, + unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + unsigned pred = fieldFromInstruction32(Insn, 28, 4); + unsigned reglist = fieldFromInstruction32(Insn, 0, 16); + + if (pred == 0xF) { + switch (Inst.getOpcode()) { + case ARM::LDMDA: + Inst.setOpcode(ARM::RFEDA); + break; + case ARM::LDMDA_UPD: + Inst.setOpcode(ARM::RFEDA_UPD); + break; + case ARM::LDMDB: + Inst.setOpcode(ARM::RFEDB); + break; + case ARM::LDMDB_UPD: + Inst.setOpcode(ARM::RFEDB_UPD); + break; + case ARM::LDMIA: + Inst.setOpcode(ARM::RFEIA); + break; + case ARM::LDMIA_UPD: + Inst.setOpcode(ARM::RFEIA_UPD); + break; + case ARM::LDMIB: + Inst.setOpcode(ARM::RFEIB); + break; + case ARM::LDMIB_UPD: + Inst.setOpcode(ARM::RFEIB_UPD); + break; + case ARM::STMDA: + Inst.setOpcode(ARM::SRSDA); + break; + case ARM::STMDA_UPD: + Inst.setOpcode(ARM::SRSDA_UPD); + break; + case ARM::STMDB: + Inst.setOpcode(ARM::SRSDB); + break; + case ARM::STMDB_UPD: + Inst.setOpcode(ARM::SRSDB_UPD); + break; + case ARM::STMIA: + Inst.setOpcode(ARM::SRSIA); + break; + case ARM::STMIA_UPD: + Inst.setOpcode(ARM::SRSIA_UPD); + break; + case ARM::STMIB: + Inst.setOpcode(ARM::SRSIB); + break; + case ARM::STMIB_UPD: + Inst.setOpcode(ARM::SRSIB_UPD); + break; + default: + if (!Check(S, MCDisassembler::Fail)) return MCDisassembler::Fail; + } + + // For stores (which become SRS's, the only operand is the mode. + if (fieldFromInstruction32(Insn, 20, 1) == 0) { + Inst.addOperand( + MCOperand::CreateImm(fieldFromInstruction32(Insn, 0, 4))); + return S; + } + + return DecodeRFEInstruction(Inst, Insn, Address, Decoder); + } + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; // Tied + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeRegListOperand(Inst, reglist, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeCPSInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + unsigned imod = fieldFromInstruction32(Insn, 18, 2); + unsigned M = fieldFromInstruction32(Insn, 17, 1); + unsigned iflags = fieldFromInstruction32(Insn, 6, 3); + unsigned mode = fieldFromInstruction32(Insn, 0, 5); + + DecodeStatus S = MCDisassembler::Success; + + // imod == '01' --> UNPREDICTABLE + // NOTE: Even though this is technically UNPREDICTABLE, we choose to + // return failure here. The '01' imod value is unprintable, so there's + // nothing useful we could do even if we returned UNPREDICTABLE. + + if (imod == 1) return MCDisassembler::Fail; + + if (imod && M) { + Inst.setOpcode(ARM::CPS3p); + Inst.addOperand(MCOperand::CreateImm(imod)); + Inst.addOperand(MCOperand::CreateImm(iflags)); + Inst.addOperand(MCOperand::CreateImm(mode)); + } else if (imod && !M) { + Inst.setOpcode(ARM::CPS2p); + Inst.addOperand(MCOperand::CreateImm(imod)); + Inst.addOperand(MCOperand::CreateImm(iflags)); + if (mode) S = MCDisassembler::SoftFail; + } else if (!imod && M) { + Inst.setOpcode(ARM::CPS1p); + Inst.addOperand(MCOperand::CreateImm(mode)); + if (iflags) S = MCDisassembler::SoftFail; + } else { + // imod == '00' && M == '0' --> UNPREDICTABLE + Inst.setOpcode(ARM::CPS1p); + Inst.addOperand(MCOperand::CreateImm(mode)); + S = MCDisassembler::SoftFail; + } + + return S; +} + +static DecodeStatus DecodeT2CPSInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + unsigned imod = fieldFromInstruction32(Insn, 9, 2); + unsigned M = fieldFromInstruction32(Insn, 8, 1); + unsigned iflags = fieldFromInstruction32(Insn, 5, 3); + unsigned mode = fieldFromInstruction32(Insn, 0, 5); + + DecodeStatus S = MCDisassembler::Success; + + // imod == '01' --> UNPREDICTABLE + // NOTE: Even though this is technically UNPREDICTABLE, we choose to + // return failure here. The '01' imod value is unprintable, so there's + // nothing useful we could do even if we returned UNPREDICTABLE. + + if (imod == 1) return MCDisassembler::Fail; + + if (imod && M) { + Inst.setOpcode(ARM::t2CPS3p); + Inst.addOperand(MCOperand::CreateImm(imod)); + Inst.addOperand(MCOperand::CreateImm(iflags)); + Inst.addOperand(MCOperand::CreateImm(mode)); + } else if (imod && !M) { + Inst.setOpcode(ARM::t2CPS2p); + Inst.addOperand(MCOperand::CreateImm(imod)); + Inst.addOperand(MCOperand::CreateImm(iflags)); + if (mode) S = MCDisassembler::SoftFail; + } else if (!imod && M) { + Inst.setOpcode(ARM::t2CPS1p); + Inst.addOperand(MCOperand::CreateImm(mode)); + if (iflags) S = MCDisassembler::SoftFail; + } else { + // imod == '00' && M == '0' --> UNPREDICTABLE + Inst.setOpcode(ARM::t2CPS1p); + Inst.addOperand(MCOperand::CreateImm(mode)); + S = MCDisassembler::SoftFail; + } + + return S; +} + +static DecodeStatus DecodeT2MOVTWInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rd = fieldFromInstruction32(Insn, 8, 4); + unsigned imm = 0; + + imm |= (fieldFromInstruction32(Insn, 0, 8) << 0); + imm |= (fieldFromInstruction32(Insn, 12, 3) << 8); + imm |= (fieldFromInstruction32(Insn, 16, 4) << 12); + imm |= (fieldFromInstruction32(Insn, 26, 1) << 11); + + if (Inst.getOpcode() == ARM::t2MOVTi16) + if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + + if (!tryAddingSymbolicOperand(Address, imm, false, 4, Inst, Decoder)) + Inst.addOperand(MCOperand::CreateImm(imm)); + + return S; +} + +static DecodeStatus DecodeArmMOVTWInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rd = fieldFromInstruction32(Insn, 12, 4); + unsigned pred = fieldFromInstruction32(Insn, 28, 4); + unsigned imm = 0; + + imm |= (fieldFromInstruction32(Insn, 0, 12) << 0); + imm |= (fieldFromInstruction32(Insn, 16, 4) << 12); + + if (Inst.getOpcode() == ARM::MOVTi16) + if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + + if (!tryAddingSymbolicOperand(Address, imm, false, 4, Inst, Decoder)) + Inst.addOperand(MCOperand::CreateImm(imm)); + + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeSMLAInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rd = fieldFromInstruction32(Insn, 16, 4); + unsigned Rn = fieldFromInstruction32(Insn, 0, 4); + unsigned Rm = fieldFromInstruction32(Insn, 8, 4); + unsigned Ra = fieldFromInstruction32(Insn, 12, 4); + unsigned pred = fieldFromInstruction32(Insn, 28, 4); + + if (pred == 0xF) + return DecodeCPSInstruction(Inst, Insn, Address, Decoder); + + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Ra, Address, Decoder))) + return MCDisassembler::Fail; + + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeAddrModeImm12Operand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned add = fieldFromInstruction32(Val, 12, 1); + unsigned imm = fieldFromInstruction32(Val, 0, 12); + unsigned Rn = fieldFromInstruction32(Val, 13, 4); + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + + if (!add) imm *= -1; + if (imm == 0 && !add) imm = INT32_MIN; + Inst.addOperand(MCOperand::CreateImm(imm)); + if (Rn == 15) + tryAddingPcLoadReferenceComment(Address, Address + imm + 8, Decoder); + + return S; +} + +static DecodeStatus DecodeAddrMode5Operand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction32(Val, 9, 4); + unsigned U = fieldFromInstruction32(Val, 8, 1); + unsigned imm = fieldFromInstruction32(Val, 0, 8); + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + + if (U) + Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM5Opc(ARM_AM::add, imm))); + else + Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM5Opc(ARM_AM::sub, imm))); + + return S; +} + +static DecodeStatus DecodeAddrMode7Operand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + return DecodeGPRRegisterClass(Inst, Val, Address, Decoder); +} + +static DecodeStatus +DecodeBranchImmInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned pred = fieldFromInstruction32(Insn, 28, 4); + unsigned imm = fieldFromInstruction32(Insn, 0, 24) << 2; + + if (pred == 0xF) { + Inst.setOpcode(ARM::BLXi); + imm |= fieldFromInstruction32(Insn, 24, 1) << 1; + Inst.addOperand(MCOperand::CreateImm(SignExtend32<26>(imm))); + return S; + } + + if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<26>(imm) + 8, true, + 4, Inst, Decoder)) + Inst.addOperand(MCOperand::CreateImm(SignExtend32<26>(imm))); + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + + +static DecodeStatus DecodeVCVTImmOperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + Inst.addOperand(MCOperand::CreateImm(64 - Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeAddrMode6Operand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rm = fieldFromInstruction32(Val, 0, 4); + unsigned align = fieldFromInstruction32(Val, 4, 2); + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + if (!align) + Inst.addOperand(MCOperand::CreateImm(0)); + else + Inst.addOperand(MCOperand::CreateImm(4 << align)); + + return S; +} + +static DecodeStatus DecodeVLDInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rd = fieldFromInstruction32(Insn, 12, 4); + Rd |= fieldFromInstruction32(Insn, 22, 1) << 4; + unsigned wb = fieldFromInstruction32(Insn, 16, 4); + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + Rn |= fieldFromInstruction32(Insn, 4, 2) << 4; + unsigned Rm = fieldFromInstruction32(Insn, 0, 4); + + // First output register + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + + // Second output register + switch (Inst.getOpcode()) { + case ARM::VLD1q8: + case ARM::VLD1q16: + case ARM::VLD1q32: + case ARM::VLD1q64: + case ARM::VLD1q8_UPD: + case ARM::VLD1q16_UPD: + case ARM::VLD1q32_UPD: + case ARM::VLD1q64_UPD: + case ARM::VLD1d8T: + case ARM::VLD1d16T: + case ARM::VLD1d32T: + case ARM::VLD1d64T: + case ARM::VLD1d8T_UPD: + case ARM::VLD1d16T_UPD: + case ARM::VLD1d32T_UPD: + case ARM::VLD1d64T_UPD: + case ARM::VLD1d8Q: + case ARM::VLD1d16Q: + case ARM::VLD1d32Q: + case ARM::VLD1d64Q: + case ARM::VLD1d8Q_UPD: + case ARM::VLD1d16Q_UPD: + case ARM::VLD1d32Q_UPD: + case ARM::VLD1d64Q_UPD: + case ARM::VLD2d8: + case ARM::VLD2d16: + case ARM::VLD2d32: + case ARM::VLD2d8_UPD: + case ARM::VLD2d16_UPD: + case ARM::VLD2d32_UPD: + case ARM::VLD2q8: + case ARM::VLD2q16: + case ARM::VLD2q32: + case ARM::VLD2q8_UPD: + case ARM::VLD2q16_UPD: + case ARM::VLD2q32_UPD: + case ARM::VLD3d8: + case ARM::VLD3d16: + case ARM::VLD3d32: + case ARM::VLD3d8_UPD: + case ARM::VLD3d16_UPD: + case ARM::VLD3d32_UPD: + case ARM::VLD4d8: + case ARM::VLD4d16: + case ARM::VLD4d32: + case ARM::VLD4d8_UPD: + case ARM::VLD4d16_UPD: + case ARM::VLD4d32_UPD: + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+1)%32, Address, Decoder))) + return MCDisassembler::Fail; + break; + case ARM::VLD2b8: + case ARM::VLD2b16: + case ARM::VLD2b32: + case ARM::VLD2b8_UPD: + case ARM::VLD2b16_UPD: + case ARM::VLD2b32_UPD: + case ARM::VLD3q8: + case ARM::VLD3q16: + case ARM::VLD3q32: + case ARM::VLD3q8_UPD: + case ARM::VLD3q16_UPD: + case ARM::VLD3q32_UPD: + case ARM::VLD4q8: + case ARM::VLD4q16: + case ARM::VLD4q32: + case ARM::VLD4q8_UPD: + case ARM::VLD4q16_UPD: + case ARM::VLD4q32_UPD: + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder))) + return MCDisassembler::Fail; + default: + break; + } + + // Third output register + switch(Inst.getOpcode()) { + case ARM::VLD1d8T: + case ARM::VLD1d16T: + case ARM::VLD1d32T: + case ARM::VLD1d64T: + case ARM::VLD1d8T_UPD: + case ARM::VLD1d16T_UPD: + case ARM::VLD1d32T_UPD: + case ARM::VLD1d64T_UPD: + case ARM::VLD1d8Q: + case ARM::VLD1d16Q: + case ARM::VLD1d32Q: + case ARM::VLD1d64Q: + case ARM::VLD1d8Q_UPD: + case ARM::VLD1d16Q_UPD: + case ARM::VLD1d32Q_UPD: + case ARM::VLD1d64Q_UPD: + case ARM::VLD2q8: + case ARM::VLD2q16: + case ARM::VLD2q32: + case ARM::VLD2q8_UPD: + case ARM::VLD2q16_UPD: + case ARM::VLD2q32_UPD: + case ARM::VLD3d8: + case ARM::VLD3d16: + case ARM::VLD3d32: + case ARM::VLD3d8_UPD: + case ARM::VLD3d16_UPD: + case ARM::VLD3d32_UPD: + case ARM::VLD4d8: + case ARM::VLD4d16: + case ARM::VLD4d32: + case ARM::VLD4d8_UPD: + case ARM::VLD4d16_UPD: + case ARM::VLD4d32_UPD: + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder))) + return MCDisassembler::Fail; + break; + case ARM::VLD3q8: + case ARM::VLD3q16: + case ARM::VLD3q32: + case ARM::VLD3q8_UPD: + case ARM::VLD3q16_UPD: + case ARM::VLD3q32_UPD: + case ARM::VLD4q8: + case ARM::VLD4q16: + case ARM::VLD4q32: + case ARM::VLD4q8_UPD: + case ARM::VLD4q16_UPD: + case ARM::VLD4q32_UPD: + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+4)%32, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + + // Fourth output register + switch (Inst.getOpcode()) { + case ARM::VLD1d8Q: + case ARM::VLD1d16Q: + case ARM::VLD1d32Q: + case ARM::VLD1d64Q: + case ARM::VLD1d8Q_UPD: + case ARM::VLD1d16Q_UPD: + case ARM::VLD1d32Q_UPD: + case ARM::VLD1d64Q_UPD: + case ARM::VLD2q8: + case ARM::VLD2q16: + case ARM::VLD2q32: + case ARM::VLD2q8_UPD: + case ARM::VLD2q16_UPD: + case ARM::VLD2q32_UPD: + case ARM::VLD4d8: + case ARM::VLD4d16: + case ARM::VLD4d32: + case ARM::VLD4d8_UPD: + case ARM::VLD4d16_UPD: + case ARM::VLD4d32_UPD: + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+3)%32, Address, Decoder))) + return MCDisassembler::Fail; + break; + case ARM::VLD4q8: + case ARM::VLD4q16: + case ARM::VLD4q32: + case ARM::VLD4q8_UPD: + case ARM::VLD4q16_UPD: + case ARM::VLD4q32_UPD: + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+6)%32, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + + // Writeback operand + switch (Inst.getOpcode()) { + case ARM::VLD1d8_UPD: + case ARM::VLD1d16_UPD: + case ARM::VLD1d32_UPD: + case ARM::VLD1d64_UPD: + case ARM::VLD1q8_UPD: + case ARM::VLD1q16_UPD: + case ARM::VLD1q32_UPD: + case ARM::VLD1q64_UPD: + case ARM::VLD1d8T_UPD: + case ARM::VLD1d16T_UPD: + case ARM::VLD1d32T_UPD: + case ARM::VLD1d64T_UPD: + case ARM::VLD1d8Q_UPD: + case ARM::VLD1d16Q_UPD: + case ARM::VLD1d32Q_UPD: + case ARM::VLD1d64Q_UPD: + case ARM::VLD2d8_UPD: + case ARM::VLD2d16_UPD: + case ARM::VLD2d32_UPD: + case ARM::VLD2q8_UPD: + case ARM::VLD2q16_UPD: + case ARM::VLD2q32_UPD: + case ARM::VLD2b8_UPD: + case ARM::VLD2b16_UPD: + case ARM::VLD2b32_UPD: + case ARM::VLD3d8_UPD: + case ARM::VLD3d16_UPD: + case ARM::VLD3d32_UPD: + case ARM::VLD3q8_UPD: + case ARM::VLD3q16_UPD: + case ARM::VLD3q32_UPD: + case ARM::VLD4d8_UPD: + case ARM::VLD4d16_UPD: + case ARM::VLD4d32_UPD: + case ARM::VLD4q8_UPD: + case ARM::VLD4q16_UPD: + case ARM::VLD4q32_UPD: + if (!Check(S, DecodeGPRRegisterClass(Inst, wb, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + + // AddrMode6 Base (register+alignment) + if (!Check(S, DecodeAddrMode6Operand(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + + // AddrMode6 Offset (register) + if (Rm == 0xD) + Inst.addOperand(MCOperand::CreateReg(0)); + else if (Rm != 0xF) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } + + return S; +} + +static DecodeStatus DecodeVSTInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rd = fieldFromInstruction32(Insn, 12, 4); + Rd |= fieldFromInstruction32(Insn, 22, 1) << 4; + unsigned wb = fieldFromInstruction32(Insn, 16, 4); + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + Rn |= fieldFromInstruction32(Insn, 4, 2) << 4; + unsigned Rm = fieldFromInstruction32(Insn, 0, 4); + + // Writeback Operand + switch (Inst.getOpcode()) { + case ARM::VST1d8_UPD: + case ARM::VST1d16_UPD: + case ARM::VST1d32_UPD: + case ARM::VST1d64_UPD: + case ARM::VST1q8_UPD: + case ARM::VST1q16_UPD: + case ARM::VST1q32_UPD: + case ARM::VST1q64_UPD: + case ARM::VST1d8T_UPD: + case ARM::VST1d16T_UPD: + case ARM::VST1d32T_UPD: + case ARM::VST1d64T_UPD: + case ARM::VST1d8Q_UPD: + case ARM::VST1d16Q_UPD: + case ARM::VST1d32Q_UPD: + case ARM::VST1d64Q_UPD: + case ARM::VST2d8_UPD: + case ARM::VST2d16_UPD: + case ARM::VST2d32_UPD: + case ARM::VST2q8_UPD: + case ARM::VST2q16_UPD: + case ARM::VST2q32_UPD: + case ARM::VST2b8_UPD: + case ARM::VST2b16_UPD: + case ARM::VST2b32_UPD: + case ARM::VST3d8_UPD: + case ARM::VST3d16_UPD: + case ARM::VST3d32_UPD: + case ARM::VST3q8_UPD: + case ARM::VST3q16_UPD: + case ARM::VST3q32_UPD: + case ARM::VST4d8_UPD: + case ARM::VST4d16_UPD: + case ARM::VST4d32_UPD: + case ARM::VST4q8_UPD: + case ARM::VST4q16_UPD: + case ARM::VST4q32_UPD: + if (!Check(S, DecodeGPRRegisterClass(Inst, wb, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + + // AddrMode6 Base (register+alignment) + if (!Check(S, DecodeAddrMode6Operand(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + + // AddrMode6 Offset (register) + if (Rm == 0xD) + Inst.addOperand(MCOperand::CreateReg(0)); + else if (Rm != 0xF) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } + + // First input register + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + + // Second input register + switch (Inst.getOpcode()) { + case ARM::VST1q8: + case ARM::VST1q16: + case ARM::VST1q32: + case ARM::VST1q64: + case ARM::VST1q8_UPD: + case ARM::VST1q16_UPD: + case ARM::VST1q32_UPD: + case ARM::VST1q64_UPD: + case ARM::VST1d8T: + case ARM::VST1d16T: + case ARM::VST1d32T: + case ARM::VST1d64T: + case ARM::VST1d8T_UPD: + case ARM::VST1d16T_UPD: + case ARM::VST1d32T_UPD: + case ARM::VST1d64T_UPD: + case ARM::VST1d8Q: + case ARM::VST1d16Q: + case ARM::VST1d32Q: + case ARM::VST1d64Q: + case ARM::VST1d8Q_UPD: + case ARM::VST1d16Q_UPD: + case ARM::VST1d32Q_UPD: + case ARM::VST1d64Q_UPD: + case ARM::VST2d8: + case ARM::VST2d16: + case ARM::VST2d32: + case ARM::VST2d8_UPD: + case ARM::VST2d16_UPD: + case ARM::VST2d32_UPD: + case ARM::VST2q8: + case ARM::VST2q16: + case ARM::VST2q32: + case ARM::VST2q8_UPD: + case ARM::VST2q16_UPD: + case ARM::VST2q32_UPD: + case ARM::VST3d8: + case ARM::VST3d16: + case ARM::VST3d32: + case ARM::VST3d8_UPD: + case ARM::VST3d16_UPD: + case ARM::VST3d32_UPD: + case ARM::VST4d8: + case ARM::VST4d16: + case ARM::VST4d32: + case ARM::VST4d8_UPD: + case ARM::VST4d16_UPD: + case ARM::VST4d32_UPD: + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+1)%32, Address, Decoder))) + return MCDisassembler::Fail; + break; + case ARM::VST2b8: + case ARM::VST2b16: + case ARM::VST2b32: + case ARM::VST2b8_UPD: + case ARM::VST2b16_UPD: + case ARM::VST2b32_UPD: + case ARM::VST3q8: + case ARM::VST3q16: + case ARM::VST3q32: + case ARM::VST3q8_UPD: + case ARM::VST3q16_UPD: + case ARM::VST3q32_UPD: + case ARM::VST4q8: + case ARM::VST4q16: + case ARM::VST4q32: + case ARM::VST4q8_UPD: + case ARM::VST4q16_UPD: + case ARM::VST4q32_UPD: + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + + // Third input register + switch (Inst.getOpcode()) { + case ARM::VST1d8T: + case ARM::VST1d16T: + case ARM::VST1d32T: + case ARM::VST1d64T: + case ARM::VST1d8T_UPD: + case ARM::VST1d16T_UPD: + case ARM::VST1d32T_UPD: + case ARM::VST1d64T_UPD: + case ARM::VST1d8Q: + case ARM::VST1d16Q: + case ARM::VST1d32Q: + case ARM::VST1d64Q: + case ARM::VST1d8Q_UPD: + case ARM::VST1d16Q_UPD: + case ARM::VST1d32Q_UPD: + case ARM::VST1d64Q_UPD: + case ARM::VST2q8: + case ARM::VST2q16: + case ARM::VST2q32: + case ARM::VST2q8_UPD: + case ARM::VST2q16_UPD: + case ARM::VST2q32_UPD: + case ARM::VST3d8: + case ARM::VST3d16: + case ARM::VST3d32: + case ARM::VST3d8_UPD: + case ARM::VST3d16_UPD: + case ARM::VST3d32_UPD: + case ARM::VST4d8: + case ARM::VST4d16: + case ARM::VST4d32: + case ARM::VST4d8_UPD: + case ARM::VST4d16_UPD: + case ARM::VST4d32_UPD: + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder))) + return MCDisassembler::Fail; + break; + case ARM::VST3q8: + case ARM::VST3q16: + case ARM::VST3q32: + case ARM::VST3q8_UPD: + case ARM::VST3q16_UPD: + case ARM::VST3q32_UPD: + case ARM::VST4q8: + case ARM::VST4q16: + case ARM::VST4q32: + case ARM::VST4q8_UPD: + case ARM::VST4q16_UPD: + case ARM::VST4q32_UPD: + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+4)%32, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + + // Fourth input register + switch (Inst.getOpcode()) { + case ARM::VST1d8Q: + case ARM::VST1d16Q: + case ARM::VST1d32Q: + case ARM::VST1d64Q: + case ARM::VST1d8Q_UPD: + case ARM::VST1d16Q_UPD: + case ARM::VST1d32Q_UPD: + case ARM::VST1d64Q_UPD: + case ARM::VST2q8: + case ARM::VST2q16: + case ARM::VST2q32: + case ARM::VST2q8_UPD: + case ARM::VST2q16_UPD: + case ARM::VST2q32_UPD: + case ARM::VST4d8: + case ARM::VST4d16: + case ARM::VST4d32: + case ARM::VST4d8_UPD: + case ARM::VST4d16_UPD: + case ARM::VST4d32_UPD: + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+3)%32, Address, Decoder))) + return MCDisassembler::Fail; + break; + case ARM::VST4q8: + case ARM::VST4q16: + case ARM::VST4q32: + case ARM::VST4q8_UPD: + case ARM::VST4q16_UPD: + case ARM::VST4q32_UPD: + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+6)%32, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + + return S; +} + +static DecodeStatus DecodeVLD1DupInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rd = fieldFromInstruction32(Insn, 12, 4); + Rd |= fieldFromInstruction32(Insn, 22, 1) << 4; + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + unsigned Rm = fieldFromInstruction32(Insn, 0, 4); + unsigned align = fieldFromInstruction32(Insn, 4, 1); + unsigned size = fieldFromInstruction32(Insn, 6, 2); + unsigned regs = fieldFromInstruction32(Insn, 5, 1) + 1; + + align *= (1 << size); + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (regs == 2) { + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+1)%32, Address, Decoder))) + return MCDisassembler::Fail; + } + if (Rm != 0xF) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(align)); + + if (Rm == 0xD) + Inst.addOperand(MCOperand::CreateReg(0)); + else if (Rm != 0xF) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } + + return S; +} + +static DecodeStatus DecodeVLD2DupInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rd = fieldFromInstruction32(Insn, 12, 4); + Rd |= fieldFromInstruction32(Insn, 22, 1) << 4; + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + unsigned Rm = fieldFromInstruction32(Insn, 0, 4); + unsigned align = fieldFromInstruction32(Insn, 4, 1); + unsigned size = 1 << fieldFromInstruction32(Insn, 6, 2); + unsigned inc = fieldFromInstruction32(Insn, 5, 1) + 1; + align *= 2*size; + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+inc)%32, Address, Decoder))) + return MCDisassembler::Fail; + if (Rm != 0xF) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(align)); + + if (Rm == 0xD) + Inst.addOperand(MCOperand::CreateReg(0)); + else if (Rm != 0xF) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } + + return S; +} + +static DecodeStatus DecodeVLD3DupInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rd = fieldFromInstruction32(Insn, 12, 4); + Rd |= fieldFromInstruction32(Insn, 22, 1) << 4; + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + unsigned Rm = fieldFromInstruction32(Insn, 0, 4); + unsigned inc = fieldFromInstruction32(Insn, 5, 1) + 1; + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+inc)%32, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2*inc)%32, Address, Decoder))) + return MCDisassembler::Fail; + if (Rm != 0xF) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(0)); + + if (Rm == 0xD) + Inst.addOperand(MCOperand::CreateReg(0)); + else if (Rm != 0xF) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } + + return S; +} + +static DecodeStatus DecodeVLD4DupInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rd = fieldFromInstruction32(Insn, 12, 4); + Rd |= fieldFromInstruction32(Insn, 22, 1) << 4; + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + unsigned Rm = fieldFromInstruction32(Insn, 0, 4); + unsigned size = fieldFromInstruction32(Insn, 6, 2); + unsigned inc = fieldFromInstruction32(Insn, 5, 1) + 1; + unsigned align = fieldFromInstruction32(Insn, 4, 1); + + if (size == 0x3) { + size = 4; + align = 16; + } else { + if (size == 2) { + size = 1 << size; + align *= 8; + } else { + size = 1 << size; + align *= 4*size; + } + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+inc)%32, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2*inc)%32, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+3*inc)%32, Address, Decoder))) + return MCDisassembler::Fail; + if (Rm != 0xF) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(align)); + + if (Rm == 0xD) + Inst.addOperand(MCOperand::CreateReg(0)); + else if (Rm != 0xF) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } + + return S; +} + +static DecodeStatus +DecodeNEONModImmInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rd = fieldFromInstruction32(Insn, 12, 4); + Rd |= fieldFromInstruction32(Insn, 22, 1) << 4; + unsigned imm = fieldFromInstruction32(Insn, 0, 4); + imm |= fieldFromInstruction32(Insn, 16, 3) << 4; + imm |= fieldFromInstruction32(Insn, 24, 1) << 7; + imm |= fieldFromInstruction32(Insn, 8, 4) << 8; + imm |= fieldFromInstruction32(Insn, 5, 1) << 12; + unsigned Q = fieldFromInstruction32(Insn, 6, 1); + + if (Q) { + if (!Check(S, DecodeQPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + } else { + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + } + + Inst.addOperand(MCOperand::CreateImm(imm)); + + switch (Inst.getOpcode()) { + case ARM::VORRiv4i16: + case ARM::VORRiv2i32: + case ARM::VBICiv4i16: + case ARM::VBICiv2i32: + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + break; + case ARM::VORRiv8i16: + case ARM::VORRiv4i32: + case ARM::VBICiv8i16: + case ARM::VBICiv4i32: + if (!Check(S, DecodeQPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + break; + default: + break; + } + + return S; +} + +static DecodeStatus DecodeVSHLMaxInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rd = fieldFromInstruction32(Insn, 12, 4); + Rd |= fieldFromInstruction32(Insn, 22, 1) << 4; + unsigned Rm = fieldFromInstruction32(Insn, 0, 4); + Rm |= fieldFromInstruction32(Insn, 5, 1) << 4; + unsigned size = fieldFromInstruction32(Insn, 18, 2); + + if (!Check(S, DecodeQPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(8 << size)); + + return S; +} + +static DecodeStatus DecodeShiftRight8Imm(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + Inst.addOperand(MCOperand::CreateImm(8 - Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeShiftRight16Imm(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + Inst.addOperand(MCOperand::CreateImm(16 - Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeShiftRight32Imm(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + Inst.addOperand(MCOperand::CreateImm(32 - Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeShiftRight64Imm(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + Inst.addOperand(MCOperand::CreateImm(64 - Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeTBLInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rd = fieldFromInstruction32(Insn, 12, 4); + Rd |= fieldFromInstruction32(Insn, 22, 1) << 4; + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + Rn |= fieldFromInstruction32(Insn, 7, 1) << 4; + unsigned Rm = fieldFromInstruction32(Insn, 0, 4); + Rm |= fieldFromInstruction32(Insn, 5, 1) << 4; + unsigned op = fieldFromInstruction32(Insn, 6, 1); + unsigned length = fieldFromInstruction32(Insn, 8, 2) + 1; + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (op) { + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; // Writeback + } + + for (unsigned i = 0; i < length; ++i) { + if (!Check(S, DecodeDPRRegisterClass(Inst, (Rn+i)%32, Address, Decoder))) + return MCDisassembler::Fail; + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeThumbAddSpecialReg(llvm::MCInst &Inst, uint16_t Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned dst = fieldFromInstruction16(Insn, 8, 3); + unsigned imm = fieldFromInstruction16(Insn, 0, 8); + + if (!Check(S, DecodetGPRRegisterClass(Inst, dst, Address, Decoder))) + return MCDisassembler::Fail; + + switch(Inst.getOpcode()) { + default: + return MCDisassembler::Fail; + case ARM::tADR: + break; // tADR does not explicitly represent the PC as an operand. + case ARM::tADDrSPi: + Inst.addOperand(MCOperand::CreateReg(ARM::SP)); + break; + } + + Inst.addOperand(MCOperand::CreateImm(imm)); + return S; +} + +static DecodeStatus DecodeThumbBROperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + Inst.addOperand(MCOperand::CreateImm(SignExtend32<12>(Val << 1))); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeT2BROperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + Inst.addOperand(MCOperand::CreateImm(SignExtend32<21>(Val))); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeThumbCmpBROperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + Inst.addOperand(MCOperand::CreateImm(SignExtend32<7>(Val << 1))); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeThumbAddrModeRR(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction32(Val, 0, 3); + unsigned Rm = fieldFromInstruction32(Val, 3, 3); + + if (!Check(S, DecodetGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodetGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeThumbAddrModeIS(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction32(Val, 0, 3); + unsigned imm = fieldFromInstruction32(Val, 3, 5); + + if (!Check(S, DecodetGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(imm)); + + return S; +} + +static DecodeStatus DecodeThumbAddrModePC(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + unsigned imm = Val << 2; + + Inst.addOperand(MCOperand::CreateImm(imm)); + tryAddingPcLoadReferenceComment(Address, (Address & ~2u) + imm + 4, Decoder); + + return MCDisassembler::Success; +} + +static DecodeStatus DecodeThumbAddrModeSP(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + Inst.addOperand(MCOperand::CreateReg(ARM::SP)); + Inst.addOperand(MCOperand::CreateImm(Val)); + + return MCDisassembler::Success; +} + +static DecodeStatus DecodeT2AddrModeSOReg(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction32(Val, 6, 4); + unsigned Rm = fieldFromInstruction32(Val, 2, 4); + unsigned imm = fieldFromInstruction32(Val, 0, 2); + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(imm)); + + return S; +} + +static DecodeStatus DecodeT2LoadShift(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + switch (Inst.getOpcode()) { + case ARM::t2PLDs: + case ARM::t2PLDWs: + case ARM::t2PLIs: + break; + default: { + unsigned Rt = fieldFromInstruction32(Insn, 12, 4); + if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + } + } + + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + if (Rn == 0xF) { + switch (Inst.getOpcode()) { + case ARM::t2LDRBs: + Inst.setOpcode(ARM::t2LDRBpci); + break; + case ARM::t2LDRHs: + Inst.setOpcode(ARM::t2LDRHpci); + break; + case ARM::t2LDRSHs: + Inst.setOpcode(ARM::t2LDRSHpci); + break; + case ARM::t2LDRSBs: + Inst.setOpcode(ARM::t2LDRSBpci); + break; + case ARM::t2PLDs: + Inst.setOpcode(ARM::t2PLDi12); + Inst.addOperand(MCOperand::CreateReg(ARM::PC)); + break; + default: + return MCDisassembler::Fail; + } + + int imm = fieldFromInstruction32(Insn, 0, 12); + if (!fieldFromInstruction32(Insn, 23, 1)) imm *= -1; + Inst.addOperand(MCOperand::CreateImm(imm)); + + return S; + } + + unsigned addrmode = fieldFromInstruction32(Insn, 4, 2); + addrmode |= fieldFromInstruction32(Insn, 0, 4) << 2; + addrmode |= fieldFromInstruction32(Insn, 16, 4) << 6; + if (!Check(S, DecodeT2AddrModeSOReg(Inst, addrmode, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeT2Imm8S4(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + int imm = Val & 0xFF; + if (!(Val & 0x100)) imm *= -1; + Inst.addOperand(MCOperand::CreateImm(imm << 2)); + + return MCDisassembler::Success; +} + +static DecodeStatus DecodeT2AddrModeImm8s4(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction32(Val, 9, 4); + unsigned imm = fieldFromInstruction32(Val, 0, 9); + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeT2Imm8S4(Inst, imm, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeT2AddrModeImm0_1020s4(llvm::MCInst &Inst,unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction32(Val, 8, 4); + unsigned imm = fieldFromInstruction32(Val, 0, 8); + + if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + + Inst.addOperand(MCOperand::CreateImm(imm)); + + return S; +} + +static DecodeStatus DecodeT2Imm8(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + int imm = Val & 0xFF; + if (Val == 0) + imm = INT32_MIN; + else if (!(Val & 0x100)) + imm *= -1; + Inst.addOperand(MCOperand::CreateImm(imm)); + + return MCDisassembler::Success; +} + + +static DecodeStatus DecodeT2AddrModeImm8(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction32(Val, 9, 4); + unsigned imm = fieldFromInstruction32(Val, 0, 9); + + // Some instructions always use an additive offset. + switch (Inst.getOpcode()) { + case ARM::t2LDRT: + case ARM::t2LDRBT: + case ARM::t2LDRHT: + case ARM::t2LDRSBT: + case ARM::t2LDRSHT: + case ARM::t2STRT: + case ARM::t2STRBT: + case ARM::t2STRHT: + imm |= 0x100; + break; + default: + break; + } + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeT2Imm8(Inst, imm, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeT2LdStPre(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rt = fieldFromInstruction32(Insn, 12, 4); + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + unsigned addr = fieldFromInstruction32(Insn, 0, 8); + addr |= fieldFromInstruction32(Insn, 9, 1) << 8; + addr |= Rn << 9; + unsigned load = fieldFromInstruction32(Insn, 20, 1); + + if (!load) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + + if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + + if (load) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + + if (!Check(S, DecodeT2AddrModeImm8(Inst, addr, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeT2AddrModeImm12(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction32(Val, 13, 4); + unsigned imm = fieldFromInstruction32(Val, 0, 12); + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(imm)); + + return S; +} + + +static DecodeStatus DecodeThumbAddSPImm(llvm::MCInst &Inst, uint16_t Insn, + uint64_t Address, const void *Decoder) { + unsigned imm = fieldFromInstruction16(Insn, 0, 7); + + Inst.addOperand(MCOperand::CreateReg(ARM::SP)); + Inst.addOperand(MCOperand::CreateReg(ARM::SP)); + Inst.addOperand(MCOperand::CreateImm(imm)); + + return MCDisassembler::Success; +} + +static DecodeStatus DecodeThumbAddSPReg(llvm::MCInst &Inst, uint16_t Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + if (Inst.getOpcode() == ARM::tADDrSP) { + unsigned Rdm = fieldFromInstruction16(Insn, 0, 3); + Rdm |= fieldFromInstruction16(Insn, 7, 1) << 3; + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rdm, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rdm, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateReg(ARM::SP)); + } else if (Inst.getOpcode() == ARM::tADDspr) { + unsigned Rm = fieldFromInstruction16(Insn, 3, 4); + + Inst.addOperand(MCOperand::CreateReg(ARM::SP)); + Inst.addOperand(MCOperand::CreateReg(ARM::SP)); + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } + + return S; +} + +static DecodeStatus DecodeThumbCPS(llvm::MCInst &Inst, uint16_t Insn, + uint64_t Address, const void *Decoder) { + unsigned imod = fieldFromInstruction16(Insn, 4, 1) | 0x2; + unsigned flags = fieldFromInstruction16(Insn, 0, 3); + + Inst.addOperand(MCOperand::CreateImm(imod)); + Inst.addOperand(MCOperand::CreateImm(flags)); + + return MCDisassembler::Success; +} + +static DecodeStatus DecodePostIdxReg(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + unsigned Rm = fieldFromInstruction32(Insn, 0, 4); + unsigned add = fieldFromInstruction32(Insn, 4, 1); + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(add)); + + return S; +} + +static DecodeStatus DecodeThumbBLXOffset(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + if (!tryAddingSymbolicOperand(Address, + (Address & ~2u) + SignExtend32<22>(Val << 1) + 4, + true, 4, Inst, Decoder)) + Inst.addOperand(MCOperand::CreateImm(SignExtend32<22>(Val << 1))); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeCoprocessor(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + if (Val == 0xA || Val == 0xB) + return MCDisassembler::Fail; + + Inst.addOperand(MCOperand::CreateImm(Val)); + return MCDisassembler::Success; +} + +static DecodeStatus +DecodeThumbTableBranch(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + unsigned Rm = fieldFromInstruction32(Insn, 0, 4); + + if (Rn == ARM::SP) S = MCDisassembler::SoftFail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + return S; +} + +static DecodeStatus +DecodeThumb2BCCInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned pred = fieldFromInstruction32(Insn, 22, 4); + if (pred == 0xE || pred == 0xF) { + unsigned opc = fieldFromInstruction32(Insn, 4, 28); + switch (opc) { + default: + return MCDisassembler::Fail; + case 0xf3bf8f4: + Inst.setOpcode(ARM::t2DSB); + break; + case 0xf3bf8f5: + Inst.setOpcode(ARM::t2DMB); + break; + case 0xf3bf8f6: + Inst.setOpcode(ARM::t2ISB); + break; + } + + unsigned imm = fieldFromInstruction32(Insn, 0, 4); + return DecodeMemBarrierOption(Inst, imm, Address, Decoder); + } + + unsigned brtarget = fieldFromInstruction32(Insn, 0, 11) << 1; + brtarget |= fieldFromInstruction32(Insn, 11, 1) << 19; + brtarget |= fieldFromInstruction32(Insn, 13, 1) << 18; + brtarget |= fieldFromInstruction32(Insn, 16, 6) << 12; + brtarget |= fieldFromInstruction32(Insn, 26, 1) << 20; + + if (!Check(S, DecodeT2BROperand(Inst, brtarget, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +// Decode a shifted immediate operand. These basically consist +// of an 8-bit value, and a 4-bit directive that specifies either +// a splat operation or a rotation. +static DecodeStatus DecodeT2SOImm(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + unsigned ctrl = fieldFromInstruction32(Val, 10, 2); + if (ctrl == 0) { + unsigned byte = fieldFromInstruction32(Val, 8, 2); + unsigned imm = fieldFromInstruction32(Val, 0, 8); + switch (byte) { + case 0: + Inst.addOperand(MCOperand::CreateImm(imm)); + break; + case 1: + Inst.addOperand(MCOperand::CreateImm((imm << 16) | imm)); + break; + case 2: + Inst.addOperand(MCOperand::CreateImm((imm << 24) | (imm << 8))); + break; + case 3: + Inst.addOperand(MCOperand::CreateImm((imm << 24) | (imm << 16) | + (imm << 8) | imm)); + break; + } + } else { + unsigned unrot = fieldFromInstruction32(Val, 0, 7) | 0x80; + unsigned rot = fieldFromInstruction32(Val, 7, 5); + unsigned imm = (unrot >> rot) | (unrot << ((32-rot)&31)); + Inst.addOperand(MCOperand::CreateImm(imm)); + } + + return MCDisassembler::Success; +} + +static DecodeStatus +DecodeThumbBCCTargetOperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder){ + Inst.addOperand(MCOperand::CreateImm(Val << 1)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeThumbBLTargetOperand(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder){ + Inst.addOperand(MCOperand::CreateImm(SignExtend32<22>(Val << 1))); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeMemBarrierOption(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + switch (Val) { + default: + return MCDisassembler::Fail; + case 0xF: // SY + case 0xE: // ST + case 0xB: // ISH + case 0xA: // ISHST + case 0x7: // NSH + case 0x6: // NSHST + case 0x3: // OSH + case 0x2: // OSHST + break; + } + + Inst.addOperand(MCOperand::CreateImm(Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeMSRMask(llvm::MCInst &Inst, unsigned Val, + uint64_t Address, const void *Decoder) { + if (!Val) return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(Val)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeDoubleRegLoad(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rt = fieldFromInstruction32(Insn, 12, 4); + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + unsigned pred = fieldFromInstruction32(Insn, 28, 4); + + if ((Rt & 1) || Rt == 0xE || Rn == 0xF) return MCDisassembler::Fail; + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt+1, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + + +static DecodeStatus DecodeDoubleRegStore(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder){ + DecodeStatus S = MCDisassembler::Success; + + unsigned Rd = fieldFromInstruction32(Insn, 12, 4); + unsigned Rt = fieldFromInstruction32(Insn, 0, 4); + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + unsigned pred = fieldFromInstruction32(Insn, 28, 4); + + if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + + if ((Rt & 1) || Rt == 0xE || Rn == 0xF) return MCDisassembler::Fail; + if (Rd == Rn || Rd == Rt || Rd == Rt+1) return MCDisassembler::Fail; + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt+1, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeLDRPreImm(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + unsigned Rt = fieldFromInstruction32(Insn, 12, 4); + unsigned imm = fieldFromInstruction32(Insn, 0, 12); + imm |= fieldFromInstruction32(Insn, 16, 4) << 13; + imm |= fieldFromInstruction32(Insn, 23, 1) << 12; + unsigned pred = fieldFromInstruction32(Insn, 28, 4); + + if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail; + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeAddrModeImm12Operand(Inst, imm, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeLDRPreReg(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + unsigned Rt = fieldFromInstruction32(Insn, 12, 4); + unsigned imm = fieldFromInstruction32(Insn, 0, 12); + imm |= fieldFromInstruction32(Insn, 16, 4) << 13; + imm |= fieldFromInstruction32(Insn, 23, 1) << 12; + unsigned pred = fieldFromInstruction32(Insn, 28, 4); + unsigned Rm = fieldFromInstruction32(Insn, 0, 4); + + if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail; + if (Rm == 0xF) S = MCDisassembler::SoftFail; + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeSORegMemOperand(Inst, imm, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + + +static DecodeStatus DecodeSTRPreImm(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + unsigned Rt = fieldFromInstruction32(Insn, 12, 4); + unsigned imm = fieldFromInstruction32(Insn, 0, 12); + imm |= fieldFromInstruction32(Insn, 16, 4) << 13; + imm |= fieldFromInstruction32(Insn, 23, 1) << 12; + unsigned pred = fieldFromInstruction32(Insn, 28, 4); + + if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail; + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeAddrModeImm12Operand(Inst, imm, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeSTRPreReg(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + unsigned Rt = fieldFromInstruction32(Insn, 12, 4); + unsigned imm = fieldFromInstruction32(Insn, 0, 12); + imm |= fieldFromInstruction32(Insn, 16, 4) << 13; + imm |= fieldFromInstruction32(Insn, 23, 1) << 12; + unsigned pred = fieldFromInstruction32(Insn, 28, 4); + + if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail; + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeSORegMemOperand(Inst, imm, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeVLD1LN(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + unsigned Rm = fieldFromInstruction32(Insn, 0, 4); + unsigned Rd = fieldFromInstruction32(Insn, 12, 4); + Rd |= fieldFromInstruction32(Insn, 22, 1) << 4; + unsigned size = fieldFromInstruction32(Insn, 10, 2); + + unsigned align = 0; + unsigned index = 0; + switch (size) { + default: + return MCDisassembler::Fail; + case 0: + if (fieldFromInstruction32(Insn, 4, 1)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction32(Insn, 5, 3); + break; + case 1: + if (fieldFromInstruction32(Insn, 5, 1)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction32(Insn, 6, 2); + if (fieldFromInstruction32(Insn, 4, 1)) + align = 2; + break; + case 2: + if (fieldFromInstruction32(Insn, 6, 1)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction32(Insn, 7, 1); + if (fieldFromInstruction32(Insn, 4, 2) != 0) + align = 4; + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (Rm != 0xF) { // Writeback + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(align)); + if (Rm != 0xF) { + if (Rm != 0xD) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } else + Inst.addOperand(MCOperand::CreateReg(0)); + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(index)); + + return S; +} + +static DecodeStatus DecodeVST1LN(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + unsigned Rm = fieldFromInstruction32(Insn, 0, 4); + unsigned Rd = fieldFromInstruction32(Insn, 12, 4); + Rd |= fieldFromInstruction32(Insn, 22, 1) << 4; + unsigned size = fieldFromInstruction32(Insn, 10, 2); + + unsigned align = 0; + unsigned index = 0; + switch (size) { + default: + return MCDisassembler::Fail; + case 0: + if (fieldFromInstruction32(Insn, 4, 1)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction32(Insn, 5, 3); + break; + case 1: + if (fieldFromInstruction32(Insn, 5, 1)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction32(Insn, 6, 2); + if (fieldFromInstruction32(Insn, 4, 1)) + align = 2; + break; + case 2: + if (fieldFromInstruction32(Insn, 6, 1)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction32(Insn, 7, 1); + if (fieldFromInstruction32(Insn, 4, 2) != 0) + align = 4; + } + + if (Rm != 0xF) { // Writeback + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(align)); + if (Rm != 0xF) { + if (Rm != 0xD) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } else + Inst.addOperand(MCOperand::CreateReg(0)); + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(index)); + + return S; +} + + +static DecodeStatus DecodeVLD2LN(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + unsigned Rm = fieldFromInstruction32(Insn, 0, 4); + unsigned Rd = fieldFromInstruction32(Insn, 12, 4); + Rd |= fieldFromInstruction32(Insn, 22, 1) << 4; + unsigned size = fieldFromInstruction32(Insn, 10, 2); + + unsigned align = 0; + unsigned index = 0; + unsigned inc = 1; + switch (size) { + default: + return MCDisassembler::Fail; + case 0: + index = fieldFromInstruction32(Insn, 5, 3); + if (fieldFromInstruction32(Insn, 4, 1)) + align = 2; + break; + case 1: + index = fieldFromInstruction32(Insn, 6, 2); + if (fieldFromInstruction32(Insn, 4, 1)) + align = 4; + if (fieldFromInstruction32(Insn, 5, 1)) + inc = 2; + break; + case 2: + if (fieldFromInstruction32(Insn, 5, 1)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction32(Insn, 7, 1); + if (fieldFromInstruction32(Insn, 4, 1) != 0) + align = 8; + if (fieldFromInstruction32(Insn, 6, 1)) + inc = 2; + break; + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder))) + return MCDisassembler::Fail; + if (Rm != 0xF) { // Writeback + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(align)); + if (Rm != 0xF) { + if (Rm != 0xD) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } else + Inst.addOperand(MCOperand::CreateReg(0)); + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(index)); + + return S; +} + +static DecodeStatus DecodeVST2LN(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + unsigned Rm = fieldFromInstruction32(Insn, 0, 4); + unsigned Rd = fieldFromInstruction32(Insn, 12, 4); + Rd |= fieldFromInstruction32(Insn, 22, 1) << 4; + unsigned size = fieldFromInstruction32(Insn, 10, 2); + + unsigned align = 0; + unsigned index = 0; + unsigned inc = 1; + switch (size) { + default: + return MCDisassembler::Fail; + case 0: + index = fieldFromInstruction32(Insn, 5, 3); + if (fieldFromInstruction32(Insn, 4, 1)) + align = 2; + break; + case 1: + index = fieldFromInstruction32(Insn, 6, 2); + if (fieldFromInstruction32(Insn, 4, 1)) + align = 4; + if (fieldFromInstruction32(Insn, 5, 1)) + inc = 2; + break; + case 2: + if (fieldFromInstruction32(Insn, 5, 1)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction32(Insn, 7, 1); + if (fieldFromInstruction32(Insn, 4, 1) != 0) + align = 8; + if (fieldFromInstruction32(Insn, 6, 1)) + inc = 2; + break; + } + + if (Rm != 0xF) { // Writeback + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(align)); + if (Rm != 0xF) { + if (Rm != 0xD) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } else + Inst.addOperand(MCOperand::CreateReg(0)); + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(index)); + + return S; +} + + +static DecodeStatus DecodeVLD3LN(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + unsigned Rm = fieldFromInstruction32(Insn, 0, 4); + unsigned Rd = fieldFromInstruction32(Insn, 12, 4); + Rd |= fieldFromInstruction32(Insn, 22, 1) << 4; + unsigned size = fieldFromInstruction32(Insn, 10, 2); + + unsigned align = 0; + unsigned index = 0; + unsigned inc = 1; + switch (size) { + default: + return MCDisassembler::Fail; + case 0: + if (fieldFromInstruction32(Insn, 4, 1)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction32(Insn, 5, 3); + break; + case 1: + if (fieldFromInstruction32(Insn, 4, 1)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction32(Insn, 6, 2); + if (fieldFromInstruction32(Insn, 5, 1)) + inc = 2; + break; + case 2: + if (fieldFromInstruction32(Insn, 4, 2)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction32(Insn, 7, 1); + if (fieldFromInstruction32(Insn, 6, 1)) + inc = 2; + break; + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder))) + return MCDisassembler::Fail; + + if (Rm != 0xF) { // Writeback + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(align)); + if (Rm != 0xF) { + if (Rm != 0xD) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } else + Inst.addOperand(MCOperand::CreateReg(0)); + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(index)); + + return S; +} + +static DecodeStatus DecodeVST3LN(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + unsigned Rm = fieldFromInstruction32(Insn, 0, 4); + unsigned Rd = fieldFromInstruction32(Insn, 12, 4); + Rd |= fieldFromInstruction32(Insn, 22, 1) << 4; + unsigned size = fieldFromInstruction32(Insn, 10, 2); + + unsigned align = 0; + unsigned index = 0; + unsigned inc = 1; + switch (size) { + default: + return MCDisassembler::Fail; + case 0: + if (fieldFromInstruction32(Insn, 4, 1)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction32(Insn, 5, 3); + break; + case 1: + if (fieldFromInstruction32(Insn, 4, 1)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction32(Insn, 6, 2); + if (fieldFromInstruction32(Insn, 5, 1)) + inc = 2; + break; + case 2: + if (fieldFromInstruction32(Insn, 4, 2)) + return MCDisassembler::Fail; // UNDEFINED + index = fieldFromInstruction32(Insn, 7, 1); + if (fieldFromInstruction32(Insn, 6, 1)) + inc = 2; + break; + } + + if (Rm != 0xF) { // Writeback + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(align)); + if (Rm != 0xF) { + if (Rm != 0xD) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } else + Inst.addOperand(MCOperand::CreateReg(0)); + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(index)); + + return S; +} + + +static DecodeStatus DecodeVLD4LN(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + unsigned Rm = fieldFromInstruction32(Insn, 0, 4); + unsigned Rd = fieldFromInstruction32(Insn, 12, 4); + Rd |= fieldFromInstruction32(Insn, 22, 1) << 4; + unsigned size = fieldFromInstruction32(Insn, 10, 2); + + unsigned align = 0; + unsigned index = 0; + unsigned inc = 1; + switch (size) { + default: + return MCDisassembler::Fail; + case 0: + if (fieldFromInstruction32(Insn, 4, 1)) + align = 4; + index = fieldFromInstruction32(Insn, 5, 3); + break; + case 1: + if (fieldFromInstruction32(Insn, 4, 1)) + align = 8; + index = fieldFromInstruction32(Insn, 6, 2); + if (fieldFromInstruction32(Insn, 5, 1)) + inc = 2; + break; + case 2: + if (fieldFromInstruction32(Insn, 4, 2)) + align = 4 << fieldFromInstruction32(Insn, 4, 2); + index = fieldFromInstruction32(Insn, 7, 1); + if (fieldFromInstruction32(Insn, 6, 1)) + inc = 2; + break; + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+3*inc, Address, Decoder))) + return MCDisassembler::Fail; + + if (Rm != 0xF) { // Writeback + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(align)); + if (Rm != 0xF) { + if (Rm != 0xD) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } else + Inst.addOperand(MCOperand::CreateReg(0)); + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+3*inc, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(index)); + + return S; +} + +static DecodeStatus DecodeVST4LN(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + unsigned Rm = fieldFromInstruction32(Insn, 0, 4); + unsigned Rd = fieldFromInstruction32(Insn, 12, 4); + Rd |= fieldFromInstruction32(Insn, 22, 1) << 4; + unsigned size = fieldFromInstruction32(Insn, 10, 2); + + unsigned align = 0; + unsigned index = 0; + unsigned inc = 1; + switch (size) { + default: + return MCDisassembler::Fail; + case 0: + if (fieldFromInstruction32(Insn, 4, 1)) + align = 4; + index = fieldFromInstruction32(Insn, 5, 3); + break; + case 1: + if (fieldFromInstruction32(Insn, 4, 1)) + align = 8; + index = fieldFromInstruction32(Insn, 6, 2); + if (fieldFromInstruction32(Insn, 5, 1)) + inc = 2; + break; + case 2: + if (fieldFromInstruction32(Insn, 4, 2)) + align = 4 << fieldFromInstruction32(Insn, 4, 2); + index = fieldFromInstruction32(Insn, 7, 1); + if (fieldFromInstruction32(Insn, 6, 1)) + inc = 2; + break; + } + + if (Rm != 0xF) { // Writeback + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + } + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(align)); + if (Rm != 0xF) { + if (Rm != 0xD) { + if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + return MCDisassembler::Fail; + } else + Inst.addOperand(MCOperand::CreateReg(0)); + } + + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+3*inc, Address, Decoder))) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::CreateImm(index)); + + return S; +} + +static DecodeStatus DecodeVMOVSRR(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + unsigned Rt = fieldFromInstruction32(Insn, 12, 4); + unsigned Rt2 = fieldFromInstruction32(Insn, 16, 4); + unsigned Rm = fieldFromInstruction32(Insn, 0, 4); + unsigned pred = fieldFromInstruction32(Insn, 28, 4); + Rm |= fieldFromInstruction32(Insn, 5, 1) << 4; + + if (Rt == 0xF || Rt2 == 0xF || Rm == 0x1F) + S = MCDisassembler::SoftFail; + + if (!Check(S, DecodeSPRRegisterClass(Inst, Rm , Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeSPRRegisterClass(Inst, Rm+1, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt , Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt2 , Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeVMOVRRS(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + unsigned Rt = fieldFromInstruction32(Insn, 12, 4); + unsigned Rt2 = fieldFromInstruction32(Insn, 16, 4); + unsigned Rm = fieldFromInstruction32(Insn, 0, 4); + unsigned pred = fieldFromInstruction32(Insn, 28, 4); + Rm |= fieldFromInstruction32(Insn, 5, 1) << 4; + + if (Rt == 0xF || Rt2 == 0xF || Rm == 0x1F) + S = MCDisassembler::SoftFail; + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt , Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rt2 , Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeSPRRegisterClass(Inst, Rm , Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeSPRRegisterClass(Inst, Rm+1, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeIT(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + unsigned pred = fieldFromInstruction16(Insn, 4, 4); + // The InstPrinter needs to have the low bit of the predicate in + // the mask operand to be able to print it properly. + unsigned mask = fieldFromInstruction16(Insn, 0, 5); + + if (pred == 0xF) { + pred = 0xE; + S = MCDisassembler::SoftFail; + } + + if ((mask & 0xF) == 0) { + // Preserve the high bit of the mask, which is the low bit of + // the predicate. + mask &= 0x10; + mask |= 0x8; + S = MCDisassembler::SoftFail; + } + + Inst.addOperand(MCOperand::CreateImm(pred)); + Inst.addOperand(MCOperand::CreateImm(mask)); + return S; +} + +static DecodeStatus +DecodeT2LDRDPreInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rt = fieldFromInstruction32(Insn, 12, 4); + unsigned Rt2 = fieldFromInstruction32(Insn, 8, 4); + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + unsigned addr = fieldFromInstruction32(Insn, 0, 8); + unsigned W = fieldFromInstruction32(Insn, 21, 1); + unsigned U = fieldFromInstruction32(Insn, 23, 1); + unsigned P = fieldFromInstruction32(Insn, 24, 1); + bool writeback = (W == 1) | (P == 0); + + addr |= (U << 8) | (Rn << 9); + + if (writeback && (Rn == Rt || Rn == Rt2)) + Check(S, MCDisassembler::SoftFail); + if (Rt == Rt2) + Check(S, MCDisassembler::SoftFail); + + // Rt + if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + // Rt2 + if (!Check(S, DecoderGPRRegisterClass(Inst, Rt2, Address, Decoder))) + return MCDisassembler::Fail; + // Writeback operand + if (!Check(S, DecoderGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + // addr + if (!Check(S, DecodeT2AddrModeImm8s4(Inst, addr, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus +DecodeT2STRDPreInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rt = fieldFromInstruction32(Insn, 12, 4); + unsigned Rt2 = fieldFromInstruction32(Insn, 8, 4); + unsigned Rn = fieldFromInstruction32(Insn, 16, 4); + unsigned addr = fieldFromInstruction32(Insn, 0, 8); + unsigned W = fieldFromInstruction32(Insn, 21, 1); + unsigned U = fieldFromInstruction32(Insn, 23, 1); + unsigned P = fieldFromInstruction32(Insn, 24, 1); + bool writeback = (W == 1) | (P == 0); + + addr |= (U << 8) | (Rn << 9); + + if (writeback && (Rn == Rt || Rn == Rt2)) + Check(S, MCDisassembler::SoftFail); + + // Writeback operand + if (!Check(S, DecoderGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + // Rt + if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder))) + return MCDisassembler::Fail; + // Rt2 + if (!Check(S, DecoderGPRRegisterClass(Inst, Rt2, Address, Decoder))) + return MCDisassembler::Fail; + // addr + if (!Check(S, DecodeT2AddrModeImm8s4(Inst, addr, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + +static DecodeStatus DecodeT2Adr(llvm::MCInst &Inst, uint32_t Insn, + uint64_t Address, const void *Decoder) { + unsigned sign1 = fieldFromInstruction32(Insn, 21, 1); + unsigned sign2 = fieldFromInstruction32(Insn, 23, 1); + if (sign1 != sign2) return MCDisassembler::Fail; + + unsigned Val = fieldFromInstruction32(Insn, 0, 8); + Val |= fieldFromInstruction32(Insn, 12, 3) << 8; + Val |= fieldFromInstruction32(Insn, 26, 1) << 11; + Val |= sign1 << 12; + Inst.addOperand(MCOperand::CreateImm(SignExtend32<13>(Val))); + + return MCDisassembler::Success; +} + +static DecodeStatus DecodeT2ShifterImmOperand(llvm::MCInst &Inst, uint32_t Val, + uint64_t Address, + const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + // Shift of "asr #32" is not allowed in Thumb2 mode. + if (Val == 0x20) S = MCDisassembler::SoftFail; + Inst.addOperand(MCOperand::CreateImm(Val)); + return S; +} + diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp new file mode 100644 index 0000000..ccdac3e --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp @@ -0,0 +1,992 @@ +//===-- ARMInstPrinter.cpp - Convert ARM MCInst to assembly syntax --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class prints an ARM MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "asm-printer" +#include "ARMInstPrinter.h" +#include "MCTargetDesc/ARMBaseInfo.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define GET_INSTRUCTION_NAME +#include "ARMGenAsmWriter.inc" + +/// translateShiftImm - Convert shift immediate from 0-31 to 1-32 for printing. +/// +/// getSORegOffset returns an integer from 0-31, representing '32' as 0. +static unsigned translateShiftImm(unsigned imm) { + if (imm == 0) + return 32; + return imm; +} + + +ARMInstPrinter::ARMInstPrinter(const MCAsmInfo &MAI, + const MCSubtargetInfo &STI) : + MCInstPrinter(MAI) { + // Initialize the set of available features. + setAvailableFeatures(STI.getFeatureBits()); +} + +StringRef ARMInstPrinter::getOpcodeName(unsigned Opcode) const { + return getInstructionName(Opcode); +} + +void ARMInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + OS << getRegisterName(RegNo); +} + +void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O, + StringRef Annot) { + unsigned Opcode = MI->getOpcode(); + + // Check for MOVs and print canonical forms, instead. + if (Opcode == ARM::MOVsr) { + // FIXME: Thumb variants? + const MCOperand &Dst = MI->getOperand(0); + const MCOperand &MO1 = MI->getOperand(1); + const MCOperand &MO2 = MI->getOperand(2); + const MCOperand &MO3 = MI->getOperand(3); + + O << '\t' << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO3.getImm())); + printSBitModifierOperand(MI, 6, O); + printPredicateOperand(MI, 4, O); + + O << '\t' << getRegisterName(Dst.getReg()) + << ", " << getRegisterName(MO1.getReg()); + + O << ", " << getRegisterName(MO2.getReg()); + assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0); + printAnnotation(O, Annot); + return; + } + + if (Opcode == ARM::MOVsi) { + // FIXME: Thumb variants? + const MCOperand &Dst = MI->getOperand(0); + const MCOperand &MO1 = MI->getOperand(1); + const MCOperand &MO2 = MI->getOperand(2); + + O << '\t' << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO2.getImm())); + printSBitModifierOperand(MI, 5, O); + printPredicateOperand(MI, 3, O); + + O << '\t' << getRegisterName(Dst.getReg()) + << ", " << getRegisterName(MO1.getReg()); + + if (ARM_AM::getSORegShOp(MO2.getImm()) == ARM_AM::rrx) { + printAnnotation(O, Annot); + return; + } + + O << ", #" << translateShiftImm(ARM_AM::getSORegOffset(MO2.getImm())); + printAnnotation(O, Annot); + return; + } + + + // A8.6.123 PUSH + if ((Opcode == ARM::STMDB_UPD || Opcode == ARM::t2STMDB_UPD) && + MI->getOperand(0).getReg() == ARM::SP) { + O << '\t' << "push"; + printPredicateOperand(MI, 2, O); + if (Opcode == ARM::t2STMDB_UPD) + O << ".w"; + O << '\t'; + printRegisterList(MI, 4, O); + printAnnotation(O, Annot); + return; + } + if (Opcode == ARM::STR_PRE_IMM && MI->getOperand(2).getReg() == ARM::SP && + MI->getOperand(3).getImm() == -4) { + O << '\t' << "push"; + printPredicateOperand(MI, 4, O); + O << "\t{" << getRegisterName(MI->getOperand(1).getReg()) << "}"; + printAnnotation(O, Annot); + return; + } + + // A8.6.122 POP + if ((Opcode == ARM::LDMIA_UPD || Opcode == ARM::t2LDMIA_UPD) && + MI->getOperand(0).getReg() == ARM::SP) { + O << '\t' << "pop"; + printPredicateOperand(MI, 2, O); + if (Opcode == ARM::t2LDMIA_UPD) + O << ".w"; + O << '\t'; + printRegisterList(MI, 4, O); + printAnnotation(O, Annot); + return; + } + if (Opcode == ARM::LDR_POST_IMM && MI->getOperand(2).getReg() == ARM::SP && + MI->getOperand(4).getImm() == 4) { + O << '\t' << "pop"; + printPredicateOperand(MI, 5, O); + O << "\t{" << getRegisterName(MI->getOperand(0).getReg()) << "}"; + printAnnotation(O, Annot); + return; + } + + + // A8.6.355 VPUSH + if ((Opcode == ARM::VSTMSDB_UPD || Opcode == ARM::VSTMDDB_UPD) && + MI->getOperand(0).getReg() == ARM::SP) { + O << '\t' << "vpush"; + printPredicateOperand(MI, 2, O); + O << '\t'; + printRegisterList(MI, 4, O); + printAnnotation(O, Annot); + return; + } + + // A8.6.354 VPOP + if ((Opcode == ARM::VLDMSIA_UPD || Opcode == ARM::VLDMDIA_UPD) && + MI->getOperand(0).getReg() == ARM::SP) { + O << '\t' << "vpop"; + printPredicateOperand(MI, 2, O); + O << '\t'; + printRegisterList(MI, 4, O); + printAnnotation(O, Annot); + return; + } + + if (Opcode == ARM::tLDMIA) { + bool Writeback = true; + unsigned BaseReg = MI->getOperand(0).getReg(); + for (unsigned i = 3; i < MI->getNumOperands(); ++i) { + if (MI->getOperand(i).getReg() == BaseReg) + Writeback = false; + } + + O << "\tldm"; + + printPredicateOperand(MI, 1, O); + O << '\t' << getRegisterName(BaseReg); + if (Writeback) O << "!"; + O << ", "; + printRegisterList(MI, 3, O); + printAnnotation(O, Annot); + return; + } + + // Thumb1 NOP + if (Opcode == ARM::tMOVr && MI->getOperand(0).getReg() == ARM::R8 && + MI->getOperand(1).getReg() == ARM::R8) { + O << "\tnop"; + printPredicateOperand(MI, 2, O); + printAnnotation(O, Annot); + return; + } + + printInstruction(MI, O); + printAnnotation(O, Annot); +} + +void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + unsigned Reg = Op.getReg(); + O << getRegisterName(Reg); + } else if (Op.isImm()) { + O << '#' << Op.getImm(); + } else { + assert(Op.isExpr() && "unknown operand kind in printOperand"); + // If a symbolic branch target was added as a constant expression then print + // that address in hex. + const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr()); + int64_t Address; + if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) { + O << "0x"; + O.write_hex(Address); + } + else { + // Otherwise, just print the expression. + O << *Op.getExpr(); + } + } +} + +void ARMInstPrinter::printT2LdrLabelOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + if (MO1.isExpr()) + O << *MO1.getExpr(); + else if (MO1.isImm()) + O << "[pc, #" << MO1.getImm() << "]"; + else + llvm_unreachable("Unknown LDR label operand?"); +} + +// so_reg is a 4-operand unit corresponding to register forms of the A5.1 +// "Addressing Mode 1 - Data-processing operands" forms. This includes: +// REG 0 0 - e.g. R5 +// REG REG 0,SH_OPC - e.g. R5, ROR R3 +// REG 0 IMM,SH_OPC - e.g. R5, LSL #3 +void ARMInstPrinter::printSORegRegOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum+1); + const MCOperand &MO3 = MI->getOperand(OpNum+2); + + O << getRegisterName(MO1.getReg()); + + // Print the shift opc. + ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO3.getImm()); + O << ", " << ARM_AM::getShiftOpcStr(ShOpc); + if (ShOpc == ARM_AM::rrx) + return; + + O << ' ' << getRegisterName(MO2.getReg()); + assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0); +} + +void ARMInstPrinter::printSORegImmOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum+1); + + O << getRegisterName(MO1.getReg()); + + // Print the shift opc. + ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO2.getImm()); + O << ", " << ARM_AM::getShiftOpcStr(ShOpc); + if (ShOpc == ARM_AM::rrx) + return; + O << " #" << translateShiftImm(ARM_AM::getSORegOffset(MO2.getImm())); +} + + +//===--------------------------------------------------------------------===// +// Addressing Mode #2 +//===--------------------------------------------------------------------===// + +void ARMInstPrinter::printAM2PreOrOffsetIndexOp(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(Op); + const MCOperand &MO2 = MI->getOperand(Op+1); + const MCOperand &MO3 = MI->getOperand(Op+2); + + O << "[" << getRegisterName(MO1.getReg()); + + if (!MO2.getReg()) { + if (ARM_AM::getAM2Offset(MO3.getImm())) // Don't print +0. + O << ", #" + << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm())) + << ARM_AM::getAM2Offset(MO3.getImm()); + O << "]"; + return; + } + + O << ", " + << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm())) + << getRegisterName(MO2.getReg()); + + if (unsigned ShImm = ARM_AM::getAM2Offset(MO3.getImm())) + O << ", " + << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO3.getImm())) + << " #" << ShImm; + O << "]"; +} + +void ARMInstPrinter::printAM2PostIndexOp(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(Op); + const MCOperand &MO2 = MI->getOperand(Op+1); + const MCOperand &MO3 = MI->getOperand(Op+2); + + O << "[" << getRegisterName(MO1.getReg()) << "], "; + + if (!MO2.getReg()) { + unsigned ImmOffs = ARM_AM::getAM2Offset(MO3.getImm()); + O << '#' + << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm())) + << ImmOffs; + return; + } + + O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm())) + << getRegisterName(MO2.getReg()); + + if (unsigned ShImm = ARM_AM::getAM2Offset(MO3.getImm())) + O << ", " + << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO3.getImm())) + << " #" << ShImm; +} + +void ARMInstPrinter::printAddrModeTBB(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(Op); + const MCOperand &MO2 = MI->getOperand(Op+1); + O << "[" << getRegisterName(MO1.getReg()) << ", " + << getRegisterName(MO2.getReg()) << "]"; +} + +void ARMInstPrinter::printAddrModeTBH(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(Op); + const MCOperand &MO2 = MI->getOperand(Op+1); + O << "[" << getRegisterName(MO1.getReg()) << ", " + << getRegisterName(MO2.getReg()) << ", lsl #1]"; +} + +void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(Op); + + if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, Op, O); + return; + } + + const MCOperand &MO3 = MI->getOperand(Op+2); + unsigned IdxMode = ARM_AM::getAM2IdxMode(MO3.getImm()); + + if (IdxMode == ARMII::IndexModePost) { + printAM2PostIndexOp(MI, Op, O); + return; + } + printAM2PreOrOffsetIndexOp(MI, Op, O); +} + +void ARMInstPrinter::printAddrMode2OffsetOperand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum+1); + + if (!MO1.getReg()) { + unsigned ImmOffs = ARM_AM::getAM2Offset(MO2.getImm()); + O << '#' + << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm())) + << ImmOffs; + return; + } + + O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm())) + << getRegisterName(MO1.getReg()); + + if (unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm())) + O << ", " + << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO2.getImm())) + << " #" << ShImm; +} + +//===--------------------------------------------------------------------===// +// Addressing Mode #3 +//===--------------------------------------------------------------------===// + +void ARMInstPrinter::printAM3PostIndexOp(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(Op); + const MCOperand &MO2 = MI->getOperand(Op+1); + const MCOperand &MO3 = MI->getOperand(Op+2); + + O << "[" << getRegisterName(MO1.getReg()) << "], "; + + if (MO2.getReg()) { + O << (char)ARM_AM::getAM3Op(MO3.getImm()) + << getRegisterName(MO2.getReg()); + return; + } + + unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm()); + O << '#' + << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm())) + << ImmOffs; +} + +void ARMInstPrinter::printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(Op); + const MCOperand &MO2 = MI->getOperand(Op+1); + const MCOperand &MO3 = MI->getOperand(Op+2); + + O << '[' << getRegisterName(MO1.getReg()); + + if (MO2.getReg()) { + O << ", " << getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm())) + << getRegisterName(MO2.getReg()) << ']'; + return; + } + + if (unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm())) + O << ", #" + << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm())) + << ImmOffs; + O << ']'; +} + +void ARMInstPrinter::printAddrMode3Operand(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &MO3 = MI->getOperand(Op+2); + unsigned IdxMode = ARM_AM::getAM3IdxMode(MO3.getImm()); + + if (IdxMode == ARMII::IndexModePost) { + printAM3PostIndexOp(MI, Op, O); + return; + } + printAM3PreOrOffsetIndexOp(MI, Op, O); +} + +void ARMInstPrinter::printAddrMode3OffsetOperand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum+1); + + if (MO1.getReg()) { + O << getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm())) + << getRegisterName(MO1.getReg()); + return; + } + + unsigned ImmOffs = ARM_AM::getAM3Offset(MO2.getImm()); + O << '#' + << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm())) + << ImmOffs; +} + +void ARMInstPrinter::printPostIdxImm8Operand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + unsigned Imm = MO.getImm(); + O << '#' << ((Imm & 256) ? "" : "-") << (Imm & 0xff); +} + +void ARMInstPrinter::printPostIdxRegOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum+1); + + O << (MO2.getImm() ? "" : "-") << getRegisterName(MO1.getReg()); +} + +void ARMInstPrinter::printPostIdxImm8s4Operand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + unsigned Imm = MO.getImm(); + O << '#' << ((Imm & 256) ? "" : "-") << ((Imm & 0xff) << 2); +} + + +void ARMInstPrinter::printLdStmModeOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(OpNum) + .getImm()); + O << ARM_AM::getAMSubModeStr(Mode); +} + +void ARMInstPrinter::printAddrMode5Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum+1); + + if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, OpNum, O); + return; + } + + O << "[" << getRegisterName(MO1.getReg()); + + unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm()); + unsigned Op = ARM_AM::getAM5Op(MO2.getImm()); + if (ImmOffs || Op == ARM_AM::sub) { + O << ", #" + << ARM_AM::getAddrOpcStr(ARM_AM::getAM5Op(MO2.getImm())) + << ImmOffs * 4; + } + O << "]"; +} + +void ARMInstPrinter::printAddrMode6Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum+1); + + O << "[" << getRegisterName(MO1.getReg()); + if (MO2.getImm()) { + // FIXME: Both darwin as and GNU as violate ARM docs here. + O << ", :" << (MO2.getImm() << 3); + } + O << "]"; +} + +void ARMInstPrinter::printAddrMode7Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + O << "[" << getRegisterName(MO1.getReg()) << "]"; +} + +void ARMInstPrinter::printAddrMode6OffsetOperand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + if (MO.getReg() == 0) + O << "!"; + else + O << ", " << getRegisterName(MO.getReg()); +} + +void ARMInstPrinter::printBitfieldInvMaskImmOperand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + uint32_t v = ~MO.getImm(); + int32_t lsb = CountTrailingZeros_32(v); + int32_t width = (32 - CountLeadingZeros_32 (v)) - lsb; + assert(MO.isImm() && "Not a valid bf_inv_mask_imm value!"); + O << '#' << lsb << ", #" << width; +} + +void ARMInstPrinter::printMemBOption(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + unsigned val = MI->getOperand(OpNum).getImm(); + O << ARM_MB::MemBOptToString(val); +} + +void ARMInstPrinter::printShiftImmOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + unsigned ShiftOp = MI->getOperand(OpNum).getImm(); + bool isASR = (ShiftOp & (1 << 5)) != 0; + unsigned Amt = ShiftOp & 0x1f; + if (isASR) + O << ", asr #" << (Amt == 0 ? 32 : Amt); + else if (Amt) + O << ", lsl #" << Amt; +} + +void ARMInstPrinter::printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNum).getImm(); + if (Imm == 0) + return; + assert(Imm > 0 && Imm < 32 && "Invalid PKH shift immediate value!"); + O << ", lsl #" << Imm; +} + +void ARMInstPrinter::printPKHASRShiftImm(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNum).getImm(); + // A shift amount of 32 is encoded as 0. + if (Imm == 0) + Imm = 32; + assert(Imm > 0 && Imm <= 32 && "Invalid PKH shift immediate value!"); + O << ", asr #" << Imm; +} + +void ARMInstPrinter::printRegisterList(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + O << "{"; + for (unsigned i = OpNum, e = MI->getNumOperands(); i != e; ++i) { + if (i != OpNum) O << ", "; + O << getRegisterName(MI->getOperand(i).getReg()); + } + O << "}"; +} + +void ARMInstPrinter::printSetendOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNum); + if (Op.getImm()) + O << "be"; + else + O << "le"; +} + +void ARMInstPrinter::printCPSIMod(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNum); + O << ARM_PROC::IModToString(Op.getImm()); +} + +void ARMInstPrinter::printCPSIFlag(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNum); + unsigned IFlags = Op.getImm(); + for (int i=2; i >= 0; --i) + if (IFlags & (1 << i)) + O << ARM_PROC::IFlagsToString(1 << i); + + if (IFlags == 0) + O << "none"; +} + +void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNum); + unsigned SpecRegRBit = Op.getImm() >> 4; + unsigned Mask = Op.getImm() & 0xf; + + if (getAvailableFeatures() & ARM::FeatureMClass) { + switch (Op.getImm()) { + default: assert(0 && "Unexpected mask value!"); + case 0: O << "apsr"; return; + case 1: O << "iapsr"; return; + case 2: O << "eapsr"; return; + case 3: O << "xpsr"; return; + case 5: O << "ipsr"; return; + case 6: O << "epsr"; return; + case 7: O << "iepsr"; return; + case 8: O << "msp"; return; + case 9: O << "psp"; return; + case 16: O << "primask"; return; + case 17: O << "basepri"; return; + case 18: O << "basepri_max"; return; + case 19: O << "faultmask"; return; + case 20: O << "control"; return; + } + } + + // As special cases, CPSR_f, CPSR_s and CPSR_fs prefer printing as + // APSR_nzcvq, APSR_g and APSRnzcvqg, respectively. + if (!SpecRegRBit && (Mask == 8 || Mask == 4 || Mask == 12)) { + O << "APSR_"; + switch (Mask) { + default: assert(0); + case 4: O << "g"; return; + case 8: O << "nzcvq"; return; + case 12: O << "nzcvqg"; return; + } + llvm_unreachable("Unexpected mask value!"); + } + + if (SpecRegRBit) + O << "SPSR"; + else + O << "CPSR"; + + if (Mask) { + O << '_'; + if (Mask & 8) O << 'f'; + if (Mask & 4) O << 's'; + if (Mask & 2) O << 'x'; + if (Mask & 1) O << 'c'; + } +} + +void ARMInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm(); + if (CC != ARMCC::AL) + O << ARMCondCodeToString(CC); +} + +void ARMInstPrinter::printMandatoryPredicateOperand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm(); + O << ARMCondCodeToString(CC); +} + +void ARMInstPrinter::printSBitModifierOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + if (MI->getOperand(OpNum).getReg()) { + assert(MI->getOperand(OpNum).getReg() == ARM::CPSR && + "Expect ARM CPSR register!"); + O << 's'; + } +} + +void ARMInstPrinter::printNoHashImmediate(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + O << MI->getOperand(OpNum).getImm(); +} + +void ARMInstPrinter::printPImmediate(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + O << "p" << MI->getOperand(OpNum).getImm(); +} + +void ARMInstPrinter::printCImmediate(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + O << "c" << MI->getOperand(OpNum).getImm(); +} + +void ARMInstPrinter::printCoprocOptionImm(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + O << "{" << MI->getOperand(OpNum).getImm() << "}"; +} + +void ARMInstPrinter::printPCLabel(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + llvm_unreachable("Unhandled PC-relative pseudo-instruction!"); +} + +void ARMInstPrinter::printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + O << "#" << MI->getOperand(OpNum).getImm() * 4; +} + +void ARMInstPrinter::printThumbSRImm(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNum).getImm(); + O << "#" << (Imm == 0 ? 32 : Imm); +} + +void ARMInstPrinter::printThumbITMask(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + // (3 - the number of trailing zeros) is the number of then / else. + unsigned Mask = MI->getOperand(OpNum).getImm(); + unsigned CondBit0 = Mask >> 4 & 1; + unsigned NumTZ = CountTrailingZeros_32(Mask); + assert(NumTZ <= 3 && "Invalid IT mask!"); + for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) { + bool T = ((Mask >> Pos) & 1) == CondBit0; + if (T) + O << 't'; + else + O << 'e'; + } +} + +void ARMInstPrinter::printThumbAddrModeRROperand(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(Op); + const MCOperand &MO2 = MI->getOperand(Op + 1); + + if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, Op, O); + return; + } + + O << "[" << getRegisterName(MO1.getReg()); + if (unsigned RegNum = MO2.getReg()) + O << ", " << getRegisterName(RegNum); + O << "]"; +} + +void ARMInstPrinter::printThumbAddrModeImm5SOperand(const MCInst *MI, + unsigned Op, + raw_ostream &O, + unsigned Scale) { + const MCOperand &MO1 = MI->getOperand(Op); + const MCOperand &MO2 = MI->getOperand(Op + 1); + + if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, Op, O); + return; + } + + O << "[" << getRegisterName(MO1.getReg()); + if (unsigned ImmOffs = MO2.getImm()) + O << ", #" << ImmOffs * Scale; + O << "]"; +} + +void ARMInstPrinter::printThumbAddrModeImm5S1Operand(const MCInst *MI, + unsigned Op, + raw_ostream &O) { + printThumbAddrModeImm5SOperand(MI, Op, O, 1); +} + +void ARMInstPrinter::printThumbAddrModeImm5S2Operand(const MCInst *MI, + unsigned Op, + raw_ostream &O) { + printThumbAddrModeImm5SOperand(MI, Op, O, 2); +} + +void ARMInstPrinter::printThumbAddrModeImm5S4Operand(const MCInst *MI, + unsigned Op, + raw_ostream &O) { + printThumbAddrModeImm5SOperand(MI, Op, O, 4); +} + +void ARMInstPrinter::printThumbAddrModeSPOperand(const MCInst *MI, unsigned Op, + raw_ostream &O) { + printThumbAddrModeImm5SOperand(MI, Op, O, 4); +} + +// Constant shifts t2_so_reg is a 2-operand unit corresponding to the Thumb2 +// register with shift forms. +// REG 0 0 - e.g. R5 +// REG IMM, SH_OPC - e.g. R5, LSL #3 +void ARMInstPrinter::printT2SOOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum+1); + + unsigned Reg = MO1.getReg(); + O << getRegisterName(Reg); + + // Print the shift opc. + assert(MO2.isImm() && "Not a valid t2_so_reg value!"); + ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO2.getImm()); + O << ", " << ARM_AM::getShiftOpcStr(ShOpc); + if (ShOpc != ARM_AM::rrx) + O << " #" << translateShiftImm(ARM_AM::getSORegOffset(MO2.getImm())); +} + +void ARMInstPrinter::printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum+1); + + if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, OpNum, O); + return; + } + + O << "[" << getRegisterName(MO1.getReg()); + + int32_t OffImm = (int32_t)MO2.getImm(); + bool isSub = OffImm < 0; + // Special value for #-0. All others are normal. + if (OffImm == INT32_MIN) + OffImm = 0; + if (isSub) + O << ", #-" << -OffImm; + else if (OffImm > 0) + O << ", #" << OffImm; + O << "]"; +} + +void ARMInstPrinter::printT2AddrModeImm8Operand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum+1); + + O << "[" << getRegisterName(MO1.getReg()); + + int32_t OffImm = (int32_t)MO2.getImm(); + // Don't print +0. + if (OffImm == INT32_MIN) + O << ", #-0"; + else if (OffImm < 0) + O << ", #-" << -OffImm; + else if (OffImm > 0) + O << ", #" << OffImm; + O << "]"; +} + +void ARMInstPrinter::printT2AddrModeImm8s4Operand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum+1); + + O << "[" << getRegisterName(MO1.getReg()); + + int32_t OffImm = (int32_t)MO2.getImm() / 4; + // Don't print +0. + if (OffImm < 0) + O << ", #-" << -OffImm * 4; + else if (OffImm > 0) + O << ", #" << OffImm * 4; + O << "]"; +} + +void ARMInstPrinter::printT2AddrModeImm0_1020s4Operand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum+1); + + O << "[" << getRegisterName(MO1.getReg()); + if (MO2.getImm()) + O << ", #" << MO2.getImm() * 4; + O << "]"; +} + +void ARMInstPrinter::printT2AddrModeImm8OffsetOperand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + int32_t OffImm = (int32_t)MO1.getImm(); + // Don't print +0. + if (OffImm < 0) + O << ", #-" << -OffImm; + else + O << ", #" << OffImm; +} + +void ARMInstPrinter::printT2AddrModeImm8s4OffsetOperand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + int32_t OffImm = (int32_t)MO1.getImm() / 4; + // Don't print +0. + if (OffImm != 0) { + O << ", "; + if (OffImm < 0) + O << "#-" << -OffImm * 4; + else if (OffImm > 0) + O << "#" << OffImm * 4; + } +} + +void ARMInstPrinter::printT2AddrModeSoRegOperand(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + const MCOperand &MO2 = MI->getOperand(OpNum+1); + const MCOperand &MO3 = MI->getOperand(OpNum+2); + + O << "[" << getRegisterName(MO1.getReg()); + + assert(MO2.getReg() && "Invalid so_reg load / store address!"); + O << ", " << getRegisterName(MO2.getReg()); + + unsigned ShAmt = MO3.getImm(); + if (ShAmt) { + assert(ShAmt <= 3 && "Not a valid Thumb2 addressing mode!"); + O << ", lsl #" << ShAmt; + } + O << "]"; +} + +void ARMInstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + O << '#' << ARM_AM::getFPImmFloat(MO.getImm()); +} + +void ARMInstPrinter::printNEONModImmOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + unsigned EncodedImm = MI->getOperand(OpNum).getImm(); + unsigned EltBits; + uint64_t Val = ARM_AM::decodeNEONModImm(EncodedImm, EltBits); + O << "#0x" << utohexstr(Val); +} + +void ARMInstPrinter::printImmPlusOneOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNum).getImm(); + O << "#" << Imm + 1; +} + +void ARMInstPrinter::printRotImmOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNum).getImm(); + if (Imm == 0) + return; + O << ", ror #"; + switch (Imm) { + default: assert (0 && "illegal ror immediate!"); + case 1: O << "8"; break; + case 2: O << "16"; break; + case 3: O << "24"; break; + } +} + +void ARMInstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + O << "[" << MI->getOperand(OpNum).getImm() << "]"; +} diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h new file mode 100644 index 0000000..5c2173f --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h @@ -0,0 +1,136 @@ +//===-- ARMInstPrinter.h - Convert ARM MCInst to assembly syntax ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class prints an ARM MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMINSTPRINTER_H +#define ARMINSTPRINTER_H + +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCSubtargetInfo.h" + +namespace llvm { + +class MCOperand; + +class ARMInstPrinter : public MCInstPrinter { +public: + ARMInstPrinter(const MCAsmInfo &MAI, const MCSubtargetInfo &STI); + + virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot); + virtual StringRef getOpcodeName(unsigned Opcode) const; + virtual void printRegName(raw_ostream &OS, unsigned RegNo) const; + + static const char *getInstructionName(unsigned Opcode); + + // Autogenerated by tblgen. + void printInstruction(const MCInst *MI, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); + + + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + + void printSORegRegOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printSORegImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + + void printAddrModeTBB(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printAddrModeTBH(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printAddrMode2Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printAM2PostIndexOp(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printAM2PreOrOffsetIndexOp(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printAddrMode2OffsetOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + + void printAddrMode3Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printAddrMode3OffsetOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printAM3PostIndexOp(const MCInst *MI, unsigned Op, raw_ostream &O); + void printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,raw_ostream &O); + void printPostIdxImm8Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printPostIdxRegOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printPostIdxImm8s4Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + + void printLdStmModeOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printAddrMode5Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printAddrMode6Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printAddrMode7Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printAddrMode6OffsetOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + + void printBitfieldInvMaskImmOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printMemBOption(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printShiftImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printPKHASRShiftImm(const MCInst *MI, unsigned OpNum, raw_ostream &O); + + void printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printThumbSRImm(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printThumbITMask(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printThumbAddrModeRROperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printThumbAddrModeImm5SOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O, unsigned Scale); + void printThumbAddrModeImm5S1Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printThumbAddrModeImm5S2Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printThumbAddrModeImm5S4Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printThumbAddrModeSPOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + + void printT2SOOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printT2AddrModeImm8Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printT2AddrModeImm8s4Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printT2AddrModeImm0_1020s4Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printT2AddrModeImm8OffsetOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printT2AddrModeImm8s4OffsetOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printT2AddrModeSoRegOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + + void printSetendOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printCPSIMod(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printCPSIFlag(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printMSRMaskOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printPredicateOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printMandatoryPredicateOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printSBitModifierOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + void printRegisterList(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printNoHashImmediate(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printPImmediate(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printCImmediate(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printCoprocOptionImm(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printFPImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printNEONModImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printImmPlusOneOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printRotImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + + void printPCLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printT2LdrLabelOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printVectorIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O); +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h new file mode 100644 index 0000000..9982fa6 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h @@ -0,0 +1,667 @@ +//===- ARMAddressingModes.h - ARM Addressing Modes --------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the ARM addressing mode implementation stuff. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_ARM_ARMADDRESSINGMODES_H +#define LLVM_TARGET_ARM_ARMADDRESSINGMODES_H + +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/Support/MathExtras.h" +#include <cassert> + +namespace llvm { + +/// ARM_AM - ARM Addressing Mode Stuff +namespace ARM_AM { + enum ShiftOpc { + no_shift = 0, + asr, + lsl, + lsr, + ror, + rrx + }; + + enum AddrOpc { + sub = 0, + add + }; + + static inline const char *getAddrOpcStr(AddrOpc Op) { + return Op == sub ? "-" : ""; + } + + static inline const char *getShiftOpcStr(ShiftOpc Op) { + switch (Op) { + default: assert(0 && "Unknown shift opc!"); + case ARM_AM::asr: return "asr"; + case ARM_AM::lsl: return "lsl"; + case ARM_AM::lsr: return "lsr"; + case ARM_AM::ror: return "ror"; + case ARM_AM::rrx: return "rrx"; + } + } + + static inline unsigned getShiftOpcEncoding(ShiftOpc Op) { + switch (Op) { + default: assert(0 && "Unknown shift opc!"); + case ARM_AM::asr: return 2; + case ARM_AM::lsl: return 0; + case ARM_AM::lsr: return 1; + case ARM_AM::ror: return 3; + } + } + + enum AMSubMode { + bad_am_submode = 0, + ia, + ib, + da, + db + }; + + static inline const char *getAMSubModeStr(AMSubMode Mode) { + switch (Mode) { + default: assert(0 && "Unknown addressing sub-mode!"); + case ARM_AM::ia: return "ia"; + case ARM_AM::ib: return "ib"; + case ARM_AM::da: return "da"; + case ARM_AM::db: return "db"; + } + } + + /// rotr32 - Rotate a 32-bit unsigned value right by a specified # bits. + /// + static inline unsigned rotr32(unsigned Val, unsigned Amt) { + assert(Amt < 32 && "Invalid rotate amount"); + return (Val >> Amt) | (Val << ((32-Amt)&31)); + } + + /// rotl32 - Rotate a 32-bit unsigned value left by a specified # bits. + /// + static inline unsigned rotl32(unsigned Val, unsigned Amt) { + assert(Amt < 32 && "Invalid rotate amount"); + return (Val << Amt) | (Val >> ((32-Amt)&31)); + } + + //===--------------------------------------------------------------------===// + // Addressing Mode #1: shift_operand with registers + //===--------------------------------------------------------------------===// + // + // This 'addressing mode' is used for arithmetic instructions. It can + // represent things like: + // reg + // reg [asr|lsl|lsr|ror|rrx] reg + // reg [asr|lsl|lsr|ror|rrx] imm + // + // This is stored three operands [rega, regb, opc]. The first is the base + // reg, the second is the shift amount (or reg0 if not present or imm). The + // third operand encodes the shift opcode and the imm if a reg isn't present. + // + static inline unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm) { + return ShOp | (Imm << 3); + } + static inline unsigned getSORegOffset(unsigned Op) { + return Op >> 3; + } + static inline ShiftOpc getSORegShOp(unsigned Op) { + return (ShiftOpc)(Op & 7); + } + + /// getSOImmValImm - Given an encoded imm field for the reg/imm form, return + /// the 8-bit imm value. + static inline unsigned getSOImmValImm(unsigned Imm) { + return Imm & 0xFF; + } + /// getSOImmValRot - Given an encoded imm field for the reg/imm form, return + /// the rotate amount. + static inline unsigned getSOImmValRot(unsigned Imm) { + return (Imm >> 8) * 2; + } + + /// getSOImmValRotate - Try to handle Imm with an immediate shifter operand, + /// computing the rotate amount to use. If this immediate value cannot be + /// handled with a single shifter-op, determine a good rotate amount that will + /// take a maximal chunk of bits out of the immediate. + static inline unsigned getSOImmValRotate(unsigned Imm) { + // 8-bit (or less) immediates are trivially shifter_operands with a rotate + // of zero. + if ((Imm & ~255U) == 0) return 0; + + // Use CTZ to compute the rotate amount. + unsigned TZ = CountTrailingZeros_32(Imm); + + // Rotate amount must be even. Something like 0x200 must be rotated 8 bits, + // not 9. + unsigned RotAmt = TZ & ~1; + + // If we can handle this spread, return it. + if ((rotr32(Imm, RotAmt) & ~255U) == 0) + return (32-RotAmt)&31; // HW rotates right, not left. + + // For values like 0xF000000F, we should ignore the low 6 bits, then + // retry the hunt. + if (Imm & 63U) { + unsigned TZ2 = CountTrailingZeros_32(Imm & ~63U); + unsigned RotAmt2 = TZ2 & ~1; + if ((rotr32(Imm, RotAmt2) & ~255U) == 0) + return (32-RotAmt2)&31; // HW rotates right, not left. + } + + // Otherwise, we have no way to cover this span of bits with a single + // shifter_op immediate. Return a chunk of bits that will be useful to + // handle. + return (32-RotAmt)&31; // HW rotates right, not left. + } + + /// getSOImmVal - Given a 32-bit immediate, if it is something that can fit + /// into an shifter_operand immediate operand, return the 12-bit encoding for + /// it. If not, return -1. + static inline int getSOImmVal(unsigned Arg) { + // 8-bit (or less) immediates are trivially shifter_operands with a rotate + // of zero. + if ((Arg & ~255U) == 0) return Arg; + + unsigned RotAmt = getSOImmValRotate(Arg); + + // If this cannot be handled with a single shifter_op, bail out. + if (rotr32(~255U, RotAmt) & Arg) + return -1; + + // Encode this correctly. + return rotl32(Arg, RotAmt) | ((RotAmt>>1) << 8); + } + + /// isSOImmTwoPartVal - Return true if the specified value can be obtained by + /// or'ing together two SOImmVal's. + static inline bool isSOImmTwoPartVal(unsigned V) { + // If this can be handled with a single shifter_op, bail out. + V = rotr32(~255U, getSOImmValRotate(V)) & V; + if (V == 0) + return false; + + // If this can be handled with two shifter_op's, accept. + V = rotr32(~255U, getSOImmValRotate(V)) & V; + return V == 0; + } + + /// getSOImmTwoPartFirst - If V is a value that satisfies isSOImmTwoPartVal, + /// return the first chunk of it. + static inline unsigned getSOImmTwoPartFirst(unsigned V) { + return rotr32(255U, getSOImmValRotate(V)) & V; + } + + /// getSOImmTwoPartSecond - If V is a value that satisfies isSOImmTwoPartVal, + /// return the second chunk of it. + static inline unsigned getSOImmTwoPartSecond(unsigned V) { + // Mask out the first hunk. + V = rotr32(~255U, getSOImmValRotate(V)) & V; + + // Take what's left. + assert(V == (rotr32(255U, getSOImmValRotate(V)) & V)); + return V; + } + + /// getThumbImmValShift - Try to handle Imm with a 8-bit immediate followed + /// by a left shift. Returns the shift amount to use. + static inline unsigned getThumbImmValShift(unsigned Imm) { + // 8-bit (or less) immediates are trivially immediate operand with a shift + // of zero. + if ((Imm & ~255U) == 0) return 0; + + // Use CTZ to compute the shift amount. + return CountTrailingZeros_32(Imm); + } + + /// isThumbImmShiftedVal - Return true if the specified value can be obtained + /// by left shifting a 8-bit immediate. + static inline bool isThumbImmShiftedVal(unsigned V) { + // If this can be handled with + V = (~255U << getThumbImmValShift(V)) & V; + return V == 0; + } + + /// getThumbImm16ValShift - Try to handle Imm with a 16-bit immediate followed + /// by a left shift. Returns the shift amount to use. + static inline unsigned getThumbImm16ValShift(unsigned Imm) { + // 16-bit (or less) immediates are trivially immediate operand with a shift + // of zero. + if ((Imm & ~65535U) == 0) return 0; + + // Use CTZ to compute the shift amount. + return CountTrailingZeros_32(Imm); + } + + /// isThumbImm16ShiftedVal - Return true if the specified value can be + /// obtained by left shifting a 16-bit immediate. + static inline bool isThumbImm16ShiftedVal(unsigned V) { + // If this can be handled with + V = (~65535U << getThumbImm16ValShift(V)) & V; + return V == 0; + } + + /// getThumbImmNonShiftedVal - If V is a value that satisfies + /// isThumbImmShiftedVal, return the non-shiftd value. + static inline unsigned getThumbImmNonShiftedVal(unsigned V) { + return V >> getThumbImmValShift(V); + } + + + /// getT2SOImmValSplat - Return the 12-bit encoded representation + /// if the specified value can be obtained by splatting the low 8 bits + /// into every other byte or every byte of a 32-bit value. i.e., + /// 00000000 00000000 00000000 abcdefgh control = 0 + /// 00000000 abcdefgh 00000000 abcdefgh control = 1 + /// abcdefgh 00000000 abcdefgh 00000000 control = 2 + /// abcdefgh abcdefgh abcdefgh abcdefgh control = 3 + /// Return -1 if none of the above apply. + /// See ARM Reference Manual A6.3.2. + static inline int getT2SOImmValSplatVal(unsigned V) { + unsigned u, Vs, Imm; + // control = 0 + if ((V & 0xffffff00) == 0) + return V; + + // If the value is zeroes in the first byte, just shift those off + Vs = ((V & 0xff) == 0) ? V >> 8 : V; + // Any passing value only has 8 bits of payload, splatted across the word + Imm = Vs & 0xff; + // Likewise, any passing values have the payload splatted into the 3rd byte + u = Imm | (Imm << 16); + + // control = 1 or 2 + if (Vs == u) + return (((Vs == V) ? 1 : 2) << 8) | Imm; + + // control = 3 + if (Vs == (u | (u << 8))) + return (3 << 8) | Imm; + + return -1; + } + + /// getT2SOImmValRotateVal - Return the 12-bit encoded representation if the + /// specified value is a rotated 8-bit value. Return -1 if no rotation + /// encoding is possible. + /// See ARM Reference Manual A6.3.2. + static inline int getT2SOImmValRotateVal(unsigned V) { + unsigned RotAmt = CountLeadingZeros_32(V); + if (RotAmt >= 24) + return -1; + + // If 'Arg' can be handled with a single shifter_op return the value. + if ((rotr32(0xff000000U, RotAmt) & V) == V) + return (rotr32(V, 24 - RotAmt) & 0x7f) | ((RotAmt + 8) << 7); + + return -1; + } + + /// getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit + /// into a Thumb-2 shifter_operand immediate operand, return the 12-bit + /// encoding for it. If not, return -1. + /// See ARM Reference Manual A6.3.2. + static inline int getT2SOImmVal(unsigned Arg) { + // If 'Arg' is an 8-bit splat, then get the encoded value. + int Splat = getT2SOImmValSplatVal(Arg); + if (Splat != -1) + return Splat; + + // If 'Arg' can be handled with a single shifter_op return the value. + int Rot = getT2SOImmValRotateVal(Arg); + if (Rot != -1) + return Rot; + + return -1; + } + + static inline unsigned getT2SOImmValRotate(unsigned V) { + if ((V & ~255U) == 0) return 0; + // Use CTZ to compute the rotate amount. + unsigned RotAmt = CountTrailingZeros_32(V); + return (32 - RotAmt) & 31; + } + + static inline bool isT2SOImmTwoPartVal (unsigned Imm) { + unsigned V = Imm; + // Passing values can be any combination of splat values and shifter + // values. If this can be handled with a single shifter or splat, bail + // out. Those should be handled directly, not with a two-part val. + if (getT2SOImmValSplatVal(V) != -1) + return false; + V = rotr32 (~255U, getT2SOImmValRotate(V)) & V; + if (V == 0) + return false; + + // If this can be handled as an immediate, accept. + if (getT2SOImmVal(V) != -1) return true; + + // Likewise, try masking out a splat value first. + V = Imm; + if (getT2SOImmValSplatVal(V & 0xff00ff00U) != -1) + V &= ~0xff00ff00U; + else if (getT2SOImmValSplatVal(V & 0x00ff00ffU) != -1) + V &= ~0x00ff00ffU; + // If what's left can be handled as an immediate, accept. + if (getT2SOImmVal(V) != -1) return true; + + // Otherwise, do not accept. + return false; + } + + static inline unsigned getT2SOImmTwoPartFirst(unsigned Imm) { + assert (isT2SOImmTwoPartVal(Imm) && + "Immedate cannot be encoded as two part immediate!"); + // Try a shifter operand as one part + unsigned V = rotr32 (~255, getT2SOImmValRotate(Imm)) & Imm; + // If the rest is encodable as an immediate, then return it. + if (getT2SOImmVal(V) != -1) return V; + + // Try masking out a splat value first. + if (getT2SOImmValSplatVal(Imm & 0xff00ff00U) != -1) + return Imm & 0xff00ff00U; + + // The other splat is all that's left as an option. + assert (getT2SOImmValSplatVal(Imm & 0x00ff00ffU) != -1); + return Imm & 0x00ff00ffU; + } + + static inline unsigned getT2SOImmTwoPartSecond(unsigned Imm) { + // Mask out the first hunk + Imm ^= getT2SOImmTwoPartFirst(Imm); + // Return what's left + assert (getT2SOImmVal(Imm) != -1 && + "Unable to encode second part of T2 two part SO immediate"); + return Imm; + } + + + //===--------------------------------------------------------------------===// + // Addressing Mode #2 + //===--------------------------------------------------------------------===// + // + // This is used for most simple load/store instructions. + // + // addrmode2 := reg +/- reg shop imm + // addrmode2 := reg +/- imm12 + // + // The first operand is always a Reg. The second operand is a reg if in + // reg/reg form, otherwise it's reg#0. The third field encodes the operation + // in bit 12, the immediate in bits 0-11, and the shift op in 13-15. The + // fourth operand 16-17 encodes the index mode. + // + // If this addressing mode is a frame index (before prolog/epilog insertion + // and code rewriting), this operand will have the form: FI#, reg0, <offs> + // with no shift amount for the frame offset. + // + static inline unsigned getAM2Opc(AddrOpc Opc, unsigned Imm12, ShiftOpc SO, + unsigned IdxMode = 0) { + assert(Imm12 < (1 << 12) && "Imm too large!"); + bool isSub = Opc == sub; + return Imm12 | ((int)isSub << 12) | (SO << 13) | (IdxMode << 16) ; + } + static inline unsigned getAM2Offset(unsigned AM2Opc) { + return AM2Opc & ((1 << 12)-1); + } + static inline AddrOpc getAM2Op(unsigned AM2Opc) { + return ((AM2Opc >> 12) & 1) ? sub : add; + } + static inline ShiftOpc getAM2ShiftOpc(unsigned AM2Opc) { + return (ShiftOpc)((AM2Opc >> 13) & 7); + } + static inline unsigned getAM2IdxMode(unsigned AM2Opc) { + return (AM2Opc >> 16); + } + + + //===--------------------------------------------------------------------===// + // Addressing Mode #3 + //===--------------------------------------------------------------------===// + // + // This is used for sign-extending loads, and load/store-pair instructions. + // + // addrmode3 := reg +/- reg + // addrmode3 := reg +/- imm8 + // + // The first operand is always a Reg. The second operand is a reg if in + // reg/reg form, otherwise it's reg#0. The third field encodes the operation + // in bit 8, the immediate in bits 0-7. The fourth operand 9-10 encodes the + // index mode. + + /// getAM3Opc - This function encodes the addrmode3 opc field. + static inline unsigned getAM3Opc(AddrOpc Opc, unsigned char Offset, + unsigned IdxMode = 0) { + bool isSub = Opc == sub; + return ((int)isSub << 8) | Offset | (IdxMode << 9); + } + static inline unsigned char getAM3Offset(unsigned AM3Opc) { + return AM3Opc & 0xFF; + } + static inline AddrOpc getAM3Op(unsigned AM3Opc) { + return ((AM3Opc >> 8) & 1) ? sub : add; + } + static inline unsigned getAM3IdxMode(unsigned AM3Opc) { + return (AM3Opc >> 9); + } + + //===--------------------------------------------------------------------===// + // Addressing Mode #4 + //===--------------------------------------------------------------------===// + // + // This is used for load / store multiple instructions. + // + // addrmode4 := reg, <mode> + // + // The four modes are: + // IA - Increment after + // IB - Increment before + // DA - Decrement after + // DB - Decrement before + // For VFP instructions, only the IA and DB modes are valid. + + static inline AMSubMode getAM4SubMode(unsigned Mode) { + return (AMSubMode)(Mode & 0x7); + } + + static inline unsigned getAM4ModeImm(AMSubMode SubMode) { + return (int)SubMode; + } + + //===--------------------------------------------------------------------===// + // Addressing Mode #5 + //===--------------------------------------------------------------------===// + // + // This is used for coprocessor instructions, such as FP load/stores. + // + // addrmode5 := reg +/- imm8*4 + // + // The first operand is always a Reg. The second operand encodes the + // operation in bit 8 and the immediate in bits 0-7. + + /// getAM5Opc - This function encodes the addrmode5 opc field. + static inline unsigned getAM5Opc(AddrOpc Opc, unsigned char Offset) { + bool isSub = Opc == sub; + return ((int)isSub << 8) | Offset; + } + static inline unsigned char getAM5Offset(unsigned AM5Opc) { + return AM5Opc & 0xFF; + } + static inline AddrOpc getAM5Op(unsigned AM5Opc) { + return ((AM5Opc >> 8) & 1) ? sub : add; + } + + //===--------------------------------------------------------------------===// + // Addressing Mode #6 + //===--------------------------------------------------------------------===// + // + // This is used for NEON load / store instructions. + // + // addrmode6 := reg with optional alignment + // + // This is stored in two operands [regaddr, align]. The first is the + // address register. The second operand is the value of the alignment + // specifier in bytes or zero if no explicit alignment. + // Valid alignments depend on the specific instruction. + + //===--------------------------------------------------------------------===// + // NEON Modified Immediates + //===--------------------------------------------------------------------===// + // + // Several NEON instructions (e.g., VMOV) take a "modified immediate" + // vector operand, where a small immediate encoded in the instruction + // specifies a full NEON vector value. These modified immediates are + // represented here as encoded integers. The low 8 bits hold the immediate + // value; bit 12 holds the "Op" field of the instruction, and bits 11-8 hold + // the "Cmode" field of the instruction. The interfaces below treat the + // Op and Cmode values as a single 5-bit value. + + static inline unsigned createNEONModImm(unsigned OpCmode, unsigned Val) { + return (OpCmode << 8) | Val; + } + static inline unsigned getNEONModImmOpCmode(unsigned ModImm) { + return (ModImm >> 8) & 0x1f; + } + static inline unsigned getNEONModImmVal(unsigned ModImm) { + return ModImm & 0xff; + } + + /// decodeNEONModImm - Decode a NEON modified immediate value into the + /// element value and the element size in bits. (If the element size is + /// smaller than the vector, it is splatted into all the elements.) + static inline uint64_t decodeNEONModImm(unsigned ModImm, unsigned &EltBits) { + unsigned OpCmode = getNEONModImmOpCmode(ModImm); + unsigned Imm8 = getNEONModImmVal(ModImm); + uint64_t Val = 0; + + if (OpCmode == 0xe) { + // 8-bit vector elements + Val = Imm8; + EltBits = 8; + } else if ((OpCmode & 0xc) == 0x8) { + // 16-bit vector elements + unsigned ByteNum = (OpCmode & 0x6) >> 1; + Val = Imm8 << (8 * ByteNum); + EltBits = 16; + } else if ((OpCmode & 0x8) == 0) { + // 32-bit vector elements, zero with one byte set + unsigned ByteNum = (OpCmode & 0x6) >> 1; + Val = Imm8 << (8 * ByteNum); + EltBits = 32; + } else if ((OpCmode & 0xe) == 0xc) { + // 32-bit vector elements, one byte with low bits set + unsigned ByteNum = 1 + (OpCmode & 0x1); + Val = (Imm8 << (8 * ByteNum)) | (0xffff >> (8 * (2 - ByteNum))); + EltBits = 32; + } else if (OpCmode == 0x1e) { + // 64-bit vector elements + for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) { + if ((ModImm >> ByteNum) & 1) + Val |= (uint64_t)0xff << (8 * ByteNum); + } + EltBits = 64; + } else { + assert(false && "Unsupported NEON immediate"); + } + return Val; + } + + AMSubMode getLoadStoreMultipleSubMode(int Opcode); + + //===--------------------------------------------------------------------===// + // Floating-point Immediates + // + static inline float getFPImmFloat(unsigned Imm) { + // We expect an 8-bit binary encoding of a floating-point number here. + union { + uint32_t I; + float F; + } FPUnion; + + uint8_t Sign = (Imm >> 7) & 0x1; + uint8_t Exp = (Imm >> 4) & 0x7; + uint8_t Mantissa = Imm & 0xf; + + // 8-bit FP iEEEE Float Encoding + // abcd efgh aBbbbbbc defgh000 00000000 00000000 + // + // where B = NOT(b); + + FPUnion.I = 0; + FPUnion.I |= Sign << 31; + FPUnion.I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30; + FPUnion.I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25; + FPUnion.I |= (Exp & 0x3) << 23; + FPUnion.I |= Mantissa << 19; + return FPUnion.F; + } + + /// getFP32Imm - Return an 8-bit floating-point version of the 32-bit + /// floating-point value. If the value cannot be represented as an 8-bit + /// floating-point value, then return -1. + static inline int getFP32Imm(const APInt &Imm) { + uint32_t Sign = Imm.lshr(31).getZExtValue() & 1; + int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127; // -126 to 127 + int64_t Mantissa = Imm.getZExtValue() & 0x7fffff; // 23 bits + + // We can handle 4 bits of mantissa. + // mantissa = (16+UInt(e:f:g:h))/16. + if (Mantissa & 0x7ffff) + return -1; + Mantissa >>= 19; + if ((Mantissa & 0xf) != Mantissa) + return -1; + + // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3 + if (Exp < -3 || Exp > 4) + return -1; + Exp = ((Exp+3) & 0x7) ^ 4; + + return ((int)Sign << 7) | (Exp << 4) | Mantissa; + } + + static inline int getFP32Imm(const APFloat &FPImm) { + return getFP32Imm(FPImm.bitcastToAPInt()); + } + + /// getFP64Imm - Return an 8-bit floating-point version of the 64-bit + /// floating-point value. If the value cannot be represented as an 8-bit + /// floating-point value, then return -1. + static inline int getFP64Imm(const APInt &Imm) { + uint64_t Sign = Imm.lshr(63).getZExtValue() & 1; + int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023; // -1022 to 1023 + uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffULL; + + // We can handle 4 bits of mantissa. + // mantissa = (16+UInt(e:f:g:h))/16. + if (Mantissa & 0xffffffffffffULL) + return -1; + Mantissa >>= 48; + if ((Mantissa & 0xf) != Mantissa) + return -1; + + // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3 + if (Exp < -3 || Exp > 4) + return -1; + Exp = ((Exp+3) & 0x7) ^ 4; + + return ((int)Sign << 7) | (Exp << 4) | Mantissa; + } + + static inline int getFP64Imm(const APFloat &FPImm) { + return getFP64Imm(FPImm.bitcastToAPInt()); + } + +} // end namespace ARM_AM +} // end namespace llvm + +#endif + diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp new file mode 100644 index 0000000..c31c5e6 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -0,0 +1,531 @@ +//===-- ARMAsmBackend.cpp - ARM Assembler Backend -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/ARMMCTargetDesc.h" +#include "MCTargetDesc/ARMBaseInfo.h" +#include "MCTargetDesc/ARMFixupKinds.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCDirectives.h" +#include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCMachObjectWriter.h" +#include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Object/MachOFormat.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +namespace { +class ARMELFObjectWriter : public MCELFObjectTargetWriter { +public: + ARMELFObjectWriter(Triple::OSType OSType) + : MCELFObjectTargetWriter(/*Is64Bit*/ false, OSType, ELF::EM_ARM, + /*HasRelocationAddend*/ false) {} +}; + +class ARMAsmBackend : public MCAsmBackend { + const MCSubtargetInfo* STI; + bool isThumbMode; // Currently emitting Thumb code. +public: + ARMAsmBackend(const Target &T, const StringRef TT) + : MCAsmBackend(), STI(ARM_MC::createARMMCSubtargetInfo(TT, "", "")), + isThumbMode(TT.startswith("thumb")) {} + + ~ARMAsmBackend() { + delete STI; + } + + unsigned getNumFixupKinds() const { return ARM::NumTargetFixupKinds; } + + bool hasNOP() const { + return (STI->getFeatureBits() & ARM::HasV6T2Ops) != 0; + } + + const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const { + const static MCFixupKindInfo Infos[ARM::NumTargetFixupKinds] = { +// This table *must* be in the order that the fixup_* kinds are defined in +// ARMFixupKinds.h. +// +// Name Offset (bits) Size (bits) Flags +{ "fixup_arm_ldst_pcrel_12", 1, 24, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_t2_ldst_pcrel_12", 0, 32, MCFixupKindInfo::FKF_IsPCRel | + MCFixupKindInfo::FKF_IsAlignedDownTo32Bits}, +{ "fixup_arm_pcrel_10", 1, 24, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_t2_pcrel_10", 0, 32, MCFixupKindInfo::FKF_IsPCRel | + MCFixupKindInfo::FKF_IsAlignedDownTo32Bits}, +{ "fixup_thumb_adr_pcrel_10",0, 8, MCFixupKindInfo::FKF_IsPCRel | + MCFixupKindInfo::FKF_IsAlignedDownTo32Bits}, +{ "fixup_arm_adr_pcrel_12", 1, 24, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_t2_adr_pcrel_12", 0, 32, MCFixupKindInfo::FKF_IsPCRel | + MCFixupKindInfo::FKF_IsAlignedDownTo32Bits}, +{ "fixup_arm_condbranch", 0, 24, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_arm_uncondbranch", 0, 24, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_t2_condbranch", 0, 32, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_t2_uncondbranch", 0, 32, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_arm_thumb_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_arm_thumb_bl", 0, 32, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_arm_thumb_blx", 0, 32, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_arm_thumb_cb", 0, 16, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_arm_thumb_cp", 0, 8, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_arm_thumb_bcc", 0, 8, MCFixupKindInfo::FKF_IsPCRel }, +// movw / movt: 16-bits immediate but scattered into two chunks 0 - 12, 16 - 19. +{ "fixup_arm_movt_hi16", 0, 20, 0 }, +{ "fixup_arm_movw_lo16", 0, 20, 0 }, +{ "fixup_t2_movt_hi16", 0, 20, 0 }, +{ "fixup_t2_movw_lo16", 0, 20, 0 }, +{ "fixup_arm_movt_hi16_pcrel", 0, 20, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_arm_movw_lo16_pcrel", 0, 20, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_t2_movt_hi16_pcrel", 0, 20, MCFixupKindInfo::FKF_IsPCRel }, +{ "fixup_t2_movw_lo16_pcrel", 0, 20, MCFixupKindInfo::FKF_IsPCRel }, + }; + + if (Kind < FirstTargetFixupKind) + return MCAsmBackend::getFixupKindInfo(Kind); + + assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() && + "Invalid kind!"); + return Infos[Kind - FirstTargetFixupKind]; + } + + bool MayNeedRelaxation(const MCInst &Inst) const; + + void RelaxInstruction(const MCInst &Inst, MCInst &Res) const; + + bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const; + + void HandleAssemblerFlag(MCAssemblerFlag Flag) { + switch (Flag) { + default: break; + case MCAF_Code16: + setIsThumb(true); + break; + case MCAF_Code32: + setIsThumb(false); + break; + } + } + + unsigned getPointerSize() const { return 4; } + bool isThumb() const { return isThumbMode; } + void setIsThumb(bool it) { isThumbMode = it; } +}; +} // end anonymous namespace + +bool ARMAsmBackend::MayNeedRelaxation(const MCInst &Inst) const { + // FIXME: Thumb targets, different move constant targets.. + return false; +} + +void ARMAsmBackend::RelaxInstruction(const MCInst &Inst, MCInst &Res) const { + assert(0 && "ARMAsmBackend::RelaxInstruction() unimplemented"); + return; +} + +bool ARMAsmBackend::WriteNopData(uint64_t Count, MCObjectWriter *OW) const { + const uint16_t Thumb1_16bitNopEncoding = 0x46c0; // using MOV r8,r8 + const uint16_t Thumb2_16bitNopEncoding = 0xbf00; // NOP + const uint32_t ARMv4_NopEncoding = 0xe1a0000; // using MOV r0,r0 + const uint32_t ARMv6T2_NopEncoding = 0xe3207800; // NOP + if (isThumb()) { + const uint16_t nopEncoding = hasNOP() ? Thumb2_16bitNopEncoding + : Thumb1_16bitNopEncoding; + uint64_t NumNops = Count / 2; + for (uint64_t i = 0; i != NumNops; ++i) + OW->Write16(nopEncoding); + if (Count & 1) + OW->Write8(0); + return true; + } + // ARM mode + const uint32_t nopEncoding = hasNOP() ? ARMv6T2_NopEncoding + : ARMv4_NopEncoding; + uint64_t NumNops = Count / 4; + for (uint64_t i = 0; i != NumNops; ++i) + OW->Write32(nopEncoding); + // FIXME: should this function return false when unable to write exactly + // 'Count' bytes with NOP encodings? + switch (Count % 4) { + default: break; // No leftover bytes to write + case 1: OW->Write8(0); break; + case 2: OW->Write16(0); break; + case 3: OW->Write16(0); OW->Write8(0xa0); break; + } + + return true; +} + +static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) { + switch (Kind) { + default: + llvm_unreachable("Unknown fixup kind!"); + case FK_Data_1: + case FK_Data_2: + case FK_Data_4: + return Value; + case ARM::fixup_arm_movt_hi16: + Value >>= 16; + // Fallthrough + case ARM::fixup_arm_movw_lo16: + case ARM::fixup_arm_movt_hi16_pcrel: + case ARM::fixup_arm_movw_lo16_pcrel: { + unsigned Hi4 = (Value & 0xF000) >> 12; + unsigned Lo12 = Value & 0x0FFF; + // inst{19-16} = Hi4; + // inst{11-0} = Lo12; + Value = (Hi4 << 16) | (Lo12); + return Value; + } + case ARM::fixup_t2_movt_hi16: + Value >>= 16; + // Fallthrough + case ARM::fixup_t2_movw_lo16: + case ARM::fixup_t2_movt_hi16_pcrel: //FIXME: Shouldn't this be shifted like + // the other hi16 fixup? + case ARM::fixup_t2_movw_lo16_pcrel: { + unsigned Hi4 = (Value & 0xF000) >> 12; + unsigned i = (Value & 0x800) >> 11; + unsigned Mid3 = (Value & 0x700) >> 8; + unsigned Lo8 = Value & 0x0FF; + // inst{19-16} = Hi4; + // inst{26} = i; + // inst{14-12} = Mid3; + // inst{7-0} = Lo8; + Value = (Hi4 << 16) | (i << 26) | (Mid3 << 12) | (Lo8); + uint64_t swapped = (Value & 0xFFFF0000) >> 16; + swapped |= (Value & 0x0000FFFF) << 16; + return swapped; + } + case ARM::fixup_arm_ldst_pcrel_12: + // ARM PC-relative values are offset by 8. + Value -= 4; + // FALLTHROUGH + case ARM::fixup_t2_ldst_pcrel_12: { + // Offset by 4, adjusted by two due to the half-word ordering of thumb. + Value -= 4; + bool isAdd = true; + if ((int64_t)Value < 0) { + Value = -Value; + isAdd = false; + } + assert ((Value < 4096) && "Out of range pc-relative fixup value!"); + Value |= isAdd << 23; + + // Same addressing mode as fixup_arm_pcrel_10, + // but with 16-bit halfwords swapped. + if (Kind == ARM::fixup_t2_ldst_pcrel_12) { + uint64_t swapped = (Value & 0xFFFF0000) >> 16; + swapped |= (Value & 0x0000FFFF) << 16; + return swapped; + } + + return Value; + } + case ARM::fixup_thumb_adr_pcrel_10: + return ((Value - 4) >> 2) & 0xff; + case ARM::fixup_arm_adr_pcrel_12: { + // ARM PC-relative values are offset by 8. + Value -= 8; + unsigned opc = 4; // bits {24-21}. Default to add: 0b0100 + if ((int64_t)Value < 0) { + Value = -Value; + opc = 2; // 0b0010 + } + assert(ARM_AM::getSOImmVal(Value) != -1 && + "Out of range pc-relative fixup value!"); + // Encode the immediate and shift the opcode into place. + return ARM_AM::getSOImmVal(Value) | (opc << 21); + } + + case ARM::fixup_t2_adr_pcrel_12: { + Value -= 4; + unsigned opc = 0; + if ((int64_t)Value < 0) { + Value = -Value; + opc = 5; + } + + uint32_t out = (opc << 21); + out |= (Value & 0x800) << 15; + out |= (Value & 0x700) << 4; + out |= (Value & 0x0FF); + + uint64_t swapped = (out & 0xFFFF0000) >> 16; + swapped |= (out & 0x0000FFFF) << 16; + return swapped; + } + + case ARM::fixup_arm_condbranch: + case ARM::fixup_arm_uncondbranch: + // These values don't encode the low two bits since they're always zero. + // Offset by 8 just as above. + return 0xffffff & ((Value - 8) >> 2); + case ARM::fixup_t2_uncondbranch: { + Value = Value - 4; + Value >>= 1; // Low bit is not encoded. + + uint32_t out = 0; + bool I = Value & 0x800000; + bool J1 = Value & 0x400000; + bool J2 = Value & 0x200000; + J1 ^= I; + J2 ^= I; + + out |= I << 26; // S bit + out |= !J1 << 13; // J1 bit + out |= !J2 << 11; // J2 bit + out |= (Value & 0x1FF800) << 5; // imm6 field + out |= (Value & 0x0007FF); // imm11 field + + uint64_t swapped = (out & 0xFFFF0000) >> 16; + swapped |= (out & 0x0000FFFF) << 16; + return swapped; + } + case ARM::fixup_t2_condbranch: { + Value = Value - 4; + Value >>= 1; // Low bit is not encoded. + + uint64_t out = 0; + out |= (Value & 0x80000) << 7; // S bit + out |= (Value & 0x40000) >> 7; // J2 bit + out |= (Value & 0x20000) >> 4; // J1 bit + out |= (Value & 0x1F800) << 5; // imm6 field + out |= (Value & 0x007FF); // imm11 field + + uint32_t swapped = (out & 0xFFFF0000) >> 16; + swapped |= (out & 0x0000FFFF) << 16; + return swapped; + } + case ARM::fixup_arm_thumb_bl: { + // The value doesn't encode the low bit (always zero) and is offset by + // four. The value is encoded into disjoint bit positions in the destination + // opcode. x = unchanged, I = immediate value bit, S = sign extension bit + // + // BL: xxxxxSIIIIIIIIII xxxxxIIIIIIIIIII + // + // Note that the halfwords are stored high first, low second; so we need + // to transpose the fixup value here to map properly. + unsigned isNeg = (int64_t(Value - 4) < 0) ? 1 : 0; + uint32_t Binary = 0; + Value = 0x3fffff & ((Value - 4) >> 1); + Binary = (Value & 0x7ff) << 16; // Low imm11 value. + Binary |= (Value & 0x1ffc00) >> 11; // High imm10 value. + Binary |= isNeg << 10; // Sign bit. + return Binary; + } + case ARM::fixup_arm_thumb_blx: { + // The value doesn't encode the low two bits (always zero) and is offset by + // four (see fixup_arm_thumb_cp). The value is encoded into disjoint bit + // positions in the destination opcode. x = unchanged, I = immediate value + // bit, S = sign extension bit, 0 = zero. + // + // BLX: xxxxxSIIIIIIIIII xxxxxIIIIIIIIII0 + // + // Note that the halfwords are stored high first, low second; so we need + // to transpose the fixup value here to map properly. + unsigned isNeg = (int64_t(Value-4) < 0) ? 1 : 0; + uint32_t Binary = 0; + Value = 0xfffff & ((Value - 2) >> 2); + Binary = (Value & 0x3ff) << 17; // Low imm10L value. + Binary |= (Value & 0xffc00) >> 10; // High imm10H value. + Binary |= isNeg << 10; // Sign bit. + return Binary; + } + case ARM::fixup_arm_thumb_cp: + // Offset by 4, and don't encode the low two bits. Two bytes of that + // 'off by 4' is implicitly handled by the half-word ordering of the + // Thumb encoding, so we only need to adjust by 2 here. + return ((Value - 2) >> 2) & 0xff; + case ARM::fixup_arm_thumb_cb: { + // Offset by 4 and don't encode the lower bit, which is always 0. + uint32_t Binary = (Value - 4) >> 1; + return ((Binary & 0x20) << 4) | ((Binary & 0x1f) << 3); + } + case ARM::fixup_arm_thumb_br: + // Offset by 4 and don't encode the lower bit, which is always 0. + return ((Value - 4) >> 1) & 0x7ff; + case ARM::fixup_arm_thumb_bcc: + // Offset by 4 and don't encode the lower bit, which is always 0. + return ((Value - 4) >> 1) & 0xff; + case ARM::fixup_arm_pcrel_10: + Value = Value - 4; // ARM fixups offset by an additional word and don't + // need to adjust for the half-word ordering. + // Fall through. + case ARM::fixup_t2_pcrel_10: { + // Offset by 4, adjusted by two due to the half-word ordering of thumb. + Value = Value - 4; + bool isAdd = true; + if ((int64_t)Value < 0) { + Value = -Value; + isAdd = false; + } + // These values don't encode the low two bits since they're always zero. + Value >>= 2; + assert ((Value < 256) && "Out of range pc-relative fixup value!"); + Value |= isAdd << 23; + + // Same addressing mode as fixup_arm_pcrel_10, + // but with 16-bit halfwords swapped. + if (Kind == ARM::fixup_t2_pcrel_10) { + uint32_t swapped = (Value & 0xFFFF0000) >> 16; + swapped |= (Value & 0x0000FFFF) << 16; + return swapped; + } + + return Value; + } + } +} + +namespace { + +// FIXME: This should be in a separate file. +// ELF is an ELF of course... +class ELFARMAsmBackend : public ARMAsmBackend { +public: + Triple::OSType OSType; + ELFARMAsmBackend(const Target &T, const StringRef TT, + Triple::OSType _OSType) + : ARMAsmBackend(T, TT), OSType(_OSType) { } + + void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, + uint64_t Value) const; + + MCObjectWriter *createObjectWriter(raw_ostream &OS) const { + return createELFObjectWriter(new ARMELFObjectWriter(OSType), OS, + /*IsLittleEndian*/ true); + } +}; + +// FIXME: Raise this to share code between Darwin and ELF. +void ELFARMAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data, + unsigned DataSize, uint64_t Value) const { + unsigned NumBytes = 4; // FIXME: 2 for Thumb + Value = adjustFixupValue(Fixup.getKind(), Value); + if (!Value) return; // Doesn't change encoding. + + unsigned Offset = Fixup.getOffset(); + + // For each byte of the fragment that the fixup touches, mask in the bits from + // the fixup value. The Value has been "split up" into the appropriate + // bitfields above. + for (unsigned i = 0; i != NumBytes; ++i) + Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); +} + +// FIXME: This should be in a separate file. +class DarwinARMAsmBackend : public ARMAsmBackend { +public: + const object::mach::CPUSubtypeARM Subtype; + DarwinARMAsmBackend(const Target &T, const StringRef TT, + object::mach::CPUSubtypeARM st) + : ARMAsmBackend(T, TT), Subtype(st) { } + + MCObjectWriter *createObjectWriter(raw_ostream &OS) const { + return createARMMachObjectWriter(OS, /*Is64Bit=*/false, + object::mach::CTM_ARM, + Subtype); + } + + void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, + uint64_t Value) const; + + virtual bool doesSectionRequireSymbols(const MCSection &Section) const { + return false; + } +}; + +/// getFixupKindNumBytes - The number of bytes the fixup may change. +static unsigned getFixupKindNumBytes(unsigned Kind) { + switch (Kind) { + default: + llvm_unreachable("Unknown fixup kind!"); + + case FK_Data_1: + case ARM::fixup_arm_thumb_bcc: + case ARM::fixup_arm_thumb_cp: + case ARM::fixup_thumb_adr_pcrel_10: + return 1; + + case FK_Data_2: + case ARM::fixup_arm_thumb_br: + case ARM::fixup_arm_thumb_cb: + return 2; + + case ARM::fixup_arm_ldst_pcrel_12: + case ARM::fixup_arm_pcrel_10: + case ARM::fixup_arm_adr_pcrel_12: + case ARM::fixup_arm_condbranch: + case ARM::fixup_arm_uncondbranch: + return 3; + + case FK_Data_4: + case ARM::fixup_t2_ldst_pcrel_12: + case ARM::fixup_t2_condbranch: + case ARM::fixup_t2_uncondbranch: + case ARM::fixup_t2_pcrel_10: + case ARM::fixup_t2_adr_pcrel_12: + case ARM::fixup_arm_thumb_bl: + case ARM::fixup_arm_thumb_blx: + case ARM::fixup_arm_movt_hi16: + case ARM::fixup_arm_movw_lo16: + case ARM::fixup_arm_movt_hi16_pcrel: + case ARM::fixup_arm_movw_lo16_pcrel: + case ARM::fixup_t2_movt_hi16: + case ARM::fixup_t2_movw_lo16: + case ARM::fixup_t2_movt_hi16_pcrel: + case ARM::fixup_t2_movw_lo16_pcrel: + return 4; + } +} + +void DarwinARMAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data, + unsigned DataSize, uint64_t Value) const { + unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); + Value = adjustFixupValue(Fixup.getKind(), Value); + if (!Value) return; // Doesn't change encoding. + + unsigned Offset = Fixup.getOffset(); + assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!"); + + // For each byte of the fragment that the fixup touches, mask in the + // bits from the fixup value. + for (unsigned i = 0; i != NumBytes; ++i) + Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); +} + +} // end anonymous namespace + +MCAsmBackend *llvm::createARMAsmBackend(const Target &T, StringRef TT) { + Triple TheTriple(TT); + + if (TheTriple.isOSDarwin()) { + if (TheTriple.getArchName() == "armv4t" || + TheTriple.getArchName() == "thumbv4t") + return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V4T); + else if (TheTriple.getArchName() == "armv5e" || + TheTriple.getArchName() == "thumbv5e") + return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V5TEJ); + else if (TheTriple.getArchName() == "armv6" || + TheTriple.getArchName() == "thumbv6") + return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V6); + return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V7); + } + + if (TheTriple.isOSWindows()) + assert(0 && "Windows not supported on ARM"); + + return new ELFARMAsmBackend(T, TT, Triple(TT).getOS()); +} diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h new file mode 100644 index 0000000..ec4b6ff --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h @@ -0,0 +1,449 @@ +//===-- ARMBaseInfo.h - Top level definitions for ARM -------- --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains small standalone helper functions and enum definitions for +// the ARM target useful for the compiler back-end and the MC libraries. +// As such, it deliberately does not include references to LLVM core +// code gen types, passes, etc.. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMBASEINFO_H +#define ARMBASEINFO_H + +#include "ARMMCTargetDesc.h" +#include "llvm/Support/ErrorHandling.h" + +namespace llvm { + +// Enums corresponding to ARM condition codes +namespace ARMCC { + // The CondCodes constants map directly to the 4-bit encoding of the + // condition field for predicated instructions. + enum CondCodes { // Meaning (integer) Meaning (floating-point) + EQ, // Equal Equal + NE, // Not equal Not equal, or unordered + HS, // Carry set >, ==, or unordered + LO, // Carry clear Less than + MI, // Minus, negative Less than + PL, // Plus, positive or zero >, ==, or unordered + VS, // Overflow Unordered + VC, // No overflow Not unordered + HI, // Unsigned higher Greater than, or unordered + LS, // Unsigned lower or same Less than or equal + GE, // Greater than or equal Greater than or equal + LT, // Less than Less than, or unordered + GT, // Greater than Greater than + LE, // Less than or equal <, ==, or unordered + AL // Always (unconditional) Always (unconditional) + }; + + inline static CondCodes getOppositeCondition(CondCodes CC) { + switch (CC) { + default: llvm_unreachable("Unknown condition code"); + case EQ: return NE; + case NE: return EQ; + case HS: return LO; + case LO: return HS; + case MI: return PL; + case PL: return MI; + case VS: return VC; + case VC: return VS; + case HI: return LS; + case LS: return HI; + case GE: return LT; + case LT: return GE; + case GT: return LE; + case LE: return GT; + } + } +} // namespace ARMCC + +inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) { + switch (CC) { + default: llvm_unreachable("Unknown condition code"); + case ARMCC::EQ: return "eq"; + case ARMCC::NE: return "ne"; + case ARMCC::HS: return "hs"; + case ARMCC::LO: return "lo"; + case ARMCC::MI: return "mi"; + case ARMCC::PL: return "pl"; + case ARMCC::VS: return "vs"; + case ARMCC::VC: return "vc"; + case ARMCC::HI: return "hi"; + case ARMCC::LS: return "ls"; + case ARMCC::GE: return "ge"; + case ARMCC::LT: return "lt"; + case ARMCC::GT: return "gt"; + case ARMCC::LE: return "le"; + case ARMCC::AL: return "al"; + } +} + +namespace ARM_PROC { + enum IMod { + IE = 2, + ID = 3 + }; + + enum IFlags { + F = 1, + I = 2, + A = 4 + }; + + inline static const char *IFlagsToString(unsigned val) { + switch (val) { + default: llvm_unreachable("Unknown iflags operand"); + case F: return "f"; + case I: return "i"; + case A: return "a"; + } + } + + inline static const char *IModToString(unsigned val) { + switch (val) { + default: llvm_unreachable("Unknown imod operand"); + case IE: return "ie"; + case ID: return "id"; + } + } +} + +namespace ARM_MB { + // The Memory Barrier Option constants map directly to the 4-bit encoding of + // the option field for memory barrier operations. + enum MemBOpt { + SY = 15, + ST = 14, + ISH = 11, + ISHST = 10, + NSH = 7, + NSHST = 6, + OSH = 3, + OSHST = 2 + }; + + inline static const char *MemBOptToString(unsigned val) { + switch (val) { + default: llvm_unreachable("Unknown memory operation"); + case SY: return "sy"; + case ST: return "st"; + case ISH: return "ish"; + case ISHST: return "ishst"; + case NSH: return "nsh"; + case NSHST: return "nshst"; + case OSH: return "osh"; + case OSHST: return "oshst"; + } + } +} // namespace ARM_MB + +/// getARMRegisterNumbering - Given the enum value for some register, e.g. +/// ARM::LR, return the number that it corresponds to (e.g. 14). +inline static unsigned getARMRegisterNumbering(unsigned Reg) { + using namespace ARM; + switch (Reg) { + default: + llvm_unreachable("Unknown ARM register!"); + case R0: case S0: case D0: case Q0: return 0; + case R1: case S1: case D1: case Q1: return 1; + case R2: case S2: case D2: case Q2: return 2; + case R3: case S3: case D3: case Q3: return 3; + case R4: case S4: case D4: case Q4: return 4; + case R5: case S5: case D5: case Q5: return 5; + case R6: case S6: case D6: case Q6: return 6; + case R7: case S7: case D7: case Q7: return 7; + case R8: case S8: case D8: case Q8: return 8; + case R9: case S9: case D9: case Q9: return 9; + case R10: case S10: case D10: case Q10: return 10; + case R11: case S11: case D11: case Q11: return 11; + case R12: case S12: case D12: case Q12: return 12; + case SP: case S13: case D13: case Q13: return 13; + case LR: case S14: case D14: case Q14: return 14; + case PC: case S15: case D15: case Q15: return 15; + + case S16: case D16: return 16; + case S17: case D17: return 17; + case S18: case D18: return 18; + case S19: case D19: return 19; + case S20: case D20: return 20; + case S21: case D21: return 21; + case S22: case D22: return 22; + case S23: case D23: return 23; + case S24: case D24: return 24; + case S25: case D25: return 25; + case S26: case D26: return 26; + case S27: case D27: return 27; + case S28: case D28: return 28; + case S29: case D29: return 29; + case S30: case D30: return 30; + case S31: case D31: return 31; + } +} + +/// isARMLowRegister - Returns true if the register is a low register (r0-r7). +/// +static inline bool isARMLowRegister(unsigned Reg) { + using namespace ARM; + switch (Reg) { + case R0: case R1: case R2: case R3: + case R4: case R5: case R6: case R7: + return true; + default: + return false; + } +} + +/// ARMII - This namespace holds all of the target specific flags that +/// instruction info tracks. +/// +namespace ARMII { + + /// ARM Index Modes + enum IndexMode { + IndexModeNone = 0, + IndexModePre = 1, + IndexModePost = 2, + IndexModeUpd = 3 + }; + + /// ARM Addressing Modes + enum AddrMode { + AddrModeNone = 0, + AddrMode1 = 1, + AddrMode2 = 2, + AddrMode3 = 3, + AddrMode4 = 4, + AddrMode5 = 5, + AddrMode6 = 6, + AddrModeT1_1 = 7, + AddrModeT1_2 = 8, + AddrModeT1_4 = 9, + AddrModeT1_s = 10, // i8 * 4 for pc and sp relative data + AddrModeT2_i12 = 11, + AddrModeT2_i8 = 12, + AddrModeT2_so = 13, + AddrModeT2_pc = 14, // +/- i12 for pc relative data + AddrModeT2_i8s4 = 15, // i8 * 4 + AddrMode_i12 = 16 + }; + + inline static const char *AddrModeToString(AddrMode addrmode) { + switch (addrmode) { + default: llvm_unreachable("Unknown memory operation"); + case AddrModeNone: return "AddrModeNone"; + case AddrMode1: return "AddrMode1"; + case AddrMode2: return "AddrMode2"; + case AddrMode3: return "AddrMode3"; + case AddrMode4: return "AddrMode4"; + case AddrMode5: return "AddrMode5"; + case AddrMode6: return "AddrMode6"; + case AddrModeT1_1: return "AddrModeT1_1"; + case AddrModeT1_2: return "AddrModeT1_2"; + case AddrModeT1_4: return "AddrModeT1_4"; + case AddrModeT1_s: return "AddrModeT1_s"; + case AddrModeT2_i12: return "AddrModeT2_i12"; + case AddrModeT2_i8: return "AddrModeT2_i8"; + case AddrModeT2_so: return "AddrModeT2_so"; + case AddrModeT2_pc: return "AddrModeT2_pc"; + case AddrModeT2_i8s4: return "AddrModeT2_i8s4"; + case AddrMode_i12: return "AddrMode_i12"; + } + } + + /// Target Operand Flag enum. + enum TOF { + //===------------------------------------------------------------------===// + // ARM Specific MachineOperand flags. + + MO_NO_FLAG, + + /// MO_LO16 - On a symbol operand, this represents a relocation containing + /// lower 16 bit of the address. Used only via movw instruction. + MO_LO16, + + /// MO_HI16 - On a symbol operand, this represents a relocation containing + /// higher 16 bit of the address. Used only via movt instruction. + MO_HI16, + + /// MO_LO16_NONLAZY - On a symbol operand "FOO", this represents a + /// relocation containing lower 16 bit of the non-lazy-ptr indirect symbol, + /// i.e. "FOO$non_lazy_ptr". + /// Used only via movw instruction. + MO_LO16_NONLAZY, + + /// MO_HI16_NONLAZY - On a symbol operand "FOO", this represents a + /// relocation containing lower 16 bit of the non-lazy-ptr indirect symbol, + /// i.e. "FOO$non_lazy_ptr". Used only via movt instruction. + MO_HI16_NONLAZY, + + /// MO_LO16_NONLAZY_PIC - On a symbol operand "FOO", this represents a + /// relocation containing lower 16 bit of the PC relative address of the + /// non-lazy-ptr indirect symbol, i.e. "FOO$non_lazy_ptr - LABEL". + /// Used only via movw instruction. + MO_LO16_NONLAZY_PIC, + + /// MO_HI16_NONLAZY_PIC - On a symbol operand "FOO", this represents a + /// relocation containing lower 16 bit of the PC relative address of the + /// non-lazy-ptr indirect symbol, i.e. "FOO$non_lazy_ptr - LABEL". + /// Used only via movt instruction. + MO_HI16_NONLAZY_PIC, + + /// MO_PLT - On a symbol operand, this represents an ELF PLT reference on a + /// call operand. + MO_PLT + }; + + enum { + //===------------------------------------------------------------------===// + // Instruction Flags. + + //===------------------------------------------------------------------===// + // This four-bit field describes the addressing mode used. + AddrModeMask = 0x1f, // The AddrMode enums are declared in ARMBaseInfo.h + + // IndexMode - Unindex, pre-indexed, or post-indexed are valid for load + // and store ops only. Generic "updating" flag is used for ld/st multiple. + // The index mode enums are declared in ARMBaseInfo.h + IndexModeShift = 5, + IndexModeMask = 3 << IndexModeShift, + + //===------------------------------------------------------------------===// + // Instruction encoding formats. + // + FormShift = 7, + FormMask = 0x3f << FormShift, + + // Pseudo instructions + Pseudo = 0 << FormShift, + + // Multiply instructions + MulFrm = 1 << FormShift, + + // Branch instructions + BrFrm = 2 << FormShift, + BrMiscFrm = 3 << FormShift, + + // Data Processing instructions + DPFrm = 4 << FormShift, + DPSoRegFrm = 5 << FormShift, + + // Load and Store + LdFrm = 6 << FormShift, + StFrm = 7 << FormShift, + LdMiscFrm = 8 << FormShift, + StMiscFrm = 9 << FormShift, + LdStMulFrm = 10 << FormShift, + + LdStExFrm = 11 << FormShift, + + // Miscellaneous arithmetic instructions + ArithMiscFrm = 12 << FormShift, + SatFrm = 13 << FormShift, + + // Extend instructions + ExtFrm = 14 << FormShift, + + // VFP formats + VFPUnaryFrm = 15 << FormShift, + VFPBinaryFrm = 16 << FormShift, + VFPConv1Frm = 17 << FormShift, + VFPConv2Frm = 18 << FormShift, + VFPConv3Frm = 19 << FormShift, + VFPConv4Frm = 20 << FormShift, + VFPConv5Frm = 21 << FormShift, + VFPLdStFrm = 22 << FormShift, + VFPLdStMulFrm = 23 << FormShift, + VFPMiscFrm = 24 << FormShift, + + // Thumb format + ThumbFrm = 25 << FormShift, + + // Miscelleaneous format + MiscFrm = 26 << FormShift, + + // NEON formats + NGetLnFrm = 27 << FormShift, + NSetLnFrm = 28 << FormShift, + NDupFrm = 29 << FormShift, + NLdStFrm = 30 << FormShift, + N1RegModImmFrm= 31 << FormShift, + N2RegFrm = 32 << FormShift, + NVCVTFrm = 33 << FormShift, + NVDupLnFrm = 34 << FormShift, + N2RegVShLFrm = 35 << FormShift, + N2RegVShRFrm = 36 << FormShift, + N3RegFrm = 37 << FormShift, + N3RegVShFrm = 38 << FormShift, + NVExtFrm = 39 << FormShift, + NVMulSLFrm = 40 << FormShift, + NVTBLFrm = 41 << FormShift, + + //===------------------------------------------------------------------===// + // Misc flags. + + // UnaryDP - Indicates this is a unary data processing instruction, i.e. + // it doesn't have a Rn operand. + UnaryDP = 1 << 13, + + // Xform16Bit - Indicates this Thumb2 instruction may be transformed into + // a 16-bit Thumb instruction if certain conditions are met. + Xform16Bit = 1 << 14, + + // ThumbArithFlagSetting - The instruction is a 16-bit flag setting Thumb + // instruction. Used by the parser to determine whether to require the 'S' + // suffix on the mnemonic (when not in an IT block) or preclude it (when + // in an IT block). + ThumbArithFlagSetting = 1 << 18, + + //===------------------------------------------------------------------===// + // Code domain. + DomainShift = 15, + DomainMask = 7 << DomainShift, + DomainGeneral = 0 << DomainShift, + DomainVFP = 1 << DomainShift, + DomainNEON = 2 << DomainShift, + DomainNEONA8 = 4 << DomainShift, + + //===------------------------------------------------------------------===// + // Field shifts - such shifts are used to set field while generating + // machine instructions. + // + // FIXME: This list will need adjusting/fixing as the MC code emitter + // takes shape and the ARMCodeEmitter.cpp bits go away. + ShiftTypeShift = 4, + + M_BitShift = 5, + ShiftImmShift = 5, + ShiftShift = 7, + N_BitShift = 7, + ImmHiShift = 8, + SoRotImmShift = 8, + RegRsShift = 8, + ExtRotImmShift = 10, + RegRdLoShift = 12, + RegRdShift = 12, + RegRdHiShift = 16, + RegRnShift = 16, + S_BitShift = 20, + W_BitShift = 21, + AM3_I_BitShift = 22, + D_BitShift = 22, + U_BitShift = 23, + P_BitShift = 24, + I_BitShift = 25, + CondShift = 28 + }; + +} // end namespace ARMII + +} // end namespace llvm; + +#endif diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h new file mode 100644 index 0000000..350c92d --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h @@ -0,0 +1,97 @@ +//===-- ARM/ARMFixupKinds.h - ARM Specific Fixup Entries --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ARM_ARMFIXUPKINDS_H +#define LLVM_ARM_ARMFIXUPKINDS_H + +#include "llvm/MC/MCFixup.h" + +namespace llvm { +namespace ARM { +enum Fixups { + // fixup_arm_ldst_pcrel_12 - 12-bit PC relative relocation for symbol + // addresses + fixup_arm_ldst_pcrel_12 = FirstTargetFixupKind, + + // fixup_t2_ldst_pcrel_12 - Equivalent to fixup_arm_ldst_pcrel_12, with + // the 16-bit halfwords reordered. + fixup_t2_ldst_pcrel_12, + + // fixup_arm_pcrel_10 - 10-bit PC relative relocation for symbol addresses + // used in VFP instructions where the lower 2 bits are not encoded + // (so it's encoded as an 8-bit immediate). + fixup_arm_pcrel_10, + // fixup_t2_pcrel_10 - Equivalent to fixup_arm_pcrel_10, accounting for + // the short-swapped encoding of Thumb2 instructions. + fixup_t2_pcrel_10, + // fixup_thumb_adr_pcrel_10 - 10-bit PC relative relocation for symbol + // addresses where the lower 2 bits are not encoded (so it's encoded as an + // 8-bit immediate). + fixup_thumb_adr_pcrel_10, + // fixup_arm_adr_pcrel_12 - 12-bit PC relative relocation for the ADR + // instruction. + fixup_arm_adr_pcrel_12, + // fixup_t2_adr_pcrel_12 - 12-bit PC relative relocation for the ADR + // instruction. + fixup_t2_adr_pcrel_12, + // fixup_arm_condbranch - 24-bit PC relative relocation for conditional branch + // instructions. + fixup_arm_condbranch, + // fixup_arm_uncondbranch - 24-bit PC relative relocation for + // branch instructions. (unconditional) + fixup_arm_uncondbranch, + // fixup_t2_condbranch - 20-bit PC relative relocation for Thumb2 direct + // uconditional branch instructions. + fixup_t2_condbranch, + // fixup_t2_uncondbranch - 20-bit PC relative relocation for Thumb2 direct + // branch unconditional branch instructions. + fixup_t2_uncondbranch, + + // fixup_arm_thumb_br - 12-bit fixup for Thumb B instructions. + fixup_arm_thumb_br, + + // fixup_arm_thumb_bl - Fixup for Thumb BL instructions. + fixup_arm_thumb_bl, + + // fixup_arm_thumb_blx - Fixup for Thumb BLX instructions. + fixup_arm_thumb_blx, + + // fixup_arm_thumb_cb - Fixup for Thumb branch instructions. + fixup_arm_thumb_cb, + + // fixup_arm_thumb_cp - Fixup for Thumb load/store from constant pool instrs. + fixup_arm_thumb_cp, + + // fixup_arm_thumb_bcc - Fixup for Thumb conditional branching instructions. + fixup_arm_thumb_bcc, + + // The next two are for the movt/movw pair + // the 16bit imm field are split into imm{15-12} and imm{11-0} + fixup_arm_movt_hi16, // :upper16: + fixup_arm_movw_lo16, // :lower16: + fixup_t2_movt_hi16, // :upper16: + fixup_t2_movw_lo16, // :lower16: + + // It is possible to create an "immediate" that happens to be pcrel. + // movw r0, :lower16:Foo-(Bar+8) and movt r0, :upper16:Foo-(Bar+8) + // result in different reloc tags than the above two. + // Needed to support ELF::R_ARM_MOVT_PREL and ELF::R_ARM_MOVW_PREL_NC + fixup_arm_movt_hi16_pcrel, // :upper16: + fixup_arm_movw_lo16_pcrel, // :lower16: + fixup_t2_movt_hi16_pcrel, // :upper16: + fixup_t2_movw_lo16_pcrel, // :lower16: + + // Marker + LastTargetFixupKind, + NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind +}; +} +} + +#endif diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp new file mode 100644 index 0000000..1c109e0 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp @@ -0,0 +1,83 @@ +//===-- ARMMCAsmInfo.cpp - ARM asm properties -------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of the ARMMCAsmInfo properties. +// +//===----------------------------------------------------------------------===// + +#include "ARMMCAsmInfo.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; + +cl::opt<bool> +EnableARMEHABI("arm-enable-ehabi", cl::Hidden, + cl::desc("Generate ARM EHABI tables"), + cl::init(false)); + + +static const char *const arm_asm_table[] = { + "{r0}", "r0", + "{r1}", "r1", + "{r2}", "r2", + "{r3}", "r3", + "{r4}", "r4", + "{r5}", "r5", + "{r6}", "r6", + "{r7}", "r7", + "{r8}", "r8", + "{r9}", "r9", + "{r10}", "r10", + "{r11}", "r11", + "{r12}", "r12", + "{r13}", "r13", + "{r14}", "r14", + "{lr}", "lr", + "{sp}", "sp", + "{ip}", "ip", + "{fp}", "fp", + "{sl}", "sl", + "{memory}", "memory", + "{cc}", "cc", + 0,0 +}; + +ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin() { + AsmTransCBE = arm_asm_table; + Data64bitsDirective = 0; + CommentString = "@"; + Code16Directive = ".code\t16"; + Code32Directive = ".code\t32"; + + SupportsDebugInformation = true; + + // Exceptions handling + ExceptionsType = ExceptionHandling::SjLj; +} + +ARMELFMCAsmInfo::ARMELFMCAsmInfo() { + // ".comm align is in bytes but .align is pow-2." + AlignmentIsInBytes = false; + + Data64bitsDirective = 0; + CommentString = "@"; + PrivateGlobalPrefix = ".L"; + Code16Directive = ".code\t16"; + Code32Directive = ".code\t32"; + + WeakRefDirective = "\t.weak\t"; + LCOMMDirectiveType = LCOMM::NoAlignment; + + HasLEB128 = true; + SupportsDebugInformation = true; + + // Exceptions handling + if (EnableARMEHABI) + ExceptionsType = ExceptionHandling::ARM; +} diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h new file mode 100644 index 0000000..90f7822 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h @@ -0,0 +1,31 @@ +//=====-- ARMMCAsmInfo.h - ARM asm properties -------------*- C++ -*--====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the ARMMCAsmInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ARMTARGETASMINFO_H +#define LLVM_ARMTARGETASMINFO_H + +#include "llvm/MC/MCAsmInfoDarwin.h" + +namespace llvm { + + struct ARMMCAsmInfoDarwin : public MCAsmInfoDarwin { + explicit ARMMCAsmInfoDarwin(); + }; + + struct ARMELFMCAsmInfo : public MCAsmInfo { + explicit ARMELFMCAsmInfo(); + }; + +} // namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp new file mode 100644 index 0000000..865c3e2 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp @@ -0,0 +1,1468 @@ +//===-- ARM/ARMMCCodeEmitter.cpp - Convert ARM code to machine code -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the ARMMCCodeEmitter class. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "mccodeemitter" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "MCTargetDesc/ARMBaseInfo.h" +#include "MCTargetDesc/ARMFixupKinds.h" +#include "MCTargetDesc/ARMMCExpr.h" +#include "MCTargetDesc/ARMMCTargetDesc.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +STATISTIC(MCNumEmitted, "Number of MC instructions emitted."); +STATISTIC(MCNumCPRelocations, "Number of constant pool relocations created."); + +namespace { +class ARMMCCodeEmitter : public MCCodeEmitter { + ARMMCCodeEmitter(const ARMMCCodeEmitter &); // DO NOT IMPLEMENT + void operator=(const ARMMCCodeEmitter &); // DO NOT IMPLEMENT + const MCInstrInfo &MCII; + const MCSubtargetInfo &STI; + +public: + ARMMCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti, + MCContext &ctx) + : MCII(mcii), STI(sti) { + } + + ~ARMMCCodeEmitter() {} + + bool isThumb() const { + // FIXME: Can tablegen auto-generate this? + return (STI.getFeatureBits() & ARM::ModeThumb) != 0; + } + bool isThumb2() const { + return isThumb() && (STI.getFeatureBits() & ARM::FeatureThumb2) != 0; + } + bool isTargetDarwin() const { + Triple TT(STI.getTargetTriple()); + Triple::OSType OS = TT.getOS(); + return OS == Triple::Darwin || OS == Triple::MacOSX || OS == Triple::IOS; + } + + unsigned getMachineSoImmOpValue(unsigned SoImm) const; + + // getBinaryCodeForInstr - TableGen'erated function for getting the + // binary encoding for an instruction. + unsigned getBinaryCodeForInstr(const MCInst &MI, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getMachineOpValue - Return binary encoding of operand. If the machine + /// operand requires relocation, record the relocation and return zero. + unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getHiLo16ImmOpValue - Return the encoding for the hi / low 16-bit of + /// the specified operand. This is used for operands with :lower16: and + /// :upper16: prefixes. + uint32_t getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + bool EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx, + unsigned &Reg, unsigned &Imm, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getThumbBLTargetOpValue - Return encoding info for Thumb immediate + /// BL branch target. + uint32_t getThumbBLTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getThumbBLXTargetOpValue - Return encoding info for Thumb immediate + /// BLX branch target. + uint32_t getThumbBLXTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getThumbBRTargetOpValue - Return encoding info for Thumb branch target. + uint32_t getThumbBRTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getThumbBCCTargetOpValue - Return encoding info for Thumb branch target. + uint32_t getThumbBCCTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getThumbCBTargetOpValue - Return encoding info for Thumb branch target. + uint32_t getThumbCBTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getBranchTargetOpValue - Return encoding info for 24-bit immediate + /// branch target. + uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getUnconditionalBranchTargetOpValue - Return encoding info for 24-bit + /// immediate Thumb2 direct branch target. + uint32_t getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getARMBranchTargetOpValue - Return encoding info for 24-bit immediate + /// branch target. + uint32_t getARMBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + uint32_t getARMBLXTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getAdrLabelOpValue - Return encoding info for 12-bit immediate + /// ADR label target. + uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + uint32_t getThumbAdrLabelOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + uint32_t getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + + /// getAddrModeImm12OpValue - Return encoding info for 'reg +/- imm12' + /// operand. + uint32_t getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getThumbAddrModeRegRegOpValue - Return encoding for 'reg + reg' operand. + uint32_t getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups)const; + + /// getT2AddrModeImm8s4OpValue - Return encoding info for 'reg +/- imm8<<2' + /// operand. + uint32_t getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getT2AddrModeImm0_1020s4OpValue - Return encoding info for 'reg + imm8<<2' + /// operand. + uint32_t getT2AddrModeImm0_1020s4OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getT2Imm8s4OpValue - Return encoding info for '+/- imm8<<2' + /// operand. + uint32_t getT2Imm8s4OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + + /// getLdStSORegOpValue - Return encoding info for 'reg +/- reg shop imm' + /// operand as needed by load/store instructions. + uint32_t getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getLdStmModeOpValue - Return encoding for load/store multiple mode. + uint32_t getLdStmModeOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + ARM_AM::AMSubMode Mode = (ARM_AM::AMSubMode)MI.getOperand(OpIdx).getImm(); + switch (Mode) { + default: assert(0 && "Unknown addressing sub-mode!"); + case ARM_AM::da: return 0; + case ARM_AM::ia: return 1; + case ARM_AM::db: return 2; + case ARM_AM::ib: return 3; + } + } + /// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value. + /// + unsigned getShiftOp(ARM_AM::ShiftOpc ShOpc) const { + switch (ShOpc) { + default: llvm_unreachable("Unknown shift opc!"); + case ARM_AM::no_shift: + case ARM_AM::lsl: return 0; + case ARM_AM::lsr: return 1; + case ARM_AM::asr: return 2; + case ARM_AM::ror: + case ARM_AM::rrx: return 3; + } + return 0; + } + + /// getAddrMode2OpValue - Return encoding for addrmode2 operands. + uint32_t getAddrMode2OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getAddrMode2OffsetOpValue - Return encoding for am2offset operands. + uint32_t getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getPostIdxRegOpValue - Return encoding for postidx_reg operands. + uint32_t getPostIdxRegOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getAddrMode3OffsetOpValue - Return encoding for am3offset operands. + uint32_t getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getAddrMode3OpValue - Return encoding for addrmode3 operands. + uint32_t getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getAddrModeThumbSPOpValue - Return encoding info for 'reg +/- imm12' + /// operand. + uint32_t getAddrModeThumbSPOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getAddrModeISOpValue - Encode the t_addrmode_is# operands. + uint32_t getAddrModeISOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getAddrModePCOpValue - Return encoding for t_addrmode_pc operands. + uint32_t getAddrModePCOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getAddrMode5OpValue - Return encoding info for 'reg +/- imm8' operand. + uint32_t getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getCCOutOpValue - Return encoding of the 's' bit. + unsigned getCCOutOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + // The operand is either reg0 or CPSR. The 's' bit is encoded as '0' or + // '1' respectively. + return MI.getOperand(Op).getReg() == ARM::CPSR; + } + + /// getSOImmOpValue - Return an encoded 12-bit shifted-immediate value. + unsigned getSOImmOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + unsigned SoImm = MI.getOperand(Op).getImm(); + int SoImmVal = ARM_AM::getSOImmVal(SoImm); + assert(SoImmVal != -1 && "Not a valid so_imm value!"); + + // Encode rotate_imm. + unsigned Binary = (ARM_AM::getSOImmValRot((unsigned)SoImmVal) >> 1) + << ARMII::SoRotImmShift; + + // Encode immed_8. + Binary |= ARM_AM::getSOImmValImm((unsigned)SoImmVal); + return Binary; + } + + /// getT2SOImmOpValue - Return an encoded 12-bit shifted-immediate value. + unsigned getT2SOImmOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + unsigned SoImm = MI.getOperand(Op).getImm(); + unsigned Encoded = ARM_AM::getT2SOImmVal(SoImm); + assert(Encoded != ~0U && "Not a Thumb2 so_imm value?"); + return Encoded; + } + + unsigned getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getT2AddrModeImm12OffsetOpValue(const MCInst &MI, unsigned OpNum, + SmallVectorImpl<MCFixup> &Fixups) const; + + /// getSORegOpValue - Return an encoded so_reg shifted register value. + unsigned getSORegRegOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getSORegImmOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getT2SORegOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + + unsigned getNEONVcvtImm32OpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + return 64 - MI.getOperand(Op).getImm(); + } + + unsigned getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + + unsigned getRegisterListOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + + unsigned getShiftRight8Imm(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getShiftRight16Imm(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getShiftRight32Imm(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getShiftRight64Imm(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + + unsigned getThumbSRImmOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + + unsigned NEONThumb2DataIPostEncoder(const MCInst &MI, + unsigned EncodedValue) const; + unsigned NEONThumb2LoadStorePostEncoder(const MCInst &MI, + unsigned EncodedValue) const; + unsigned NEONThumb2DupPostEncoder(const MCInst &MI, + unsigned EncodedValue) const; + + unsigned VFPThumb2PostEncoder(const MCInst &MI, + unsigned EncodedValue) const; + + void EmitByte(unsigned char C, raw_ostream &OS) const { + OS << (char)C; + } + + void EmitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) const { + // Output the constant in little endian byte order. + for (unsigned i = 0; i != Size; ++i) { + EmitByte(Val & 255, OS); + Val >>= 8; + } + } + + void EncodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups) const; +}; + +} // end anonymous namespace + +MCCodeEmitter *llvm::createARMMCCodeEmitter(const MCInstrInfo &MCII, + const MCSubtargetInfo &STI, + MCContext &Ctx) { + return new ARMMCCodeEmitter(MCII, STI, Ctx); +} + +/// NEONThumb2DataIPostEncoder - Post-process encoded NEON data-processing +/// instructions, and rewrite them to their Thumb2 form if we are currently in +/// Thumb2 mode. +unsigned ARMMCCodeEmitter::NEONThumb2DataIPostEncoder(const MCInst &MI, + unsigned EncodedValue) const { + if (isThumb2()) { + // NEON Thumb2 data-processsing encodings are very simple: bit 24 is moved + // to bit 12 of the high half-word (i.e. bit 28), and bits 27-24 are + // set to 1111. + unsigned Bit24 = EncodedValue & 0x01000000; + unsigned Bit28 = Bit24 << 4; + EncodedValue &= 0xEFFFFFFF; + EncodedValue |= Bit28; + EncodedValue |= 0x0F000000; + } + + return EncodedValue; +} + +/// NEONThumb2LoadStorePostEncoder - Post-process encoded NEON load/store +/// instructions, and rewrite them to their Thumb2 form if we are currently in +/// Thumb2 mode. +unsigned ARMMCCodeEmitter::NEONThumb2LoadStorePostEncoder(const MCInst &MI, + unsigned EncodedValue) const { + if (isThumb2()) { + EncodedValue &= 0xF0FFFFFF; + EncodedValue |= 0x09000000; + } + + return EncodedValue; +} + +/// NEONThumb2DupPostEncoder - Post-process encoded NEON vdup +/// instructions, and rewrite them to their Thumb2 form if we are currently in +/// Thumb2 mode. +unsigned ARMMCCodeEmitter::NEONThumb2DupPostEncoder(const MCInst &MI, + unsigned EncodedValue) const { + if (isThumb2()) { + EncodedValue &= 0x00FFFFFF; + EncodedValue |= 0xEE000000; + } + + return EncodedValue; +} + +/// VFPThumb2PostEncoder - Post-process encoded VFP instructions and rewrite +/// them to their Thumb2 form if we are currently in Thumb2 mode. +unsigned ARMMCCodeEmitter:: +VFPThumb2PostEncoder(const MCInst &MI, unsigned EncodedValue) const { + if (isThumb2()) { + EncodedValue &= 0x0FFFFFFF; + EncodedValue |= 0xE0000000; + } + return EncodedValue; +} + +/// getMachineOpValue - Return binary encoding of operand. If the machine +/// operand requires relocation, record the relocation and return zero. +unsigned ARMMCCodeEmitter:: +getMachineOpValue(const MCInst &MI, const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixups) const { + if (MO.isReg()) { + unsigned Reg = MO.getReg(); + unsigned RegNo = getARMRegisterNumbering(Reg); + + // Q registers are encoded as 2x their register number. + switch (Reg) { + default: + return RegNo; + case ARM::Q0: case ARM::Q1: case ARM::Q2: case ARM::Q3: + case ARM::Q4: case ARM::Q5: case ARM::Q6: case ARM::Q7: + case ARM::Q8: case ARM::Q9: case ARM::Q10: case ARM::Q11: + case ARM::Q12: case ARM::Q13: case ARM::Q14: case ARM::Q15: + return 2 * RegNo; + } + } else if (MO.isImm()) { + return static_cast<unsigned>(MO.getImm()); + } else if (MO.isFPImm()) { + return static_cast<unsigned>(APFloat(MO.getFPImm()) + .bitcastToAPInt().getHiBits(32).getLimitedValue()); + } + + llvm_unreachable("Unable to encode MCOperand!"); + return 0; +} + +/// getAddrModeImmOpValue - Return encoding info for 'reg +/- imm' operand. +bool ARMMCCodeEmitter:: +EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx, unsigned &Reg, + unsigned &Imm, SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx + 1); + + Reg = getARMRegisterNumbering(MO.getReg()); + + int32_t SImm = MO1.getImm(); + bool isAdd = true; + + // Special value for #-0 + if (SImm == INT32_MIN) { + SImm = 0; + isAdd = false; + } + + // Immediate is always encoded as positive. The 'U' bit controls add vs sub. + if (SImm < 0) { + SImm = -SImm; + isAdd = false; + } + + Imm = SImm; + return isAdd; +} + +/// getBranchTargetOpValue - Helper function to get the branch target operand, +/// which is either an immediate or requires a fixup. +static uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + unsigned FixupKind, + SmallVectorImpl<MCFixup> &Fixups) { + const MCOperand &MO = MI.getOperand(OpIdx); + + // If the destination is an immediate, we have nothing to do. + if (MO.isImm()) return MO.getImm(); + assert(MO.isExpr() && "Unexpected branch target type!"); + const MCExpr *Expr = MO.getExpr(); + MCFixupKind Kind = MCFixupKind(FixupKind); + Fixups.push_back(MCFixup::Create(0, Expr, Kind)); + + // All of the information is in the fixup. + return 0; +} + +// Thumb BL and BLX use a strange offset encoding where bits 22 and 21 are +// determined by negating them and XOR'ing them with bit 23. +static int32_t encodeThumbBLOffset(int32_t offset) { + offset >>= 1; + uint32_t S = (offset & 0x800000) >> 23; + uint32_t J1 = (offset & 0x400000) >> 22; + uint32_t J2 = (offset & 0x200000) >> 21; + J1 = (~J1 & 0x1); + J2 = (~J2 & 0x1); + J1 ^= S; + J2 ^= S; + + offset &= ~0x600000; + offset |= J1 << 22; + offset |= J2 << 21; + + return offset; +} + +/// getThumbBLTargetOpValue - Return encoding info for immediate branch target. +uint32_t ARMMCCodeEmitter:: +getThumbBLTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand MO = MI.getOperand(OpIdx); + if (MO.isExpr()) + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_bl, + Fixups); + return encodeThumbBLOffset(MO.getImm()); +} + +/// getThumbBLXTargetOpValue - Return encoding info for Thumb immediate +/// BLX branch target. +uint32_t ARMMCCodeEmitter:: +getThumbBLXTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand MO = MI.getOperand(OpIdx); + if (MO.isExpr()) + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_blx, + Fixups); + return encodeThumbBLOffset(MO.getImm()); +} + +/// getThumbBRTargetOpValue - Return encoding info for Thumb branch target. +uint32_t ARMMCCodeEmitter:: +getThumbBRTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand MO = MI.getOperand(OpIdx); + if (MO.isExpr()) + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_br, + Fixups); + return (MO.getImm() >> 1); +} + +/// getThumbBCCTargetOpValue - Return encoding info for Thumb branch target. +uint32_t ARMMCCodeEmitter:: +getThumbBCCTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand MO = MI.getOperand(OpIdx); + if (MO.isExpr()) + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_bcc, + Fixups); + return (MO.getImm() >> 1); +} + +/// getThumbCBTargetOpValue - Return encoding info for Thumb branch target. +uint32_t ARMMCCodeEmitter:: +getThumbCBTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand MO = MI.getOperand(OpIdx); + if (MO.isExpr()) + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cb, Fixups); + return (MO.getImm() >> 1); +} + +/// Return true if this branch has a non-always predication +static bool HasConditionalBranch(const MCInst &MI) { + int NumOp = MI.getNumOperands(); + if (NumOp >= 2) { + for (int i = 0; i < NumOp-1; ++i) { + const MCOperand &MCOp1 = MI.getOperand(i); + const MCOperand &MCOp2 = MI.getOperand(i + 1); + if (MCOp1.isImm() && MCOp2.isReg() && + (MCOp2.getReg() == 0 || MCOp2.getReg() == ARM::CPSR)) { + if (ARMCC::CondCodes(MCOp1.getImm()) != ARMCC::AL) + return true; + } + } + } + return false; +} + +/// getBranchTargetOpValue - Return encoding info for 24-bit immediate branch +/// target. +uint32_t ARMMCCodeEmitter:: +getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // FIXME: This really, really shouldn't use TargetMachine. We don't want + // coupling between MC and TM anywhere we can help it. + if (isThumb2()) + return + ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_condbranch, Fixups); + return getARMBranchTargetOpValue(MI, OpIdx, Fixups); +} + +/// getBranchTargetOpValue - Return encoding info for 24-bit immediate branch +/// target. +uint32_t ARMMCCodeEmitter:: +getARMBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand MO = MI.getOperand(OpIdx); + if (MO.isExpr()) { + if (HasConditionalBranch(MI)) + return ::getBranchTargetOpValue(MI, OpIdx, + ARM::fixup_arm_condbranch, Fixups); + return ::getBranchTargetOpValue(MI, OpIdx, + ARM::fixup_arm_uncondbranch, Fixups); + } + + return MO.getImm() >> 2; +} + +uint32_t ARMMCCodeEmitter:: +getARMBLXTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand MO = MI.getOperand(OpIdx); + if (MO.isExpr()) { + if (HasConditionalBranch(MI)) + return ::getBranchTargetOpValue(MI, OpIdx, + ARM::fixup_arm_condbranch, Fixups); + return ::getBranchTargetOpValue(MI, OpIdx, + ARM::fixup_arm_uncondbranch, Fixups); + } + + return MO.getImm() >> 1; +} + +/// getUnconditionalBranchTargetOpValue - Return encoding info for 24-bit +/// immediate branch target. +uint32_t ARMMCCodeEmitter:: +getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + unsigned Val = + ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_uncondbranch, Fixups); + bool I = (Val & 0x800000); + bool J1 = (Val & 0x400000); + bool J2 = (Val & 0x200000); + if (I ^ J1) + Val &= ~0x400000; + else + Val |= 0x400000; + + if (I ^ J2) + Val &= ~0x200000; + else + Val |= 0x200000; + + return Val; +} + +/// getAdrLabelOpValue - Return encoding info for 12-bit immediate ADR label +/// target. +uint32_t ARMMCCodeEmitter:: +getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand MO = MI.getOperand(OpIdx); + if (MO.isExpr()) + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_adr_pcrel_12, + Fixups); + int32_t offset = MO.getImm(); + uint32_t Val = 0x2000; + if (offset < 0) { + Val = 0x1000; + offset *= -1; + } + Val |= offset; + return Val; +} + +/// getAdrLabelOpValue - Return encoding info for 12-bit immediate ADR label +/// target. +uint32_t ARMMCCodeEmitter:: +getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand MO = MI.getOperand(OpIdx); + if (MO.isExpr()) + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_adr_pcrel_12, + Fixups); + int32_t Val = MO.getImm(); + if (Val < 0) { + Val *= -1; + Val |= 0x1000; + } + return Val; +} + +/// getAdrLabelOpValue - Return encoding info for 8-bit immediate ADR label +/// target. +uint32_t ARMMCCodeEmitter:: +getThumbAdrLabelOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand MO = MI.getOperand(OpIdx); + if (MO.isExpr()) + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_thumb_adr_pcrel_10, + Fixups); + return MO.getImm(); +} + +/// getThumbAddrModeRegRegOpValue - Return encoding info for 'reg + reg' +/// operand. +uint32_t ARMMCCodeEmitter:: +getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &) const { + // [Rn, Rm] + // {5-3} = Rm + // {2-0} = Rn + const MCOperand &MO1 = MI.getOperand(OpIdx); + const MCOperand &MO2 = MI.getOperand(OpIdx + 1); + unsigned Rn = getARMRegisterNumbering(MO1.getReg()); + unsigned Rm = getARMRegisterNumbering(MO2.getReg()); + return (Rm << 3) | Rn; +} + +/// getAddrModeImm12OpValue - Return encoding info for 'reg +/- imm12' operand. +uint32_t ARMMCCodeEmitter:: +getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // {17-13} = reg + // {12} = (U)nsigned (add == '1', sub == '0') + // {11-0} = imm12 + unsigned Reg, Imm12; + bool isAdd = true; + // If The first operand isn't a register, we have a label reference. + const MCOperand &MO = MI.getOperand(OpIdx); + if (!MO.isReg()) { + Reg = getARMRegisterNumbering(ARM::PC); // Rn is PC. + Imm12 = 0; + isAdd = false ; // 'U' bit is set as part of the fixup. + + if (MO.isExpr()) { + const MCExpr *Expr = MO.getExpr(); + + MCFixupKind Kind; + if (isThumb2()) + Kind = MCFixupKind(ARM::fixup_t2_ldst_pcrel_12); + else + Kind = MCFixupKind(ARM::fixup_arm_ldst_pcrel_12); + Fixups.push_back(MCFixup::Create(0, Expr, Kind)); + + ++MCNumCPRelocations; + } else { + Reg = ARM::PC; + int32_t Offset = MO.getImm(); + if (Offset < 0) { + Offset *= -1; + isAdd = false; + } + Imm12 = Offset; + } + } else + isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm12, Fixups); + + uint32_t Binary = Imm12 & 0xfff; + // Immediate is always encoded as positive. The 'U' bit controls add vs sub. + if (isAdd) + Binary |= (1 << 12); + Binary |= (Reg << 13); + return Binary; +} + +/// getT2Imm8s4OpValue - Return encoding info for +/// '+/- imm8<<2' operand. +uint32_t ARMMCCodeEmitter:: +getT2Imm8s4OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // FIXME: The immediate operand should have already been encoded like this + // before ever getting here. The encoder method should just need to combine + // the MI operands for the register and the offset into a single + // representation for the complex operand in the .td file. This isn't just + // style, unfortunately. As-is, we can't represent the distinct encoding + // for #-0. + + // {8} = (U)nsigned (add == '1', sub == '0') + // {7-0} = imm8 + int32_t Imm8 = MI.getOperand(OpIdx).getImm(); + bool isAdd = Imm8 >= 0; + + // Immediate is always encoded as positive. The 'U' bit controls add vs sub. + if (Imm8 < 0) + Imm8 = -Imm8; + + // Scaled by 4. + Imm8 /= 4; + + uint32_t Binary = Imm8 & 0xff; + // Immediate is always encoded as positive. The 'U' bit controls add vs sub. + if (isAdd) + Binary |= (1 << 8); + return Binary; +} + +/// getT2AddrModeImm8s4OpValue - Return encoding info for +/// 'reg +/- imm8<<2' operand. +uint32_t ARMMCCodeEmitter:: +getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // {12-9} = reg + // {8} = (U)nsigned (add == '1', sub == '0') + // {7-0} = imm8 + unsigned Reg, Imm8; + bool isAdd = true; + // If The first operand isn't a register, we have a label reference. + const MCOperand &MO = MI.getOperand(OpIdx); + if (!MO.isReg()) { + Reg = getARMRegisterNumbering(ARM::PC); // Rn is PC. + Imm8 = 0; + isAdd = false ; // 'U' bit is set as part of the fixup. + + assert(MO.isExpr() && "Unexpected machine operand type!"); + const MCExpr *Expr = MO.getExpr(); + MCFixupKind Kind = MCFixupKind(ARM::fixup_arm_pcrel_10); + Fixups.push_back(MCFixup::Create(0, Expr, Kind)); + + ++MCNumCPRelocations; + } else + isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups); + + // FIXME: The immediate operand should have already been encoded like this + // before ever getting here. The encoder method should just need to combine + // the MI operands for the register and the offset into a single + // representation for the complex operand in the .td file. This isn't just + // style, unfortunately. As-is, we can't represent the distinct encoding + // for #-0. + uint32_t Binary = (Imm8 >> 2) & 0xff; + // Immediate is always encoded as positive. The 'U' bit controls add vs sub. + if (isAdd) + Binary |= (1 << 8); + Binary |= (Reg << 9); + return Binary; +} + +/// getT2AddrModeImm0_1020s4OpValue - Return encoding info for +/// 'reg + imm8<<2' operand. +uint32_t ARMMCCodeEmitter:: +getT2AddrModeImm0_1020s4OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // {11-8} = reg + // {7-0} = imm8 + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx + 1); + unsigned Reg = getARMRegisterNumbering(MO.getReg()); + unsigned Imm8 = MO1.getImm(); + return (Reg << 8) | Imm8; +} + +// FIXME: This routine assumes that a binary +// expression will always result in a PCRel expression +// In reality, its only true if one or more subexpressions +// is itself a PCRel (i.e. "." in asm or some other pcrel construct) +// but this is good enough for now. +static bool EvaluateAsPCRel(const MCExpr *Expr) { + switch (Expr->getKind()) { + default: assert(0 && "Unexpected expression type"); + case MCExpr::SymbolRef: return false; + case MCExpr::Binary: return true; + } +} + +uint32_t +ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // {20-16} = imm{15-12} + // {11-0} = imm{11-0} + const MCOperand &MO = MI.getOperand(OpIdx); + if (MO.isImm()) + // Hi / lo 16 bits already extracted during earlier passes. + return static_cast<unsigned>(MO.getImm()); + + // Handle :upper16: and :lower16: assembly prefixes. + const MCExpr *E = MO.getExpr(); + if (E->getKind() == MCExpr::Target) { + const ARMMCExpr *ARM16Expr = cast<ARMMCExpr>(E); + E = ARM16Expr->getSubExpr(); + + MCFixupKind Kind; + switch (ARM16Expr->getKind()) { + default: assert(0 && "Unsupported ARMFixup"); + case ARMMCExpr::VK_ARM_HI16: + if (!isTargetDarwin() && EvaluateAsPCRel(E)) + Kind = MCFixupKind(isThumb2() + ? ARM::fixup_t2_movt_hi16_pcrel + : ARM::fixup_arm_movt_hi16_pcrel); + else + Kind = MCFixupKind(isThumb2() + ? ARM::fixup_t2_movt_hi16 + : ARM::fixup_arm_movt_hi16); + break; + case ARMMCExpr::VK_ARM_LO16: + if (!isTargetDarwin() && EvaluateAsPCRel(E)) + Kind = MCFixupKind(isThumb2() + ? ARM::fixup_t2_movw_lo16_pcrel + : ARM::fixup_arm_movw_lo16_pcrel); + else + Kind = MCFixupKind(isThumb2() + ? ARM::fixup_t2_movw_lo16 + : ARM::fixup_arm_movw_lo16); + break; + } + Fixups.push_back(MCFixup::Create(0, E, Kind)); + return 0; + }; + + llvm_unreachable("Unsupported MCExpr type in MCOperand!"); + return 0; +} + +uint32_t ARMMCCodeEmitter:: +getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx+1); + const MCOperand &MO2 = MI.getOperand(OpIdx+2); + unsigned Rn = getARMRegisterNumbering(MO.getReg()); + unsigned Rm = getARMRegisterNumbering(MO1.getReg()); + unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm()); + bool isAdd = ARM_AM::getAM2Op(MO2.getImm()) == ARM_AM::add; + ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(MO2.getImm()); + unsigned SBits = getShiftOp(ShOp); + + // {16-13} = Rn + // {12} = isAdd + // {11-0} = shifter + // {3-0} = Rm + // {4} = 0 + // {6-5} = type + // {11-7} = imm + uint32_t Binary = Rm; + Binary |= Rn << 13; + Binary |= SBits << 5; + Binary |= ShImm << 7; + if (isAdd) + Binary |= 1 << 12; + return Binary; +} + +uint32_t ARMMCCodeEmitter:: +getAddrMode2OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // {17-14} Rn + // {13} 1 == imm12, 0 == Rm + // {12} isAdd + // {11-0} imm12/Rm + const MCOperand &MO = MI.getOperand(OpIdx); + unsigned Rn = getARMRegisterNumbering(MO.getReg()); + uint32_t Binary = getAddrMode2OffsetOpValue(MI, OpIdx + 1, Fixups); + Binary |= Rn << 14; + return Binary; +} + +uint32_t ARMMCCodeEmitter:: +getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // {13} 1 == imm12, 0 == Rm + // {12} isAdd + // {11-0} imm12/Rm + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx+1); + unsigned Imm = MO1.getImm(); + bool isAdd = ARM_AM::getAM2Op(Imm) == ARM_AM::add; + bool isReg = MO.getReg() != 0; + uint32_t Binary = ARM_AM::getAM2Offset(Imm); + // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm12 + if (isReg) { + ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(Imm); + Binary <<= 7; // Shift amount is bits [11:7] + Binary |= getShiftOp(ShOp) << 5; // Shift type is bits [6:5] + Binary |= getARMRegisterNumbering(MO.getReg()); // Rm is bits [3:0] + } + return Binary | (isAdd << 12) | (isReg << 13); +} + +uint32_t ARMMCCodeEmitter:: +getPostIdxRegOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // {4} isAdd + // {3-0} Rm + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx+1); + bool isAdd = MO1.getImm() != 0; + return getARMRegisterNumbering(MO.getReg()) | (isAdd << 4); +} + +uint32_t ARMMCCodeEmitter:: +getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // {9} 1 == imm8, 0 == Rm + // {8} isAdd + // {7-4} imm7_4/zero + // {3-0} imm3_0/Rm + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx+1); + unsigned Imm = MO1.getImm(); + bool isAdd = ARM_AM::getAM3Op(Imm) == ARM_AM::add; + bool isImm = MO.getReg() == 0; + uint32_t Imm8 = ARM_AM::getAM3Offset(Imm); + // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm8 + if (!isImm) + Imm8 = getARMRegisterNumbering(MO.getReg()); + return Imm8 | (isAdd << 8) | (isImm << 9); +} + +uint32_t ARMMCCodeEmitter:: +getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // {13} 1 == imm8, 0 == Rm + // {12-9} Rn + // {8} isAdd + // {7-4} imm7_4/zero + // {3-0} imm3_0/Rm + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx+1); + const MCOperand &MO2 = MI.getOperand(OpIdx+2); + unsigned Rn = getARMRegisterNumbering(MO.getReg()); + unsigned Imm = MO2.getImm(); + bool isAdd = ARM_AM::getAM3Op(Imm) == ARM_AM::add; + bool isImm = MO1.getReg() == 0; + uint32_t Imm8 = ARM_AM::getAM3Offset(Imm); + // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm8 + if (!isImm) + Imm8 = getARMRegisterNumbering(MO1.getReg()); + return (Rn << 9) | Imm8 | (isAdd << 8) | (isImm << 13); +} + +/// getAddrModeThumbSPOpValue - Encode the t_addrmode_sp operands. +uint32_t ARMMCCodeEmitter:: +getAddrModeThumbSPOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // [SP, #imm] + // {7-0} = imm8 + const MCOperand &MO1 = MI.getOperand(OpIdx + 1); + assert(MI.getOperand(OpIdx).getReg() == ARM::SP && + "Unexpected base register!"); + + // The immediate is already shifted for the implicit zeroes, so no change + // here. + return MO1.getImm() & 0xff; +} + +/// getAddrModeISOpValue - Encode the t_addrmode_is# operands. +uint32_t ARMMCCodeEmitter:: +getAddrModeISOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // [Rn, #imm] + // {7-3} = imm5 + // {2-0} = Rn + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx + 1); + unsigned Rn = getARMRegisterNumbering(MO.getReg()); + unsigned Imm5 = MO1.getImm(); + return ((Imm5 & 0x1f) << 3) | Rn; +} + +/// getAddrModePCOpValue - Return encoding for t_addrmode_pc operands. +uint32_t ARMMCCodeEmitter:: +getAddrModePCOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand MO = MI.getOperand(OpIdx); + if (MO.isExpr()) + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cp, Fixups); + return (MO.getImm() >> 2); +} + +/// getAddrMode5OpValue - Return encoding info for 'reg +/- imm10' operand. +uint32_t ARMMCCodeEmitter:: +getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // {12-9} = reg + // {8} = (U)nsigned (add == '1', sub == '0') + // {7-0} = imm8 + unsigned Reg, Imm8; + bool isAdd; + // If The first operand isn't a register, we have a label reference. + const MCOperand &MO = MI.getOperand(OpIdx); + if (!MO.isReg()) { + Reg = getARMRegisterNumbering(ARM::PC); // Rn is PC. + Imm8 = 0; + isAdd = false; // 'U' bit is handled as part of the fixup. + + assert(MO.isExpr() && "Unexpected machine operand type!"); + const MCExpr *Expr = MO.getExpr(); + MCFixupKind Kind; + if (isThumb2()) + Kind = MCFixupKind(ARM::fixup_t2_pcrel_10); + else + Kind = MCFixupKind(ARM::fixup_arm_pcrel_10); + Fixups.push_back(MCFixup::Create(0, Expr, Kind)); + + ++MCNumCPRelocations; + } else { + EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups); + isAdd = ARM_AM::getAM5Op(Imm8) == ARM_AM::add; + } + + uint32_t Binary = ARM_AM::getAM5Offset(Imm8); + // Immediate is always encoded as positive. The 'U' bit controls add vs sub. + if (isAdd) + Binary |= (1 << 8); + Binary |= (Reg << 9); + return Binary; +} + +unsigned ARMMCCodeEmitter:: +getSORegRegOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // Sub-operands are [reg, reg, imm]. The first register is Rm, the reg to be + // shifted. The second is Rs, the amount to shift by, and the third specifies + // the type of the shift. + // + // {3-0} = Rm. + // {4} = 1 + // {6-5} = type + // {11-8} = Rs + // {7} = 0 + + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx + 1); + const MCOperand &MO2 = MI.getOperand(OpIdx + 2); + ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO2.getImm()); + + // Encode Rm. + unsigned Binary = getARMRegisterNumbering(MO.getReg()); + + // Encode the shift opcode. + unsigned SBits = 0; + unsigned Rs = MO1.getReg(); + if (Rs) { + // Set shift operand (bit[7:4]). + // LSL - 0001 + // LSR - 0011 + // ASR - 0101 + // ROR - 0111 + switch (SOpc) { + default: llvm_unreachable("Unknown shift opc!"); + case ARM_AM::lsl: SBits = 0x1; break; + case ARM_AM::lsr: SBits = 0x3; break; + case ARM_AM::asr: SBits = 0x5; break; + case ARM_AM::ror: SBits = 0x7; break; + } + } + + Binary |= SBits << 4; + + // Encode the shift operation Rs. + // Encode Rs bit[11:8]. + assert(ARM_AM::getSORegOffset(MO2.getImm()) == 0); + return Binary | (getARMRegisterNumbering(Rs) << ARMII::RegRsShift); +} + +unsigned ARMMCCodeEmitter:: +getSORegImmOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // Sub-operands are [reg, imm]. The first register is Rm, the reg to be + // shifted. The second is the amount to shift by. + // + // {3-0} = Rm. + // {4} = 0 + // {6-5} = type + // {11-7} = imm + + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx + 1); + ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO1.getImm()); + + // Encode Rm. + unsigned Binary = getARMRegisterNumbering(MO.getReg()); + + // Encode the shift opcode. + unsigned SBits = 0; + + // Set shift operand (bit[6:4]). + // LSL - 000 + // LSR - 010 + // ASR - 100 + // ROR - 110 + // RRX - 110 and bit[11:8] clear. + switch (SOpc) { + default: llvm_unreachable("Unknown shift opc!"); + case ARM_AM::lsl: SBits = 0x0; break; + case ARM_AM::lsr: SBits = 0x2; break; + case ARM_AM::asr: SBits = 0x4; break; + case ARM_AM::ror: SBits = 0x6; break; + case ARM_AM::rrx: + Binary |= 0x60; + return Binary; + } + + // Encode shift_imm bit[11:7]. + Binary |= SBits << 4; + unsigned Offset = ARM_AM::getSORegOffset(MO1.getImm()); + assert(Offset && "Offset must be in range 1-32!"); + if (Offset == 32) Offset = 0; + return Binary | (Offset << 7); +} + + +unsigned ARMMCCodeEmitter:: +getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand &MO1 = MI.getOperand(OpNum); + const MCOperand &MO2 = MI.getOperand(OpNum+1); + const MCOperand &MO3 = MI.getOperand(OpNum+2); + + // Encoded as [Rn, Rm, imm]. + // FIXME: Needs fixup support. + unsigned Value = getARMRegisterNumbering(MO1.getReg()); + Value <<= 4; + Value |= getARMRegisterNumbering(MO2.getReg()); + Value <<= 2; + Value |= MO3.getImm(); + + return Value; +} + +unsigned ARMMCCodeEmitter:: +getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand &MO1 = MI.getOperand(OpNum); + const MCOperand &MO2 = MI.getOperand(OpNum+1); + + // FIXME: Needs fixup support. + unsigned Value = getARMRegisterNumbering(MO1.getReg()); + + // Even though the immediate is 8 bits long, we need 9 bits in order + // to represent the (inverse of the) sign bit. + Value <<= 9; + int32_t tmp = (int32_t)MO2.getImm(); + if (tmp < 0) + tmp = abs(tmp); + else + Value |= 256; // Set the ADD bit + Value |= tmp & 255; + return Value; +} + +unsigned ARMMCCodeEmitter:: +getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand &MO1 = MI.getOperand(OpNum); + + // FIXME: Needs fixup support. + unsigned Value = 0; + int32_t tmp = (int32_t)MO1.getImm(); + if (tmp < 0) + tmp = abs(tmp); + else + Value |= 256; // Set the ADD bit + Value |= tmp & 255; + return Value; +} + +unsigned ARMMCCodeEmitter:: +getT2AddrModeImm12OffsetOpValue(const MCInst &MI, unsigned OpNum, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand &MO1 = MI.getOperand(OpNum); + + // FIXME: Needs fixup support. + unsigned Value = 0; + int32_t tmp = (int32_t)MO1.getImm(); + if (tmp < 0) + tmp = abs(tmp); + else + Value |= 4096; // Set the ADD bit + Value |= tmp & 4095; + return Value; +} + +unsigned ARMMCCodeEmitter:: +getT2SORegOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups) const { + // Sub-operands are [reg, imm]. The first register is Rm, the reg to be + // shifted. The second is the amount to shift by. + // + // {3-0} = Rm. + // {4} = 0 + // {6-5} = type + // {11-7} = imm + + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx + 1); + ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO1.getImm()); + + // Encode Rm. + unsigned Binary = getARMRegisterNumbering(MO.getReg()); + + // Encode the shift opcode. + unsigned SBits = 0; + // Set shift operand (bit[6:4]). + // LSL - 000 + // LSR - 010 + // ASR - 100 + // ROR - 110 + switch (SOpc) { + default: llvm_unreachable("Unknown shift opc!"); + case ARM_AM::lsl: SBits = 0x0; break; + case ARM_AM::lsr: SBits = 0x2; break; + case ARM_AM::asr: SBits = 0x4; break; + case ARM_AM::rrx: // FALLTHROUGH + case ARM_AM::ror: SBits = 0x6; break; + } + + Binary |= SBits << 4; + if (SOpc == ARM_AM::rrx) + return Binary; + + // Encode shift_imm bit[11:7]. + return Binary | ARM_AM::getSORegOffset(MO1.getImm()) << 7; +} + +unsigned ARMMCCodeEmitter:: +getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + // 10 bits. lower 5 bits are are the lsb of the mask, high five bits are the + // msb of the mask. + const MCOperand &MO = MI.getOperand(Op); + uint32_t v = ~MO.getImm(); + uint32_t lsb = CountTrailingZeros_32(v); + uint32_t msb = (32 - CountLeadingZeros_32 (v)) - 1; + assert (v != 0 && lsb < 32 && msb < 32 && "Illegal bitfield mask!"); + return lsb | (msb << 5); +} + +unsigned ARMMCCodeEmitter:: +getRegisterListOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + // VLDM/VSTM: + // {12-8} = Vd + // {7-0} = Number of registers + // + // LDM/STM: + // {15-0} = Bitfield of GPRs. + unsigned Reg = MI.getOperand(Op).getReg(); + bool SPRRegs = llvm::ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg); + bool DPRRegs = llvm::ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg); + + unsigned Binary = 0; + + if (SPRRegs || DPRRegs) { + // VLDM/VSTM + unsigned RegNo = getARMRegisterNumbering(Reg); + unsigned NumRegs = (MI.getNumOperands() - Op) & 0xff; + Binary |= (RegNo & 0x1f) << 8; + if (SPRRegs) + Binary |= NumRegs; + else + Binary |= NumRegs * 2; + } else { + for (unsigned I = Op, E = MI.getNumOperands(); I < E; ++I) { + unsigned RegNo = getARMRegisterNumbering(MI.getOperand(I).getReg()); + Binary |= 1 << RegNo; + } + } + + return Binary; +} + +/// getAddrMode6AddressOpValue - Encode an addrmode6 register number along +/// with the alignment operand. +unsigned ARMMCCodeEmitter:: +getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand &Reg = MI.getOperand(Op); + const MCOperand &Imm = MI.getOperand(Op + 1); + + unsigned RegNo = getARMRegisterNumbering(Reg.getReg()); + unsigned Align = 0; + + switch (Imm.getImm()) { + default: break; + case 2: + case 4: + case 8: Align = 0x01; break; + case 16: Align = 0x02; break; + case 32: Align = 0x03; break; + } + + return RegNo | (Align << 4); +} + +/// getAddrMode6OneLane32AddressOpValue - Encode an addrmode6 register number +/// along with the alignment operand for use in VST1 and VLD1 with size 32. +unsigned ARMMCCodeEmitter:: +getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand &Reg = MI.getOperand(Op); + const MCOperand &Imm = MI.getOperand(Op + 1); + + unsigned RegNo = getARMRegisterNumbering(Reg.getReg()); + unsigned Align = 0; + + switch (Imm.getImm()) { + default: break; + case 2: + case 4: + case 8: + case 16: Align = 0x00; break; + case 32: Align = 0x03; break; + } + + return RegNo | (Align << 4); +} + + +/// getAddrMode6DupAddressOpValue - Encode an addrmode6 register number and +/// alignment operand for use in VLD-dup instructions. This is the same as +/// getAddrMode6AddressOpValue except for the alignment encoding, which is +/// different for VLD4-dup. +unsigned ARMMCCodeEmitter:: +getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand &Reg = MI.getOperand(Op); + const MCOperand &Imm = MI.getOperand(Op + 1); + + unsigned RegNo = getARMRegisterNumbering(Reg.getReg()); + unsigned Align = 0; + + switch (Imm.getImm()) { + default: break; + case 2: + case 4: + case 8: Align = 0x01; break; + case 16: Align = 0x03; break; + } + + return RegNo | (Align << 4); +} + +unsigned ARMMCCodeEmitter:: +getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + const MCOperand &MO = MI.getOperand(Op); + if (MO.getReg() == 0) return 0x0D; + return MO.getReg(); +} + +unsigned ARMMCCodeEmitter:: +getShiftRight8Imm(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + return 8 - MI.getOperand(Op).getImm(); +} + +unsigned ARMMCCodeEmitter:: +getShiftRight16Imm(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + return 16 - MI.getOperand(Op).getImm(); +} + +unsigned ARMMCCodeEmitter:: +getShiftRight32Imm(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + return 32 - MI.getOperand(Op).getImm(); +} + +unsigned ARMMCCodeEmitter:: +getShiftRight64Imm(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + return 64 - MI.getOperand(Op).getImm(); +} + +void ARMMCCodeEmitter:: +EncodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups) const { + // Pseudo instructions don't get encoded. + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + uint64_t TSFlags = Desc.TSFlags; + if ((TSFlags & ARMII::FormMask) == ARMII::Pseudo) + return; + + int Size; + if (Desc.getSize() == 2 || Desc.getSize() == 4) + Size = Desc.getSize(); + else + llvm_unreachable("Unexpected instruction size!"); + + uint32_t Binary = getBinaryCodeForInstr(MI, Fixups); + // Thumb 32-bit wide instructions need to emit the high order halfword + // first. + if (isThumb() && Size == 4) { + EmitConstant(Binary >> 16, 2, OS); + EmitConstant(Binary & 0xffff, 2, OS); + } else + EmitConstant(Binary, Size, OS); + ++MCNumEmitted; // Keep track of the # of mi's emitted. +} + +#include "ARMGenMCCodeEmitter.inc" diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp new file mode 100644 index 0000000..2727ba8 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp @@ -0,0 +1,73 @@ +//===-- ARMMCExpr.cpp - ARM specific MC expression classes ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "armmcexpr" +#include "ARMMCExpr.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCAssembler.h" +using namespace llvm; + +const ARMMCExpr* +ARMMCExpr::Create(VariantKind Kind, const MCExpr *Expr, + MCContext &Ctx) { + return new (Ctx) ARMMCExpr(Kind, Expr); +} + +void ARMMCExpr::PrintImpl(raw_ostream &OS) const { + switch (Kind) { + default: assert(0 && "Invalid kind!"); + case VK_ARM_HI16: OS << ":upper16:"; break; + case VK_ARM_LO16: OS << ":lower16:"; break; + } + + const MCExpr *Expr = getSubExpr(); + if (Expr->getKind() != MCExpr::SymbolRef) + OS << '('; + Expr->print(OS); + if (Expr->getKind() != MCExpr::SymbolRef) + OS << ')'; +} + +bool +ARMMCExpr::EvaluateAsRelocatableImpl(MCValue &Res, + const MCAsmLayout *Layout) const { + return false; +} + +// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps +// that method should be made public? +static void AddValueSymbols_(const MCExpr *Value, MCAssembler *Asm) { + switch (Value->getKind()) { + case MCExpr::Target: + assert(0 && "Can't handle nested target expr!"); + break; + + case MCExpr::Constant: + break; + + case MCExpr::Binary: { + const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value); + AddValueSymbols_(BE->getLHS(), Asm); + AddValueSymbols_(BE->getRHS(), Asm); + break; + } + + case MCExpr::SymbolRef: + Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol()); + break; + + case MCExpr::Unary: + AddValueSymbols_(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm); + break; + } +} + +void ARMMCExpr::AddValueSymbols(MCAssembler *Asm) const { + AddValueSymbols_(getSubExpr(), Asm); +} diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h new file mode 100644 index 0000000..0a2e883 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h @@ -0,0 +1,76 @@ +//===-- ARMMCExpr.h - ARM specific MC expression classes ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMMCEXPR_H +#define ARMMCEXPR_H + +#include "llvm/MC/MCExpr.h" + +namespace llvm { + +class ARMMCExpr : public MCTargetExpr { +public: + enum VariantKind { + VK_ARM_None, + VK_ARM_HI16, // The R_ARM_MOVT_ABS relocation (:upper16: in the .s file) + VK_ARM_LO16 // The R_ARM_MOVW_ABS_NC relocation (:lower16: in the .s file) + }; + +private: + const VariantKind Kind; + const MCExpr *Expr; + + explicit ARMMCExpr(VariantKind _Kind, const MCExpr *_Expr) + : Kind(_Kind), Expr(_Expr) {} + +public: + /// @name Construction + /// @{ + + static const ARMMCExpr *Create(VariantKind Kind, const MCExpr *Expr, + MCContext &Ctx); + + static const ARMMCExpr *CreateUpper16(const MCExpr *Expr, MCContext &Ctx) { + return Create(VK_ARM_HI16, Expr, Ctx); + } + + static const ARMMCExpr *CreateLower16(const MCExpr *Expr, MCContext &Ctx) { + return Create(VK_ARM_LO16, Expr, Ctx); + } + + /// @} + /// @name Accessors + /// @{ + + /// getOpcode - Get the kind of this expression. + VariantKind getKind() const { return Kind; } + + /// getSubExpr - Get the child of this expression. + const MCExpr *getSubExpr() const { return Expr; } + + /// @} + + void PrintImpl(raw_ostream &OS) const; + bool EvaluateAsRelocatableImpl(MCValue &Res, + const MCAsmLayout *Layout) const; + void AddValueSymbols(MCAssembler *) const; + const MCSection *FindAssociatedSection() const { + return getSubExpr()->FindAssociatedSection(); + } + + static bool classof(const MCExpr *E) { + return E->getKind() == MCExpr::Target; + } + + static bool classof(const ARMMCExpr *) { return true; } + +}; +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp new file mode 100644 index 0000000..a55c410 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp @@ -0,0 +1,263 @@ +//===-- ARMMCTargetDesc.cpp - ARM Target Descriptions -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides ARM specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#include "ARMMCTargetDesc.h" +#include "ARMMCAsmInfo.h" +#include "ARMBaseInfo.h" +#include "InstPrinter/ARMInstPrinter.h" +#include "llvm/MC/MCCodeGenInfo.h" +#include "llvm/MC/MCInstrAnalysis.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" + +#define GET_REGINFO_MC_DESC +#include "ARMGenRegisterInfo.inc" + +#define GET_INSTRINFO_MC_DESC +#include "ARMGenInstrInfo.inc" + +#define GET_SUBTARGETINFO_MC_DESC +#include "ARMGenSubtargetInfo.inc" + +using namespace llvm; + +std::string ARM_MC::ParseARMTriple(StringRef TT) { + // Set the boolean corresponding to the current target triple, or the default + // if one cannot be determined, to true. + unsigned Len = TT.size(); + unsigned Idx = 0; + + // FIXME: Enhance Triple helper class to extract ARM version. + bool isThumb = false; + if (Len >= 5 && TT.substr(0, 4) == "armv") + Idx = 4; + else if (Len >= 6 && TT.substr(0, 5) == "thumb") { + isThumb = true; + if (Len >= 7 && TT[5] == 'v') + Idx = 6; + } + + std::string ARMArchFeature; + if (Idx) { + unsigned SubVer = TT[Idx]; + if (SubVer >= '7' && SubVer <= '9') { + if (Len >= Idx+2 && TT[Idx+1] == 'm') { + // v7m: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureMClass + ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+mclass"; + } else if (Len >= Idx+3 && TT[Idx+1] == 'e'&& TT[Idx+2] == 'm') { + // v7em: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureDSPThumb2, + // FeatureT2XtPk, FeatureMClass + ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+t2dsp,t2xtpk,+mclass"; + } else + // v7a: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureT2XtPk + ARMArchFeature = "+v7,+neon,+db,+t2dsp,+t2xtpk"; + } else if (SubVer == '6') { + if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == '2') + ARMArchFeature = "+v6t2"; + else if (Len >= Idx+2 && TT[Idx+1] == 'm') + // v6m: FeatureNoARM, FeatureMClass + ARMArchFeature = "+v6t2,+noarm,+mclass"; + else + ARMArchFeature = "+v6"; + } else if (SubVer == '5') { + if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == 'e') + ARMArchFeature = "+v5te"; + else + ARMArchFeature = "+v5t"; + } else if (SubVer == '4' && Len >= Idx+2 && TT[Idx+1] == 't') + ARMArchFeature = "+v4t"; + } + + if (isThumb) { + if (ARMArchFeature.empty()) + ARMArchFeature = "+thumb-mode"; + else + ARMArchFeature += ",+thumb-mode"; + } + + Triple TheTriple(TT); + if (TheTriple.getOS() == Triple::NativeClient) { + if (ARMArchFeature.empty()) + ARMArchFeature = "+nacl-mode"; + else + ARMArchFeature += ",+nacl-mode"; + } + + return ARMArchFeature; +} + +MCSubtargetInfo *ARM_MC::createARMMCSubtargetInfo(StringRef TT, StringRef CPU, + StringRef FS) { + std::string ArchFS = ARM_MC::ParseARMTriple(TT); + if (!FS.empty()) { + if (!ArchFS.empty()) + ArchFS = ArchFS + "," + FS.str(); + else + ArchFS = FS; + } + + MCSubtargetInfo *X = new MCSubtargetInfo(); + InitARMMCSubtargetInfo(X, TT, CPU, ArchFS); + return X; +} + +static MCInstrInfo *createARMMCInstrInfo() { + MCInstrInfo *X = new MCInstrInfo(); + InitARMMCInstrInfo(X); + return X; +} + +static MCRegisterInfo *createARMMCRegisterInfo(StringRef Triple) { + MCRegisterInfo *X = new MCRegisterInfo(); + InitARMMCRegisterInfo(X, ARM::LR); + return X; +} + +static MCAsmInfo *createARMMCAsmInfo(const Target &T, StringRef TT) { + Triple TheTriple(TT); + + if (TheTriple.isOSDarwin()) + return new ARMMCAsmInfoDarwin(); + + return new ARMELFMCAsmInfo(); +} + +static MCCodeGenInfo *createARMMCCodeGenInfo(StringRef TT, Reloc::Model RM, + CodeModel::Model CM) { + MCCodeGenInfo *X = new MCCodeGenInfo(); + if (RM == Reloc::Default) { + Triple TheTriple(TT); + // Default relocation model on Darwin is PIC, not DynamicNoPIC. + RM = TheTriple.isOSDarwin() ? Reloc::PIC_ : Reloc::DynamicNoPIC; + } + X->InitMCCodeGenInfo(RM, CM); + return X; +} + +// This is duplicated code. Refactor this. +static MCStreamer *createMCStreamer(const Target &T, StringRef TT, + MCContext &Ctx, MCAsmBackend &MAB, + raw_ostream &OS, + MCCodeEmitter *Emitter, + bool RelaxAll, + bool NoExecStack) { + Triple TheTriple(TT); + + if (TheTriple.isOSDarwin()) + return createMachOStreamer(Ctx, MAB, OS, Emitter, RelaxAll); + + if (TheTriple.isOSWindows()) { + llvm_unreachable("ARM does not support Windows COFF format"); + return NULL; + } + + return createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll, NoExecStack); +} + +static MCInstPrinter *createARMMCInstPrinter(const Target &T, + unsigned SyntaxVariant, + const MCAsmInfo &MAI, + const MCSubtargetInfo &STI) { + if (SyntaxVariant == 0) + return new ARMInstPrinter(MAI, STI); + return 0; +} + +namespace { + +class ARMMCInstrAnalysis : public MCInstrAnalysis { +public: + ARMMCInstrAnalysis(const MCInstrInfo *Info) : MCInstrAnalysis(Info) {} + + virtual bool isUnconditionalBranch(const MCInst &Inst) const { + // BCCs with the "always" predicate are unconditional branches. + if (Inst.getOpcode() == ARM::Bcc && Inst.getOperand(1).getImm()==ARMCC::AL) + return true; + return MCInstrAnalysis::isUnconditionalBranch(Inst); + } + + virtual bool isConditionalBranch(const MCInst &Inst) const { + // BCCs with the "always" predicate are unconditional branches. + if (Inst.getOpcode() == ARM::Bcc && Inst.getOperand(1).getImm()==ARMCC::AL) + return false; + return MCInstrAnalysis::isConditionalBranch(Inst); + } + + uint64_t evaluateBranch(const MCInst &Inst, uint64_t Addr, + uint64_t Size) const { + // We only handle PCRel branches for now. + if (Info->get(Inst.getOpcode()).OpInfo[0].OperandType!=MCOI::OPERAND_PCREL) + return -1ULL; + + int64_t Imm = Inst.getOperand(0).getImm(); + // FIXME: This is not right for thumb. + return Addr+Imm+8; // In ARM mode the PC is always off by 8 bytes. + } +}; + +} + +static MCInstrAnalysis *createARMMCInstrAnalysis(const MCInstrInfo *Info) { + return new ARMMCInstrAnalysis(Info); +} + +// Force static initialization. +extern "C" void LLVMInitializeARMTargetMC() { + // Register the MC asm info. + RegisterMCAsmInfoFn A(TheARMTarget, createARMMCAsmInfo); + RegisterMCAsmInfoFn B(TheThumbTarget, createARMMCAsmInfo); + + // Register the MC codegen info. + TargetRegistry::RegisterMCCodeGenInfo(TheARMTarget, createARMMCCodeGenInfo); + TargetRegistry::RegisterMCCodeGenInfo(TheThumbTarget, createARMMCCodeGenInfo); + + // Register the MC instruction info. + TargetRegistry::RegisterMCInstrInfo(TheARMTarget, createARMMCInstrInfo); + TargetRegistry::RegisterMCInstrInfo(TheThumbTarget, createARMMCInstrInfo); + + // Register the MC register info. + TargetRegistry::RegisterMCRegInfo(TheARMTarget, createARMMCRegisterInfo); + TargetRegistry::RegisterMCRegInfo(TheThumbTarget, createARMMCRegisterInfo); + + // Register the MC subtarget info. + TargetRegistry::RegisterMCSubtargetInfo(TheARMTarget, + ARM_MC::createARMMCSubtargetInfo); + TargetRegistry::RegisterMCSubtargetInfo(TheThumbTarget, + ARM_MC::createARMMCSubtargetInfo); + + // Register the MC instruction analyzer. + TargetRegistry::RegisterMCInstrAnalysis(TheARMTarget, + createARMMCInstrAnalysis); + TargetRegistry::RegisterMCInstrAnalysis(TheThumbTarget, + createARMMCInstrAnalysis); + + // Register the MC Code Emitter + TargetRegistry::RegisterMCCodeEmitter(TheARMTarget, createARMMCCodeEmitter); + TargetRegistry::RegisterMCCodeEmitter(TheThumbTarget, createARMMCCodeEmitter); + + // Register the asm backend. + TargetRegistry::RegisterMCAsmBackend(TheARMTarget, createARMAsmBackend); + TargetRegistry::RegisterMCAsmBackend(TheThumbTarget, createARMAsmBackend); + + // Register the object streamer. + TargetRegistry::RegisterMCObjectStreamer(TheARMTarget, createMCStreamer); + TargetRegistry::RegisterMCObjectStreamer(TheThumbTarget, createMCStreamer); + + // Register the MCInstPrinter. + TargetRegistry::RegisterMCInstPrinter(TheARMTarget, createARMMCInstPrinter); + TargetRegistry::RegisterMCInstPrinter(TheThumbTarget, createARMMCInstPrinter); +} diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h new file mode 100644 index 0000000..9b3d3bd --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h @@ -0,0 +1,71 @@ +//===-- ARMMCTargetDesc.h - ARM Target Descriptions -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides ARM specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMMCTARGETDESC_H +#define ARMMCTARGETDESC_H + +#include "llvm/Support/DataTypes.h" +#include <string> + +namespace llvm { +class MCAsmBackend; +class MCCodeEmitter; +class MCContext; +class MCInstrInfo; +class MCObjectWriter; +class MCSubtargetInfo; +class StringRef; +class Target; +class raw_ostream; + +extern Target TheARMTarget, TheThumbTarget; + +namespace ARM_MC { + std::string ParseARMTriple(StringRef TT); + + /// createARMMCSubtargetInfo - Create a ARM MCSubtargetInfo instance. + /// This is exposed so Asm parser, etc. do not need to go through + /// TargetRegistry. + MCSubtargetInfo *createARMMCSubtargetInfo(StringRef TT, StringRef CPU, + StringRef FS); +} + +MCCodeEmitter *createARMMCCodeEmitter(const MCInstrInfo &MCII, + const MCSubtargetInfo &STI, + MCContext &Ctx); + +MCAsmBackend *createARMAsmBackend(const Target &T, StringRef TT); + +/// createARMMachObjectWriter - Construct an ARM Mach-O object writer. +MCObjectWriter *createARMMachObjectWriter(raw_ostream &OS, + bool Is64Bit, + uint32_t CPUType, + uint32_t CPUSubtype); + +} // End llvm namespace + +// Defines symbolic names for ARM registers. This defines a mapping from +// register name to register number. +// +#define GET_REGINFO_ENUM +#include "ARMGenRegisterInfo.inc" + +// Defines symbolic names for the ARM instructions. +// +#define GET_INSTRINFO_ENUM +#include "ARMGenInstrInfo.inc" + +#define GET_SUBTARGETINFO_ENUM +#include "ARMGenSubtargetInfo.inc" + +#endif diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp new file mode 100644 index 0000000..352c73e --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp @@ -0,0 +1,388 @@ +//===-- ARMMachObjectWriter.cpp - ARM Mach Object Writer ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/ARMBaseInfo.h" +#include "MCTargetDesc/ARMFixupKinds.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCAsmLayout.h" +#include "llvm/MC/MCMachObjectWriter.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCFixup.h" +#include "llvm/MC/MCFixupKindInfo.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Object/MachOFormat.h" +#include "llvm/Support/ErrorHandling.h" +using namespace llvm; +using namespace llvm::object; + +namespace { +class ARMMachObjectWriter : public MCMachObjectTargetWriter { + void RecordARMScatteredRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + unsigned Log2Size, + uint64_t &FixedValue); + void RecordARMMovwMovtRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, MCValue Target, + uint64_t &FixedValue); + +public: + ARMMachObjectWriter(bool Is64Bit, uint32_t CPUType, + uint32_t CPUSubtype) + : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype, + /*UseAggressiveSymbolFolding=*/true) {} + + void RecordRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, const MCAsmLayout &Layout, + const MCFragment *Fragment, const MCFixup &Fixup, + MCValue Target, uint64_t &FixedValue); +}; +} + +static bool getARMFixupKindMachOInfo(unsigned Kind, unsigned &RelocType, + unsigned &Log2Size) { + RelocType = unsigned(macho::RIT_Vanilla); + Log2Size = ~0U; + + switch (Kind) { + default: + return false; + + case FK_Data_1: + Log2Size = llvm::Log2_32(1); + return true; + case FK_Data_2: + Log2Size = llvm::Log2_32(2); + return true; + case FK_Data_4: + Log2Size = llvm::Log2_32(4); + return true; + case FK_Data_8: + Log2Size = llvm::Log2_32(8); + return true; + + // Handle 24-bit branch kinds. + case ARM::fixup_arm_ldst_pcrel_12: + case ARM::fixup_arm_pcrel_10: + case ARM::fixup_arm_adr_pcrel_12: + case ARM::fixup_arm_condbranch: + case ARM::fixup_arm_uncondbranch: + RelocType = unsigned(macho::RIT_ARM_Branch24Bit); + // Report as 'long', even though that is not quite accurate. + Log2Size = llvm::Log2_32(4); + return true; + + // Handle Thumb branches. + case ARM::fixup_arm_thumb_br: + RelocType = unsigned(macho::RIT_ARM_ThumbBranch22Bit); + Log2Size = llvm::Log2_32(2); + return true; + + case ARM::fixup_t2_uncondbranch: + case ARM::fixup_arm_thumb_bl: + case ARM::fixup_arm_thumb_blx: + RelocType = unsigned(macho::RIT_ARM_ThumbBranch22Bit); + Log2Size = llvm::Log2_32(4); + return true; + + case ARM::fixup_arm_movt_hi16: + case ARM::fixup_arm_movt_hi16_pcrel: + case ARM::fixup_t2_movt_hi16: + case ARM::fixup_t2_movt_hi16_pcrel: + RelocType = unsigned(macho::RIT_ARM_HalfDifference); + // Report as 'long', even though that is not quite accurate. + Log2Size = llvm::Log2_32(4); + return true; + + case ARM::fixup_arm_movw_lo16: + case ARM::fixup_arm_movw_lo16_pcrel: + case ARM::fixup_t2_movw_lo16: + case ARM::fixup_t2_movw_lo16_pcrel: + RelocType = unsigned(macho::RIT_ARM_Half); + // Report as 'long', even though that is not quite accurate. + Log2Size = llvm::Log2_32(4); + return true; + } +} + +void ARMMachObjectWriter:: +RecordARMMovwMovtRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + uint64_t &FixedValue) { + uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); + unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); + unsigned Type = macho::RIT_ARM_Half; + + // See <reloc.h>. + const MCSymbol *A = &Target.getSymA()->getSymbol(); + MCSymbolData *A_SD = &Asm.getSymbolData(*A); + + if (!A_SD->getFragment()) + report_fatal_error("symbol '" + A->getName() + + "' can not be undefined in a subtraction expression"); + + uint32_t Value = Writer->getSymbolAddress(A_SD, Layout); + uint32_t Value2 = 0; + uint64_t SecAddr = + Writer->getSectionAddress(A_SD->getFragment()->getParent()); + FixedValue += SecAddr; + + if (const MCSymbolRefExpr *B = Target.getSymB()) { + MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol()); + + if (!B_SD->getFragment()) + report_fatal_error("symbol '" + B->getSymbol().getName() + + "' can not be undefined in a subtraction expression"); + + // Select the appropriate difference relocation type. + Type = macho::RIT_ARM_HalfDifference; + Value2 = Writer->getSymbolAddress(B_SD, Layout); + FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent()); + } + + // Relocations are written out in reverse order, so the PAIR comes first. + // ARM_RELOC_HALF and ARM_RELOC_HALF_SECTDIFF abuse the r_length field: + // + // For these two r_type relocations they always have a pair following them and + // the r_length bits are used differently. The encoding of the r_length is as + // follows: + // low bit of r_length: + // 0 - :lower16: for movw instructions + // 1 - :upper16: for movt instructions + // high bit of r_length: + // 0 - arm instructions + // 1 - thumb instructions + // the other half of the relocated expression is in the following pair + // relocation entry in the the low 16 bits of r_address field. + unsigned ThumbBit = 0; + unsigned MovtBit = 0; + switch ((unsigned)Fixup.getKind()) { + default: break; + case ARM::fixup_arm_movt_hi16: + case ARM::fixup_arm_movt_hi16_pcrel: + MovtBit = 1; + break; + case ARM::fixup_t2_movt_hi16: + case ARM::fixup_t2_movt_hi16_pcrel: + MovtBit = 1; + // Fallthrough + case ARM::fixup_t2_movw_lo16: + case ARM::fixup_t2_movw_lo16_pcrel: + ThumbBit = 1; + break; + } + + + if (Type == macho::RIT_ARM_HalfDifference) { + uint32_t OtherHalf = MovtBit + ? (FixedValue & 0xffff) : ((FixedValue & 0xffff0000) >> 16); + + macho::RelocationEntry MRE; + MRE.Word0 = ((OtherHalf << 0) | + (macho::RIT_Pair << 24) | + (MovtBit << 28) | + (ThumbBit << 29) | + (IsPCRel << 30) | + macho::RF_Scattered); + MRE.Word1 = Value2; + Writer->addRelocation(Fragment->getParent(), MRE); + } + + macho::RelocationEntry MRE; + MRE.Word0 = ((FixupOffset << 0) | + (Type << 24) | + (MovtBit << 28) | + (ThumbBit << 29) | + (IsPCRel << 30) | + macho::RF_Scattered); + MRE.Word1 = Value; + Writer->addRelocation(Fragment->getParent(), MRE); +} + +void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + unsigned Log2Size, + uint64_t &FixedValue) { + uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); + unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); + unsigned Type = macho::RIT_Vanilla; + + // See <reloc.h>. + const MCSymbol *A = &Target.getSymA()->getSymbol(); + MCSymbolData *A_SD = &Asm.getSymbolData(*A); + + if (!A_SD->getFragment()) + report_fatal_error("symbol '" + A->getName() + + "' can not be undefined in a subtraction expression"); + + uint32_t Value = Writer->getSymbolAddress(A_SD, Layout); + uint64_t SecAddr = Writer->getSectionAddress(A_SD->getFragment()->getParent()); + FixedValue += SecAddr; + uint32_t Value2 = 0; + + if (const MCSymbolRefExpr *B = Target.getSymB()) { + MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol()); + + if (!B_SD->getFragment()) + report_fatal_error("symbol '" + B->getSymbol().getName() + + "' can not be undefined in a subtraction expression"); + + // Select the appropriate difference relocation type. + Type = macho::RIT_Difference; + Value2 = Writer->getSymbolAddress(B_SD, Layout); + FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent()); + } + + // Relocations are written out in reverse order, so the PAIR comes first. + if (Type == macho::RIT_Difference || + Type == macho::RIT_Generic_LocalDifference) { + macho::RelocationEntry MRE; + MRE.Word0 = ((0 << 0) | + (macho::RIT_Pair << 24) | + (Log2Size << 28) | + (IsPCRel << 30) | + macho::RF_Scattered); + MRE.Word1 = Value2; + Writer->addRelocation(Fragment->getParent(), MRE); + } + + macho::RelocationEntry MRE; + MRE.Word0 = ((FixupOffset << 0) | + (Type << 24) | + (Log2Size << 28) | + (IsPCRel << 30) | + macho::RF_Scattered); + MRE.Word1 = Value; + Writer->addRelocation(Fragment->getParent(), MRE); +} + +void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + uint64_t &FixedValue) { + unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); + unsigned Log2Size; + unsigned RelocType = macho::RIT_Vanilla; + if (!getARMFixupKindMachOInfo(Fixup.getKind(), RelocType, Log2Size)) { + report_fatal_error("unknown ARM fixup kind!"); + return; + } + + // If this is a difference or a defined symbol plus an offset, then we need a + // scattered relocation entry. Differences always require scattered + // relocations. + if (Target.getSymB()) { + if (RelocType == macho::RIT_ARM_Half || + RelocType == macho::RIT_ARM_HalfDifference) + return RecordARMMovwMovtRelocation(Writer, Asm, Layout, Fragment, Fixup, + Target, FixedValue); + return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, + Target, Log2Size, FixedValue); + } + + // Get the symbol data, if any. + MCSymbolData *SD = 0; + if (Target.getSymA()) + SD = &Asm.getSymbolData(Target.getSymA()->getSymbol()); + + // FIXME: For other platforms, we need to use scattered relocations for + // internal relocations with offsets. If this is an internal relocation with + // an offset, it also needs a scattered relocation entry. + // + // Is this right for ARM? + uint32_t Offset = Target.getConstant(); + if (IsPCRel && RelocType == macho::RIT_Vanilla) + Offset += 1 << Log2Size; + if (Offset && SD && !Writer->doesSymbolRequireExternRelocation(SD)) + return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, + Target, Log2Size, FixedValue); + + // See <reloc.h>. + uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); + unsigned Index = 0; + unsigned IsExtern = 0; + unsigned Type = 0; + + if (Target.isAbsolute()) { // constant + // FIXME! + report_fatal_error("FIXME: relocations to absolute targets " + "not yet implemented"); + } else { + // Resolve constant variables. + if (SD->getSymbol().isVariable()) { + int64_t Res; + if (SD->getSymbol().getVariableValue()->EvaluateAsAbsolute( + Res, Layout, Writer->getSectionAddressMap())) { + FixedValue = Res; + return; + } + } + + // Check whether we need an external or internal relocation. + if (Writer->doesSymbolRequireExternRelocation(SD)) { + IsExtern = 1; + Index = SD->getIndex(); + + // For external relocations, make sure to offset the fixup value to + // compensate for the addend of the symbol address, if it was + // undefined. This occurs with weak definitions, for example. + if (!SD->Symbol->isUndefined()) + FixedValue -= Layout.getSymbolOffset(SD); + } else { + // The index is the section ordinal (1-based). + const MCSectionData &SymSD = Asm.getSectionData( + SD->getSymbol().getSection()); + Index = SymSD.getOrdinal() + 1; + FixedValue += Writer->getSectionAddress(&SymSD); + } + if (IsPCRel) + FixedValue -= Writer->getSectionAddress(Fragment->getParent()); + + // The type is determined by the fixup kind. + Type = RelocType; + } + + // struct relocation_info (8 bytes) + macho::RelocationEntry MRE; + MRE.Word0 = FixupOffset; + MRE.Word1 = ((Index << 0) | + (IsPCRel << 24) | + (Log2Size << 25) | + (IsExtern << 27) | + (Type << 28)); + Writer->addRelocation(Fragment->getParent(), MRE); +} + +MCObjectWriter *llvm::createARMMachObjectWriter(raw_ostream &OS, + bool Is64Bit, + uint32_t CPUType, + uint32_t CPUSubtype) { + return createMachObjectWriter(new ARMMachObjectWriter(Is64Bit, + CPUType, + CPUSubtype), + OS, /*IsLittleEndian=*/true); +} diff --git a/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp b/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp new file mode 100644 index 0000000..2df0053 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp @@ -0,0 +1,331 @@ +//===-- MLxExpansionPass.cpp - Expand MLx instrs to avoid hazards ----------=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Expand VFP / NEON floating point MLA / MLS instructions (each to a pair of +// multiple and add / sub instructions) when special VMLx hazards are detected. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "mlx-expansion" +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "ARMSubtarget.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +static cl::opt<bool> +ForceExapnd("expand-all-fp-mlx", cl::init(false), cl::Hidden); +static cl::opt<unsigned> +ExpandLimit("expand-limit", cl::init(~0U), cl::Hidden); + +STATISTIC(NumExpand, "Number of fp MLA / MLS instructions expanded"); + +namespace { + struct MLxExpansion : public MachineFunctionPass { + static char ID; + MLxExpansion() : MachineFunctionPass(ID) {} + + virtual bool runOnMachineFunction(MachineFunction &Fn); + + virtual const char *getPassName() const { + return "ARM MLA / MLS expansion pass"; + } + + private: + const ARMBaseInstrInfo *TII; + const TargetRegisterInfo *TRI; + MachineRegisterInfo *MRI; + + bool isA9; + unsigned MIIdx; + MachineInstr* LastMIs[4]; + SmallPtrSet<MachineInstr*, 4> IgnoreStall; + + void clearStack(); + void pushStack(MachineInstr *MI); + MachineInstr *getAccDefMI(MachineInstr *MI) const; + unsigned getDefReg(MachineInstr *MI) const; + bool hasRAWHazard(unsigned Reg, MachineInstr *MI) const; + bool FindMLxHazard(MachineInstr *MI); + void ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI, + unsigned MulOpc, unsigned AddSubOpc, + bool NegAcc, bool HasLane); + bool ExpandFPMLxInstructions(MachineBasicBlock &MBB); + }; + char MLxExpansion::ID = 0; +} + +void MLxExpansion::clearStack() { + std::fill(LastMIs, LastMIs + 4, (MachineInstr*)0); + MIIdx = 0; +} + +void MLxExpansion::pushStack(MachineInstr *MI) { + LastMIs[MIIdx] = MI; + if (++MIIdx == 4) + MIIdx = 0; +} + +MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const { + // Look past COPY and INSERT_SUBREG instructions to find the + // real definition MI. This is important for _sfp instructions. + unsigned Reg = MI->getOperand(1).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(Reg)) + return 0; + + MachineBasicBlock *MBB = MI->getParent(); + MachineInstr *DefMI = MRI->getVRegDef(Reg); + while (true) { + if (DefMI->getParent() != MBB) + break; + if (DefMI->isCopyLike()) { + Reg = DefMI->getOperand(1).getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + DefMI = MRI->getVRegDef(Reg); + continue; + } + } else if (DefMI->isInsertSubreg()) { + Reg = DefMI->getOperand(2).getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + DefMI = MRI->getVRegDef(Reg); + continue; + } + } + break; + } + return DefMI; +} + +unsigned MLxExpansion::getDefReg(MachineInstr *MI) const { + unsigned Reg = MI->getOperand(0).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(Reg) || + !MRI->hasOneNonDBGUse(Reg)) + return Reg; + + MachineBasicBlock *MBB = MI->getParent(); + MachineInstr *UseMI = &*MRI->use_nodbg_begin(Reg); + if (UseMI->getParent() != MBB) + return Reg; + + while (UseMI->isCopy() || UseMI->isInsertSubreg()) { + Reg = UseMI->getOperand(0).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(Reg) || + !MRI->hasOneNonDBGUse(Reg)) + return Reg; + UseMI = &*MRI->use_nodbg_begin(Reg); + if (UseMI->getParent() != MBB) + return Reg; + } + + return Reg; +} + +bool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const { + // FIXME: Detect integer instructions properly. + const MCInstrDesc &MCID = MI->getDesc(); + unsigned Domain = MCID.TSFlags & ARMII::DomainMask; + if (MCID.mayStore()) + return false; + unsigned Opcode = MCID.getOpcode(); + if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD) + return false; + if ((Domain & ARMII::DomainVFP) || (Domain & ARMII::DomainNEON)) + return MI->readsRegister(Reg, TRI); + return false; +} + + +bool MLxExpansion::FindMLxHazard(MachineInstr *MI) { + if (NumExpand >= ExpandLimit) + return false; + + if (ForceExapnd) + return true; + + MachineInstr *DefMI = getAccDefMI(MI); + if (TII->isFpMLxInstruction(DefMI->getOpcode())) { + // r0 = vmla + // r3 = vmla r0, r1, r2 + // takes 16 - 17 cycles + // + // r0 = vmla + // r4 = vmul r1, r2 + // r3 = vadd r0, r4 + // takes about 14 - 15 cycles even with vmul stalling for 4 cycles. + IgnoreStall.insert(DefMI); + return true; + } + + if (IgnoreStall.count(MI)) + return false; + + // If a VMLA.F is followed by an VADD.F or VMUL.F with no RAW hazard, the + // VADD.F or VMUL.F will stall 4 cycles before issue. The 4 cycle stall + // preserves the in-order retirement of the instructions. + // Look at the next few instructions, if *most* of them can cause hazards, + // then the scheduler can't *fix* this, we'd better break up the VMLA. + unsigned Limit1 = isA9 ? 1 : 4; + unsigned Limit2 = isA9 ? 1 : 4; + for (unsigned i = 1; i <= 4; ++i) { + int Idx = ((int)MIIdx - i + 4) % 4; + MachineInstr *NextMI = LastMIs[Idx]; + if (!NextMI) + continue; + + if (TII->canCauseFpMLxStall(NextMI->getOpcode())) { + if (i <= Limit1) + return true; + } + + // Look for VMLx RAW hazard. + if (i <= Limit2 && hasRAWHazard(getDefReg(MI), NextMI)) + return true; + } + + return false; +} + +/// ExpandFPMLxInstructions - Expand a MLA / MLS instruction into a pair +/// of MUL + ADD / SUB instructions. +void +MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI, + unsigned MulOpc, unsigned AddSubOpc, + bool NegAcc, bool HasLane) { + unsigned DstReg = MI->getOperand(0).getReg(); + bool DstDead = MI->getOperand(0).isDead(); + unsigned AccReg = MI->getOperand(1).getReg(); + unsigned Src1Reg = MI->getOperand(2).getReg(); + unsigned Src2Reg = MI->getOperand(3).getReg(); + bool Src1Kill = MI->getOperand(2).isKill(); + bool Src2Kill = MI->getOperand(3).isKill(); + unsigned LaneImm = HasLane ? MI->getOperand(4).getImm() : 0; + unsigned NextOp = HasLane ? 5 : 4; + ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NextOp).getImm(); + unsigned PredReg = MI->getOperand(++NextOp).getReg(); + + const MCInstrDesc &MCID1 = TII->get(MulOpc); + const MCInstrDesc &MCID2 = TII->get(AddSubOpc); + unsigned TmpReg = MRI->createVirtualRegister(TII->getRegClass(MCID1, 0, TRI)); + + MachineInstrBuilder MIB = BuildMI(MBB, *MI, MI->getDebugLoc(), MCID1, TmpReg) + .addReg(Src1Reg, getKillRegState(Src1Kill)) + .addReg(Src2Reg, getKillRegState(Src2Kill)); + if (HasLane) + MIB.addImm(LaneImm); + MIB.addImm(Pred).addReg(PredReg); + + MIB = BuildMI(MBB, *MI, MI->getDebugLoc(), MCID2) + .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstDead)); + + if (NegAcc) { + bool AccKill = MRI->hasOneNonDBGUse(AccReg); + MIB.addReg(TmpReg, getKillRegState(true)) + .addReg(AccReg, getKillRegState(AccKill)); + } else { + MIB.addReg(AccReg).addReg(TmpReg, getKillRegState(true)); + } + MIB.addImm(Pred).addReg(PredReg); + + DEBUG({ + dbgs() << "Expanding: " << *MI; + dbgs() << " to:\n"; + MachineBasicBlock::iterator MII = MI; + MII = llvm::prior(MII); + MachineInstr &MI2 = *MII; + MII = llvm::prior(MII); + MachineInstr &MI1 = *MII; + dbgs() << " " << MI1; + dbgs() << " " << MI2; + }); + + MI->eraseFromParent(); + ++NumExpand; +} + +bool MLxExpansion::ExpandFPMLxInstructions(MachineBasicBlock &MBB) { + bool Changed = false; + + clearStack(); + IgnoreStall.clear(); + + unsigned Skip = 0; + MachineBasicBlock::reverse_iterator MII = MBB.rbegin(), E = MBB.rend(); + while (MII != E) { + MachineInstr *MI = &*MII; + + if (MI->isLabel() || MI->isImplicitDef() || MI->isCopy()) { + ++MII; + continue; + } + + const MCInstrDesc &MCID = MI->getDesc(); + if (MCID.isBarrier()) { + clearStack(); + Skip = 0; + ++MII; + continue; + } + + unsigned Domain = MCID.TSFlags & ARMII::DomainMask; + if (Domain == ARMII::DomainGeneral) { + if (++Skip == 2) + // Assume dual issues of non-VFP / NEON instructions. + pushStack(0); + } else { + Skip = 0; + + unsigned MulOpc, AddSubOpc; + bool NegAcc, HasLane; + if (!TII->isFpMLxInstruction(MCID.getOpcode(), + MulOpc, AddSubOpc, NegAcc, HasLane) || + !FindMLxHazard(MI)) + pushStack(MI); + else { + ExpandFPMLxInstruction(MBB, MI, MulOpc, AddSubOpc, NegAcc, HasLane); + E = MBB.rend(); // May have changed if MI was the 1st instruction. + Changed = true; + continue; + } + } + + ++MII; + } + + return Changed; +} + +bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) { + TII = static_cast<const ARMBaseInstrInfo*>(Fn.getTarget().getInstrInfo()); + TRI = Fn.getTarget().getRegisterInfo(); + MRI = &Fn.getRegInfo(); + const ARMSubtarget *STI = &Fn.getTarget().getSubtarget<ARMSubtarget>(); + isA9 = STI->isCortexA9(); + + bool Modified = false; + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; + ++MFI) { + MachineBasicBlock &MBB = *MFI; + Modified |= ExpandFPMLxInstructions(MBB); + } + + return Modified; +} + +FunctionPass *llvm::createMLxExpansionPass() { + return new MLxExpansion(); +} diff --git a/contrib/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp b/contrib/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp new file mode 100644 index 0000000..500e3de --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp @@ -0,0 +1,23 @@ +//===-- ARMTargetInfo.cpp - ARM Target Implementation ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "llvm/Module.h" +#include "llvm/Support/TargetRegistry.h" +using namespace llvm; + +Target llvm::TheARMTarget, llvm::TheThumbTarget; + +extern "C" void LLVMInitializeARMTargetInfo() { + RegisterTarget<Triple::arm, /*HasJIT=*/true> + X(TheARMTarget, "arm", "ARM"); + + RegisterTarget<Triple::thumb, /*HasJIT=*/true> + Y(TheThumbTarget, "thumb", "Thumb"); +} diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp new file mode 100644 index 0000000..d848177 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -0,0 +1,366 @@ +//======- Thumb1FrameLowering.cpp - Thumb1 Frame Information ---*- C++ -*-====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Thumb1 implementation of TargetFrameLowering class. +// +//===----------------------------------------------------------------------===// + +#include "Thumb1FrameLowering.h" +#include "ARMBaseInstrInfo.h" +#include "ARMMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +bool Thumb1FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const{ + const MachineFrameInfo *FFI = MF.getFrameInfo(); + unsigned CFSize = FFI->getMaxCallFrameSize(); + // It's not always a good idea to include the call frame as part of the + // stack frame. ARM (especially Thumb) has small immediate offset to + // address the stack frame. So a large call frame can cause poor codegen + // and may even makes it impossible to scavenge a register. + if (CFSize >= ((1 << 8) - 1) * 4 / 2) // Half of imm8 * 4 + return false; + + return !MF.getFrameInfo()->hasVarSizedObjects(); +} + +static void +emitSPUpdate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + const TargetInstrInfo &TII, DebugLoc dl, + const Thumb1RegisterInfo &MRI, + int NumBytes, unsigned MIFlags = MachineInstr::NoFlags) { + emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, TII, + MRI, MIFlags); +} + +void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const { + MachineBasicBlock &MBB = MF.front(); + MachineBasicBlock::iterator MBBI = MBB.begin(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + const Thumb1RegisterInfo *RegInfo = + static_cast<const Thumb1RegisterInfo*>(MF.getTarget().getRegisterInfo()); + const Thumb1InstrInfo &TII = + *static_cast<const Thumb1InstrInfo*>(MF.getTarget().getInstrInfo()); + + unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); + unsigned NumBytes = MFI->getStackSize(); + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + unsigned FramePtr = RegInfo->getFrameRegister(MF); + unsigned BasePtr = RegInfo->getBaseRegister(); + + // Thumb add/sub sp, imm8 instructions implicitly multiply the offset by 4. + NumBytes = (NumBytes + 3) & ~3; + MFI->setStackSize(NumBytes); + + // Determine the sizes of each callee-save spill areas and record which frame + // belongs to which callee-save spill areas. + unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0; + int FramePtrSpillFI = 0; + + if (VARegSaveSize) + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -VARegSaveSize, + MachineInstr::FrameSetup); + + if (!AFI->hasStackFrame()) { + if (NumBytes != 0) + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes, + MachineInstr::FrameSetup); + return; + } + + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + int FI = CSI[i].getFrameIdx(); + switch (Reg) { + case ARM::R4: + case ARM::R5: + case ARM::R6: + case ARM::R7: + case ARM::LR: + if (Reg == FramePtr) + FramePtrSpillFI = FI; + AFI->addGPRCalleeSavedArea1Frame(FI); + GPRCS1Size += 4; + break; + case ARM::R8: + case ARM::R9: + case ARM::R10: + case ARM::R11: + if (Reg == FramePtr) + FramePtrSpillFI = FI; + if (STI.isTargetDarwin()) { + AFI->addGPRCalleeSavedArea2Frame(FI); + GPRCS2Size += 4; + } else { + AFI->addGPRCalleeSavedArea1Frame(FI); + GPRCS1Size += 4; + } + break; + default: + AFI->addDPRCalleeSavedAreaFrame(FI); + DPRCSSize += 8; + } + } + + if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPUSH) { + ++MBBI; + if (MBBI != MBB.end()) + dl = MBBI->getDebugLoc(); + } + + // Determine starting offsets of spill areas. + unsigned DPRCSOffset = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize); + unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize; + unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size; + AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + NumBytes); + AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset); + AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset); + AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset); + NumBytes = DPRCSOffset; + + // Adjust FP so it point to the stack slot that contains the previous FP. + if (hasFP(MF)) { + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr) + .addFrameIndex(FramePtrSpillFI).addImm(0) + .setMIFlags(MachineInstr::FrameSetup)); + if (NumBytes > 508) + // If offset is > 508 then sp cannot be adjusted in a single instruction, + // try restoring from fp instead. + AFI->setShouldRestoreSPFromFP(true); + } + + if (NumBytes) + // Insert it after all the callee-save spills. + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes, + MachineInstr::FrameSetup); + + if (STI.isTargetELF() && hasFP(MF)) + MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() - + AFI->getFramePtrSpillOffset()); + + AFI->setGPRCalleeSavedArea1Size(GPRCS1Size); + AFI->setGPRCalleeSavedArea2Size(GPRCS2Size); + AFI->setDPRCalleeSavedAreaSize(DPRCSSize); + + // Thumb1 does not currently support dynamic stack realignment. Report a + // fatal error rather then silently generate bad code. + if (RegInfo->needsStackRealignment(MF)) + report_fatal_error("Dynamic stack realignment not supported for thumb1."); + + // If we need a base pointer, set it up here. It's whatever the value + // of the stack pointer is at this point. Any variable size objects + // will be allocated after this, so we can still use the base pointer + // to reference locals. + if (RegInfo->hasBasePointer(MF)) + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), BasePtr) + .addReg(ARM::SP)); + + // If the frame has variable sized objects then the epilogue must restore + // the sp from fp. We can assume there's an FP here since hasFP already + // checks for hasVarSizedObjects. + if (MFI->hasVarSizedObjects()) + AFI->setShouldRestoreSPFromFP(true); +} + +static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) { + for (unsigned i = 0; CSRegs[i]; ++i) + if (Reg == CSRegs[i]) + return true; + return false; +} + +static bool isCSRestore(MachineInstr *MI, const unsigned *CSRegs) { + if (MI->getOpcode() == ARM::tLDRspi && + MI->getOperand(1).isFI() && + isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs)) + return true; + else if (MI->getOpcode() == ARM::tPOP) { + // The first two operands are predicates. The last two are + // imp-def and imp-use of SP. Check everything in between. + for (int i = 2, e = MI->getNumOperands() - 2; i != e; ++i) + if (!isCalleeSavedRegister(MI->getOperand(i).getReg(), CSRegs)) + return false; + return true; + } + return false; +} + +void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + assert((MBBI->getOpcode() == ARM::tBX_RET || + MBBI->getOpcode() == ARM::tPOP_RET) && + "Can only insert epilog into returning blocks"); + DebugLoc dl = MBBI->getDebugLoc(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + const Thumb1RegisterInfo *RegInfo = + static_cast<const Thumb1RegisterInfo*>(MF.getTarget().getRegisterInfo()); + const Thumb1InstrInfo &TII = + *static_cast<const Thumb1InstrInfo*>(MF.getTarget().getInstrInfo()); + + unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); + int NumBytes = (int)MFI->getStackSize(); + const unsigned *CSRegs = RegInfo->getCalleeSavedRegs(); + unsigned FramePtr = RegInfo->getFrameRegister(MF); + + if (!AFI->hasStackFrame()) { + if (NumBytes != 0) + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes); + } else { + // Unwind MBBI to point to first LDR / VLDRD. + if (MBBI != MBB.begin()) { + do + --MBBI; + while (MBBI != MBB.begin() && isCSRestore(MBBI, CSRegs)); + if (!isCSRestore(MBBI, CSRegs)) + ++MBBI; + } + + // Move SP to start of FP callee save spill area. + NumBytes -= (AFI->getGPRCalleeSavedArea1Size() + + AFI->getGPRCalleeSavedArea2Size() + + AFI->getDPRCalleeSavedAreaSize()); + + if (AFI->shouldRestoreSPFromFP()) { + NumBytes = AFI->getFramePtrSpillOffset() - NumBytes; + // Reset SP based on frame pointer only if the stack frame extends beyond + // frame pointer stack slot, the target is ELF and the function has FP, or + // the target uses var sized objects. + if (NumBytes) { + assert(MF.getRegInfo().isPhysRegUsed(ARM::R4) && + "No scratch register to restore SP from FP!"); + emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes, + TII, *RegInfo); + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), + ARM::SP) + .addReg(ARM::R4)); + } else + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), + ARM::SP) + .addReg(FramePtr)); + } else { + if (MBBI->getOpcode() == ARM::tBX_RET && + &MBB.front() != MBBI && + prior(MBBI)->getOpcode() == ARM::tPOP) { + MachineBasicBlock::iterator PMBBI = prior(MBBI); + emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes); + } else + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes); + } + } + + if (VARegSaveSize) { + // Unlike T2 and ARM mode, the T1 pop instruction cannot restore + // to LR, and we can't pop the value directly to the PC since + // we need to update the SP after popping the value. Therefore, we + // pop the old LR into R3 as a temporary. + + // Move back past the callee-saved register restoration + while (MBBI != MBB.end() && isCSRestore(MBBI, CSRegs)) + ++MBBI; + // Epilogue for vararg functions: pop LR to R3 and branch off it. + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP))) + .addReg(ARM::R3, RegState::Define); + + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, VARegSaveSize); + + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX_RET_vararg)) + .addReg(ARM::R3, RegState::Kill)); + // erase the old tBX_RET instruction + MBB.erase(MBBI); + } +} + +bool Thumb1FrameLowering:: +spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + if (CSI.empty()) + return false; + + DebugLoc DL; + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + + if (MI != MBB.end()) DL = MI->getDebugLoc(); + + MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(ARM::tPUSH)); + AddDefaultPred(MIB); + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + bool isKill = true; + + // Add the callee-saved register as live-in unless it's LR and + // @llvm.returnaddress is called. If LR is returned for @llvm.returnaddress + // then it's already added to the function and entry block live-in sets. + if (Reg == ARM::LR) { + MachineFunction &MF = *MBB.getParent(); + if (MF.getFrameInfo()->isReturnAddressTaken() && + MF.getRegInfo().isLiveIn(Reg)) + isKill = false; + } + + if (isKill) + MBB.addLiveIn(Reg); + + MIB.addReg(Reg, getKillRegState(isKill)); + } + MIB.setMIFlags(MachineInstr::FrameSetup); + return true; +} + +bool Thumb1FrameLowering:: +restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + if (CSI.empty()) + return false; + + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + + bool isVarArg = AFI->getVarArgsRegSaveSize() > 0; + DebugLoc DL = MI->getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(MF, DL, TII.get(ARM::tPOP)); + AddDefaultPred(MIB); + + bool NumRegs = false; + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + if (Reg == ARM::LR) { + // Special epilogue for vararg functions. See emitEpilogue + if (isVarArg) + continue; + Reg = ARM::PC; + (*MIB).setDesc(TII.get(ARM::tPOP_RET)); + MI = MBB.erase(MI); + } + MIB.addReg(Reg, getDefRegState(true)); + NumRegs = true; + } + + // It's illegal to emit pop instruction without operands. + if (NumRegs) + MBB.insert(MI, &*MIB); + else + MF.DeleteMachineInstr(MIB); + + return true; +} diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h new file mode 100644 index 0000000..bcfc516 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h @@ -0,0 +1,52 @@ +//===-- Thumb1FrameLowering.h - Thumb1-specific frame info stuff --*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +// +//===----------------------------------------------------------------------===// + +#ifndef __THUMB_FRAMEINFO_H_ +#define __THUMB_FRAMEINFO_H_ + +#include "ARM.h" +#include "ARMFrameLowering.h" +#include "ARMSubtarget.h" +#include "Thumb1InstrInfo.h" +#include "Thumb1RegisterInfo.h" +#include "llvm/Target/TargetFrameLowering.h" + +namespace llvm { + class ARMSubtarget; + +class Thumb1FrameLowering : public ARMFrameLowering { +public: + explicit Thumb1FrameLowering(const ARMSubtarget &sti) + : ARMFrameLowering(sti) { + } + + /// emitProlog/emitEpilog - These methods insert prolog and epilog code into + /// the function. + void emitPrologue(MachineFunction &MF) const; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; + + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const; + bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const; + + bool hasReservedCallFrame(const MachineFunction &MF) const; +}; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp new file mode 100644 index 0000000..218311d --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -0,0 +1,100 @@ +//===- Thumb1InstrInfo.cpp - Thumb-1 Instruction Information ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Thumb-1 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "Thumb1InstrInfo.h" +#include "ARM.h" +#include "ARMMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/ADT/SmallVector.h" +#include "Thumb1InstrInfo.h" + +using namespace llvm; + +Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI) + : ARMBaseInstrInfo(STI), RI(*this, STI) { +} + +unsigned Thumb1InstrInfo::getUnindexedOpcode(unsigned Opc) const { + return 0; +} + +void Thumb1InstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const { + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc))); + assert(ARM::GPRRegClass.contains(DestReg, SrcReg) && + "Thumb1 can only copy GPR registers"); +} + +void Thumb1InstrInfo:: +storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned SrcReg, bool isKill, int FI, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + assert((RC == ARM::tGPRRegisterClass || + (TargetRegisterInfo::isPhysicalRegister(SrcReg) && + isARMLowRegister(SrcReg))) && "Unknown regclass!"); + + if (RC == ARM::tGPRRegisterClass || + (TargetRegisterInfo::isPhysicalRegister(SrcReg) && + isARMLowRegister(SrcReg))) { + DebugLoc DL; + if (I != MBB.end()) DL = I->getDebugLoc(); + + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + MachineMemOperand *MMO = + MF.getMachineMemOperand( + MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)), + MachineMemOperand::MOStore, + MFI.getObjectSize(FI), + MFI.getObjectAlignment(FI)); + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tSTRspi)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + } +} + +void Thumb1InstrInfo:: +loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned DestReg, int FI, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + assert((RC == ARM::tGPRRegisterClass || + (TargetRegisterInfo::isPhysicalRegister(DestReg) && + isARMLowRegister(DestReg))) && "Unknown regclass!"); + + if (RC == ARM::tGPRRegisterClass || + (TargetRegisterInfo::isPhysicalRegister(DestReg) && + isARMLowRegister(DestReg))) { + DebugLoc DL; + if (I != MBB.end()) DL = I->getDebugLoc(); + + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + MachineMemOperand *MMO = + MF.getMachineMemOperand( + MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)), + MachineMemOperand::MOLoad, + MFI.getObjectSize(FI), + MFI.getObjectAlignment(FI)); + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tLDRspi), DestReg) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + } +} diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h new file mode 100644 index 0000000..17ef2f7 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h @@ -0,0 +1,59 @@ +//===- Thumb1InstrInfo.h - Thumb-1 Instruction Information ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Thumb-1 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef THUMB1INSTRUCTIONINFO_H +#define THUMB1INSTRUCTIONINFO_H + +#include "llvm/Target/TargetInstrInfo.h" +#include "ARM.h" +#include "ARMInstrInfo.h" +#include "Thumb1RegisterInfo.h" + +namespace llvm { + class ARMSubtarget; + +class Thumb1InstrInfo : public ARMBaseInstrInfo { + Thumb1RegisterInfo RI; +public: + explicit Thumb1InstrInfo(const ARMSubtarget &STI); + + // Return the non-pre/post incrementing version of 'Opc'. Return 0 + // if there is not such an opcode. + unsigned getUnindexedOpcode(unsigned Opc) const; + + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As + /// such, whenever a client has an instance of instruction info, it should + /// always be able to get register info as well (through this method). + /// + const Thumb1RegisterInfo &getRegisterInfo() const { return RI; } + + void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const; + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned SrcReg, bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const; + + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const; + +}; +} + +#endif // THUMB1INSTRUCTIONINFO_H diff --git a/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp new file mode 100644 index 0000000..e8ed482 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp @@ -0,0 +1,706 @@ +//===- Thumb1RegisterInfo.cpp - Thumb-1 Register Information ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Thumb-1 implementation of the TargetRegisterInfo +// class. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMSubtarget.h" +#include "Thumb1InstrInfo.h" +#include "Thumb1RegisterInfo.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/LLVMContext.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { +extern cl::opt<bool> ReuseFrameIndexVals; +} + +using namespace llvm; + +Thumb1RegisterInfo::Thumb1RegisterInfo(const ARMBaseInstrInfo &tii, + const ARMSubtarget &sti) + : ARMBaseRegisterInfo(tii, sti) { +} + +const TargetRegisterClass* +Thumb1RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) + const { + if (ARM::tGPRRegClass.hasSubClassEq(RC)) + return ARM::tGPRRegisterClass; + return ARMBaseRegisterInfo::getLargestLegalSuperClass(RC); +} + +const TargetRegisterClass * +Thumb1RegisterInfo::getPointerRegClass(unsigned Kind) const { + return ARM::tGPRRegisterClass; +} + +/// emitLoadConstPool - Emits a load from constpool to materialize the +/// specified immediate. +void +Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + DebugLoc dl, + unsigned DestReg, unsigned SubIdx, + int Val, + ARMCC::CondCodes Pred, unsigned PredReg, + unsigned MIFlags) const { + MachineFunction &MF = *MBB.getParent(); + MachineConstantPool *ConstantPool = MF.getConstantPool(); + const Constant *C = ConstantInt::get( + Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4); + + BuildMI(MBB, MBBI, dl, TII.get(ARM::tLDRpci)) + .addReg(DestReg, getDefRegState(true), SubIdx) + .addConstantPoolIndex(Idx).addImm(Pred).addReg(PredReg) + .setMIFlags(MIFlags); +} + + +/// emitThumbRegPlusImmInReg - Emits a series of instructions to materialize +/// a destreg = basereg + immediate in Thumb code. Materialize the immediate +/// in a register using mov / mvn sequences or load the immediate from a +/// constpool entry. +static +void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + DebugLoc dl, + unsigned DestReg, unsigned BaseReg, + int NumBytes, bool CanChangeCC, + const TargetInstrInfo &TII, + const ARMBaseRegisterInfo& MRI, + unsigned MIFlags = MachineInstr::NoFlags) { + MachineFunction &MF = *MBB.getParent(); + bool isHigh = !isARMLowRegister(DestReg) || + (BaseReg != 0 && !isARMLowRegister(BaseReg)); + bool isSub = false; + // Subtract doesn't have high register version. Load the negative value + // if either base or dest register is a high register. Also, if do not + // issue sub as part of the sequence if condition register is to be + // preserved. + if (NumBytes < 0 && !isHigh && CanChangeCC) { + isSub = true; + NumBytes = -NumBytes; + } + unsigned LdReg = DestReg; + if (DestReg == ARM::SP) { + assert(BaseReg == ARM::SP && "Unexpected!"); + LdReg = MF.getRegInfo().createVirtualRegister(ARM::tGPRRegisterClass); + } + + if (NumBytes <= 255 && NumBytes >= 0) + AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg)) + .addImm(NumBytes).setMIFlags(MIFlags); + else if (NumBytes < 0 && NumBytes >= -255) { + AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg)) + .addImm(NumBytes).setMIFlags(MIFlags); + AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tRSB), LdReg)) + .addReg(LdReg, RegState::Kill).setMIFlags(MIFlags); + } else + MRI.emitLoadConstPool(MBB, MBBI, dl, LdReg, 0, NumBytes, + ARMCC::AL, 0, MIFlags); + + // Emit add / sub. + int Opc = (isSub) ? ARM::tSUBrr : (isHigh ? ARM::tADDhirr : ARM::tADDrr); + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg); + if (Opc != ARM::tADDhirr) + MIB = AddDefaultT1CC(MIB); + if (DestReg == ARM::SP || isSub) + MIB.addReg(BaseReg).addReg(LdReg, RegState::Kill); + else + MIB.addReg(LdReg).addReg(BaseReg, RegState::Kill); + AddDefaultPred(MIB); +} + +/// calcNumMI - Returns the number of instructions required to materialize +/// the specific add / sub r, c instruction. +static unsigned calcNumMI(int Opc, int ExtraOpc, unsigned Bytes, + unsigned NumBits, unsigned Scale) { + unsigned NumMIs = 0; + unsigned Chunk = ((1 << NumBits) - 1) * Scale; + + if (Opc == ARM::tADDrSPi) { + unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes; + Bytes -= ThisVal; + NumMIs++; + NumBits = 8; + Scale = 1; // Followed by a number of tADDi8. + Chunk = ((1 << NumBits) - 1) * Scale; + } + + NumMIs += Bytes / Chunk; + if ((Bytes % Chunk) != 0) + NumMIs++; + if (ExtraOpc) + NumMIs++; + return NumMIs; +} + +/// emitThumbRegPlusImmediate - Emits a series of instructions to materialize +/// a destreg = basereg + immediate in Thumb code. +void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + DebugLoc dl, + unsigned DestReg, unsigned BaseReg, + int NumBytes, const TargetInstrInfo &TII, + const ARMBaseRegisterInfo& MRI, + unsigned MIFlags) { + bool isSub = NumBytes < 0; + unsigned Bytes = (unsigned)NumBytes; + if (isSub) Bytes = -NumBytes; + bool isMul4 = (Bytes & 3) == 0; + bool isTwoAddr = false; + bool DstNotEqBase = false; + unsigned NumBits = 1; + unsigned Scale = 1; + int Opc = 0; + int ExtraOpc = 0; + bool NeedCC = false; + + if (DestReg == BaseReg && BaseReg == ARM::SP) { + assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!"); + NumBits = 7; + Scale = 4; + Opc = isSub ? ARM::tSUBspi : ARM::tADDspi; + isTwoAddr = true; + } else if (!isSub && BaseReg == ARM::SP) { + // r1 = add sp, 403 + // => + // r1 = add sp, 100 * 4 + // r1 = add r1, 3 + if (!isMul4) { + Bytes &= ~3; + ExtraOpc = ARM::tADDi3; + } + NumBits = 8; + Scale = 4; + Opc = ARM::tADDrSPi; + } else { + // sp = sub sp, c + // r1 = sub sp, c + // r8 = sub sp, c + if (DestReg != BaseReg) + DstNotEqBase = true; + NumBits = 8; + if (DestReg == ARM::SP) { + Opc = isSub ? ARM::tSUBspi : ARM::tADDspi; + assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!"); + NumBits = 7; + Scale = 4; + } else { + Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8; + NumBits = 8; + NeedCC = true; + } + isTwoAddr = true; + } + + unsigned NumMIs = calcNumMI(Opc, ExtraOpc, Bytes, NumBits, Scale); + unsigned Threshold = (DestReg == ARM::SP) ? 3 : 2; + if (NumMIs > Threshold) { + // This will expand into too many instructions. Load the immediate from a + // constpool entry. + emitThumbRegPlusImmInReg(MBB, MBBI, dl, + DestReg, BaseReg, NumBytes, true, + TII, MRI, MIFlags); + return; + } + + if (DstNotEqBase) { + if (isARMLowRegister(DestReg) && isARMLowRegister(BaseReg)) { + // If both are low registers, emit DestReg = add BaseReg, max(Imm, 7) + unsigned Chunk = (1 << 3) - 1; + unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes; + Bytes -= ThisVal; + const MCInstrDesc &MCID = TII.get(isSub ? ARM::tSUBi3 : ARM::tADDi3); + const MachineInstrBuilder MIB = + AddDefaultT1CC(BuildMI(MBB, MBBI, dl, MCID, DestReg) + .setMIFlags(MIFlags)); + AddDefaultPred(MIB.addReg(BaseReg, RegState::Kill).addImm(ThisVal)); + } else { + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg) + .addReg(BaseReg, RegState::Kill)) + .setMIFlags(MIFlags); + } + BaseReg = DestReg; + } + + unsigned Chunk = ((1 << NumBits) - 1) * Scale; + while (Bytes) { + unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes; + Bytes -= ThisVal; + ThisVal /= Scale; + // Build the new tADD / tSUB. + if (isTwoAddr) { + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg); + if (NeedCC) + MIB = AddDefaultT1CC(MIB); + MIB.addReg(DestReg).addImm(ThisVal); + MIB = AddDefaultPred(MIB); + MIB.setMIFlags(MIFlags); + } else { + bool isKill = BaseReg != ARM::SP; + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg); + if (NeedCC) + MIB = AddDefaultT1CC(MIB); + MIB.addReg(BaseReg, getKillRegState(isKill)).addImm(ThisVal); + MIB = AddDefaultPred(MIB); + MIB.setMIFlags(MIFlags); + + BaseReg = DestReg; + if (Opc == ARM::tADDrSPi) { + // r4 = add sp, imm + // r4 = add r4, imm + // ... + NumBits = 8; + Scale = 1; + Chunk = ((1 << NumBits) - 1) * Scale; + Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8; + NeedCC = isTwoAddr = true; + } + } + } + + if (ExtraOpc) { + const MCInstrDesc &MCID = TII.get(ExtraOpc); + AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, MCID, DestReg)) + .addReg(DestReg, RegState::Kill) + .addImm(((unsigned)NumBytes) & 3) + .setMIFlags(MIFlags)); + } +} + +static void emitSPUpdate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + const TargetInstrInfo &TII, DebugLoc dl, + const Thumb1RegisterInfo &MRI, + int NumBytes) { + emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, TII, + MRI); +} + +void Thumb1RegisterInfo:: +eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + + if (!TFI->hasReservedCallFrame(MF)) { + // If we have alloca, convert as follows: + // ADJCALLSTACKDOWN -> sub, sp, sp, amount + // ADJCALLSTACKUP -> add, sp, sp, amount + MachineInstr *Old = I; + DebugLoc dl = Old->getDebugLoc(); + unsigned Amount = Old->getOperand(0).getImm(); + if (Amount != 0) { + // We need to keep the stack aligned properly. To do this, we round the + // amount of space needed for the outgoing arguments up to the next + // alignment boundary. + unsigned Align = TFI->getStackAlignment(); + Amount = (Amount+Align-1)/Align*Align; + + // Replace the pseudo instruction with a new instruction... + unsigned Opc = Old->getOpcode(); + if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) { + emitSPUpdate(MBB, I, TII, dl, *this, -Amount); + } else { + assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP); + emitSPUpdate(MBB, I, TII, dl, *this, Amount); + } + } + } + MBB.erase(I); +} + +/// emitThumbConstant - Emit a series of instructions to materialize a +/// constant. +static void emitThumbConstant(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + unsigned DestReg, int Imm, + const TargetInstrInfo &TII, + const Thumb1RegisterInfo& MRI, + DebugLoc dl) { + bool isSub = Imm < 0; + if (isSub) Imm = -Imm; + + int Chunk = (1 << 8) - 1; + int ThisVal = (Imm > Chunk) ? Chunk : Imm; + Imm -= ThisVal; + AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), + DestReg)) + .addImm(ThisVal)); + if (Imm > 0) + emitThumbRegPlusImmediate(MBB, MBBI, dl, DestReg, DestReg, Imm, TII, MRI); + if (isSub) { + const MCInstrDesc &MCID = TII.get(ARM::tRSB); + AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, MCID, DestReg)) + .addReg(DestReg, RegState::Kill)); + } +} + +static void removeOperands(MachineInstr &MI, unsigned i) { + unsigned Op = i; + for (unsigned e = MI.getNumOperands(); i != e; ++i) + MI.RemoveOperand(Op); +} + +/// convertToNonSPOpcode - Change the opcode to the non-SP version, because +/// we're replacing the frame index with a non-SP register. +static unsigned convertToNonSPOpcode(unsigned Opcode) { + switch (Opcode) { + case ARM::tLDRspi: + return ARM::tLDRi; + + case ARM::tSTRspi: + return ARM::tSTRi; + } + + return Opcode; +} + +bool Thumb1RegisterInfo:: +rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const ARMBaseInstrInfo &TII) const { + MachineInstr &MI = *II; + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc dl = MI.getDebugLoc(); + unsigned Opcode = MI.getOpcode(); + const MCInstrDesc &Desc = MI.getDesc(); + unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); + + if (Opcode == ARM::tADDrSPi) { + Offset += MI.getOperand(FrameRegIdx+1).getImm(); + + // Can't use tADDrSPi if it's based off the frame pointer. + unsigned NumBits = 0; + unsigned Scale = 1; + if (FrameReg != ARM::SP) { + Opcode = ARM::tADDi3; + NumBits = 3; + } else { + NumBits = 8; + Scale = 4; + assert((Offset & 3) == 0 && + "Thumb add/sub sp, #imm immediate must be multiple of 4!"); + } + + unsigned PredReg; + if (Offset == 0 && getInstrPredicate(&MI, PredReg) == ARMCC::AL) { + // Turn it into a move. + MI.setDesc(TII.get(ARM::tMOVr)); + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + // Remove offset + MI.RemoveOperand(FrameRegIdx+1); + MachineInstrBuilder MIB(&MI); + return true; + } + + // Common case: small offset, fits into instruction. + unsigned Mask = (1 << NumBits) - 1; + if (((Offset / Scale) & ~Mask) == 0) { + // Replace the FrameIndex with sp / fp + if (Opcode == ARM::tADDi3) { + MI.setDesc(TII.get(Opcode)); + removeOperands(MI, FrameRegIdx); + MachineInstrBuilder MIB(&MI); + AddDefaultPred(AddDefaultT1CC(MIB).addReg(FrameReg) + .addImm(Offset / Scale)); + } else { + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset / Scale); + } + return true; + } + + unsigned DestReg = MI.getOperand(0).getReg(); + unsigned Bytes = (Offset > 0) ? Offset : -Offset; + unsigned NumMIs = calcNumMI(Opcode, 0, Bytes, NumBits, Scale); + // MI would expand into a large number of instructions. Don't try to + // simplify the immediate. + if (NumMIs > 2) { + emitThumbRegPlusImmediate(MBB, II, dl, DestReg, FrameReg, Offset, TII, + *this); + MBB.erase(II); + return true; + } + + if (Offset > 0) { + // Translate r0 = add sp, imm to + // r0 = add sp, 255*4 + // r0 = add r0, (imm - 255*4) + if (Opcode == ARM::tADDi3) { + MI.setDesc(TII.get(Opcode)); + removeOperands(MI, FrameRegIdx); + MachineInstrBuilder MIB(&MI); + AddDefaultPred(AddDefaultT1CC(MIB).addReg(FrameReg).addImm(Mask)); + } else { + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Mask); + } + Offset = (Offset - Mask * Scale); + MachineBasicBlock::iterator NII = llvm::next(II); + emitThumbRegPlusImmediate(MBB, NII, dl, DestReg, DestReg, Offset, TII, + *this); + } else { + // Translate r0 = add sp, -imm to + // r0 = -imm (this is then translated into a series of instructons) + // r0 = add r0, sp + emitThumbConstant(MBB, II, DestReg, Offset, TII, *this, dl); + + MI.setDesc(TII.get(ARM::tADDhirr)); + MI.getOperand(FrameRegIdx).ChangeToRegister(DestReg, false, false, true); + MI.getOperand(FrameRegIdx+1).ChangeToRegister(FrameReg, false); + } + return true; + } else { + if (AddrMode != ARMII::AddrModeT1_s) + llvm_unreachable("Unsupported addressing mode!"); + + unsigned ImmIdx = FrameRegIdx + 1; + int InstrOffs = MI.getOperand(ImmIdx).getImm(); + unsigned NumBits = (FrameReg == ARM::SP) ? 8 : 5; + unsigned Scale = 4; + + Offset += InstrOffs * Scale; + assert((Offset & (Scale - 1)) == 0 && "Can't encode this offset!"); + + // Common case: small offset, fits into instruction. + MachineOperand &ImmOp = MI.getOperand(ImmIdx); + int ImmedOffset = Offset / Scale; + unsigned Mask = (1 << NumBits) - 1; + + if ((unsigned)Offset <= Mask * Scale) { + // Replace the FrameIndex with the frame register (e.g., sp). + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + ImmOp.ChangeToImmediate(ImmedOffset); + + // If we're using a register where sp was stored, convert the instruction + // to the non-SP version. + unsigned NewOpc = convertToNonSPOpcode(Opcode); + if (NewOpc != Opcode && FrameReg != ARM::SP) + MI.setDesc(TII.get(NewOpc)); + + return true; + } + + NumBits = 5; + Mask = (1 << NumBits) - 1; + + // If this is a thumb spill / restore, we will be using a constpool load to + // materialize the offset. + if (Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) { + ImmOp.ChangeToImmediate(0); + } else { + // Otherwise, it didn't fit. Pull in what we can to simplify the immed. + ImmedOffset = ImmedOffset & Mask; + ImmOp.ChangeToImmediate(ImmedOffset); + Offset &= ~(Mask * Scale); + } + } + + return Offset == 0; +} + +void +Thumb1RegisterInfo::resolveFrameIndex(MachineBasicBlock::iterator I, + unsigned BaseReg, int64_t Offset) const { + MachineInstr &MI = *I; + int Off = Offset; // ARM doesn't need the general 64-bit offsets + unsigned i = 0; + + while (!MI.getOperand(i).isFI()) { + ++i; + assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!"); + } + bool Done = rewriteFrameIndex(MI, i, BaseReg, Off, TII); + assert (Done && "Unable to resolve frame index!"); + (void)Done; +} + +/// saveScavengerRegister - Spill the register so it can be used by the +/// register scavenger. Return true. +bool +Thumb1RegisterInfo::saveScavengerRegister(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator &UseMI, + const TargetRegisterClass *RC, + unsigned Reg) const { + // Thumb1 can't use the emergency spill slot on the stack because + // ldr/str immediate offsets must be positive, and if we're referencing + // off the frame pointer (if, for example, there are alloca() calls in + // the function, the offset will be negative. Use R12 instead since that's + // a call clobbered register that we know won't be used in Thumb1 mode. + DebugLoc DL; + AddDefaultPred(BuildMI(MBB, I, DL, TII.get(ARM::tMOVr)) + .addReg(ARM::R12, RegState::Define) + .addReg(Reg, RegState::Kill)); + + // The UseMI is where we would like to restore the register. If there's + // interference with R12 before then, however, we'll need to restore it + // before that instead and adjust the UseMI. + bool done = false; + for (MachineBasicBlock::iterator II = I; !done && II != UseMI ; ++II) { + if (II->isDebugValue()) + continue; + // If this instruction affects R12, adjust our restore point. + for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = II->getOperand(i); + if (!MO.isReg() || MO.isUndef() || !MO.getReg() || + TargetRegisterInfo::isVirtualRegister(MO.getReg())) + continue; + if (MO.getReg() == ARM::R12) { + UseMI = II; + done = true; + break; + } + } + } + // Restore the register from R12 + AddDefaultPred(BuildMI(MBB, UseMI, DL, TII.get(ARM::tMOVr)). + addReg(Reg, RegState::Define).addReg(ARM::R12, RegState::Kill)); + + return true; +} + +void +Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, RegScavenger *RS) const { + unsigned VReg = 0; + unsigned i = 0; + MachineInstr &MI = *II; + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + DebugLoc dl = MI.getDebugLoc(); + + while (!MI.getOperand(i).isFI()) { + ++i; + assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!"); + } + + unsigned FrameReg = ARM::SP; + int FrameIndex = MI.getOperand(i).getIndex(); + int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) + + MF.getFrameInfo()->getStackSize() + SPAdj; + + if (AFI->isGPRCalleeSavedArea1Frame(FrameIndex)) + Offset -= AFI->getGPRCalleeSavedArea1Offset(); + else if (AFI->isGPRCalleeSavedArea2Frame(FrameIndex)) + Offset -= AFI->getGPRCalleeSavedArea2Offset(); + else if (MF.getFrameInfo()->hasVarSizedObjects()) { + assert(SPAdj == 0 && MF.getTarget().getFrameLowering()->hasFP(MF) && + "Unexpected"); + // There are alloca()'s in this function, must reference off the frame + // pointer or base pointer instead. + if (!hasBasePointer(MF)) { + FrameReg = getFrameRegister(MF); + Offset -= AFI->getFramePtrSpillOffset(); + } else + FrameReg = BasePtr; + } + + // Special handling of dbg_value instructions. + if (MI.isDebugValue()) { + MI.getOperand(i). ChangeToRegister(FrameReg, false /*isDef*/); + MI.getOperand(i+1).ChangeToImmediate(Offset); + return; + } + + // Modify MI as necessary to handle as much of 'Offset' as possible + assert(AFI->isThumbFunction() && + "This eliminateFrameIndex only supports Thumb1!"); + if (rewriteFrameIndex(MI, i, FrameReg, Offset, TII)) + return; + + // If we get here, the immediate doesn't fit into the instruction. We folded + // as much as possible above, handle the rest, providing a register that is + // SP+LargeImm. + assert(Offset && "This code isn't needed if offset already handled!"); + + unsigned Opcode = MI.getOpcode(); + const MCInstrDesc &Desc = MI.getDesc(); + + // Remove predicate first. + int PIdx = MI.findFirstPredOperandIdx(); + if (PIdx != -1) + removeOperands(MI, PIdx); + + if (Desc.mayLoad()) { + // Use the destination register to materialize sp + offset. + unsigned TmpReg = MI.getOperand(0).getReg(); + bool UseRR = false; + if (Opcode == ARM::tLDRspi) { + if (FrameReg == ARM::SP) + emitThumbRegPlusImmInReg(MBB, II, dl, TmpReg, FrameReg, + Offset, false, TII, *this); + else { + emitLoadConstPool(MBB, II, dl, TmpReg, 0, Offset); + UseRR = true; + } + } else { + emitThumbRegPlusImmediate(MBB, II, dl, TmpReg, FrameReg, Offset, TII, + *this); + } + + MI.setDesc(TII.get(UseRR ? ARM::tLDRr : ARM::tLDRi)); + MI.getOperand(i).ChangeToRegister(TmpReg, false, false, true); + if (UseRR) + // Use [reg, reg] addrmode. Replace the immediate operand w/ the frame + // register. The offset is already handled in the vreg value. + MI.getOperand(i+1).ChangeToRegister(FrameReg, false, false, false); + } else if (Desc.mayStore()) { + VReg = MF.getRegInfo().createVirtualRegister(ARM::tGPRRegisterClass); + bool UseRR = false; + + if (Opcode == ARM::tSTRspi) { + if (FrameReg == ARM::SP) + emitThumbRegPlusImmInReg(MBB, II, dl, VReg, FrameReg, + Offset, false, TII, *this); + else { + emitLoadConstPool(MBB, II, dl, VReg, 0, Offset); + UseRR = true; + } + } else + emitThumbRegPlusImmediate(MBB, II, dl, VReg, FrameReg, Offset, TII, + *this); + MI.setDesc(TII.get(UseRR ? ARM::tSTRr : ARM::tSTRi)); + MI.getOperand(i).ChangeToRegister(VReg, false, false, true); + if (UseRR) + // Use [reg, reg] addrmode. Replace the immediate operand w/ the frame + // register. The offset is already handled in the vreg value. + MI.getOperand(i+1).ChangeToRegister(FrameReg, false, false, false); + } else { + assert(false && "Unexpected opcode!"); + } + + // Add predicate back if it's needed. + if (MI.getDesc().isPredicable()) { + MachineInstrBuilder MIB(&MI); + AddDefaultPred(MIB); + } +} diff --git a/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.h b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.h new file mode 100644 index 0000000..9060e59 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.h @@ -0,0 +1,69 @@ +//===- Thumb1RegisterInfo.h - Thumb-1 Register Information Impl -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Thumb-1 implementation of the TargetRegisterInfo +// class. +// +//===----------------------------------------------------------------------===// + +#ifndef THUMB1REGISTERINFO_H +#define THUMB1REGISTERINFO_H + +#include "ARM.h" +#include "ARMRegisterInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" + +namespace llvm { + class ARMSubtarget; + class ARMBaseInstrInfo; + class Type; + +struct Thumb1RegisterInfo : public ARMBaseRegisterInfo { +public: + Thumb1RegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI); + + const TargetRegisterClass* + getLargestLegalSuperClass(const TargetRegisterClass *RC) const; + + const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const; + + /// emitLoadConstPool - Emits a load from constpool to materialize the + /// specified immediate. + void emitLoadConstPool(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + DebugLoc dl, + unsigned DestReg, unsigned SubIdx, int Val, + ARMCC::CondCodes Pred = ARMCC::AL, + unsigned PredReg = 0, + unsigned MIFlags = MachineInstr::NoFlags) const; + + /// Code Generation virtual methods... + void eliminateCallFramePseudoInstr(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const; + + // rewrite MI to access 'Offset' bytes from the FP. Update Offset to be + // however much remains to be handled. Return 'true' if no further + // work is required. + bool rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const ARMBaseInstrInfo &TII) const; + void resolveFrameIndex(MachineBasicBlock::iterator I, + unsigned BaseReg, int64_t Offset) const; + bool saveScavengerRegister(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator &UseMI, + const TargetRegisterClass *RC, + unsigned Reg) const; + void eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, RegScavenger *RS = NULL) const; +}; +} + +#endif // THUMB1REGISTERINFO_H diff --git a/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp new file mode 100644 index 0000000..b627400 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp @@ -0,0 +1,273 @@ +//===-- Thumb2ITBlockPass.cpp - Insert Thumb IT blocks ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "thumb2-it" +#include "ARM.h" +#include "ARMMachineFunctionInfo.h" +#include "Thumb2InstrInfo.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumITs, "Number of IT blocks inserted"); +STATISTIC(NumMovedInsts, "Number of predicated instructions moved"); + +namespace { + class Thumb2ITBlockPass : public MachineFunctionPass { + bool PreRegAlloc; + + public: + static char ID; + Thumb2ITBlockPass() : MachineFunctionPass(ID) {} + + const Thumb2InstrInfo *TII; + const TargetRegisterInfo *TRI; + ARMFunctionInfo *AFI; + + virtual bool runOnMachineFunction(MachineFunction &Fn); + + virtual const char *getPassName() const { + return "Thumb IT blocks insertion pass"; + } + + private: + bool MoveCopyOutOfITBlock(MachineInstr *MI, + ARMCC::CondCodes CC, ARMCC::CondCodes OCC, + SmallSet<unsigned, 4> &Defs, + SmallSet<unsigned, 4> &Uses); + bool InsertITInstructions(MachineBasicBlock &MBB); + }; + char Thumb2ITBlockPass::ID = 0; +} + +/// TrackDefUses - Tracking what registers are being defined and used by +/// instructions in the IT block. This also tracks "dependencies", i.e. uses +/// in the IT block that are defined before the IT instruction. +static void TrackDefUses(MachineInstr *MI, + SmallSet<unsigned, 4> &Defs, + SmallSet<unsigned, 4> &Uses, + const TargetRegisterInfo *TRI) { + SmallVector<unsigned, 4> LocalDefs; + SmallVector<unsigned, 4> LocalUses; + + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (!Reg || Reg == ARM::ITSTATE || Reg == ARM::SP) + continue; + if (MO.isUse()) + LocalUses.push_back(Reg); + else + LocalDefs.push_back(Reg); + } + + for (unsigned i = 0, e = LocalUses.size(); i != e; ++i) { + unsigned Reg = LocalUses[i]; + Uses.insert(Reg); + for (const unsigned *Subreg = TRI->getSubRegisters(Reg); + *Subreg; ++Subreg) + Uses.insert(*Subreg); + } + + for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) { + unsigned Reg = LocalDefs[i]; + Defs.insert(Reg); + for (const unsigned *Subreg = TRI->getSubRegisters(Reg); + *Subreg; ++Subreg) + Defs.insert(*Subreg); + if (Reg == ARM::CPSR) + continue; + } +} + +static bool isCopy(MachineInstr *MI) { + switch (MI->getOpcode()) { + default: + return false; + case ARM::MOVr: + case ARM::MOVr_TC: + case ARM::tMOVr: + case ARM::t2MOVr: + return true; + } +} + +bool +Thumb2ITBlockPass::MoveCopyOutOfITBlock(MachineInstr *MI, + ARMCC::CondCodes CC, ARMCC::CondCodes OCC, + SmallSet<unsigned, 4> &Defs, + SmallSet<unsigned, 4> &Uses) { + if (!isCopy(MI)) + return false; + // llvm models select's as two-address instructions. That means a copy + // is inserted before a t2MOVccr, etc. If the copy is scheduled in + // between selects we would end up creating multiple IT blocks. + assert(MI->getOperand(0).getSubReg() == 0 && + MI->getOperand(1).getSubReg() == 0 && + "Sub-register indices still around?"); + + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned SrcReg = MI->getOperand(1).getReg(); + + // First check if it's safe to move it. + if (Uses.count(DstReg) || Defs.count(SrcReg)) + return false; + + // If the CPSR is defined by this copy, then we don't want to move it. E.g., + // if we have: + // + // movs r1, r1 + // rsb r1, 0 + // movs r2, r2 + // rsb r2, 0 + // + // we don't want this to be converted to: + // + // movs r1, r1 + // movs r2, r2 + // itt mi + // rsb r1, 0 + // rsb r2, 0 + // + const MCInstrDesc &MCID = MI->getDesc(); + if (MCID.hasOptionalDef() && + MI->getOperand(MCID.getNumOperands() - 1).getReg() == ARM::CPSR) + return false; + + // Then peek at the next instruction to see if it's predicated on CC or OCC. + // If not, then there is nothing to be gained by moving the copy. + MachineBasicBlock::iterator I = MI; ++I; + MachineBasicBlock::iterator E = MI->getParent()->end(); + while (I != E && I->isDebugValue()) + ++I; + if (I != E) { + unsigned NPredReg = 0; + ARMCC::CondCodes NCC = llvm::getITInstrPredicate(I, NPredReg); + if (NCC == CC || NCC == OCC) + return true; + } + return false; +} + +bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) { + bool Modified = false; + + SmallSet<unsigned, 4> Defs; + SmallSet<unsigned, 4> Uses; + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + while (MBBI != E) { + MachineInstr *MI = &*MBBI; + DebugLoc dl = MI->getDebugLoc(); + unsigned PredReg = 0; + ARMCC::CondCodes CC = llvm::getITInstrPredicate(MI, PredReg); + if (CC == ARMCC::AL) { + ++MBBI; + continue; + } + + Defs.clear(); + Uses.clear(); + TrackDefUses(MI, Defs, Uses, TRI); + + // Insert an IT instruction. + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(ARM::t2IT)) + .addImm(CC); + + // Add implicit use of ITSTATE to IT block instructions. + MI->addOperand(MachineOperand::CreateReg(ARM::ITSTATE, false/*ifDef*/, + true/*isImp*/, false/*isKill*/)); + + MachineInstr *LastITMI = MI; + MachineBasicBlock::iterator InsertPos = MIB; + ++MBBI; + + // Form IT block. + ARMCC::CondCodes OCC = ARMCC::getOppositeCondition(CC); + unsigned Mask = 0, Pos = 3; + // Branches, including tricky ones like LDM_RET, need to end an IT + // block so check the instruction we just put in the block. + for (; MBBI != E && Pos && + (!MI->getDesc().isBranch() && !MI->getDesc().isReturn()) ; ++MBBI) { + if (MBBI->isDebugValue()) + continue; + + MachineInstr *NMI = &*MBBI; + MI = NMI; + + unsigned NPredReg = 0; + ARMCC::CondCodes NCC = llvm::getITInstrPredicate(NMI, NPredReg); + if (NCC == CC || NCC == OCC) { + Mask |= (NCC & 1) << Pos; + // Add implicit use of ITSTATE. + NMI->addOperand(MachineOperand::CreateReg(ARM::ITSTATE, false/*ifDef*/, + true/*isImp*/, false/*isKill*/)); + LastITMI = NMI; + } else { + if (NCC == ARMCC::AL && + MoveCopyOutOfITBlock(NMI, CC, OCC, Defs, Uses)) { + --MBBI; + MBB.remove(NMI); + MBB.insert(InsertPos, NMI); + ++NumMovedInsts; + continue; + } + break; + } + TrackDefUses(NMI, Defs, Uses, TRI); + --Pos; + } + + // Finalize IT mask. + Mask |= (1 << Pos); + // Tag along (firstcond[0] << 4) with the mask. + Mask |= (CC & 1) << 4; + MIB.addImm(Mask); + + // Last instruction in IT block kills ITSTATE. + LastITMI->findRegisterUseOperand(ARM::ITSTATE)->setIsKill(); + + Modified = true; + ++NumITs; + } + + return Modified; +} + +bool Thumb2ITBlockPass::runOnMachineFunction(MachineFunction &Fn) { + const TargetMachine &TM = Fn.getTarget(); + AFI = Fn.getInfo<ARMFunctionInfo>(); + TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo()); + TRI = TM.getRegisterInfo(); + + if (!AFI->isThumbFunction()) + return false; + + bool Modified = false; + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; ) { + MachineBasicBlock &MBB = *MFI; + ++MFI; + Modified |= InsertITInstructions(MBB); + } + + if (Modified) + AFI->setHasITBlocks(true); + + return Modified; +} + +/// createThumb2ITBlockPass - Returns an instance of the Thumb2 IT blocks +/// insertion pass. +FunctionPass *llvm::createThumb2ITBlockPass() { + return new Thumb2ITBlockPass(); +} diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp new file mode 100644 index 0000000..cf040c82 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -0,0 +1,611 @@ +//===- Thumb2InstrInfo.cpp - Thumb-2 Instruction Information ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Thumb-2 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "Thumb2InstrInfo.h" +#include "ARM.h" +#include "ARMConstantPoolValue.h" +#include "ARMMachineFunctionInfo.h" +#include "Thumb2InstrInfo.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; + +static cl::opt<bool> +OldT2IfCvt("old-thumb2-ifcvt", cl::Hidden, + cl::desc("Use old-style Thumb2 if-conversion heuristics"), + cl::init(false)); + +Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI) + : ARMBaseInstrInfo(STI), RI(*this, STI) { +} + +unsigned Thumb2InstrInfo::getUnindexedOpcode(unsigned Opc) const { + // FIXME + return 0; +} + +void +Thumb2InstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail, + MachineBasicBlock *NewDest) const { + MachineBasicBlock *MBB = Tail->getParent(); + ARMFunctionInfo *AFI = MBB->getParent()->getInfo<ARMFunctionInfo>(); + if (!AFI->hasITBlocks()) { + TargetInstrInfoImpl::ReplaceTailWithBranchTo(Tail, NewDest); + return; + } + + // If the first instruction of Tail is predicated, we may have to update + // the IT instruction. + unsigned PredReg = 0; + ARMCC::CondCodes CC = llvm::getInstrPredicate(Tail, PredReg); + MachineBasicBlock::iterator MBBI = Tail; + if (CC != ARMCC::AL) + // Expecting at least the t2IT instruction before it. + --MBBI; + + // Actually replace the tail. + TargetInstrInfoImpl::ReplaceTailWithBranchTo(Tail, NewDest); + + // Fix up IT. + if (CC != ARMCC::AL) { + MachineBasicBlock::iterator E = MBB->begin(); + unsigned Count = 4; // At most 4 instructions in an IT block. + while (Count && MBBI != E) { + if (MBBI->isDebugValue()) { + --MBBI; + continue; + } + if (MBBI->getOpcode() == ARM::t2IT) { + unsigned Mask = MBBI->getOperand(1).getImm(); + if (Count == 4) + MBBI->eraseFromParent(); + else { + unsigned MaskOn = 1 << Count; + unsigned MaskOff = ~(MaskOn - 1); + MBBI->getOperand(1).setImm((Mask & MaskOff) | MaskOn); + } + return; + } + --MBBI; + --Count; + } + + // Ctrl flow can reach here if branch folding is run before IT block + // formation pass. + } +} + +bool +Thumb2InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) const { + while (MBBI->isDebugValue()) { + ++MBBI; + if (MBBI == MBB.end()) + return false; + } + + unsigned PredReg = 0; + return llvm::getITInstrPredicate(MBBI, PredReg) == ARMCC::AL; +} + +void Thumb2InstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const { + // Handle SPR, DPR, and QPR copies. + if (!ARM::GPRRegClass.contains(DestReg, SrcReg)) + return ARMBaseInstrInfo::copyPhysReg(MBB, I, DL, DestReg, SrcReg, KillSrc); + + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc))); +} + +void Thumb2InstrInfo:: +storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned SrcReg, bool isKill, int FI, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass || + RC == ARM::tcGPRRegisterClass || RC == ARM::rGPRRegisterClass || + RC == ARM::GPRnopcRegisterClass) { + DebugLoc DL; + if (I != MBB.end()) DL = I->getDebugLoc(); + + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + MachineMemOperand *MMO = + MF.getMachineMemOperand( + MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)), + MachineMemOperand::MOStore, + MFI.getObjectSize(FI), + MFI.getObjectAlignment(FI)); + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2STRi12)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + return; + } + + ARMBaseInstrInfo::storeRegToStackSlot(MBB, I, SrcReg, isKill, FI, RC, TRI); +} + +void Thumb2InstrInfo:: +loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned DestReg, int FI, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass || + RC == ARM::tcGPRRegisterClass || RC == ARM::rGPRRegisterClass || + RC == ARM::GPRnopcRegisterClass) { + DebugLoc DL; + if (I != MBB.end()) DL = I->getDebugLoc(); + + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + MachineMemOperand *MMO = + MF.getMachineMemOperand( + MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)), + MachineMemOperand::MOLoad, + MFI.getObjectSize(FI), + MFI.getObjectAlignment(FI)); + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2LDRi12), DestReg) + .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); + return; + } + + ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, TRI); +} + +void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, DebugLoc dl, + unsigned DestReg, unsigned BaseReg, int NumBytes, + ARMCC::CondCodes Pred, unsigned PredReg, + const ARMBaseInstrInfo &TII, unsigned MIFlags) { + bool isSub = NumBytes < 0; + if (isSub) NumBytes = -NumBytes; + + // If profitable, use a movw or movt to materialize the offset. + // FIXME: Use the scavenger to grab a scratch register. + if (DestReg != ARM::SP && DestReg != BaseReg && + NumBytes >= 4096 && + ARM_AM::getT2SOImmVal(NumBytes) == -1) { + bool Fits = false; + if (NumBytes < 65536) { + // Use a movw to materialize the 16-bit constant. + BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), DestReg) + .addImm(NumBytes) + .addImm((unsigned)Pred).addReg(PredReg).setMIFlags(MIFlags); + Fits = true; + } else if ((NumBytes & 0xffff) == 0) { + // Use a movt to materialize the 32-bit constant. + BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVTi16), DestReg) + .addReg(DestReg) + .addImm(NumBytes >> 16) + .addImm((unsigned)Pred).addReg(PredReg).setMIFlags(MIFlags); + Fits = true; + } + + if (Fits) { + if (isSub) { + BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr), DestReg) + .addReg(BaseReg, RegState::Kill) + .addReg(DestReg, RegState::Kill) + .addImm((unsigned)Pred).addReg(PredReg).addReg(0) + .setMIFlags(MIFlags); + } else { + BuildMI(MBB, MBBI, dl, TII.get(ARM::t2ADDrr), DestReg) + .addReg(DestReg, RegState::Kill) + .addReg(BaseReg, RegState::Kill) + .addImm((unsigned)Pred).addReg(PredReg).addReg(0) + .setMIFlags(MIFlags); + } + return; + } + } + + while (NumBytes) { + unsigned ThisVal = NumBytes; + unsigned Opc = 0; + if (DestReg == ARM::SP && BaseReg != ARM::SP) { + // mov sp, rn. Note t2MOVr cannot be used. + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),DestReg) + .addReg(BaseReg).setMIFlags(MIFlags)); + BaseReg = ARM::SP; + continue; + } + + bool HasCCOut = true; + if (BaseReg == ARM::SP) { + // sub sp, sp, #imm7 + if (DestReg == ARM::SP && (ThisVal < ((1 << 7)-1) * 4)) { + assert((ThisVal & 3) == 0 && "Stack update is not multiple of 4?"); + Opc = isSub ? ARM::tSUBspi : ARM::tADDspi; + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg) + .addReg(BaseReg).addImm(ThisVal/4).setMIFlags(MIFlags)); + NumBytes = 0; + continue; + } + + // sub rd, sp, so_imm + Opc = isSub ? ARM::t2SUBri : ARM::t2ADDri; + if (ARM_AM::getT2SOImmVal(NumBytes) != -1) { + NumBytes = 0; + } else { + // FIXME: Move this to ARMAddressingModes.h? + unsigned RotAmt = CountLeadingZeros_32(ThisVal); + ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt); + NumBytes &= ~ThisVal; + assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 && + "Bit extraction didn't work?"); + } + } else { + assert(DestReg != ARM::SP && BaseReg != ARM::SP); + Opc = isSub ? ARM::t2SUBri : ARM::t2ADDri; + if (ARM_AM::getT2SOImmVal(NumBytes) != -1) { + NumBytes = 0; + } else if (ThisVal < 4096) { + Opc = isSub ? ARM::t2SUBri12 : ARM::t2ADDri12; + HasCCOut = false; + NumBytes = 0; + } else { + // FIXME: Move this to ARMAddressingModes.h? + unsigned RotAmt = CountLeadingZeros_32(ThisVal); + ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt); + NumBytes &= ~ThisVal; + assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 && + "Bit extraction didn't work?"); + } + } + + // Build the new ADD / SUB. + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg) + .addReg(BaseReg, RegState::Kill) + .addImm(ThisVal)).setMIFlags(MIFlags); + if (HasCCOut) + AddDefaultCC(MIB); + + BaseReg = DestReg; + } +} + +static unsigned +negativeOffsetOpcode(unsigned opcode) +{ + switch (opcode) { + case ARM::t2LDRi12: return ARM::t2LDRi8; + case ARM::t2LDRHi12: return ARM::t2LDRHi8; + case ARM::t2LDRBi12: return ARM::t2LDRBi8; + case ARM::t2LDRSHi12: return ARM::t2LDRSHi8; + case ARM::t2LDRSBi12: return ARM::t2LDRSBi8; + case ARM::t2STRi12: return ARM::t2STRi8; + case ARM::t2STRBi12: return ARM::t2STRBi8; + case ARM::t2STRHi12: return ARM::t2STRHi8; + + case ARM::t2LDRi8: + case ARM::t2LDRHi8: + case ARM::t2LDRBi8: + case ARM::t2LDRSHi8: + case ARM::t2LDRSBi8: + case ARM::t2STRi8: + case ARM::t2STRBi8: + case ARM::t2STRHi8: + return opcode; + + default: + break; + } + + return 0; +} + +static unsigned +positiveOffsetOpcode(unsigned opcode) +{ + switch (opcode) { + case ARM::t2LDRi8: return ARM::t2LDRi12; + case ARM::t2LDRHi8: return ARM::t2LDRHi12; + case ARM::t2LDRBi8: return ARM::t2LDRBi12; + case ARM::t2LDRSHi8: return ARM::t2LDRSHi12; + case ARM::t2LDRSBi8: return ARM::t2LDRSBi12; + case ARM::t2STRi8: return ARM::t2STRi12; + case ARM::t2STRBi8: return ARM::t2STRBi12; + case ARM::t2STRHi8: return ARM::t2STRHi12; + + case ARM::t2LDRi12: + case ARM::t2LDRHi12: + case ARM::t2LDRBi12: + case ARM::t2LDRSHi12: + case ARM::t2LDRSBi12: + case ARM::t2STRi12: + case ARM::t2STRBi12: + case ARM::t2STRHi12: + return opcode; + + default: + break; + } + + return 0; +} + +static unsigned +immediateOffsetOpcode(unsigned opcode) +{ + switch (opcode) { + case ARM::t2LDRs: return ARM::t2LDRi12; + case ARM::t2LDRHs: return ARM::t2LDRHi12; + case ARM::t2LDRBs: return ARM::t2LDRBi12; + case ARM::t2LDRSHs: return ARM::t2LDRSHi12; + case ARM::t2LDRSBs: return ARM::t2LDRSBi12; + case ARM::t2STRs: return ARM::t2STRi12; + case ARM::t2STRBs: return ARM::t2STRBi12; + case ARM::t2STRHs: return ARM::t2STRHi12; + + case ARM::t2LDRi12: + case ARM::t2LDRHi12: + case ARM::t2LDRBi12: + case ARM::t2LDRSHi12: + case ARM::t2LDRSBi12: + case ARM::t2STRi12: + case ARM::t2STRBi12: + case ARM::t2STRHi12: + case ARM::t2LDRi8: + case ARM::t2LDRHi8: + case ARM::t2LDRBi8: + case ARM::t2LDRSHi8: + case ARM::t2LDRSBi8: + case ARM::t2STRi8: + case ARM::t2STRBi8: + case ARM::t2STRHi8: + return opcode; + + default: + break; + } + + return 0; +} + +bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const ARMBaseInstrInfo &TII) { + unsigned Opcode = MI.getOpcode(); + const MCInstrDesc &Desc = MI.getDesc(); + unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); + bool isSub = false; + + // Memory operands in inline assembly always use AddrModeT2_i12. + if (Opcode == ARM::INLINEASM) + AddrMode = ARMII::AddrModeT2_i12; // FIXME. mode for thumb2? + + if (Opcode == ARM::t2ADDri || Opcode == ARM::t2ADDri12) { + Offset += MI.getOperand(FrameRegIdx+1).getImm(); + + unsigned PredReg; + if (Offset == 0 && getInstrPredicate(&MI, PredReg) == ARMCC::AL) { + // Turn it into a move. + MI.setDesc(TII.get(ARM::tMOVr)); + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + // Remove offset and remaining explicit predicate operands. + do MI.RemoveOperand(FrameRegIdx+1); + while (MI.getNumOperands() > FrameRegIdx+1); + MachineInstrBuilder MIB(&MI); + AddDefaultPred(MIB); + return true; + } + + bool HasCCOut = Opcode != ARM::t2ADDri12; + + if (Offset < 0) { + Offset = -Offset; + isSub = true; + MI.setDesc(TII.get(ARM::t2SUBri)); + } else { + MI.setDesc(TII.get(ARM::t2ADDri)); + } + + // Common case: small offset, fits into instruction. + if (ARM_AM::getT2SOImmVal(Offset) != -1) { + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset); + // Add cc_out operand if the original instruction did not have one. + if (!HasCCOut) + MI.addOperand(MachineOperand::CreateReg(0, false)); + Offset = 0; + return true; + } + // Another common case: imm12. + if (Offset < 4096 && + (!HasCCOut || MI.getOperand(MI.getNumOperands()-1).getReg() == 0)) { + unsigned NewOpc = isSub ? ARM::t2SUBri12 : ARM::t2ADDri12; + MI.setDesc(TII.get(NewOpc)); + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset); + // Remove the cc_out operand. + if (HasCCOut) + MI.RemoveOperand(MI.getNumOperands()-1); + Offset = 0; + return true; + } + + // Otherwise, extract 8 adjacent bits from the immediate into this + // t2ADDri/t2SUBri. + unsigned RotAmt = CountLeadingZeros_32(Offset); + unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xff000000U, RotAmt); + + // We will handle these bits from offset, clear them. + Offset &= ~ThisImmVal; + + assert(ARM_AM::getT2SOImmVal(ThisImmVal) != -1 && + "Bit extraction didn't work?"); + MI.getOperand(FrameRegIdx+1).ChangeToImmediate(ThisImmVal); + // Add cc_out operand if the original instruction did not have one. + if (!HasCCOut) + MI.addOperand(MachineOperand::CreateReg(0, false)); + + } else { + + // AddrMode4 and AddrMode6 cannot handle any offset. + if (AddrMode == ARMII::AddrMode4 || AddrMode == ARMII::AddrMode6) + return false; + + // AddrModeT2_so cannot handle any offset. If there is no offset + // register then we change to an immediate version. + unsigned NewOpc = Opcode; + if (AddrMode == ARMII::AddrModeT2_so) { + unsigned OffsetReg = MI.getOperand(FrameRegIdx+1).getReg(); + if (OffsetReg != 0) { + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + return Offset == 0; + } + + MI.RemoveOperand(FrameRegIdx+1); + MI.getOperand(FrameRegIdx+1).ChangeToImmediate(0); + NewOpc = immediateOffsetOpcode(Opcode); + AddrMode = ARMII::AddrModeT2_i12; + } + + unsigned NumBits = 0; + unsigned Scale = 1; + if (AddrMode == ARMII::AddrModeT2_i8 || AddrMode == ARMII::AddrModeT2_i12) { + // i8 supports only negative, and i12 supports only positive, so + // based on Offset sign convert Opcode to the appropriate + // instruction + Offset += MI.getOperand(FrameRegIdx+1).getImm(); + if (Offset < 0) { + NewOpc = negativeOffsetOpcode(Opcode); + NumBits = 8; + isSub = true; + Offset = -Offset; + } else { + NewOpc = positiveOffsetOpcode(Opcode); + NumBits = 12; + } + } else if (AddrMode == ARMII::AddrMode5) { + // VFP address mode. + const MachineOperand &OffOp = MI.getOperand(FrameRegIdx+1); + int InstrOffs = ARM_AM::getAM5Offset(OffOp.getImm()); + if (ARM_AM::getAM5Op(OffOp.getImm()) == ARM_AM::sub) + InstrOffs *= -1; + NumBits = 8; + Scale = 4; + Offset += InstrOffs * 4; + assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!"); + if (Offset < 0) { + Offset = -Offset; + isSub = true; + } + } else { + llvm_unreachable("Unsupported addressing mode!"); + } + + if (NewOpc != Opcode) + MI.setDesc(TII.get(NewOpc)); + + MachineOperand &ImmOp = MI.getOperand(FrameRegIdx+1); + + // Attempt to fold address computation + // Common case: small offset, fits into instruction. + int ImmedOffset = Offset / Scale; + unsigned Mask = (1 << NumBits) - 1; + if ((unsigned)Offset <= Mask * Scale) { + // Replace the FrameIndex with fp/sp + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + if (isSub) { + if (AddrMode == ARMII::AddrMode5) + // FIXME: Not consistent. + ImmedOffset |= 1 << NumBits; + else + ImmedOffset = -ImmedOffset; + } + ImmOp.ChangeToImmediate(ImmedOffset); + Offset = 0; + return true; + } + + // Otherwise, offset doesn't fit. Pull in what we can to simplify + ImmedOffset = ImmedOffset & Mask; + if (isSub) { + if (AddrMode == ARMII::AddrMode5) + // FIXME: Not consistent. + ImmedOffset |= 1 << NumBits; + else { + ImmedOffset = -ImmedOffset; + if (ImmedOffset == 0) + // Change the opcode back if the encoded offset is zero. + MI.setDesc(TII.get(positiveOffsetOpcode(NewOpc))); + } + } + ImmOp.ChangeToImmediate(ImmedOffset); + Offset &= ~(Mask*Scale); + } + + Offset = (isSub) ? -Offset : Offset; + return Offset == 0; +} + +/// scheduleTwoAddrSource - Schedule the copy / re-mat of the source of the +/// two-addrss instruction inserted by two-address pass. +void +Thumb2InstrInfo::scheduleTwoAddrSource(MachineInstr *SrcMI, + MachineInstr *UseMI, + const TargetRegisterInfo &TRI) const { + if (SrcMI->getOpcode() != ARM::tMOVr || SrcMI->getOperand(1).isKill()) + return; + + unsigned PredReg = 0; + ARMCC::CondCodes CC = llvm::getInstrPredicate(UseMI, PredReg); + if (CC == ARMCC::AL || PredReg != ARM::CPSR) + return; + + // Schedule the copy so it doesn't come between previous instructions + // and UseMI which can form an IT block. + unsigned SrcReg = SrcMI->getOperand(1).getReg(); + ARMCC::CondCodes OCC = ARMCC::getOppositeCondition(CC); + MachineBasicBlock *MBB = UseMI->getParent(); + MachineBasicBlock::iterator MBBI = SrcMI; + unsigned NumInsts = 0; + while (--MBBI != MBB->begin()) { + if (MBBI->isDebugValue()) + continue; + + MachineInstr *NMI = &*MBBI; + ARMCC::CondCodes NCC = llvm::getInstrPredicate(NMI, PredReg); + if (!(NCC == CC || NCC == OCC) || + NMI->modifiesRegister(SrcReg, &TRI) || + NMI->definesRegister(ARM::CPSR)) + break; + if (++NumInsts == 4) + // Too many in a row! + return; + } + + if (NumInsts) { + MBB->remove(SrcMI); + MBB->insert(++MBBI, SrcMI); + } +} + +ARMCC::CondCodes +llvm::getITInstrPredicate(const MachineInstr *MI, unsigned &PredReg) { + unsigned Opc = MI->getOpcode(); + if (Opc == ARM::tBcc || Opc == ARM::t2Bcc) + return ARMCC::AL; + return llvm::getInstrPredicate(MI, PredReg); +} diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h new file mode 100644 index 0000000..f2637d7 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h @@ -0,0 +1,78 @@ +//===- Thumb2InstrInfo.h - Thumb-2 Instruction Information ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Thumb-2 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef THUMB2INSTRUCTIONINFO_H +#define THUMB2INSTRUCTIONINFO_H + +#include "llvm/Target/TargetInstrInfo.h" +#include "ARM.h" +#include "ARMInstrInfo.h" +#include "Thumb2RegisterInfo.h" + +namespace llvm { +class ARMSubtarget; +class ScheduleHazardRecognizer; + +class Thumb2InstrInfo : public ARMBaseInstrInfo { + Thumb2RegisterInfo RI; +public: + explicit Thumb2InstrInfo(const ARMSubtarget &STI); + + // Return the non-pre/post incrementing version of 'Opc'. Return 0 + // if there is not such an opcode. + unsigned getUnindexedOpcode(unsigned Opc) const; + + void ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail, + MachineBasicBlock *NewDest) const; + + bool isLegalToSplitMBBAt(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) const; + + void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const; + + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned SrcReg, bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const; + + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const; + + /// scheduleTwoAddrSource - Schedule the copy / re-mat of the source of the + /// two-addrss instruction inserted by two-address pass. + void scheduleTwoAddrSource(MachineInstr *SrcMI, MachineInstr *UseMI, + const TargetRegisterInfo &TRI) const; + + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As + /// such, whenever a client has an instance of instruction info, it should + /// always be able to get register info as well (through this method). + /// + const Thumb2RegisterInfo &getRegisterInfo() const { return RI; } +}; + +/// getITInstrPredicate - Valid only in Thumb2 mode. This function is identical +/// to llvm::getInstrPredicate except it returns AL for conditional branch +/// instructions which are "predicated", but are not in IT blocks. +ARMCC::CondCodes getITInstrPredicate(const MachineInstr *MI, unsigned &PredReg); + + +} + +#endif // THUMB2INSTRUCTIONINFO_H diff --git a/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.cpp new file mode 100644 index 0000000..355c3bf --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.cpp @@ -0,0 +1,52 @@ +//===- Thumb2RegisterInfo.cpp - Thumb-2 Register Information ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Thumb-2 implementation of the TargetRegisterInfo +// class. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMSubtarget.h" +#include "Thumb2InstrInfo.h" +#include "Thumb2RegisterInfo.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +using namespace llvm; + +Thumb2RegisterInfo::Thumb2RegisterInfo(const ARMBaseInstrInfo &tii, + const ARMSubtarget &sti) + : ARMBaseRegisterInfo(tii, sti) { +} + +/// emitLoadConstPool - Emits a load from constpool to materialize the +/// specified immediate. +void +Thumb2RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + DebugLoc dl, + unsigned DestReg, unsigned SubIdx, + int Val, + ARMCC::CondCodes Pred, unsigned PredReg, + unsigned MIFlags) const { + MachineFunction &MF = *MBB.getParent(); + MachineConstantPool *ConstantPool = MF.getConstantPool(); + const Constant *C = ConstantInt::get( + Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4); + + BuildMI(MBB, MBBI, dl, TII.get(ARM::t2LDRpci)) + .addReg(DestReg, getDefRegState(true), SubIdx) + .addConstantPoolIndex(Idx).addImm((int64_t)ARMCC::AL).addReg(0) + .setMIFlags(MIFlags); +} diff --git a/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.h b/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.h new file mode 100644 index 0000000..824378a --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.h @@ -0,0 +1,43 @@ +//===- Thumb2RegisterInfo.h - Thumb-2 Register Information Impl -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Thumb-2 implementation of the TargetRegisterInfo +// class. +// +//===----------------------------------------------------------------------===// + +#ifndef THUMB2REGISTERINFO_H +#define THUMB2REGISTERINFO_H + +#include "ARM.h" +#include "ARMRegisterInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" + +namespace llvm { + class ARMSubtarget; + class ARMBaseInstrInfo; + class Type; + +struct Thumb2RegisterInfo : public ARMBaseRegisterInfo { +public: + Thumb2RegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI); + + /// emitLoadConstPool - Emits a load from constpool to materialize the + /// specified immediate. + void emitLoadConstPool(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + DebugLoc dl, + unsigned DestReg, unsigned SubIdx, int Val, + ARMCC::CondCodes Pred = ARMCC::AL, + unsigned PredReg = 0, + unsigned MIFlags = MachineInstr::NoFlags) const; +}; +} + +#endif // THUMB2REGISTERINFO_H diff --git a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp new file mode 100644 index 0000000..89a155c --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -0,0 +1,890 @@ +//===-- Thumb2SizeReduction.cpp - Thumb2 code size reduction pass -*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "t2-reduce-size" +#include "ARM.h" +#include "ARMBaseRegisterInfo.h" +#include "ARMBaseInstrInfo.h" +#include "ARMSubtarget.h" +#include "Thumb2InstrInfo.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumNarrows, "Number of 32-bit instrs reduced to 16-bit ones"); +STATISTIC(Num2Addrs, "Number of 32-bit instrs reduced to 2addr 16-bit ones"); +STATISTIC(NumLdSts, "Number of 32-bit load / store reduced to 16-bit ones"); + +static cl::opt<int> ReduceLimit("t2-reduce-limit", + cl::init(-1), cl::Hidden); +static cl::opt<int> ReduceLimit2Addr("t2-reduce-limit2", + cl::init(-1), cl::Hidden); +static cl::opt<int> ReduceLimitLdSt("t2-reduce-limit3", + cl::init(-1), cl::Hidden); + +namespace { + /// ReduceTable - A static table with information on mapping from wide + /// opcodes to narrow + struct ReduceEntry { + unsigned WideOpc; // Wide opcode + unsigned NarrowOpc1; // Narrow opcode to transform to + unsigned NarrowOpc2; // Narrow opcode when it's two-address + uint8_t Imm1Limit; // Limit of immediate field (bits) + uint8_t Imm2Limit; // Limit of immediate field when it's two-address + unsigned LowRegs1 : 1; // Only possible if low-registers are used + unsigned LowRegs2 : 1; // Only possible if low-registers are used (2addr) + unsigned PredCC1 : 2; // 0 - If predicated, cc is on and vice versa. + // 1 - No cc field. + // 2 - Always set CPSR. + unsigned PredCC2 : 2; + unsigned PartFlag : 1; // 16-bit instruction does partial flag update + unsigned Special : 1; // Needs to be dealt with specially + }; + + static const ReduceEntry ReduceTable[] = { + // Wide, Narrow1, Narrow2, imm1,imm2, lo1, lo2, P/C, PF, S + { ARM::t2ADCrr, 0, ARM::tADC, 0, 0, 0, 1, 0,0, 0,0 }, + { ARM::t2ADDri, ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 0,0, 0,1 }, + { ARM::t2ADDrr, ARM::tADDrr, ARM::tADDhirr, 0, 0, 1, 0, 0,1, 0,0 }, + { ARM::t2ADDSri,ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 2,2, 0,1 }, + { ARM::t2ADDSrr,ARM::tADDrr, 0, 0, 0, 1, 0, 2,0, 0,1 }, + { ARM::t2ANDrr, 0, ARM::tAND, 0, 0, 0, 1, 0,0, 1,0 }, + { ARM::t2ASRri, ARM::tASRri, 0, 5, 0, 1, 0, 0,0, 1,0 }, + { ARM::t2ASRrr, 0, ARM::tASRrr, 0, 0, 0, 1, 0,0, 1,0 }, + { ARM::t2BICrr, 0, ARM::tBIC, 0, 0, 0, 1, 0,0, 1,0 }, + //FIXME: Disable CMN, as CCodes are backwards from compare expectations + //{ ARM::t2CMNrr, ARM::tCMN, 0, 0, 0, 1, 0, 2,0, 0,0 }, + { ARM::t2CMPri, ARM::tCMPi8, 0, 8, 0, 1, 0, 2,0, 0,0 }, + { ARM::t2CMPrr, ARM::tCMPhir, 0, 0, 0, 0, 0, 2,0, 0,1 }, + { ARM::t2EORrr, 0, ARM::tEOR, 0, 0, 0, 1, 0,0, 1,0 }, + // FIXME: adr.n immediate offset must be multiple of 4. + //{ ARM::t2LEApcrelJT,ARM::tLEApcrelJT, 0, 0, 0, 1, 0, 1,0, 0,0 }, + { ARM::t2LSLri, ARM::tLSLri, 0, 5, 0, 1, 0, 0,0, 1,0 }, + { ARM::t2LSLrr, 0, ARM::tLSLrr, 0, 0, 0, 1, 0,0, 1,0 }, + { ARM::t2LSRri, ARM::tLSRri, 0, 5, 0, 1, 0, 0,0, 1,0 }, + { ARM::t2LSRrr, 0, ARM::tLSRrr, 0, 0, 0, 1, 0,0, 1,0 }, + // FIXME: tMOVi8 and tMVN also partially update CPSR but they are less + // likely to cause issue in the loop. As a size / performance workaround, + // they are not marked as such. + { ARM::t2MOVi, ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 0,0 }, + { ARM::t2MOVi16,ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 0,1 }, + // FIXME: Do we need the 16-bit 'S' variant? + { ARM::t2MOVr,ARM::tMOVr, 0, 0, 0, 0, 0, 1,0, 0,0 }, + { ARM::t2MUL, 0, ARM::tMUL, 0, 0, 0, 1, 0,0, 1,0 }, + { ARM::t2MVNr, ARM::tMVN, 0, 0, 0, 1, 0, 0,0, 0,0 }, + { ARM::t2ORRrr, 0, ARM::tORR, 0, 0, 0, 1, 0,0, 1,0 }, + { ARM::t2REV, ARM::tREV, 0, 0, 0, 1, 0, 1,0, 0,0 }, + { ARM::t2REV16, ARM::tREV16, 0, 0, 0, 1, 0, 1,0, 0,0 }, + { ARM::t2REVSH, ARM::tREVSH, 0, 0, 0, 1, 0, 1,0, 0,0 }, + { ARM::t2RORrr, 0, ARM::tROR, 0, 0, 0, 1, 0,0, 1,0 }, + { ARM::t2RSBri, ARM::tRSB, 0, 0, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2RSBSri,ARM::tRSB, 0, 0, 0, 1, 0, 2,0, 0,1 }, + { ARM::t2SBCrr, 0, ARM::tSBC, 0, 0, 0, 1, 0,0, 0,0 }, + { ARM::t2SUBri, ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 0,0, 0,0 }, + { ARM::t2SUBrr, ARM::tSUBrr, 0, 0, 0, 1, 0, 0,0, 0,0 }, + { ARM::t2SUBSri,ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 2,2, 0,0 }, + { ARM::t2SUBSrr,ARM::tSUBrr, 0, 0, 0, 1, 0, 2,0, 0,0 }, + { ARM::t2SXTB, ARM::tSXTB, 0, 0, 0, 1, 0, 1,0, 0,1 }, + { ARM::t2SXTH, ARM::tSXTH, 0, 0, 0, 1, 0, 1,0, 0,1 }, + { ARM::t2TSTrr, ARM::tTST, 0, 0, 0, 1, 0, 2,0, 0,0 }, + { ARM::t2UXTB, ARM::tUXTB, 0, 0, 0, 1, 0, 1,0, 0,1 }, + { ARM::t2UXTH, ARM::tUXTH, 0, 0, 0, 1, 0, 1,0, 0,1 }, + + // FIXME: Clean this up after splitting each Thumb load / store opcode + // into multiple ones. + { ARM::t2LDRi12,ARM::tLDRi, ARM::tLDRspi, 5, 8, 1, 0, 0,0, 0,1 }, + { ARM::t2LDRs, ARM::tLDRr, 0, 0, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2LDRBi12,ARM::tLDRBi, 0, 5, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2LDRBs, ARM::tLDRBr, 0, 0, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2LDRHi12,ARM::tLDRHi, 0, 5, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2LDRHs, ARM::tLDRHr, 0, 0, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2LDRSBs,ARM::tLDRSB, 0, 0, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2LDRSHs,ARM::tLDRSH, 0, 0, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2STRi12,ARM::tSTRi, ARM::tSTRspi, 5, 8, 1, 0, 0,0, 0,1 }, + { ARM::t2STRs, ARM::tSTRr, 0, 0, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2STRBi12,ARM::tSTRBi, 0, 5, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2STRBs, ARM::tSTRBr, 0, 0, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2STRHi12,ARM::tSTRHi, 0, 5, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2STRHs, ARM::tSTRHr, 0, 0, 0, 1, 0, 0,0, 0,1 }, + + { ARM::t2LDMIA, ARM::tLDMIA, 0, 0, 0, 1, 1, 1,1, 0,1 }, + { ARM::t2LDMIA_RET,0, ARM::tPOP_RET, 0, 0, 1, 1, 1,1, 0,1 }, + { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0, 0, 1, 1, 1,1, 0,1 }, + // ARM::t2STM (with no basereg writeback) has no Thumb1 equivalent + { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1 }, + { ARM::t2STMDB_UPD, 0, ARM::tPUSH, 0, 0, 1, 1, 1,1, 0,1 }, + }; + + class Thumb2SizeReduce : public MachineFunctionPass { + public: + static char ID; + Thumb2SizeReduce(); + + const Thumb2InstrInfo *TII; + const ARMSubtarget *STI; + + virtual bool runOnMachineFunction(MachineFunction &MF); + + virtual const char *getPassName() const { + return "Thumb2 instruction size reduction pass"; + } + + private: + /// ReduceOpcodeMap - Maps wide opcode to index of entry in ReduceTable. + DenseMap<unsigned, unsigned> ReduceOpcodeMap; + + bool canAddPseudoFlagDep(MachineInstr *Def, MachineInstr *Use); + + bool VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry, + bool is2Addr, ARMCC::CondCodes Pred, + bool LiveCPSR, bool &HasCC, bool &CCDead); + + bool ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry); + + bool ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry, bool LiveCPSR, + MachineInstr *CPSRDef); + + /// ReduceTo2Addr - Reduce a 32-bit instruction to a 16-bit two-address + /// instruction. + bool ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry, + bool LiveCPSR, MachineInstr *CPSRDef); + + /// ReduceToNarrow - Reduce a 32-bit instruction to a 16-bit + /// non-two-address instruction. + bool ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry, + bool LiveCPSR, MachineInstr *CPSRDef); + + /// ReduceMBB - Reduce width of instructions in the specified basic block. + bool ReduceMBB(MachineBasicBlock &MBB); + }; + char Thumb2SizeReduce::ID = 0; +} + +Thumb2SizeReduce::Thumb2SizeReduce() : MachineFunctionPass(ID) { + for (unsigned i = 0, e = array_lengthof(ReduceTable); i != e; ++i) { + unsigned FromOpc = ReduceTable[i].WideOpc; + if (!ReduceOpcodeMap.insert(std::make_pair(FromOpc, i)).second) + assert(false && "Duplicated entries?"); + } +} + +static bool HasImplicitCPSRDef(const MCInstrDesc &MCID) { + for (const unsigned *Regs = MCID.ImplicitDefs; *Regs; ++Regs) + if (*Regs == ARM::CPSR) + return true; + return false; +} + +/// canAddPseudoFlagDep - For A9 (and other out-of-order) implementations, +/// the 's' 16-bit instruction partially update CPSR. Abort the +/// transformation to avoid adding false dependency on last CPSR setting +/// instruction which hurts the ability for out-of-order execution engine +/// to do register renaming magic. +/// This function checks if there is a read-of-write dependency between the +/// last instruction that defines the CPSR and the current instruction. If there +/// is, then there is no harm done since the instruction cannot be retired +/// before the CPSR setting instruction anyway. +/// Note, we are not doing full dependency analysis here for the sake of compile +/// time. We're not looking for cases like: +/// r0 = muls ... +/// r1 = add.w r0, ... +/// ... +/// = mul.w r1 +/// In this case it would have been ok to narrow the mul.w to muls since there +/// are indirect RAW dependency between the muls and the mul.w +bool +Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Def, MachineInstr *Use) { + if (!Def || !STI->avoidCPSRPartialUpdate()) + return false; + + SmallSet<unsigned, 2> Defs; + for (unsigned i = 0, e = Def->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = Def->getOperand(i); + if (!MO.isReg() || MO.isUndef() || MO.isUse()) + continue; + unsigned Reg = MO.getReg(); + if (Reg == 0 || Reg == ARM::CPSR) + continue; + Defs.insert(Reg); + } + + for (unsigned i = 0, e = Use->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = Use->getOperand(i); + if (!MO.isReg() || MO.isUndef() || MO.isDef()) + continue; + unsigned Reg = MO.getReg(); + if (Defs.count(Reg)) + return false; + } + + // No read-after-write dependency. The narrowing will add false dependency. + return true; +} + +bool +Thumb2SizeReduce::VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry, + bool is2Addr, ARMCC::CondCodes Pred, + bool LiveCPSR, bool &HasCC, bool &CCDead) { + if ((is2Addr && Entry.PredCC2 == 0) || + (!is2Addr && Entry.PredCC1 == 0)) { + if (Pred == ARMCC::AL) { + // Not predicated, must set CPSR. + if (!HasCC) { + // Original instruction was not setting CPSR, but CPSR is not + // currently live anyway. It's ok to set it. The CPSR def is + // dead though. + if (!LiveCPSR) { + HasCC = true; + CCDead = true; + return true; + } + return false; + } + } else { + // Predicated, must not set CPSR. + if (HasCC) + return false; + } + } else if ((is2Addr && Entry.PredCC2 == 2) || + (!is2Addr && Entry.PredCC1 == 2)) { + /// Old opcode has an optional def of CPSR. + if (HasCC) + return true; + // If old opcode does not implicitly define CPSR, then it's not ok since + // these new opcodes' CPSR def is not meant to be thrown away. e.g. CMP. + if (!HasImplicitCPSRDef(MI->getDesc())) + return false; + HasCC = true; + } else { + // 16-bit instruction does not set CPSR. + if (HasCC) + return false; + } + + return true; +} + +static bool VerifyLowRegs(MachineInstr *MI) { + unsigned Opc = MI->getOpcode(); + bool isPCOk = (Opc == ARM::t2LDMIA_RET || Opc == ARM::t2LDMIA || + Opc == ARM::t2LDMDB || Opc == ARM::t2LDMIA_UPD || + Opc == ARM::t2LDMDB_UPD); + bool isLROk = (Opc == ARM::t2STMIA_UPD || Opc == ARM::t2STMDB_UPD); + bool isSPOk = isPCOk || isLROk; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg() || MO.isImplicit()) + continue; + unsigned Reg = MO.getReg(); + if (Reg == 0 || Reg == ARM::CPSR) + continue; + if (isPCOk && Reg == ARM::PC) + continue; + if (isLROk && Reg == ARM::LR) + continue; + if (Reg == ARM::SP) { + if (isSPOk) + continue; + if (i == 1 && (Opc == ARM::t2LDRi12 || Opc == ARM::t2STRi12)) + // Special case for these ldr / str with sp as base register. + continue; + } + if (!isARMLowRegister(Reg)) + return false; + } + return true; +} + +bool +Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry) { + if (ReduceLimitLdSt != -1 && ((int)NumLdSts >= ReduceLimitLdSt)) + return false; + + unsigned Scale = 1; + bool HasImmOffset = false; + bool HasShift = false; + bool HasOffReg = true; + bool isLdStMul = false; + unsigned Opc = Entry.NarrowOpc1; + unsigned OpNum = 3; // First 'rest' of operands. + uint8_t ImmLimit = Entry.Imm1Limit; + + switch (Entry.WideOpc) { + default: + llvm_unreachable("Unexpected Thumb2 load / store opcode!"); + case ARM::t2LDRi12: + case ARM::t2STRi12: + if (MI->getOperand(1).getReg() == ARM::SP) { + Opc = Entry.NarrowOpc2; + ImmLimit = Entry.Imm2Limit; + HasOffReg = false; + } + + Scale = 4; + HasImmOffset = true; + HasOffReg = false; + break; + case ARM::t2LDRBi12: + case ARM::t2STRBi12: + HasImmOffset = true; + HasOffReg = false; + break; + case ARM::t2LDRHi12: + case ARM::t2STRHi12: + Scale = 2; + HasImmOffset = true; + HasOffReg = false; + break; + case ARM::t2LDRs: + case ARM::t2LDRBs: + case ARM::t2LDRHs: + case ARM::t2LDRSBs: + case ARM::t2LDRSHs: + case ARM::t2STRs: + case ARM::t2STRBs: + case ARM::t2STRHs: + HasShift = true; + OpNum = 4; + break; + case ARM::t2LDMIA: + case ARM::t2LDMDB: { + unsigned BaseReg = MI->getOperand(0).getReg(); + if (!isARMLowRegister(BaseReg) || Entry.WideOpc != ARM::t2LDMIA) + return false; + + // For the non-writeback version (this one), the base register must be + // one of the registers being loaded. + bool isOK = false; + for (unsigned i = 4; i < MI->getNumOperands(); ++i) { + if (MI->getOperand(i).getReg() == BaseReg) { + isOK = true; + break; + } + } + + if (!isOK) + return false; + + OpNum = 0; + isLdStMul = true; + break; + } + case ARM::t2LDMIA_RET: { + unsigned BaseReg = MI->getOperand(1).getReg(); + if (BaseReg != ARM::SP) + return false; + Opc = Entry.NarrowOpc2; // tPOP_RET + OpNum = 2; + isLdStMul = true; + break; + } + case ARM::t2LDMIA_UPD: + case ARM::t2LDMDB_UPD: + case ARM::t2STMIA_UPD: + case ARM::t2STMDB_UPD: { + OpNum = 0; + + unsigned BaseReg = MI->getOperand(1).getReg(); + if (BaseReg == ARM::SP && + (Entry.WideOpc == ARM::t2LDMIA_UPD || + Entry.WideOpc == ARM::t2STMDB_UPD)) { + Opc = Entry.NarrowOpc2; // tPOP or tPUSH + OpNum = 2; + } else if (!isARMLowRegister(BaseReg) || + (Entry.WideOpc != ARM::t2LDMIA_UPD && + Entry.WideOpc != ARM::t2STMIA_UPD)) { + return false; + } + + isLdStMul = true; + break; + } + } + + unsigned OffsetReg = 0; + bool OffsetKill = false; + if (HasShift) { + OffsetReg = MI->getOperand(2).getReg(); + OffsetKill = MI->getOperand(2).isKill(); + + if (MI->getOperand(3).getImm()) + // Thumb1 addressing mode doesn't support shift. + return false; + } + + unsigned OffsetImm = 0; + if (HasImmOffset) { + OffsetImm = MI->getOperand(2).getImm(); + unsigned MaxOffset = ((1 << ImmLimit) - 1) * Scale; + + if ((OffsetImm & (Scale - 1)) || OffsetImm > MaxOffset) + // Make sure the immediate field fits. + return false; + } + + // Add the 16-bit load / store instruction. + DebugLoc dl = MI->getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, TII->get(Opc)); + if (!isLdStMul) { + MIB.addOperand(MI->getOperand(0)); + MIB.addOperand(MI->getOperand(1)); + + if (HasImmOffset) + MIB.addImm(OffsetImm / Scale); + + assert((!HasShift || OffsetReg) && "Invalid so_reg load / store address!"); + + if (HasOffReg) + MIB.addReg(OffsetReg, getKillRegState(OffsetKill)); + } + + // Transfer the rest of operands. + for (unsigned e = MI->getNumOperands(); OpNum != e; ++OpNum) + MIB.addOperand(MI->getOperand(OpNum)); + + // Transfer memoperands. + MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + + // Transfer MI flags. + MIB.setMIFlags(MI->getFlags()); + + DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB); + + MBB.erase(MI); + ++NumLdSts; + return true; +} + +bool +Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry, + bool LiveCPSR, MachineInstr *CPSRDef) { + unsigned Opc = MI->getOpcode(); + if (Opc == ARM::t2ADDri) { + // If the source register is SP, try to reduce to tADDrSPi, otherwise + // it's a normal reduce. + if (MI->getOperand(1).getReg() != ARM::SP) { + if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef)) + return true; + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef); + } + // Try to reduce to tADDrSPi. + unsigned Imm = MI->getOperand(2).getImm(); + // The immediate must be in range, the destination register must be a low + // reg, the predicate must be "always" and the condition flags must not + // be being set. + if (Imm & 3 || Imm > 1020) + return false; + if (!isARMLowRegister(MI->getOperand(0).getReg())) + return false; + if (MI->getOperand(3).getImm() != ARMCC::AL) + return false; + const MCInstrDesc &MCID = MI->getDesc(); + if (MCID.hasOptionalDef() && + MI->getOperand(MCID.getNumOperands()-1).getReg() == ARM::CPSR) + return false; + + MachineInstrBuilder MIB = BuildMI(MBB, *MI, MI->getDebugLoc(), + TII->get(ARM::tADDrSPi)) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)) + .addImm(Imm / 4); // The tADDrSPi has an implied scale by four. + AddDefaultPred(MIB); + + // Transfer MI flags. + MIB.setMIFlags(MI->getFlags()); + + DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " <<*MIB); + + MBB.erase(MI); + ++NumNarrows; + return true; + } + + if (Entry.LowRegs1 && !VerifyLowRegs(MI)) + return false; + + const MCInstrDesc &MCID = MI->getDesc(); + if (MCID.mayLoad() || MCID.mayStore()) + return ReduceLoadStore(MBB, MI, Entry); + + switch (Opc) { + default: break; + case ARM::t2ADDSri: + case ARM::t2ADDSrr: { + unsigned PredReg = 0; + if (getInstrPredicate(MI, PredReg) == ARMCC::AL) { + switch (Opc) { + default: break; + case ARM::t2ADDSri: { + if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef)) + return true; + // fallthrough + } + case ARM::t2ADDSrr: + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef); + } + } + break; + } + case ARM::t2RSBri: + case ARM::t2RSBSri: + case ARM::t2SXTB: + case ARM::t2SXTH: + case ARM::t2UXTB: + case ARM::t2UXTH: + if (MI->getOperand(2).getImm() == 0) + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef); + break; + case ARM::t2MOVi16: + // Can convert only 'pure' immediate operands, not immediates obtained as + // globals' addresses. + if (MI->getOperand(1).isImm()) + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef); + break; + case ARM::t2CMPrr: { + // Try to reduce to the lo-reg only version first. Why there are two + // versions of the instruction is a mystery. + // It would be nice to just have two entries in the master table that + // are prioritized, but the table assumes a unique entry for each + // source insn opcode. So for now, we hack a local entry record to use. + static const ReduceEntry NarrowEntry = + { ARM::t2CMPrr,ARM::tCMPr, 0, 0, 0, 1, 1,2, 0, 0,1 }; + if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, CPSRDef)) + return true; + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef); + } + } + return false; +} + +bool +Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry, + bool LiveCPSR, MachineInstr *CPSRDef) { + + if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr)) + return false; + + unsigned Reg0 = MI->getOperand(0).getReg(); + unsigned Reg1 = MI->getOperand(1).getReg(); + if (Reg0 != Reg1) { + // Try to commute the operands to make it a 2-address instruction. + unsigned CommOpIdx1, CommOpIdx2; + if (!TII->findCommutedOpIndices(MI, CommOpIdx1, CommOpIdx2) || + CommOpIdx1 != 1 || MI->getOperand(CommOpIdx2).getReg() != Reg0) + return false; + MachineInstr *CommutedMI = TII->commuteInstruction(MI); + if (!CommutedMI) + return false; + } + if (Entry.LowRegs2 && !isARMLowRegister(Reg0)) + return false; + if (Entry.Imm2Limit) { + unsigned Imm = MI->getOperand(2).getImm(); + unsigned Limit = (1 << Entry.Imm2Limit) - 1; + if (Imm > Limit) + return false; + } else { + unsigned Reg2 = MI->getOperand(2).getReg(); + if (Entry.LowRegs2 && !isARMLowRegister(Reg2)) + return false; + } + + // Check if it's possible / necessary to transfer the predicate. + const MCInstrDesc &NewMCID = TII->get(Entry.NarrowOpc2); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); + bool SkipPred = false; + if (Pred != ARMCC::AL) { + if (!NewMCID.isPredicable()) + // Can't transfer predicate, fail. + return false; + } else { + SkipPred = !NewMCID.isPredicable(); + } + + bool HasCC = false; + bool CCDead = false; + const MCInstrDesc &MCID = MI->getDesc(); + if (MCID.hasOptionalDef()) { + unsigned NumOps = MCID.getNumOperands(); + HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR); + if (HasCC && MI->getOperand(NumOps-1).isDead()) + CCDead = true; + } + if (!VerifyPredAndCC(MI, Entry, true, Pred, LiveCPSR, HasCC, CCDead)) + return false; + + // Avoid adding a false dependency on partial flag update by some 16-bit + // instructions which has the 's' bit set. + if (Entry.PartFlag && NewMCID.hasOptionalDef() && HasCC && + canAddPseudoFlagDep(CPSRDef, MI)) + return false; + + // Add the 16-bit instruction. + DebugLoc dl = MI->getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, NewMCID); + MIB.addOperand(MI->getOperand(0)); + if (NewMCID.hasOptionalDef()) { + if (HasCC) + AddDefaultT1CC(MIB, CCDead); + else + AddNoT1CC(MIB); + } + + // Transfer the rest of operands. + unsigned NumOps = MCID.getNumOperands(); + for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) { + if (i < NumOps && MCID.OpInfo[i].isOptionalDef()) + continue; + if (SkipPred && MCID.OpInfo[i].isPredicate()) + continue; + MIB.addOperand(MI->getOperand(i)); + } + + // Transfer MI flags. + MIB.setMIFlags(MI->getFlags()); + + DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB); + + MBB.erase(MI); + ++Num2Addrs; + return true; +} + +bool +Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, + const ReduceEntry &Entry, + bool LiveCPSR, MachineInstr *CPSRDef) { + if (ReduceLimit != -1 && ((int)NumNarrows >= ReduceLimit)) + return false; + + unsigned Limit = ~0U; + if (Entry.Imm1Limit) + Limit = (1 << Entry.Imm1Limit) - 1; + + const MCInstrDesc &MCID = MI->getDesc(); + for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) { + if (MCID.OpInfo[i].isPredicate()) + continue; + const MachineOperand &MO = MI->getOperand(i); + if (MO.isReg()) { + unsigned Reg = MO.getReg(); + if (!Reg || Reg == ARM::CPSR) + continue; + if (Entry.LowRegs1 && !isARMLowRegister(Reg)) + return false; + } else if (MO.isImm() && + !MCID.OpInfo[i].isPredicate()) { + if (((unsigned)MO.getImm()) > Limit) + return false; + } + } + + // Check if it's possible / necessary to transfer the predicate. + const MCInstrDesc &NewMCID = TII->get(Entry.NarrowOpc1); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); + bool SkipPred = false; + if (Pred != ARMCC::AL) { + if (!NewMCID.isPredicable()) + // Can't transfer predicate, fail. + return false; + } else { + SkipPred = !NewMCID.isPredicable(); + } + + bool HasCC = false; + bool CCDead = false; + if (MCID.hasOptionalDef()) { + unsigned NumOps = MCID.getNumOperands(); + HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR); + if (HasCC && MI->getOperand(NumOps-1).isDead()) + CCDead = true; + } + if (!VerifyPredAndCC(MI, Entry, false, Pred, LiveCPSR, HasCC, CCDead)) + return false; + + // Avoid adding a false dependency on partial flag update by some 16-bit + // instructions which has the 's' bit set. + if (Entry.PartFlag && NewMCID.hasOptionalDef() && HasCC && + canAddPseudoFlagDep(CPSRDef, MI)) + return false; + + // Add the 16-bit instruction. + DebugLoc dl = MI->getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, NewMCID); + MIB.addOperand(MI->getOperand(0)); + if (NewMCID.hasOptionalDef()) { + if (HasCC) + AddDefaultT1CC(MIB, CCDead); + else + AddNoT1CC(MIB); + } + + // Transfer the rest of operands. + unsigned NumOps = MCID.getNumOperands(); + for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) { + if (i < NumOps && MCID.OpInfo[i].isOptionalDef()) + continue; + if ((MCID.getOpcode() == ARM::t2RSBSri || + MCID.getOpcode() == ARM::t2RSBri || + MCID.getOpcode() == ARM::t2SXTB || + MCID.getOpcode() == ARM::t2SXTH || + MCID.getOpcode() == ARM::t2UXTB || + MCID.getOpcode() == ARM::t2UXTH) && i == 2) + // Skip the zero immediate operand, it's now implicit. + continue; + bool isPred = (i < NumOps && MCID.OpInfo[i].isPredicate()); + if (SkipPred && isPred) + continue; + const MachineOperand &MO = MI->getOperand(i); + if (MO.isReg() && MO.isImplicit() && MO.getReg() == ARM::CPSR) + // Skip implicit def of CPSR. Either it's modeled as an optional + // def now or it's already an implicit def on the new instruction. + continue; + MIB.addOperand(MO); + } + if (!MCID.isPredicable() && NewMCID.isPredicable()) + AddDefaultPred(MIB); + + // Transfer MI flags. + MIB.setMIFlags(MI->getFlags()); + + DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB); + + MBB.erase(MI); + ++NumNarrows; + return true; +} + +static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR, bool &DefCPSR) { + bool HasDef = false; + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI.getOperand(i); + if (!MO.isReg() || MO.isUndef() || MO.isUse()) + continue; + if (MO.getReg() != ARM::CPSR) + continue; + + DefCPSR = true; + if (!MO.isDead()) + HasDef = true; + } + + return HasDef || LiveCPSR; +} + +static bool UpdateCPSRUse(MachineInstr &MI, bool LiveCPSR) { + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI.getOperand(i); + if (!MO.isReg() || MO.isUndef() || MO.isDef()) + continue; + if (MO.getReg() != ARM::CPSR) + continue; + assert(LiveCPSR && "CPSR liveness tracking is wrong!"); + if (MO.isKill()) { + LiveCPSR = false; + break; + } + } + + return LiveCPSR; +} + +bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { + bool Modified = false; + + // Yes, CPSR could be livein. + bool LiveCPSR = MBB.isLiveIn(ARM::CPSR); + MachineInstr *CPSRDef = 0; + + MachineBasicBlock::iterator MII = MBB.begin(), E = MBB.end(); + MachineBasicBlock::iterator NextMII; + for (; MII != E; MII = NextMII) { + NextMII = llvm::next(MII); + + MachineInstr *MI = &*MII; + LiveCPSR = UpdateCPSRUse(*MI, LiveCPSR); + + unsigned Opcode = MI->getOpcode(); + DenseMap<unsigned, unsigned>::iterator OPI = ReduceOpcodeMap.find(Opcode); + if (OPI != ReduceOpcodeMap.end()) { + const ReduceEntry &Entry = ReduceTable[OPI->second]; + // Ignore "special" cases for now. + if (Entry.Special) { + if (ReduceSpecial(MBB, MI, Entry, LiveCPSR, CPSRDef)) { + Modified = true; + MachineBasicBlock::iterator I = prior(NextMII); + MI = &*I; + } + goto ProcessNext; + } + + // Try to transform to a 16-bit two-address instruction. + if (Entry.NarrowOpc2 && + ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef)) { + Modified = true; + MachineBasicBlock::iterator I = prior(NextMII); + MI = &*I; + goto ProcessNext; + } + + // Try to transform to a 16-bit non-two-address instruction. + if (Entry.NarrowOpc1 && + ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef)) { + Modified = true; + MachineBasicBlock::iterator I = prior(NextMII); + MI = &*I; + } + } + + ProcessNext: + bool DefCPSR = false; + LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR, DefCPSR); + if (MI->getDesc().isCall()) + // Calls don't really set CPSR. + CPSRDef = 0; + else if (DefCPSR) + // This is the last CPSR defining instruction. + CPSRDef = MI; + } + + return Modified; +} + +bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) { + const TargetMachine &TM = MF.getTarget(); + TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo()); + STI = &TM.getSubtarget<ARMSubtarget>(); + + bool Modified = false; + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) + Modified |= ReduceMBB(*I); + return Modified; +} + +/// createThumb2SizeReductionPass - Returns an instance of the Thumb2 size +/// reduction pass. +FunctionPass *llvm::createThumb2SizeReductionPass() { + return new Thumb2SizeReduce(); +} |