diff options
Diffstat (limited to 'lib/Target')
217 files changed, 12883 insertions, 5263 deletions
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index bf4315f..6af5f85 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -51,6 +51,12 @@ def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true", // to just not use them. def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true", "Disable VFP / NEON MAC instructions">; + +// Cortex-A8 / A9 Advanced SIMD has multiplier accumulator forwarding. +def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding", + "HasVMLxForwarding", "true", + "Has multiplier accumulator forwarding">; + // Some processors benefit from using NEON instructions for scalar // single-precision FP operations. def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP", @@ -61,6 +67,14 @@ def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP", def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true", "Prefer 32-bit Thumb instrs">; +/// Some instructions update CPSR partially, which can add false dependency for +/// out-of-order implementation, e.g. Cortex-A9, unless each individual bit is +/// mapped to a separate physical register. Avoid partial CPSR update for these +/// processors. +def FeatureAvoidPartialCPSR : SubtargetFeature<"avoid-partial-cpsr", + "AvoidCPSRPartialUpdate", "true", + "Avoid CPSR partial update for OOO execution">; + // Multiprocessing extension. def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true", "Supports Multiprocessing extension">; @@ -100,11 +114,13 @@ def ProcOthers : SubtargetFeature<"others", "ARMProcFamily", "Others", def ProcA8 : SubtargetFeature<"a8", "ARMProcFamily", "CortexA8", "Cortex-A8 ARM processors", [FeatureSlowFPBrcc, FeatureNEONForFP, - FeatureHasSlowFPVMLx, FeatureT2XtPk]>; + FeatureHasSlowFPVMLx, FeatureVMLxForwarding, + FeatureT2XtPk]>; def ProcA9 : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9", "Cortex-A9 ARM processors", - [FeatureHasSlowFPVMLx, FeatureT2XtPk, - FeatureFP16]>; + [FeatureVMLxForwarding, + FeatureT2XtPk, FeatureFP16, + FeatureAvoidPartialCPSR]>; class ProcNoItin<string Name, list<SubtargetFeature> Features> : Processor<Name, GenericItineraries, Features>; @@ -171,6 +187,8 @@ def : Processor<"cortex-a8", CortexA8Itineraries, [ArchV7A, ProcA8]>; def : Processor<"cortex-a9", CortexA9Itineraries, [ArchV7A, ProcA9]>; +def : Processor<"cortex-a9-mp", CortexA9Itineraries, + [ArchV7A, ProcA9, FeatureMP]>; // V7M Processors. def : ProcNoItin<"cortex-m3", [ArchV7M]>; diff --git a/lib/Target/ARM/ARMAddressingModes.h b/lib/Target/ARM/ARMAddressingModes.h index 19fbf05..595708f 100644 --- a/lib/Target/ARM/ARMAddressingModes.h +++ b/lib/Target/ARM/ARMAddressingModes.h @@ -408,16 +408,18 @@ namespace ARM_AM { // // The first operand is always a Reg. The second operand is a reg if in // reg/reg form, otherwise it's reg#0. The third field encodes the operation - // in bit 12, the immediate in bits 0-11, and the shift op in 13-15. + // in bit 12, the immediate in bits 0-11, and the shift op in 13-15. The + // fourth operand 16-17 encodes the index mode. // // If this addressing mode is a frame index (before prolog/epilog insertion // and code rewriting), this operand will have the form: FI#, reg0, <offs> // with no shift amount for the frame offset. // - static inline unsigned getAM2Opc(AddrOpc Opc, unsigned Imm12, ShiftOpc SO) { + static inline unsigned getAM2Opc(AddrOpc Opc, unsigned Imm12, ShiftOpc SO, + unsigned IdxMode = 0) { assert(Imm12 < (1 << 12) && "Imm too large!"); bool isSub = Opc == sub; - return Imm12 | ((int)isSub << 12) | (SO << 13); + return Imm12 | ((int)isSub << 12) | (SO << 13) | (IdxMode << 16) ; } static inline unsigned getAM2Offset(unsigned AM2Opc) { return AM2Opc & ((1 << 12)-1); @@ -426,7 +428,10 @@ namespace ARM_AM { return ((AM2Opc >> 12) & 1) ? sub : add; } static inline ShiftOpc getAM2ShiftOpc(unsigned AM2Opc) { - return (ShiftOpc)(AM2Opc >> 13); + return (ShiftOpc)((AM2Opc >> 13) & 7); + } + static inline unsigned getAM2IdxMode(unsigned AM2Opc) { + return (AM2Opc >> 16); } @@ -441,12 +446,14 @@ namespace ARM_AM { // // The first operand is always a Reg. The second operand is a reg if in // reg/reg form, otherwise it's reg#0. The third field encodes the operation - // in bit 8, the immediate in bits 0-7. + // in bit 8, the immediate in bits 0-7. The fourth operand 9-10 encodes the + // index mode. /// getAM3Opc - This function encodes the addrmode3 opc field. - static inline unsigned getAM3Opc(AddrOpc Opc, unsigned char Offset) { + static inline unsigned getAM3Opc(AddrOpc Opc, unsigned char Offset, + unsigned IdxMode = 0) { bool isSub = Opc == sub; - return ((int)isSub << 8) | Offset; + return ((int)isSub << 8) | Offset | (IdxMode << 9); } static inline unsigned char getAM3Offset(unsigned AM3Opc) { return AM3Opc & 0xFF; @@ -454,6 +461,9 @@ namespace ARM_AM { static inline AddrOpc getAM3Op(unsigned AM3Opc) { return ((AM3Opc >> 8) & 1) ? sub : add; } + static inline unsigned getAM3IdxMode(unsigned AM3Opc) { + return (AM3Opc >> 9); + } //===--------------------------------------------------------------------===// // Addressing Mode #4 diff --git a/lib/Target/ARM/ARMAsmBackend.cpp b/lib/Target/ARM/ARMAsmBackend.cpp index ec23449..f062819 100644 --- a/lib/Target/ARM/ARMAsmBackend.cpp +++ b/lib/Target/ARM/ARMAsmBackend.cpp @@ -246,7 +246,7 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) { } uint32_t out = (opc << 21); - out |= (Value & 0x800) << 14; + out |= (Value & 0x800) << 15; out |= (Value & 0x700) << 4; out |= (Value & 0x0FF); @@ -416,21 +416,22 @@ void ELFARMAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data, // FIXME: This should be in a separate file. class DarwinARMAsmBackend : public ARMAsmBackend { public: - DarwinARMAsmBackend(const Target &T) : ARMAsmBackend(T) { } - - void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value) const; + const object::mach::CPUSubtypeARM Subtype; + DarwinARMAsmBackend(const Target &T, object::mach::CPUSubtypeARM st) + : ARMAsmBackend(T), Subtype(st) { } MCObjectWriter *createObjectWriter(raw_ostream &OS) const { - // FIXME: Subtarget info should be derived. Force v7 for now. return createMachObjectWriter(new ARMMachObjectWriter( /*Is64Bit=*/false, object::mach::CTM_ARM, - object::mach::CSARM_V7), + Subtype), OS, /*IsLittleEndian=*/true); } + void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, + uint64_t Value) const; + virtual bool doesSectionRequireSymbols(const MCSection &Section) const { return false; } @@ -499,14 +500,17 @@ void DarwinARMAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data, TargetAsmBackend *llvm::createARMAsmBackend(const Target &T, const std::string &TT) { - switch (Triple(TT).getOS()) { - case Triple::Darwin: - return new DarwinARMAsmBackend(T); - case Triple::MinGW32: - case Triple::Cygwin: - case Triple::Win32: - assert(0 && "Windows not supported on ARM"); - default: - return new ELFARMAsmBackend(T, Triple(TT).getOS()); + Triple TheTriple(TT); + + if (TheTriple.isOSDarwin()) { + if (TheTriple.getArchName() == "armv6" || + TheTriple.getArchName() == "thumbv6") + return new DarwinARMAsmBackend(T, object::mach::CSARM_V6); + return new DarwinARMAsmBackend(T, object::mach::CSARM_V7); } + + if (TheTriple.isOSWindows()) + assert(0 && "Windows not supported on ARM"); + + return new ELFARMAsmBackend(T, Triple(TT).getOS()); } diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp index db12b8e..c428e18 100644 --- a/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/lib/Target/ARM/ARMAsmPrinter.cpp @@ -88,6 +88,11 @@ namespace { case ARMBuildAttrs::CPU_name: Streamer.EmitRawText(StringRef("\t.cpu ") + LowercaseString(String)); break; + /* GAS requires .fpu to be emitted regardless of EABI attribute */ + case ARMBuildAttrs::Advanced_SIMD_arch: + case ARMBuildAttrs::VFP_arch: + Streamer.EmitRawText(StringRef("\t.fpu ") + LowercaseString(String)); + break; default: assert(0 && "Unsupported Text attribute in ASM Mode"); break; } } @@ -167,6 +172,117 @@ getDebugValueLocation(const MachineInstr *MI) const { return Location; } +/// getDwarfRegOpSize - get size required to emit given machine location using +/// dwarf encoding. +unsigned ARMAsmPrinter::getDwarfRegOpSize(const MachineLocation &MLoc) const { + const TargetRegisterInfo *RI = TM.getRegisterInfo(); + if (RI->getDwarfRegNum(MLoc.getReg(), false) != -1) + return AsmPrinter::getDwarfRegOpSize(MLoc); + else { + unsigned Reg = MLoc.getReg(); + if (Reg >= ARM::S0 && Reg <= ARM::S31) { + assert(ARM::S0 + 31 == ARM::S31 && "Unexpected ARM S register numbering"); + // S registers are described as bit-pieces of a register + // S[2x] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 0) + // S[2x+1] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 32) + + unsigned SReg = Reg - ARM::S0; + unsigned Rx = 256 + (SReg >> 1); + OutStreamer.AddComment("Loc expr size"); + // DW_OP_regx + ULEB + DW_OP_bit_piece + ULEB + ULEB + // 1 + ULEB(Rx) + 1 + 1 + 1 + return 4 + MCAsmInfo::getULEB128Size(Rx); + } + + if (Reg >= ARM::Q0 && Reg <= ARM::Q15) { + assert(ARM::Q0 + 15 == ARM::Q15 && "Unexpected ARM Q register numbering"); + // Q registers Q0-Q15 are described by composing two D registers together. + // Qx = DW_OP_regx(256+2x) DW_OP_piece(8) DW_OP_regx(256+2x+1) DW_OP_piece(8) + + unsigned QReg = Reg - ARM::Q0; + unsigned D1 = 256 + 2 * QReg; + unsigned D2 = D1 + 1; + + OutStreamer.AddComment("Loc expr size"); + // DW_OP_regx + ULEB + DW_OP_piece + ULEB(8) + + // DW_OP_regx + ULEB + DW_OP_piece + ULEB(8); + // 6 + ULEB(D1) + ULEB(D2) + return 6 + MCAsmInfo::getULEB128Size(D1) + MCAsmInfo::getULEB128Size(D2); + } + } + return 0; +} + +/// EmitDwarfRegOp - Emit dwarf register operation. +void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const { + const TargetRegisterInfo *RI = TM.getRegisterInfo(); + if (RI->getDwarfRegNum(MLoc.getReg(), false) != -1) + AsmPrinter::EmitDwarfRegOp(MLoc); + else { + unsigned Reg = MLoc.getReg(); + if (Reg >= ARM::S0 && Reg <= ARM::S31) { + assert(ARM::S0 + 31 == ARM::S31 && "Unexpected ARM S register numbering"); + // S registers are described as bit-pieces of a register + // S[2x] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 0) + // S[2x+1] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 32) + + unsigned SReg = Reg - ARM::S0; + bool odd = SReg & 0x1; + unsigned Rx = 256 + (SReg >> 1); + OutStreamer.AddComment("Loc expr size"); + // DW_OP_regx + ULEB + DW_OP_bit_piece + ULEB + ULEB + // 1 + ULEB(Rx) + 1 + 1 + 1 + EmitInt16(4 + MCAsmInfo::getULEB128Size(Rx)); + + OutStreamer.AddComment("DW_OP_regx for S register"); + EmitInt8(dwarf::DW_OP_regx); + + OutStreamer.AddComment(Twine(SReg)); + EmitULEB128(Rx); + + if (odd) { + OutStreamer.AddComment("DW_OP_bit_piece 32 32"); + EmitInt8(dwarf::DW_OP_bit_piece); + EmitULEB128(32); + EmitULEB128(32); + } else { + OutStreamer.AddComment("DW_OP_bit_piece 32 0"); + EmitInt8(dwarf::DW_OP_bit_piece); + EmitULEB128(32); + EmitULEB128(0); + } + } else if (Reg >= ARM::Q0 && Reg <= ARM::Q15) { + assert(ARM::Q0 + 15 == ARM::Q15 && "Unexpected ARM Q register numbering"); + // Q registers Q0-Q15 are described by composing two D registers together. + // Qx = DW_OP_regx(256+2x) DW_OP_piece(8) DW_OP_regx(256+2x+1) DW_OP_piece(8) + + unsigned QReg = Reg - ARM::Q0; + unsigned D1 = 256 + 2 * QReg; + unsigned D2 = D1 + 1; + + OutStreamer.AddComment("Loc expr size"); + // DW_OP_regx + ULEB + DW_OP_piece + ULEB(8) + + // DW_OP_regx + ULEB + DW_OP_piece + ULEB(8); + // 6 + ULEB(D1) + ULEB(D2) + EmitInt16(6 + MCAsmInfo::getULEB128Size(D1) + MCAsmInfo::getULEB128Size(D2)); + + OutStreamer.AddComment("DW_OP_regx for Q register: D1"); + EmitInt8(dwarf::DW_OP_regx); + EmitULEB128(D1); + OutStreamer.AddComment("DW_OP_piece 8"); + EmitInt8(dwarf::DW_OP_piece); + EmitULEB128(8); + + OutStreamer.AddComment("DW_OP_regx for Q register: D2"); + EmitInt8(dwarf::DW_OP_regx); + EmitULEB128(D2); + OutStreamer.AddComment("DW_OP_piece 8"); + EmitInt8(dwarf::DW_OP_piece); + EmitULEB128(8); + } + } +} + void ARMAsmPrinter::EmitFunctionEntryLabel() { if (AFI->isThumbFunction()) { OutStreamer.EmitAssemblerFlag(MCAF_Code16); @@ -453,10 +569,13 @@ void ARMAsmPrinter::emitAttributes() { emitARMAttributeSection(); + /* GAS expect .fpu to be emitted, regardless of VFP build attribute */ + bool emitFPU = false; AttributeEmitter *AttrEmitter; - if (OutStreamer.hasRawTextSupport()) + if (OutStreamer.hasRawTextSupport()) { AttrEmitter = new AsmAttributeEmitter(OutStreamer); - else { + emitFPU = true; + } else { MCObjectStreamer &O = static_cast<MCObjectStreamer&>(OutStreamer); AttrEmitter = new ObjectAttributeEmitter(O); } @@ -490,10 +609,36 @@ void ARMAsmPrinter::emitAttributes() { ARMBuildAttrs::Allowed); } - // FIXME: Emit FPU type - if (Subtarget->hasVFP2()) + if (Subtarget->hasNEON() && emitFPU) { + /* NEON is not exactly a VFP architecture, but GAS emit one of + * neon/vfpv3/vfpv2 for .fpu parameters */ + AttrEmitter->EmitTextAttribute(ARMBuildAttrs::Advanced_SIMD_arch, "neon"); + /* If emitted for NEON, omit from VFP below, since you can have both + * NEON and VFP in build attributes but only one .fpu */ + emitFPU = false; + } + + /* VFPv3 + .fpu */ + if (Subtarget->hasVFP3()) { + AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch, + ARMBuildAttrs::AllowFPv3A); + if (emitFPU) + AttrEmitter->EmitTextAttribute(ARMBuildAttrs::VFP_arch, "vfpv3"); + + /* VFPv2 + .fpu */ + } else if (Subtarget->hasVFP2()) { AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch, ARMBuildAttrs::AllowFPv2); + if (emitFPU) + AttrEmitter->EmitTextAttribute(ARMBuildAttrs::VFP_arch, "vfpv2"); + } + + /* TODO: ARMBuildAttrs::Allowed is not completely accurate, + * since NEON can have 1 (allowed) or 2 (fused MAC operations) */ + if (Subtarget->hasNEON()) { + AttrEmitter->EmitAttribute(ARMBuildAttrs::Advanced_SIMD_arch, + ARMBuildAttrs::Allowed); + } // Signal various FP modes. if (!UnsafeFPMath) { @@ -777,10 +922,161 @@ void ARMAsmPrinter::EmitPatchedInstruction(const MachineInstr *MI, OutStreamer.EmitInstruction(TmpInst); } +void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { + assert(MI->getFlag(MachineInstr::FrameSetup) && + "Only instruction which are involved into frame setup code are allowed"); + + const MachineFunction &MF = *MI->getParent()->getParent(); + const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo(); + const ARMFunctionInfo &AFI = *MF.getInfo<ARMFunctionInfo>(); + + unsigned FramePtr = RegInfo->getFrameRegister(MF); + unsigned Opc = MI->getOpcode(); + unsigned SrcReg, DstReg; + + if (Opc == ARM::tPUSH || Opc == ARM::tLDRpci) { + // Two special cases: + // 1) tPUSH does not have src/dst regs. + // 2) for Thumb1 code we sometimes materialize the constant via constpool + // load. Yes, this is pretty fragile, but for now I don't see better + // way... :( + SrcReg = DstReg = ARM::SP; + } else { + SrcReg = MI->getOperand(1).getReg(); + DstReg = MI->getOperand(0).getReg(); + } + + // Try to figure out the unwinding opcode out of src / dst regs. + if (MI->getDesc().mayStore()) { + // Register saves. + assert(DstReg == ARM::SP && + "Only stack pointer as a destination reg is supported"); + + SmallVector<unsigned, 4> RegList; + // Skip src & dst reg, and pred ops. + unsigned StartOp = 2 + 2; + // Use all the operands. + unsigned NumOffset = 0; + + switch (Opc) { + default: + MI->dump(); + assert(0 && "Unsupported opcode for unwinding information"); + case ARM::tPUSH: + // Special case here: no src & dst reg, but two extra imp ops. + StartOp = 2; NumOffset = 2; + case ARM::STMDB_UPD: + case ARM::t2STMDB_UPD: + case ARM::VSTMDDB_UPD: + assert(SrcReg == ARM::SP && + "Only stack pointer as a source reg is supported"); + for (unsigned i = StartOp, NumOps = MI->getNumOperands() - NumOffset; + i != NumOps; ++i) + RegList.push_back(MI->getOperand(i).getReg()); + break; + case ARM::STR_PRE: + assert(MI->getOperand(2).getReg() == ARM::SP && + "Only stack pointer as a source reg is supported"); + RegList.push_back(SrcReg); + break; + } + OutStreamer.EmitRegSave(RegList, Opc == ARM::VSTMDDB_UPD); + } else { + // Changes of stack / frame pointer. + if (SrcReg == ARM::SP) { + int64_t Offset = 0; + switch (Opc) { + default: + MI->dump(); + assert(0 && "Unsupported opcode for unwinding information"); + case ARM::MOVr: + case ARM::tMOVgpr2gpr: + case ARM::tMOVgpr2tgpr: + Offset = 0; + break; + case ARM::ADDri: + Offset = -MI->getOperand(2).getImm(); + break; + case ARM::SUBri: + case ARM::t2SUBrSPi: + Offset = MI->getOperand(2).getImm(); + break; + case ARM::tSUBspi: + Offset = MI->getOperand(2).getImm()*4; + break; + case ARM::tADDspi: + case ARM::tADDrSPi: + Offset = -MI->getOperand(2).getImm()*4; + break; + case ARM::tLDRpci: { + // Grab the constpool index and check, whether it corresponds to + // original or cloned constpool entry. + unsigned CPI = MI->getOperand(1).getIndex(); + const MachineConstantPool *MCP = MF.getConstantPool(); + if (CPI >= MCP->getConstants().size()) + CPI = AFI.getOriginalCPIdx(CPI); + assert(CPI != -1U && "Invalid constpool index"); + + // Derive the actual offset. + const MachineConstantPoolEntry &CPE = MCP->getConstants()[CPI]; + assert(!CPE.isMachineConstantPoolEntry() && "Invalid constpool entry"); + // FIXME: Check for user, it should be "add" instruction! + Offset = -cast<ConstantInt>(CPE.Val.ConstVal)->getSExtValue(); + break; + } + } + + if (DstReg == FramePtr && FramePtr != ARM::SP) + // Set-up of the frame pointer. Positive values correspond to "add" + // instruction. + OutStreamer.EmitSetFP(FramePtr, ARM::SP, -Offset); + else if (DstReg == ARM::SP) { + // Change of SP by an offset. Positive values correspond to "sub" + // instruction. + OutStreamer.EmitPad(Offset); + } else { + MI->dump(); + assert(0 && "Unsupported opcode for unwinding information"); + } + } else if (DstReg == ARM::SP) { + // FIXME: .movsp goes here + MI->dump(); + assert(0 && "Unsupported opcode for unwinding information"); + } + else { + MI->dump(); + assert(0 && "Unsupported opcode for unwinding information"); + } + } +} + +extern cl::opt<bool> EnableARMEHABI; + void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { unsigned Opc = MI->getOpcode(); switch (Opc) { default: break; + case ARM::B: { + // B is just a Bcc with an 'always' predicate. + MCInst TmpInst; + LowerARMMachineInstrToMCInst(MI, TmpInst, *this); + TmpInst.setOpcode(ARM::Bcc); + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.EmitInstruction(TmpInst); + return; + } + case ARM::LDMIA_RET: { + // LDMIA_RET is just a normal LDMIA_UPD instruction that targets PC and as + // such has additional code-gen properties and scheduling information. + // To emit it, we just construct as normal and set the opcode to LDMIA_UPD. + MCInst TmpInst; + LowerARMMachineInstrToMCInst(MI, TmpInst, *this); + TmpInst.setOpcode(ARM::LDMIA_UPD); + OutStreamer.EmitInstruction(TmpInst); + return; + } case ARM::t2ADDrSPi: case ARM::t2ADDrSPi12: case ARM::t2SUBrSPi: @@ -850,6 +1146,26 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { OutStreamer.EmitInstruction(TmpInst); return; } + // Darwin call instructions are just normal call instructions with different + // clobber semantics (they clobber R9). + case ARM::BLr9: + case ARM::BLr9_pred: + case ARM::BLXr9: + case ARM::BLXr9_pred: { + unsigned newOpc; + switch (Opc) { + default: assert(0); + case ARM::BLr9: newOpc = ARM::BL; break; + case ARM::BLr9_pred: newOpc = ARM::BL_pred; break; + case ARM::BLXr9: newOpc = ARM::BLX; break; + case ARM::BLXr9_pred: newOpc = ARM::BLX_pred; break; + } + MCInst TmpInst; + LowerARMMachineInstrToMCInst(MI, TmpInst, *this); + TmpInst.setOpcode(newOpc); + OutStreamer.EmitInstruction(TmpInst); + return; + } case ARM::BXr9_CALL: case ARM::BX_CALL: { { @@ -1502,6 +1818,49 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { } return; } + // Tail jump branches are really just branch instructions with additional + // code-gen attributes. Convert them to the canonical form here. + case ARM::TAILJMPd: + case ARM::TAILJMPdND: { + MCInst TmpInst, TmpInst2; + // Lower the instruction as-is to get the operands properly converted. + LowerARMMachineInstrToMCInst(MI, TmpInst2, *this); + TmpInst.setOpcode(ARM::Bcc); + TmpInst.addOperand(TmpInst2.getOperand(0)); + // Add predicate operands. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.AddComment("TAILCALL"); + OutStreamer.EmitInstruction(TmpInst); + return; + } + case ARM::tTAILJMPd: + case ARM::tTAILJMPdND: { + MCInst TmpInst, TmpInst2; + LowerARMMachineInstrToMCInst(MI, TmpInst2, *this); + TmpInst.setOpcode(ARM::tB); + TmpInst.addOperand(TmpInst2.getOperand(0)); + OutStreamer.AddComment("TAILCALL"); + OutStreamer.EmitInstruction(TmpInst); + return; + } + case ARM::TAILJMPrND: + case ARM::tTAILJMPrND: + case ARM::TAILJMPr: + case ARM::tTAILJMPr: { + unsigned newOpc = (Opc == ARM::TAILJMPr || Opc == ARM::TAILJMPrND) + ? ARM::BX : ARM::tBX; + MCInst TmpInst; + TmpInst.setOpcode(newOpc); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + // Predicate. + TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL)); + TmpInst.addOperand(MCOperand::CreateReg(0)); + OutStreamer.AddComment("TAILCALL"); + OutStreamer.EmitInstruction(TmpInst); + return; + } + // These are the pseudos created to comply with stricter operand restrictions // on ARMv5. Lower them now to "normal" instructions, since all the // restrictions are already satisfied. @@ -1530,6 +1889,11 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { MCInst TmpInst; LowerARMMachineInstrToMCInst(MI, TmpInst, *this); + + // Emit unwinding stuff for frame-related instructions + if (EnableARMEHABI && MI->getFlag(MachineInstr::FrameSetup)) + EmitUnwindingInstruction(MI); + OutStreamer.EmitInstruction(TmpInst); } @@ -1538,10 +1902,11 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { //===----------------------------------------------------------------------===// static MCInstPrinter *createARMMCInstPrinter(const Target &T, + TargetMachine &TM, unsigned SyntaxVariant, const MCAsmInfo &MAI) { if (SyntaxVariant == 0) - return new ARMInstPrinter(MAI); + return new ARMInstPrinter(TM, MAI); return 0; } diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h index 5852684..1ee1b70 100644 --- a/lib/Target/ARM/ARMAsmPrinter.h +++ b/lib/Target/ARM/ARMAsmPrinter.h @@ -82,11 +82,20 @@ private: // Generic helper used to emit e.g. ARMv5 mul pseudos void EmitPatchedInstruction(const MachineInstr *MI, unsigned TargetOpc); + void EmitUnwindingInstruction(const MachineInstr *MI); + public: void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS); MachineLocation getDebugValueLocation(const MachineInstr *MI) const; + /// getDwarfRegOpSize - get size required to emit given machine location + /// using dwarf encoding. + virtual unsigned getDwarfRegOpSize(const MachineLocation &MLoc) const; + + /// EmitDwarfRegOp - Emit dwarf register operation. + virtual void EmitDwarfRegOp(const MachineLocation &MLoc) const; + virtual unsigned getISAEncoding() { // ARM/Darwin adds ISA to the DWARF info for each function. if (!Subtarget->isTargetDarwin()) diff --git a/lib/Target/ARM/ARMBaseInfo.h b/lib/Target/ARM/ARMBaseInfo.h index a56cc1a..36edbad 100644 --- a/lib/Target/ARM/ARMBaseInfo.h +++ b/lib/Target/ARM/ARMBaseInfo.h @@ -200,6 +200,59 @@ inline static unsigned getARMRegisterNumbering(unsigned Reg) { } namespace ARMII { + + /// ARM Index Modes + enum IndexMode { + IndexModeNone = 0, + IndexModePre = 1, + IndexModePost = 2, + IndexModeUpd = 3 + }; + + /// ARM Addressing Modes + enum AddrMode { + AddrModeNone = 0, + AddrMode1 = 1, + AddrMode2 = 2, + AddrMode3 = 3, + AddrMode4 = 4, + AddrMode5 = 5, + AddrMode6 = 6, + AddrModeT1_1 = 7, + AddrModeT1_2 = 8, + AddrModeT1_4 = 9, + AddrModeT1_s = 10, // i8 * 4 for pc and sp relative data + AddrModeT2_i12 = 11, + AddrModeT2_i8 = 12, + AddrModeT2_so = 13, + AddrModeT2_pc = 14, // +/- i12 for pc relative data + AddrModeT2_i8s4 = 15, // i8 * 4 + AddrMode_i12 = 16 + }; + + inline static const char *AddrModeToString(AddrMode addrmode) { + switch (addrmode) { + default: llvm_unreachable("Unknown memory operation"); + case AddrModeNone: return "AddrModeNone"; + case AddrMode1: return "AddrMode1"; + case AddrMode2: return "AddrMode2"; + case AddrMode3: return "AddrMode3"; + case AddrMode4: return "AddrMode4"; + case AddrMode5: return "AddrMode5"; + case AddrMode6: return "AddrMode6"; + case AddrModeT1_1: return "AddrModeT1_1"; + case AddrModeT1_2: return "AddrModeT1_2"; + case AddrModeT1_4: return "AddrModeT1_4"; + case AddrModeT1_s: return "AddrModeT1_s"; + case AddrModeT2_i12: return "AddrModeT2_i12"; + case AddrModeT2_i8: return "AddrModeT2_i8"; + case AddrModeT2_so: return "AddrModeT2_so"; + case AddrModeT2_pc: return "AddrModeT2_pc"; + case AddrModeT2_i8s4: return "AddrModeT2_i8s4"; + case AddrMode_i12: return "AddrMode_i12"; + } + } + /// Target Operand Flag enum. enum TOF { //===------------------------------------------------------------------===// diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index 2268e59..44a3976 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1021,7 +1021,7 @@ reMaterialize(MachineBasicBlock &MBB, MachineInstrBuilder MIB = BuildMI(MBB, I, Orig->getDebugLoc(), get(Opcode), DestReg) .addConstantPoolIndex(CPI).addImm(PCLabelId); - (*MIB).setMemRefs(Orig->memoperands_begin(), Orig->memoperands_end()); + MIB->setMemRefs(Orig->memoperands_begin(), Orig->memoperands_end()); break; } } @@ -1080,11 +1080,18 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0, int CPI1 = MO1.getIndex(); const MachineConstantPoolEntry &MCPE0 = MCP->getConstants()[CPI0]; const MachineConstantPoolEntry &MCPE1 = MCP->getConstants()[CPI1]; - ARMConstantPoolValue *ACPV0 = - static_cast<ARMConstantPoolValue*>(MCPE0.Val.MachineCPVal); - ARMConstantPoolValue *ACPV1 = - static_cast<ARMConstantPoolValue*>(MCPE1.Val.MachineCPVal); - return ACPV0->hasSameValue(ACPV1); + bool isARMCP0 = MCPE0.isMachineConstantPoolEntry(); + bool isARMCP1 = MCPE1.isMachineConstantPoolEntry(); + if (isARMCP0 && isARMCP1) { + ARMConstantPoolValue *ACPV0 = + static_cast<ARMConstantPoolValue*>(MCPE0.Val.MachineCPVal); + ARMConstantPoolValue *ACPV1 = + static_cast<ARMConstantPoolValue*>(MCPE1.Val.MachineCPVal); + return ACPV0->hasSameValue(ACPV1); + } else if (!isARMCP0 && !isARMCP1) { + return MCPE0.Val.ConstVal == MCPE1.Val.ConstVal; + } + return false; } else if (Opcode == ARM::PICLDR) { if (MI1->getOpcode() != Opcode) return false; @@ -1194,7 +1201,7 @@ bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, } /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to -/// determine (in conjuction with areLoadsFromSameBasePtr) if two loads should +/// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should /// be scheduled togther. On some targets if two loads are loading from /// addresses in the same cache line, it's better if they are scheduled /// together. This function takes two integers that represent the load offsets @@ -1263,19 +1270,19 @@ bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI, } bool ARMBaseInstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB, - unsigned NumCyles, + unsigned NumCycles, unsigned ExtraPredCycles, float Probability, float Confidence) const { - if (!NumCyles) + if (!NumCycles) return false; // Attempt to estimate the relative costs of predication versus branching. - float UnpredCost = Probability * NumCyles; + float UnpredCost = Probability * NumCycles; UnpredCost += 1.0; // The branch itself UnpredCost += (1.0 - Confidence) * Subtarget.getMispredictionPenalty(); - return (float)(NumCyles + ExtraPredCycles) < UnpredCost; + return (float)(NumCycles + ExtraPredCycles) < UnpredCost; } bool ARMBaseInstrInfo:: @@ -1328,7 +1335,7 @@ void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, DebugLoc dl, unsigned DestReg, unsigned BaseReg, int NumBytes, ARMCC::CondCodes Pred, unsigned PredReg, - const ARMBaseInstrInfo &TII) { + const ARMBaseInstrInfo &TII, unsigned MIFlags) { bool isSub = NumBytes < 0; if (isSub) NumBytes = -NumBytes; @@ -1346,7 +1353,8 @@ void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB, unsigned Opc = isSub ? ARM::SUBri : ARM::ADDri; BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg) .addReg(BaseReg, RegState::Kill).addImm(ThisVal) - .addImm((unsigned)Pred).addReg(PredReg).addReg(0); + .addImm((unsigned)Pred).addReg(PredReg).addReg(0) + .setMIFlags(MIFlags); BaseReg = DestReg; } } @@ -1610,18 +1618,84 @@ OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, int CmpMask, // Set the "zero" bit in CPSR. switch (MI->getOpcode()) { default: break; + case ARM::RSBrr: + case ARM::RSBri: + case ARM::RSCrr: + case ARM::RSCri: + case ARM::ADDrr: case ARM::ADDri: - case ARM::ANDri: - case ARM::t2ANDri: + case ARM::ADCrr: + case ARM::ADCri: + case ARM::SUBrr: case ARM::SUBri: + case ARM::SBCrr: + case ARM::SBCri: + case ARM::t2RSBri: + case ARM::t2ADDrr: case ARM::t2ADDri: + case ARM::t2ADCrr: + case ARM::t2ADCri: + case ARM::t2SUBrr: case ARM::t2SUBri: + case ARM::t2SBCrr: + case ARM::t2SBCri: + case ARM::ANDrr: + case ARM::ANDri: + case ARM::t2ANDrr: + case ARM::t2ANDri: + case ARM::ORRrr: + case ARM::ORRri: + case ARM::t2ORRrr: + case ARM::t2ORRri: + case ARM::EORrr: + case ARM::EORri: + case ARM::t2EORrr: + case ARM::t2EORri: { + // Scan forward for the use of CPSR, if it's a conditional code requires + // checking of V bit, then this is not safe to do. If we can't find the + // CPSR use (i.e. used in another block), then it's not safe to perform + // the optimization. + bool isSafe = false; + I = CmpInstr; + E = MI->getParent()->end(); + while (!isSafe && ++I != E) { + const MachineInstr &Instr = *I; + for (unsigned IO = 0, EO = Instr.getNumOperands(); + !isSafe && IO != EO; ++IO) { + const MachineOperand &MO = Instr.getOperand(IO); + if (!MO.isReg() || MO.getReg() != ARM::CPSR) + continue; + if (MO.isDef()) { + isSafe = true; + break; + } + // Condition code is after the operand before CPSR. + ARMCC::CondCodes CC = (ARMCC::CondCodes)Instr.getOperand(IO-1).getImm(); + switch (CC) { + default: + isSafe = true; + break; + case ARMCC::VS: + case ARMCC::VC: + case ARMCC::GE: + case ARMCC::LT: + case ARMCC::GT: + case ARMCC::LE: + return false; + } + } + } + + if (!isSafe) + return false; + // Toggle the optional operand to CPSR. MI->getOperand(5).setReg(ARM::CPSR); MI->getOperand(5).setIsDef(true); CmpInstr->eraseFromParent(); return true; } + } return false; } @@ -1741,9 +1815,7 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData, llvm_unreachable("Unexpected multi-uops instruction!"); break; case ARM::VLDMQIA: - case ARM::VLDMQDB: case ARM::VSTMQIA: - case ARM::VSTMQDB: return 2; // The number of uOps for load / store multiple are determined by the number @@ -1757,19 +1829,15 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData, // is not 64-bit aligned, then AGU would take an extra cycle. For VFP / NEON // load / store multiple, the formula is (#reg / 2) + (#reg % 2) + 1. case ARM::VLDMDIA: - case ARM::VLDMDDB: case ARM::VLDMDIA_UPD: case ARM::VLDMDDB_UPD: case ARM::VLDMSIA: - case ARM::VLDMSDB: case ARM::VLDMSIA_UPD: case ARM::VLDMSDB_UPD: case ARM::VSTMDIA: - case ARM::VSTMDDB: case ARM::VSTMDIA_UPD: case ARM::VSTMDDB_UPD: case ARM::VSTMSIA: - case ARM::VSTMSDB: case ARM::VSTMSIA_UPD: case ARM::VSTMSDB_UPD: { unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands(); @@ -1859,7 +1927,6 @@ ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData, switch (DefTID.getOpcode()) { default: break; case ARM::VLDMSIA: - case ARM::VLDMSDB: case ARM::VLDMSIA_UPD: case ARM::VLDMSDB_UPD: isSLoad = true; @@ -1935,7 +2002,6 @@ ARMBaseInstrInfo::getVSTMUseCycle(const InstrItineraryData *ItinData, switch (UseTID.getOpcode()) { default: break; case ARM::VSTMSIA: - case ARM::VSTMSDB: case ARM::VSTMSIA_UPD: case ARM::VSTMSDB_UPD: isSStore = true; @@ -2006,11 +2072,9 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, break; case ARM::VLDMDIA: - case ARM::VLDMDDB: case ARM::VLDMDIA_UPD: case ARM::VLDMDDB_UPD: case ARM::VLDMSIA: - case ARM::VLDMSDB: case ARM::VLDMSIA_UPD: case ARM::VLDMSDB_UPD: DefCycle = getVLDMDefCycle(ItinData, DefTID, DefClass, DefIdx, DefAlign); @@ -2049,11 +2113,9 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, break; case ARM::VSTMDIA: - case ARM::VSTMDDB: case ARM::VSTMDIA_UPD: case ARM::VSTMDDB_UPD: case ARM::VSTMSIA: - case ARM::VSTMSDB: case ARM::VSTMSIA_UPD: case ARM::VSTMSDB_UPD: UseCycle = getVSTMUseCycle(ItinData, UseTID, UseClass, UseIdx, UseAlign); @@ -2160,6 +2222,101 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, } } + if (DefAlign < 8 && Subtarget.isCortexA9()) + switch (DefTID.getOpcode()) { + default: break; + case ARM::VLD1q8: + case ARM::VLD1q16: + case ARM::VLD1q32: + case ARM::VLD1q64: + case ARM::VLD1q8_UPD: + case ARM::VLD1q16_UPD: + case ARM::VLD1q32_UPD: + case ARM::VLD1q64_UPD: + case ARM::VLD2d8: + case ARM::VLD2d16: + case ARM::VLD2d32: + case ARM::VLD2q8: + case ARM::VLD2q16: + case ARM::VLD2q32: + case ARM::VLD2d8_UPD: + case ARM::VLD2d16_UPD: + case ARM::VLD2d32_UPD: + case ARM::VLD2q8_UPD: + case ARM::VLD2q16_UPD: + case ARM::VLD2q32_UPD: + case ARM::VLD3d8: + case ARM::VLD3d16: + case ARM::VLD3d32: + case ARM::VLD1d64T: + case ARM::VLD3d8_UPD: + case ARM::VLD3d16_UPD: + case ARM::VLD3d32_UPD: + case ARM::VLD1d64T_UPD: + case ARM::VLD3q8_UPD: + case ARM::VLD3q16_UPD: + case ARM::VLD3q32_UPD: + case ARM::VLD4d8: + case ARM::VLD4d16: + case ARM::VLD4d32: + case ARM::VLD1d64Q: + case ARM::VLD4d8_UPD: + case ARM::VLD4d16_UPD: + case ARM::VLD4d32_UPD: + case ARM::VLD1d64Q_UPD: + case ARM::VLD4q8_UPD: + case ARM::VLD4q16_UPD: + case ARM::VLD4q32_UPD: + case ARM::VLD1DUPq8: + case ARM::VLD1DUPq16: + case ARM::VLD1DUPq32: + case ARM::VLD1DUPq8_UPD: + case ARM::VLD1DUPq16_UPD: + case ARM::VLD1DUPq32_UPD: + case ARM::VLD2DUPd8: + case ARM::VLD2DUPd16: + case ARM::VLD2DUPd32: + case ARM::VLD2DUPd8_UPD: + case ARM::VLD2DUPd16_UPD: + case ARM::VLD2DUPd32_UPD: + case ARM::VLD4DUPd8: + case ARM::VLD4DUPd16: + case ARM::VLD4DUPd32: + case ARM::VLD4DUPd8_UPD: + case ARM::VLD4DUPd16_UPD: + case ARM::VLD4DUPd32_UPD: + case ARM::VLD1LNd8: + case ARM::VLD1LNd16: + case ARM::VLD1LNd32: + case ARM::VLD1LNd8_UPD: + case ARM::VLD1LNd16_UPD: + case ARM::VLD1LNd32_UPD: + case ARM::VLD2LNd8: + case ARM::VLD2LNd16: + case ARM::VLD2LNd32: + case ARM::VLD2LNq16: + case ARM::VLD2LNq32: + case ARM::VLD2LNd8_UPD: + case ARM::VLD2LNd16_UPD: + case ARM::VLD2LNd32_UPD: + case ARM::VLD2LNq16_UPD: + case ARM::VLD2LNq32_UPD: + case ARM::VLD4LNd8: + case ARM::VLD4LNd16: + case ARM::VLD4LNd32: + case ARM::VLD4LNq16: + case ARM::VLD4LNq32: + case ARM::VLD4LNd8_UPD: + case ARM::VLD4LNd16_UPD: + case ARM::VLD4LNd32_UPD: + case ARM::VLD4LNq16_UPD: + case ARM::VLD4LNq32_UPD: + // If the address is not 64-bit aligned, the latencies of these + // instructions increases by one. + ++Latency; + break; + } + return Latency; } @@ -2226,6 +2383,113 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, } } + if (DefAlign < 8 && Subtarget.isCortexA9()) + switch (DefTID.getOpcode()) { + default: break; + case ARM::VLD1q8Pseudo: + case ARM::VLD1q16Pseudo: + case ARM::VLD1q32Pseudo: + case ARM::VLD1q64Pseudo: + case ARM::VLD1q8Pseudo_UPD: + case ARM::VLD1q16Pseudo_UPD: + case ARM::VLD1q32Pseudo_UPD: + case ARM::VLD1q64Pseudo_UPD: + case ARM::VLD2d8Pseudo: + case ARM::VLD2d16Pseudo: + case ARM::VLD2d32Pseudo: + case ARM::VLD2q8Pseudo: + case ARM::VLD2q16Pseudo: + case ARM::VLD2q32Pseudo: + case ARM::VLD2d8Pseudo_UPD: + case ARM::VLD2d16Pseudo_UPD: + case ARM::VLD2d32Pseudo_UPD: + case ARM::VLD2q8Pseudo_UPD: + case ARM::VLD2q16Pseudo_UPD: + case ARM::VLD2q32Pseudo_UPD: + case ARM::VLD3d8Pseudo: + case ARM::VLD3d16Pseudo: + case ARM::VLD3d32Pseudo: + case ARM::VLD1d64TPseudo: + case ARM::VLD3d8Pseudo_UPD: + case ARM::VLD3d16Pseudo_UPD: + case ARM::VLD3d32Pseudo_UPD: + case ARM::VLD1d64TPseudo_UPD: + case ARM::VLD3q8Pseudo_UPD: + case ARM::VLD3q16Pseudo_UPD: + case ARM::VLD3q32Pseudo_UPD: + case ARM::VLD3q8oddPseudo: + case ARM::VLD3q16oddPseudo: + case ARM::VLD3q32oddPseudo: + case ARM::VLD3q8oddPseudo_UPD: + case ARM::VLD3q16oddPseudo_UPD: + case ARM::VLD3q32oddPseudo_UPD: + case ARM::VLD4d8Pseudo: + case ARM::VLD4d16Pseudo: + case ARM::VLD4d32Pseudo: + case ARM::VLD1d64QPseudo: + case ARM::VLD4d8Pseudo_UPD: + case ARM::VLD4d16Pseudo_UPD: + case ARM::VLD4d32Pseudo_UPD: + case ARM::VLD1d64QPseudo_UPD: + case ARM::VLD4q8Pseudo_UPD: + case ARM::VLD4q16Pseudo_UPD: + case ARM::VLD4q32Pseudo_UPD: + case ARM::VLD4q8oddPseudo: + case ARM::VLD4q16oddPseudo: + case ARM::VLD4q32oddPseudo: + case ARM::VLD4q8oddPseudo_UPD: + case ARM::VLD4q16oddPseudo_UPD: + case ARM::VLD4q32oddPseudo_UPD: + case ARM::VLD1DUPq8Pseudo: + case ARM::VLD1DUPq16Pseudo: + case ARM::VLD1DUPq32Pseudo: + case ARM::VLD1DUPq8Pseudo_UPD: + case ARM::VLD1DUPq16Pseudo_UPD: + case ARM::VLD1DUPq32Pseudo_UPD: + case ARM::VLD2DUPd8Pseudo: + case ARM::VLD2DUPd16Pseudo: + case ARM::VLD2DUPd32Pseudo: + case ARM::VLD2DUPd8Pseudo_UPD: + case ARM::VLD2DUPd16Pseudo_UPD: + case ARM::VLD2DUPd32Pseudo_UPD: + case ARM::VLD4DUPd8Pseudo: + case ARM::VLD4DUPd16Pseudo: + case ARM::VLD4DUPd32Pseudo: + case ARM::VLD4DUPd8Pseudo_UPD: + case ARM::VLD4DUPd16Pseudo_UPD: + case ARM::VLD4DUPd32Pseudo_UPD: + case ARM::VLD1LNq8Pseudo: + case ARM::VLD1LNq16Pseudo: + case ARM::VLD1LNq32Pseudo: + case ARM::VLD1LNq8Pseudo_UPD: + case ARM::VLD1LNq16Pseudo_UPD: + case ARM::VLD1LNq32Pseudo_UPD: + case ARM::VLD2LNd8Pseudo: + case ARM::VLD2LNd16Pseudo: + case ARM::VLD2LNd32Pseudo: + case ARM::VLD2LNq16Pseudo: + case ARM::VLD2LNq32Pseudo: + case ARM::VLD2LNd8Pseudo_UPD: + case ARM::VLD2LNd16Pseudo_UPD: + case ARM::VLD2LNd32Pseudo_UPD: + case ARM::VLD2LNq16Pseudo_UPD: + case ARM::VLD2LNq32Pseudo_UPD: + case ARM::VLD4LNd8Pseudo: + case ARM::VLD4LNd16Pseudo: + case ARM::VLD4LNd32Pseudo: + case ARM::VLD4LNq16Pseudo: + case ARM::VLD4LNq32Pseudo: + case ARM::VLD4LNd8Pseudo_UPD: + case ARM::VLD4LNd16Pseudo_UPD: + case ARM::VLD4LNd32Pseudo_UPD: + case ARM::VLD4LNq16Pseudo_UPD: + case ARM::VLD4LNq32Pseudo_UPD: + // If the address is not 64-bit aligned, the latencies of these + // instructions increases by one. + ++Latency; + break; + } + return Latency; } @@ -2264,9 +2528,7 @@ int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, default: return ItinData->getStageLatency(get(Opcode).getSchedClass()); case ARM::VLDMQIA: - case ARM::VLDMQDB: case ARM::VSTMQIA: - case ARM::VSTMQDB: return 2; } } diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h index 7e2183d..9a2faf8 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -34,25 +34,7 @@ namespace ARMII { //===------------------------------------------------------------------===// // This four-bit field describes the addressing mode used. - - AddrModeMask = 0x1f, - AddrModeNone = 0, - AddrMode1 = 1, - AddrMode2 = 2, - AddrMode3 = 3, - AddrMode4 = 4, - AddrMode5 = 5, - AddrMode6 = 6, - AddrModeT1_1 = 7, - AddrModeT1_2 = 8, - AddrModeT1_4 = 9, - AddrModeT1_s = 10, // i8 * 4 for pc and sp relative data - AddrModeT2_i12 = 11, - AddrModeT2_i8 = 12, - AddrModeT2_so = 13, - AddrModeT2_pc = 14, // +/- i12 for pc relative data - AddrModeT2_i8s4 = 15, // i8 * 4 - AddrMode_i12 = 16, + AddrModeMask = 0x1f, // The AddrMode enums are declared in ARMBaseInfo.h // Size* - Flags to keep track of the size of an instruction. SizeShift = 5, @@ -64,11 +46,9 @@ namespace ARMII { // IndexMode - Unindex, pre-indexed, or post-indexed are valid for load // and store ops only. Generic "updating" flag is used for ld/st multiple. + // The index mode enums are declared in ARMBaseInfo.h IndexModeShift = 8, IndexModeMask = 3 << IndexModeShift, - IndexModePre = 1, - IndexModePost = 2, - IndexModeUpd = 3, //===------------------------------------------------------------------===// // Instruction encoding formats. @@ -311,7 +291,7 @@ public: int64_t &Offset1, int64_t &Offset2)const; /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to - /// determine (in conjuction with areLoadsFromSameBasePtr) if two loads should + /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should /// be scheduled togther. On some targets if two loads are loading from /// addresses in the same cache line, it's better if they are scheduled /// together. This function takes two integers that represent the load offsets @@ -327,7 +307,7 @@ public: const MachineFunction &MF) const; virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB, - unsigned NumCyles, unsigned ExtraPredCycles, + unsigned NumCycles, unsigned ExtraPredCycles, float Prob, float Confidence) const; virtual bool isProfitableToIfCvt(MachineBasicBlock &TMBB, @@ -337,10 +317,10 @@ public: float Probability, float Confidence) const; virtual bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, - unsigned NumCyles, + unsigned NumCycles, float Probability, float Confidence) const { - return NumCyles == 1; + return NumCycles == 1; } /// AnalyzeCompare - For a comparison instruction, return the source register @@ -496,19 +476,19 @@ void emitARMRegPlusImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, DebugLoc dl, unsigned DestReg, unsigned BaseReg, int NumBytes, ARMCC::CondCodes Pred, unsigned PredReg, - const ARMBaseInstrInfo &TII); + const ARMBaseInstrInfo &TII, unsigned MIFlags = 0); void emitT2RegPlusImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, DebugLoc dl, unsigned DestReg, unsigned BaseReg, int NumBytes, ARMCC::CondCodes Pred, unsigned PredReg, - const ARMBaseInstrInfo &TII); + const ARMBaseInstrInfo &TII, unsigned MIFlags = 0); void emitThumbRegPlusImmediate(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, + MachineBasicBlock::iterator &MBBI, DebugLoc dl, unsigned DestReg, unsigned BaseReg, int NumBytes, const TargetInstrInfo &TII, const ARMBaseRegisterInfo& MRI, - DebugLoc dl); + unsigned MIFlags = 0); /// rewriteARMFrameIndex / rewriteT2FrameIndex - diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp index 67a4b7d..ea1f08a 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -88,7 +88,7 @@ BitVector ARMBaseRegisterInfo:: getReservedRegs(const MachineFunction &MF) const { const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); - // FIXME: avoid re-calculating this everytime. + // FIXME: avoid re-calculating this every time. BitVector Reserved(getNumRegs()); Reserved.set(ARM::SP); Reserved.set(ARM::PC); @@ -342,12 +342,51 @@ ARMBaseRegisterInfo::canCombineSubRegIndices(const TargetRegisterClass *RC, return false; } +const TargetRegisterClass* +ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) + const { + const TargetRegisterClass *Super = RC; + TargetRegisterClass::sc_iterator I = RC->superclasses_begin(); + do { + switch (Super->getID()) { + case ARM::GPRRegClassID: + case ARM::SPRRegClassID: + case ARM::DPRRegClassID: + case ARM::QPRRegClassID: + case ARM::QQPRRegClassID: + case ARM::QQQQPRRegClassID: + return Super; + } + Super = *I++; + } while (Super); + return RC; +} const TargetRegisterClass * ARMBaseRegisterInfo::getPointerRegClass(unsigned Kind) const { return ARM::GPRRegisterClass; } +unsigned +ARMBaseRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + + switch (RC->getID()) { + default: + return 0; + case ARM::tGPRRegClassID: + return TFI->hasFP(MF) ? 4 : 5; + case ARM::GPRRegClassID: { + unsigned FP = TFI->hasFP(MF) ? 1 : 0; + return 10 - FP - (STI.isR9Reserved() ? 1 : 0); + } + case ARM::SPRRegClassID: // Currently not used as 'rep' register class. + case ARM::DPRRegClassID: + return 32 - 10; + } +} + /// getAllocationOrder - Returns the register allocation order for a specified /// register class in the form of a pair of TargetRegisterClass iterators. std::pair<TargetRegisterClass::iterator,TargetRegisterClass::iterator> @@ -428,6 +467,10 @@ ARMBaseRegisterInfo::getAllocationOrder(const TargetRegisterClass *RC, ARM::R0, ARM::R2, ARM::R10,ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8 }; + // We only support even/odd hints for GPR and rGPR. + if (RC != ARM::GPRRegisterClass && RC != ARM::rGPRRegisterClass) + return std::make_pair(RC->allocation_order_begin(MF), + RC->allocation_order_end(MF)); if (HintType == ARMRI::RegPairEven) { if (isPhysicalRegister(HintReg) && getRegisterPairEven(HintReg, MF) == 0) @@ -530,6 +573,29 @@ ARMBaseRegisterInfo::UpdateRegAllocHint(unsigned Reg, unsigned NewReg, } } +bool +ARMBaseRegisterInfo::avoidWriteAfterWrite(const TargetRegisterClass *RC) const { + // CortexA9 has a Write-after-write hazard for NEON registers. + if (!STI.isCortexA9()) + return false; + + switch (RC->getID()) { + case ARM::DPRRegClassID: + case ARM::DPR_8RegClassID: + case ARM::DPR_VFP2RegClassID: + case ARM::QPRRegClassID: + case ARM::QPR_8RegClassID: + case ARM::QPR_VFP2RegClassID: + case ARM::SPRRegClassID: + case ARM::SPR_8RegClassID: + // Avoid reusing S, D, and Q registers. + // Don't increase register pressure for QQ and QQQQ. + return true; + default: + return false; + } +} + bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); @@ -806,7 +872,7 @@ emitLoadConstPool(MachineBasicBlock &MBB, DebugLoc dl, unsigned DestReg, unsigned SubIdx, int Val, ARMCC::CondCodes Pred, - unsigned PredReg) const { + unsigned PredReg, unsigned MIFlags) const { MachineFunction &MF = *MBB.getParent(); MachineConstantPool *ConstantPool = MF.getConstantPool(); const Constant *C = @@ -816,7 +882,8 @@ emitLoadConstPool(MachineBasicBlock &MBB, BuildMI(MBB, MBBI, dl, TII.get(ARM::LDRcp)) .addReg(DestReg, getDefRegState(true), SubIdx) .addConstantPoolIndex(Idx) - .addImm(0).addImm(Pred).addReg(PredReg); + .addImm(0).addImm(Pred).addReg(PredReg) + .setMIFlags(MIFlags); } bool ARMBaseRegisterInfo:: diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h index ba6bd2b..9edf72d 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.h +++ b/lib/Target/ARM/ARMBaseRegisterInfo.h @@ -128,6 +128,12 @@ public: const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const; + const TargetRegisterClass* + getLargestLegalSuperClass(const TargetRegisterClass *RC) const; + + unsigned getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const; + std::pair<TargetRegisterClass::iterator,TargetRegisterClass::iterator> getAllocationOrder(const TargetRegisterClass *RC, unsigned HintType, unsigned HintReg, @@ -139,6 +145,8 @@ public: void UpdateRegAllocHint(unsigned Reg, unsigned NewReg, MachineFunction &MF) const; + virtual bool avoidWriteAfterWrite(const TargetRegisterClass *RC) const; + bool hasBasePointer(const MachineFunction &MF) const; bool canRealignStack(const MachineFunction &MF) const; @@ -176,7 +184,8 @@ public: unsigned DestReg, unsigned SubIdx, int Val, ARMCC::CondCodes Pred = ARMCC::AL, - unsigned PredReg = 0) const; + unsigned PredReg = 0, + unsigned MIFlags = MachineInstr::NoFlags)const; /// Code Generation virtual methods... virtual bool isReservedReg(const MachineFunction &MF, unsigned Reg) const; diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td index 426ba13..d2981c0 100644 --- a/lib/Target/ARM/ARMCallingConv.td +++ b/lib/Target/ARM/ARMCallingConv.td @@ -22,6 +22,9 @@ class CCIfAlign<string Align, CCAction A>: //===----------------------------------------------------------------------===// def CC_ARM_APCS : CallingConv<[ + // Handles byval parameters. + CCIfByVal<CCPassByVal<4, 4>>, + CCIfType<[i8, i16], CCPromoteToType<i32>>, // Handle all vector types as either f64 or v2f64. diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp index 9bbf6a0..fa73716 100644 --- a/lib/Target/ARM/ARMCodeEmitter.cpp +++ b/lib/Target/ARM/ARMCodeEmitter.cpp @@ -312,6 +312,15 @@ namespace { unsigned getRegisterListOpValue(const MachineInstr &MI, unsigned Op) const { return 0; } + unsigned getShiftRight8Imm(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getShiftRight16Imm(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getShiftRight32Imm(const MachineInstr &MI, unsigned Op) + const { return 0; } + unsigned getShiftRight64Imm(const MachineInstr &MI, unsigned Op) + const { return 0; } + /// getMovi32Value - Return binary encoding of operand for movw/movt. If the /// machine operand requires relocation, record the relocation and return /// zero. @@ -969,7 +978,7 @@ unsigned ARMCodeEmitter::getMachineSoImmOpValue(unsigned SoImm) { unsigned ARMCodeEmitter::getAddrModeSBit(const MachineInstr &MI, const TargetInstrDesc &TID) const { - for (unsigned i = MI.getNumOperands(), e = TID.getNumOperands(); i != e; --i){ + for (unsigned i = MI.getNumOperands(), e = TID.getNumOperands(); i >= e; --i){ const MachineOperand &MO = MI.getOperand(i-1); if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) return 1 << ARMII::S_BitShift; diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp index 13d1b33..baf95a3 100644 --- a/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -1650,24 +1650,27 @@ bool ARMConstantIslands::OptimizeThumb2Branches(MachineFunction &MF) { unsigned BrOffset = GetOffsetOf(Br.MI) + 4 - 2; unsigned DestOffset = BBOffsets[DestBB->getNumber()]; if (BrOffset < DestOffset && (DestOffset - BrOffset) <= 126) { - MachineBasicBlock::iterator CmpMI = Br.MI; --CmpMI; - if (CmpMI->getOpcode() == ARM::tCMPi8) { - unsigned Reg = CmpMI->getOperand(0).getReg(); - Pred = llvm::getInstrPredicate(CmpMI, PredReg); - if (Pred == ARMCC::AL && - CmpMI->getOperand(1).getImm() == 0 && - isARMLowRegister(Reg)) { - MachineBasicBlock *MBB = Br.MI->getParent(); - MachineInstr *NewBR = - BuildMI(*MBB, CmpMI, Br.MI->getDebugLoc(), TII->get(NewOpc)) - .addReg(Reg).addMBB(DestBB, Br.MI->getOperand(0).getTargetFlags()); - CmpMI->eraseFromParent(); - Br.MI->eraseFromParent(); - Br.MI = NewBR; - BBSizes[MBB->getNumber()] -= 2; - AdjustBBOffsetsAfter(MBB, -2); - ++NumCBZ; - MadeChange = true; + MachineBasicBlock::iterator CmpMI = Br.MI; + if (CmpMI != Br.MI->getParent()->begin()) { + --CmpMI; + if (CmpMI->getOpcode() == ARM::tCMPi8) { + unsigned Reg = CmpMI->getOperand(0).getReg(); + Pred = llvm::getInstrPredicate(CmpMI, PredReg); + if (Pred == ARMCC::AL && + CmpMI->getOperand(1).getImm() == 0 && + isARMLowRegister(Reg)) { + MachineBasicBlock *MBB = Br.MI->getParent(); + MachineInstr *NewBR = + BuildMI(*MBB, CmpMI, Br.MI->getDebugLoc(), TII->get(NewOpc)) + .addReg(Reg).addMBB(DestBB,Br.MI->getOperand(0).getTargetFlags()); + CmpMI->eraseFromParent(); + Br.MI->eraseFromParent(); + Br.MI = NewBR; + BBSizes[MBB->getNumber()] -= 2; + AdjustBBOffsetsAfter(MBB, -2); + ++NumCBZ; + MadeChange = true; + } } } } diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index bd753d2..b6b3c75 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -455,6 +455,10 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) { // Add an implicit def for the super-register. MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead)); TransferImpOps(MI, MIB, MIB); + + // Transfer memoperands. + MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MI.eraseFromParent(); } @@ -496,10 +500,13 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) { MIB.addOperand(MI.getOperand(OpIdx++)); MIB.addOperand(MI.getOperand(OpIdx++)); - if (SrcIsKill) - // Add an implicit kill for the super-reg. - (*MIB).addRegisterKilled(SrcReg, TRI, true); + if (SrcIsKill) // Add an implicit kill for the super-reg. + MIB->addRegisterKilled(SrcReg, TRI, true); TransferImpOps(MI, MIB, MIB); + + // Transfer memoperands. + MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MI.eraseFromParent(); } @@ -622,9 +629,8 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI, MIB.addOperand(MI.getOperand(OpIdx++)); MIB.addOperand(MI.getOperand(OpIdx++)); - if (SrcIsKill) - // Add an implicit kill for the super-reg. - (*MIB).addRegisterKilled(SrcReg, TRI, true); + if (SrcIsKill) // Add an implicit kill for the super-reg. + MIB->addRegisterKilled(SrcReg, TRI, true); TransferImpOps(MI, MIB, MIB); MI.eraseFromParent(); } @@ -655,8 +661,8 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal); LO16 = LO16.addImm(SOImmValV1); HI16 = HI16.addImm(SOImmValV2); - (*LO16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); - (*HI16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); LO16.addImm(Pred).addReg(PredReg).addReg(0); HI16.addImm(Pred).addReg(PredReg).addReg(0); TransferImpOps(MI, LO16, HI16); @@ -692,8 +698,8 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, HI16 = HI16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_HI16); } - (*LO16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); - (*HI16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); LO16.addImm(Pred).addReg(PredReg); HI16.addImm(Pred).addReg(PredReg); @@ -708,6 +714,78 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, switch (Opcode) { default: return false; + case ARM::VMOVScc: + case ARM::VMOVDcc: { + unsigned newOpc = Opcode == ARM::VMOVScc ? ARM::VMOVS : ARM::VMOVD; + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(newOpc), + MI.getOperand(1).getReg()) + .addReg(MI.getOperand(2).getReg(), + getKillRegState(MI.getOperand(2).isKill())) + .addImm(MI.getOperand(3).getImm()) // 'pred' + .addReg(MI.getOperand(4).getReg()); + + MI.eraseFromParent(); + return true; + } + case ARM::MOVCCr: { + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVr), + MI.getOperand(1).getReg()) + .addReg(MI.getOperand(2).getReg(), + getKillRegState(MI.getOperand(2).isKill())) + .addImm(MI.getOperand(3).getImm()) // 'pred' + .addReg(MI.getOperand(4).getReg()) + .addReg(0); // 's' bit + + MI.eraseFromParent(); + return true; + } + case ARM::MOVCCs: { + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVs), + (MI.getOperand(1).getReg())) + .addReg(MI.getOperand(2).getReg(), + getKillRegState(MI.getOperand(2).isKill())) + .addReg(MI.getOperand(3).getReg(), + getKillRegState(MI.getOperand(3).isKill())) + .addImm(MI.getOperand(4).getImm()) + .addImm(MI.getOperand(5).getImm()) // 'pred' + .addReg(MI.getOperand(6).getReg()) + .addReg(0); // 's' bit + + MI.eraseFromParent(); + return true; + } + case ARM::MOVCCi16: { + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi16), + MI.getOperand(1).getReg()) + .addImm(MI.getOperand(2).getImm()) + .addImm(MI.getOperand(3).getImm()) // 'pred' + .addReg(MI.getOperand(4).getReg()); + + MI.eraseFromParent(); + return true; + } + case ARM::MOVCCi: { + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi), + MI.getOperand(1).getReg()) + .addImm(MI.getOperand(2).getImm()) + .addImm(MI.getOperand(3).getImm()) // 'pred' + .addReg(MI.getOperand(4).getReg()) + .addReg(0); // 's' bit + + MI.eraseFromParent(); + return true; + } + case ARM::MVNCCi: { + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MVNi), + MI.getOperand(1).getReg()) + .addImm(MI.getOperand(2).getImm()) + .addImm(MI.getOperand(3).getImm()) // 'pred' + .addReg(MI.getOperand(4).getReg()) + .addReg(0); // 's' bit + + MI.eraseFromParent(); + return true; + } case ARM::Int_eh_sjlj_dispatchsetup: { MachineFunction &MF = *MI.getParent()->getParent(); const ARMBaseInstrInfo *AII = @@ -726,9 +804,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, llvm::emitT2RegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6, FramePtr, -NumBytes, ARMCC::AL, 0, *TII); } else if (AFI->isThumbFunction()) { - llvm::emitThumbRegPlusImmediate(MBB, MBBI, ARM::R6, - FramePtr, -NumBytes, - *TII, RI, MI.getDebugLoc()); + llvm::emitThumbRegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6, + FramePtr, -NumBytes, *TII, RI); } else { llvm::emitARMRegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6, FramePtr, -NumBytes, ARMCC::AL, 0, @@ -785,7 +862,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, TII->get(ARM::BL)) .addExternalSymbol("__aeabi_read_tp", 0); - (*MIB).setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); TransferImpOps(MI, MIB, MIB); MI.eraseFromParent(); return true; @@ -800,7 +877,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewLdOpc), DstReg) .addOperand(MI.getOperand(1))); - (*MIB1).setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MIB1->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tPICADD)) .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) @@ -823,7 +900,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, const MachineOperand &MO1 = MI.getOperand(1); const GlobalValue *GV = MO1.getGlobal(); unsigned TF = MO1.getTargetFlags(); - bool isARM = Opcode != ARM::t2MOV_ga_pcrel; + bool isARM = (Opcode != ARM::t2MOV_ga_pcrel && Opcode != ARM::t2MOV_ga_dyn); bool isPIC = (Opcode != ARM::MOV_ga_dyn && Opcode != ARM::t2MOV_ga_dyn); unsigned LO16Opc = isARM ? ARM::MOVi16_ga_pcrel : ARM::t2MOVi16_ga_pcrel; unsigned HI16Opc = isARM ? ARM::MOVTi16_ga_pcrel : ARM::t2MOVTi16_ga_pcrel; @@ -856,7 +933,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, if (isARM) { AddDefaultPred(MIB3); if (Opcode == ARM::MOV_ga_pcrel_ldr) - (*MIB2).setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MIB2->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); } TransferImpOps(MI, MIB1, MIB3); MI.eraseFromParent(); @@ -896,9 +973,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, return true; } - case ARM::VLDMQIA: - case ARM::VLDMQDB: { - unsigned NewOpc = (Opcode == ARM::VLDMQIA) ? ARM::VLDMDIA : ARM::VLDMDDB; + case ARM::VLDMQIA: { + unsigned NewOpc = ARM::VLDMDIA; MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc)); unsigned OpIdx = 0; @@ -927,9 +1003,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, return true; } - case ARM::VSTMQIA: - case ARM::VSTMQDB: { - unsigned NewOpc = (Opcode == ARM::VSTMQIA) ? ARM::VSTMDIA : ARM::VSTMDDB; + case ARM::VSTMQIA: { + unsigned NewOpc = ARM::VSTMDIA; MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc)); unsigned OpIdx = 0; @@ -950,9 +1025,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, unsigned D1 = TRI->getSubReg(SrcReg, ARM::dsub_1); MIB.addReg(D0).addReg(D1); - if (SrcIsKill) - // Add an implicit kill for the Q register. - (*MIB).addRegisterKilled(SrcReg, TRI, true); + if (SrcIsKill) // Add an implicit kill for the Q register. + MIB->addRegisterKilled(SrcReg, TRI, true); TransferImpOps(MI, MIB, MIB); MI.eraseFromParent(); @@ -960,14 +1034,16 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, } case ARM::VDUPfqf: case ARM::VDUPfdf:{ - unsigned NewOpc = Opcode == ARM::VDUPfqf ? ARM::VDUPLNfq : ARM::VDUPLNfd; + unsigned NewOpc = Opcode == ARM::VDUPfqf ? ARM::VDUPLN32q : + ARM::VDUPLN32d; MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc)); unsigned OpIdx = 0; unsigned SrcReg = MI.getOperand(1).getReg(); unsigned Lane = getARMRegisterNumbering(SrcReg) & 1; unsigned DReg = TRI->getMatchingSuperReg(SrcReg, - Lane & 1 ? ARM::ssub_1 : ARM::ssub_0, &ARM::DPR_VFP2RegClass); + Lane & 1 ? ARM::ssub_1 : ARM::ssub_0, + &ARM::DPR_VFP2RegClass); // The lane is [0,1] for the containing DReg superregister. // Copy the dst/src register operands. MIB.addOperand(MI.getOperand(OpIdx++)); diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp index 26f48b3..3baf274 100644 --- a/lib/Target/ARM/ARMFastISel.cpp +++ b/lib/Target/ARM/ARMFastISel.cpp @@ -14,6 +14,7 @@ //===----------------------------------------------------------------------===// #include "ARM.h" +#include "ARMAddressingModes.h" #include "ARMBaseInstrInfo.h" #include "ARMCallingConv.h" #include "ARMRegisterInfo.h" @@ -26,6 +27,7 @@ #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" #include "llvm/Module.h" +#include "llvm/Operator.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" @@ -115,6 +117,11 @@ class ARMFastISel : public FastISel { const TargetRegisterClass *RC, unsigned Op0, bool Op0IsKill, unsigned Op1, bool Op1IsKill); + virtual unsigned FastEmitInst_rrr(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill, + unsigned Op2, bool Op2IsKill); virtual unsigned FastEmitInst_ri(unsigned MachineInstOpcode, const TargetRegisterClass *RC, unsigned Op0, bool Op0IsKill, @@ -123,14 +130,18 @@ class ARMFastISel : public FastISel { const TargetRegisterClass *RC, unsigned Op0, bool Op0IsKill, const ConstantFP *FPImm); - virtual unsigned FastEmitInst_i(unsigned MachineInstOpcode, - const TargetRegisterClass *RC, - uint64_t Imm); virtual unsigned FastEmitInst_rri(unsigned MachineInstOpcode, const TargetRegisterClass *RC, unsigned Op0, bool Op0IsKill, unsigned Op1, bool Op1IsKill, uint64_t Imm); + virtual unsigned FastEmitInst_i(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + uint64_t Imm); + virtual unsigned FastEmitInst_ii(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + uint64_t Imm1, uint64_t Imm2); + virtual unsigned FastEmitInst_extractsubreg(MVT RetVT, unsigned Op0, bool Op0IsKill, uint32_t Idx); @@ -193,6 +204,7 @@ class ARMFastISel : public FastISel { // OptionalDef handling routines. private: + bool isARMNEONPred(const MachineInstr *MI); bool DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR); const MachineInstrBuilder &AddOptionalDefs(const MachineInstrBuilder &MIB); void AddLoadStoreOperands(EVT VT, Address &Addr, @@ -221,6 +233,21 @@ bool ARMFastISel::DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR) { return true; } +bool ARMFastISel::isARMNEONPred(const MachineInstr *MI) { + const TargetInstrDesc &TID = MI->getDesc(); + + // If we're a thumb2 or not NEON function we were handled via isPredicable. + if ((TID.TSFlags & ARMII::DomainMask) != ARMII::DomainNEON || + AFI->isThumb2Function()) + return false; + + for (unsigned i = 0, e = TID.getNumOperands(); i != e; ++i) + if (TID.OpInfo[i].isPredicate()) + return true; + + return false; +} + // If the machine is predicable go ahead and add the predicate operands, if // it needs default CC operands add those. // TODO: If we want to support thumb1 then we'll need to deal with optional @@ -230,8 +257,10 @@ const MachineInstrBuilder & ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) { MachineInstr *MI = &*MIB; - // Do we use a predicate? - if (TII.isPredicable(MI)) + // Do we use a predicate? or... + // Are we NEON in ARM mode and have a predicate operand? If so, I know + // we're not predicable but add it anyways. + if (TII.isPredicable(MI) || isARMNEONPred(MI)) AddDefaultPred(MIB); // Do we optionally set a predicate? Preds is size > 0 iff the predicate @@ -296,6 +325,31 @@ unsigned ARMFastISel::FastEmitInst_rr(unsigned MachineInstOpcode, return ResultReg; } +unsigned ARMFastISel::FastEmitInst_rrr(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill, + unsigned Op2, bool Op2IsKill) { + unsigned ResultReg = createResultReg(RC); + const TargetInstrDesc &II = TII.get(MachineInstOpcode); + + if (II.getNumDefs() >= 1) + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg) + .addReg(Op0, Op0IsKill * RegState::Kill) + .addReg(Op1, Op1IsKill * RegState::Kill) + .addReg(Op2, Op2IsKill * RegState::Kill)); + else { + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II) + .addReg(Op0, Op0IsKill * RegState::Kill) + .addReg(Op1, Op1IsKill * RegState::Kill) + .addReg(Op2, Op2IsKill * RegState::Kill)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(II.ImplicitDefs[0])); + } + return ResultReg; +} + unsigned ARMFastISel::FastEmitInst_ri(unsigned MachineInstOpcode, const TargetRegisterClass *RC, unsigned Op0, bool Op0IsKill, @@ -384,6 +438,26 @@ unsigned ARMFastISel::FastEmitInst_i(unsigned MachineInstOpcode, return ResultReg; } +unsigned ARMFastISel::FastEmitInst_ii(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + uint64_t Imm1, uint64_t Imm2) { + unsigned ResultReg = createResultReg(RC); + const TargetInstrDesc &II = TII.get(MachineInstOpcode); + + if (II.getNumDefs() >= 1) + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg) + .addImm(Imm1).addImm(Imm2)); + else { + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II) + .addImm(Imm1).addImm(Imm2)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(TargetOpcode::COPY), + ResultReg) + .addReg(II.ImplicitDefs[0])); + } + return ResultReg; +} + unsigned ARMFastISel::FastEmitInst_extractsubreg(MVT RetVT, unsigned Op0, bool Op0IsKill, uint32_t Idx) { @@ -667,24 +741,29 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) { TmpOffset += SL->getElementOffset(Idx); } else { uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType()); - SmallVector<const Value *, 4> Worklist; - Worklist.push_back(Op); - do { - Op = Worklist.pop_back_val(); + for (;;) { if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { // Constant-offset addressing. TmpOffset += CI->getSExtValue() * S; - } else if (isa<AddOperator>(Op) && - isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) { - // An add with a constant operand. Fold the constant. + break; + } + if (isa<AddOperator>(Op) && + (!isa<Instruction>(Op) || + FuncInfo.MBBMap[cast<Instruction>(Op)->getParent()] + == FuncInfo.MBB) && + isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) { + // An add (in the same block) with a constant operand. Fold the + // constant. ConstantInt *CI = - cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1)); + cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1)); TmpOffset += CI->getSExtValue() * S; - // Add the other operand back to the work list. - Worklist.push_back(cast<AddOperator>(Op)->getOperand(0)); - } else - goto unsupported_gep; - } while (!Worklist.empty()); + // Iterate on the other operand. + Op = cast<AddOperator>(Op)->getOperand(0); + continue; + } + // Unsupported + goto unsupported_gep; + } } } @@ -767,26 +846,9 @@ void ARMFastISel::ARMSimplifyAddress(Address &Addr, EVT VT) { // Since the offset is too large for the load/store instruction // get the reg+offset into a register. if (needsLowering) { - ARMCC::CondCodes Pred = ARMCC::AL; - unsigned PredReg = 0; - - TargetRegisterClass *RC = isThumb ? ARM::tGPRRegisterClass : - ARM::GPRRegisterClass; - unsigned BaseReg = createResultReg(RC); - - if (!isThumb) - emitARMRegPlusImmediate(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - BaseReg, Addr.Base.Reg, Addr.Offset, - Pred, PredReg, - static_cast<const ARMBaseInstrInfo&>(TII)); - else { - assert(AFI->isThumb2Function()); - emitT2RegPlusImmediate(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - BaseReg, Addr.Base.Reg, Addr.Offset, Pred, PredReg, - static_cast<const ARMBaseInstrInfo&>(TII)); - } + Addr.Base.Reg = FastEmit_ri_(MVT::i32, ISD::ADD, Addr.Base.Reg, + /*Op0IsKill*/false, Addr.Offset, MVT::i32); Addr.Offset = 0; - Addr.Base.Reg = BaseReg; } } @@ -797,7 +859,7 @@ void ARMFastISel::AddLoadStoreOperands(EVT VT, Address &Addr, if (VT.getSimpleVT().SimpleTy == MVT::f32 || VT.getSimpleVT().SimpleTy == MVT::f64) Addr.Offset /= 4; - + // Frame base works a bit differently. Handle it separately. if (Addr.BaseType == Address::FrameIndexBase) { int FI = Addr.Base.FI; @@ -819,7 +881,7 @@ void ARMFastISel::AddLoadStoreOperands(EVT VT, Address &Addr, } else { // Now add the rest of the operands. MIB.addReg(Addr.Base.Reg); - + // ARM halfword load/stores need an additional operand. if (!isThumb && VT.getSimpleVT().SimpleTy == MVT::i16) MIB.addReg(0); @@ -1007,18 +1069,16 @@ bool ARMFastISel::SelectBranch(const Instruction *I) { // behavior. // TODO: Factor this out. if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) { - if (CI->hasOneUse() && (CI->getParent() == I->getParent())) { - MVT VT; - const Type *Ty = CI->getOperand(0)->getType(); - if (!isTypeLegal(Ty, VT)) - return false; - + MVT SourceVT; + const Type *Ty = CI->getOperand(0)->getType(); + if (CI->hasOneUse() && (CI->getParent() == I->getParent()) + && isTypeLegal(Ty, SourceVT)) { bool isFloat = (Ty->isDoubleTy() || Ty->isFloatTy()); if (isFloat && !Subtarget->hasVFP2()) return false; unsigned CmpOpc; - switch (VT.SimpleTy) { + switch (SourceVT.SimpleTy) { default: return false; // TODO: Verify compares. case MVT::f32: @@ -1033,7 +1093,14 @@ bool ARMFastISel::SelectBranch(const Instruction *I) { } // Get the compare predicate. - ARMCC::CondCodes ARMPred = getComparePred(CI->getPredicate()); + // Try to take advantage of fallthrough opportunities. + CmpInst::Predicate Predicate = CI->getPredicate(); + if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { + std::swap(TBB, FBB); + Predicate = CmpInst::getInversePredicate(Predicate); + } + + ARMCC::CondCodes ARMPred = getComparePred(Predicate); // We may not handle every CC for now. if (ARMPred == ARMCC::AL) return false; @@ -1061,19 +1128,55 @@ bool ARMFastISel::SelectBranch(const Instruction *I) { FuncInfo.MBB->addSuccessor(TBB); return true; } + } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) { + MVT SourceVT; + if (TI->hasOneUse() && TI->getParent() == I->getParent() && + (isTypeLegal(TI->getOperand(0)->getType(), SourceVT))) { + unsigned TstOpc = isThumb ? ARM::t2TSTri : ARM::TSTri; + unsigned OpReg = getRegForValue(TI->getOperand(0)); + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(TstOpc)) + .addReg(OpReg).addImm(1)); + + unsigned CCMode = ARMCC::NE; + if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { + std::swap(TBB, FBB); + CCMode = ARMCC::EQ; + } + + unsigned BrOpc = isThumb ? ARM::t2Bcc : ARM::Bcc; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BrOpc)) + .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR); + + FastEmitBranch(FBB, DL); + FuncInfo.MBB->addSuccessor(TBB); + return true; + } } unsigned CmpReg = getRegForValue(BI->getCondition()); if (CmpReg == 0) return false; - // Re-set the flags just in case. - unsigned CmpOpc = isThumb ? ARM::t2CMPri : ARM::CMPri; - AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc)) - .addReg(CmpReg).addImm(0)); + // We've been divorced from our compare! Our block was split, and + // now our compare lives in a predecessor block. We musn't + // re-compare here, as the children of the compare aren't guaranteed + // live across the block boundary (we *could* check for this). + // Regardless, the compare has been done in the predecessor block, + // and it left a value for us in a virtual register. Ergo, we test + // the one-bit value left in the virtual register. + unsigned TstOpc = isThumb ? ARM::t2TSTri : ARM::TSTri; + AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TstOpc)) + .addReg(CmpReg).addImm(1)); + + unsigned CCMode = ARMCC::NE; + if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { + std::swap(TBB, FBB); + CCMode = ARMCC::EQ; + } unsigned BrOpc = isThumb ? ARM::t2Bcc : ARM::Bcc; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BrOpc)) - .addMBB(TBB).addImm(ARMCC::NE).addReg(ARM::CPSR); + .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR); FastEmitBranch(FBB, DL); FuncInfo.MBB->addSuccessor(TBB); return true; @@ -1636,17 +1739,9 @@ bool ARMFastISel::SelectRet(const Instruction *I) { unsigned ARMFastISel::ARMSelectCallOp(const GlobalValue *GV) { - // Depend our opcode for thumb on whether or not we're targeting an - // externally callable function. For libcalls we'll just pass a NULL GV - // in here. - bool isExternal = false; - if (!GV || GV->hasExternalLinkage()) isExternal = true; - // Darwin needs the r9 versions of the opcodes. bool isDarwin = Subtarget->isTargetDarwin(); - if (isThumb && isExternal) { - return isDarwin ? ARM::tBLXi_r9 : ARM::tBLXi; - } else if (isThumb) { + if (isThumb) { return isDarwin ? ARM::tBLr9 : ARM::tBL; } else { return isDarwin ? ARM::BLr9 : ARM::BL; @@ -1671,9 +1766,6 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { else if (!isTypeLegal(RetTy, RetVT)) return false; - // For now we're using BLX etc on the assumption that we have v5t ops. - if (!Subtarget->hasV5TOps()) return false; - // TODO: For now if we have long calls specified we don't handle the call. if (EnableARMLongCalls) return false; @@ -1711,7 +1803,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes)) return false; - // Issue the call, BLXr9 for darwin, BLX otherwise. This uses V5 ops. + // Issue the call, BLr9 for darwin, BL otherwise. // TODO: Turn this into the table of arm call ops. MachineInstrBuilder MIB; unsigned CallOpc = ARMSelectCallOp(NULL); @@ -1772,13 +1864,9 @@ bool ARMFastISel::SelectCall(const Instruction *I) { else if (!isTypeLegal(RetTy, RetVT)) return false; - // For now we're using BLX etc on the assumption that we have v5t ops. - // TODO: Maybe? - if (!Subtarget->hasV5TOps()) return false; - // TODO: For now if we have long calls specified we don't handle the call. if (EnableARMLongCalls) return false; - + // Set up the argument vectors. SmallVector<Value*, 8> Args; SmallVector<unsigned, 8> ArgRegs; @@ -1827,7 +1915,7 @@ bool ARMFastISel::SelectCall(const Instruction *I) { if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes)) return false; - // Issue the call, BLXr9 for darwin, BLX otherwise. This uses V5 ops. + // Issue the call, BLr9 for darwin, BL otherwise. // TODO: Turn this into the table of arm call ops. MachineInstrBuilder MIB; unsigned CallOpc = ARMSelectCallOp(GV); @@ -1842,7 +1930,7 @@ bool ARMFastISel::SelectCall(const Instruction *I) { MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc)) .addGlobalAddress(GV, 0, 0)); - + // Add implicit physical register uses to the call. for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) MIB.addReg(RegArgs[i]); diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp index 68c33f0..e2e95d4 100644 --- a/lib/Target/ARM/ARMFrameLowering.cpp +++ b/lib/Target/ARM/ARMFrameLowering.cpp @@ -106,14 +106,13 @@ static void emitSPUpdate(bool isARM, MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, DebugLoc dl, const ARMBaseInstrInfo &TII, - int NumBytes, - ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0) { + int NumBytes, unsigned MIFlags = MachineInstr::NoFlags) { if (isARM) emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, - Pred, PredReg, TII); + ARMCC::AL, 0, TII, MIFlags); else emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, - Pred, PredReg, TII); + ARMCC::AL, 0, TII, MIFlags); } void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { @@ -141,11 +140,13 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { // Allocate the vararg register save area. This is not counted in NumBytes. if (VARegSaveSize) - emitSPUpdate(isARM, MBB, MBBI, dl, TII, -VARegSaveSize); + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -VARegSaveSize, + MachineInstr::FrameSetup); if (!AFI->hasStackFrame()) { if (NumBytes != 0) - emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes); + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes, + MachineInstr::FrameSetup); return; } @@ -196,7 +197,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri : ARM::t2ADDri; MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(ADDriOpc), FramePtr) - .addFrameIndex(FramePtrSpillFI).addImm(0); + .addFrameIndex(FramePtrSpillFI).addImm(0) + .setMIFlag(MachineInstr::FrameSetup); AddDefaultCC(AddDefaultPred(MIB)); } @@ -226,7 +228,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { NumBytes = DPRCSOffset; if (NumBytes) { // Adjust SP after all the callee-save spills. - emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes); + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes, + MachineInstr::FrameSetup); if (HasFP && isARM) // Restore from fp only in ARM mode: e.g. sub sp, r7, #24 // Note it's not safe to do this in Thumb2 mode because it would have @@ -282,6 +285,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { // of the stack pointer is at this point. Any variable size objects // will be allocated after this, so we can still use the base pointer // to reference locals. + // FIXME: Clarify FrameSetup flags here. if (RegInfo->hasBasePointer(MF)) { if (isARM) BuildMI(MBB, MBBI, dl, @@ -396,8 +400,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, // Jump to label or value in register. if (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNdiND) { unsigned TCOpcode = (RetOpcode == ARM::TCRETURNdi) - ? (STI.isThumb() ? ARM::TAILJMPdt : ARM::TAILJMPd) - : (STI.isThumb() ? ARM::TAILJMPdNDt : ARM::TAILJMPdND); + ? (STI.isThumb() ? ARM::tTAILJMPd : ARM::TAILJMPd) + : (STI.isThumb() ? ARM::tTAILJMPdND : ARM::TAILJMPdND); MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(TCOpcode)); if (JumpTarget.isGlobal()) MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), @@ -408,10 +412,12 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, JumpTarget.getTargetFlags()); } } else if (RetOpcode == ARM::TCRETURNri) { - BuildMI(MBB, MBBI, dl, TII.get(ARM::TAILJMPr)). + BuildMI(MBB, MBBI, dl, + TII.get(STI.isThumb() ? ARM::tTAILJMPr : ARM::TAILJMPr)). addReg(JumpTarget.getReg(), RegState::Kill); } else if (RetOpcode == ARM::TCRETURNriND) { - BuildMI(MBB, MBBI, dl, TII.get(ARM::TAILJMPrND)). + BuildMI(MBB, MBBI, dl, + TII.get(STI.isThumb() ? ARM::tTAILJMPrND : ARM::TAILJMPrND)). addReg(JumpTarget.getReg(), RegState::Kill); } @@ -439,8 +445,7 @@ ARMFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, int ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF, - int FI, - unsigned &FrameReg, + int FI, unsigned &FrameReg, int SPAdj) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const ARMBaseRegisterInfo *RegInfo = @@ -484,19 +489,23 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF, return FPOffset; } else if (MFI->hasVarSizedObjects()) { assert(RegInfo->hasBasePointer(MF) && "missing base pointer!"); - // Try to use the frame pointer if we can, else use the base pointer - // since it's available. This is handy for the emergency spill slot, in - // particular. if (AFI->isThumb2Function()) { + // Try to use the frame pointer if we can, else use the base pointer + // since it's available. This is handy for the emergency spill slot, in + // particular. if (FPOffset >= -255 && FPOffset < 0) { FrameReg = RegInfo->getFrameRegister(MF); return FPOffset; } - } else - FrameReg = RegInfo->getBaseRegister(); + } } else if (AFI->isThumb2Function()) { + // Use add <rd>, sp, #<imm8> + // ldr <rd>, [sp, #<imm8>] + // if at all possible to save space. + if (Offset >= 0 && (Offset & 3) == 0 && Offset <= 1020) + return Offset; // In Thumb2 mode, the negative offset is very limited. Try to avoid - // out of range references. + // out of range references. ldr <rt>,[<rn>, #-<imm8>] if (FPOffset >= -255 && FPOffset < 0) { FrameReg = RegInfo->getFrameRegister(MF); return FPOffset; @@ -524,7 +533,8 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB, const std::vector<CalleeSavedInfo> &CSI, unsigned StmOpc, unsigned StrOpc, bool NoGap, - bool(*Func)(unsigned, bool)) const { + bool(*Func)(unsigned, bool), + unsigned MIFlags) const { MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); @@ -567,14 +577,14 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB, if (Regs.size() > 1 || StrOpc== 0) { MachineInstrBuilder MIB = AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(StmOpc), ARM::SP) - .addReg(ARM::SP)); + .addReg(ARM::SP).setMIFlags(MIFlags)); for (unsigned i = 0, e = Regs.size(); i < e; ++i) MIB.addReg(Regs[i].first, getKillRegState(Regs[i].second)); } else if (Regs.size() == 1) { MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc), ARM::SP) .addReg(Regs[0].first, getKillRegState(Regs[0].second)) - .addReg(ARM::SP); + .addReg(ARM::SP).setMIFlags(MIFlags); // ARM mode needs an extra reg0 here due to addrmode2. Will go away once // that refactoring is complete (eventually). if (StrOpc == ARM::STR_PRE) { @@ -676,9 +686,12 @@ bool ARMFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, unsigned PushOpc = AFI->isThumbFunction() ? ARM::t2STMDB_UPD : ARM::STMDB_UPD; unsigned PushOneOpc = AFI->isThumbFunction() ? ARM::t2STR_PRE : ARM::STR_PRE; unsigned FltOpc = ARM::VSTMDDB_UPD; - emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea1Register); - emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea2Register); - emitPushInst(MBB, MI, CSI, FltOpc, 0, true, &isARMArea3Register); + emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea1Register, + MachineInstr::FrameSetup); + emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea2Register, + MachineInstr::FrameSetup); + emitPushInst(MBB, MI, CSI, FltOpc, 0, true, &isARMArea3Register, + MachineInstr::FrameSetup); return true; } diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h index 1288b70..61bb8af 100644 --- a/lib/Target/ARM/ARMFrameLowering.h +++ b/lib/Target/ARM/ARMFrameLowering.h @@ -51,7 +51,8 @@ public: bool canSimplifyCallFramePseudos(const MachineFunction &MF) const; int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const; - int ResolveFrameIndexReference(const MachineFunction &MF, int FI, + int ResolveFrameIndexReference(const MachineFunction &MF, + int FI, unsigned &FrameReg, int SPAdj) const; int getFrameIndexOffset(const MachineFunction &MF, int FI) const; @@ -62,7 +63,8 @@ public: void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector<CalleeSavedInfo> &CSI, unsigned StmOpc, unsigned StrOpc, bool NoGap, - bool(*Func)(unsigned, bool)) const; + bool(*Func)(unsigned, bool), + unsigned MIFlags = 0) const; void emitPopInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector<CalleeSavedInfo> &CSI, unsigned LdmOpc, unsigned LdrOpc, bool isVarArg, bool NoGap, diff --git a/lib/Target/ARM/ARMHazardRecognizer.cpp b/lib/Target/ARM/ARMHazardRecognizer.cpp index e97ce50..517bba8 100644 --- a/lib/Target/ARM/ARMHazardRecognizer.cpp +++ b/lib/Target/ARM/ARMHazardRecognizer.cpp @@ -49,6 +49,8 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { const TargetInstrDesc &LastTID = LastMI->getDesc(); // Skip over one non-VFP / NEON instruction. if (!LastTID.isBarrier() && + // On A9, AGU and NEON/FPU are muxed. + !(STI.isCortexA9() && (LastTID.mayLoad() || LastTID.mayStore())) && (LastTID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) { MachineBasicBlock::iterator I = LastMI; if (I != LastMI->getParent()->begin()) { diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index f0d5a7d..abe5a31 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -45,7 +45,7 @@ DisableShifterOp("disable-shifter-op", cl::Hidden, static cl::opt<bool> CheckVMLxHazard("check-vmlx-hazard", cl::Hidden, cl::desc("Check fp vmla / vmls hazard at isel time"), - cl::init(false)); + cl::init(true)); //===--------------------------------------------------------------------===// /// ARMDAGToDAGISel - ARM specific code to select ARM machine @@ -91,9 +91,14 @@ public: bool isShifterOpProfitable(const SDValue &Shift, ARM_AM::ShiftOpc ShOpcVal, unsigned ShAmt); bool SelectShifterOperandReg(SDValue N, SDValue &A, - SDValue &B, SDValue &C); + SDValue &B, SDValue &C, + bool CheckProfitability = true); bool SelectShiftShifterOperandReg(SDValue N, SDValue &A, - SDValue &B, SDValue &C); + SDValue &B, SDValue &C) { + // Don't apply the profitability check + return SelectShifterOperandReg(N, A, B, C, false); + } + bool SelectAddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm); bool SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset, SDValue &Opc); @@ -174,16 +179,6 @@ public: return ARM_AM::getT2SOImmVal(~Imm) != -1; } - inline bool Pred_so_imm(SDNode *inN) const { - ConstantSDNode *N = cast<ConstantSDNode>(inN); - return is_so_imm(N->getZExtValue()); - } - - inline bool Pred_t2_so_imm(SDNode *inN) const { - ConstantSDNode *N = cast<ConstantSDNode>(inN); - return is_t2_so_imm(N->getZExtValue()); - } - // Include the pieces autogenerated from the target description. #include "ARMGenDAGISel.inc" @@ -373,7 +368,8 @@ bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift, bool ARMDAGToDAGISel::SelectShifterOperandReg(SDValue N, SDValue &BaseReg, SDValue &ShReg, - SDValue &Opc) { + SDValue &Opc, + bool CheckProfitability) { if (DisableShifterOp) return false; @@ -390,7 +386,7 @@ bool ARMDAGToDAGISel::SelectShifterOperandReg(SDValue N, ShImmVal = RHS->getZExtValue() & 31; } else { ShReg = N.getOperand(1); - if (!isShifterOpProfitable(N, ShOpcVal, ShImmVal)) + if (CheckProfitability && !isShifterOpProfitable(N, ShOpcVal, ShImmVal)) return false; } Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal), @@ -398,30 +394,6 @@ bool ARMDAGToDAGISel::SelectShifterOperandReg(SDValue N, return true; } -bool ARMDAGToDAGISel::SelectShiftShifterOperandReg(SDValue N, - SDValue &BaseReg, - SDValue &ShReg, - SDValue &Opc) { - ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N); - - // Don't match base register only case. That is matched to a separate - // lower complexity pattern with explicit register operand. - if (ShOpcVal == ARM_AM::no_shift) return false; - - BaseReg = N.getOperand(0); - unsigned ShImmVal = 0; - // Do not check isShifterOpProfitable. This must return true. - if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { - ShReg = CurDAG->getRegister(0, MVT::i32); - ShImmVal = RHS->getZExtValue() & 31; - } else { - ShReg = N.getOperand(1); - } - Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal), - MVT::i32); - return true; -} - bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm) { @@ -437,7 +409,7 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N, OffImm = CurDAG->getTargetConstant(0, MVT::i32); return true; } - + if (N.getOpcode() == ARMISD::Wrapper && !(Subtarget->useMovt() && N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) { @@ -1138,7 +1110,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N, OffImm = CurDAG->getTargetConstant(0, MVT::i32); return true; } - + if (N.getOpcode() == ARMISD::Wrapper && !(Subtarget->useMovt() && N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) { @@ -1183,7 +1155,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N, if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB && !CurDAG->isBaseWithConstantOffset(N)) return false; - + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { int RHSC = (int)RHS->getSExtValue(); if (N.getOpcode() == ISD::SUB) @@ -1571,6 +1543,11 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, Ops.data(), Ops.size()); } + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + cast<MachineSDNode>(VLd)->setMemRefs(MemOp, MemOp + 1); + if (NumVecs == 1) return VLd; @@ -1600,6 +1577,9 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) return NULL; + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + SDValue Chain = N->getOperand(0); EVT VT = N->getOperand(Vec0Idx).getValueType(); bool is64BitVector = VT.is64BitVector(); @@ -1672,7 +1652,13 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, Ops.push_back(Pred); Ops.push_back(Reg0); Ops.push_back(Chain); - return CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size()); + SDNode *VSt = + CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size()); + + // Transfer memoperands. + cast<MachineSDNode>(VSt)->setMemRefs(MemOp, MemOp + 1); + + return VSt; } // Otherwise, quad registers are stored with two separate instructions, @@ -1693,6 +1679,7 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, SDNode *VStA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl, MemAddr.getValueType(), MVT::Other, OpsA, 7); + cast<MachineSDNode>(VStA)->setMemRefs(MemOp, MemOp + 1); Chain = SDValue(VStA, 1); // Store the odd D registers. @@ -1709,8 +1696,10 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, Ops.push_back(Pred); Ops.push_back(Reg0); Ops.push_back(Chain); - return CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, - Ops.data(), Ops.size()); + SDNode *VStB = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, + Ops.data(), Ops.size()); + cast<MachineSDNode>(VStB)->setMemRefs(MemOp, MemOp + 1); + return VStB; } SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, @@ -1726,6 +1715,9 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align)) return NULL; + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + SDValue Chain = N->getOperand(0); unsigned Lane = cast<ConstantSDNode>(N->getOperand(Vec0Idx + NumVecs))->getZExtValue(); @@ -1812,6 +1804,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, QOpcodes[OpcodeIndex]); SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size()); + cast<MachineSDNode>(VLdLn)->setMemRefs(MemOp, MemOp + 1); if (!IsLoad) return VLdLn; @@ -1838,6 +1831,9 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align)) return NULL; + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + SDValue Chain = N->getOperand(0); EVT VT = N->getValueType(0); @@ -1882,12 +1878,13 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs; std::vector<EVT> ResTys; - ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts)); + ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,ResTyElts)); if (isUpdating) ResTys.push_back(MVT::i32); ResTys.push_back(MVT::Other); SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size()); + cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1); SuperReg = SDValue(VLdDup, 0); // Extract the subregisters. @@ -2168,7 +2165,7 @@ SDNode *ARMDAGToDAGISel::SelectCMOVOp(SDNode *N) { // Emits: (tMOVCCr:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc) // Pattern complexity = 6 cost = 11 size = 0 // - // Also FCPYScc and FCPYDcc. + // Also VMOVScc and VMOVDcc. SDValue Tmp2 = CurDAG->getTargetConstant(CCVal, MVT::i32); SDValue Ops[] = { FalseVal, TrueVal, Tmp2, CCR, InFlag }; unsigned Opc = 0; @@ -2450,34 +2447,6 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { } case ARMISD::CMOV: return SelectCMOVOp(N); - case ARMISD::CNEG: { - EVT VT = N->getValueType(0); - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - SDValue N2 = N->getOperand(2); - SDValue N3 = N->getOperand(3); - SDValue InFlag = N->getOperand(4); - assert(N2.getOpcode() == ISD::Constant); - assert(N3.getOpcode() == ISD::Register); - - SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned) - cast<ConstantSDNode>(N2)->getZExtValue()), - MVT::i32); - SDValue Ops[] = { N0, N1, Tmp2, N3, InFlag }; - unsigned Opc = 0; - switch (VT.getSimpleVT().SimpleTy) { - default: assert(false && "Illegal conditional move type!"); - break; - case MVT::f32: - Opc = ARM::VNEGScc; - break; - case MVT::f64: - Opc = ARM::VNEGDcc; - break; - } - return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 5); - } - case ARMISD::VZIP: { unsigned Opc = 0; EVT VT = N->getValueType(0); @@ -2870,6 +2839,35 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { break; } + case ARMISD::VTBL1: { + DebugLoc dl = N->getDebugLoc(); + EVT VT = N->getValueType(0); + SmallVector<SDValue, 6> Ops; + + Ops.push_back(N->getOperand(0)); + Ops.push_back(N->getOperand(1)); + Ops.push_back(getAL(CurDAG)); // Predicate + Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register + return CurDAG->getMachineNode(ARM::VTBL1, dl, VT, Ops.data(), Ops.size()); + } + case ARMISD::VTBL2: { + DebugLoc dl = N->getDebugLoc(); + EVT VT = N->getValueType(0); + + // Form a REG_SEQUENCE to force register allocation. + SDValue V0 = N->getOperand(0); + SDValue V1 = N->getOperand(1); + SDValue RegSeq = SDValue(PairDRegs(MVT::v16i8, V0, V1), 0); + + SmallVector<SDValue, 6> Ops; + Ops.push_back(RegSeq); + Ops.push_back(N->getOperand(2)); + Ops.push_back(getAL(CurDAG)); // Predicate + Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register + return CurDAG->getMachineNode(ARM::VTBL2Pseudo, dl, VT, + Ops.data(), Ops.size()); + } + case ISD::CONCAT_VECTORS: return SelectConcatVector(N); } diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index ab9f9e1..0a31b87 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -72,6 +72,11 @@ ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), cl::init(true)); +// The APCS parameter registers. +static const unsigned GPRArgRegs[] = { + ARM::R0, ARM::R1, ARM::R2, ARM::R3 +}; + void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT, EVT PromotedBitwiseVT) { if (VT != PromotedLdStVT) { @@ -393,6 +398,12 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setLibcallCallingConv(RTLIB::UDIV_I32, CallingConv::ARM_AAPCS); } + // Use divmod iOS compiler-rt calls. + if (Subtarget->getTargetTriple().getOS() == Triple::IOS) { + setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); + setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); + } + if (Subtarget->isThumb1Only()) addRegisterClass(MVT::i32, ARM::tGPRRegisterClass); else @@ -461,6 +472,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::UDIV, MVT::v8i8, Custom); setOperationAction(ISD::VSETCC, MVT::v1i64, Expand); setOperationAction(ISD::VSETCC, MVT::v2i64, Expand); + // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with + // a destination type that is wider than the source. + setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); @@ -502,18 +517,15 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) } // i64 operation support. + setOperationAction(ISD::MUL, MVT::i64, Expand); + setOperationAction(ISD::MULHU, MVT::i32, Expand); if (Subtarget->isThumb1Only()) { - setOperationAction(ISD::MUL, MVT::i64, Expand); - setOperationAction(ISD::MULHU, MVT::i32, Expand); - setOperationAction(ISD::MULHS, MVT::i32, Expand); setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); - } else { - setOperationAction(ISD::MUL, MVT::i64, Expand); - setOperationAction(ISD::MULHU, MVT::i32, Expand); - if (!Subtarget->hasV6Ops()) - setOperationAction(ISD::MULHS, MVT::i32, Expand); } + if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()) + setOperationAction(ISD::MULHS, MVT::i32, Expand); + setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); @@ -597,6 +609,18 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i8, Expand); setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i16, Expand); setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand); // Since the libcalls include locking, fold in the fences setShouldFoldAtomicFences(true); } @@ -716,7 +740,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) // pressure of the register class's representative and all of it's super // classes' representatives transitively. We have not implemented this because // of the difficulty prior to coalescing of modeling operand register classes -// due to the common occurence of cross class copies and subregister insertions +// due to the common occurrence of cross class copies and subregister insertions // and extractions. std::pair<const TargetRegisterClass*, uint8_t> ARMTargetLowering::findRepresentativeClass(EVT VT) const{ @@ -778,7 +802,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::BCC_i64: return "ARMISD::BCC_i64"; case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; case ARMISD::CMOV: return "ARMISD::CMOV"; - case ARMISD::CNEG: return "ARMISD::CNEG"; case ARMISD::RBIT: return "ARMISD::RBIT"; @@ -853,6 +876,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VZIP: return "ARMISD::VZIP"; case ARMISD::VUZP: return "ARMISD::VUZP"; case ARMISD::VTRN: return "ARMISD::VTRN"; + case ARMISD::VTBL1: return "ARMISD::VTBL1"; + case ARMISD::VTBL2: return "ARMISD::VTBL2"; case ARMISD::VMULLs: return "ARMISD::VMULLs"; case ARMISD::VMULLu: return "ARMISD::VMULLu"; case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; @@ -861,6 +886,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::BFI: return "ARMISD::BFI"; case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; + case ARMISD::VBSL: return "ARMISD::VBSL"; case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; @@ -946,27 +972,6 @@ Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { return Sched::RegPressure; } -// FIXME: Move to RegInfo -unsigned -ARMTargetLowering::getRegPressureLimit(const TargetRegisterClass *RC, - MachineFunction &MF) const { - const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); - - switch (RC->getID()) { - default: - return 0; - case ARM::tGPRRegClassID: - return TFI->hasFP(MF) ? 4 : 5; - case ARM::GPRRegClassID: { - unsigned FP = TFI->hasFP(MF) ? 1 : 0; - return 10 - FP - (Subtarget->isR9Reserved() ? 1 : 0); - } - case ARM::SPRRegClassID: // Currently not used as 'rep' register class. - case ARM::DPRRegClassID: - return 32 - 10; - } -} - //===----------------------------------------------------------------------===// // Lowering Code //===----------------------------------------------------------------------===// @@ -1130,22 +1135,6 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, return Chain; } -/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified -/// by "Src" to address "Dst" of size "Size". Alignment information is -/// specified by the specific parameter attribute. The copy will be passed as -/// a byval function parameter. -/// Sometimes what we are copying is the end of a larger object, the part that -/// does not fit in registers. -static SDValue -CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, - ISD::ArgFlagsTy Flags, SelectionDAG &DAG, - DebugLoc dl) { - SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); - return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), - /*isVolatile=*/false, /*AlwaysInline=*/false, - MachinePointerInfo(0), MachinePointerInfo(0)); -} - /// LowerMemOpCallTo - Store the argument to the stack. SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, @@ -1156,9 +1145,6 @@ ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, unsigned LocMemOffset = VA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); - if (Flags.isByVal()) - return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); - return DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo::getStack(LocMemOffset), false, false, 0); @@ -1224,6 +1210,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs, *DAG.getContext()); + CCInfo.setCallOrPrologue(Call); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv, /* Return*/ false, isVarArg)); @@ -1253,6 +1240,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[realArgIdx]; ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; + bool isByVal = Flags.isByVal(); // Promote the value if needed. switch (VA.getLocInfo()) { @@ -1299,6 +1287,43 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, } } else if (VA.isRegLoc()) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + } else if (isByVal) { + assert(VA.isMemLoc()); + unsigned offset = 0; + + // True if this byval aggregate will be split between registers + // and memory. + if (CCInfo.isFirstByValRegValid()) { + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + unsigned int i, j; + for (i = 0, j = CCInfo.getFirstByValReg(); j < ARM::R4; i++, j++) { + SDValue Const = DAG.getConstant(4*i, MVT::i32); + SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); + SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, + MachinePointerInfo(), + false, false, 0); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(j, Load)); + } + offset = ARM::R4 - CCInfo.getFirstByValReg(); + CCInfo.clearFirstByValReg(); + } + + unsigned LocMemOffset = VA.getLocMemOffset(); + SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset); + SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, + StkPtrOff); + SDValue SrcOffset = DAG.getIntPtrConstant(4*offset); + SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset); + SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, + MVT::i32); + MemOpChains.push_back(DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, + Flags.getByValAlign(), + /*isVolatile=*/false, + /*AlwaysInline=*/false, + MachinePointerInfo(0), + MachinePointerInfo(0))); + } else if (!IsSibCall) { assert(VA.isMemLoc()); @@ -1332,7 +1357,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // than necessary, because it means that each store effectively depends // on every argument instead of just those arguments it would clobber. - // Do not flag preceeding copytoreg stuff together with the following stuff. + // Do not flag preceding copytoreg stuff together with the following stuff. InFlag = SDValue(); for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, @@ -1492,6 +1517,35 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, dl, DAG, InVals); } +/// HandleByVal - Every parameter *after* a byval parameter is passed +/// on the stack. Remember the next parameter register to allocate, +/// and then confiscate the rest of the parameter registers to insure +/// this. +void +llvm::ARMTargetLowering::HandleByVal(CCState *State, unsigned &size) const { + unsigned reg = State->AllocateReg(GPRArgRegs, 4); + assert((State->getCallOrPrologue() == Prologue || + State->getCallOrPrologue() == Call) && + "unhandled ParmContext"); + if ((!State->isFirstByValRegValid()) && + (ARM::R0 <= reg) && (reg <= ARM::R3)) { + State->setFirstByValReg(reg); + // At a call site, a byval parameter that is split between + // registers and memory needs its size truncated here. In a + // function prologue, such byval parameters are reassembled in + // memory, and are not truncated. + if (State->getCallOrPrologue() == Call) { + unsigned excess = 4 * (ARM::R4 - reg); + assert(size >= excess && "expected larger existing stack allocation"); + size -= excess; + } + } + // Confiscate any remaining parameter registers to preclude their + // assignment to subsequent parameters. + while (State->AllocateReg(GPRArgRegs, 4)) + ; +} + /// MatchingStackOffset - Return true if the given stack call argument is /// already available in the same position (relatively) of the caller's /// incoming argument stack. @@ -1813,6 +1867,16 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N) const { return HasRet; } +bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { + if (!EnableARMTailCalls) + return false; + + if (!CI->isTailCall()) + return false; + + return !Subtarget->isThumb1Only(); +} + // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is // one of the above mentioned nodes. It has to be wrapped because otherwise @@ -2096,7 +2160,7 @@ ARMTargetLowering::LowerEH_SJLJ_DISPATCHSETUP(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); return DAG.getNode(ARMISD::EH_SJLJ_DISPATCHSETUP, dl, MVT::Other, - Op.getOperand(0), Op.getOperand(1)); + Op.getOperand(0)); } SDValue @@ -2151,6 +2215,13 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, } return Result; } + case Intrinsic::arm_neon_vmulls: + case Intrinsic::arm_neon_vmullu: { + unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) + ? ARMISD::VMULLs : ARMISD::VMULLu; + return DAG.getNode(NewOpc, Op.getDebugLoc(), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } } } @@ -2257,6 +2328,88 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2); } +void +ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF, + unsigned &VARegSize, unsigned &VARegSaveSize) + const { + unsigned NumGPRs; + if (CCInfo.isFirstByValRegValid()) + NumGPRs = ARM::R4 - CCInfo.getFirstByValReg(); + else { + unsigned int firstUnalloced; + firstUnalloced = CCInfo.getFirstUnallocated(GPRArgRegs, + sizeof(GPRArgRegs) / + sizeof(GPRArgRegs[0])); + NumGPRs = (firstUnalloced <= 3) ? (4 - firstUnalloced) : 0; + } + + unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment(); + VARegSize = NumGPRs * 4; + VARegSaveSize = (VARegSize + Align - 1) & ~(Align - 1); +} + +// The remaining GPRs hold either the beginning of variable-argument +// data, or the beginning of an aggregate passed by value (usuall +// byval). Either way, we allocate stack slots adjacent to the data +// provided by our caller, and store the unallocated registers there. +// If this is a variadic function, the va_list pointer will begin with +// these values; otherwise, this reassembles a (byval) structure that +// was split between registers and memory. +void +ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, + DebugLoc dl, SDValue &Chain, + unsigned ArgOffset) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned firstRegToSaveIndex; + if (CCInfo.isFirstByValRegValid()) + firstRegToSaveIndex = CCInfo.getFirstByValReg() - ARM::R0; + else { + firstRegToSaveIndex = CCInfo.getFirstUnallocated + (GPRArgRegs, sizeof(GPRArgRegs) / sizeof(GPRArgRegs[0])); + } + + unsigned VARegSize, VARegSaveSize; + computeRegArea(CCInfo, MF, VARegSize, VARegSaveSize); + if (VARegSaveSize) { + // If this function is vararg, store any remaining integer argument regs + // to their spots on the stack so that they may be loaded by deferencing + // the result of va_next. + AFI->setVarArgsRegSaveSize(VARegSaveSize); + AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(VARegSaveSize, + ArgOffset + VARegSaveSize + - VARegSize, + false)); + SDValue FIN = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(), + getPointerTy()); + + SmallVector<SDValue, 4> MemOps; + for (; firstRegToSaveIndex < 4; ++firstRegToSaveIndex) { + TargetRegisterClass *RC; + if (AFI->isThumb1OnlyFunction()) + RC = ARM::tGPRRegisterClass; + else + RC = ARM::GPRRegisterClass; + + unsigned VReg = MF.addLiveIn(GPRArgRegs[firstRegToSaveIndex], RC); + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); + SDValue Store = + DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo::getFixedStack(AFI->getVarArgsFrameIndex()), + false, false, 0); + MemOps.push_back(Store); + FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, + DAG.getConstant(4, getPointerTy())); + } + if (!MemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + &MemOps[0], MemOps.size()); + } else + // This will point to the next argument passed via stack. + AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(4, ArgOffset, true)); +} + SDValue ARMTargetLowering::LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -2265,7 +2418,6 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, DebugLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { - MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -2275,12 +2427,15 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs, *DAG.getContext()); + CCInfo.setCallOrPrologue(Prologue); CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv, /* Return*/ false, isVarArg)); SmallVector<SDValue, 16> ArgValues; + int lastInsIndex = -1; + SDValue ArgValue; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; @@ -2288,7 +2443,6 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); - SDValue ArgValue; if (VA.needsCustom()) { // f64 and vector types are split up into multiple registers or // combinations of registers and stack slots. @@ -2364,67 +2518,45 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, assert(VA.isMemLoc()); assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); - unsigned ArgSize = VA.getLocVT().getSizeInBits()/8; - int FI = MFI->CreateFixedObject(ArgSize, VA.getLocMemOffset(), true); + int index = ArgLocs[i].getValNo(); + + // Some Ins[] entries become multiple ArgLoc[] entries. + // Process them only once. + if (index != lastInsIndex) + { + ISD::ArgFlagsTy Flags = Ins[index].Flags; + // FIXME: For now, all byval parameter objects are marked mutable. + // This can be changed with more analysis. + // In case of tail call optimization mark all arguments mutable. + // Since they could be overwritten by lowering of arguments in case of + // a tail call. + if (Flags.isByVal()) { + unsigned VARegSize, VARegSaveSize; + computeRegArea(CCInfo, MF, VARegSize, VARegSaveSize); + VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 0); + unsigned Bytes = Flags.getByValSize() - VARegSize; + if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. + int FI = MFI->CreateFixedObject(Bytes, + VA.getLocMemOffset(), false); + InVals.push_back(DAG.getFrameIndex(FI, getPointerTy())); + } else { + int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8, + VA.getLocMemOffset(), true); - // Create load nodes to retrieve arguments from the stack. - SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); - InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, 0)); + // Create load nodes to retrieve arguments from the stack. + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, + MachinePointerInfo::getFixedStack(FI), + false, false, 0)); + } + lastInsIndex = index; + } } } // varargs - if (isVarArg) { - static const unsigned GPRArgRegs[] = { - ARM::R0, ARM::R1, ARM::R2, ARM::R3 - }; - - unsigned NumGPRs = CCInfo.getFirstUnallocated - (GPRArgRegs, sizeof(GPRArgRegs) / sizeof(GPRArgRegs[0])); - - unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment(); - unsigned VARegSize = (4 - NumGPRs) * 4; - unsigned VARegSaveSize = (VARegSize + Align - 1) & ~(Align - 1); - unsigned ArgOffset = CCInfo.getNextStackOffset(); - if (VARegSaveSize) { - // If this function is vararg, store any remaining integer argument regs - // to their spots on the stack so that they may be loaded by deferencing - // the result of va_next. - AFI->setVarArgsRegSaveSize(VARegSaveSize); - AFI->setVarArgsFrameIndex( - MFI->CreateFixedObject(VARegSaveSize, - ArgOffset + VARegSaveSize - VARegSize, - false)); - SDValue FIN = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(), - getPointerTy()); - - SmallVector<SDValue, 4> MemOps; - for (; NumGPRs < 4; ++NumGPRs) { - TargetRegisterClass *RC; - if (AFI->isThumb1OnlyFunction()) - RC = ARM::tGPRRegisterClass; - else - RC = ARM::GPRRegisterClass; - - unsigned VReg = MF.addLiveIn(GPRArgRegs[NumGPRs], RC); - SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); - SDValue Store = - DAG.getStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo::getFixedStack(AFI->getVarArgsFrameIndex()), - false, false, 0); - MemOps.push_back(Store); - FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, - DAG.getConstant(4, getPointerTy())); - } - if (!MemOps.empty()) - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - &MemOps[0], MemOps.size()); - } else - // This will point to the next argument passed via stack. - AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(4, ArgOffset, true)); - } + if (isVarArg) + VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset()); return Chain; } @@ -2517,6 +2649,27 @@ ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); } +/// duplicateCmp - Glue values can have only one use, so this function +/// duplicates a comparison node. +SDValue +ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { + unsigned Opc = Cmp.getOpcode(); + DebugLoc DL = Cmp.getDebugLoc(); + if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ) + return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); + + assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation"); + Cmp = Cmp.getOperand(0); + Opc = Cmp.getOpcode(); + if (Opc == ARMISD::CMPFP) + Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); + else { + assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT"); + Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0)); + } + return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); +} + SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Cond = Op.getOperand(0); SDValue SelectTrue = Op.getOperand(1); @@ -2552,7 +2705,7 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { EVT VT = Cond.getValueType(); SDValue ARMcc = Cond.getOperand(2); SDValue CCR = Cond.getOperand(3); - SDValue Cmp = Cond.getOperand(4); + SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG); return DAG.getNode(ARMISD::CMOV, dl, VT, True, False, ARMcc, CCR, Cmp); } } @@ -2681,8 +2834,8 @@ ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { // If one of the operand is zero, it's safe to ignore the NaN case since // we only care about equality comparisons. (SeenZero || (DAG.isKnownNeverNaN(LHS) && DAG.isKnownNeverNaN(RHS)))) { - // If unsafe fp math optimization is enabled and there are no othter uses of - // the CMP operands, and the condition code is EQ oe NE, we can optimize it + // If unsafe fp math optimization is enabled and there are no other uses of + // the CMP operands, and the condition code is EQ or NE, we can optimize it // to an integer comparison. if (CC == ISD::SETOEQ) CC = ISD::SETEQ; @@ -2811,8 +2964,39 @@ static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op); } +static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + + EVT OperandVT = Op.getOperand(0).getValueType(); + assert(OperandVT == MVT::v4i16 && "Invalid type for custom lowering!"); + if (VT != MVT::v4f32) + return DAG.UnrollVectorOp(Op.getNode()); + + unsigned CastOpc; + unsigned Opc; + switch (Op.getOpcode()) { + default: + assert(0 && "Invalid opcode!"); + case ISD::SINT_TO_FP: + CastOpc = ISD::SIGN_EXTEND; + Opc = ISD::SINT_TO_FP; + break; + case ISD::UINT_TO_FP: + CastOpc = ISD::ZERO_EXTEND; + Opc = ISD::UINT_TO_FP; + break; + } + + Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0)); + return DAG.getNode(Opc, dl, VT, Op); +} + static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); + if (VT.isVector()) + return LowerVectorINT_TO_FP(Op, DAG); + DebugLoc dl = Op.getDebugLoc(); unsigned Opc; @@ -2860,7 +3044,10 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT, DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), DAG.getConstant(32, MVT::i32)); - } + } else if (VT == MVT::f32) + Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64, + DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), + DAG.getConstant(32, MVT::i32)); Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1); @@ -2869,11 +3056,11 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes); SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask, DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes)); - + SDValue Res = DAG.getNode(ISD::OR, dl, OpVT, DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask), DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot)); - if (SrcVT == MVT::f32) { + if (VT == MVT::f32) { Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res); Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, DAG.getConstant(0, MVT::i32)); @@ -3508,6 +3695,13 @@ static bool isVREVMask(const SmallVectorImpl<int> &M, EVT VT, return true; } +static bool isVTBLMask(const SmallVectorImpl<int> &M, EVT VT) { + // We can handle <8 x i8> vector shuffles. If the index in the mask is out of + // range, then 0 is placed into the resulting vector. So pretty much any mask + // of 8 elements can work here. + return VT == MVT::v8i8 && M.size() == 8; +} + static bool isVTRNMask(const SmallVectorImpl<int> &M, EVT VT, unsigned &WhichResult) { unsigned EltSz = VT.getVectorElementType().getSizeInBits(); @@ -3947,6 +4141,7 @@ ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, isVREVMask(M, VT, 32) || isVREVMask(M, VT, 16) || isVEXTMask(M, VT, ReverseVEXT, Imm) || + isVTBLMask(M, VT) || isVTRNMask(M, VT, WhichResult) || isVUZPMask(M, VT, WhichResult) || isVZIPMask(M, VT, WhichResult) || @@ -4024,6 +4219,29 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, } } +static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, + SmallVectorImpl<int> &ShuffleMask, + SelectionDAG &DAG) { + // Check to see if we can use the VTBL instruction. + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + DebugLoc DL = Op.getDebugLoc(); + + SmallVector<SDValue, 8> VTBLMask; + for (SmallVectorImpl<int>::iterator + I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I) + VTBLMask.push_back(DAG.getConstant(*I, MVT::i32)); + + if (V2.getNode()->getOpcode() == ISD::UNDEF) + return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, + &VTBLMask[0], 8)); + + return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, + &VTBLMask[0], 8)); +} + static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); @@ -4141,6 +4359,12 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::BITCAST, dl, VT, Val); } + if (VT == MVT::v8i8) { + SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG); + if (NewOp.getNode()) + return NewOp; + } + return SDValue(); } @@ -4290,6 +4514,28 @@ static SDValue SkipExtension(SDNode *N, SelectionDAG &DAG) { MVT::getVectorVT(TruncVT, NumElts), Ops.data(), NumElts); } +static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { + unsigned Opcode = N->getOpcode(); + if (Opcode == ISD::ADD || Opcode == ISD::SUB) { + SDNode *N0 = N->getOperand(0).getNode(); + SDNode *N1 = N->getOperand(1).getNode(); + return N0->hasOneUse() && N1->hasOneUse() && + isSignExtended(N0, DAG) && isSignExtended(N1, DAG); + } + return false; +} + +static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { + unsigned Opcode = N->getOpcode(); + if (Opcode == ISD::ADD || Opcode == ISD::SUB) { + SDNode *N0 = N->getOperand(0).getNode(); + SDNode *N1 = N->getOperand(1).getNode(); + return N0->hasOneUse() && N1->hasOneUse() && + isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); + } + return false; +} + static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { // Multiplications are only custom-lowered for 128-bit vectors so that // VMULL can be detected. Otherwise v2i64 multiplications are not legal. @@ -4298,29 +4544,73 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { SDNode *N0 = Op.getOperand(0).getNode(); SDNode *N1 = Op.getOperand(1).getNode(); unsigned NewOpc = 0; - if (isSignExtended(N0, DAG) && isSignExtended(N1, DAG)) + bool isMLA = false; + bool isN0SExt = isSignExtended(N0, DAG); + bool isN1SExt = isSignExtended(N1, DAG); + if (isN0SExt && isN1SExt) NewOpc = ARMISD::VMULLs; - else if (isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG)) - NewOpc = ARMISD::VMULLu; - else if (VT == MVT::v2i64) - // Fall through to expand this. It is not legal. - return SDValue(); - else - // Other vector multiplications are legal. - return Op; + else { + bool isN0ZExt = isZeroExtended(N0, DAG); + bool isN1ZExt = isZeroExtended(N1, DAG); + if (isN0ZExt && isN1ZExt) + NewOpc = ARMISD::VMULLu; + else if (isN1SExt || isN1ZExt) { + // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these + // into (s/zext A * s/zext C) + (s/zext B * s/zext C) + if (isN1SExt && isAddSubSExt(N0, DAG)) { + NewOpc = ARMISD::VMULLs; + isMLA = true; + } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { + NewOpc = ARMISD::VMULLu; + isMLA = true; + } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { + std::swap(N0, N1); + NewOpc = ARMISD::VMULLu; + isMLA = true; + } + } + + if (!NewOpc) { + if (VT == MVT::v2i64) + // Fall through to expand this. It is not legal. + return SDValue(); + else + // Other vector multiplications are legal. + return Op; + } + } // Legalize to a VMULL instruction. DebugLoc DL = Op.getDebugLoc(); - SDValue Op0 = SkipExtension(N0, DAG); + SDValue Op0; SDValue Op1 = SkipExtension(N1, DAG); - - assert(Op0.getValueType().is64BitVector() && - Op1.getValueType().is64BitVector() && - "unexpected types for extended operands to VMULL"); - return DAG.getNode(NewOpc, DL, VT, Op0, Op1); + if (!isMLA) { + Op0 = SkipExtension(N0, DAG); + assert(Op0.getValueType().is64BitVector() && + Op1.getValueType().is64BitVector() && + "unexpected types for extended operands to VMULL"); + return DAG.getNode(NewOpc, DL, VT, Op0, Op1); + } + + // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during + // isel lowering to take advantage of no-stall back to back vmul + vmla. + // vmull q0, d4, d6 + // vmlal q0, d5, d6 + // is faster than + // vaddl q0, d4, d5 + // vmovl q1, d6 + // vmul q0, q0, q1 + SDValue N00 = SkipExtension(N0->getOperand(0).getNode(), DAG); + SDValue N01 = SkipExtension(N0->getOperand(1).getNode(), DAG); + EVT Op1VT = Op1.getValueType(); + return DAG.getNode(N0->getOpcode(), DL, VT, + DAG.getNode(NewOpc, DL, VT, + DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), + DAG.getNode(NewOpc, DL, VT, + DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); } -static SDValue +static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) { // Convert to float // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); @@ -4331,7 +4621,7 @@ LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) { Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y); // Get reciprocal estimate. // float4 recip = vrecpeq_f32(yf); - Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), Y); // Because char has a smaller range than uchar, we can actually get away // without any newton steps. This requires that we use a weird bias @@ -4349,7 +4639,7 @@ LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) { return X; } -static SDValue +static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, DebugLoc dl, SelectionDAG &DAG) { SDValue N2; // Convert to float. @@ -4359,13 +4649,13 @@ LowerSDIV_v4i16(SDValue N0, SDValue N1, DebugLoc dl, SelectionDAG &DAG) { N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1); N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); - + // Use reciprocal estimate and one refinement step. // float4 recip = vrecpeq_f32(yf); // recip *= vrecpsq_f32(yf, recip); - N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1); - N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32), N1, N2); N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); @@ -4395,15 +4685,15 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2, N3; - + if (VT == MVT::v8i8) { N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0); N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1); - + N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, DAG.getIntPtrConstant(4)); N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, - DAG.getIntPtrConstant(4)); + DAG.getIntPtrConstant(4)); N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, DAG.getIntPtrConstant(0)); N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, @@ -4414,7 +4704,7 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); N0 = LowerCONCAT_VECTORS(N0, DAG); - + N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); return N0; } @@ -4430,32 +4720,32 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2, N3; - + if (VT == MVT::v8i8) { N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0); N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1); - + N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, DAG.getIntPtrConstant(4)); N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, - DAG.getIntPtrConstant(4)); + DAG.getIntPtrConstant(4)); N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, DAG.getIntPtrConstant(0)); N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, DAG.getIntPtrConstant(0)); - + N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 - + N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); N0 = LowerCONCAT_VECTORS(N0, DAG); - - N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, + + N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, MVT::i32), N0); return N0; } - + // v4i16 sdiv ... Convert to float. // float4 yf = vcvt_f32_s32(vmovl_u16(y)); // float4 xf = vcvt_f32_s32(vmovl_u16(x)); @@ -4468,13 +4758,13 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { // float4 recip = vrecpeq_f32(yf); // recip *= vrecpsq_f32(yf, recip); // recip *= vrecpsq_f32(yf, recip); - N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1); - N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32), N1, N2); N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); - N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, + N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32), N1, N2); N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); @@ -4503,7 +4793,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::GlobalAddress: return Subtarget->isTargetDarwin() ? LowerGlobalAddressDarwin(Op, DAG) : LowerGlobalAddressELF(Op, DAG); - case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); + case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::BR_CC: return LowerBR_CC(Op, DAG); @@ -4524,7 +4814,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::EH_SJLJ_DISPATCHSETUP: return LowerEH_SJLJ_DISPATCHSETUP(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, Subtarget); - case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG); + case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG); case ISD::SHL: case ISD::SRL: case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); @@ -4754,6 +5044,109 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, return BB; } +MachineBasicBlock * +ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI, + MachineBasicBlock *BB, + unsigned Size, + bool signExtend, + ARMCC::CondCodes Cond) const { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction *MF = BB->getParent(); + MachineFunction::iterator It = BB; + ++It; + + unsigned dest = MI->getOperand(0).getReg(); + unsigned ptr = MI->getOperand(1).getReg(); + unsigned incr = MI->getOperand(2).getReg(); + unsigned oldval = dest; + DebugLoc dl = MI->getDebugLoc(); + + bool isThumb2 = Subtarget->isThumb2(); + unsigned ldrOpc, strOpc, extendOpc; + switch (Size) { + default: llvm_unreachable("unsupported size for AtomicCmpSwap!"); + case 1: + ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB; + strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB; + extendOpc = isThumb2 ? ARM::t2SXTBr : ARM::SXTBr; + break; + case 2: + ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH; + strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH; + extendOpc = isThumb2 ? ARM::t2SXTHr : ARM::SXTHr; + break; + case 4: + ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX; + strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX; + extendOpc = 0; + break; + } + + MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MF->insert(It, loopMBB); + MF->insert(It, exitMBB); + + // Transfer the remainder of BB and its successor edges to exitMBB. + exitMBB->splice(exitMBB->begin(), BB, + llvm::next(MachineBasicBlock::iterator(MI)), + BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); + + MachineRegisterInfo &RegInfo = MF->getRegInfo(); + unsigned scratch = RegInfo.createVirtualRegister(ARM::GPRRegisterClass); + unsigned scratch2 = RegInfo.createVirtualRegister(ARM::GPRRegisterClass); + + // thisMBB: + // ... + // fallthrough --> loopMBB + BB->addSuccessor(loopMBB); + + // loopMBB: + // ldrex dest, ptr + // (sign extend dest, if required) + // cmp dest, incr + // cmov.cond scratch2, dest, incr + // strex scratch, scratch2, ptr + // cmp scratch, #0 + // bne- loopMBB + // fallthrough --> exitMBB + BB = loopMBB; + AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr)); + + // Sign extend the value, if necessary. + if (signExtend && extendOpc) { + oldval = RegInfo.createVirtualRegister(ARM::GPRRegisterClass); + AddDefaultPred(BuildMI(BB, dl, TII->get(extendOpc), oldval).addReg(dest)); + } + + // Build compare and cmov instructions. + AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) + .addReg(oldval).addReg(incr)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2MOVCCr : ARM::MOVCCr), scratch2) + .addReg(oldval).addReg(incr).addImm(Cond).addReg(ARM::CPSR); + + AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2) + .addReg(ptr)); + AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) + .addReg(scratch).addImm(0)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) + .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); + + BB->addSuccessor(loopMBB); + BB->addSuccessor(exitMBB); + + // exitMBB: + // ... + BB = exitMBB; + + MI->eraseFromParent(); // The instruction is gone now. + + return BB; +} + static MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), @@ -4763,6 +5156,72 @@ MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { llvm_unreachable("Expecting a BB with two successors!"); } +// FIXME: This opcode table should obviously be expressed in the target +// description. We probably just need a "machine opcode" value in the pseudo +// instruction. But the ideal solution maybe to simply remove the "S" version +// of the opcode altogether. +struct AddSubFlagsOpcodePair { + unsigned PseudoOpc; + unsigned MachineOpc; +}; + +static AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = { + {ARM::ADCSri, ARM::ADCri}, + {ARM::ADCSrr, ARM::ADCrr}, + {ARM::ADCSrs, ARM::ADCrs}, + {ARM::SBCSri, ARM::SBCri}, + {ARM::SBCSrr, ARM::SBCrr}, + {ARM::SBCSrs, ARM::SBCrs}, + {ARM::RSBSri, ARM::RSBri}, + {ARM::RSBSrr, ARM::RSBrr}, + {ARM::RSBSrs, ARM::RSBrs}, + {ARM::RSCSri, ARM::RSCri}, + {ARM::RSCSrs, ARM::RSCrs}, + {ARM::t2ADCSri, ARM::t2ADCri}, + {ARM::t2ADCSrr, ARM::t2ADCrr}, + {ARM::t2ADCSrs, ARM::t2ADCrs}, + {ARM::t2SBCSri, ARM::t2SBCri}, + {ARM::t2SBCSrr, ARM::t2SBCrr}, + {ARM::t2SBCSrs, ARM::t2SBCrs}, + {ARM::t2RSBSri, ARM::t2RSBri}, + {ARM::t2RSBSrs, ARM::t2RSBrs}, +}; + +// Convert and Add or Subtract with Carry and Flags to a generic opcode with +// CPSR<def> operand. e.g. ADCS (...) -> ADC (... CPSR<def>). +// +// FIXME: Somewhere we should assert that CPSR<def> is in the correct +// position to be recognized by the target descrition as the 'S' bit. +bool ARMTargetLowering::RemapAddSubWithFlags(MachineInstr *MI, + MachineBasicBlock *BB) const { + unsigned OldOpc = MI->getOpcode(); + unsigned NewOpc = 0; + + // This is only called for instructions that need remapping, so iterating over + // the tiny opcode table is not costly. + static const int NPairs = + sizeof(AddSubFlagsOpcodeMap) / sizeof(AddSubFlagsOpcodePair); + for (AddSubFlagsOpcodePair *Pair = &AddSubFlagsOpcodeMap[0], + *End = &AddSubFlagsOpcodeMap[NPairs]; Pair != End; ++Pair) { + if (OldOpc == Pair->PseudoOpc) { + NewOpc = Pair->MachineOpc; + break; + } + } + if (!NewOpc) + return false; + + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc dl = MI->getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); + for (unsigned i = 0; i < MI->getNumOperands(); ++i) + MIB.addOperand(MI->getOperand(i)); + AddDefaultPred(MIB); + MIB.addReg(ARM::CPSR, RegState::Define); // S bit + MI->eraseFromParent(); + return true; +} + MachineBasicBlock * ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { @@ -4770,10 +5229,13 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, DebugLoc dl = MI->getDebugLoc(); bool isThumb2 = Subtarget->isThumb2(); switch (MI->getOpcode()) { - default: + default: { + if (RemapAddSubWithFlags(MI, BB)) + return BB; + MI->dump(); llvm_unreachable("Unexpected instr type to insert"); - + } case ARM::ATOMIC_LOAD_ADD_I8: return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr); case ARM::ATOMIC_LOAD_ADD_I16: @@ -4816,6 +5278,34 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case ARM::ATOMIC_LOAD_SUB_I32: return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr); + case ARM::ATOMIC_LOAD_MIN_I8: + return EmitAtomicBinaryMinMax(MI, BB, 1, true, ARMCC::LT); + case ARM::ATOMIC_LOAD_MIN_I16: + return EmitAtomicBinaryMinMax(MI, BB, 2, true, ARMCC::LT); + case ARM::ATOMIC_LOAD_MIN_I32: + return EmitAtomicBinaryMinMax(MI, BB, 4, true, ARMCC::LT); + + case ARM::ATOMIC_LOAD_MAX_I8: + return EmitAtomicBinaryMinMax(MI, BB, 1, true, ARMCC::GT); + case ARM::ATOMIC_LOAD_MAX_I16: + return EmitAtomicBinaryMinMax(MI, BB, 2, true, ARMCC::GT); + case ARM::ATOMIC_LOAD_MAX_I32: + return EmitAtomicBinaryMinMax(MI, BB, 4, true, ARMCC::GT); + + case ARM::ATOMIC_LOAD_UMIN_I8: + return EmitAtomicBinaryMinMax(MI, BB, 1, false, ARMCC::LO); + case ARM::ATOMIC_LOAD_UMIN_I16: + return EmitAtomicBinaryMinMax(MI, BB, 2, false, ARMCC::LO); + case ARM::ATOMIC_LOAD_UMIN_I32: + return EmitAtomicBinaryMinMax(MI, BB, 4, false, ARMCC::LO); + + case ARM::ATOMIC_LOAD_UMAX_I8: + return EmitAtomicBinaryMinMax(MI, BB, 1, false, ARMCC::HI); + case ARM::ATOMIC_LOAD_UMAX_I16: + return EmitAtomicBinaryMinMax(MI, BB, 2, false, ARMCC::HI); + case ARM::ATOMIC_LOAD_UMAX_I32: + return EmitAtomicBinaryMinMax(MI, BB, 4, false, ARMCC::HI); + case ARM::ATOMIC_SWAP_I8: return EmitAtomicBinary(MI, BB, 1, 0); case ARM::ATOMIC_SWAP_I16: return EmitAtomicBinary(MI, BB, 2, 0); case ARM::ATOMIC_SWAP_I32: return EmitAtomicBinary(MI, BB, 4, 0); @@ -5034,6 +5524,42 @@ static SDValue PerformSUBCombine(SDNode *N, return SDValue(); } +/// PerformVMULCombine +/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the +/// special multiplier accumulator forwarding. +/// vmul d3, d0, d2 +/// vmla d3, d1, d2 +/// is faster than +/// vadd d3, d0, d1 +/// vmul d3, d3, d2 +static SDValue PerformVMULCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + if (!Subtarget->hasVMLxForwarding()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + unsigned Opcode = N0.getOpcode(); + if (Opcode != ISD::ADD && Opcode != ISD::SUB && + Opcode != ISD::FADD && Opcode != ISD::FSUB) { + Opcode = N0.getOpcode(); + if (Opcode != ISD::ADD && Opcode != ISD::SUB && + Opcode != ISD::FADD && Opcode != ISD::FSUB) + return SDValue(); + std::swap(N0, N1); + } + + EVT VT = N->getValueType(0); + DebugLoc DL = N->getDebugLoc(); + SDValue N00 = N0->getOperand(0); + SDValue N01 = N0->getOperand(1); + return DAG.getNode(Opcode, DL, VT, + DAG.getNode(ISD::MUL, DL, VT, N00, N1), + DAG.getNode(ISD::MUL, DL, VT, N01, N1)); +} + static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { @@ -5046,6 +5572,8 @@ static SDValue PerformMULCombine(SDNode *N, return SDValue(); EVT VT = N->getValueType(0); + if (VT.is64BitVector() || VT.is128BitVector()) + return PerformVMULCombine(N, DCI, Subtarget); if (VT != MVT::i32) return SDValue(); @@ -5088,12 +5616,16 @@ static SDValue PerformMULCombine(SDNode *N, static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { + // Attempt to use immediate-form VBIC BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); DebugLoc dl = N->getDebugLoc(); EVT VT = N->getValueType(0); SelectionDAG &DAG = DCI.DAG; + if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; @@ -5127,6 +5659,9 @@ static SDValue PerformORCombine(SDNode *N, EVT VT = N->getValueType(0); SelectionDAG &DAG = DCI.DAG; + if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; @@ -5147,6 +5682,37 @@ static SDValue PerformORCombine(SDNode *N, } } + SDValue N0 = N->getOperand(0); + if (N0.getOpcode() != ISD::AND) + return SDValue(); + SDValue N1 = N->getOperand(1); + + // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. + if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() && + DAG.getTargetLoweringInfo().isTypeLegal(VT)) { + APInt SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + + BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1)); + APInt SplatBits0; + if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, + HasAnyUndefs) && !HasAnyUndefs) { + BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1)); + APInt SplatBits1; + if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, + HasAnyUndefs) && !HasAnyUndefs && + SplatBits0 == ~SplatBits1) { + // Canonicalize the vector type to make instruction selection simpler. + EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; + SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, + N0->getOperand(1), N0->getOperand(0), + N1->getOperand(0)); + return DAG.getNode(ISD::BITCAST, dl, VT, Result); + } + } + } + // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when // reasonable. @@ -5154,19 +5720,16 @@ static SDValue PerformORCombine(SDNode *N, if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) return SDValue(); - SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); DebugLoc DL = N->getDebugLoc(); // 1) or (and A, mask), val => ARMbfi A, val, mask // iff (val & mask) == val // // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2) - // && CountPopulation_32(mask) == CountPopulation_32(~mask2) + // && mask == ~mask2 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2) - // && CountPopulation_32(mask) == CountPopulation_32(~mask2) + // && ~mask == mask2 // (i.e., copy a bitfield value into another bitfield of the same width) - if (N0.getOpcode() != ISD::AND) - return SDValue(); if (VT != MVT::i32) return SDValue(); @@ -5209,26 +5772,26 @@ static SDValue PerformORCombine(SDNode *N, return SDValue(); unsigned Mask2 = N11C->getZExtValue(); + // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern + // as is to match. if (ARM::isBitFieldInvertedMask(Mask) && - ARM::isBitFieldInvertedMask(~Mask2) && - (CountPopulation_32(Mask) == CountPopulation_32(~Mask2))) { + (Mask == ~Mask2)) { // The pack halfword instruction works better for masks that fit it, // so use that when it's available. if (Subtarget->hasT2ExtractPack() && (Mask == 0xffff || Mask == 0xffff0000)) return SDValue(); // 2a - unsigned lsb = CountTrailingZeros_32(Mask2); + unsigned amt = CountTrailingZeros_32(Mask2); Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), - DAG.getConstant(lsb, MVT::i32)); + DAG.getConstant(amt, MVT::i32)); Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, DAG.getConstant(Mask, MVT::i32)); // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, Res, false); return SDValue(); } else if (ARM::isBitFieldInvertedMask(~Mask) && - ARM::isBitFieldInvertedMask(Mask2) && - (CountPopulation_32(~Mask) == CountPopulation_32(Mask2))) { + (~Mask == Mask2)) { // The pack halfword instruction works better for masks that fit it, // so use that when it's available. if (Subtarget->hasT2ExtractPack() && @@ -5239,7 +5802,7 @@ static SDValue PerformORCombine(SDNode *N, Res = DAG.getNode(ISD::SRL, DL, VT, N00, DAG.getConstant(lsb, MVT::i32)); Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, - DAG.getConstant(Mask2, MVT::i32)); + DAG.getConstant(Mask2, MVT::i32)); // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, Res, false); return SDValue(); @@ -5294,6 +5857,37 @@ static SDValue PerformVMOVRRDCombine(SDNode *N, SDValue InDouble = N->getOperand(0); if (InDouble.getOpcode() == ARMISD::VMOVDRR) return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); + + // vmovrrd(load f64) -> (load i32), (load i32) + SDNode *InNode = InDouble.getNode(); + if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() && + InNode->getValueType(0) == MVT::f64 && + InNode->getOperand(1).getOpcode() == ISD::FrameIndex && + !cast<LoadSDNode>(InNode)->isVolatile()) { + // TODO: Should this be done for non-FrameIndex operands? + LoadSDNode *LD = cast<LoadSDNode>(InNode); + + SelectionDAG &DAG = DCI.DAG; + DebugLoc DL = LD->getDebugLoc(); + SDValue BasePtr = LD->getBasePtr(); + SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, + LD->getPointerInfo(), LD->isVolatile(), + LD->isNonTemporal(), LD->getAlignment()); + + SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, + DAG.getConstant(4, MVT::i32)); + SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, + LD->getPointerInfo(), LD->isVolatile(), + LD->isNonTemporal(), + std::min(4U, LD->getAlignment() / 2)); + + DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); + SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2); + DCI.RemoveFromWorklist(LD); + DAG.DeleteNode(LD); + return Result; + } + return SDValue(); } @@ -5323,8 +5917,28 @@ static SDValue PerformSTORECombine(SDNode *N, // Otherwise, the i64 value will be legalized to a pair of i32 values. StoreSDNode *St = cast<StoreSDNode>(N); SDValue StVal = St->getValue(); - if (!ISD::isNormalStore(St) || St->isVolatile() || - StVal.getValueType() != MVT::i64 || + if (!ISD::isNormalStore(St) || St->isVolatile()) + return SDValue(); + + if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && + StVal.getNode()->hasOneUse() && !St->isVolatile()) { + SelectionDAG &DAG = DCI.DAG; + DebugLoc DL = St->getDebugLoc(); + SDValue BasePtr = St->getBasePtr(); + SDValue NewST1 = DAG.getStore(St->getChain(), DL, + StVal.getNode()->getOperand(0), BasePtr, + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + + SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, + DAG.getConstant(4, MVT::i32)); + return DAG.getStore(NewST1.getValue(0), DL, StVal.getNode()->getOperand(1), + OffsetPtr, St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), + std::min(4U, St->getAlignment() / 2)); + } + + if (StVal.getValueType() != MVT::i64 || StVal.getNode()->getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); @@ -5553,7 +6167,7 @@ static SDValue CombineBaseUpdate(SDNode *N, EVT VecTy; if (isLoad) VecTy = N->getValueType(0); - else + else VecTy = N->getOperand(AddrOpIdx+1).getValueType(); unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; if (isLaneOp) @@ -5603,7 +6217,7 @@ static SDValue CombineBaseUpdate(SDNode *N, DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); break; - } + } return SDValue(); } diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index dc400c4..a2e6260 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -57,7 +57,6 @@ namespace llvm { CMPFPw0, // ARM VFP compare against zero instruction, sets FPSCR. FMSTAT, // ARM fmstat instruction. CMOV, // ARM conditional move instructions. - CNEG, // ARM conditional negate instructions. BCC_i64, @@ -89,7 +88,7 @@ namespace llvm { MEMBARRIER_MCR, // Memory barrier (MCR) PRELOAD, // Preload - + VCEQ, // Vector compare equal. VCEQZ, // Vector compare equal to zero. VCGE, // Vector compare greater than or equal. @@ -154,6 +153,8 @@ namespace llvm { VZIP, // zip (interleave) VUZP, // unzip (deinterleave) VTRN, // transpose + VTBL1, // 1-register shuffle with mask + VTBL2, // 2-register shuffle with mask // Vector multiply long: VMULLs, // ...signed @@ -172,12 +173,15 @@ namespace llvm { // Bit-field insert BFI, - + // Vector OR with immediate VORRIMM, // Vector AND with NOT of immediate VBICIMM, + // Vector bitwise select + VBSL, + // Vector load N-element structure to all lanes: VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE, VLD3DUP, @@ -330,9 +334,6 @@ namespace llvm { Sched::Preference getSchedulingPreference(SDNode *N) const; - unsigned getRegPressureLimit(const TargetRegisterClass *RC, - MachineFunction &MF) const; - bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const; bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const; @@ -407,7 +408,7 @@ namespace llvm { SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, + SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) const; SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const; @@ -425,6 +426,13 @@ namespace llvm { DebugLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const; + void VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, + DebugLoc dl, SDValue &Chain, unsigned ArgOffset) + const; + + void computeRegArea(CCState &CCInfo, MachineFunction &MF, + unsigned &VARegSize, unsigned &VARegSaveSize) const; + virtual SDValue LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, @@ -435,6 +443,9 @@ namespace llvm { DebugLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const; + /// HandleByVal - Target-specific cleanup for ByVal support. + virtual void HandleByVal(CCState *, unsigned &) const; + /// IsEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. Targets which want to do tail call /// optimization should implement this function. @@ -456,10 +467,13 @@ namespace llvm { virtual bool isUsedByReturnOnly(SDNode *N) const; + virtual bool mayBeEmittedAsTailCall(CallInst *CI) const; + SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &ARMcc, SelectionDAG &DAG, DebugLoc dl) const; SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, DebugLoc dl) const; + SDValue duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const; SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const; @@ -470,16 +484,22 @@ namespace llvm { MachineBasicBlock *BB, unsigned Size, unsigned BinOpcode) const; + MachineBasicBlock * EmitAtomicBinaryMinMax(MachineInstr *MI, + MachineBasicBlock *BB, + unsigned Size, + bool signExtend, + ARMCC::CondCodes Cond) const; + bool RemapAddSubWithFlags(MachineInstr *MI, MachineBasicBlock *BB) const; }; - + enum NEONModImmType { VMOVModImm, VMVNModImm, OtherModImm }; - - + + namespace ARM { FastISel *createFastISel(FunctionLoweringInfo &funcInfo); } diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td index 359ac45..f5fb98e 100644 --- a/lib/Target/ARM/ARMInstrFormats.td +++ b/lib/Target/ARM/ARMInstrFormats.td @@ -206,19 +206,30 @@ def setend_op : Operand<i32> { let PrintMethod = "printSetendOperand"; } -def cps_opt : Operand<i32> { - let PrintMethod = "printCPSOptionOperand"; -} - def msr_mask : Operand<i32> { let PrintMethod = "printMSRMaskOperand"; let ParserMatchClass = MSRMaskOperand; } -// A8.6.117, A8.6.118. Different instructions are generated for #0 and #-0. -// The neg_zero operand translates -0 to -1, -1 to -2, ..., etc. -def neg_zero : Operand<i32> { - let PrintMethod = "printNegZeroOperand"; +// Shift Right Immediate - A shift right immediate is encoded differently from +// other shift immediates. The imm6 field is encoded like so: +// +// Offset Encoding +// 8 imm6<5:3> = '001', 8 - <imm> is encoded in imm6<2:0> +// 16 imm6<5:4> = '01', 16 - <imm> is encoded in imm6<3:0> +// 32 imm6<5> = '1', 32 - <imm> is encoded in imm6<4:0> +// 64 64 - <imm> is encoded in imm6<5:0> +def shr_imm8 : Operand<i32> { + let EncoderMethod = "getShiftRight8Imm"; +} +def shr_imm16 : Operand<i32> { + let EncoderMethod = "getShiftRight16Imm"; +} +def shr_imm32 : Operand<i32> { + let EncoderMethod = "getShiftRight32Imm"; +} +def shr_imm64 : Operand<i32> { + let EncoderMethod = "getShiftRight64Imm"; } //===----------------------------------------------------------------------===// @@ -279,6 +290,7 @@ class PseudoInst<dag oops, dag iops, InstrItinClass itin, list<dag> pattern> let OutOperandList = oops; let InOperandList = iops; let Pattern = pattern; + let isCodeGenOnly = 1; } // PseudoInst that's ARM-mode only. @@ -422,11 +434,11 @@ class AIstrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin, opc, asm, "", pattern> { bits<4> Rd; bits<4> Rt; - bits<4> Rn; + bits<4> addr; let Inst{27-23} = 0b00011; let Inst{22-21} = opcod; let Inst{20} = 0; - let Inst{19-16} = Rn; + let Inst{19-16} = addr; let Inst{15-12} = Rd; let Inst{11-4} = 0b11111001; let Inst{3-0} = Rt; @@ -513,6 +525,24 @@ class AI2stridx<bit isByte, bit isPre, dag oops, dag iops, let Inst{19-16} = Rn; let Inst{11-0} = offset{11-0}; } +// FIXME: Merge with the above class when addrmode2 gets used for STR, STRB +// but for now use this class for STRT and STRBT. +class AI2stridxT<bit isByte, bit isPre, dag oops, dag iops, + IndexMode im, Format f, InstrItinClass itin, string opc, + string asm, string cstr, list<dag> pattern> + : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr, + pattern> { + // AM2 store w/ two operands: (GPR, am2offset) + // {17-14} Rn + // {13} 1 == Rm, 0 == imm12 + // {12} isAdd + // {11-0} imm12/Rm + bits<18> addr; + let Inst{25} = addr{13}; + let Inst{23} = addr{12}; + let Inst{19-16} = addr{17-14}; + let Inst{11-0} = addr{11-0}; +} // addrmode3 instructions class AI3ld<bits<4> op, bit op20, dag oops, dag iops, Format f, @@ -547,6 +577,34 @@ class AI3ldstidx<bits<4> op, bit op20, bit isLd, bit isPre, dag oops, dag iops, let Inst{15-12} = Rt; // Rt let Inst{7-4} = op; } + +// FIXME: Merge with the above class when addrmode2 gets used for LDR, LDRB +// but for now use this class for LDRSBT, LDRHT, LDSHT. +class AI3ldstidxT<bits<4> op, bit op20, bit isLd, bit isPre, dag oops, dag iops, + IndexMode im, Format f, InstrItinClass itin, string opc, + string asm, string cstr, list<dag> pattern> + : I<oops, iops, AddrMode3, Size4Bytes, im, f, itin, + opc, asm, cstr, pattern> { + // {13} 1 == imm8, 0 == Rm + // {12-9} Rn + // {8} isAdd + // {7-4} imm7_4/zero + // {3-0} imm3_0/Rm + bits<14> addr; + bits<4> Rt; + let Inst{27-25} = 0b000; + let Inst{24} = isPre; // P bit + let Inst{23} = addr{8}; // U bit + let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm + let Inst{20} = op20; // L bit + let Inst{19-16} = addr{12-9}; // Rn + let Inst{15-12} = Rt; // Rt + let Inst{11-8} = addr{7-4}; // imm7_4/zero + let Inst{7-4} = op; + let Inst{3-0} = addr{3-0}; // imm3_0/Rm + let AsmMatchConverter = "CvtLdWriteBackRegAddrMode3"; +} + class AI3stridx<bits<4> op, bit isByte, bit isPre, dag oops, dag iops, IndexMode im, Format f, InstrItinClass itin, string opc, string asm, string cstr, list<dag> pattern> @@ -619,12 +677,25 @@ class AI3sthpo<dag oops, dag iops, Format f, InstrItinClass itin, string opc, string asm, string cstr, list<dag> pattern> : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin, opc, asm, cstr,pattern> { + // {13} 1 == imm8, 0 == Rm + // {12-9} Rn + // {8} isAdd + // {7-4} imm7_4/zero + // {3-0} imm3_0/Rm + bits<14> addr; + bits<4> Rt; + let Inst{3-0} = addr{3-0}; // imm3_0/Rm let Inst{4} = 1; let Inst{5} = 1; // H bit let Inst{6} = 0; // S bit let Inst{7} = 1; + let Inst{11-8} = addr{7-4}; // imm7_4/zero + let Inst{15-12} = Rt; // Rt + let Inst{19-16} = addr{12-9}; // Rn let Inst{20} = 0; // L bit let Inst{21} = 0; // W bit + let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm + let Inst{23} = addr{8}; // U bit let Inst{24} = 0; // P bit let Inst{27-25} = 0b000; } @@ -1670,7 +1741,8 @@ class N2VImm<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, } // NEON 3 vector register format. -class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4, + +class N3VCommon<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4, dag oops, dag iops, Format f, InstrItinClass itin, string opc, string dt, string asm, string cstr, list<dag> pattern> : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> { @@ -1680,6 +1752,13 @@ class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4, let Inst{11-8} = op11_8; let Inst{6} = op6; let Inst{4} = op4; +} + +class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4, + dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string dt, string asm, string cstr, list<dag> pattern> + : N3VCommon<op24, op23, op21_20, op11_8, op6, op4, + oops, iops, f, itin, opc, dt, asm, cstr, pattern> { // Instruction operands. bits<5> Vd; @@ -1694,6 +1773,47 @@ class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4, let Inst{5} = Vm{4}; } +class N3VLane32<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4, + dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string dt, string asm, string cstr, list<dag> pattern> + : N3VCommon<op24, op23, op21_20, op11_8, op6, op4, + oops, iops, f, itin, opc, dt, asm, cstr, pattern> { + + // Instruction operands. + bits<5> Vd; + bits<5> Vn; + bits<5> Vm; + bit lane; + + let Inst{15-12} = Vd{3-0}; + let Inst{22} = Vd{4}; + let Inst{19-16} = Vn{3-0}; + let Inst{7} = Vn{4}; + let Inst{3-0} = Vm{3-0}; + let Inst{5} = lane; +} + +class N3VLane16<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4, + dag oops, dag iops, Format f, InstrItinClass itin, + string opc, string dt, string asm, string cstr, list<dag> pattern> + : N3VCommon<op24, op23, op21_20, op11_8, op6, op4, + oops, iops, f, itin, opc, dt, asm, cstr, pattern> { + + // Instruction operands. + bits<5> Vd; + bits<5> Vn; + bits<5> Vm; + bits<2> lane; + + let Inst{15-12} = Vd{3-0}; + let Inst{22} = Vd{4}; + let Inst{19-16} = Vn{3-0}; + let Inst{7} = Vn{4}; + let Inst{2-0} = Vm{2-0}; + let Inst{5} = lane{1}; + let Inst{3} = lane{0}; +} + // Same as N3V except it doesn't have a data type suffix. class N3VX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4, @@ -1730,6 +1850,8 @@ class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, let Inst{11-8} = opcod2; let Inst{6-5} = opcod3; let Inst{4} = 1; + // A8.6.303, A8.6.328, A8.6.329 + let Inst{3-0} = 0b0000; let OutOperandList = oops; let InOperandList = !con(iops, (ins pred:$p)); diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 6e3fe2e..209c1a3 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -58,7 +58,7 @@ def SDT_ARMEH_SJLJ_Setjmp : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisInt<2>]>; def SDT_ARMEH_SJLJ_Longjmp: SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisInt<1>]>; -def SDT_ARMEH_SJLJ_DispatchSetup: SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; +def SDT_ARMEH_SJLJ_DispatchSetup: SDTypeProfile<0, 0, []>; def SDT_ARMMEMBARRIER : SDTypeProfile<0, 1, [SDTCisInt<0>]>; @@ -93,8 +93,6 @@ def ARMretflag : SDNode<"ARMISD::RET_FLAG", SDTNone, def ARMcmov : SDNode<"ARMISD::CMOV", SDT_ARMCMov, [SDNPInGlue]>; -def ARMcneg : SDNode<"ARMISD::CNEG", SDT_ARMCMov, - [SDNPInGlue]>; def ARMbrcond : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond, [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>; @@ -205,13 +203,13 @@ def so_imm_not_XFORM : SDNodeXForm<imm, [{ }]>; /// imm1_15 predicate - True if the 32-bit immediate is in the range [1,15]. -def imm1_15 : PatLeaf<(i32 imm), [{ - return (int32_t)N->getZExtValue() >= 1 && (int32_t)N->getZExtValue() < 16; +def imm1_15 : ImmLeaf<i32, [{ + return (int32_t)Imm >= 1 && (int32_t)Imm < 16; }]>; /// imm16_31 predicate - True if the 32-bit immediate is in the range [16,31]. -def imm16_31 : PatLeaf<(i32 imm), [{ - return (int32_t)N->getZExtValue() >= 16 && (int32_t)N->getZExtValue() < 32; +def imm16_31 : ImmLeaf<i32, [{ + return (int32_t)Imm >= 16 && (int32_t)Imm < 32; }]>; def so_imm_neg : @@ -241,8 +239,8 @@ def lo16AllZero : PatLeaf<(i32 imm), [{ /// imm0_65535 predicate - True if the 32-bit immediate is in the range /// [0.65535]. -def imm0_65535 : PatLeaf<(i32 imm), [{ - return (uint32_t)N->getZExtValue() < 65536; +def imm0_65535 : ImmLeaf<i32, [{ + return Imm >= 0 && Imm < 65536; }]>; class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>; @@ -377,17 +375,23 @@ def neon_vcvt_imm32 : Operand<i32> { } // rot_imm: An integer that encodes a rotate amount. Must be 8, 16, or 24. -def rot_imm : Operand<i32>, PatLeaf<(i32 imm), [{ - int32_t v = (int32_t)N->getZExtValue(); +def rot_imm : Operand<i32>, ImmLeaf<i32, [{ + int32_t v = (int32_t)Imm; return v == 8 || v == 16 || v == 24; }]> { let EncoderMethod = "getRotImmOpValue"; } +def ShifterAsmOperand : AsmOperandClass { + let Name = "Shifter"; + let SuperClasses = []; +} + // shift_imm: An integer that encodes a shift amount and the type of shift // (currently either asr or lsl) using the same encoding used for the // immediates in so_reg operands. def shift_imm : Operand<i32> { let PrintMethod = "printShiftImmOperand"; + let ParserMatchClass = ShifterAsmOperand; } // shifter_operand operands: so_reg and so_imm. @@ -396,19 +400,21 @@ def so_reg : Operand<i32>, // reg reg imm [shl,srl,sra,rotr]> { let EncoderMethod = "getSORegOpValue"; let PrintMethod = "printSORegOperand"; - let MIOperandInfo = (ops GPR, GPR, i32imm); + let MIOperandInfo = (ops GPR, GPR, shift_imm); } def shift_so_reg : Operand<i32>, // reg reg imm ComplexPattern<i32, 3, "SelectShiftShifterOperandReg", [shl,srl,sra,rotr]> { let EncoderMethod = "getSORegOpValue"; let PrintMethod = "printSORegOperand"; - let MIOperandInfo = (ops GPR, GPR, i32imm); + let MIOperandInfo = (ops GPR, GPR, shift_imm); } // so_imm - Match a 32-bit shifter_operand immediate operand, which is an // 8-bit immediate rotated by an arbitrary number of bits. -def so_imm : Operand<i32>, PatLeaf<(imm), [{ return Pred_so_imm(N); }]> { +def so_imm : Operand<i32>, ImmLeaf<i32, [{ + return ARM_AM::getSOImmVal(Imm) != -1; + }]> { let EncoderMethod = "getSOImmOpValue"; let PrintMethod = "printSOImmOperand"; } @@ -429,13 +435,13 @@ def arm_i32imm : PatLeaf<(imm), [{ }]>; /// imm0_31 predicate - True if the 32-bit immediate is in the range [0,31]. -def imm0_31 : Operand<i32>, PatLeaf<(imm), [{ - return (int32_t)N->getZExtValue() < 32; +def imm0_31 : Operand<i32>, ImmLeaf<i32, [{ + return Imm >= 0 && Imm < 32; }]>; /// imm0_31_m1 - Matches and prints like imm0_31, but encodes as 'value - 1'. -def imm0_31_m1 : Operand<i32>, PatLeaf<(imm), [{ - return (int32_t)N->getZExtValue() < 32; +def imm0_31_m1 : Operand<i32>, ImmLeaf<i32, [{ + return Imm >= 0 && Imm < 32; }]> { let EncoderMethod = "getImmMinusOneOpValue"; } @@ -458,19 +464,30 @@ def bf_inv_mask_imm : Operand<i32>, } /// lsb_pos_imm - position of the lsb bit, used by BFI4p and t2BFI4p -def lsb_pos_imm : Operand<i32>, PatLeaf<(imm), [{ - return isInt<5>(N->getSExtValue()); +def lsb_pos_imm : Operand<i32>, ImmLeaf<i32, [{ + return isInt<5>(Imm); }]>; /// width_imm - number of bits to be copied, used by BFI4p and t2BFI4p -def width_imm : Operand<i32>, PatLeaf<(imm), [{ - return N->getSExtValue() > 0 && N->getSExtValue() <= 32; +def width_imm : Operand<i32>, ImmLeaf<i32, [{ + return Imm > 0 && Imm <= 32; }] > { let EncoderMethod = "getMsbOpValue"; } // Define ARM specific addressing modes. +def MemMode2AsmOperand : AsmOperandClass { + let Name = "MemMode2"; + let SuperClasses = []; + let ParserMethod = "tryParseMemMode2Operand"; +} + +def MemMode3AsmOperand : AsmOperandClass { + let Name = "MemMode3"; + let SuperClasses = []; + let ParserMethod = "tryParseMemMode3Operand"; +} // addrmode_imm12 := reg +/- imm12 // @@ -501,6 +518,7 @@ def addrmode2 : Operand<i32>, ComplexPattern<i32, 3, "SelectAddrMode2", []> { let EncoderMethod = "getAddrMode2OpValue"; let PrintMethod = "printAddrMode2Operand"; + let ParserMatchClass = MemMode2AsmOperand; let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm); } @@ -519,6 +537,7 @@ def addrmode3 : Operand<i32>, ComplexPattern<i32, 3, "SelectAddrMode3", []> { let EncoderMethod = "getAddrMode3OpValue"; let PrintMethod = "printAddrMode3Operand"; + let ParserMatchClass = MemMode3AsmOperand; let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm); } @@ -586,6 +605,21 @@ def addrmodepc : Operand<i32>, let MIOperandInfo = (ops GPR, i32imm); } +def MemMode7AsmOperand : AsmOperandClass { + let Name = "MemMode7"; + let SuperClasses = []; +} + +// addrmode7 := reg +// Used by load/store exclusive instructions. Useful to enable right assembly +// parsing and printing. Not used for any codegen matching. +// +def addrmode7 : Operand<i32> { + let PrintMethod = "printAddrMode7Operand"; + let MIOperandInfo = (ops GPR); + let ParserMatchClass = MemMode7AsmOperand; +} + def nohash_imm : Operand<i32> { let PrintMethod = "printNoHashImmediate"; } @@ -902,52 +936,23 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, let Inst{19-16} = Rn; } } +} + // Carry setting variants -let isCodeGenOnly = 1, Defs = [CPSR] in { -multiclass AI1_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode, - bit Commutable = 0> { - def Sri : AXI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), - DPFrm, IIC_iALUi, !strconcat(opc, "\t$Rd, $Rn, $imm"), - [(set GPR:$Rd, (opnode GPR:$Rn, so_imm:$imm))]>, - Requires<[IsARM]> { - bits<4> Rd; - bits<4> Rn; - bits<12> imm; - let Inst{15-12} = Rd; - let Inst{19-16} = Rn; - let Inst{11-0} = imm; - let Inst{20} = 1; - let Inst{25} = 1; - } - def Srr : AXI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), - DPFrm, IIC_iALUr, !strconcat(opc, "\t$Rd, $Rn, $Rm"), - [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]>, - Requires<[IsARM]> { - bits<4> Rd; - bits<4> Rn; - bits<4> Rm; - let Inst{11-4} = 0b00000000; +// NOTE: CPSR def omitted because it will be handled by the custom inserter. +let usesCustomInserter = 1 in { +multiclass AI1_adde_sube_s_irs<PatFrag opnode, bit Commutable = 0> { + def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), + Size4Bytes, IIC_iALUi, + [(set GPR:$Rd, (opnode GPR:$Rn, so_imm:$imm))]>; + def rr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + Size4Bytes, IIC_iALUr, + [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]> { let isCommutable = Commutable; - let Inst{3-0} = Rm; - let Inst{15-12} = Rd; - let Inst{19-16} = Rn; - let Inst{20} = 1; - let Inst{25} = 0; - } - def Srs : AXI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift), - DPSoRegFrm, IIC_iALUsr, !strconcat(opc, "\t$Rd, $Rn, $shift"), - [(set GPR:$Rd, (opnode GPR:$Rn, so_reg:$shift))]>, - Requires<[IsARM]> { - bits<4> Rd; - bits<4> Rn; - bits<12> shift; - let Inst{11-0} = shift; - let Inst{15-12} = Rd; - let Inst{19-16} = Rn; - let Inst{20} = 1; - let Inst{25} = 0; } -} + def rs : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift), + Size4Bytes, IIC_iALUsr, + [(set GPR:$Rd, (opnode GPR:$Rn, so_reg:$shift))]>; } } @@ -972,6 +977,7 @@ multiclass AI_ldr1<bit isByte, string opc, InstrItinClass iii, [(set GPR:$Rt, (opnode ldst_so_reg:$shift))]> { bits<4> Rt; bits<17> shift; + let shift{4} = 0; // Inst{4} = 0 let Inst{23} = shift{12}; // U (add = ('U' == 1)) let Inst{19-16} = shift{16-13}; // Rn let Inst{15-12} = Rt; @@ -1001,6 +1007,7 @@ multiclass AI_str1<bit isByte, string opc, InstrItinClass iii, [(opnode GPR:$Rt, ldst_so_reg:$shift)]> { bits<4> Rt; bits<17> shift; + let shift{4} = 0; // Inst{4} = 0 let Inst{23} = shift{12}; // U (add = ('U' == 1)) let Inst{19-16} = shift{16-13}; // Rn let Inst{15-12} = Rt; @@ -1249,7 +1256,7 @@ let neverHasSideEffects = 1, isReMaterializable = 1 in // The 'adr' mnemonic encodes differently if the label is before or after // the instruction. The {24-21} opcode bits are set by the fixup, as we don't // know until then which form of the instruction will be used. -def ADR : AI1<0, (outs GPR:$Rd), (ins adrlabel:$label), +def ADR : AI1<{0,?,?,0}, (outs GPR:$Rd), (ins adrlabel:$label), MiscFrm, IIC_iALUi, "adr", "\t$Rd, #$label", []> { bits<4> Rd; bits<12> label; @@ -1311,6 +1318,9 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { // before calls from potentially appearing dead. let isCall = 1, // On non-Darwin platforms R9 is callee-saved. + // FIXME: Do we really need a non-predicated version? If so, it should + // at least be a pseudo instruction expanding to the predicated version + // at MC lowering time. Defs = [R0, R1, R2, R3, R12, LR, D0, D1, D2, D3, D4, D5, D6, D7, D16, D17, D18, D19, D20, D21, D22, D23, @@ -1340,7 +1350,16 @@ let isCall = 1, Requires<[IsARM, HasV5T, IsNotDarwin]> { bits<4> func; let Inst{31-4} = 0b1110000100101111111111110011; - let Inst{3-0} = func; + let Inst{3-0} = func; + } + + def BLX_pred : AI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm, + IIC_Br, "blx", "\t$func", + [(ARMcall_pred GPR:$func)]>, + Requires<[IsARM, HasV5T, IsNotDarwin]> { + bits<4> func; + let Inst{27-4} = 0b000100101111111111110011; + let Inst{3-0} = func; } // ARMv4T @@ -1364,30 +1383,25 @@ let isCall = 1, D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR], Uses = [R7, SP] in { - def BLr9 : ABXI<0b1011, (outs), (ins bltarget:$func, variable_ops), - IIC_Br, "bl\t$func", - [(ARMcall tglobaladdr:$func)]>, Requires<[IsARM, IsDarwin]> { - let Inst{31-28} = 0b1110; - bits<24> func; - let Inst{23-0} = func; - } + def BLr9 : ARMPseudoInst<(outs), (ins bltarget:$func, variable_ops), + Size4Bytes, IIC_Br, + [(ARMcall tglobaladdr:$func)]>, Requires<[IsARM, IsDarwin]>; - def BLr9_pred : ABI<0b1011, (outs), (ins bltarget:$func, variable_ops), - IIC_Br, "bl", "\t$func", + def BLr9_pred : ARMPseudoInst<(outs), + (ins bltarget:$func, pred:$p, variable_ops), + Size4Bytes, IIC_Br, [(ARMcall_pred tglobaladdr:$func)]>, - Requires<[IsARM, IsDarwin]> { - bits<24> func; - let Inst{23-0} = func; - } + Requires<[IsARM, IsDarwin]>; // ARMv5T and above - def BLXr9 : AXI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm, - IIC_Br, "blx\t$func", - [(ARMcall GPR:$func)]>, Requires<[IsARM, HasV5T, IsDarwin]> { - bits<4> func; - let Inst{31-4} = 0b1110000100101111111111110011; - let Inst{3-0} = func; - } + def BLXr9 : ARMPseudoInst<(outs), (ins GPR:$func, variable_ops), + Size4Bytes, IIC_Br, + [(ARMcall GPR:$func)]>, Requires<[IsARM, HasV5T, IsDarwin]>; + + def BLXr9_pred: ARMPseudoInst<(outs), (ins GPR:$func, pred:$p, variable_ops), + Size4Bytes, IIC_Br, + [(ARMcall_pred GPR:$func)]>, + Requires<[IsARM, HasV5T, IsDarwin]>; // ARMv4T // Note: Restrict $func to the tGPR regclass to prevent it being in LR. @@ -1403,11 +1417,7 @@ let isCall = 1, // Tail calls. -// FIXME: These should probably be xformed into the non-TC versions of the -// instructions as part of MC lowering. -// FIXME: These seem to be used for both Thumb and ARM instruction selection. -// Thumb should have its own version since the instruction is actually -// different, even though the mnemonic is the same. +// FIXME: The Thumb versions of these should live in ARMInstrThumb.td let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in { // Darwin versions. let Defs = [R0, R1, R2, R3, R9, R12, @@ -1421,21 +1431,21 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in { def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst, variable_ops), IIC_Br, []>, Requires<[IsDarwin]>; - def TAILJMPd : ABXI<0b1010, (outs), (ins brtarget:$dst, variable_ops), - IIC_Br, "b\t$dst @ TAILCALL", + def TAILJMPd : ARMPseudoInst<(outs), (ins brtarget:$dst, variable_ops), + Size4Bytes, IIC_Br, []>, Requires<[IsARM, IsDarwin]>; - def TAILJMPdt: ABXI<0b1010, (outs), (ins brtarget:$dst, variable_ops), - IIC_Br, "b.w\t$dst @ TAILCALL", + def tTAILJMPd: tPseudoInst<(outs), (ins brtarget:$dst, variable_ops), + Size4Bytes, IIC_Br, []>, Requires<[IsThumb, IsDarwin]>; - def TAILJMPr : AXI<(outs), (ins tcGPR:$dst, variable_ops), - BrMiscFrm, IIC_Br, "bx\t$dst @ TAILCALL", - []>, Requires<[IsDarwin]> { - bits<4> dst; - let Inst{31-4} = 0b1110000100101111111111110001; - let Inst{3-0} = dst; - } + def TAILJMPr : ARMPseudoInst<(outs), (ins tcGPR:$dst, variable_ops), + Size4Bytes, IIC_Br, + []>, Requires<[IsARM, IsDarwin]>; + + def tTAILJMPr : tPseudoInst<(outs), (ins tcGPR:$dst, variable_ops), + Size4Bytes, IIC_Br, + []>, Requires<[IsThumb, IsDarwin]>; } // Non-Darwin versions (the difference is R9). @@ -1450,34 +1460,31 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in { def TCRETURNriND : PseudoInst<(outs), (ins tcGPR:$dst, variable_ops), IIC_Br, []>, Requires<[IsNotDarwin]>; - def TAILJMPdND : ABXI<0b1010, (outs), (ins brtarget:$dst, variable_ops), - IIC_Br, "b\t$dst @ TAILCALL", + def TAILJMPdND : ARMPseudoInst<(outs), (ins brtarget:$dst, variable_ops), + Size4Bytes, IIC_Br, []>, Requires<[IsARM, IsNotDarwin]>; - def TAILJMPdNDt : ABXI<0b1010, (outs), (ins brtarget:$dst, variable_ops), - IIC_Br, "b.w\t$dst @ TAILCALL", + def tTAILJMPdND : tPseudoInst<(outs), (ins brtarget:$dst, variable_ops), + Size4Bytes, IIC_Br, []>, Requires<[IsThumb, IsNotDarwin]>; - def TAILJMPrND : AXI<(outs), (ins tcGPR:$dst, variable_ops), - BrMiscFrm, IIC_Br, "bx\t$dst @ TAILCALL", - []>, Requires<[IsNotDarwin]> { - bits<4> dst; - let Inst{31-4} = 0b1110000100101111111111110001; - let Inst{3-0} = dst; - } + def TAILJMPrND : ARMPseudoInst<(outs), (ins tcGPR:$dst, variable_ops), + Size4Bytes, IIC_Br, + []>, Requires<[IsARM, IsNotDarwin]>; + def tTAILJMPrND : tPseudoInst<(outs), (ins tcGPR:$dst, variable_ops), + Size4Bytes, IIC_Br, + []>, Requires<[IsThumb, IsNotDarwin]>; } } let isBranch = 1, isTerminator = 1 in { - // B is "predicable" since it can be xformed into a Bcc. + // B is "predicable" since it's just a Bcc with an 'always' condition. let isBarrier = 1 in { let isPredicable = 1 in - def B : ABXI<0b1010, (outs), (ins brtarget:$target), IIC_Br, - "b\t$target", [(br bb:$target)]> { - bits<24> target; - let Inst{31-28} = 0b1110; - let Inst{23-0} = target; - } + // FIXME: We shouldn't need this pseudo at all. Just using Bcc directly + // should be sufficient. + def B : ARMPseudoInst<(outs), (ins brtarget:$target), Size4Bytes, IIC_Br, + [(br bb:$target)]>; let isNotDuplicable = 1, isIndirectBranch = 1 in { def BR_JTr : ARMPseudoInst<(outs), @@ -1509,6 +1516,16 @@ let isBranch = 1, isTerminator = 1 in { } } +// BLX (immediate) -- for disassembly only +def BLXi : AXI<(outs), (ins br_target:$target), BrMiscFrm, NoItinerary, + "blx\t$target", [/* pattern left blank */]>, + Requires<[IsARM, HasV5T]> { + let Inst{31-25} = 0b1111101; + bits<25> target; + let Inst{23-0} = target{24-1}; + let Inst{24} = target{0}; +} + // Branch and Exchange Jazelle -- for disassembly only def BXJ : ABI<0b0001, (outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func", [/* For disassembly only; pattern left blank */]> { @@ -1533,6 +1550,7 @@ def SVC : ABI<0b1111, (outs), (ins i32imm:$svc), IIC_Br, "svc", "\t$svc", let Inst{23-0} = svc; } } +def : MnemonicAlias<"swi", "svc">; // Store Return State is a system instruction -- for disassembly only let isCodeGenOnly = 1 in { // FIXME: This should not use submode! @@ -1541,6 +1559,8 @@ def SRSW : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, i32imm:$mode), [/* For disassembly only; pattern left blank */]> { let Inst{31-28} = 0b1111; let Inst{22-20} = 0b110; // W = 1 + let Inst{19-8} = 0xd05; + let Inst{7-5} = 0b000; } def SRS : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, i32imm:$mode), @@ -1548,6 +1568,8 @@ def SRS : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, i32imm:$mode), [/* For disassembly only; pattern left blank */]> { let Inst{31-28} = 0b1111; let Inst{22-20} = 0b100; // W = 0 + let Inst{19-8} = 0xd05; + let Inst{7-5} = 0b000; } // Return From Exception is a system instruction -- for disassembly only @@ -1556,6 +1578,7 @@ def RFEW : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, GPR:$base), [/* For disassembly only; pattern left blank */]> { let Inst{31-28} = 0b1111; let Inst{22-20} = 0b011; // W = 1 + let Inst{15-0} = 0x0a00; } def RFE : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, GPR:$base), @@ -1563,6 +1586,7 @@ def RFE : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, GPR:$base), [/* For disassembly only; pattern left blank */]> { let Inst{31-28} = 0b1111; let Inst{22-20} = 0b001; // W = 0 + let Inst{15-0} = 0x0a00; } } // isCodeGenOnly = 1 @@ -1610,15 +1634,11 @@ def LDRSB : AI3ld<0b1101, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm, IIC_iLoad_bh_r, "ldrsb", "\t$Rt, $addr", [(set GPR:$Rt, (sextloadi8 addrmode3:$addr))]>; -let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1, - isCodeGenOnly = 1 in { // $dst2 doesn't exist in asmstring? -// FIXME: $dst2 isn't in the asm string as it's implied by $Rd (dst2 = Rd+1) -// how to represent that such that tblgen is happy and we don't -// mark this codegen only? +let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { // Load doubleword def LDRD : AI3ld<0b1101, 0, (outs GPR:$Rd, GPR:$dst2), (ins addrmode3:$addr), LdMiscFrm, - IIC_iLoad_d_r, "ldrd", "\t$Rd, $addr", + IIC_iLoad_d_r, "ldrd", "\t$Rd, $dst2, $addr", []>, Requires<[IsARM, HasV5TE]>; } @@ -1636,6 +1656,7 @@ multiclass AI2_ldridx<bit isByte, string opc, InstrItinClass itin> { let Inst{23} = addr{12}; let Inst{19-16} = addr{17-14}; let Inst{11-0} = addr{11-0}; + let AsmMatchConverter = "CvtLdWriteBackRegAddrMode2"; } def _POST : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb), (ins GPR:$Rn, am2offset:$offset), @@ -1688,40 +1709,80 @@ let mayLoad = 1, neverHasSideEffects = 1 in { defm LDRH : AI3_ldridx<0b1011, 1, "ldrh", IIC_iLoad_bh_ru>; defm LDRSH : AI3_ldridx<0b1111, 1, "ldrsh", IIC_iLoad_bh_ru>; defm LDRSB : AI3_ldridx<0b1101, 1, "ldrsb", IIC_iLoad_bh_ru>; -let hasExtraDefRegAllocReq = 1, isCodeGenOnly = 1 in -defm LDRD : AI3_ldridx<0b1101, 0, "ldrd", IIC_iLoad_d_ru>; +let hasExtraDefRegAllocReq = 1 in { +def LDRD_PRE : AI3ldstidx<0b1101, 0, 1, 1, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb), + (ins addrmode3:$addr), IndexModePre, + LdMiscFrm, IIC_iLoad_d_ru, + "ldrd", "\t$Rt, $Rt2, $addr!", + "$addr.base = $Rn_wb", []> { + bits<14> addr; + let Inst{23} = addr{8}; // U bit + let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm + let Inst{19-16} = addr{12-9}; // Rn + let Inst{11-8} = addr{7-4}; // imm7_4/zero + let Inst{3-0} = addr{3-0}; // imm3_0/Rm +} +def LDRD_POST: AI3ldstidx<0b1101, 0, 1, 0, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb), + (ins GPR:$Rn, am3offset:$offset), IndexModePost, + LdMiscFrm, IIC_iLoad_d_ru, + "ldrd", "\t$Rt, $Rt2, [$Rn], $offset", + "$Rn = $Rn_wb", []> { + bits<10> offset; + bits<4> Rn; + let Inst{23} = offset{8}; // U bit + let Inst{22} = offset{9}; // 1 == imm8, 0 == Rm + let Inst{19-16} = Rn; + let Inst{11-8} = offset{7-4}; // imm7_4/zero + let Inst{3-0} = offset{3-0}; // imm3_0/Rm +} +} // hasExtraDefRegAllocReq = 1 } // mayLoad = 1, neverHasSideEffects = 1 // LDRT, LDRBT, LDRSBT, LDRHT, LDRSHT are for disassembly only. let mayLoad = 1, neverHasSideEffects = 1 in { -def LDRT : AI2ldstidx<1, 0, 0, (outs GPR:$dst, GPR:$base_wb), - (ins GPR:$base, am2offset:$offset), IndexModeNone, - LdFrm, IIC_iLoad_ru, - "ldrt", "\t$dst, [$base], $offset", "$base = $base_wb", []> { +def LDRT : AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$base_wb), + (ins addrmode2:$addr), IndexModePost, LdFrm, IIC_iLoad_ru, + "ldrt", "\t$Rt, $addr", "$addr.base = $base_wb", []> { + // {17-14} Rn + // {13} 1 == Rm, 0 == imm12 + // {12} isAdd + // {11-0} imm12/Rm + bits<18> addr; + let Inst{25} = addr{13}; + let Inst{23} = addr{12}; let Inst{21} = 1; // overwrite -} -def LDRBT : AI2ldstidx<1, 1, 0, (outs GPR:$dst, GPR:$base_wb), - (ins GPR:$base, am2offset:$offset), IndexModeNone, - LdFrm, IIC_iLoad_bh_ru, - "ldrbt", "\t$dst, [$base], $offset", "$base = $base_wb", []> { + let Inst{19-16} = addr{17-14}; + let Inst{11-0} = addr{11-0}; + let AsmMatchConverter = "CvtLdWriteBackRegAddrMode2"; +} +def LDRBT : AI2ldstidx<1, 1, 0, (outs GPR:$Rt, GPR:$base_wb), + (ins addrmode2:$addr), IndexModePost, LdFrm, IIC_iLoad_bh_ru, + "ldrbt", "\t$Rt, $addr", "$addr.base = $base_wb", []> { + // {17-14} Rn + // {13} 1 == Rm, 0 == imm12 + // {12} isAdd + // {11-0} imm12/Rm + bits<18> addr; + let Inst{25} = addr{13}; + let Inst{23} = addr{12}; let Inst{21} = 1; // overwrite + let Inst{19-16} = addr{17-14}; + let Inst{11-0} = addr{11-0}; + let AsmMatchConverter = "CvtLdWriteBackRegAddrMode2"; } -def LDRSBT : AI3ldstidx<0b1101, 1, 1, 0, (outs GPR:$dst, GPR:$base_wb), - (ins GPR:$base, am3offset:$offset), IndexModePost, - LdMiscFrm, IIC_iLoad_bh_ru, - "ldrsbt", "\t$dst, [$base], $offset", "$base = $base_wb", []> { +def LDRSBT : AI3ldstidxT<0b1101, 1, 1, 0, (outs GPR:$Rt, GPR:$base_wb), + (ins addrmode3:$addr), IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru, + "ldrsbt", "\t$Rt, $addr", "$addr.base = $base_wb", []> { let Inst{21} = 1; // overwrite } -def LDRHT : AI3ldstidx<0b1011, 1, 1, 0, (outs GPR:$dst, GPR:$base_wb), - (ins GPR:$base, am3offset:$offset), IndexModePost, - LdMiscFrm, IIC_iLoad_bh_ru, - "ldrht", "\t$dst, [$base], $offset", "$base = $base_wb", []> { +def LDRHT : AI3ldstidxT<0b1011, 1, 1, 0, (outs GPR:$Rt, GPR:$base_wb), + (ins addrmode3:$addr), IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru, + "ldrht", "\t$Rt, $addr", "$addr.base = $base_wb", []> { let Inst{21} = 1; // overwrite } -def LDRSHT : AI3ldstidx<0b1111, 1, 1, 0, (outs GPR:$dst, GPR:$base_wb), - (ins GPR:$base, am3offset:$offset), IndexModePost, - LdMiscFrm, IIC_iLoad_bh_ru, - "ldrsht", "\t$dst, [$base], $offset", "$base = $base_wb", []> { +def LDRSHT : AI3ldstidxT<0b1111, 1, 1, 0, (outs GPR:$Rt, GPR:$base_wb), + (ins addrmode3:$addr), IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru, + "ldrsht", "\t$Rt, $addr", "$addr.base = $base_wb", []> { let Inst{21} = 1; // overwrite } } @@ -1734,55 +1795,61 @@ def STRH : AI3str<0b1011, (outs), (ins GPR:$Rt, addrmode3:$addr), StMiscFrm, [(truncstorei16 GPR:$Rt, addrmode3:$addr)]>; // Store doubleword -let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1, - isCodeGenOnly = 1 in // $src2 doesn't exist in asm string -def STRD : AI3str<0b1111, (outs), (ins GPR:$src1, GPR:$src2, addrmode3:$addr), +let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in +def STRD : AI3str<0b1111, (outs), (ins GPR:$Rt, GPR:$src2, addrmode3:$addr), StMiscFrm, IIC_iStore_d_r, - "strd", "\t$src1, $addr", []>, Requires<[IsARM, HasV5TE]>; + "strd", "\t$Rt, $src2, $addr", []>, Requires<[IsARM, HasV5TE]>; // Indexed stores def STR_PRE : AI2stridx<0, 1, (outs GPR:$Rn_wb), (ins GPR:$Rt, GPR:$Rn, am2offset:$offset), IndexModePre, StFrm, IIC_iStore_ru, - "str", "\t$Rt, [$Rn, $offset]!", "$Rn = $Rn_wb", + "str", "\t$Rt, [$Rn, $offset]!", + "$Rn = $Rn_wb,@earlyclobber $Rn_wb", [(set GPR:$Rn_wb, (pre_store GPR:$Rt, GPR:$Rn, am2offset:$offset))]>; def STR_POST : AI2stridx<0, 0, (outs GPR:$Rn_wb), (ins GPR:$Rt, GPR:$Rn, am2offset:$offset), IndexModePost, StFrm, IIC_iStore_ru, - "str", "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb", + "str", "\t$Rt, [$Rn], $offset", + "$Rn = $Rn_wb,@earlyclobber $Rn_wb", [(set GPR:$Rn_wb, (post_store GPR:$Rt, GPR:$Rn, am2offset:$offset))]>; def STRB_PRE : AI2stridx<1, 1, (outs GPR:$Rn_wb), (ins GPR:$Rt, GPR:$Rn, am2offset:$offset), IndexModePre, StFrm, IIC_iStore_bh_ru, - "strb", "\t$Rt, [$Rn, $offset]!", "$Rn = $Rn_wb", + "strb", "\t$Rt, [$Rn, $offset]!", + "$Rn = $Rn_wb,@earlyclobber $Rn_wb", [(set GPR:$Rn_wb, (pre_truncsti8 GPR:$Rt, GPR:$Rn, am2offset:$offset))]>; def STRB_POST: AI2stridx<1, 0, (outs GPR:$Rn_wb), (ins GPR:$Rt, GPR:$Rn, am2offset:$offset), IndexModePost, StFrm, IIC_iStore_bh_ru, - "strb", "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb", + "strb", "\t$Rt, [$Rn], $offset", + "$Rn = $Rn_wb,@earlyclobber $Rn_wb", [(set GPR:$Rn_wb, (post_truncsti8 GPR:$Rt, GPR:$Rn, am2offset:$offset))]>; def STRH_PRE : AI3stridx<0b1011, 0, 1, (outs GPR:$Rn_wb), (ins GPR:$Rt, GPR:$Rn, am3offset:$offset), IndexModePre, StMiscFrm, IIC_iStore_ru, - "strh", "\t$Rt, [$Rn, $offset]!", "$Rn = $Rn_wb", + "strh", "\t$Rt, [$Rn, $offset]!", + "$Rn = $Rn_wb,@earlyclobber $Rn_wb", [(set GPR:$Rn_wb, (pre_truncsti16 GPR:$Rt, GPR:$Rn, am3offset:$offset))]>; def STRH_POST: AI3stridx<0b1011, 0, 0, (outs GPR:$Rn_wb), (ins GPR:$Rt, GPR:$Rn, am3offset:$offset), IndexModePost, StMiscFrm, IIC_iStore_bh_ru, - "strh", "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb", + "strh", "\t$Rt, [$Rn], $offset", + "$Rn = $Rn_wb,@earlyclobber $Rn_wb", [(set GPR:$Rn_wb, (post_truncsti16 GPR:$Rt, GPR:$Rn, am3offset:$offset))]>; // For disassembly only +let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in { def STRD_PRE : AI3stdpr<(outs GPR:$base_wb), (ins GPR:$src1, GPR:$src2, GPR:$base, am3offset:$offset), StMiscFrm, IIC_iStore_d_ru, @@ -1795,31 +1862,32 @@ def STRD_POST: AI3stdpo<(outs GPR:$base_wb), StMiscFrm, IIC_iStore_d_ru, "strd", "\t$src1, $src2, [$base], $offset", "$base = $base_wb", []>; +} // mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 // STRT, STRBT, and STRHT are for disassembly only. -def STRT : AI2stridx<0, 0, (outs GPR:$Rn_wb), - (ins GPR:$Rt, GPR:$Rn,am2offset:$offset), - IndexModeNone, StFrm, IIC_iStore_ru, - "strt", "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb", - [/* For disassembly only; pattern left blank */]> { +def STRT : AI2stridxT<0, 0, (outs GPR:$Rn_wb), (ins GPR:$Rt, addrmode2:$addr), + IndexModePost, StFrm, IIC_iStore_ru, + "strt", "\t$Rt, $addr", "$addr.base = $Rn_wb", + [/* For disassembly only; pattern left blank */]> { let Inst{21} = 1; // overwrite + let AsmMatchConverter = "CvtStWriteBackRegAddrMode2"; } -def STRBT : AI2stridx<1, 0, (outs GPR:$Rn_wb), - (ins GPR:$Rt, GPR:$Rn, am2offset:$offset), - IndexModeNone, StFrm, IIC_iStore_bh_ru, - "strbt", "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb", - [/* For disassembly only; pattern left blank */]> { +def STRBT : AI2stridxT<1, 0, (outs GPR:$Rn_wb), (ins GPR:$Rt, addrmode2:$addr), + IndexModePost, StFrm, IIC_iStore_bh_ru, + "strbt", "\t$Rt, $addr", "$addr.base = $Rn_wb", + [/* For disassembly only; pattern left blank */]> { let Inst{21} = 1; // overwrite + let AsmMatchConverter = "CvtStWriteBackRegAddrMode2"; } -def STRHT: AI3sthpo<(outs GPR:$base_wb), - (ins GPR:$src, GPR:$base,am3offset:$offset), +def STRHT: AI3sthpo<(outs GPR:$base_wb), (ins GPR:$Rt, addrmode3:$addr), StMiscFrm, IIC_iStore_bh_ru, - "strht", "\t$src, [$base], $offset", "$base = $base_wb", + "strht", "\t$Rt, $addr", "$addr.base = $base_wb", [/* For disassembly only; pattern left blank */]> { let Inst{21} = 1; // overwrite + let AsmMatchConverter = "CvtStWriteBackRegAddrMode3"; } //===----------------------------------------------------------------------===// @@ -1892,7 +1960,7 @@ multiclass arm_ldst_mult<string asm, bit L_bit, Format f, let Inst{21} = 1; // Writeback let Inst{20} = L_bit; } -} +} let neverHasSideEffects = 1 in { @@ -1912,16 +1980,10 @@ def : MnemonicAlias<"stm", "stmia">; // FIXME: Should pc be an implicit operand like PICADD, etc? let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1, hasExtraDefRegAllocReq = 1, isCodeGenOnly = 1 in -// FIXME: Should be a pseudo-instruction. -def LDMIA_RET : AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, - reglist:$regs, variable_ops), - IndexModeUpd, LdStMulFrm, IIC_iLoad_mBr, - "ldmia${p}\t$Rn!, $regs", - "$Rn = $wb", []> { - let Inst{24-23} = 0b01; // Increment After - let Inst{21} = 1; // Writeback - let Inst{20} = 1; // Load -} +def LDMIA_RET : ARMPseudoInst<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, + reglist:$regs, variable_ops), + Size4Bytes, IIC_iLoad_mBr, []>, + RegConstraint<"$Rn = $wb">; //===----------------------------------------------------------------------===// // Move Instructions. @@ -1933,6 +1995,7 @@ def MOVr : AsI1<0b1101, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMOVr, bits<4> Rd; bits<4> Rm; + let Inst{19-16} = 0b0000; let Inst{11-4} = 0b00000000; let Inst{25} = 0; let Inst{3-0} = Rm; @@ -1959,6 +2022,7 @@ def MOVs : AsI1<0b1101, (outs GPR:$Rd), (ins shift_so_reg:$src), bits<4> Rd; bits<12> src; let Inst{15-12} = Rd; + let Inst{19-16} = 0b0000; let Inst{11-0} = src; let Inst{25} = 0; } @@ -2145,10 +2209,12 @@ defm SBC : AI1_adde_sube_irs<0b0110, "sbc", BinOpFrag<(sube_dead_carry node:$LHS, node:$RHS)>>; // ADC and SUBC with 's' bit set. -defm ADCS : AI1_adde_sube_s_irs<0b0101, "adcs", - BinOpFrag<(adde_live_carry node:$LHS, node:$RHS)>, 1>; -defm SBCS : AI1_adde_sube_s_irs<0b0110, "sbcs", - BinOpFrag<(sube_live_carry node:$LHS, node:$RHS) >>; +let usesCustomInserter = 1 in { +defm ADCS : AI1_adde_sube_s_irs< + BinOpFrag<(adde_live_carry node:$LHS, node:$RHS)>, 1>; +defm SBCS : AI1_adde_sube_s_irs< + BinOpFrag<(sube_live_carry node:$LHS, node:$RHS) >>; +} def RSBri : AsI1<0b0011, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm, IIC_iALUi, "rsb", "\t$Rd, $Rn, $imm", @@ -2190,31 +2256,17 @@ def RSBrs : AsI1<0b0011, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift), } // RSB with 's' bit set. -let isCodeGenOnly = 1, Defs = [CPSR] in { -def RSBSri : AI1<0b0011, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm, - IIC_iALUi, "rsbs", "\t$Rd, $Rn, $imm", - [(set GPR:$Rd, (subc so_imm:$imm, GPR:$Rn))]> { - bits<4> Rd; - bits<4> Rn; - bits<12> imm; - let Inst{25} = 1; - let Inst{20} = 1; - let Inst{15-12} = Rd; - let Inst{19-16} = Rn; - let Inst{11-0} = imm; -} -def RSBSrs : AI1<0b0011, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift), - DPSoRegFrm, IIC_iALUsr, "rsbs", "\t$Rd, $Rn, $shift", - [(set GPR:$Rd, (subc so_reg:$shift, GPR:$Rn))]> { - bits<4> Rd; - bits<4> Rn; - bits<12> shift; - let Inst{25} = 0; - let Inst{20} = 1; - let Inst{11-0} = shift; - let Inst{15-12} = Rd; - let Inst{19-16} = Rn; -} +// NOTE: CPSR def omitted because it will be handled by the custom inserter. +let usesCustomInserter = 1 in { +def RSBSri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), + Size4Bytes, IIC_iALUi, + [(set GPR:$Rd, (subc so_imm:$imm, GPR:$Rn))]>; +def RSBSrr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), + Size4Bytes, IIC_iALUr, + [/* For disassembly only; pattern left blank */]>; +def RSBSrs : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift), + Size4Bytes, IIC_iALUsr, + [(set GPR:$Rd, (subc so_reg:$shift, GPR:$Rn))]>; } let Uses = [CPSR] in { @@ -2258,34 +2310,14 @@ def RSCrs : AsI1<0b0111, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift), } } -// FIXME: Allow these to be predicated. -let isCodeGenOnly = 1, Defs = [CPSR], Uses = [CPSR] in { -def RSCSri : AXI1<0b0111, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), - DPFrm, IIC_iALUi, "rscs\t$Rd, $Rn, $imm", - [(set GPR:$Rd, (sube_dead_carry so_imm:$imm, GPR:$Rn))]>, - Requires<[IsARM]> { - bits<4> Rd; - bits<4> Rn; - bits<12> imm; - let Inst{25} = 1; - let Inst{20} = 1; - let Inst{15-12} = Rd; - let Inst{19-16} = Rn; - let Inst{11-0} = imm; -} -def RSCSrs : AXI1<0b0111, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift), - DPSoRegFrm, IIC_iALUsr, "rscs\t$Rd, $Rn, $shift", - [(set GPR:$Rd, (sube_dead_carry so_reg:$shift, GPR:$Rn))]>, - Requires<[IsARM]> { - bits<4> Rd; - bits<4> Rn; - bits<12> shift; - let Inst{25} = 0; - let Inst{20} = 1; - let Inst{11-0} = shift; - let Inst{15-12} = Rd; - let Inst{19-16} = Rn; -} +// NOTE: CPSR def omitted because it will be handled by the custom inserter. +let usesCustomInserter = 1, Uses = [CPSR] in { +def RSCSri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), + Size4Bytes, IIC_iALUi, + [(set GPR:$Rd, (sube_dead_carry so_imm:$imm, GPR:$Rn))]>; +def RSCSrs : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift), + Size4Bytes, IIC_iALUsr, + [(set GPR:$Rd, (sube_dead_carry so_reg:$shift, GPR:$Rn))]>; } // (sub X, imm) gets canonicalized to (add X, -imm). Match this form. @@ -2300,8 +2332,10 @@ def : ARMPat<(addc GPR:$src, so_imm_neg:$imm), // The with-carry-in form matches bitwise not instead of the negation. // Effectively, the inverse interpretation of the carry flag already accounts // for part of the negation. -def : ARMPat<(adde GPR:$src, so_imm_not:$imm), +def : ARMPat<(adde_dead_carry GPR:$src, so_imm_not:$imm), (SBCri GPR:$src, so_imm_not:$imm)>; +def : ARMPat<(adde_live_carry GPR:$src, so_imm_not:$imm), + (SBCSri GPR:$src, so_imm_not:$imm)>; // Note: These are implemented in C++ code, because they have to generate // ADD/SUBrs instructions, which use a complex pattern that a xform function @@ -2617,14 +2651,16 @@ def MULv5: ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, def MUL : AsMul1I32<0b0000000, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm", [(set GPR:$Rd, (mul GPR:$Rn, GPR:$Rm))]>, - Requires<[IsARM, HasV6]>; + Requires<[IsARM, HasV6]> { + let Inst{15-12} = 0b0000; +} } let Constraints = "@earlyclobber $Rd" in def MLAv5: ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra, pred:$p, cc_out:$s), - Size4Bytes, IIC_iMAC32, - [(set GPR:$Rd, (add (mul GPR:$Rn, GPR:$Rm), GPR:$Ra))]>, + Size4Bytes, IIC_iMAC32, + [(set GPR:$Rd, (add (mul GPR:$Rn, GPR:$Rm), GPR:$Ra))]>, Requires<[IsARM, NoV6]> { bits<4> Ra; let Inst{15-12} = Ra; @@ -2657,7 +2693,7 @@ let neverHasSideEffects = 1 in { let isCommutable = 1 in { let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in { def SMULLv5 : ARMPseudoInst<(outs GPR:$RdLo, GPR:$RdHi), - (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), + (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), Size4Bytes, IIC_iMUL64, []>, Requires<[IsARM, NoV6]>; @@ -2681,15 +2717,15 @@ def UMULL : AsMul1I64<0b0000100, (outs GPR:$RdLo, GPR:$RdHi), // Multiply + accumulate let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in { def SMLALv5 : ARMPseudoInst<(outs GPR:$RdLo, GPR:$RdHi), - (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), + (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), Size4Bytes, IIC_iMAC64, []>, Requires<[IsARM, NoV6]>; def UMLALv5 : ARMPseudoInst<(outs GPR:$RdLo, GPR:$RdHi), - (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), + (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), Size4Bytes, IIC_iMAC64, []>, Requires<[IsARM, NoV6]>; def UMAALv5 : ARMPseudoInst<(outs GPR:$RdLo, GPR:$RdHi), - (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), + (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), Size4Bytes, IIC_iMAC64, []>, Requires<[IsARM, NoV6]>; @@ -2970,17 +3006,25 @@ def REVSH : AMiscA1I<0b01101111, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm), IIC_iUNAr, "revsh", "\t$Rd, $Rm", [(set GPR:$Rd, (sext_inreg - (or (srl (and GPR:$Rm, 0xFF00), (i32 8)), + (or (srl GPR:$Rm, (i32 8)), (shl GPR:$Rm, (i32 8))), i16))]>, Requires<[IsARM, HasV6]>; +def : ARMV6Pat<(sext_inreg (or (srl (and GPR:$Rm, 0xFF00), (i32 8)), + (shl GPR:$Rm, (i32 8))), i16), + (REVSH GPR:$Rm)>; + +// Need the AddedComplexity or else MOVs + REV would be chosen. +let AddedComplexity = 5 in +def : ARMV6Pat<(sra (bswap GPR:$Rm), (i32 16)), (REVSH GPR:$Rm)>; + def lsl_shift_imm : SDNodeXForm<imm, [{ unsigned Sh = ARM_AM::getSORegOpc(ARM_AM::lsl, N->getZExtValue()); return CurDAG->getTargetConstant(Sh, MVT::i32); }]>; -def lsl_amt : PatLeaf<(i32 imm), [{ - return (N->getZExtValue() < 32); +def lsl_amt : ImmLeaf<i32, [{ + return Imm > 0 && Imm < 32; }], lsl_shift_imm>; def PKHBT : APKHI<0b01101000, 0, (outs GPR:$Rd), @@ -3002,8 +3046,8 @@ def asr_shift_imm : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(Sh, MVT::i32); }]>; -def asr_amt : PatLeaf<(i32 imm), [{ - return (N->getZExtValue() <= 32); +def asr_amt : ImmLeaf<i32, [{ + return Imm > 0 && Imm <= 32; }], asr_shift_imm>; // Note: Shifts of 1-15 bits will be transformed to srl instead of sra and @@ -3119,88 +3163,43 @@ def BCCZi64 : PseudoInst<(outs), // Conditional moves // FIXME: should be able to write a pattern for ARMcmov, but can't use // a two-value operand where a dag node expects two operands. :( -// FIXME: These should all be pseudo-instructions that get expanded to -// the normal MOV instructions. That would fix the dependency on -// special casing them in tblgen. let neverHasSideEffects = 1 in { -def MOVCCr : AI1<0b1101, (outs GPR:$Rd), (ins GPR:$false, GPR:$Rm), DPFrm, - IIC_iCMOVr, "mov", "\t$Rd, $Rm", - [/*(set GPR:$Rd, (ARMcmov GPR:$false, GPR:$Rm, imm:$cc, CCR:$ccr))*/]>, - RegConstraint<"$false = $Rd">, UnaryDP { - bits<4> Rd; - bits<4> Rm; - let Inst{25} = 0; - let Inst{20} = 0; - let Inst{15-12} = Rd; - let Inst{11-4} = 0b00000000; - let Inst{3-0} = Rm; -} - -def MOVCCs : AI1<0b1101, (outs GPR:$Rd), - (ins GPR:$false, so_reg:$shift), DPSoRegFrm, IIC_iCMOVsr, - "mov", "\t$Rd, $shift", - [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_reg:$shift, imm:$cc, CCR:$ccr))*/]>, - RegConstraint<"$false = $Rd">, UnaryDP { - bits<4> Rd; - bits<12> shift; - let Inst{25} = 0; - let Inst{20} = 0; - let Inst{19-16} = 0; - let Inst{15-12} = Rd; - let Inst{11-0} = shift; -} +def MOVCCr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$false, GPR:$Rm, pred:$p), + Size4Bytes, IIC_iCMOVr, + [/*(set GPR:$Rd, (ARMcmov GPR:$false, GPR:$Rm, imm:$cc, CCR:$ccr))*/]>, + RegConstraint<"$false = $Rd">; +def MOVCCs : ARMPseudoInst<(outs GPR:$Rd), + (ins GPR:$false, so_reg:$shift, pred:$p), + Size4Bytes, IIC_iCMOVsr, + [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_reg:$shift, imm:$cc, CCR:$ccr))*/]>, + RegConstraint<"$false = $Rd">; let isMoveImm = 1 in -def MOVCCi16 : AI1<0b1000, (outs GPR:$Rd), (ins GPR:$false, i32imm_hilo16:$imm), - DPFrm, IIC_iMOVi, - "movw", "\t$Rd, $imm", - []>, - RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>, - UnaryDP { - bits<4> Rd; - bits<16> imm; - let Inst{25} = 1; - let Inst{20} = 0; - let Inst{19-16} = imm{15-12}; - let Inst{15-12} = Rd; - let Inst{11-0} = imm{11-0}; -} +def MOVCCi16 : ARMPseudoInst<(outs GPR:$Rd), + (ins GPR:$false, i32imm_hilo16:$imm, pred:$p), + Size4Bytes, IIC_iMOVi, + []>, + RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>; let isMoveImm = 1 in -def MOVCCi : AI1<0b1101, (outs GPR:$Rd), - (ins GPR:$false, so_imm:$imm), DPFrm, IIC_iCMOVi, - "mov", "\t$Rd, $imm", +def MOVCCi : ARMPseudoInst<(outs GPR:$Rd), + (ins GPR:$false, so_imm:$imm, pred:$p), + Size4Bytes, IIC_iCMOVi, [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_imm:$imm, imm:$cc, CCR:$ccr))*/]>, - RegConstraint<"$false = $Rd">, UnaryDP { - bits<4> Rd; - bits<12> imm; - let Inst{25} = 1; - let Inst{20} = 0; - let Inst{19-16} = 0b0000; - let Inst{15-12} = Rd; - let Inst{11-0} = imm; -} + RegConstraint<"$false = $Rd">; // Two instruction predicate mov immediate. let isMoveImm = 1 in -def MOVCCi32imm : PseudoInst<(outs GPR:$Rd), - (ins GPR:$false, i32imm:$src, pred:$p), - IIC_iCMOVix2, []>, RegConstraint<"$false = $Rd">; +def MOVCCi32imm : ARMPseudoInst<(outs GPR:$Rd), + (ins GPR:$false, i32imm:$src, pred:$p), + Size8Bytes, IIC_iCMOVix2, []>, RegConstraint<"$false = $Rd">; let isMoveImm = 1 in -def MVNCCi : AI1<0b1111, (outs GPR:$Rd), - (ins GPR:$false, so_imm:$imm), DPFrm, IIC_iCMOVi, - "mvn", "\t$Rd, $imm", +def MVNCCi : ARMPseudoInst<(outs GPR:$Rd), + (ins GPR:$false, so_imm:$imm, pred:$p), + Size4Bytes, IIC_iCMOVi, [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_imm_not:$imm, imm:$cc, CCR:$ccr))*/]>, - RegConstraint<"$false = $Rd">, UnaryDP { - bits<4> Rd; - bits<12> imm; - let Inst{25} = 1; - let Inst{20} = 0; - let Inst{19-16} = 0b0000; - let Inst{15-12} = Rd; - let Inst{11-0} = imm; -} + RegConstraint<"$false = $Rd">; } // neverHasSideEffects //===----------------------------------------------------------------------===// @@ -3221,13 +3220,6 @@ def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary, let Inst{31-4} = 0xf57ff05; let Inst{3-0} = opt; } - -def DMB_MCR : AInoP<(outs), (ins GPR:$zero), MiscFrm, NoItinerary, - "mcr", "\tp15, 0, $zero, c7, c10, 5", - [(ARMMemBarrierMCR GPR:$zero)]>, - Requires<[IsARM, HasV6]> { - // FIXME: add encoding -} } def DSB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary, @@ -3266,6 +3258,18 @@ let usesCustomInserter = 1 in { def ATOMIC_LOAD_NAND_I8 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, [(set GPR:$dst, (atomic_load_nand_8 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_MIN_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_min_8 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_MAX_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_max_8 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_UMIN_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_min_8 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_UMAX_I8 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_max_8 GPR:$ptr, GPR:$val))]>; def ATOMIC_LOAD_ADD_I16 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, [(set GPR:$dst, (atomic_load_add_16 GPR:$ptr, GPR:$incr))]>; @@ -3284,6 +3288,18 @@ let usesCustomInserter = 1 in { def ATOMIC_LOAD_NAND_I16 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, [(set GPR:$dst, (atomic_load_nand_16 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_MIN_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_min_16 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_MAX_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_max_16 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_UMIN_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_min_16 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_UMAX_I16 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_max_16 GPR:$ptr, GPR:$val))]>; def ATOMIC_LOAD_ADD_I32 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, [(set GPR:$dst, (atomic_load_add_32 GPR:$ptr, GPR:$incr))]>; @@ -3302,6 +3318,18 @@ let usesCustomInserter = 1 in { def ATOMIC_LOAD_NAND_I32 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary, [(set GPR:$dst, (atomic_load_nand_32 GPR:$ptr, GPR:$incr))]>; + def ATOMIC_LOAD_MIN_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_min_32 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_MAX_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_max_32 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_UMIN_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_min_32 GPR:$ptr, GPR:$val))]>; + def ATOMIC_LOAD_UMAX_I32 : PseudoInst< + (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary, + [(set GPR:$dst, (atomic_load_max_32 GPR:$ptr, GPR:$val))]>; def ATOMIC_SWAP_I8 : PseudoInst< (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary, @@ -3326,39 +3354,26 @@ let usesCustomInserter = 1 in { } let mayLoad = 1 in { -def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins GPR:$Rn), NoItinerary, - "ldrexb", "\t$Rt, [$Rn]", - []>; -def LDREXH : AIldrex<0b11, (outs GPR:$Rt), (ins GPR:$Rn), NoItinerary, - "ldrexh", "\t$Rt, [$Rn]", - []>; -def LDREX : AIldrex<0b00, (outs GPR:$Rt), (ins GPR:$Rn), NoItinerary, - "ldrex", "\t$Rt, [$Rn]", - []>; -def LDREXD : AIldrex<0b01, (outs GPR:$Rt, GPR:$Rt2), (ins GPR:$Rn), - NoItinerary, - "ldrexd", "\t$Rt, $Rt2, [$Rn]", - []>; +def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins addrmode7:$addr), NoItinerary, + "ldrexb", "\t$Rt, $addr", []>; +def LDREXH : AIldrex<0b11, (outs GPR:$Rt), (ins addrmode7:$addr), NoItinerary, + "ldrexh", "\t$Rt, $addr", []>; +def LDREX : AIldrex<0b00, (outs GPR:$Rt), (ins addrmode7:$addr), NoItinerary, + "ldrex", "\t$Rt, $addr", []>; +def LDREXD : AIldrex<0b01, (outs GPR:$Rt, GPR:$Rt2), (ins addrmode7:$addr), + NoItinerary, "ldrexd", "\t$Rt, $Rt2, $addr", []>; } let mayStore = 1, Constraints = "@earlyclobber $Rd" in { -def STREXB : AIstrex<0b10, (outs GPR:$Rd), (ins GPR:$src, GPR:$Rn), - NoItinerary, - "strexb", "\t$Rd, $src, [$Rn]", - []>; -def STREXH : AIstrex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, GPR:$Rn), - NoItinerary, - "strexh", "\t$Rd, $Rt, [$Rn]", - []>; -def STREX : AIstrex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, GPR:$Rn), - NoItinerary, - "strex", "\t$Rd, $Rt, [$Rn]", - []>; +def STREXB : AIstrex<0b10, (outs GPR:$Rd), (ins GPR:$Rt, addrmode7:$addr), + NoItinerary, "strexb", "\t$Rd, $Rt, $addr", []>; +def STREXH : AIstrex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addrmode7:$addr), + NoItinerary, "strexh", "\t$Rd, $Rt, $addr", []>; +def STREX : AIstrex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addrmode7:$addr), + NoItinerary, "strex", "\t$Rd, $Rt, $addr", []>; def STREXD : AIstrex<0b01, (outs GPR:$Rd), - (ins GPR:$Rt, GPR:$Rt2, GPR:$Rn), - NoItinerary, - "strexd", "\t$Rd, $Rt, $Rt2, [$Rn]", - []>; + (ins GPR:$Rt, GPR:$Rt2, addrmode7:$addr), + NoItinerary, "strexd", "\t$Rd, $Rt, $Rt2, $addr", []>; } // Clear-Exclusive is for disassembly only. @@ -3377,238 +3392,7 @@ def SWPB : AIswp<1, (outs GPR:$Rt), (ins GPR:$Rt2, GPR:$Rn), "swpb", } //===----------------------------------------------------------------------===// -// TLS Instructions -// - -// __aeabi_read_tp preserves the registers r1-r3. -// This is a pseudo inst so that we can get the encoding right, -// complete with fixup for the aeabi_read_tp function. -let isCall = 1, - Defs = [R0, R12, LR, CPSR], Uses = [SP] in { - def TPsoft : PseudoInst<(outs), (ins), IIC_Br, - [(set R0, ARMthread_pointer)]>; -} - -//===----------------------------------------------------------------------===// -// SJLJ Exception handling intrinsics -// eh_sjlj_setjmp() is an instruction sequence to store the return -// address and save #0 in R0 for the non-longjmp case. -// Since by its nature we may be coming from some other function to get -// here, and we're using the stack frame for the containing function to -// save/restore registers, we can't keep anything live in regs across -// the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon -// when we get here from a longjmp(). We force everthing out of registers -// except for our own input by listing the relevant registers in Defs. By -// doing so, we also cause the prologue/epilogue code to actively preserve -// all of the callee-saved resgisters, which is exactly what we want. -// A constant value is passed in $val, and we use the location as a scratch. -// -// These are pseudo-instructions and are lowered to individual MC-insts, so -// no encoding information is necessary. -let Defs = - [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, D0, - D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15, - D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, - D31 ], hasSideEffects = 1, isBarrier = 1 in { - def Int_eh_sjlj_setjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$val), - NoItinerary, - [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>, - Requires<[IsARM, HasVFP2]>; -} - -let Defs = - [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR ], - hasSideEffects = 1, isBarrier = 1 in { - def Int_eh_sjlj_setjmp_nofp : PseudoInst<(outs), (ins GPR:$src, GPR:$val), - NoItinerary, - [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>, - Requires<[IsARM, NoVFP]>; -} - -// FIXME: Non-Darwin version(s) -let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, - Defs = [ R7, LR, SP ] in { -def Int_eh_sjlj_longjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$scratch), - NoItinerary, - [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>, - Requires<[IsARM, IsDarwin]>; -} - -// eh.sjlj.dispatchsetup pseudo-instruction. -// This pseudo is used for ARM, Thumb1 and Thumb2. Any differences are -// handled when the pseudo is expanded (which happens before any passes -// that need the instruction size). -let isBarrier = 1, hasSideEffects = 1 in -def Int_eh_sjlj_dispatchsetup : - PseudoInst<(outs), (ins GPR:$src), NoItinerary, - [(ARMeh_sjlj_dispatchsetup GPR:$src)]>, - Requires<[IsDarwin]>; - -//===----------------------------------------------------------------------===// -// Non-Instruction Patterns -// - -// Large immediate handling. - -// 32-bit immediate using two piece so_imms or movw + movt. -// This is a single pseudo instruction, the benefit is that it can be remat'd -// as a single unit instead of having to handle reg inputs. -// FIXME: Remove this when we can do generalized remat. -let isReMaterializable = 1, isMoveImm = 1 in -def MOVi32imm : PseudoInst<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVix2, - [(set GPR:$dst, (arm_i32imm:$src))]>, - Requires<[IsARM]>; - -// Pseudo instruction that combines movw + movt + add pc (if PIC). -// It also makes it possible to rematerialize the instructions. -// FIXME: Remove this when we can do generalized remat and when machine licm -// can properly the instructions. -let isReMaterializable = 1 in { -def MOV_ga_pcrel : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr), - IIC_iMOVix2addpc, - [(set GPR:$dst, (ARMWrapperPIC tglobaladdr:$addr))]>, - Requires<[IsARM, UseMovt]>; - -def MOV_ga_dyn : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr), - IIC_iMOVix2, - [(set GPR:$dst, (ARMWrapperDYN tglobaladdr:$addr))]>, - Requires<[IsARM, UseMovt]>; - -let AddedComplexity = 10 in -def MOV_ga_pcrel_ldr : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr), - IIC_iMOVix2ld, - [(set GPR:$dst, (load (ARMWrapperPIC tglobaladdr:$addr)))]>, - Requires<[IsARM, UseMovt]>; -} // isReMaterializable - -// ConstantPool, GlobalAddress, and JumpTable -def : ARMPat<(ARMWrapper tglobaladdr :$dst), (LEApcrel tglobaladdr :$dst)>, - Requires<[IsARM, DontUseMovt]>; -def : ARMPat<(ARMWrapper tconstpool :$dst), (LEApcrel tconstpool :$dst)>; -def : ARMPat<(ARMWrapper tglobaladdr :$dst), (MOVi32imm tglobaladdr :$dst)>, - Requires<[IsARM, UseMovt]>; -def : ARMPat<(ARMWrapperJT tjumptable:$dst, imm:$id), - (LEApcrelJT tjumptable:$dst, imm:$id)>; - -// TODO: add,sub,and, 3-instr forms? - -// Tail calls -def : ARMPat<(ARMtcret tcGPR:$dst), - (TCRETURNri tcGPR:$dst)>, Requires<[IsDarwin]>; - -def : ARMPat<(ARMtcret (i32 tglobaladdr:$dst)), - (TCRETURNdi texternalsym:$dst)>, Requires<[IsDarwin]>; - -def : ARMPat<(ARMtcret (i32 texternalsym:$dst)), - (TCRETURNdi texternalsym:$dst)>, Requires<[IsDarwin]>; - -def : ARMPat<(ARMtcret tcGPR:$dst), - (TCRETURNriND tcGPR:$dst)>, Requires<[IsNotDarwin]>; - -def : ARMPat<(ARMtcret (i32 tglobaladdr:$dst)), - (TCRETURNdiND texternalsym:$dst)>, Requires<[IsNotDarwin]>; - -def : ARMPat<(ARMtcret (i32 texternalsym:$dst)), - (TCRETURNdiND texternalsym:$dst)>, Requires<[IsNotDarwin]>; - -// Direct calls -def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>, - Requires<[IsARM, IsNotDarwin]>; -def : ARMPat<(ARMcall texternalsym:$func), (BLr9 texternalsym:$func)>, - Requires<[IsARM, IsDarwin]>; - -// zextload i1 -> zextload i8 -def : ARMPat<(zextloadi1 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>; -def : ARMPat<(zextloadi1 ldst_so_reg:$addr), (LDRBrs ldst_so_reg:$addr)>; - -// extload -> zextload -def : ARMPat<(extloadi1 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>; -def : ARMPat<(extloadi1 ldst_so_reg:$addr), (LDRBrs ldst_so_reg:$addr)>; -def : ARMPat<(extloadi8 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>; -def : ARMPat<(extloadi8 ldst_so_reg:$addr), (LDRBrs ldst_so_reg:$addr)>; - -def : ARMPat<(extloadi16 addrmode3:$addr), (LDRH addrmode3:$addr)>; - -def : ARMPat<(extloadi8 addrmodepc:$addr), (PICLDRB addrmodepc:$addr)>; -def : ARMPat<(extloadi16 addrmodepc:$addr), (PICLDRH addrmodepc:$addr)>; - -// smul* and smla* -def : ARMV5TEPat<(mul (sra (shl GPR:$a, (i32 16)), (i32 16)), - (sra (shl GPR:$b, (i32 16)), (i32 16))), - (SMULBB GPR:$a, GPR:$b)>; -def : ARMV5TEPat<(mul sext_16_node:$a, sext_16_node:$b), - (SMULBB GPR:$a, GPR:$b)>; -def : ARMV5TEPat<(mul (sra (shl GPR:$a, (i32 16)), (i32 16)), - (sra GPR:$b, (i32 16))), - (SMULBT GPR:$a, GPR:$b)>; -def : ARMV5TEPat<(mul sext_16_node:$a, (sra GPR:$b, (i32 16))), - (SMULBT GPR:$a, GPR:$b)>; -def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)), - (sra (shl GPR:$b, (i32 16)), (i32 16))), - (SMULTB GPR:$a, GPR:$b)>; -def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)), sext_16_node:$b), - (SMULTB GPR:$a, GPR:$b)>; -def : ARMV5TEPat<(sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))), - (i32 16)), - (SMULWB GPR:$a, GPR:$b)>; -def : ARMV5TEPat<(sra (mul GPR:$a, sext_16_node:$b), (i32 16)), - (SMULWB GPR:$a, GPR:$b)>; - -def : ARMV5TEPat<(add GPR:$acc, - (mul (sra (shl GPR:$a, (i32 16)), (i32 16)), - (sra (shl GPR:$b, (i32 16)), (i32 16)))), - (SMLABB GPR:$a, GPR:$b, GPR:$acc)>; -def : ARMV5TEPat<(add GPR:$acc, - (mul sext_16_node:$a, sext_16_node:$b)), - (SMLABB GPR:$a, GPR:$b, GPR:$acc)>; -def : ARMV5TEPat<(add GPR:$acc, - (mul (sra (shl GPR:$a, (i32 16)), (i32 16)), - (sra GPR:$b, (i32 16)))), - (SMLABT GPR:$a, GPR:$b, GPR:$acc)>; -def : ARMV5TEPat<(add GPR:$acc, - (mul sext_16_node:$a, (sra GPR:$b, (i32 16)))), - (SMLABT GPR:$a, GPR:$b, GPR:$acc)>; -def : ARMV5TEPat<(add GPR:$acc, - (mul (sra GPR:$a, (i32 16)), - (sra (shl GPR:$b, (i32 16)), (i32 16)))), - (SMLATB GPR:$a, GPR:$b, GPR:$acc)>; -def : ARMV5TEPat<(add GPR:$acc, - (mul (sra GPR:$a, (i32 16)), sext_16_node:$b)), - (SMLATB GPR:$a, GPR:$b, GPR:$acc)>; -def : ARMV5TEPat<(add GPR:$acc, - (sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))), - (i32 16))), - (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>; -def : ARMV5TEPat<(add GPR:$acc, - (sra (mul GPR:$a, sext_16_node:$b), (i32 16))), - (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>; - -//===----------------------------------------------------------------------===// -// Thumb Support -// - -include "ARMInstrThumb.td" - -//===----------------------------------------------------------------------===// -// Thumb2 Support -// - -include "ARMInstrThumb2.td" - -//===----------------------------------------------------------------------===// -// Floating Point Support -// - -include "ARMInstrVFP.td" - -//===----------------------------------------------------------------------===// -// Advanced SIMD (NEON) Support -// - -include "ARMInstrNEON.td" - -//===----------------------------------------------------------------------===// -// Coprocessor Instructions. For disassembly only. +// Coprocessor Instructions. // def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, i32imm:$opc1, @@ -3652,17 +3436,18 @@ def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, i32imm:$opc1, let Inst{23-20} = opc1; } -class ACI<dag oops, dag iops, string opc, string asm> - : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, BrFrm, NoItinerary, - opc, asm, "", [/* For disassembly only; pattern left blank */]> { +class ACI<dag oops, dag iops, string opc, string asm, + IndexMode im = IndexModeNone> + : InoP<oops, iops, AddrModeNone, Size4Bytes, im, BrFrm, NoItinerary, + opc, asm, "", [/* For disassembly only; pattern left blank */]> { let Inst{27-25} = 0b110; } -multiclass LdStCop<bits<4> op31_28, bit load, string opc> { +multiclass LdStCop<bits<4> op31_28, bit load, dag ops, string opc, string cond>{ def _OFFSET : ACI<(outs), - (ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), - opc, "\tp$cop, cr$CRd, $addr"> { + !con((ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), ops), + !strconcat(opc, cond), "\tp$cop, cr$CRd, $addr"> { let Inst{31-28} = op31_28; let Inst{24} = 1; // P = 1 let Inst{21} = 0; // W = 0 @@ -3671,8 +3456,8 @@ multiclass LdStCop<bits<4> op31_28, bit load, string opc> { } def _PRE : ACI<(outs), - (ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), - opc, "\tp$cop, cr$CRd, $addr!"> { + !con((ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), ops), + !strconcat(opc, cond), "\tp$cop, cr$CRd, $addr!", IndexModePre> { let Inst{31-28} = op31_28; let Inst{24} = 1; // P = 1 let Inst{21} = 1; // W = 1 @@ -3681,8 +3466,8 @@ multiclass LdStCop<bits<4> op31_28, bit load, string opc> { } def _POST : ACI<(outs), - (ins nohash_imm:$cop, nohash_imm:$CRd, GPR:$base, am2offset:$offset), - opc, "\tp$cop, cr$CRd, [$base], $offset"> { + !con((ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), ops), + !strconcat(opc, cond), "\tp$cop, cr$CRd, $addr", IndexModePost> { let Inst{31-28} = op31_28; let Inst{24} = 0; // P = 0 let Inst{21} = 1; // W = 1 @@ -3691,8 +3476,9 @@ multiclass LdStCop<bits<4> op31_28, bit load, string opc> { } def _OPTION : ACI<(outs), - (ins nohash_imm:$cop, nohash_imm:$CRd, GPR:$base, i32imm:$option), - opc, "\tp$cop, cr$CRd, [$base], $option"> { + !con((ins nohash_imm:$cop,nohash_imm:$CRd,GPR:$base, nohash_imm:$option), + ops), + !strconcat(opc, cond), "\tp$cop, cr$CRd, [$base], \\{$option\\}"> { let Inst{31-28} = op31_28; let Inst{24} = 0; // P = 0 let Inst{23} = 1; // U = 1 @@ -3702,8 +3488,8 @@ multiclass LdStCop<bits<4> op31_28, bit load, string opc> { } def L_OFFSET : ACI<(outs), - (ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), - !strconcat(opc, "l"), "\tp$cop, cr$CRd, $addr"> { + !con((ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), ops), + !strconcat(!strconcat(opc, "l"), cond), "\tp$cop, cr$CRd, $addr"> { let Inst{31-28} = op31_28; let Inst{24} = 1; // P = 1 let Inst{21} = 0; // W = 0 @@ -3712,8 +3498,9 @@ multiclass LdStCop<bits<4> op31_28, bit load, string opc> { } def L_PRE : ACI<(outs), - (ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), - !strconcat(opc, "l"), "\tp$cop, cr$CRd, $addr!"> { + !con((ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), ops), + !strconcat(!strconcat(opc, "l"), cond), "\tp$cop, cr$CRd, $addr!", + IndexModePre> { let Inst{31-28} = op31_28; let Inst{24} = 1; // P = 1 let Inst{21} = 1; // W = 1 @@ -3722,8 +3509,9 @@ multiclass LdStCop<bits<4> op31_28, bit load, string opc> { } def L_POST : ACI<(outs), - (ins nohash_imm:$cop, nohash_imm:$CRd, GPR:$base, am2offset:$offset), - !strconcat(opc, "l"), "\tp$cop, cr$CRd, [$base], $offset"> { + !con((ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), ops), + !strconcat(!strconcat(opc, "l"), cond), "\tp$cop, cr$CRd, $addr", + IndexModePost> { let Inst{31-28} = op31_28; let Inst{24} = 0; // P = 0 let Inst{21} = 1; // W = 1 @@ -3732,8 +3520,10 @@ multiclass LdStCop<bits<4> op31_28, bit load, string opc> { } def L_OPTION : ACI<(outs), - (ins nohash_imm:$cop, nohash_imm:$CRd, GPR:$base, nohash_imm:$option), - !strconcat(opc, "l"), "\tp$cop, cr$CRd, [$base], $option"> { + !con((ins nohash_imm:$cop, nohash_imm:$CRd,GPR:$base,nohash_imm:$option), + ops), + !strconcat(!strconcat(opc, "l"), cond), + "\tp$cop, cr$CRd, [$base], \\{$option\\}"> { let Inst{31-28} = op31_28; let Inst{24} = 0; // P = 0 let Inst{23} = 1; // U = 1 @@ -3743,19 +3533,18 @@ multiclass LdStCop<bits<4> op31_28, bit load, string opc> { } } -defm LDC : LdStCop<{?,?,?,?}, 1, "ldc">; -defm LDC2 : LdStCop<0b1111, 1, "ldc2">; -defm STC : LdStCop<{?,?,?,?}, 0, "stc">; -defm STC2 : LdStCop<0b1111, 0, "stc2">; +defm LDC : LdStCop<{?,?,?,?}, 1, (ins pred:$p), "ldc", "${p}">; +defm LDC2 : LdStCop<0b1111, 1, (ins), "ldc2", "">; +defm STC : LdStCop<{?,?,?,?}, 0, (ins pred:$p), "stc", "${p}">; +defm STC2 : LdStCop<0b1111, 0, (ins), "stc2", "">; //===----------------------------------------------------------------------===// // Move between coprocessor and ARM core register -- for disassembly only // -class MovRCopro<string opc, bit direction> - : ABI<0b1110, (outs), (ins p_imm:$cop, i32imm:$opc1, - GPR:$Rt, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2), - NoItinerary, opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2", +class MovRCopro<string opc, bit direction, dag oops, dag iops> + : ABI<0b1110, oops, iops, NoItinerary, opc, + "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2", [/* For disassembly only; pattern left blank */]> { let Inst{20} = direction; let Inst{4} = 1; @@ -3775,13 +3564,17 @@ class MovRCopro<string opc, bit direction> let Inst{19-16} = CRn; } -def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */>; -def MRC : MovRCopro<"mrc", 1 /* from coprocessor to ARM core register */>; +def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */, + (outs), (ins p_imm:$cop, i32imm:$opc1, + GPR:$Rt, c_imm:$CRn, c_imm:$CRm, + i32imm:$opc2)>; +def MRC : MovRCopro<"mrc", 1 /* from coprocessor to ARM core register */, + (outs GPR:$Rt), (ins p_imm:$cop, i32imm:$opc1, + c_imm:$CRn, c_imm:$CRm, i32imm:$opc2)>; -class MovRCopro2<string opc, bit direction> - : ABXI<0b1110, (outs), (ins p_imm:$cop, i32imm:$opc1, - GPR:$Rt, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2), - NoItinerary, !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"), +class MovRCopro2<string opc, bit direction, dag oops, dag iops> + : ABXI<0b1110, oops, iops, NoItinerary, + !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"), [/* For disassembly only; pattern left blank */]> { let Inst{31-28} = 0b1111; let Inst{20} = direction; @@ -3802,8 +3595,14 @@ class MovRCopro2<string opc, bit direction> let Inst{19-16} = CRn; } -def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */>; -def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */>; +def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */, + (outs), (ins p_imm:$cop, i32imm:$opc1, + GPR:$Rt, c_imm:$CRn, c_imm:$CRm, + i32imm:$opc2)>; +def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */, + (outs GPR:$Rt), (ins p_imm:$cop, i32imm:$opc1, + c_imm:$CRn, c_imm:$CRm, + i32imm:$opc2)>; class MovRRCopro<string opc, bit direction> : ABI<0b1100, (outs), (ins p_imm:$cop, i32imm:$opc1, @@ -3909,3 +3708,241 @@ def MSRi : ABI<0b0011, (outs), (ins msr_mask:$mask, so_imm:$a), NoItinerary, let Inst{15-12} = 0b1111; let Inst{11-0} = a; } + +//===----------------------------------------------------------------------===// +// TLS Instructions +// + +// __aeabi_read_tp preserves the registers r1-r3. +// This is a pseudo inst so that we can get the encoding right, +// complete with fixup for the aeabi_read_tp function. +let isCall = 1, + Defs = [R0, R12, LR, CPSR], Uses = [SP] in { + def TPsoft : PseudoInst<(outs), (ins), IIC_Br, + [(set R0, ARMthread_pointer)]>; +} + +//===----------------------------------------------------------------------===// +// SJLJ Exception handling intrinsics +// eh_sjlj_setjmp() is an instruction sequence to store the return +// address and save #0 in R0 for the non-longjmp case. +// Since by its nature we may be coming from some other function to get +// here, and we're using the stack frame for the containing function to +// save/restore registers, we can't keep anything live in regs across +// the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon +// when we get here from a longjmp(). We force everything out of registers +// except for our own input by listing the relevant registers in Defs. By +// doing so, we also cause the prologue/epilogue code to actively preserve +// all of the callee-saved resgisters, which is exactly what we want. +// A constant value is passed in $val, and we use the location as a scratch. +// +// These are pseudo-instructions and are lowered to individual MC-insts, so +// no encoding information is necessary. +let Defs = + [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, D0, + D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15, + D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, + D31 ], hasSideEffects = 1, isBarrier = 1 in { + def Int_eh_sjlj_setjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$val), + NoItinerary, + [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>, + Requires<[IsARM, HasVFP2]>; +} + +let Defs = + [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR ], + hasSideEffects = 1, isBarrier = 1 in { + def Int_eh_sjlj_setjmp_nofp : PseudoInst<(outs), (ins GPR:$src, GPR:$val), + NoItinerary, + [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>, + Requires<[IsARM, NoVFP]>; +} + +// FIXME: Non-Darwin version(s) +let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, + Defs = [ R7, LR, SP ] in { +def Int_eh_sjlj_longjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$scratch), + NoItinerary, + [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>, + Requires<[IsARM, IsDarwin]>; +} + +// eh.sjlj.dispatchsetup pseudo-instruction. +// This pseudo is used for ARM, Thumb1 and Thumb2. Any differences are +// handled when the pseudo is expanded (which happens before any passes +// that need the instruction size). +let isBarrier = 1, hasSideEffects = 1 in +def Int_eh_sjlj_dispatchsetup : + PseudoInst<(outs), (ins), NoItinerary, + [(ARMeh_sjlj_dispatchsetup)]>, + Requires<[IsDarwin]>; + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +// + +// Large immediate handling. + +// 32-bit immediate using two piece so_imms or movw + movt. +// This is a single pseudo instruction, the benefit is that it can be remat'd +// as a single unit instead of having to handle reg inputs. +// FIXME: Remove this when we can do generalized remat. +let isReMaterializable = 1, isMoveImm = 1 in +def MOVi32imm : PseudoInst<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVix2, + [(set GPR:$dst, (arm_i32imm:$src))]>, + Requires<[IsARM]>; + +// Pseudo instruction that combines movw + movt + add pc (if PIC). +// It also makes it possible to rematerialize the instructions. +// FIXME: Remove this when we can do generalized remat and when machine licm +// can properly the instructions. +let isReMaterializable = 1 in { +def MOV_ga_pcrel : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr), + IIC_iMOVix2addpc, + [(set GPR:$dst, (ARMWrapperPIC tglobaladdr:$addr))]>, + Requires<[IsARM, UseMovt]>; + +def MOV_ga_dyn : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr), + IIC_iMOVix2, + [(set GPR:$dst, (ARMWrapperDYN tglobaladdr:$addr))]>, + Requires<[IsARM, UseMovt]>; + +let AddedComplexity = 10 in +def MOV_ga_pcrel_ldr : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr), + IIC_iMOVix2ld, + [(set GPR:$dst, (load (ARMWrapperPIC tglobaladdr:$addr)))]>, + Requires<[IsARM, UseMovt]>; +} // isReMaterializable + +// ConstantPool, GlobalAddress, and JumpTable +def : ARMPat<(ARMWrapper tglobaladdr :$dst), (LEApcrel tglobaladdr :$dst)>, + Requires<[IsARM, DontUseMovt]>; +def : ARMPat<(ARMWrapper tconstpool :$dst), (LEApcrel tconstpool :$dst)>; +def : ARMPat<(ARMWrapper tglobaladdr :$dst), (MOVi32imm tglobaladdr :$dst)>, + Requires<[IsARM, UseMovt]>; +def : ARMPat<(ARMWrapperJT tjumptable:$dst, imm:$id), + (LEApcrelJT tjumptable:$dst, imm:$id)>; + +// TODO: add,sub,and, 3-instr forms? + +// Tail calls +def : ARMPat<(ARMtcret tcGPR:$dst), + (TCRETURNri tcGPR:$dst)>, Requires<[IsDarwin]>; + +def : ARMPat<(ARMtcret (i32 tglobaladdr:$dst)), + (TCRETURNdi texternalsym:$dst)>, Requires<[IsDarwin]>; + +def : ARMPat<(ARMtcret (i32 texternalsym:$dst)), + (TCRETURNdi texternalsym:$dst)>, Requires<[IsDarwin]>; + +def : ARMPat<(ARMtcret tcGPR:$dst), + (TCRETURNriND tcGPR:$dst)>, Requires<[IsNotDarwin]>; + +def : ARMPat<(ARMtcret (i32 tglobaladdr:$dst)), + (TCRETURNdiND texternalsym:$dst)>, Requires<[IsNotDarwin]>; + +def : ARMPat<(ARMtcret (i32 texternalsym:$dst)), + (TCRETURNdiND texternalsym:$dst)>, Requires<[IsNotDarwin]>; + +// Direct calls +def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>, + Requires<[IsARM, IsNotDarwin]>; +def : ARMPat<(ARMcall texternalsym:$func), (BLr9 texternalsym:$func)>, + Requires<[IsARM, IsDarwin]>; + +// zextload i1 -> zextload i8 +def : ARMPat<(zextloadi1 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>; +def : ARMPat<(zextloadi1 ldst_so_reg:$addr), (LDRBrs ldst_so_reg:$addr)>; + +// extload -> zextload +def : ARMPat<(extloadi1 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>; +def : ARMPat<(extloadi1 ldst_so_reg:$addr), (LDRBrs ldst_so_reg:$addr)>; +def : ARMPat<(extloadi8 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>; +def : ARMPat<(extloadi8 ldst_so_reg:$addr), (LDRBrs ldst_so_reg:$addr)>; + +def : ARMPat<(extloadi16 addrmode3:$addr), (LDRH addrmode3:$addr)>; + +def : ARMPat<(extloadi8 addrmodepc:$addr), (PICLDRB addrmodepc:$addr)>; +def : ARMPat<(extloadi16 addrmodepc:$addr), (PICLDRH addrmodepc:$addr)>; + +// smul* and smla* +def : ARMV5TEPat<(mul (sra (shl GPR:$a, (i32 16)), (i32 16)), + (sra (shl GPR:$b, (i32 16)), (i32 16))), + (SMULBB GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul sext_16_node:$a, sext_16_node:$b), + (SMULBB GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul (sra (shl GPR:$a, (i32 16)), (i32 16)), + (sra GPR:$b, (i32 16))), + (SMULBT GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul sext_16_node:$a, (sra GPR:$b, (i32 16))), + (SMULBT GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)), + (sra (shl GPR:$b, (i32 16)), (i32 16))), + (SMULTB GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)), sext_16_node:$b), + (SMULTB GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))), + (i32 16)), + (SMULWB GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(sra (mul GPR:$a, sext_16_node:$b), (i32 16)), + (SMULWB GPR:$a, GPR:$b)>; + +def : ARMV5TEPat<(add GPR:$acc, + (mul (sra (shl GPR:$a, (i32 16)), (i32 16)), + (sra (shl GPR:$b, (i32 16)), (i32 16)))), + (SMLABB GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (mul sext_16_node:$a, sext_16_node:$b)), + (SMLABB GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (mul (sra (shl GPR:$a, (i32 16)), (i32 16)), + (sra GPR:$b, (i32 16)))), + (SMLABT GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (mul sext_16_node:$a, (sra GPR:$b, (i32 16)))), + (SMLABT GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (mul (sra GPR:$a, (i32 16)), + (sra (shl GPR:$b, (i32 16)), (i32 16)))), + (SMLATB GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (mul (sra GPR:$a, (i32 16)), sext_16_node:$b)), + (SMLATB GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))), + (i32 16))), + (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (sra (mul GPR:$a, sext_16_node:$b), (i32 16))), + (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>; + + +// Pre-v7 uses MCR for synchronization barriers. +def : ARMPat<(ARMMemBarrierMCR GPR:$zero), (MCR 15, 0, GPR:$zero, 7, 10, 5)>, + Requires<[IsARM, HasV6]>; + + +//===----------------------------------------------------------------------===// +// Thumb Support +// + +include "ARMInstrThumb.td" + +//===----------------------------------------------------------------------===// +// Thumb2 Support +// + +include "ARMInstrThumb2.td" + +//===----------------------------------------------------------------------===// +// Floating Point Support +// + +include "ARMInstrVFP.td" + +//===----------------------------------------------------------------------===// +// Advanced SIMD (NEON) Support +// + +include "ARMInstrNEON.td" + diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index dc3d63e..e34d69a 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -80,6 +80,12 @@ def SDTARMVORRIMM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, def NEONvorrImm : SDNode<"ARMISD::VORRIMM", SDTARMVORRIMM>; def NEONvbicImm : SDNode<"ARMISD::VBICIMM", SDTARMVORRIMM>; +def NEONvbsl : SDNode<"ARMISD::VBSL", + SDTypeProfile<1, 3, [SDTCisVec<0>, + SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>]>>; + def NEONvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>; // VDUPLANE can produce a quad-register result from a double-register source, @@ -146,10 +152,6 @@ def VLDMQIA : PseudoVFPLdStM<(outs QPR:$dst), (ins GPR:$Rn), IIC_fpLoad_m, "", [(set QPR:$dst, (v2f64 (load GPR:$Rn)))]>; -def VLDMQDB - : PseudoVFPLdStM<(outs QPR:$dst), (ins GPR:$Rn), - IIC_fpLoad_m, "", - [(set QPR:$dst, (v2f64 (load GPR:$Rn)))]>; // Use VSTM to store a Q register as a D register pair. // This is a pseudo instruction that is expanded to VSTMD after reg alloc. @@ -157,10 +159,6 @@ def VSTMQIA : PseudoVFPLdStM<(outs), (ins QPR:$src, GPR:$Rn), IIC_fpStore_m, "", [(store (v2f64 QPR:$src), GPR:$Rn)]>; -def VSTMQDB - : PseudoVFPLdStM<(outs), (ins QPR:$src, GPR:$Rn), - IIC_fpStore_m, "", - [(store (v2f64 QPR:$src), GPR:$Rn)]>; // Classes for VLD* pseudo-instructions with multi-register operands. // These are expanded to real instructions after register allocation. @@ -1801,7 +1799,7 @@ class N3VDX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, class N3VDSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp> - : N3V<0, 1, op21_20, op11_8, 1, 0, + : N3VLane32<0, 1, op21_20, op11_8, 1, 0, (outs DPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", [(set (Ty DPR:$Vd), @@ -1811,7 +1809,7 @@ class N3VDSL<bits<2> op21_20, bits<4> op11_8, } class N3VDSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp> - : N3V<0, 1, op21_20, op11_8, 1, 0, + : N3VLane16<0, 1, op21_20, op11_8, 1, 0, (outs DPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), NVMulSLFrm, IIC_VMULi16D, OpcodeStr, Dt,"$Vd, $Vn, $Vm[$lane]","", [(set (Ty DPR:$Vd), @@ -1841,7 +1839,7 @@ class N3VQX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, class N3VQSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode ShOp> - : N3V<1, 1, op21_20, op11_8, 1, 0, + : N3VLane32<1, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), (ins QPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", [(set (ResTy QPR:$Vd), @@ -1852,7 +1850,7 @@ class N3VQSL<bits<2> op21_20, bits<4> op11_8, } class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode ShOp> - : N3V<1, 1, op21_20, op11_8, 1, 0, + : N3VLane16<1, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), (ins QPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), NVMulSLFrm, IIC_VMULi16Q, OpcodeStr, Dt,"$Vd, $Vn, $Vm[$lane]","", [(set (ResTy QPR:$Vd), @@ -1874,7 +1872,7 @@ class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, } class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp> - : N3V<0, 1, op21_20, op11_8, 1, 0, + : N3VLane32<0, 1, op21_20, op11_8, 1, 0, (outs DPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", [(set (Ty DPR:$Vd), @@ -1885,7 +1883,7 @@ class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, } class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp> - : N3V<0, 1, op21_20, op11_8, 1, 0, + : N3VLane16<0, 1, op21_20, op11_8, 1, 0, (outs DPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", [(set (Ty DPR:$Vd), @@ -1915,7 +1913,7 @@ class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> - : N3V<1, 1, op21_20, op11_8, 1, 0, + : N3VLane32<1, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), (ins QPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", [(set (ResTy QPR:$Vd), @@ -1927,7 +1925,7 @@ class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> - : N3V<1, 1, op21_20, op11_8, 1, 0, + : N3VLane16<1, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), (ins QPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", [(set (ResTy QPR:$Vd), @@ -1959,7 +1957,7 @@ class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, SDPatternOperator MulOp, SDPatternOperator ShOp> - : N3V<0, 1, op21_20, op11_8, 1, 0, + : N3VLane32<0, 1, op21_20, op11_8, 1, 0, (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), NVMulSLFrm, itin, @@ -1972,7 +1970,7 @@ class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty, SDNode MulOp, SDNode ShOp> - : N3V<0, 1, op21_20, op11_8, 1, 0, + : N3VLane16<0, 1, op21_20, op11_8, 1, 0, (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), NVMulSLFrm, itin, @@ -1994,7 +1992,7 @@ class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, class N3VQMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDPatternOperator MulOp, SDPatternOperator ShOp> - : N3V<1, 1, op21_20, op11_8, 1, 0, + : N3VLane32<1, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), NVMulSLFrm, itin, @@ -2008,7 +2006,7 @@ class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDNode MulOp, SDNode ShOp> - : N3V<1, 1, op21_20, op11_8, 1, 0, + : N3VLane16<1, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), NVMulSLFrm, itin, @@ -2069,7 +2067,7 @@ class N3VLMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, class N3VLMulOpSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode> - : N3V<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), + : N3VLane32<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd", @@ -2081,7 +2079,7 @@ class N3VLMulOpSL<bit op24, bits<2> op21_20, bits<4> op11_8, class N3VLMulOpSL16<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode> - : N3V<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), + : N3VLane16<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd", @@ -2116,7 +2114,7 @@ class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> - : N3V<op24, 1, op21_20, op11_8, 1, 0, + : N3VLane32<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), NVMulSLFrm, itin, @@ -2129,7 +2127,7 @@ class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> - : N3V<op24, 1, op21_20, op11_8, 1, 0, + : N3VLane16<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), NVMulSLFrm, itin, @@ -2164,7 +2162,7 @@ class N3VL<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, class N3VLSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD, SDNode OpNode> - : N3V<op24, 1, op21_20, op11_8, 1, 0, + : N3VLane32<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", [(set QPR:$Vd, @@ -2173,7 +2171,7 @@ class N3VLSL<bit op24, bits<2> op21_20, bits<4> op11_8, class N3VLSL16<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD, SDNode OpNode> - : N3V<op24, 1, op21_20, op11_8, 1, 0, + : N3VLane16<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", [(set QPR:$Vd, @@ -2219,7 +2217,7 @@ class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> - : N3V<op24, 1, op21_20, op11_8, 1, 0, + : N3VLane32<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane), NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", [(set (ResTy QPR:$Vd), @@ -2229,7 +2227,7 @@ class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, Intrinsic IntOp> - : N3V<op24, 1, op21_20, op11_8, 1, 0, + : N3VLane16<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane), NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "", [(set (ResTy QPR:$Vd), @@ -2288,17 +2286,17 @@ class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, // Shift by immediate, // both double- and quad-register. class N2VDSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, - Format f, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType Ty, SDNode OpNode> + Format f, InstrItinClass itin, Operand ImmTy, + string OpcodeStr, string Dt, ValueType Ty, SDNode OpNode> : N2VImm<op24, op23, op11_8, op7, 0, op4, - (outs DPR:$Vd), (ins DPR:$Vm, i32imm:$SIMM), f, itin, + (outs DPR:$Vd), (ins DPR:$Vm, ImmTy:$SIMM), f, itin, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "", [(set DPR:$Vd, (Ty (OpNode (Ty DPR:$Vm), (i32 imm:$SIMM))))]>; class N2VQSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, - Format f, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType Ty, SDNode OpNode> + Format f, InstrItinClass itin, Operand ImmTy, + string OpcodeStr, string Dt, ValueType Ty, SDNode OpNode> : N2VImm<op24, op23, op11_8, op7, 1, op4, - (outs QPR:$Vd), (ins QPR:$Vm, i32imm:$SIMM), f, itin, + (outs QPR:$Vd), (ins QPR:$Vm, ImmTy:$SIMM), f, itin, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "", [(set QPR:$Vd, (Ty (OpNode (Ty QPR:$Vm), (i32 imm:$SIMM))))]>; @@ -2315,9 +2313,9 @@ class N2VLSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, // Narrow shift by immediate. class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, - ValueType ResTy, ValueType OpTy, SDNode OpNode> + ValueType ResTy, ValueType OpTy, Operand ImmTy, SDNode OpNode> : N2VImm<op24, op23, op11_8, op7, op6, op4, - (outs DPR:$Vd), (ins QPR:$Vm, i32imm:$SIMM), N2RegVShRFrm, itin, + (outs DPR:$Vd), (ins QPR:$Vm, ImmTy:$SIMM), N2RegVShRFrm, itin, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "", [(set DPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vm), (i32 imm:$SIMM))))]>; @@ -2325,16 +2323,18 @@ class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, // Shift right by immediate and accumulate, // both double- and quad-register. class N2VDShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, - string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp> + Operand ImmTy, string OpcodeStr, string Dt, + ValueType Ty, SDNode ShOp> : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$Vd), - (ins DPR:$src1, DPR:$Vm, i32imm:$SIMM), N2RegVShRFrm, IIC_VPALiD, + (ins DPR:$src1, DPR:$Vm, ImmTy:$SIMM), N2RegVShRFrm, IIC_VPALiD, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd", [(set DPR:$Vd, (Ty (add DPR:$src1, (Ty (ShOp DPR:$Vm, (i32 imm:$SIMM))))))]>; class N2VQShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, - string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp> + Operand ImmTy, string OpcodeStr, string Dt, + ValueType Ty, SDNode ShOp> : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$Vd), - (ins QPR:$src1, QPR:$Vm, i32imm:$SIMM), N2RegVShRFrm, IIC_VPALiD, + (ins QPR:$src1, QPR:$Vm, ImmTy:$SIMM), N2RegVShRFrm, IIC_VPALiD, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd", [(set QPR:$Vd, (Ty (add QPR:$src1, (Ty (ShOp QPR:$Vm, (i32 imm:$SIMM))))))]>; @@ -2342,15 +2342,17 @@ class N2VQShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, // Shift by immediate and insert, // both double- and quad-register. class N2VDShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, - Format f, string OpcodeStr, string Dt, ValueType Ty,SDNode ShOp> + Operand ImmTy, Format f, string OpcodeStr, string Dt, + ValueType Ty,SDNode ShOp> : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$Vd), - (ins DPR:$src1, DPR:$Vm, i32imm:$SIMM), f, IIC_VSHLiD, + (ins DPR:$src1, DPR:$Vm, ImmTy:$SIMM), f, IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd", [(set DPR:$Vd, (Ty (ShOp DPR:$src1, DPR:$Vm, (i32 imm:$SIMM))))]>; class N2VQShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4, - Format f, string OpcodeStr, string Dt, ValueType Ty,SDNode ShOp> + Operand ImmTy, Format f, string OpcodeStr, string Dt, + ValueType Ty,SDNode ShOp> : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$Vd), - (ins QPR:$src1, QPR:$Vm, i32imm:$SIMM), f, IIC_VSHLiQ, + (ins QPR:$src1, QPR:$Vm, ImmTy:$SIMM), f, IIC_VSHLiQ, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd", [(set QPR:$Vd, (Ty (ShOp QPR:$src1, QPR:$Vm, (i32 imm:$SIMM))))]>; @@ -3010,40 +3012,77 @@ multiclass N2VPLInt2_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, // Neon 2-register vector shift by immediate, // with f of either N2RegVShLFrm or N2RegVShRFrm // element sizes of 8, 16, 32 and 64 bits: -multiclass N2VSh_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, - InstrItinClass itin, string OpcodeStr, string Dt, - SDNode OpNode, Format f> { +multiclass N2VShL_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + SDNode OpNode> { + // 64-bit vector types. + def v8i8 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm, + OpcodeStr, !strconcat(Dt, "8"), v8i8, OpNode> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm, + OpcodeStr, !strconcat(Dt, "16"), v4i16, OpNode> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm, + OpcodeStr, !strconcat(Dt, "32"), v2i32, OpNode> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, N2RegVShLFrm, itin, i32imm, + OpcodeStr, !strconcat(Dt, "64"), v1i64, OpNode>; + // imm6 = xxxxxx + + // 128-bit vector types. + def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm, + OpcodeStr, !strconcat(Dt, "8"), v16i8, OpNode> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm, + OpcodeStr, !strconcat(Dt, "16"), v8i16, OpNode> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm, + OpcodeStr, !strconcat(Dt, "32"), v4i32, OpNode> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, N2RegVShLFrm, itin, i32imm, + OpcodeStr, !strconcat(Dt, "64"), v2i64, OpNode>; + // imm6 = xxxxxx +} +multiclass N2VShR_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, + SDNode OpNode> { // 64-bit vector types. - def v8i8 : N2VDSh<op24, op23, op11_8, 0, op4, f, itin, + def v8i8 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm8, OpcodeStr, !strconcat(Dt, "8"), v8i8, OpNode> { let Inst{21-19} = 0b001; // imm6 = 001xxx } - def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, f, itin, + def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm16, OpcodeStr, !strconcat(Dt, "16"), v4i16, OpNode> { let Inst{21-20} = 0b01; // imm6 = 01xxxx } - def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, f, itin, + def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm32, OpcodeStr, !strconcat(Dt, "32"), v2i32, OpNode> { let Inst{21} = 0b1; // imm6 = 1xxxxx } - def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, f, itin, + def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, N2RegVShRFrm, itin, shr_imm64, OpcodeStr, !strconcat(Dt, "64"), v1i64, OpNode>; // imm6 = xxxxxx // 128-bit vector types. - def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, f, itin, + def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm8, OpcodeStr, !strconcat(Dt, "8"), v16i8, OpNode> { let Inst{21-19} = 0b001; // imm6 = 001xxx } - def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, f, itin, + def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm16, OpcodeStr, !strconcat(Dt, "16"), v8i16, OpNode> { let Inst{21-20} = 0b01; // imm6 = 01xxxx } - def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, f, itin, + def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm32, OpcodeStr, !strconcat(Dt, "32"), v4i32, OpNode> { let Inst{21} = 0b1; // imm6 = 1xxxxx } - def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, f, itin, + def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, N2RegVShRFrm, itin, shr_imm64, OpcodeStr, !strconcat(Dt, "64"), v2i64, OpNode>; // imm6 = xxxxxx } @@ -3053,79 +3092,113 @@ multiclass N2VSh_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, multiclass N2VShAdd_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, string OpcodeStr, string Dt, SDNode ShOp> { // 64-bit vector types. - def v8i8 : N2VDShAdd<op24, op23, op11_8, 0, op4, + def v8i8 : N2VDShAdd<op24, op23, op11_8, 0, op4, shr_imm8, OpcodeStr, !strconcat(Dt, "8"), v8i8, ShOp> { let Inst{21-19} = 0b001; // imm6 = 001xxx } - def v4i16 : N2VDShAdd<op24, op23, op11_8, 0, op4, + def v4i16 : N2VDShAdd<op24, op23, op11_8, 0, op4, shr_imm16, OpcodeStr, !strconcat(Dt, "16"), v4i16, ShOp> { let Inst{21-20} = 0b01; // imm6 = 01xxxx } - def v2i32 : N2VDShAdd<op24, op23, op11_8, 0, op4, + def v2i32 : N2VDShAdd<op24, op23, op11_8, 0, op4, shr_imm32, OpcodeStr, !strconcat(Dt, "32"), v2i32, ShOp> { let Inst{21} = 0b1; // imm6 = 1xxxxx } - def v1i64 : N2VDShAdd<op24, op23, op11_8, 1, op4, + def v1i64 : N2VDShAdd<op24, op23, op11_8, 1, op4, shr_imm64, OpcodeStr, !strconcat(Dt, "64"), v1i64, ShOp>; // imm6 = xxxxxx // 128-bit vector types. - def v16i8 : N2VQShAdd<op24, op23, op11_8, 0, op4, + def v16i8 : N2VQShAdd<op24, op23, op11_8, 0, op4, shr_imm8, OpcodeStr, !strconcat(Dt, "8"), v16i8, ShOp> { let Inst{21-19} = 0b001; // imm6 = 001xxx } - def v8i16 : N2VQShAdd<op24, op23, op11_8, 0, op4, + def v8i16 : N2VQShAdd<op24, op23, op11_8, 0, op4, shr_imm16, OpcodeStr, !strconcat(Dt, "16"), v8i16, ShOp> { let Inst{21-20} = 0b01; // imm6 = 01xxxx } - def v4i32 : N2VQShAdd<op24, op23, op11_8, 0, op4, + def v4i32 : N2VQShAdd<op24, op23, op11_8, 0, op4, shr_imm32, OpcodeStr, !strconcat(Dt, "32"), v4i32, ShOp> { let Inst{21} = 0b1; // imm6 = 1xxxxx } - def v2i64 : N2VQShAdd<op24, op23, op11_8, 1, op4, + def v2i64 : N2VQShAdd<op24, op23, op11_8, 1, op4, shr_imm64, OpcodeStr, !strconcat(Dt, "64"), v2i64, ShOp>; // imm6 = xxxxxx } - // Neon Shift-Insert vector operations, // with f of either N2RegVShLFrm or N2RegVShRFrm // element sizes of 8, 16, 32 and 64 bits: -multiclass N2VShIns_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, - string OpcodeStr, SDNode ShOp, - Format f> { +multiclass N2VShInsL_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr> { + // 64-bit vector types. + def v8i8 : N2VDShIns<op24, op23, op11_8, 0, op4, i32imm, + N2RegVShLFrm, OpcodeStr, "8", v8i8, NEONvsli> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4, i32imm, + N2RegVShLFrm, OpcodeStr, "16", v4i16, NEONvsli> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4, i32imm, + N2RegVShLFrm, OpcodeStr, "32", v2i32, NEONvsli> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4, i32imm, + N2RegVShLFrm, OpcodeStr, "64", v1i64, NEONvsli>; + // imm6 = xxxxxx + + // 128-bit vector types. + def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4, i32imm, + N2RegVShLFrm, OpcodeStr, "8", v16i8, NEONvsli> { + let Inst{21-19} = 0b001; // imm6 = 001xxx + } + def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4, i32imm, + N2RegVShLFrm, OpcodeStr, "16", v8i16, NEONvsli> { + let Inst{21-20} = 0b01; // imm6 = 01xxxx + } + def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4, i32imm, + N2RegVShLFrm, OpcodeStr, "32", v4i32, NEONvsli> { + let Inst{21} = 0b1; // imm6 = 1xxxxx + } + def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4, i32imm, + N2RegVShLFrm, OpcodeStr, "64", v2i64, NEONvsli>; + // imm6 = xxxxxx +} +multiclass N2VShInsR_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr> { // 64-bit vector types. - def v8i8 : N2VDShIns<op24, op23, op11_8, 0, op4, - f, OpcodeStr, "8", v8i8, ShOp> { + def v8i8 : N2VDShIns<op24, op23, op11_8, 0, op4, shr_imm8, + N2RegVShRFrm, OpcodeStr, "8", v8i8, NEONvsri> { let Inst{21-19} = 0b001; // imm6 = 001xxx } - def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4, - f, OpcodeStr, "16", v4i16, ShOp> { + def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4, shr_imm16, + N2RegVShRFrm, OpcodeStr, "16", v4i16, NEONvsri> { let Inst{21-20} = 0b01; // imm6 = 01xxxx } - def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4, - f, OpcodeStr, "32", v2i32, ShOp> { + def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4, shr_imm32, + N2RegVShRFrm, OpcodeStr, "32", v2i32, NEONvsri> { let Inst{21} = 0b1; // imm6 = 1xxxxx } - def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4, - f, OpcodeStr, "64", v1i64, ShOp>; + def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4, shr_imm64, + N2RegVShRFrm, OpcodeStr, "64", v1i64, NEONvsri>; // imm6 = xxxxxx // 128-bit vector types. - def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4, - f, OpcodeStr, "8", v16i8, ShOp> { + def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4, shr_imm8, + N2RegVShRFrm, OpcodeStr, "8", v16i8, NEONvsri> { let Inst{21-19} = 0b001; // imm6 = 001xxx } - def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4, - f, OpcodeStr, "16", v8i16, ShOp> { + def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4, shr_imm16, + N2RegVShRFrm, OpcodeStr, "16", v8i16, NEONvsri> { let Inst{21-20} = 0b01; // imm6 = 01xxxx } - def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4, - f, OpcodeStr, "32", v4i32, ShOp> { + def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4, shr_imm32, + N2RegVShRFrm, OpcodeStr, "32", v4i32, NEONvsri> { let Inst{21} = 0b1; // imm6 = 1xxxxx } - def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4, - f, OpcodeStr, "64", v2i64, ShOp>; + def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4, shr_imm64, + N2RegVShRFrm, OpcodeStr, "64", v2i64, NEONvsri>; // imm6 = xxxxxx } @@ -3153,15 +3226,18 @@ multiclass N2VNSh_HSD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4, InstrItinClass itin, string OpcodeStr, string Dt, SDNode OpNode> { def v8i8 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin, - OpcodeStr, !strconcat(Dt, "16"), v8i8, v8i16, OpNode> { + OpcodeStr, !strconcat(Dt, "16"), + v8i8, v8i16, shr_imm8, OpNode> { let Inst{21-19} = 0b001; // imm6 = 001xxx } def v4i16 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin, - OpcodeStr, !strconcat(Dt, "32"), v4i16, v4i32, OpNode> { + OpcodeStr, !strconcat(Dt, "32"), + v4i16, v4i32, shr_imm16, OpNode> { let Inst{21-20} = 0b01; // imm6 = 01xxxx } def v2i32 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin, - OpcodeStr, !strconcat(Dt, "64"), v2i32, v2i64, OpNode> { + OpcodeStr, !strconcat(Dt, "64"), + v2i32, v2i64, shr_imm32, OpNode> { let Inst{21} = 0b1; // imm6 = 1xxxxx } } @@ -3697,16 +3773,21 @@ def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VCNTiD, "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd", - [(set DPR:$Vd, - (v2i32 (or (and DPR:$Vn, DPR:$src1), - (and DPR:$Vm, (vnotd DPR:$src1)))))]>; + [(set DPR:$Vd, (v2i32 (NEONvbsl DPR:$src1, DPR:$Vn, DPR:$Vm)))]>; + +def : Pat<(v2i32 (or (and DPR:$Vn, DPR:$Vd), + (and DPR:$Vm, (vnotd DPR:$Vd)))), + (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>; + def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VCNTiQ, "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd", - [(set QPR:$Vd, - (v4i32 (or (and QPR:$Vn, QPR:$src1), - (and QPR:$Vm, (vnotq QPR:$src1)))))]>; + [(set QPR:$Vd, (v4i32 (NEONvbsl QPR:$src1, QPR:$Vn, QPR:$Vm)))]>; + +def : Pat<(v4i32 (or (and QPR:$Vn, QPR:$Vd), + (and QPR:$Vm, (vnotq QPR:$Vd)))), + (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>; // VBIF : Vector Bitwise Insert if False // like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst", @@ -3917,14 +3998,13 @@ defm VSHLs : N3VInt_QHSDSh<0, 0, 0b0100, 0, N3RegVShFrm, defm VSHLu : N3VInt_QHSDSh<1, 0, 0b0100, 0, N3RegVShFrm, IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ, "vshl", "u", int_arm_neon_vshiftu>; + // VSHL : Vector Shift Left (Immediate) -defm VSHLi : N2VSh_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshl, - N2RegVShLFrm>; +defm VSHLi : N2VShL_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshl>; + // VSHR : Vector Shift Right (Immediate) -defm VSHRs : N2VSh_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s", NEONvshrs, - N2RegVShRFrm>; -defm VSHRu : N2VSh_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u", NEONvshru, - N2RegVShRFrm>; +defm VSHRs : N2VShR_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s",NEONvshrs>; +defm VSHRu : N2VShR_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u",NEONvshru>; // VSHLL : Vector Shift Left Long defm VSHLLs : N2VLSh_QHS<0, 1, 0b1010, 0, 0, 1, "vshll", "s", NEONvshlls>; @@ -3957,10 +4037,8 @@ defm VRSHLu : N3VInt_QHSDSh<1, 0, 0b0101, 0, N3RegVShFrm, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, "vrshl", "u", int_arm_neon_vrshiftu>; // VRSHR : Vector Rounding Shift Right -defm VRSHRs : N2VSh_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s", NEONvrshrs, - N2RegVShRFrm>; -defm VRSHRu : N2VSh_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u", NEONvrshru, - N2RegVShRFrm>; +defm VRSHRs : N2VShR_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s",NEONvrshrs>; +defm VRSHRu : N2VShR_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u",NEONvrshru>; // VRSHRN : Vector Rounding Shift Right and Narrow defm VRSHRN : N2VNSh_HSD<0, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vrshrn", "i", @@ -3974,13 +4052,11 @@ defm VQSHLu : N3VInt_QHSDSh<1, 0, 0b0100, 1, N3RegVShFrm, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q, "vqshl", "u", int_arm_neon_vqshiftu>; // VQSHL : Vector Saturating Shift Left (Immediate) -defm VQSHLsi : N2VSh_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s",NEONvqshls, - N2RegVShLFrm>; -defm VQSHLui : N2VSh_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u",NEONvqshlu, - N2RegVShLFrm>; +defm VQSHLsi : N2VShL_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s",NEONvqshls>; +defm VQSHLui : N2VShL_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u",NEONvqshlu>; + // VQSHLU : Vector Saturating Shift Left (Immediate, Unsigned) -defm VQSHLsu : N2VSh_QHSD<1,1,0b0110,1, IIC_VSHLi4D,"vqshlu","s",NEONvqshlsu, - N2RegVShLFrm>; +defm VQSHLsu : N2VShL_QHSD<1,1,0b0110,1, IIC_VSHLi4D,"vqshlu","s",NEONvqshlsu>; // VQSHRN : Vector Saturating Shift Right and Narrow defm VQSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "s", @@ -4018,9 +4094,10 @@ defm VRSRAs : N2VShAdd_QHSD<0, 1, 0b0011, 1, "vrsra", "s", NEONvrshrs>; defm VRSRAu : N2VShAdd_QHSD<1, 1, 0b0011, 1, "vrsra", "u", NEONvrshru>; // VSLI : Vector Shift Left and Insert -defm VSLI : N2VShIns_QHSD<1, 1, 0b0101, 1, "vsli", NEONvsli, N2RegVShLFrm>; +defm VSLI : N2VShInsL_QHSD<1, 1, 0b0101, 1, "vsli">; + // VSRI : Vector Shift Right and Insert -defm VSRI : N2VShIns_QHSD<1, 1, 0b0100, 1, "vsri", NEONvsri, N2RegVShRFrm>; +defm VSRI : N2VShInsR_QHSD<1, 1, 0b0100, 1, "vsri">; // Vector Absolute and Saturating Absolute. @@ -4362,14 +4439,8 @@ def VDUP8q : VDUPQ<0b11101110, 0b00, "8", v16i8>; def VDUP16q : VDUPQ<0b11101010, 0b01, "16", v8i16>; def VDUP32q : VDUPQ<0b11101010, 0b00, "32", v4i32>; -def VDUPfd : NVDup<0b11101000, 0b1011, 0b00, (outs DPR:$V), (ins GPR:$R), - IIC_VMOVIS, "vdup", "32", "$V, $R", - [(set DPR:$V, (v2f32 (NEONvdup - (f32 (bitconvert GPR:$R)))))]>; -def VDUPfq : NVDup<0b11101010, 0b1011, 0b00, (outs QPR:$V), (ins GPR:$R), - IIC_VMOVIS, "vdup", "32", "$V, $R", - [(set QPR:$V, (v4f32 (NEONvdup - (f32 (bitconvert GPR:$R)))))]>; +def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32d GPR:$R)>; +def : Pat<(v4f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32q GPR:$R)>; // VDUP : Vector Duplicate Lane (from scalar to all elements) @@ -4397,9 +4468,6 @@ def VDUPLN16d : VDUPLND<{?,?,1,0}, "vdup", "16", v4i16> { def VDUPLN32d : VDUPLND<{?,1,0,0}, "vdup", "32", v2i32> { let Inst{19} = lane{0}; } -def VDUPLNfd : VDUPLND<{?,1,0,0}, "vdup", "32", v2f32> { - let Inst{19} = lane{0}; -} def VDUPLN8q : VDUPLNQ<{?,?,?,1}, "vdup", "8", v16i8, v8i8> { let Inst{19-17} = lane{2-0}; } @@ -4409,9 +4477,12 @@ def VDUPLN16q : VDUPLNQ<{?,?,1,0}, "vdup", "16", v8i16, v4i16> { def VDUPLN32q : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4i32, v2i32> { let Inst{19} = lane{0}; } -def VDUPLNfq : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4f32, v2f32> { - let Inst{19} = lane{0}; -} + +def : Pat<(v2f32 (NEONvduplane (v2f32 DPR:$Vm), imm:$lane)), + (VDUPLN32d DPR:$Vm, imm:$lane)>; + +def : Pat<(v4f32 (NEONvduplane (v2f32 DPR:$Vm), imm:$lane)), + (VDUPLN32q DPR:$Vm, imm:$lane)>; def : Pat<(v16i8 (NEONvduplane (v16i8 QPR:$src), imm:$lane)), (v16i8 (VDUPLN8q (v8i8 (EXTRACT_SUBREG QPR:$src, @@ -4426,7 +4497,7 @@ def : Pat<(v4i32 (NEONvduplane (v4i32 QPR:$src), imm:$lane)), (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane)))>; def : Pat<(v4f32 (NEONvduplane (v4f32 QPR:$src), imm:$lane)), - (v4f32 (VDUPLNfq (v2f32 (EXTRACT_SUBREG QPR:$src, + (v4f32 (VDUPLN32q (v2f32 (EXTRACT_SUBREG QPR:$src, (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane)))>; @@ -4517,12 +4588,12 @@ class VREV64Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty> def VREV64d8 : VREV64D<0b00, "vrev64", "8", v8i8>; def VREV64d16 : VREV64D<0b01, "vrev64", "16", v4i16>; def VREV64d32 : VREV64D<0b10, "vrev64", "32", v2i32>; -def VREV64df : VREV64D<0b10, "vrev64", "32", v2f32>; +def : Pat<(v2f32 (NEONvrev64 (v2f32 DPR:$Vm))), (VREV64d32 DPR:$Vm)>; def VREV64q8 : VREV64Q<0b00, "vrev64", "8", v16i8>; def VREV64q16 : VREV64Q<0b01, "vrev64", "16", v8i16>; def VREV64q32 : VREV64Q<0b10, "vrev64", "32", v4i32>; -def VREV64qf : VREV64Q<0b10, "vrev64", "32", v4f32>; +def : Pat<(v4f32 (NEONvrev64 (v4f32 QPR:$Vm))), (VREV64q32 QPR:$Vm)>; // VREV32 : Vector Reverse elements within 32-bit words @@ -4628,8 +4699,8 @@ def VEXTq32 : VEXTq<"vext", "32", v4i32> { let Inst{9-8} = 0b00; } def VEXTqf : VEXTq<"vext", "32", v4f32> { - let Inst{11} = index{0}; - let Inst{10-8} = 0b000; + let Inst{11-10} = index{1-0}; + let Inst{9-8} = 0b00; } // VTRN : Vector Transpose diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td index 826ef46..8c542fe 100644 --- a/lib/Target/ARM/ARMInstrThumb.td +++ b/lib/Target/ARM/ARMInstrThumb.td @@ -27,22 +27,22 @@ def imm_comp_XFORM : SDNodeXForm<imm, [{ }]>; /// imm0_7 predicate - True if the 32-bit immediate is in the range [0,7]. -def imm0_7 : PatLeaf<(i32 imm), [{ - return (uint32_t)N->getZExtValue() < 8; +def imm0_7 : ImmLeaf<i32, [{ + return Imm >= 0 && Imm < 8; }]>; def imm0_7_neg : PatLeaf<(i32 imm), [{ return (uint32_t)-N->getZExtValue() < 8; }], imm_neg_XFORM>; -def imm0_255 : PatLeaf<(i32 imm), [{ - return (uint32_t)N->getZExtValue() < 256; +def imm0_255 : ImmLeaf<i32, [{ + return Imm >= 0 && Imm < 256; }]>; def imm0_255_comp : PatLeaf<(i32 imm), [{ return ~((uint32_t)N->getZExtValue()) < 256; }]>; -def imm8_255 : PatLeaf<(i32 imm), [{ - return (uint32_t)N->getZExtValue() >= 8 && (uint32_t)N->getZExtValue() < 256; +def imm8_255 : ImmLeaf<i32, [{ + return Imm >= 8 && Imm < 256; }]>; def imm8_255_neg : PatLeaf<(i32 imm), [{ unsigned Val = -N->getZExtValue(); @@ -369,6 +369,15 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1 in { let Inst{2-0} = 0b000; } + def tBX_Rm : TI<(outs), (ins pred:$p, GPR:$Rm), IIC_Br, "bx${p}\t$Rm", + [/* for disassembly only */]>, + T1Special<{1,1,0,?}> { + // A6.2.3 & A8.6.25 + bits<4> Rm; + let Inst{6-3} = Rm; + let Inst{2-0} = 0b000; + } + // Alternative return instruction used by vararg functions. def tBX_RET_vararg : TI<(outs), (ins tGPR:$Rm), IIC_Br, "bx\t$Rm", @@ -712,6 +721,19 @@ def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i, let Inst{7-0} = addr; } +// FIXME: Remove this entry when the above ldr.n workaround is fixed. +// For disassembly use only. +def tLDRpciDIS : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i, + "ldr", "\t$Rt, $addr", + [/* disassembly only */]>, + T1Encoding<{0,1,0,0,1,?}> { + // A6.2 & A8.6.59 + bits<3> Rt; + bits<8> addr; + let Inst{10-8} = Rt; + let Inst{7-0} = addr; +} + // A8.6.194 & A8.6.192 defm tSTR : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rrs4, t_addrmode_is4, AddrModeT1_4, @@ -1175,10 +1197,18 @@ def tREVSH : // A8.6.136 "revsh", "\t$Rd, $Rm", [(set tGPR:$Rd, (sext_inreg - (or (srl (and tGPR:$Rm, 0xFF00), (i32 8)), + (or (srl tGPR:$Rm, (i32 8)), (shl tGPR:$Rm, (i32 8))), i16))]>, Requires<[IsThumb, IsThumb1Only, HasV6]>; +def : T1Pat<(sext_inreg (or (srl (and tGPR:$Rm, 0xFF00), (i32 8)), + (shl tGPR:$Rm, (i32 8))), i16), + (tREVSH tGPR:$Rm)>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; + +def : T1Pat<(sra (bswap tGPR:$Rm), (i32 16)), (tREVSH tGPR:$Rm)>, + Requires<[IsThumb, IsThumb1Only, HasV6]>; + // Rotate right register def tROR : // A8.6.139 T1sItDPEncode<0b0111, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), @@ -1322,10 +1352,8 @@ def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd), // Move between coprocessor and ARM core register -- for disassembly only // -class tMovRCopro<string opc, bit direction> - : T1Cop<(outs), (ins p_imm:$cop, i32imm:$opc1, - GPR:$Rt, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2), - !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"), +class tMovRCopro<string opc, bit direction, dag oops, dag iops> + : T1Cop<oops, iops, !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"), [/* For disassembly only; pattern left blank */]> { let Inst{27-24} = 0b1110; let Inst{20} = direction; @@ -1346,8 +1374,12 @@ class tMovRCopro<string opc, bit direction> let Inst{19-16} = CRn; } -def tMCR : tMovRCopro<"mcr", 0 /* from ARM core register to coprocessor */>; -def tMRC : tMovRCopro<"mrc", 1 /* from coprocessor to ARM core register */>; +def tMCR : tMovRCopro<"mcr", 0 /* from ARM core register to coprocessor */, + (outs), (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, c_imm:$CRn, + c_imm:$CRm, i32imm:$opc2)>; +def tMRC : tMovRCopro<"mrc", 1 /* from coprocessor to ARM core register */, + (outs GPR:$Rt), (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRn, + c_imm:$CRm, i32imm:$opc2)>; class tMovRRCopro<string opc, bit direction> : T1Cop<(outs), (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, GPR:$Rt2, c_imm:$CRm), @@ -1420,7 +1452,7 @@ def tTPsoft : TIx2<0b11110, 0b11, 1, (outs), (ins), IIC_Br, // from some other function to get here, and we're using the stack frame for the // containing function to save/restore registers, we can't keep anything live in // regs across the eh_sjlj_setjmp(), else it will almost certainly have been -// tromped upon when we get here from a longjmp(). We force everthing out of +// tromped upon when we get here from a longjmp(). We force everything out of // registers except for our own input by listing the relevant registers in // Defs. By doing so, we also cause the prologue/epilogue code to actively // preserve all of the callee-saved resgisters, which is exactly what we want. diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index 0e01be5..600a121 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -44,7 +44,9 @@ def t2_so_imm_neg_XFORM : SDNodeXForm<imm, [{ // t2_so_imm - Match a 32-bit immediate operand, which is an // 8-bit immediate rotated by an arbitrary number of bits, or an 8-bit // immediate splatted into multiple bytes of the word. -def t2_so_imm : Operand<i32>, PatLeaf<(imm), [{ return Pred_t2_so_imm(N); }]> { +def t2_so_imm : Operand<i32>, ImmLeaf<i32, [{ + return ARM_AM::getT2SOImmVal(Imm) != -1; + }]> { let EncoderMethod = "getT2SOImmOpValue"; } @@ -61,49 +63,15 @@ def t2_so_imm_neg : Operand<i32>, return ARM_AM::getT2SOImmVal(-((uint32_t)N->getZExtValue())) != -1; }], t2_so_imm_neg_XFORM>; -// Break t2_so_imm's up into two pieces. This handles immediates with up to 16 -// bits set in them. This uses t2_so_imm2part to match and t2_so_imm2part_[12] -// to get the first/second pieces. -def t2_so_imm2part : Operand<i32>, - PatLeaf<(imm), [{ - return ARM_AM::isT2SOImmTwoPartVal((unsigned)N->getZExtValue()); - }]> { -} - -def t2_so_imm2part_1 : SDNodeXForm<imm, [{ - unsigned V = ARM_AM::getT2SOImmTwoPartFirst((unsigned)N->getZExtValue()); - return CurDAG->getTargetConstant(V, MVT::i32); -}]>; - -def t2_so_imm2part_2 : SDNodeXForm<imm, [{ - unsigned V = ARM_AM::getT2SOImmTwoPartSecond((unsigned)N->getZExtValue()); - return CurDAG->getTargetConstant(V, MVT::i32); -}]>; - -def t2_so_neg_imm2part : Operand<i32>, PatLeaf<(imm), [{ - return ARM_AM::isT2SOImmTwoPartVal(-(int)N->getZExtValue()); - }]> { -} - -def t2_so_neg_imm2part_1 : SDNodeXForm<imm, [{ - unsigned V = ARM_AM::getT2SOImmTwoPartFirst(-(int)N->getZExtValue()); - return CurDAG->getTargetConstant(V, MVT::i32); -}]>; - -def t2_so_neg_imm2part_2 : SDNodeXForm<imm, [{ - unsigned V = ARM_AM::getT2SOImmTwoPartSecond(-(int)N->getZExtValue()); - return CurDAG->getTargetConstant(V, MVT::i32); -}]>; - /// imm1_31 predicate - True if the 32-bit immediate is in the range [1,31]. -def imm1_31 : PatLeaf<(i32 imm), [{ - return (int32_t)N->getZExtValue() >= 1 && (int32_t)N->getZExtValue() < 32; +def imm1_31 : ImmLeaf<i32, [{ + return (int32_t)Imm >= 1 && (int32_t)Imm < 32; }]>; /// imm0_4095 predicate - True if the 32-bit immediate is in the range [0.4095]. def imm0_4095 : Operand<i32>, - PatLeaf<(i32 imm), [{ - return (uint32_t)N->getZExtValue() < 4096; + ImmLeaf<i32, [{ + return Imm >= 0 && Imm < 4096; }]>; def imm0_4095_neg : PatLeaf<(i32 imm), [{ @@ -118,6 +86,11 @@ def imm0_255_not : PatLeaf<(i32 imm), [{ return (uint32_t)(~N->getZExtValue()) < 255; }], imm_comp_XFORM>; +def lo5AllOne : PatLeaf<(i32 imm), [{ + // Returns true if all low 5-bits are 1. + return (((uint32_t)N->getZExtValue()) & 0x1FUL) == 0x1FUL; +}]>; + // Define Thumb2 specific addressing modes. // t2addrmode_imm12 := reg + imm12 @@ -129,6 +102,12 @@ def t2addrmode_imm12 : Operand<i32>, let ParserMatchClass = MemMode5AsmOperand; } +// t2ldrlabel := imm12 +def t2ldrlabel : Operand<i32> { + let EncoderMethod = "getAddrModeImm12OpValue"; +} + + // ADR instruction labels. def t2adrlabel : Operand<i32> { let EncoderMethod = "getT2AdrLabelOpValue"; @@ -173,6 +152,15 @@ def t2addrmode_so_reg : Operand<i32>, let ParserMatchClass = MemMode5AsmOperand; } +// t2addrmode_reg := reg +// Used by load/store exclusive instructions. Useful to enable right assembly +// parsing and printing. Not used for any codegen matching. +// +def t2addrmode_reg : Operand<i32> { + let PrintMethod = "printAddrMode7Operand"; + let MIOperandInfo = (ops tGPR); + let ParserMatchClass = MemMode7AsmOperand; +} //===----------------------------------------------------------------------===// // Multiclass helpers... @@ -700,49 +688,27 @@ multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, let Inst{24-21} = opcod; } } +} // Carry setting variants -let isCodeGenOnly = 1, Defs = [CPSR] in { -multiclass T2I_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode, - bit Commutable = 0> { +// NOTE: CPSR def omitted because it will be handled by the custom inserter. +let usesCustomInserter = 1 in { +multiclass T2I_adde_sube_s_irs<PatFrag opnode, bit Commutable = 0> { // shifted imm - def ri : T2sTwoRegImm< - (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), IIC_iALUi, - opc, "\t$Rd, $Rn, $imm", - [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]>, - Requires<[IsThumb2]> { - let Inst{31-27} = 0b11110; - let Inst{25} = 0; - let Inst{24-21} = opcod; - let Inst{20} = 1; // The S bit. - let Inst{15} = 0; - } + def ri : t2PseudoInst<(outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), + Size4Bytes, IIC_iALUi, + [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]>; // register - def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr, - opc, ".w\t$Rd, $Rn, $Rm", - [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]>, - Requires<[IsThumb2]> { + def rr : t2PseudoInst<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), + Size4Bytes, IIC_iALUr, + [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]> { let isCommutable = Commutable; - let Inst{31-27} = 0b11101; - let Inst{26-25} = 0b01; - let Inst{24-21} = opcod; - let Inst{20} = 1; // The S bit. - let Inst{14-12} = 0b000; // imm3 - let Inst{7-6} = 0b00; // imm2 - let Inst{5-4} = 0b00; // type } // shifted register - def rs : T2sTwoRegShiftedReg< - (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm), - IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm", - [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]>, - Requires<[IsThumb2]> { - let Inst{31-27} = 0b11101; - let Inst{26-25} = 0b01; - let Inst{24-21} = opcod; - let Inst{20} = 1; // The S bit. - } -} + def rs : t2PseudoInst< + (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm), + Size4Bytes, IIC_iALUsi, + [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]>; } } @@ -864,6 +830,7 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc, let Inst{15-12} = Rt; bits<17> addr; + let addr{12} = 1; // add = TRUE let Inst{19-16} = addr{16-13}; // Rn let Inst{23} = addr{12}; // U let Inst{11-0} = addr{11-0}; // imm @@ -911,7 +878,7 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc, } // FIXME: Is the pci variant actually needed? - def pci : T2Ipc <(outs GPR:$Rt), (ins i32imm:$addr), iii, + def pci : T2Ipc <(outs GPR:$Rt), (ins t2ldrlabel:$addr), iii, opc, ".w\t$Rt, $addr", [(set GPR:$Rt, (opnode (ARMWrapper tconstpool:$addr)))]> { let isReMaterializable = 1; @@ -944,6 +911,7 @@ multiclass T2I_st<bits<2> opcod, string opc, let Inst{15-12} = Rt; bits<17> addr; + let addr{12} = 1; // add = TRUE let Inst{19-16} = addr{16-13}; // Rn let Inst{23} = addr{12}; // U let Inst{11-0} = addr{11-0}; // imm @@ -1398,7 +1366,7 @@ def t2LDRSH_POST : T2Iidxldst<1, 0b01, 1, 0, (outs GPR:$dst, GPR:$Rn), // for disassembly only. // Ref: A8.6.57 LDR (immediate, Thumb) Encoding T4 class T2IldT<bit signed, bits<2> type, string opc, InstrItinClass ii> - : T2Ii8<(outs GPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc, + : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc, "\t$Rt, $addr", []> { let Inst{31-27} = 0b11111; let Inst{26-25} = 0b00; @@ -1440,42 +1408,48 @@ def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs), def t2STR_PRE : T2Iidxldst<0, 0b10, 0, 1, (outs GPR:$base_wb), (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr), AddrModeT2_i8, IndexModePre, IIC_iStore_iu, - "str", "\t$Rt, [$Rn, $addr]!", "$Rn = $base_wb", + "str", "\t$Rt, [$Rn, $addr]!", + "$Rn = $base_wb,@earlyclobber $base_wb", [(set GPR:$base_wb, (pre_store GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>; def t2STR_POST : T2Iidxldst<0, 0b10, 0, 0, (outs GPR:$base_wb), (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr), AddrModeT2_i8, IndexModePost, IIC_iStore_iu, - "str", "\t$Rt, [$Rn], $addr", "$Rn = $base_wb", + "str", "\t$Rt, [$Rn], $addr", + "$Rn = $base_wb,@earlyclobber $base_wb", [(set GPR:$base_wb, (post_store GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>; def t2STRH_PRE : T2Iidxldst<0, 0b01, 0, 1, (outs GPR:$base_wb), (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr), AddrModeT2_i8, IndexModePre, IIC_iStore_iu, - "strh", "\t$Rt, [$Rn, $addr]!", "$Rn = $base_wb", + "strh", "\t$Rt, [$Rn, $addr]!", + "$Rn = $base_wb,@earlyclobber $base_wb", [(set GPR:$base_wb, (pre_truncsti16 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>; def t2STRH_POST : T2Iidxldst<0, 0b01, 0, 0, (outs GPR:$base_wb), (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr), AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu, - "strh", "\t$Rt, [$Rn], $addr", "$Rn = $base_wb", + "strh", "\t$Rt, [$Rn], $addr", + "$Rn = $base_wb,@earlyclobber $base_wb", [(set GPR:$base_wb, (post_truncsti16 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>; def t2STRB_PRE : T2Iidxldst<0, 0b00, 0, 1, (outs GPR:$base_wb), (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr), AddrModeT2_i8, IndexModePre, IIC_iStore_bh_iu, - "strb", "\t$Rt, [$Rn, $addr]!", "$Rn = $base_wb", + "strb", "\t$Rt, [$Rn, $addr]!", + "$Rn = $base_wb,@earlyclobber $base_wb", [(set GPR:$base_wb, (pre_truncsti8 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>; def t2STRB_POST : T2Iidxldst<0, 0b00, 0, 0, (outs GPR:$base_wb), (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr), AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu, - "strb", "\t$Rt, [$Rn], $addr", "$Rn = $base_wb", + "strb", "\t$Rt, [$Rn], $addr", + "$Rn = $base_wb,@earlyclobber $base_wb", [(set GPR:$base_wb, (post_truncsti8 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>; @@ -1483,7 +1457,7 @@ def t2STRB_POST : T2Iidxldst<0, 0b00, 0, 0, (outs GPR:$base_wb), // only. // Ref: A8.6.193 STR (immediate, Thumb) Encoding T4 class T2IstT<bits<2> type, string opc, InstrItinClass ii> - : T2Ii8<(outs GPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc, + : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc, "\t$Rt, $addr", []> { let Inst{31-27} = 0b11111; let Inst{26-25} = 0b00; @@ -1508,20 +1482,20 @@ def t2STRHT : T2IstT<0b01, "strht", IIC_iStore_bh_i>; // ldrd / strd pre / post variants // For disassembly only. -def t2LDRD_PRE : T2Ii8s4<1, 1, 1, (outs GPR:$Rt, GPR:$Rt2), +def t2LDRD_PRE : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2), (ins GPR:$base, t2am_imm8s4_offset:$imm), IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, [$base, $imm]!", []>; -def t2LDRD_POST : T2Ii8s4<0, 1, 1, (outs GPR:$Rt, GPR:$Rt2), +def t2LDRD_POST : T2Ii8s4<0, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2), (ins GPR:$base, t2am_imm8s4_offset:$imm), IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, [$base], $imm", []>; def t2STRD_PRE : T2Ii8s4<1, 1, 0, (outs), - (ins GPR:$Rt, GPR:$Rt2, GPR:$base, t2am_imm8s4_offset:$imm), + (ins rGPR:$Rt, rGPR:$Rt2, GPR:$base, t2am_imm8s4_offset:$imm), IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, [$base, $imm]!", []>; def t2STRD_POST : T2Ii8s4<0, 1, 0, (outs), - (ins GPR:$Rt, GPR:$Rt2, GPR:$base, t2am_imm8s4_offset:$imm), + (ins rGPR:$Rt, rGPR:$Rt2, GPR:$base, t2am_imm8s4_offset:$imm), IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, [$base], $imm", []>; // T2Ipl (Preload Data/Instruction) signals the memory system of possible future @@ -1541,6 +1515,7 @@ multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> { let Inst{15-12} = 0b1111; bits<17> addr; + let addr{12} = 1; // add = TRUE let Inst{19-16} = addr{16-13}; // Rn let Inst{23} = addr{12}; // U let Inst{11-0} = addr{11-0}; // imm12 @@ -1813,10 +1788,8 @@ defm t2ADC : T2I_adde_sube_irs<0b1010, "adc", BinOpFrag<(adde_dead_carry node:$LHS, node:$RHS)>, 1>; defm t2SBC : T2I_adde_sube_irs<0b1011, "sbc", BinOpFrag<(sube_dead_carry node:$LHS, node:$RHS)>>; -defm t2ADCS : T2I_adde_sube_s_irs<0b1010, "adc", - BinOpFrag<(adde_live_carry node:$LHS, node:$RHS)>, 1>; -defm t2SBCS : T2I_adde_sube_s_irs<0b1011, "sbc", - BinOpFrag<(sube_live_carry node:$LHS, node:$RHS)>>; +defm t2ADCS : T2I_adde_sube_s_irs<BinOpFrag<(adde_live_carry node:$LHS, node:$RHS)>, 1>; +defm t2SBCS : T2I_adde_sube_s_irs<BinOpFrag<(sube_live_carry node:$LHS, node:$RHS)>>; // RSB defm t2RSB : T2I_rbin_irs <0b1110, "rsb", @@ -1847,9 +1820,14 @@ def : T2Pat<(addc rGPR:$src, t2_so_imm_neg:$imm), // Effectively, the inverse interpretation of the carry flag already accounts // for part of the negation. let AddedComplexity = 1 in -def : T2Pat<(adde rGPR:$src, imm0_255_not:$imm), +def : T2Pat<(adde_dead_carry rGPR:$src, imm0_255_not:$imm), + (t2SBCri rGPR:$src, imm0_255_not:$imm)>; +def : T2Pat<(adde_dead_carry rGPR:$src, t2_so_imm_not:$imm), + (t2SBCri rGPR:$src, t2_so_imm_not:$imm)>; +let AddedComplexity = 1 in +def : T2Pat<(adde_live_carry rGPR:$src, imm0_255_not:$imm), (t2SBCSri rGPR:$src, imm0_255_not:$imm)>; -def : T2Pat<(adde rGPR:$src, t2_so_imm_not:$imm), +def : T2Pat<(adde_live_carry rGPR:$src, t2_so_imm_not:$imm), (t2SBCSri rGPR:$src, t2_so_imm_not:$imm)>; // Select Bytes -- for disassembly only @@ -2052,6 +2030,10 @@ defm t2LSR : T2I_sh_ir<0b01, "lsr", BinOpFrag<(srl node:$LHS, node:$RHS)>>; defm t2ASR : T2I_sh_ir<0b10, "asr", BinOpFrag<(sra node:$LHS, node:$RHS)>>; defm t2ROR : T2I_sh_ir<0b11, "ror", BinOpFrag<(rotr node:$LHS, node:$RHS)>>; +// (rotr x, (and y, 0x...1f)) ==> (ROR x, y) +def : Pat<(rotr rGPR:$lhs, (and rGPR:$rhs, lo5AllOne)), + (t2RORrr rGPR:$lhs, rGPR:$rhs)>; + let Uses = [CPSR] in { def t2RRX : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi, "rrx", "\t$Rd, $Rm", @@ -2140,10 +2122,12 @@ def t2BFC : T2BitFI<(outs rGPR:$Rd), (ins rGPR:$src, bf_inv_mask_imm:$imm), IIC_iUNAsi, "bfc", "\t$Rd, $imm", [(set rGPR:$Rd, (and rGPR:$src, bf_inv_mask_imm:$imm))]> { let Inst{31-27} = 0b11110; + let Inst{26} = 0; // should be 0. let Inst{25} = 1; let Inst{24-20} = 0b10110; let Inst{19-16} = 0b1111; // Rn let Inst{15} = 0; + let Inst{5} = 0; // should be 0. bits<10> imm; let msb{4-0} = imm{9-5}; @@ -2176,9 +2160,11 @@ let Constraints = "$src = $Rd" in { [(set rGPR:$Rd, (ARMbfi rGPR:$src, rGPR:$Rn, bf_inv_mask_imm:$imm))]> { let Inst{31-27} = 0b11110; + let Inst{26} = 0; // should be 0. let Inst{25} = 1; let Inst{24-20} = 0b10110; let Inst{15} = 0; + let Inst{5} = 0; // should be 0. bits<10> imm; let msb{4-0} = imm{9-5}; @@ -2193,9 +2179,11 @@ let Constraints = "$src = $Rd" in { IIC_iBITi, "bfi", "\t$Rd, $Rn, $lsbit, $width", []> { let Inst{31-27} = 0b11110; + let Inst{26} = 0; // should be 0. let Inst{25} = 1; let Inst{24-20} = 0b10110; let Inst{15} = 0; + let Inst{5} = 0; // should be 0. bits<5> lsbit; bits<5> width; @@ -2607,9 +2595,15 @@ def t2REVSH : T2I_misc<0b01, 0b11, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr, "revsh", ".w\t$Rd, $Rm", [(set rGPR:$Rd, (sext_inreg - (or (srl (and rGPR:$Rm, 0xFF00), (i32 8)), + (or (srl rGPR:$Rm, (i32 8)), (shl rGPR:$Rm, (i32 8))), i16))]>; +def : T2Pat<(sext_inreg (or (srl (and rGPR:$Rm, 0xFF00), (i32 8)), + (shl rGPR:$Rm, (i32 8))), i16), + (t2REVSH rGPR:$Rm)>; + +def : T2Pat<(sra (bswap rGPR:$Rm), (i32 16)), (t2REVSH rGPR:$Rm)>; + def t2PKHBT : T2ThreeReg< (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, shift_imm:$sh), IIC_iBITsi, "pkhbt", "\t$Rd, $Rn, $Rm$sh", @@ -2843,9 +2837,9 @@ class T2I_ldrex<bits<2> opcod, dag oops, dag iops, AddrMode am, SizeFlagVal sz, let Inst{5-4} = opcod; let Inst{3-0} = 0b1111; - bits<4> Rn; + bits<4> addr; bits<4> Rt; - let Inst{19-16} = Rn; + let Inst{19-16} = addr; let Inst{15-12} = Rt; } class T2I_strex<bits<2> opcod, dag oops, dag iops, AddrMode am, SizeFlagVal sz, @@ -2859,37 +2853,37 @@ class T2I_strex<bits<2> opcod, dag oops, dag iops, AddrMode am, SizeFlagVal sz, let Inst{5-4} = opcod; bits<4> Rd; - bits<4> Rn; + bits<4> addr; bits<4> Rt; - let Inst{11-8} = Rd; - let Inst{19-16} = Rn; + let Inst{3-0} = Rd; + let Inst{19-16} = addr; let Inst{15-12} = Rt; } let mayLoad = 1 in { -def t2LDREXB : T2I_ldrex<0b00, (outs rGPR:$Rt), (ins rGPR:$Rn), AddrModeNone, - Size4Bytes, NoItinerary, "ldrexb", "\t$Rt, [$Rn]", +def t2LDREXB : T2I_ldrex<0b00, (outs rGPR:$Rt), (ins t2addrmode_reg:$addr), AddrModeNone, + Size4Bytes, NoItinerary, "ldrexb", "\t$Rt, $addr", "", []>; -def t2LDREXH : T2I_ldrex<0b01, (outs rGPR:$Rt), (ins rGPR:$Rn), AddrModeNone, - Size4Bytes, NoItinerary, "ldrexh", "\t$Rt, [$Rn]", +def t2LDREXH : T2I_ldrex<0b01, (outs rGPR:$Rt), (ins t2addrmode_reg:$addr), AddrModeNone, + Size4Bytes, NoItinerary, "ldrexh", "\t$Rt, $addr", "", []>; -def t2LDREX : Thumb2I<(outs rGPR:$Rt), (ins rGPR:$Rn), AddrModeNone, +def t2LDREX : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_reg:$addr), AddrModeNone, Size4Bytes, NoItinerary, - "ldrex", "\t$Rt, [$Rn]", "", + "ldrex", "\t$Rt, $addr", "", []> { let Inst{31-27} = 0b11101; let Inst{26-20} = 0b0000101; let Inst{11-8} = 0b1111; let Inst{7-0} = 0b00000000; // imm8 = 0 - bits<4> Rn; bits<4> Rt; - let Inst{19-16} = Rn; + bits<4> addr; + let Inst{19-16} = addr; let Inst{15-12} = Rt; } -def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2), (ins rGPR:$Rn), +def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2), (ins t2addrmode_reg:$addr), AddrModeNone, Size4Bytes, NoItinerary, - "ldrexd", "\t$Rt, $Rt2, [$Rn]", "", + "ldrexd", "\t$Rt, $Rt2, $addr", "", [], {?, ?, ?, ?}> { bits<4> Rt2; let Inst{11-8} = Rt2; @@ -2897,31 +2891,31 @@ def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2), (ins rGPR:$Rn), } let mayStore = 1, Constraints = "@earlyclobber $Rd" in { -def t2STREXB : T2I_strex<0b00, (outs rGPR:$Rd), (ins rGPR:$Rt, rGPR:$Rn), - AddrModeNone, Size4Bytes, NoItinerary, - "strexb", "\t$Rd, $Rt, [$Rn]", "", []>; -def t2STREXH : T2I_strex<0b01, (outs rGPR:$Rd), (ins rGPR:$Rt, rGPR:$Rn), - AddrModeNone, Size4Bytes, NoItinerary, - "strexh", "\t$Rd, $Rt, [$Rn]", "", []>; -def t2STREX : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt, rGPR:$Rn), - AddrModeNone, Size4Bytes, NoItinerary, - "strex", "\t$Rd, $Rt, [$Rn]", "", - []> { +def t2STREXB : T2I_strex<0b00, (outs rGPR:$Rd), (ins rGPR:$Rt, t2addrmode_reg:$addr), + AddrModeNone, Size4Bytes, NoItinerary, + "strexb", "\t$Rd, $Rt, $addr", "", []>; +def t2STREXH : T2I_strex<0b01, (outs rGPR:$Rd), (ins rGPR:$Rt, t2addrmode_reg:$addr), + AddrModeNone, Size4Bytes, NoItinerary, + "strexh", "\t$Rd, $Rt, $addr", "", []>; +def t2STREX : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt, t2addrmode_reg:$addr), + AddrModeNone, Size4Bytes, NoItinerary, + "strex", "\t$Rd, $Rt, $addr", "", + []> { let Inst{31-27} = 0b11101; let Inst{26-20} = 0b0000100; let Inst{7-0} = 0b00000000; // imm8 = 0 bits<4> Rd; - bits<4> Rn; + bits<4> addr; bits<4> Rt; let Inst{11-8} = Rd; - let Inst{19-16} = Rn; + let Inst{19-16} = addr; let Inst{15-12} = Rt; } def t2STREXD : T2I_strex<0b11, (outs rGPR:$Rd), - (ins rGPR:$Rt, rGPR:$Rt2, rGPR:$Rn), + (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_reg:$addr), AddrModeNone, Size4Bytes, NoItinerary, - "strexd", "\t$Rd, $Rt, $Rt2, [$Rn]", "", [], + "strexd", "\t$Rd, $Rt, $Rt2, $addr", "", [], {?, ?, ?, ?}> { bits<4> Rt2; let Inst{11-8} = Rt2; @@ -2965,7 +2959,7 @@ let isCall = 1, // here, and we're using the stack frame for the containing function to // save/restore registers, we can't keep anything live in regs across // the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon -// when we get here from a longjmp(). We force everthing out of registers +// when we get here from a longjmp(). We force everything out of registers // except for our own input by listing the relevant registers in Defs. By // doing so, we also cause the prologue/epilogue code to actively preserve // all of the callee-saved resgisters, which is exactly what we want. @@ -3238,19 +3232,20 @@ class T2RFE<bits<12> op31_20, dag oops, dag iops, InstrItinClass itin, bits<4> Rn; let Inst{19-16} = Rn; + let Inst{15-0} = 0xc000; } def t2RFEDBW : T2RFE<0b111010000011, - (outs), (ins rGPR:$Rn), NoItinerary, "rfedb", "\t$Rn!", + (outs), (ins GPR:$Rn), NoItinerary, "rfedb", "\t$Rn!", [/* For disassembly only; pattern left blank */]>; def t2RFEDB : T2RFE<0b111010000001, - (outs), (ins rGPR:$Rn), NoItinerary, "rfeab", "\t$Rn", + (outs), (ins GPR:$Rn), NoItinerary, "rfedb", "\t$Rn", [/* For disassembly only; pattern left blank */]>; def t2RFEIAW : T2RFE<0b111010011011, - (outs), (ins rGPR:$Rn), NoItinerary, "rfeia", "\t$Rn!", + (outs), (ins GPR:$Rn), NoItinerary, "rfeia", "\t$Rn!", [/* For disassembly only; pattern left blank */]>; def t2RFEIA : T2RFE<0b111010011001, - (outs), (ins rGPR:$Rn), NoItinerary, "rfeia", "\t$Rn", + (outs), (ins GPR:$Rn), NoItinerary, "rfeia", "\t$Rn", [/* For disassembly only; pattern left blank */]>; //===----------------------------------------------------------------------===// @@ -3352,10 +3347,8 @@ def t2MSR : T2SpecialReg<0b111100111000 /* op31-20 */, 0b10 /* op15-14 */, // Move between coprocessor and ARM core register -- for disassembly only // -class t2MovRCopro<string opc, bit direction> - : T2Cop<(outs), (ins p_imm:$cop, i32imm:$opc1, - GPR:$Rt, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2), - !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"), +class t2MovRCopro<string opc, bit direction, dag oops, dag iops> + : T2Cop<oops, iops, !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"), [/* For disassembly only; pattern left blank */]> { let Inst{27-24} = 0b1110; let Inst{20} = direction; @@ -3376,8 +3369,12 @@ class t2MovRCopro<string opc, bit direction> let Inst{19-16} = CRn; } -def t2MCR2 : t2MovRCopro<"mcr2", 0 /* from ARM core register to coprocessor */>; -def t2MRC2 : t2MovRCopro<"mrc2", 1 /* from coprocessor to ARM core register */>; +def t2MCR2 : t2MovRCopro<"mcr2", 0 /* from ARM core register to coprocessor */, + (outs), (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, c_imm:$CRn, + c_imm:$CRm, i32imm:$opc2)>; +def t2MRC2 : t2MovRCopro<"mrc2", 1 /* from coprocessor to ARM core register */, + (outs GPR:$Rt), (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRn, + c_imm:$CRm, i32imm:$opc2)>; class t2MovRRCopro<string opc, bit direction> : T2Cop<(outs), (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, GPR:$Rt2, c_imm:$CRm), diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td index 2990283..376bd96 100644 --- a/lib/Target/ARM/ARMInstrVFP.td +++ b/lib/Target/ARM/ARMInstrVFP.td @@ -101,14 +101,6 @@ multiclass vfp_ldst_mult<string asm, bit L_bit, let Inst{21} = 1; // Writeback let Inst{20} = L_bit; } - def DDB : - AXDI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops), - IndexModeNone, itin, - !strconcat(asm, "db${p}\t$Rn, $regs"), "", []> { - let Inst{24-23} = 0b10; // Decrement Before - let Inst{21} = 0; // No writeback - let Inst{20} = L_bit; - } def DDB_UPD : AXDI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops), IndexModeUpd, itin_upd, @@ -143,18 +135,6 @@ multiclass vfp_ldst_mult<string asm, bit L_bit, // VFP pipelines. let D = VFPNeonDomain; } - def SDB : - AXSI4<(outs), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, variable_ops), - IndexModeNone, itin, - !strconcat(asm, "db${p}\t$Rn, $regs"), "", []> { - let Inst{24-23} = 0b10; // Decrement Before - let Inst{21} = 0; // No writeback - let Inst{20} = L_bit; - - // Some single precision VFP instructions may be executed on both NEON and - // VFP pipelines. - let D = VFPNeonDomain; - } def SDB_UPD : AXSI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, variable_ops), IndexModeUpd, itin_upd, @@ -467,6 +447,10 @@ def VMOVRS : AVConv2I<0b11100001, 0b1010, let Inst{6-5} = 0b00; let Inst{3-0} = 0b0000; + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; } def VMOVSR : AVConv4I<0b11100000, 0b1010, @@ -484,6 +468,10 @@ def VMOVSR : AVConv4I<0b11100000, 0b1010, let Inst{6-5} = 0b00; let Inst{3-0} = 0b0000; + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; } let neverHasSideEffects = 1 in { @@ -503,6 +491,10 @@ def VMOVRRD : AVConv3I<0b11000101, 0b1011, let Inst{19-16} = Rt2; let Inst{7-6} = 0b00; + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; } def VMOVRRS : AVConv3I<0b11000101, 0b1010, @@ -510,6 +502,10 @@ def VMOVRRS : AVConv3I<0b11000101, 0b1010, IIC_fpMOVDI, "vmov", "\t$wb, $dst2, $src1, $src2", [/* For disassembly only; pattern left blank */]> { let Inst{7-6} = 0b00; + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; } } // neverHasSideEffects @@ -532,6 +528,10 @@ def VMOVDRR : AVConv5I<0b11000100, 0b1011, let Inst{19-16} = Rt2; let Inst{7-6} = 0b00; + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; } let neverHasSideEffects = 1 in @@ -540,6 +540,10 @@ def VMOVSRR : AVConv5I<0b11000100, 0b1010, IIC_fpMOVID, "vmov", "\t$dst1, $dst2, $src1, $src2", [/* For disassembly only; pattern left blank */]> { let Inst{7-6} = 0b00; + + // Some single precision VFP instructions may be executed on both NEON and VFP + // pipelines. + let D = VFPNeonDomain; } // FMRDH: SPR -> GPR @@ -972,33 +976,15 @@ def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin), // let neverHasSideEffects = 1 in { -def VMOVDcc : ADuI<0b11101, 0b11, 0b0000, 0b01, 0, - (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), - IIC_fpUNA64, "vmov", ".f64\t$Dd, $Dm", +def VMOVDcc : ARMPseudoInst<(outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm, pred:$p), + Size4Bytes, IIC_fpUNA64, [/*(set DPR:$Dd, (ARMcmov DPR:$Dn, DPR:$Dm, imm:$cc))*/]>, RegConstraint<"$Dn = $Dd">; -def VMOVScc : ASuI<0b11101, 0b11, 0b0000, 0b01, 0, - (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), - IIC_fpUNA32, "vmov", ".f32\t$Sd, $Sm", +def VMOVScc : ARMPseudoInst<(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm, pred:$p), + Size4Bytes, IIC_fpUNA32, [/*(set SPR:$Sd, (ARMcmov SPR:$Sn, SPR:$Sm, imm:$cc))*/]>, RegConstraint<"$Sn = $Sd">; - -def VNEGDcc : ADuI<0b11101, 0b11, 0b0001, 0b01, 0, - (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm), - IIC_fpUNA64, "vneg", ".f64\t$Dd, $Dm", - [/*(set DPR:$Dd, (ARMcneg DPR:$Dn, DPR:$Dm, imm:$cc))*/]>, - RegConstraint<"$Dn = $Dd">; - -def VNEGScc : ASuI<0b11101, 0b11, 0b0001, 0b01, 0, - (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm), - IIC_fpUNA32, "vneg", ".f32\t$Sd, $Sm", - [/*(set SPR:$Sd, (ARMcneg SPR:$Sn, SPR:$Sm, imm:$cc))*/]>, - RegConstraint<"$Sn = $Sd"> { - // Some single precision VFP instructions may be executed on both NEON and - // VFP pipelines on A8. - let D = VFPNeonA8Domain; -} } // neverHasSideEffects //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index d9dc5cd..df89fad 100644 --- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -79,7 +79,7 @@ namespace { unsigned Position; MachineBasicBlock::iterator MBBI; bool Merged; - MemOpQueueEntry(int o, unsigned r, bool k, unsigned p, + MemOpQueueEntry(int o, unsigned r, bool k, unsigned p, MachineBasicBlock::iterator i) : Offset(o), Reg(r), isKill(k), Position(p), MBBI(i), Merged(false) {} }; @@ -174,7 +174,7 @@ static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) { switch (Mode) { default: llvm_unreachable("Unhandled submode!"); case ARM_AM::ia: return ARM::VLDMSIA; - case ARM_AM::db: return ARM::VLDMSDB; + case ARM_AM::db: return 0; // Only VLDMSDB_UPD exists. } break; case ARM::VSTRS: @@ -182,7 +182,7 @@ static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) { switch (Mode) { default: llvm_unreachable("Unhandled submode!"); case ARM_AM::ia: return ARM::VSTMSIA; - case ARM_AM::db: return ARM::VSTMSDB; + case ARM_AM::db: return 0; // Only VSTMSDB_UPD exists. } break; case ARM::VLDRD: @@ -190,7 +190,7 @@ static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) { switch (Mode) { default: llvm_unreachable("Unhandled submode!"); case ARM_AM::ia: return ARM::VLDMDIA; - case ARM_AM::db: return ARM::VLDMDDB; + case ARM_AM::db: return 0; // Only VLDMDDB_UPD exists. } break; case ARM::VSTRD: @@ -198,7 +198,7 @@ static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) { switch (Mode) { default: llvm_unreachable("Unhandled submode!"); case ARM_AM::ia: return ARM::VSTMDIA; - case ARM_AM::db: return ARM::VSTMDDB; + case ARM_AM::db: return 0; // Only VSTMDDB_UPD exists. } break; } @@ -246,13 +246,9 @@ AMSubMode getLoadStoreMultipleSubMode(int Opcode) { case ARM::t2LDMDB_UPD: case ARM::t2STMDB: case ARM::t2STMDB_UPD: - case ARM::VLDMSDB: case ARM::VLDMSDB_UPD: - case ARM::VSTMSDB: case ARM::VSTMSDB_UPD: - case ARM::VLDMDDB: case ARM::VLDMDDB_UPD: - case ARM::VSTMDDB: case ARM::VSTMDDB_UPD: return ARM_AM::db; @@ -312,6 +308,10 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, // VLDM/VSTM do not support DB mode without also updating the base reg. Mode = ARM_AM::db; else if (Offset != 0) { + // Check if this is a supported opcode before we insert instructions to + // calculate a new base register. + if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return false; + // If starting offset isn't zero, insert a MI to materialize a new base. // But only do so if it is cost effective, i.e. merging more than two // loads / stores. @@ -354,6 +354,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, bool isDef = (isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD); Opcode = getLoadStoreMultipleOpcode(Opcode, Mode); + if (!Opcode) return false; MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode)) .addReg(Base, getKillRegState(BaseKill)) .addImm(Pred).addReg(PredReg); @@ -453,6 +454,25 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned PRegNum = PMO.isUndef() ? UINT_MAX : getARMRegisterNumbering(PReg); unsigned Count = 1; + unsigned Limit = ~0U; + + // vldm / vstm limit are 32 for S variants, 16 for D variants. + + switch (Opcode) { + default: break; + case ARM::VSTRS: + Limit = 32; + break; + case ARM::VSTRD: + Limit = 16; + break; + case ARM::VLDRD: + Limit = 16; + break; + case ARM::VLDRS: + Limit = 32; + break; + } for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) { int NewOffset = MemOps[i].Offset; @@ -460,13 +480,13 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Reg = MO.getReg(); unsigned RegNum = MO.isUndef() ? UINT_MAX : getARMRegisterNumbering(Reg); - // Register numbers must be in ascending order. For VFP, the registers - // must also be consecutive and there is a limit of 16 double-word - // registers per instruction. + // Register numbers must be in ascending order. For VFP / NEON load and + // store multiples, the registers must also be consecutive and within the + // limit on the number of registers per instruction. if (Reg != ARM::SP && NewOffset == Offset + (int)Size && - ((isNotVFP && RegNum > PRegNum) - || ((Size < 8 || Count < 16) && RegNum == PRegNum+1))) { + ((isNotVFP && RegNum > PRegNum) || + ((Count < Limit) && RegNum == PRegNum+1))) { Offset += Size; PRegNum = RegNum; ++Count; @@ -567,14 +587,10 @@ static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) { case ARM::t2STMIA: case ARM::t2STMDB: case ARM::VLDMSIA: - case ARM::VLDMSDB: case ARM::VSTMSIA: - case ARM::VSTMSDB: return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4; case ARM::VLDMDIA: - case ARM::VLDMDDB: case ARM::VSTMDIA: - case ARM::VSTMDDB: return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8; } } @@ -624,7 +640,6 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc, } break; case ARM::VLDMSIA: - case ARM::VLDMSDB: switch (Mode) { default: llvm_unreachable("Unhandled submode!"); case ARM_AM::ia: return ARM::VLDMSIA_UPD; @@ -632,7 +647,6 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc, } break; case ARM::VLDMDIA: - case ARM::VLDMDDB: switch (Mode) { default: llvm_unreachable("Unhandled submode!"); case ARM_AM::ia: return ARM::VLDMDIA_UPD; @@ -640,7 +654,6 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc, } break; case ARM::VSTMSIA: - case ARM::VSTMSDB: switch (Mode) { default: llvm_unreachable("Unhandled submode!"); case ARM_AM::ia: return ARM::VSTMSIA_UPD; @@ -648,7 +661,6 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc, } break; case ARM::VSTMDIA: - case ARM::VSTMDDB: switch (Mode) { default: llvm_unreachable("Unhandled submode!"); case ARM_AM::ia: return ARM::VSTMDIA_UPD; @@ -749,7 +761,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, MIB.addOperand(MI->getOperand(OpNum)); // Transfer memoperands. - (*MIB).setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); MBB.erase(MBBI); return true; @@ -1275,14 +1287,14 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize, CurrPred, CurrPredReg, Scratch, MemOps, Merges); - // Try folding preceeding/trailing base inc/dec into the generated + // Try folding preceding/trailing base inc/dec into the generated // LDM/STM ops. for (unsigned i = 0, e = Merges.size(); i < e; ++i) if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI)) ++NumMerges; NumMerges += Merges.size(); - // Try folding preceeding/trailing base inc/dec into those load/store + // Try folding preceding/trailing base inc/dec into those load/store // that were not merged to form LDM/STM ops. for (unsigned i = 0; i != NumMemOps; ++i) if (!MemOps[i].Merged) @@ -1292,7 +1304,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { // RS may be pointing to an instruction that's deleted. RS->skipTo(prior(MBBI)); } else if (NumMemOps == 1) { - // Try folding preceeding/trailing base inc/dec into the single + // Try folding preceding/trailing base inc/dec into the single // load/store. if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) { ++NumMerges; @@ -1322,7 +1334,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { } /// MergeReturnIntoLDM - If this is a exit BB, try merging the return ops -/// ("bx lr" and "mov pc, lr") into the preceeding stack restore so it +/// ("bx lr" and "mov pc, lr") into the preceding stack restore so it /// directly restore the value of LR into pc. /// ldmfd sp!, {..., lr} /// bx lr @@ -1530,15 +1542,9 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, // Then make sure the immediate offset fits. int OffImm = getMemoryOpOffset(Op0); if (isT2) { - if (OffImm < 0) { - if (OffImm < -255) - // Can't fall back to t2LDRi8 / t2STRi8. - return false; - } else { - int Limit = (1 << 8) * Scale; - if (OffImm >= Limit || (OffImm & (Scale-1))) - return false; - } + int Limit = (1 << 8) * Scale; + if (OffImm >= Limit || (OffImm <= -Limit) || (OffImm & (Scale-1))) + return false; Offset = OffImm; } else { ARM_AM::AddrOpc AddSub = ARM_AM::add; diff --git a/lib/Target/ARM/ARMMCAsmInfo.cpp b/lib/Target/ARM/ARMMCAsmInfo.cpp index 53edfca..a3f89e9 100644 --- a/lib/Target/ARM/ARMMCAsmInfo.cpp +++ b/lib/Target/ARM/ARMMCAsmInfo.cpp @@ -12,8 +12,16 @@ //===----------------------------------------------------------------------===// #include "ARMMCAsmInfo.h" +#include "llvm/Support/CommandLine.h" + using namespace llvm; +cl::opt<bool> +EnableARMEHABI("arm-enable-ehabi", cl::Hidden, + cl::desc("Generate ARM EHABI tables"), + cl::init(false)); + + static const char *const arm_asm_table[] = { "{r0}", "r0", "{r1}", "r1", @@ -65,4 +73,8 @@ ARMELFMCAsmInfo::ARMELFMCAsmInfo() { DwarfRequiresFrameSection = false; SupportsDebugInformation = true; + + // Exceptions handling + if (EnableARMEHABI) + ExceptionsType = ExceptionHandling::ARM; } diff --git a/lib/Target/ARM/ARMMCCodeEmitter.cpp b/lib/Target/ARM/ARMMCCodeEmitter.cpp index 6d7b485..10607b1 100644 --- a/lib/Target/ARM/ARMMCCodeEmitter.cpp +++ b/lib/Target/ARM/ARMMCCodeEmitter.cpp @@ -278,6 +278,15 @@ public: unsigned getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getShiftRight8Imm(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getShiftRight16Imm(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getShiftRight32Imm(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned getShiftRight64Imm(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const; + unsigned NEONThumb2DataIPostEncoder(const MCInst &MI, unsigned EncodedValue) const; unsigned NEONThumb2LoadStorePostEncoder(const MCInst &MI, @@ -1201,6 +1210,30 @@ getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op, return MO.getReg(); } +unsigned ARMMCCodeEmitter:: +getShiftRight8Imm(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + return 8 - MI.getOperand(Op).getImm(); +} + +unsigned ARMMCCodeEmitter:: +getShiftRight16Imm(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + return 16 - MI.getOperand(Op).getImm(); +} + +unsigned ARMMCCodeEmitter:: +getShiftRight32Imm(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + return 32 - MI.getOperand(Op).getImm(); +} + +unsigned ARMMCCodeEmitter:: +getShiftRight64Imm(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups) const { + return 64 - MI.getOperand(Op).getImm(); +} + void ARMMCCodeEmitter:: EncodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups) const { diff --git a/lib/Target/ARM/ARMMCExpr.h b/lib/Target/ARM/ARMMCExpr.h index d42f766..0a2e883 100644 --- a/lib/Target/ARM/ARMMCExpr.h +++ b/lib/Target/ARM/ARMMCExpr.h @@ -60,6 +60,9 @@ public: bool EvaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout) const; void AddValueSymbols(MCAssembler *) const; + const MCSection *FindAssociatedSection() const { + return getSubExpr()->FindAssociatedSection(); + } static bool classof(const MCExpr *E) { return E->getKind() == MCExpr::Target; diff --git a/lib/Target/ARM/ARMRegisterInfo.cpp b/lib/Target/ARM/ARMRegisterInfo.cpp index ad51bc1..1cba1ba 100644 --- a/lib/Target/ARM/ARMRegisterInfo.cpp +++ b/lib/Target/ARM/ARMRegisterInfo.cpp @@ -12,26 +12,8 @@ //===----------------------------------------------------------------------===// #include "ARM.h" -#include "ARMAddressingModes.h" #include "ARMBaseInstrInfo.h" -#include "ARMInstrInfo.h" -#include "ARMMachineFunctionInfo.h" #include "ARMRegisterInfo.h" -#include "ARMSubtarget.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/CodeGen/MachineConstantPool.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineLocation.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RegisterScavenging.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetOptions.h" -#include "llvm/ADT/BitVector.h" -#include "llvm/ADT/SmallVector.h" using namespace llvm; ARMRegisterInfo::ARMRegisterInfo(const ARMBaseInstrInfo &tii, diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td index 22d15b5..54bf82a 100644 --- a/lib/Target/ARM/ARMRegisterInfo.td +++ b/lib/Target/ARM/ARMRegisterInfo.td @@ -70,6 +70,8 @@ def R4 : ARMReg< 4, "r4">, DwarfRegNum<[4]>; def R5 : ARMReg< 5, "r5">, DwarfRegNum<[5]>; def R6 : ARMReg< 6, "r6">, DwarfRegNum<[6]>; def R7 : ARMReg< 7, "r7">, DwarfRegNum<[7]>; +// These require 32-bit instructions. +let CostPerUse = 1 in { def R8 : ARMReg< 8, "r8">, DwarfRegNum<[8]>; def R9 : ARMReg< 9, "r9">, DwarfRegNum<[9]>; def R10 : ARMReg<10, "r10">, DwarfRegNum<[10]>; @@ -78,6 +80,7 @@ def R12 : ARMReg<12, "r12">, DwarfRegNum<[12]>; def SP : ARMReg<13, "sp">, DwarfRegNum<[13]>; def LR : ARMReg<14, "lr">, DwarfRegNum<[14]>; def PC : ARMReg<15, "pc">, DwarfRegNum<[15]>; +} // Float registers def S0 : ARMFReg< 0, "s0">; def S1 : ARMFReg< 1, "s1">; @@ -99,33 +102,41 @@ def S30 : ARMFReg<30, "s30">; def S31 : ARMFReg<31, "s31">; // Aliases of the F* registers used to hold 64-bit fp values (doubles) let SubRegIndices = [ssub_0, ssub_1] in { -def D0 : ARMReg< 0, "d0", [S0, S1]>; -def D1 : ARMReg< 1, "d1", [S2, S3]>; -def D2 : ARMReg< 2, "d2", [S4, S5]>; -def D3 : ARMReg< 3, "d3", [S6, S7]>; -def D4 : ARMReg< 4, "d4", [S8, S9]>; -def D5 : ARMReg< 5, "d5", [S10, S11]>; -def D6 : ARMReg< 6, "d6", [S12, S13]>; -def D7 : ARMReg< 7, "d7", [S14, S15]>; -def D8 : ARMReg< 8, "d8", [S16, S17]>; -def D9 : ARMReg< 9, "d9", [S18, S19]>; -def D10 : ARMReg<10, "d10", [S20, S21]>; -def D11 : ARMReg<11, "d11", [S22, S23]>; -def D12 : ARMReg<12, "d12", [S24, S25]>; -def D13 : ARMReg<13, "d13", [S26, S27]>; -def D14 : ARMReg<14, "d14", [S28, S29]>; -def D15 : ARMReg<15, "d15", [S30, S31]>; +def D0 : ARMReg< 0, "d0", [S0, S1]>, DwarfRegNum<[256]>; +def D1 : ARMReg< 1, "d1", [S2, S3]>, DwarfRegNum<[257]>; +def D2 : ARMReg< 2, "d2", [S4, S5]>, DwarfRegNum<[258]>; +def D3 : ARMReg< 3, "d3", [S6, S7]>, DwarfRegNum<[259]>; +def D4 : ARMReg< 4, "d4", [S8, S9]>, DwarfRegNum<[260]>; +def D5 : ARMReg< 5, "d5", [S10, S11]>, DwarfRegNum<[261]>; +def D6 : ARMReg< 6, "d6", [S12, S13]>, DwarfRegNum<[262]>; +def D7 : ARMReg< 7, "d7", [S14, S15]>, DwarfRegNum<[263]>; +def D8 : ARMReg< 8, "d8", [S16, S17]>, DwarfRegNum<[264]>; +def D9 : ARMReg< 9, "d9", [S18, S19]>, DwarfRegNum<[265]>; +def D10 : ARMReg<10, "d10", [S20, S21]>, DwarfRegNum<[266]>; +def D11 : ARMReg<11, "d11", [S22, S23]>, DwarfRegNum<[267]>; +def D12 : ARMReg<12, "d12", [S24, S25]>, DwarfRegNum<[268]>; +def D13 : ARMReg<13, "d13", [S26, S27]>, DwarfRegNum<[269]>; +def D14 : ARMReg<14, "d14", [S28, S29]>, DwarfRegNum<[270]>; +def D15 : ARMReg<15, "d15", [S30, S31]>, DwarfRegNum<[271]>; } // VFP3 defines 16 additional double registers -def D16 : ARMFReg<16, "d16">; def D17 : ARMFReg<17, "d17">; -def D18 : ARMFReg<18, "d18">; def D19 : ARMFReg<19, "d19">; -def D20 : ARMFReg<20, "d20">; def D21 : ARMFReg<21, "d21">; -def D22 : ARMFReg<22, "d22">; def D23 : ARMFReg<23, "d23">; -def D24 : ARMFReg<24, "d24">; def D25 : ARMFReg<25, "d25">; -def D26 : ARMFReg<26, "d26">; def D27 : ARMFReg<27, "d27">; -def D28 : ARMFReg<28, "d28">; def D29 : ARMFReg<29, "d29">; -def D30 : ARMFReg<30, "d30">; def D31 : ARMFReg<31, "d31">; +def D16 : ARMFReg<16, "d16">, DwarfRegNum<[272]>; +def D17 : ARMFReg<17, "d17">, DwarfRegNum<[273]>; +def D18 : ARMFReg<18, "d18">, DwarfRegNum<[274]>; +def D19 : ARMFReg<19, "d19">, DwarfRegNum<[275]>; +def D20 : ARMFReg<20, "d20">, DwarfRegNum<[276]>; +def D21 : ARMFReg<21, "d21">, DwarfRegNum<[277]>; +def D22 : ARMFReg<22, "d22">, DwarfRegNum<[278]>; +def D23 : ARMFReg<23, "d23">, DwarfRegNum<[279]>; +def D24 : ARMFReg<24, "d24">, DwarfRegNum<[280]>; +def D25 : ARMFReg<25, "d25">, DwarfRegNum<[281]>; +def D26 : ARMFReg<26, "d26">, DwarfRegNum<[282]>; +def D27 : ARMFReg<27, "d27">, DwarfRegNum<[283]>; +def D28 : ARMFReg<28, "d28">, DwarfRegNum<[284]>; +def D29 : ARMFReg<29, "d29">, DwarfRegNum<[285]>; +def D30 : ARMFReg<30, "d30">, DwarfRegNum<[286]>; +def D31 : ARMFReg<31, "d31">, DwarfRegNum<[287]>; // Advanced SIMD (NEON) defines 16 quad-word aliases let SubRegIndices = [dsub_0, dsub_1], diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td index 82c6735..49fedf6 100644 --- a/lib/Target/ARM/ARMScheduleA9.td +++ b/lib/Target/ARM/ARMScheduleA9.td @@ -656,19 +656,19 @@ def CortexA9Itineraries : ProcessorItineraries< [1, 1, 1]>, // // Single-precision to Integer Move + // + // On A9 move-from-VFP is free to issue with no stall if other VFP + // operations are in flight. I assume it still can't dual-issue though. InstrItinData<IIC_fpMOVSI, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, - InstrStage<1, [A9_MUX0], 0>, - InstrStage<1, [A9_DRegsVFP], 0, Required>, - InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_NPipe]>], + InstrStage<1, [A9_MUX0], 0>], [2, 1]>, // // Double-precision to Integer Move + // + // On A9 move-from-VFP is free to issue with no stall if other VFP + // operations are in flight. I assume it still can't dual-issue though. InstrItinData<IIC_fpMOVDI, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, - InstrStage<1, [A9_MUX0], 0>, - InstrStage<1, [A9_DRegsVFP], 0, Required>, - InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_NPipe]>], + InstrStage<1, [A9_MUX0], 0>], [2, 1, 1]>, // // Single-precision FP Load @@ -691,20 +691,22 @@ def CortexA9Itineraries : ProcessorItineraries< [2, 1]>, // // FP Load Multiple + // FIXME: assumes 2 doubles which requires 2 LS cycles. InstrItinData<IIC_fpLoad_m, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, InstrStage<1, [A9_NPipe], 0>, - InstrStage<1, [A9_LSUnit]>], [1, 1, 1, 1]>, + InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1]>, // // FP Load Multiple + update + // FIXME: assumes 2 doubles which requires 2 LS cycles. InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, InstrStage<1, [A9_NPipe], 0>, - InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1]>, + InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1]>, // // Single-precision FP Store InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, @@ -725,205 +727,206 @@ def CortexA9Itineraries : ProcessorItineraries< [1, 1]>, // // FP Store Multiple + // FIXME: assumes 2 doubles which requires 2 LS cycles. InstrItinData<IIC_fpStore_m,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, InstrStage<1, [A9_NPipe], 0>, - InstrStage<1, [A9_LSUnit]>], [1, 1, 1, 1]>, + InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1]>, // // FP Store Multiple + update + // FIXME: assumes 2 doubles which requires 2 LS cycles. InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, InstrStage<1, [A9_NPipe], 0>, - InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1]>, + InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1]>, // NEON // VLD1 - // FIXME: Conservatively assume insufficent alignment. InstrItinData<IIC_VLD1, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], - [2, 1]>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1]>, // VLD1x2 InstrItinData<IIC_VLD1x2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], - [2, 2, 1]>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1, 1]>, // VLD1x3 InstrItinData<IIC_VLD1x3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [2, 2, 3, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 2, 1]>, // VLD1x4 InstrItinData<IIC_VLD1x4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [2, 2, 3, 3, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 2, 2, 1]>, // VLD1u InstrItinData<IIC_VLD1u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], - [2, 2, 1]>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 2, 1]>, // VLD1x2u InstrItinData<IIC_VLD1x2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], - [2, 2, 2, 1]>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [1, 1, 2, 1]>, // VLD1x3u InstrItinData<IIC_VLD1x3u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [2, 2, 3, 2, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 2, 2, 1]>, // VLD1x4u InstrItinData<IIC_VLD1x4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [2, 2, 3, 3, 2, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [1, 1, 2, 2, 2, 1]>, // // VLD1ln InstrItinData<IIC_VLD1ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [4, 1, 1, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 1, 1, 1]>, // // VLD1lnu InstrItinData<IIC_VLD1lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [4, 2, 1, 1, 1, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 2, 1, 1, 1, 1]>, // // VLD1dup InstrItinData<IIC_VLD1dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], - [3, 1]>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 1]>, // // VLD1dupu InstrItinData<IIC_VLD1dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], - [3, 2, 1, 1]>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 1, 1]>, // // VLD2 InstrItinData<IIC_VLD2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], - [3, 3, 1]>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 1]>, // // VLD2x2 InstrItinData<IIC_VLD2x2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [3, 4, 3, 4, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 3, 2, 3, 1]>, // // VLD2ln InstrItinData<IIC_VLD2ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [4, 4, 1, 1, 1, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 3, 1, 1, 1, 1]>, // // VLD2u InstrItinData<IIC_VLD2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], - [3, 3, 2, 1, 1, 1]>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 2, 1, 1, 1]>, // // VLD2x2u InstrItinData<IIC_VLD2x2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [3, 4, 3, 4, 2, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 3, 2, 3, 2, 1]>, // // VLD2lnu InstrItinData<IIC_VLD2lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [4, 4, 2, 1, 1, 1, 1, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [3, 3, 2, 1, 1, 1, 1, 1]>, // // VLD2dup InstrItinData<IIC_VLD2dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], - [3, 3, 1]>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 1]>, // // VLD2dupu InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], - [3, 3, 2, 1, 1]>, + InstrStage<7, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], + [2, 2, 2, 1, 1]>, // // VLD3 InstrItinData<IIC_VLD3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<10,[A9_DRegsVFP], 0, Reserved>, - InstrStage<4, [A9_NPipe], 0>, - InstrStage<4, [A9_LSUnit]>], - [4, 4, 5, 1]>, + InstrStage<9,[A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 1]>, // // VLD3ln InstrItinData<IIC_VLD3ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, @@ -938,10 +941,10 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_VLD3u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<10,[A9_DRegsVFP], 0, Reserved>, - InstrStage<4, [A9_NPipe], 0>, - InstrStage<4, [A9_LSUnit]>], - [4, 4, 5, 2, 1]>, + InstrStage<9,[A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 2, 1]>, // // VLD3lnu InstrItinData<IIC_VLD3lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, @@ -974,108 +977,108 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_VLD4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<10,[A9_DRegsVFP], 0, Reserved>, - InstrStage<4, [A9_NPipe], 0>, - InstrStage<4, [A9_LSUnit]>], - [4, 4, 5, 5, 1]>, + InstrStage<9,[A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 4, 1]>, // // VLD4ln InstrItinData<IIC_VLD4ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<11,[A9_DRegsVFP], 0, Reserved>, - InstrStage<5, [A9_NPipe], 0>, - InstrStage<5, [A9_LSUnit]>], - [5, 5, 6, 6, 1, 1, 1, 1, 2, 2]>, + InstrStage<10,[A9_DRegsVFP], 0, Reserved>, + InstrStage<4, [A9_NPipe], 0>, + InstrStage<4, [A9_LSUnit]>], + [4, 4, 5, 5, 1, 1, 1, 1, 2, 2]>, // // VLD4u InstrItinData<IIC_VLD4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<10,[A9_DRegsVFP], 0, Reserved>, - InstrStage<4, [A9_NPipe], 0>, - InstrStage<4, [A9_LSUnit]>], - [4, 4, 5, 5, 2, 1]>, + InstrStage<9,[A9_DRegsVFP], 0, Reserved>, + InstrStage<3, [A9_NPipe], 0>, + InstrStage<3, [A9_LSUnit]>], + [3, 3, 4, 4, 2, 1]>, // // VLD4lnu InstrItinData<IIC_VLD4lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<11,[A9_DRegsVFP], 0, Reserved>, - InstrStage<5, [A9_NPipe], 0>, - InstrStage<5, [A9_LSUnit]>], - [5, 5, 6, 6, 2, 1, 1, 1, 1, 1, 2, 2]>, + InstrStage<10,[A9_DRegsVFP], 0, Reserved>, + InstrStage<4, [A9_NPipe], 0>, + InstrStage<4, [A9_LSUnit]>], + [4, 4, 5, 5, 2, 1, 1, 1, 1, 1, 2, 2]>, // // VLD4dup InstrItinData<IIC_VLD4dup, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [3, 3, 4, 4, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 2, 3, 3, 1]>, // // VLD4dupu InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], - [3, 3, 4, 4, 2, 1, 1]>, + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], + [2, 2, 3, 3, 2, 1, 1]>, // // VST1 InstrItinData<IIC_VST1, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<2, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>, // // VST1x2 InstrItinData<IIC_VST1x2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<2, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1, 1]>, // // VST1x3 InstrItinData<IIC_VST1x3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1, 2]>, // // VST1x4 InstrItinData<IIC_VST1x4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1, 2, 2]>, // // VST1u InstrItinData<IIC_VST1u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<2, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1, 1]>, // // VST1x2u InstrItinData<IIC_VST1x2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<2, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1, 1, 1]>, // // VST1x3u @@ -1083,44 +1086,44 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, InstrStage<2, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1, 1, 1, 2]>, // // VST1x4u InstrItinData<IIC_VST1x4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1, 1, 1, 2, 2]>, // // VST1ln InstrItinData<IIC_VST1ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<2, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>, // // VST1lnu InstrItinData<IIC_VST1lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1, 1]>, // // VST2 InstrItinData<IIC_VST2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<2, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1, 1]>, // // VST2x2 @@ -1136,9 +1139,9 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_VST2u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<2, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1, 1, 1]>, // // VST2x2u @@ -1154,36 +1157,36 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_VST2ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<2, [A9_DRegsVFP], 0, Reserved>, - InstrStage<2, [A9_NPipe], 0>, - InstrStage<2, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [1, 1, 1, 1]>, // // VST2lnu InstrItinData<IIC_VST2lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<1, [A9_DRegsVFP], 0, Reserved>, + InstrStage<1, [A9_NPipe], 0>, + InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1, 1, 1]>, // // VST3 InstrItinData<IIC_VST3, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1, 2]>, // // VST3u InstrItinData<IIC_VST3u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1, 1, 1, 2]>, // // VST3ln @@ -1208,36 +1211,36 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_VST4, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1, 2, 2]>, // // VST4u InstrItinData<IIC_VST4u, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1, 1, 1, 2, 2]>, // // VST4ln InstrItinData<IIC_VST4ln, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1, 2, 2]>, // // VST4lnu InstrItinData<IIC_VST4lnu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_DRegsN], 0, Required>, - InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<3, [A9_NPipe], 0>, - InstrStage<3, [A9_LSUnit]>], + InstrStage<2, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe], 0>, + InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1, 1, 1, 2, 2]>, // diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp index 2b9202b..aa1e398 100644 --- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp +++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -35,7 +35,7 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, // This requires 4-byte alignment. if ((Align & 3) != 0) return SDValue(); - // This requires the copy size to be a constant, preferrably + // This requires the copy size to be a constant, preferably // within a subtarget-specific limit. ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); if (!ConstantSize) diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index 1465984..c6f266b 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -38,6 +38,7 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS, , ARMFPUType(None) , UseNEONForSinglePrecisionFP(false) , SlowFPVMLx(false) + , HasVMLxForwarding(false) , SlowFPBrcc(false) , IsThumb(isT) , ThumbMode(Thumb1) @@ -51,6 +52,7 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS, , HasT2ExtractPack(false) , HasDataBarrier(false) , Pref32BitThumb(false) + , AvoidCPSRPartialUpdate(false) , HasMPExtension(false) , FPOnlySP(false) , AllowsUnalignedMem(false) diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index 76c1c3f..0271c87 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -61,6 +61,10 @@ protected: /// whether the FP VML[AS] instructions are slow (if so, don't use them). bool SlowFPVMLx; + /// HasVMLxForwarding - If true, NEON has special multiplier accumulator + /// forwarding to allow mul + mla being issued back to back. + bool HasVMLxForwarding; + /// SlowFPBrcc - True if floating point compare + branch is slow. bool SlowFPBrcc; @@ -106,6 +110,11 @@ protected: /// over 16-bit ones. bool Pref32BitThumb; + /// AvoidCPSRPartialUpdate - If true, codegen would avoid using instructions + /// that partially update CPSR and add false dependency on the previous + /// CPSR setting instruction. + bool AvoidCPSRPartialUpdate; + /// HasMPExtension - True if the subtarget supports Multiprocessing /// extension (ARMv7 only). bool HasMPExtension; @@ -182,15 +191,19 @@ protected: bool hasT2ExtractPack() const { return HasT2ExtractPack; } bool hasDataBarrier() const { return HasDataBarrier; } bool useFPVMLx() const { return !SlowFPVMLx; } + bool hasVMLxForwarding() const { return HasVMLxForwarding; } bool isFPBrccSlow() const { return SlowFPBrcc; } bool isFPOnlySP() const { return FPOnlySP; } bool prefers32BitThumb() const { return Pref32BitThumb; } + bool avoidCPSRPartialUpdate() const { return AvoidCPSRPartialUpdate; } bool hasMPExtension() const { return HasMPExtension; } bool hasFP16() const { return HasFP16; } bool hasD16() const { return HasD16; } - bool isTargetDarwin() const { return TargetTriple.getOS() == Triple::Darwin; } + const Triple &getTargetTriple() const { return TargetTriple; } + + bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } bool isTargetELF() const { return !isTargetDarwin(); } bool isAPCS_ABI() const { return TargetABI == ARM_ABI_APCS; } diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index 0ee773b..29aa4f7 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -22,16 +22,13 @@ #include "llvm/Target/TargetRegistry.h" using namespace llvm; -static cl::opt<bool>ExpandMLx("expand-fp-mlx", cl::init(false), cl::Hidden); - static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) { Triple TheTriple(TT); - switch (TheTriple.getOS()) { - case Triple::Darwin: + + if (TheTriple.isOSDarwin()) return new ARMMCAsmInfoDarwin(); - default: - return new ARMELFMCAsmInfo(); - } + + return new ARMELFMCAsmInfo(); } // This is duplicated code. Refactor this. @@ -41,17 +38,17 @@ static MCStreamer *createMCStreamer(const Target &T, const std::string &TT, MCCodeEmitter *Emitter, bool RelaxAll, bool NoExecStack) { - switch (Triple(TT).getOS()) { - case Triple::Darwin: + Triple TheTriple(TT); + + if (TheTriple.isOSDarwin()) return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll); - case Triple::MinGW32: - case Triple::Cygwin: - case Triple::Win32: + + if (TheTriple.isOSWindows()) { llvm_unreachable("ARM does not support Windows COFF format"); return NULL; - default: - return createELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack); } + + return createELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack); } extern "C" void LLVMInitializeARMTarget() { @@ -86,8 +83,7 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, : LLVMTargetMachine(T, TT), Subtarget(TT, FS, isThumb), JITInfo(), - InstrItins(Subtarget.getInstrItineraryData()) -{ + InstrItins(Subtarget.getInstrItineraryData()) { DefRelocModel = getRelocationModel(); } @@ -149,8 +145,7 @@ bool ARMBaseTargetMachine::addPreRegAlloc(PassManagerBase &PM, // FIXME: temporarily disabling load / store optimization pass for Thumb1. if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only()) PM.add(createARMLoadStoreOptimizationPass(true)); - if (ExpandMLx && - OptLevel != CodeGenOpt::None && Subtarget.hasVFP2()) + if (OptLevel != CodeGenOpt::None && Subtarget.isCortexA9()) PM.add(createMLxExpansionPass()); return true; diff --git a/lib/Target/ARM/ARMTargetObjectFile.cpp b/lib/Target/ARM/ARMTargetObjectFile.cpp index 7535da5..19defa1 100644 --- a/lib/Target/ARM/ARMTargetObjectFile.cpp +++ b/lib/Target/ARM/ARMTargetObjectFile.cpp @@ -36,8 +36,9 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx, ELF::SHF_WRITE | ELF::SHF_ALLOC, SectionKind::getDataRel()); + LSDASection = NULL; } - + AttributesSection = getContext().getELFSection(".ARM.attributes", ELF::SHT_ARM_ATTRIBUTES, diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 129af20..29ecc18 100644 --- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -29,15 +29,6 @@ #include "llvm/ADT/Twine.h" using namespace llvm; -/// Shift types used for register controlled shifts in ARM memory addressing. -enum ShiftType { - Lsl, - Lsr, - Asr, - Ror, - Rrx -}; - namespace { class ARMOperand; @@ -55,8 +46,10 @@ class ARMAsmParser : public TargetAsmParser { int TryParseRegister(); virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc); bool TryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &); + bool TryParseShiftRegister(SmallVectorImpl<MCParsedAsmOperand*> &); bool ParseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &); - bool ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &); + bool ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &, + ARMII::AddrMode AddrMode); bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &, StringRef Mnemonic); bool ParsePrefix(ARMMCExpr::VariantKind &RefKind); const MCExpr *ApplyPrefixToExpr(const MCExpr *E, @@ -65,13 +58,14 @@ class ARMAsmParser : public TargetAsmParser { bool ParseMemoryOffsetReg(bool &Negative, bool &OffsetRegShifted, - enum ShiftType &ShiftType, + enum ARM_AM::ShiftOpc &ShiftType, const MCExpr *&ShiftAmount, const MCExpr *&Offset, bool &OffsetIsReg, int &OffsetRegNum, SMLoc &E); - bool ParseShift(enum ShiftType &St, const MCExpr *&ShiftAmount, SMLoc &E); + bool ParseShift(enum ARM_AM::ShiftOpc &St, + const MCExpr *&ShiftAmount, SMLoc &E); bool ParseDirectiveWord(unsigned Size, SMLoc L); bool ParseDirectiveThumb(SMLoc L); bool ParseDirectiveThumbFunc(SMLoc L); @@ -102,10 +96,25 @@ class ARMAsmParser : public TargetAsmParser { SmallVectorImpl<MCParsedAsmOperand*>&); OperandMatchResultTy tryParseMSRMaskOperand( SmallVectorImpl<MCParsedAsmOperand*>&); + OperandMatchResultTy tryParseMemMode2Operand( + SmallVectorImpl<MCParsedAsmOperand*>&); + OperandMatchResultTy tryParseMemMode3Operand( + SmallVectorImpl<MCParsedAsmOperand*>&); + + // Asm Match Converter Methods + bool CvtLdWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &); + bool CvtStWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &); + bool CvtLdWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &); + bool CvtStWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &); public: ARMAsmParser(const Target &T, MCAsmParser &_Parser, TargetMachine &_TM) : TargetAsmParser(T), Parser(_Parser), TM(_TM) { + MCAsmParserExtension::Initialize(_Parser); // Initialize the set of available features. setAvailableFeatures(ComputeAvailableFeatures( &TM.getSubtarget<ARMSubtarget>())); @@ -136,6 +145,7 @@ class ARMOperand : public MCParsedAsmOperand { RegisterList, DPRRegisterList, SPRRegisterList, + Shifter, Token } Kind; @@ -178,13 +188,14 @@ class ARMOperand : public MCParsedAsmOperand { /// Combined record for all forms of ARM address expressions. struct { + ARMII::AddrMode AddrMode; unsigned BaseRegNum; union { unsigned RegNum; ///< Offset register num, when OffsetIsReg. const MCExpr *Value; ///< Offset value, when !OffsetIsReg. } Offset; const MCExpr *ShiftAmount; // used when OffsetRegShifted is true - enum ShiftType ShiftType; // used when OffsetRegShifted is true + enum ARM_AM::ShiftOpc ShiftType; // used when OffsetRegShifted is true unsigned OffsetRegShifted : 1; // only used when OffsetIsReg is true unsigned Preindexed : 1; unsigned Postindexed : 1; @@ -192,6 +203,11 @@ class ARMOperand : public MCParsedAsmOperand { unsigned Negative : 1; // only used when OffsetIsReg is true unsigned Writeback : 1; } Mem; + + struct { + ARM_AM::ShiftOpc ShiftTy; + unsigned RegNum; + } Shift; }; ARMOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {} @@ -234,6 +250,10 @@ public: break; case ProcIFlags: IFlags = o.IFlags; + break; + case Shifter: + Shift = o.Shift; + break; } } @@ -290,7 +310,9 @@ public: /// @name Memory Operand Accessors /// @{ - + ARMII::AddrMode getMemAddrMode() const { + return Mem.AddrMode; + } unsigned getMemBaseRegNum() const { return Mem.BaseRegNum; } @@ -310,7 +332,7 @@ public: assert(Mem.OffsetIsReg && Mem.OffsetRegShifted && "Invalid access!"); return Mem.ShiftAmount; } - enum ShiftType getMemShiftType() const { + enum ARM_AM::ShiftOpc getMemShiftType() const { assert(Mem.OffsetIsReg && Mem.OffsetRegShifted && "Invalid access!"); return Mem.ShiftType; } @@ -334,6 +356,52 @@ public: bool isToken() const { return Kind == Token; } bool isMemBarrierOpt() const { return Kind == MemBarrierOpt; } bool isMemory() const { return Kind == Memory; } + bool isShifter() const { return Kind == Shifter; } + bool isMemMode2() const { + if (getMemAddrMode() != ARMII::AddrMode2) + return false; + + if (getMemOffsetIsReg()) + return true; + + if (getMemNegative() && + !(getMemPostindexed() || getMemPreindexed())) + return false; + + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset()); + if (!CE) return false; + int64_t Value = CE->getValue(); + + // The offset must be in the range 0-4095 (imm12). + if (Value > 4095 || Value < -4095) + return false; + + return true; + } + bool isMemMode3() const { + if (getMemAddrMode() != ARMII::AddrMode3) + return false; + + if (getMemOffsetIsReg()) { + if (getMemOffsetRegShifted()) + return false; // No shift with offset reg allowed + return true; + } + + if (getMemNegative() && + !(getMemPostindexed() || getMemPreindexed())) + return false; + + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset()); + if (!CE) return false; + int64_t Value = CE->getValue(); + + // The offset must be in the range 0-255 (imm8). + if (Value > 255 || Value < -255) + return false; + + return true; + } bool isMemMode5() const { if (!isMemory() || getMemOffsetIsReg() || getMemWriteback() || getMemNegative()) @@ -346,6 +414,23 @@ public: int64_t Value = CE->getValue(); return ((Value & 0x3) == 0 && Value <= 1020 && Value >= -1020); } + bool isMemMode7() const { + if (!isMemory() || + getMemPreindexed() || + getMemPostindexed() || + getMemOffsetIsReg() || + getMemNegative() || + getMemWriteback()) + return false; + + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset()); + if (!CE) return false; + + if (CE->getValue()) + return false; + + return true; + } bool isMemModeRegThumb() const { if (!isMemory() || !getMemOffsetIsReg() || getMemWriteback()) return false; @@ -402,6 +487,12 @@ public: Inst.addOperand(MCOperand::CreateReg(getReg())); } + void addShifterOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm( + ARM_AM::getSORegOpc(Shift.ShiftTy, 0))); + } + void addRegListOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); const SmallVectorImpl<unsigned> &RegList = getRegList(); @@ -428,6 +519,88 @@ public: Inst.addOperand(MCOperand::CreateImm(unsigned(getMemBarrierOpt()))); } + void addMemMode7Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && isMemMode7() && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(getMemBaseRegNum())); + + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset()); + (void)CE; + assert((CE || CE->getValue() == 0) && + "No offset operand support in mode 7"); + } + + void addMemMode2Operands(MCInst &Inst, unsigned N) const { + assert(isMemMode2() && "Invalid mode or number of operands!"); + Inst.addOperand(MCOperand::CreateReg(getMemBaseRegNum())); + unsigned IdxMode = (getMemPreindexed() | getMemPostindexed() << 1); + + if (getMemOffsetIsReg()) { + Inst.addOperand(MCOperand::CreateReg(getMemOffsetRegNum())); + + ARM_AM::AddrOpc AMOpc = getMemNegative() ? ARM_AM::sub : ARM_AM::add; + ARM_AM::ShiftOpc ShOpc = ARM_AM::no_shift; + int64_t ShiftAmount = 0; + + if (getMemOffsetRegShifted()) { + ShOpc = getMemShiftType(); + const MCConstantExpr *CE = + dyn_cast<MCConstantExpr>(getMemShiftAmount()); + ShiftAmount = CE->getValue(); + } + + Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM2Opc(AMOpc, ShiftAmount, + ShOpc, IdxMode))); + return; + } + + // Create a operand placeholder to always yield the same number of operands. + Inst.addOperand(MCOperand::CreateReg(0)); + + // FIXME: #-0 is encoded differently than #0. Does the parser preserve + // the difference? + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset()); + assert(CE && "Non-constant mode 2 offset operand!"); + int64_t Offset = CE->getValue(); + + if (Offset >= 0) + Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM2Opc(ARM_AM::add, + Offset, ARM_AM::no_shift, IdxMode))); + else + Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM2Opc(ARM_AM::sub, + -Offset, ARM_AM::no_shift, IdxMode))); + } + + void addMemMode3Operands(MCInst &Inst, unsigned N) const { + assert(isMemMode3() && "Invalid mode or number of operands!"); + Inst.addOperand(MCOperand::CreateReg(getMemBaseRegNum())); + unsigned IdxMode = (getMemPreindexed() | getMemPostindexed() << 1); + + if (getMemOffsetIsReg()) { + Inst.addOperand(MCOperand::CreateReg(getMemOffsetRegNum())); + + ARM_AM::AddrOpc AMOpc = getMemNegative() ? ARM_AM::sub : ARM_AM::add; + Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM3Opc(AMOpc, 0, + IdxMode))); + return; + } + + // Create a operand placeholder to always yield the same number of operands. + Inst.addOperand(MCOperand::CreateReg(0)); + + // FIXME: #-0 is encoded differently than #0. Does the parser preserve + // the difference? + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset()); + assert(CE && "Non-constant mode 3 offset operand!"); + int64_t Offset = CE->getValue(); + + if (Offset >= 0) + Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM3Opc(ARM_AM::add, + Offset, IdxMode))); + else + Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM3Opc(ARM_AM::sub, + -Offset, IdxMode))); + } + void addMemMode5Operands(MCInst &Inst, unsigned N) const { assert(N == 2 && isMemMode5() && "Invalid number of operands!"); @@ -525,6 +698,15 @@ public: return Op; } + static ARMOperand *CreateShifter(ARM_AM::ShiftOpc ShTy, + SMLoc S, SMLoc E) { + ARMOperand *Op = new ARMOperand(Shifter); + Op->Shift.ShiftTy = ShTy; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + static ARMOperand * CreateRegList(const SmallVectorImpl<std::pair<unsigned, SMLoc> > &Regs, SMLoc StartLoc, SMLoc EndLoc) { @@ -553,9 +735,10 @@ public: return Op; } - static ARMOperand *CreateMem(unsigned BaseRegNum, bool OffsetIsReg, - const MCExpr *Offset, int OffsetRegNum, - bool OffsetRegShifted, enum ShiftType ShiftType, + static ARMOperand *CreateMem(ARMII::AddrMode AddrMode, unsigned BaseRegNum, + bool OffsetIsReg, const MCExpr *Offset, + int OffsetRegNum, bool OffsetRegShifted, + enum ARM_AM::ShiftOpc ShiftType, const MCExpr *ShiftAmount, bool Preindexed, bool Postindexed, bool Negative, bool Writeback, SMLoc S, SMLoc E) { @@ -571,6 +754,7 @@ public: "Cannot have expression offset and register offset!"); ARMOperand *Op = new ARMOperand(Memory); + Op->Mem.AddrMode = AddrMode; Op->Mem.BaseRegNum = BaseRegNum; Op->Mem.OffsetIsReg = OffsetIsReg; if (OffsetIsReg) @@ -642,7 +826,8 @@ void ARMOperand::dump(raw_ostream &OS) const { break; case Memory: OS << "<memory " - << "base:" << getMemBaseRegNum(); + << "am:" << ARMII::AddrModeToString(getMemAddrMode()) + << " base:" << getMemBaseRegNum(); if (getMemOffsetIsReg()) { OS << " offset:<register " << getMemOffsetRegNum(); if (getMemOffsetRegShifted()) { @@ -676,6 +861,9 @@ void ARMOperand::dump(raw_ostream &OS) const { case Register: OS << "<register " << getReg() << ">"; break; + case Shifter: + OS << "<shifter " << getShiftOpcStr(Shift.ShiftTy) << ">"; + break; case RegisterList: case DPRRegisterList: case SPRRegisterList: { @@ -738,6 +926,42 @@ int ARMAsmParser::TryParseRegister() { return RegNum; } +/// Try to parse a register name. The token must be an Identifier when called, +/// and if it is a register name the token is eaten and the register number is +/// returned. Otherwise return -1. +/// +bool ARMAsmParser::TryParseShiftRegister( + SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + SMLoc S = Parser.getTok().getLoc(); + const AsmToken &Tok = Parser.getTok(); + assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier"); + + std::string upperCase = Tok.getString().str(); + std::string lowerCase = LowercaseString(upperCase); + ARM_AM::ShiftOpc ShiftTy = StringSwitch<ARM_AM::ShiftOpc>(lowerCase) + .Case("lsl", ARM_AM::lsl) + .Case("lsr", ARM_AM::lsr) + .Case("asr", ARM_AM::asr) + .Case("ror", ARM_AM::ror) + .Case("rrx", ARM_AM::rrx) + .Default(ARM_AM::no_shift); + + if (ShiftTy == ARM_AM::no_shift) + return true; + + Parser.Lex(); // Eat shift-type operand; + int RegNum = TryParseRegister(); + if (RegNum == -1) + return Error(Parser.getTok().getLoc(), "register expected"); + + Operands.push_back(ARMOperand::CreateReg(RegNum,S, Parser.getTok().getLoc())); + Operands.push_back(ARMOperand::CreateShifter(ShiftTy, + S, Parser.getTok().getLoc())); + + return false; +} + + /// Try to parse a register name. The token must be an Identifier when called. /// If it's a register, an AsmOperand is created. Another AsmOperand is created /// if there is a "writeback". 'true' if it's not a register. @@ -1046,13 +1270,96 @@ tryParseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { return MatchOperand_Success; } +/// tryParseMemMode2Operand - Try to parse memory addressing mode 2 operand. +ARMAsmParser::OperandMatchResultTy ARMAsmParser:: +tryParseMemMode2Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + assert(Parser.getTok().is(AsmToken::LBrac) && "Token is not a \"[\""); + + if (ParseMemory(Operands, ARMII::AddrMode2)) + return MatchOperand_NoMatch; + + return MatchOperand_Success; +} + +/// tryParseMemMode3Operand - Try to parse memory addressing mode 3 operand. +ARMAsmParser::OperandMatchResultTy ARMAsmParser:: +tryParseMemMode3Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + assert(Parser.getTok().is(AsmToken::LBrac) && "Token is not a \"[\""); + + if (ParseMemory(Operands, ARMII::AddrMode3)) + return MatchOperand_NoMatch; + + return MatchOperand_Success; +} + +/// CvtLdWriteBackRegAddrMode2 - Convert parsed operands to MCInst. +/// Needed here because the Asm Gen Matcher can't handle properly tied operands +/// when they refer multiple MIOperands inside a single one. +bool ARMAsmParser:: +CvtLdWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); + + // Create a writeback register dummy placeholder. + Inst.addOperand(MCOperand::CreateImm(0)); + + ((ARMOperand*)Operands[3])->addMemMode2Operands(Inst, 3); + ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); + return true; +} + +/// CvtStWriteBackRegAddrMode2 - Convert parsed operands to MCInst. +/// Needed here because the Asm Gen Matcher can't handle properly tied operands +/// when they refer multiple MIOperands inside a single one. +bool ARMAsmParser:: +CvtStWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + // Create a writeback register dummy placeholder. + Inst.addOperand(MCOperand::CreateImm(0)); + ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); + ((ARMOperand*)Operands[3])->addMemMode2Operands(Inst, 3); + ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); + return true; +} + +/// CvtLdWriteBackRegAddrMode3 - Convert parsed operands to MCInst. +/// Needed here because the Asm Gen Matcher can't handle properly tied operands +/// when they refer multiple MIOperands inside a single one. +bool ARMAsmParser:: +CvtLdWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); + + // Create a writeback register dummy placeholder. + Inst.addOperand(MCOperand::CreateImm(0)); + + ((ARMOperand*)Operands[3])->addMemMode3Operands(Inst, 3); + ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); + return true; +} + +/// CvtStWriteBackRegAddrMode3 - Convert parsed operands to MCInst. +/// Needed here because the Asm Gen Matcher can't handle properly tied operands +/// when they refer multiple MIOperands inside a single one. +bool ARMAsmParser:: +CvtStWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode, + const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + // Create a writeback register dummy placeholder. + Inst.addOperand(MCOperand::CreateImm(0)); + ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1); + ((ARMOperand*)Operands[3])->addMemMode3Operands(Inst, 3); + ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2); + return true; +} + /// Parse an ARM memory expression, return false if successful else return true /// or an error. The first token must be a '[' when called. /// /// TODO Only preindexing and postindexing addressing are started, unindexed /// with option, etc are still to do. bool ARMAsmParser:: -ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands, + ARMII::AddrMode AddrMode = ARMII::AddrModeNone) { SMLoc S, E; assert(Parser.getTok().is(AsmToken::LBrac) && "Token is not a Left Bracket"); @@ -1083,7 +1390,7 @@ ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { ARMOperand *WBOp = 0; int OffsetRegNum = -1; bool OffsetRegShifted = false; - enum ShiftType ShiftType = Lsl; + enum ARM_AM::ShiftOpc ShiftType = ARM_AM::lsl; const MCExpr *ShiftAmount = 0; const MCExpr *Offset = 0; @@ -1106,10 +1413,17 @@ ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { const AsmToken &ExclaimTok = Parser.getTok(); if (ExclaimTok.is(AsmToken::Exclaim)) { + // None of addrmode3 instruction uses "!" + if (AddrMode == ARMII::AddrMode3) + return true; + WBOp = ARMOperand::CreateToken(ExclaimTok.getString(), ExclaimTok.getLoc()); Writeback = true; Parser.Lex(); // Eat exclaim token + } else { // In addressing mode 2, pre-indexed mode always end with "!" + if (AddrMode == ARMII::AddrMode2) + Preindexed = false; } } else { // The "[Rn" we have so far was not followed by a comma. @@ -1143,13 +1457,17 @@ ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { if (!OffsetIsReg) { if (!Offset) Offset = MCConstantExpr::Create(0, getContext()); + } else { + if (AddrMode == ARMII::AddrMode3 && OffsetRegShifted) { + Error(E, "shift amount not supported"); + return true; + } } - Operands.push_back(ARMOperand::CreateMem(BaseRegNum, OffsetIsReg, Offset, - OffsetRegNum, OffsetRegShifted, - ShiftType, ShiftAmount, Preindexed, - Postindexed, Negative, Writeback, - S, E)); + Operands.push_back(ARMOperand::CreateMem(AddrMode, BaseRegNum, OffsetIsReg, + Offset, OffsetRegNum, OffsetRegShifted, + ShiftType, ShiftAmount, Preindexed, + Postindexed, Negative, Writeback, S, E)); if (WBOp) Operands.push_back(WBOp); @@ -1165,7 +1483,7 @@ ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { /// we return false on success or an error otherwise. bool ARMAsmParser::ParseMemoryOffsetReg(bool &Negative, bool &OffsetRegShifted, - enum ShiftType &ShiftType, + enum ARM_AM::ShiftOpc &ShiftType, const MCExpr *&ShiftAmount, const MCExpr *&Offset, bool &OffsetIsReg, @@ -1226,28 +1544,28 @@ bool ARMAsmParser::ParseMemoryOffsetReg(bool &Negative, /// ( lsl | lsr | asr | ror ) , # shift_amount /// rrx /// and returns true if it parses a shift otherwise it returns false. -bool ARMAsmParser::ParseShift(ShiftType &St, const MCExpr *&ShiftAmount, - SMLoc &E) { +bool ARMAsmParser::ParseShift(ARM_AM::ShiftOpc &St, + const MCExpr *&ShiftAmount, SMLoc &E) { const AsmToken &Tok = Parser.getTok(); if (Tok.isNot(AsmToken::Identifier)) return true; StringRef ShiftName = Tok.getString(); if (ShiftName == "lsl" || ShiftName == "LSL") - St = Lsl; + St = ARM_AM::lsl; else if (ShiftName == "lsr" || ShiftName == "LSR") - St = Lsr; + St = ARM_AM::lsr; else if (ShiftName == "asr" || ShiftName == "ASR") - St = Asr; + St = ARM_AM::asr; else if (ShiftName == "ror" || ShiftName == "ROR") - St = Ror; + St = ARM_AM::ror; else if (ShiftName == "rrx" || ShiftName == "RRX") - St = Rrx; + St = ARM_AM::rrx; else return true; Parser.Lex(); // Eat shift type token. // Rrx stands alone. - if (St == Rrx) + if (St == ARM_AM::rrx) return false; // Otherwise, there must be a '#' and a shift amount. @@ -1286,6 +1604,9 @@ bool ARMAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands, case AsmToken::Identifier: if (!TryParseRegisterWithWriteBack(Operands)) return false; + if (!TryParseShiftRegister(Operands)) + return false; + // Fall though for the Identifier case that is not a register or a // special name. diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index 78d73d3..bdce2c4 100644 --- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -18,6 +18,7 @@ #include "ARMDisassembler.h" #include "ARMDisassemblerCore.h" +#include "llvm/ADT/OwningPtr.h" #include "llvm/MC/EDInstInfo.h" #include "llvm/MC/MCInst.h" #include "llvm/Target/TargetRegistry.h" @@ -94,6 +95,9 @@ static unsigned decodeARMInstruction(uint32_t &insn) { // As a result, the decoder fails to deocode USAT properly. if (slice(insn, 27, 21) == 0x37 && slice(insn, 5, 4) == 1) return ARM::USAT; + // As a result, the decoder fails to deocode UQADD16 properly. + if (slice(insn, 27, 20) == 0x66 && slice(insn, 7, 4) == 1) + return ARM::UQADD16; // Ditto for ADDSrs, which is a super-instruction for A8.6.7 & A8.6.8. // As a result, the decoder fails to decode UMULL properly. @@ -280,6 +284,24 @@ static unsigned T2Morph2LoadLiteral(unsigned Opcode) { } } +// Helper function for special case handling of PLD (literal) and friends. +// See A8.6.117 T1 & T2 and friends for why we morphed the opcode +// before returning it. +static unsigned T2Morph2PLDLiteral(unsigned Opcode) { + switch (Opcode) { + default: + return Opcode; // Return unmorphed opcode. + + case ARM::t2PLDi8: case ARM::t2PLDs: + case ARM::t2PLDWi12: case ARM::t2PLDWi8: + case ARM::t2PLDWs: + return ARM::t2PLDi12; + + case ARM::t2PLIi8: case ARM::t2PLIs: + return ARM::t2PLIi12; + } +} + /// decodeThumbSideEffect is a decorator function which can potentially twiddle /// the instruction or morph the returned opcode under Thumb2. /// @@ -330,12 +352,27 @@ static unsigned decodeThumbSideEffect(bool IsThumb2, unsigned &insn) { } // --------- Transform End Marker --------- + unsigned unmorphed = decodeThumbInstruction(insn); + // See, for example, A6.3.7 Load word: Table A6-18 Load word. // See A8.6.57 T3, T4 & A8.6.60 T2 and friends for why we morphed the opcode // before returning it to our caller. if (op1 == 3 && slice(op2, 6, 5) == 0 && slice(op2, 0, 0) == 1 - && slice(insn, 19, 16) == 15) - return T2Morph2LoadLiteral(decodeThumbInstruction(insn)); + && slice(insn, 19, 16) == 15) { + unsigned morphed = T2Morph2LoadLiteral(unmorphed); + if (morphed != unmorphed) + return morphed; + } + + // See, for example, A8.6.117 PLD,PLDW (immediate) T1 & T2, and friends for + // why we morphed the opcode before returning it to our caller. + if (slice(insn, 31, 25) == 0x7C && slice(insn, 15, 12) == 0xF + && slice(insn, 22, 22) == 0 && slice(insn, 20, 20) == 1 + && slice(insn, 19, 16) == 15) { + unsigned morphed = T2Morph2PLDLiteral(unmorphed); + if (morphed != unmorphed) + return morphed; + } // One last check for NEON/VFP instructions. if ((op1 == 1 || op1 == 3) && slice(op2, 6, 6) == 1) @@ -375,21 +412,23 @@ bool ARMDisassembler::getInstruction(MCInst &MI, Size = 4; DEBUG({ - errs() << "Opcode=" << Opcode << " Name=" << ARMUtils::OpcodeName(Opcode) + errs() << "\nOpcode=" << Opcode << " Name=" <<ARMUtils::OpcodeName(Opcode) << " Format=" << stringForARMFormat(Format) << '(' << (int)Format << ")\n"; showBitVector(errs(), insn); }); - ARMBasicMCBuilder *Builder = CreateMCBuilder(Opcode, Format); + OwningPtr<ARMBasicMCBuilder> Builder(CreateMCBuilder(Opcode, Format)); if (!Builder) return false; + Builder->setupBuilderForSymbolicDisassembly(getLLVMOpInfoCallback(), + getDisInfoBlock(), getMCContext(), + Address); + if (!Builder->Build(MI, insn)) return false; - delete Builder; - return true; } @@ -398,7 +437,7 @@ bool ThumbDisassembler::getInstruction(MCInst &MI, const MemoryObject &Region, uint64_t Address, raw_ostream &os) const { - // The Thumb instruction stream is a sequence of halhwords. + // The Thumb instruction stream is a sequence of halfwords. // This represents the first halfword as well as the machine instruction // passed to decodeThumbInstruction(). For 16-bit Thumb instruction, the top @@ -463,17 +502,19 @@ bool ThumbDisassembler::getInstruction(MCInst &MI, showBitVector(errs(), insn); }); - ARMBasicMCBuilder *Builder = CreateMCBuilder(Opcode, Format); + OwningPtr<ARMBasicMCBuilder> Builder(CreateMCBuilder(Opcode, Format)); if (!Builder) return false; Builder->SetSession(const_cast<Session *>(&SO)); + Builder->setupBuilderForSymbolicDisassembly(getLLVMOpInfoCallback(), + getDisInfoBlock(), getMCContext(), + Address); + if (!Builder->Build(MI, insn)) return false; - delete Builder; - return true; } diff --git a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp index bac68dd..642829c 100644 --- a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp +++ b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp @@ -17,6 +17,7 @@ #include "ARMDisassemblerCore.h" #include "ARMAddressingModes.h" +#include "ARMMCExpr.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -82,10 +83,28 @@ const char *ARMUtils::OpcodeName(unsigned Opcode) { // FIXME: Auto-gened? static unsigned getRegisterEnum(BO B, unsigned RegClassID, unsigned RawRegister) { - // For this purpose, we can treat rGPR as if it were GPR. - if (RegClassID == ARM::rGPRRegClassID) RegClassID = ARM::GPRRegClassID; + if (RegClassID == ARM::rGPRRegClassID) { + // Check for The register numbers 13 and 15 that are not permitted for many + // Thumb register specifiers. + if (RawRegister == 13 || RawRegister == 15) { + B->SetErr(-1); + return 0; + } + // For this purpose, we can treat rGPR as if it were GPR. + RegClassID = ARM::GPRRegClassID; + } // See also decodeNEONRd(), decodeNEONRn(), decodeNEONRm(). + // A7.3 register encoding + // Qd -> bit[12] == 0 + // Qn -> bit[16] == 0 + // Qm -> bit[0] == 0 + // + // If one of these bits is 1, the instruction is UNDEFINED. + if (RegClassID == ARM::QPRRegClassID && slice(RawRegister, 0, 0) == 1) { + B->SetErr(-1); + return 0; + } unsigned RegNum = RegClassID == ARM::QPRRegClassID ? RawRegister >> 1 : RawRegister; @@ -497,14 +516,66 @@ static bool DisassemblePseudo(MCInst &MI, unsigned Opcode, uint32_t insn, return false; } +// A8.6.94 MLA +// if d == 15 || n == 15 || m == 15 || a == 15 then UNPREDICTABLE; +// +// A8.6.105 MUL +// if d == 15 || n == 15 || m == 15 then UNPREDICTABLE; +// +// A8.6.246 UMULL +// if dLo == 15 || dHi == 15 || n == 15 || m == 15 then UNPREDICTABLE; +// if dHi == dLo then UNPREDICTABLE; +static bool BadRegsMulFrm(unsigned Opcode, uint32_t insn) { + unsigned R19_16 = slice(insn, 19, 16); + unsigned R15_12 = slice(insn, 15, 12); + unsigned R11_8 = slice(insn, 11, 8); + unsigned R3_0 = slice(insn, 3, 0); + switch (Opcode) { + default: + // Did we miss an opcode? + DEBUG(errs() << "BadRegsMulFrm: unexpected opcode!"); + return false; + case ARM::MLA: case ARM::MLS: case ARM::SMLABB: case ARM::SMLABT: + case ARM::SMLATB: case ARM::SMLATT: case ARM::SMLAWB: case ARM::SMLAWT: + case ARM::SMMLA: case ARM::SMMLAR: case ARM::SMMLS: case ARM::SMMLSR: + case ARM::USADA8: + if (R19_16 == 15 || R15_12 == 15 || R11_8 == 15 || R3_0 == 15) + return true; + return false; + case ARM::MUL: case ARM::SMMUL: case ARM::SMMULR: + case ARM::SMULBB: case ARM::SMULBT: case ARM::SMULTB: case ARM::SMULTT: + case ARM::SMULWB: case ARM::SMULWT: case ARM::SMUAD: case ARM::SMUADX: + // A8.6.167 SMLAD & A8.6.172 SMLSD + case ARM::SMLAD: case ARM::SMLADX: case ARM::SMLSD: case ARM::SMLSDX: + case ARM::USAD8: + if (R19_16 == 15 || R11_8 == 15 || R3_0 == 15) + return true; + return false; + case ARM::SMLAL: case ARM::SMULL: case ARM::UMAAL: case ARM::UMLAL: + case ARM::UMULL: + case ARM::SMLALBB: case ARM::SMLALBT: case ARM::SMLALTB: case ARM::SMLALTT: + case ARM::SMLALD: case ARM::SMLALDX: case ARM::SMLSLD: case ARM::SMLSLDX: + if (R19_16 == 15 || R15_12 == 15 || R11_8 == 15 || R3_0 == 15) + return true; + if (R19_16 == R15_12) + return true; + return false;; + } +} + // Multiply Instructions. -// MLA, MLS, SMLABB, SMLABT, SMLATB, SMLATT, SMLAWB, SMLAWT, SMMLA, SMMLS: +// MLA, MLS, SMLABB, SMLABT, SMLATB, SMLATT, SMLAWB, SMLAWT, SMMLA, SMMLAR, +// SMMLS, SMMLAR, SMLAD, SMLADX, SMLSD, SMLSDX, and USADA8 (for convenience): // Rd{19-16} Rn{3-0} Rm{11-8} Ra{15-12} +// But note that register checking for {SMLAD, SMLADX, SMLSD, SMLSDX} is +// only for {d, n, m}. // -// MUL, SMMUL, SMULBB, SMULBT, SMULTB, SMULTT, SMULWB, SMULWT: +// MUL, SMMUL, SMMULR, SMULBB, SMULBT, SMULTB, SMULTT, SMULWB, SMULWT, SMUAD, +// SMUADX, and USAD8 (for convenience): // Rd{19-16} Rn{3-0} Rm{11-8} // -// SMLAL, SMULL, UMAAL, UMLAL, UMULL, SMLALBB, SMLALBT, SMLALTB, SMLALTT: +// SMLAL, SMULL, UMAAL, UMLAL, UMULL, SMLALBB, SMLALBT, SMLALTB, SMLALTT, +// SMLALD, SMLADLX, SMLSLD, SMLSLDX: // RdLo{15-12} RdHi{19-16} Rn{3-0} Rm{11-8} // // The mapping of the multiply registers to the "regular" ARM registers, where @@ -531,6 +602,10 @@ static bool DisassembleMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn, && OpInfo[2].RegClass == ARM::GPRRegClassID && "Expect three register operands"); + // Sanity check for the register encodings. + if (BadRegsMulFrm(Opcode, insn)) + return false; + // Instructions with two destination registers have RdLo{15-12} first. if (NumDefs == 2) { assert(NumOps >= 4 && OpInfo[3].RegClass == ARM::GPRRegClassID && @@ -618,18 +693,38 @@ static inline unsigned GetCopOpc(uint32_t insn) { static bool DisassembleCoprocessor(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { - assert(NumOps >= 5 && "Num of operands >= 5 for coprocessor instr"); + assert(NumOps >= 4 && "Num of operands >= 4 for coprocessor instr"); unsigned &OpIdx = NumOpsAdded; + // A8.6.92 + // if coproc == '101x' then SEE "Advanced SIMD and VFP" + // But since the special instructions have more explicit encoding bits + // specified, if coproc == 10 or 11, we should reject it as invalid. + unsigned coproc = GetCoprocessor(insn); + if ((Opcode == ARM::MCR || Opcode == ARM::MCRR || + Opcode == ARM::MRC || Opcode == ARM::MRRC) && + (coproc == 10 || coproc == 11)) { + DEBUG(errs() << "Encoding error: coproc == 10 or 11 for MCR[R]/MR[R]C\n"); + return false; + } + bool OneCopOpc = (Opcode == ARM::MCRR || Opcode == ARM::MCRR2 || Opcode == ARM::MRRC || Opcode == ARM::MRRC2); + // CDP/CDP2 has no GPR operand; the opc1 operand is also wider (Inst{23-20}). bool NoGPR = (Opcode == ARM::CDP || Opcode == ARM::CDP2); bool LdStCop = LdStCopOpcode(Opcode); + bool RtOut = (Opcode == ARM::MRC || Opcode == ARM::MRC2); OpIdx = 0; - MI.addOperand(MCOperand::CreateImm(GetCoprocessor(insn))); + if (RtOut) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn)))); + ++OpIdx; + } + MI.addOperand(MCOperand::CreateImm(coproc)); + ++OpIdx; if (LdStCop) { // Unindex if P:W = 0b00 --> _OPTION variant @@ -639,26 +734,34 @@ static bool DisassembleCoprocessor(MCInst &MI, unsigned Opcode, uint32_t insn, MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, decodeRn(insn)))); + OpIdx += 2; if (PW) { MI.addOperand(MCOperand::CreateReg(0)); ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub; + const TargetInstrDesc &TID = ARMInsts[Opcode]; + unsigned IndexMode = + (TID.TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift; unsigned Offset = ARM_AM::getAM2Opc(AddrOpcode, slice(insn, 7, 0) << 2, - ARM_AM::no_shift); + ARM_AM::no_shift, IndexMode); MI.addOperand(MCOperand::CreateImm(Offset)); - OpIdx = 5; + OpIdx += 2; } else { MI.addOperand(MCOperand::CreateImm(slice(insn, 7, 0))); - OpIdx = 4; + ++OpIdx; } } else { MI.addOperand(MCOperand::CreateImm(OneCopOpc ? GetCopOpc(insn) : GetCopOpc1(insn, NoGPR))); + ++OpIdx; - MI.addOperand(NoGPR ? MCOperand::CreateImm(decodeRd(insn)) - : MCOperand::CreateReg( - getRegisterEnum(B, ARM::GPRRegClassID, - decodeRd(insn)))); + if (!RtOut) { + MI.addOperand(NoGPR ? MCOperand::CreateImm(decodeRd(insn)) + : MCOperand::CreateReg( + getRegisterEnum(B, ARM::GPRRegClassID, + decodeRd(insn)))); + ++OpIdx; + } MI.addOperand(OneCopOpc ? MCOperand::CreateReg( getRegisterEnum(B, ARM::GPRRegClassID, @@ -667,7 +770,7 @@ static bool DisassembleCoprocessor(MCInst &MI, unsigned Opcode, uint32_t insn, MI.addOperand(MCOperand::CreateImm(decodeRm(insn))); - OpIdx = 5; + OpIdx += 2; if (!OneCopOpc) { MI.addOperand(MCOperand::CreateImm(GetCopOpc2(insn))); @@ -679,8 +782,8 @@ static bool DisassembleCoprocessor(MCInst &MI, unsigned Opcode, uint32_t insn, } // Branch Instructions. -// BLr9: SignExtend(Imm24:'00', 32) -// Bcc, BLr9_pred: SignExtend(Imm24:'00', 32) Pred0 Pred1 +// BL: SignExtend(Imm24:'00', 32) +// Bcc, BL_pred: SignExtend(Imm24:'00', 32) Pred0 Pred1 // SMC: ZeroExtend(imm4, 32) // SVC: ZeroExtend(Imm24, 32) // @@ -735,6 +838,11 @@ static bool DisassembleBrFrm(MCInst &MI, unsigned Opcode, uint32_t insn, // MSRi take a mask, followed by one so_imm operand. The mask contains the // R Bit in bit 4, and the special register fields in bits 3-0. if (Opcode == ARM::MSRi) { + // A5.2.11 MSR (immediate), and hints & B6.1.6 MSR (immediate) + // The hints instructions have more specific encodings, so if mask == 0, + // we should reject this as an invalid instruction. + if (slice(insn, 19, 16) == 0) + return false; MI.addOperand(MCOperand::CreateImm(slice(insn, 22, 22) << 4 /* R Bit */ | slice(insn, 19, 16) /* Special Reg */ )); // SOImm is 4-bit rotate amount in bits 11-8 with 8-bit imm in bits 7-0. @@ -760,11 +868,11 @@ static bool DisassembleBrFrm(MCInst &MI, unsigned Opcode, uint32_t insn, return true; } - assert((Opcode == ARM::Bcc || Opcode == ARM::BLr9 || Opcode == ARM::BLr9_pred + assert((Opcode == ARM::Bcc || Opcode == ARM::BL || Opcode == ARM::BL_pred || Opcode == ARM::SMC || Opcode == ARM::SVC) && "Unexpected Opcode"); - assert(NumOps >= 1 && OpInfo[0].RegClass < 0 && "Reg operand expected"); + assert(NumOps >= 1 && OpInfo[0].RegClass < 0 && "Imm operand expected"); int Imm32 = 0; if (Opcode == ARM::SMC) { @@ -778,12 +886,6 @@ static bool DisassembleBrFrm(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned Imm26 = slice(insn, 23, 0) << 2; //Imm32 = signextend<signed int, 26>(Imm26); Imm32 = SignExtend32<26>(Imm26); - - // When executing an ARM instruction, PC reads as the address of the current - // instruction plus 8. The assembler subtracts 8 from the difference - // between the branch instruction and the target address, disassembler has - // to add 8 to compensate. - Imm32 += 8; } MI.addOperand(MCOperand::CreateImm(Imm32)); @@ -793,7 +895,7 @@ static bool DisassembleBrFrm(MCInst &MI, unsigned Opcode, uint32_t insn, } // Misc. Branch Instructions. -// BLXr9, BXr9 +// BLX, BLXi, BX // BX, BX_RET static bool DisassembleBrMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { @@ -809,8 +911,9 @@ static bool DisassembleBrMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, if (Opcode == ARM::BX_RET || Opcode == ARM::MOVPCLR) return true; - // BLXr9 and BX take one GPR reg. - if (Opcode == ARM::BLXr9 || Opcode == ARM::BX) { + // BLX and BX take one GPR reg. + if (Opcode == ARM::BLX || Opcode == ARM::BLX_pred || + Opcode == ARM::BX) { assert(NumOps >= 1 && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID && "Reg operand expected"); MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, @@ -819,6 +922,17 @@ static bool DisassembleBrMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, return true; } + // BLXi takes imm32 (the PC offset). + if (Opcode == ARM::BLXi) { + assert(NumOps >= 1 && OpInfo[0].RegClass < 0 && "Imm operand expected"); + // SignExtend(imm24:H:'0', 32) where imm24 = Inst{23-0} and H = Inst{24}. + unsigned Imm26 = slice(insn, 23, 0) << 2 | slice(insn, 24, 24) << 1; + int Imm32 = SignExtend32<26>(Imm26); + MI.addOperand(MCOperand::CreateImm(Imm32)); + OpIdx = 1; + return true; + } + return false; } @@ -837,6 +951,24 @@ static inline bool getBFCInvMask(uint32_t insn, uint32_t &mask) { return true; } +// Standard data-processing instructions allow PC as a register specifier, +// but we should reject other DPFrm instructions with PC as registers. +static bool BadRegsDPFrm(unsigned Opcode, uint32_t insn) { + switch (Opcode) { + default: + // Did we miss an opcode? + if (decodeRd(insn) == 15 || decodeRn(insn) == 15 || decodeRm(insn) == 15) { + DEBUG(errs() << "DPFrm with bad reg specifier(s)\n"); + return true; + } + case ARM::ADCrr: case ARM::ADDSrr: case ARM::ADDrr: case ARM::ANDrr: + case ARM::BICrr: case ARM::CMNzrr: case ARM::CMPrr: case ARM::EORrr: + case ARM::ORRrr: case ARM::RSBrr: case ARM::RSCrr: case ARM::SBCrr: + case ARM::SUBSrr: case ARM::SUBrr: case ARM::TEQrr: case ARM::TSTrr: + return false; + } +} + // A major complication is the fact that some of the saturating add/subtract // operations have Rd Rm Rn, instead of the "normal" Rd Rn Rm. // They are QADD, QDADD, QDSUB, and QSUB. @@ -864,6 +996,10 @@ static bool DisassembleDPFrm(MCInst &MI, unsigned Opcode, uint32_t insn, // Special-case handling of BFC/BFI/SBFX/UBFX. if (Opcode == ARM::BFC || Opcode == ARM::BFI) { + // A8.6.17 BFC & A8.6.18 BFI + // Sanity check Rd. + if (decodeRd(insn) == 15) + return false; MI.addOperand(MCOperand::CreateReg(0)); if (Opcode == ARM::BFI) { MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, @@ -879,6 +1015,9 @@ static bool DisassembleDPFrm(MCInst &MI, unsigned Opcode, uint32_t insn, return true; } if (Opcode == ARM::SBFX || Opcode == ARM::UBFX) { + // Sanity check Rd and Rm. + if (decodeRd(insn) == 15 || decodeRm(insn) == 15) + return false; MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, decodeRm(insn)))); MI.addOperand(MCOperand::CreateImm(slice(insn, 11, 7))); @@ -915,15 +1054,21 @@ static bool DisassembleDPFrm(MCInst &MI, unsigned Opcode, uint32_t insn, // Assert disabled because saturating operations, e.g., A8.6.127 QASX, are // routed here as well. // assert(getIBit(insn) == 0 && "I_Bit != '0' reg/reg form"); + if (BadRegsDPFrm(Opcode, insn)) + return false; MI.addOperand(MCOperand::CreateReg( getRegisterEnum(B, ARM::GPRRegClassID, RmRn? decodeRn(insn) : decodeRm(insn)))); ++OpIdx; } else if (Opcode == ARM::MOVi16 || Opcode == ARM::MOVTi16) { + // These two instructions don't allow d as 15. + if (decodeRd(insn) == 15) + return false; // We have an imm16 = imm4:imm12 (imm4=Inst{19:16}, imm12 = Inst{11:0}). assert(getIBit(insn) == 1 && "I_Bit != '1' reg/imm form"); unsigned Imm16 = slice(insn, 19, 16) << 12 | slice(insn, 11, 0); - MI.addOperand(MCOperand::CreateImm(Imm16)); + if (!B->tryAddingSymbolicOperand(Imm16, 4, MI)) + MI.addOperand(MCOperand::CreateImm(Imm16)); ++OpIdx; } else { // We have a reg/imm form. @@ -992,6 +1137,21 @@ static bool DisassembleDPSoRegFrm(MCInst &MI, unsigned Opcode, uint32_t insn, MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, decodeRm(insn)))); if (Rs) { + // If Inst{7} != 0, we should reject this insn as an invalid encoding. + if (slice(insn, 7, 7)) + return false; + + // A8.6.3 ADC (register-shifted register) + // if d == 15 || n == 15 || m == 15 || s == 15 then UNPREDICTABLE; + // + // This also accounts for shift instructions (register) where, fortunately, + // Inst{19-16} = 0b0000. + // A8.6.89 LSL (register) + // if d == 15 || n == 15 || m == 15 then UNPREDICTABLE; + if (decodeRd(insn) == 15 || decodeRn(insn) == 15 || + decodeRm(insn) == 15 || decodeRs(insn) == 15) + return false; + // Register-controlled shifts: [Rm, Rs, shift]. MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, decodeRs(insn)))); @@ -1015,6 +1175,71 @@ static bool DisassembleDPSoRegFrm(MCInst &MI, unsigned Opcode, uint32_t insn, return true; } +static bool BadRegsLdStFrm(unsigned Opcode, uint32_t insn, bool Store, bool WBack, + bool Imm) { + const StringRef Name = ARMInsts[Opcode].Name; + unsigned Rt = decodeRd(insn); + unsigned Rn = decodeRn(insn); + unsigned Rm = decodeRm(insn); + unsigned P = getPBit(insn); + unsigned W = getWBit(insn); + + if (Store) { + // Only STR (immediate, register) allows PC as the source. + if (Name.startswith("STRB") && Rt == 15) { + DEBUG(errs() << "if t == 15 then UNPREDICTABLE\n"); + return true; + } + if (WBack && (Rn == 15 || Rn == Rt)) { + DEBUG(errs() << "if wback && (n == 15 || n == t) then UNPREDICTABLE\n"); + return true; + } + if (!Imm && Rm == 15) { + DEBUG(errs() << "if m == 15 then UNPREDICTABLE\n"); + return true; + } + } else { + // Only LDR (immediate, register) allows PC as the destination. + if (Name.startswith("LDRB") && Rt == 15) { + DEBUG(errs() << "if t == 15 then UNPREDICTABLE\n"); + return true; + } + if (Imm) { + // Immediate + if (Rn == 15) { + // The literal form must be in offset mode; it's an encoding error + // otherwise. + if (!(P == 1 && W == 0)) { + DEBUG(errs() << "Ld literal form with !(P == 1 && W == 0)\n"); + return true; + } + // LDRB (literal) does not allow PC as the destination. + if (Opcode != ARM::LDRi12 && Rt == 15) { + DEBUG(errs() << "if t == 15 then UNPREDICTABLE\n"); + return true; + } + } else { + // Write back while Rn == Rt does not make sense. + if (WBack && (Rn == Rt)) { + DEBUG(errs() << "if wback && n == t then UNPREDICTABLE\n"); + return true; + } + } + } else { + // Register + if (Rm == 15) { + DEBUG(errs() << "if m == 15 then UNPREDICTABLE\n"); + return true; + } + if (WBack && (Rn == 15 || Rn == Rt)) { + DEBUG(errs() << "if wback && (n == 15 || n == t) then UNPREDICTABLE\n"); + return true; + } + } + } + return false; +} + static bool DisassembleLdStFrm(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, bool isStore, BO B) { @@ -1077,19 +1302,41 @@ static bool DisassembleLdStFrm(MCInst &MI, unsigned Opcode, uint32_t insn, if (OpIdx + 1 >= NumOps) return false; - assert((OpInfo[OpIdx].RegClass == ARM::GPRRegClassID) && - (OpInfo[OpIdx+1].RegClass < 0) && - "Expect 1 reg operand followed by 1 imm operand"); + if (BadRegsLdStFrm(Opcode, insn, isStore, isPrePost, getIBit(insn)==0)) + return false; ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub; + unsigned IndexMode = + (TID.TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift; if (getIBit(insn) == 0) { - MI.addOperand(MCOperand::CreateReg(0)); + // For pre- and post-indexed case, add a reg0 operand (Addressing Mode #2). + // Otherwise, skip the reg operand since for addrmode_imm12, Rn has already + // been populated. + if (isPrePost) { + MI.addOperand(MCOperand::CreateReg(0)); + OpIdx += 1; + } - // Disassemble the 12-bit immediate offset. unsigned Imm12 = slice(insn, 11, 0); - unsigned Offset = ARM_AM::getAM2Opc(AddrOpcode, Imm12, ARM_AM::no_shift); - MI.addOperand(MCOperand::CreateImm(Offset)); + if (Opcode == ARM::LDRBi12 || Opcode == ARM::LDRi12 || + Opcode == ARM::STRBi12 || Opcode == ARM::STRi12) { + // Disassemble the 12-bit immediate offset, which is the second operand in + // $addrmode_imm12 => (ops GPR:$base, i32imm:$offsimm). + int Offset = AddrOpcode == ARM_AM::add ? 1 * Imm12 : -1 * Imm12; + MI.addOperand(MCOperand::CreateImm(Offset)); + } else { + // Disassemble the 12-bit immediate offset, which is the second operand in + // $am2offset => (ops GPR, i32imm). + unsigned Offset = ARM_AM::getAM2Opc(AddrOpcode, Imm12, ARM_AM::no_shift, + IndexMode); + MI.addOperand(MCOperand::CreateImm(Offset)); + } + OpIdx += 1; } else { + // If Inst{25} = 1 and Inst{4} != 0, we should reject this as invalid. + if (slice(insn,4,4) == 1) + return false; + // Disassemble the offset reg (Rm), shift type, and immediate shift length. MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, decodeRm(insn)))); @@ -1101,9 +1348,9 @@ static bool DisassembleLdStFrm(MCInst &MI, unsigned Opcode, uint32_t insn, // A8.4.1. Possible rrx or shift amount of 32... getImmShiftSE(ShOp, ShImm); MI.addOperand(MCOperand::CreateImm( - ARM_AM::getAM2Opc(AddrOpcode, ShImm, ShOp))); + ARM_AM::getAM2Opc(AddrOpcode, ShImm, ShOp, IndexMode))); + OpIdx += 2; } - OpIdx += 2; return true; } @@ -1125,7 +1372,7 @@ static bool HasDualReg(unsigned Opcode) { case ARM::LDRD: case ARM::LDRD_PRE: case ARM::LDRD_POST: case ARM::STRD: case ARM::STRD_PRE: case ARM::STRD_POST: return true; - } + } } static bool DisassembleLdStMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, @@ -1153,8 +1400,6 @@ static bool DisassembleLdStMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, ++OpIdx; } - bool DualReg = HasDualReg(Opcode); - // Disassemble the dst/src operand. if (OpIdx >= NumOps) return false; @@ -1165,8 +1410,8 @@ static bool DisassembleLdStMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, decodeRd(insn)))); ++OpIdx; - // Fill in LDRD and STRD's second operand. - if (DualReg) { + // Fill in LDRD and STRD's second operand Rt operand. + if (HasDualReg(Opcode)) { MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, decodeRd(insn) + 1))); ++OpIdx; @@ -1188,7 +1433,7 @@ static bool DisassembleLdStMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID && "Reg operand expected"); assert((!isPrePost || (TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1)) - && "Index mode or tied_to operand expected"); + && "Offset mode or tied_to operand expected"); MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, decodeRn(insn)))); ++OpIdx; @@ -1204,19 +1449,22 @@ static bool DisassembleLdStMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, "Expect 1 reg operand followed by 1 imm operand"); ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub; + unsigned IndexMode = + (TID.TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift; if (getAM3IBit(insn) == 1) { MI.addOperand(MCOperand::CreateReg(0)); // Disassemble the 8-bit immediate offset. unsigned Imm4H = (insn >> ARMII::ImmHiShift) & 0xF; unsigned Imm4L = insn & 0xF; - unsigned Offset = ARM_AM::getAM3Opc(AddrOpcode, (Imm4H << 4) | Imm4L); + unsigned Offset = ARM_AM::getAM3Opc(AddrOpcode, (Imm4H << 4) | Imm4L, + IndexMode); MI.addOperand(MCOperand::CreateImm(Offset)); } else { // Disassemble the offset reg (Rm). MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, decodeRm(insn)))); - unsigned Offset = ARM_AM::getAM3Opc(AddrOpcode, 0); + unsigned Offset = ARM_AM::getAM3Opc(AddrOpcode, 0, IndexMode); MI.addOperand(MCOperand::CreateImm(Offset)); } OpIdx += 2; @@ -1236,13 +1484,13 @@ static bool DisassembleStMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, } // The algorithm for disassembly of LdStMulFrm is different from others because -// it explicitly populates the two predicate operands after operand 0 (the base) -// and operand 1 (the AM4 mode imm). After operand 3, we need to populate the -// reglist with each affected register encoded as an MCOperand. +// it explicitly populates the two predicate operands after the base register. +// After that, we need to populate the reglist with each affected register +// encoded as an MCOperand. static bool DisassembleLdStMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { - assert(NumOps >= 5 && "LdStMulFrm expects NumOps >= 5"); + assert(NumOps >= 4 && "LdStMulFrm expects NumOps >= 4"); NumOpsAdded = 0; unsigned Base = getRegisterEnum(B, ARM::GPRRegClassID, decodeRn(insn)); @@ -1260,8 +1508,10 @@ static bool DisassembleLdStMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn, MI.addOperand(MCOperand::CreateReg(Base)); // Handling the two predicate operands before the reglist. - int64_t CondVal = insn >> ARMII::CondShift; - MI.addOperand(MCOperand::CreateImm(CondVal == 0xF ? 0xE : CondVal)); + int64_t CondVal = getCondField(insn); + if (CondVal == 0xF) + return false; + MI.addOperand(MCOperand::CreateImm(CondVal)); MI.addOperand(MCOperand::CreateReg(ARM::CPSR)); NumOpsAdded += 3; @@ -1352,6 +1602,12 @@ static bool DisassembleArithMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, bool ThreeReg = NumOps > 2 && OpInfo[2].RegClass == ARM::GPRRegClassID; + // Sanity check the registers, which should not be 15. + if (decodeRd(insn) == 15 || decodeRm(insn) == 15) + return false; + if (ThreeReg && decodeRn(insn) == 15) + return false; + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, decodeRd(insn)))); ++OpIdx; @@ -1376,7 +1632,7 @@ static bool DisassembleArithMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, ARM_AM::ShiftOpc Opc = ARM_AM::no_shift; if (Opcode == ARM::PKHBT) Opc = ARM_AM::lsl; - else if (Opcode == ARM::PKHBT) + else if (Opcode == ARM::PKHTB) Opc = ARM_AM::asr; getImmShiftSE(Opc, ShiftAmt); MI.addOperand(MCOperand::CreateImm(ARM_AM::getSORegOpc(Opc, ShiftAmt))); @@ -1391,6 +1647,11 @@ static bool DisassembleArithMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, static bool DisassembleSatFrm(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + // A8.6.183 SSAT + // if d == 15 || n == 15 then UNPREDICTABLE; + if (decodeRd(insn) == 15 || decodeRm(insn) == 15) + return false; + const TargetInstrDesc &TID = ARMInsts[Opcode]; NumOpsAdded = TID.getNumOperands() - 2; // ignore predicate operands @@ -1429,6 +1690,11 @@ static bool DisassembleSatFrm(MCInst &MI, unsigned Opcode, uint32_t insn, static bool DisassembleExtFrm(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { + // A8.6.220 SXTAB + // if d == 15 || m == 15 then UNPREDICTABLE; + if (decodeRd(insn) == 15 || decodeRm(insn) == 15) + return false; + const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; unsigned &OpIdx = NumOpsAdded; @@ -1611,7 +1877,7 @@ static bool DisassembleVFPBinaryFrm(MCInst &MI, unsigned Opcode, uint32_t insn, // A8.6.295 vcvt (floating-point <-> integer) // Int to FP: VSITOD, VSITOS, VUITOD, VUITOS // FP to Int: VTOSI[Z|R]D, VTOSI[Z|R]S, VTOUI[Z|R]D, VTOUI[Z|R]S -// +// // A8.6.297 vcvt (floating-point and fixed-point) // Dd|Sd Dd|Sd(TIED_TO) #fbits(= 16|32 - UInt(imm4:i)) static bool DisassembleVFPConv1Frm(MCInst &MI, unsigned Opcode, uint32_t insn, @@ -1800,15 +2066,14 @@ static bool DisassembleVFPLdStFrm(MCInst &MI, unsigned Opcode, uint32_t insn, } // VFP Load/Store Multiple Instructions. -// This is similar to the algorithm for LDM/STM in that operand 0 (the base) and -// operand 1 (the AM4 mode imm) is followed by two predicate operands. It is -// followed by a reglist of either DPR(s) or SPR(s). +// We have an optional write back reg, the base, and two predicate operands. +// It is then followed by a reglist of either DPR(s) or SPR(s). // // VLDMD[_UPD], VLDMS[_UPD], VSTMD[_UPD], VSTMS[_UPD] static bool DisassembleVFPLdStMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { - assert(NumOps >= 5 && "VFPLdStMulFrm expects NumOps >= 5"); + assert(NumOps >= 4 && "VFPLdStMulFrm expects NumOps >= 4"); unsigned &OpIdx = NumOpsAdded; @@ -1827,25 +2092,18 @@ static bool DisassembleVFPLdStMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn, MI.addOperand(MCOperand::CreateReg(Base)); - // Next comes the AM4 Opcode. - ARM_AM::AMSubMode SubMode = getAMSubModeForBits(getPUBits(insn)); - // Must be either "ia" or "db" submode. - if (SubMode != ARM_AM::ia && SubMode != ARM_AM::db) { - DEBUG(errs() << "Illegal addressing mode 4 sub-mode!\n"); - return false; - } - MI.addOperand(MCOperand::CreateImm(ARM_AM::getAM4ModeImm(SubMode))); - // Handling the two predicate operands before the reglist. - int64_t CondVal = insn >> ARMII::CondShift; - MI.addOperand(MCOperand::CreateImm(CondVal == 0xF ? 0xE : CondVal)); + int64_t CondVal = getCondField(insn); + if (CondVal == 0xF) + return false; + MI.addOperand(MCOperand::CreateImm(CondVal)); MI.addOperand(MCOperand::CreateReg(ARM::CPSR)); - OpIdx += 4; + OpIdx += 3; - bool isSPVFP = (Opcode == ARM::VLDMSIA || Opcode == ARM::VLDMSDB || + bool isSPVFP = (Opcode == ARM::VLDMSIA || Opcode == ARM::VLDMSIA_UPD || Opcode == ARM::VLDMSDB_UPD || - Opcode == ARM::VSTMSIA || Opcode == ARM::VSTMSDB || + Opcode == ARM::VSTMSIA || Opcode == ARM::VSTMSIA_UPD || Opcode == ARM::VSTMSDB_UPD); unsigned RegClassID = isSPVFP ? ARM::SPRRegClassID : ARM::DPRRegClassID; @@ -1855,6 +2113,11 @@ static bool DisassembleVFPLdStMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn, // Fill the variadic part of reglist. unsigned char Imm8 = insn & 0xFF; unsigned Regs = isSPVFP ? Imm8 : Imm8/2; + + // Apply some sanity checks before proceeding. + if (Regs == 0 || (RegD + Regs) > 32 || (!isSPVFP && Regs > 16)) + return false; + for (unsigned i = 0; i < Regs; ++i) { MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClassID, RegD + i))); @@ -2136,7 +2399,7 @@ static unsigned decodeN3VImm(uint32_t insn) { // Correctly set VLD*/VST*'s TIED_TO GPR, as the asm printer needs it. static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, bool Store, bool DblSpaced, - BO B) { + unsigned alignment, BO B) { const TargetInstrDesc &TID = ARMInsts[Opcode]; const TargetOperandInfo *OpInfo = TID.OpInfo; @@ -2180,9 +2443,10 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn, assert((OpIdx+1) < NumOps && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID && OpInfo[OpIdx + 1].RegClass < 0 && "Addrmode #6 Operands expected"); + // addrmode6 := (ops GPR:$addr, i32imm) MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, Rn))); - MI.addOperand(MCOperand::CreateImm(0)); // Alignment ignored? + MI.addOperand(MCOperand::CreateImm(alignment)); // Alignment OpIdx += 2; if (WB) { @@ -2230,9 +2494,10 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn, assert((OpIdx+1) < NumOps && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID && OpInfo[OpIdx + 1].RegClass < 0 && "Addrmode #6 Operands expected"); + // addrmode6 := (ops GPR:$addr, i32imm) MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, Rn))); - MI.addOperand(MCOperand::CreateImm(0)); // Alignment ignored? + MI.addOperand(MCOperand::CreateImm(alignment)); // Alignment OpIdx += 2; if (WB) { @@ -2263,6 +2528,92 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn, return true; } +// A8.6.308, A8.6.311, A8.6.314, A8.6.317. +static bool Align4OneLaneInst(unsigned elem, unsigned size, + unsigned index_align, unsigned & alignment) { + unsigned bits = 0; + switch (elem) { + default: + return false; + case 1: + // A8.6.308 + if (size == 0) + return slice(index_align, 0, 0) == 0; + else if (size == 1) { + bits = slice(index_align, 1, 0); + if (bits != 0 && bits != 1) + return false; + if (bits == 1) + alignment = 16; + return true; + } else if (size == 2) { + bits = slice(index_align, 2, 0); + if (bits != 0 && bits != 3) + return false; + if (bits == 3) + alignment = 32; + return true;; + } + return true; + case 2: + // A8.6.311 + if (size == 0) { + if (slice(index_align, 0, 0) == 1) + alignment = 16; + return true; + } if (size == 1) { + if (slice(index_align, 0, 0) == 1) + alignment = 32; + return true; + } else if (size == 2) { + if (slice(index_align, 1, 1) != 0) + return false; + if (slice(index_align, 0, 0) == 1) + alignment = 64; + return true;; + } + return true; + case 3: + // A8.6.314 + if (size == 0) { + if (slice(index_align, 0, 0) != 0) + return false; + return true; + } if (size == 1) { + if (slice(index_align, 0, 0) != 0) + return false; + return true; + return true; + } else if (size == 2) { + if (slice(index_align, 1, 0) != 0) + return false; + return true;; + } + return true; + case 4: + // A8.6.317 + if (size == 0) { + if (slice(index_align, 0, 0) == 1) + alignment = 32; + return true; + } if (size == 1) { + if (slice(index_align, 0, 0) == 1) + alignment = 64; + return true; + } else if (size == 2) { + bits = slice(index_align, 1, 0); + if (bits == 3) + return false; + if (bits == 1) + alignment = 64; + else if (bits == 2) + alignment = 128; + return true;; + } + return true; + } +} + // A7.7 // If L (Inst{21}) == 0, store instructions. // Find out about double-spaced-ness of the Opcode and pass it on to @@ -2272,11 +2623,33 @@ static bool DisassembleNLdSt(MCInst &MI, unsigned Opcode, uint32_t insn, const StringRef Name = ARMInsts[Opcode].Name; bool DblSpaced = false; + // 0 represents standard alignment, i.e., unaligned data access. + unsigned alignment = 0; + + unsigned elem = 0; // legal values: {1, 2, 3, 4} + if (Name.startswith("VST1") || Name.startswith("VLD1")) + elem = 1; + + if (Name.startswith("VST2") || Name.startswith("VLD2")) + elem = 2; + + if (Name.startswith("VST3") || Name.startswith("VLD3")) + elem = 3; + + if (Name.startswith("VST4") || Name.startswith("VLD4")) + elem = 4; if (Name.find("LN") != std::string::npos) { // To one lane instructions. // See, for example, 8.6.317 VLD4 (single 4-element structure to one lane). + // Utility function takes number of elements, size, and index_align. + if (!Align4OneLaneInst(elem, + slice(insn, 11, 10), + slice(insn, 7, 4), + alignment)) + return false; + // <size> == 16 && Inst{5} == 1 --> DblSpaced = true if (Name.endswith("16") || Name.endswith("16_UPD")) DblSpaced = slice(insn, 5, 5) == 1; @@ -2284,30 +2657,102 @@ static bool DisassembleNLdSt(MCInst &MI, unsigned Opcode, uint32_t insn, // <size> == 32 && Inst{6} == 1 --> DblSpaced = true if (Name.endswith("32") || Name.endswith("32_UPD")) DblSpaced = slice(insn, 6, 6) == 1; - + } else if (Name.find("DUP") != std::string::npos) { + // Single element (or structure) to all lanes. + // Inst{9-8} encodes the number of element(s) in the structure, with: + // 0b00 (VLD1DUP) (for this, a bit makes sense only for data size 16 and 32. + // 0b01 (VLD2DUP) + // 0b10 (VLD3DUP) (for this, a bit must be encoded as 0) + // 0b11 (VLD4DUP) + // + // Inst{7-6} encodes the data size, with: + // 0b00 => 8, 0b01 => 16, 0b10 => 32 + // + // Inst{4} (the a bit) encodes the align action (0: standard alignment) + unsigned elem = slice(insn, 9, 8) + 1; + unsigned a = slice(insn, 4, 4); + if (elem != 3) { + // 0b11 is not a valid encoding for Inst{7-6}. + if (slice(insn, 7, 6) == 3) + return false; + unsigned data_size = 8 << slice(insn, 7, 6); + // For VLD1DUP, a bit makes sense only for data size of 16 and 32. + if (a && data_size == 8) + return false; + + // Now we can calculate the alignment! + if (a) + alignment = elem * data_size; + } else { + if (a) { + // A8.6.315 VLD3 (single 3-element structure to all lanes) + // The a bit must be encoded as 0. + return false; + } + } } else { // Multiple n-element structures with type encoded as Inst{11-8}. // See, for example, A8.6.316 VLD4 (multiple 4-element structures). - // n == 2 && type == 0b1001 -> DblSpaced = true - if (Name.startswith("VST2") || Name.startswith("VLD2")) - DblSpaced = slice(insn, 11, 8) == 9; - - // n == 3 && type == 0b0101 -> DblSpaced = true - if (Name.startswith("VST3") || Name.startswith("VLD3")) - DblSpaced = slice(insn, 11, 8) == 5; - - // n == 4 && type == 0b0001 -> DblSpaced = true - if (Name.startswith("VST4") || Name.startswith("VLD4")) - DblSpaced = slice(insn, 11, 8) == 1; - + // Inst{5-4} encodes alignment. + unsigned align = slice(insn, 5, 4); + switch (align) { + default: + break; + case 1: + alignment = 64; break; + case 2: + alignment = 128; break; + case 3: + alignment = 256; break; + } + + unsigned type = slice(insn, 11, 8); + // Reject UNDEFINED instructions based on type and align. + // Plus set DblSpaced flag where appropriate. + switch (elem) { + default: + break; + case 1: + // n == 1 + // A8.6.307 & A8.6.391 + if ((type == 7 && slice(align, 1, 1) == 1) || + (type == 10 && align == 3) || + (type == 6 && slice(align, 1, 1) == 1)) + return false; + break; + case 2: + // n == 2 && type == 0b1001 -> DblSpaced = true + // A8.6.310 & A8.6.393 + if ((type == 8 || type == 9) && align == 3) + return false; + DblSpaced = (type == 9); + break; + case 3: + // n == 3 && type == 0b0101 -> DblSpaced = true + // A8.6.313 & A8.6.395 + if (slice(insn, 7, 6) == 3 || slice(align, 1, 1) == 1) + return false; + DblSpaced = (type == 5); + break; + case 4: + // n == 4 && type == 0b0001 -> DblSpaced = true + // A8.6.316 & A8.6.397 + if (slice(insn, 7, 6) == 3) + return false; + DblSpaced = (type == 1); + break; + } } return DisassembleNLdSt0(MI, Opcode, insn, NumOps, NumOpsAdded, - slice(insn, 21, 21) == 0, DblSpaced, B); + slice(insn, 21, 21) == 0, DblSpaced, alignment/8, B); } // VMOV (immediate) // Qd/Dd imm +// VBIC (immediate) +// VORR (immediate) +// Qd/Dd imm src(=Qd/Dd) static bool DisassembleN1RegModImmFrm(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { @@ -2334,12 +2779,20 @@ static bool DisassembleN1RegModImmFrm(MCInst &MI, unsigned Opcode, case ARM::VMOVv8i16: case ARM::VMVNv4i16: case ARM::VMVNv8i16: + case ARM::VBICiv4i16: + case ARM::VBICiv8i16: + case ARM::VORRiv4i16: + case ARM::VORRiv8i16: esize = ESize16; break; case ARM::VMOVv2i32: case ARM::VMOVv4i32: case ARM::VMVNv2i32: case ARM::VMVNv4i32: + case ARM::VBICiv2i32: + case ARM::VBICiv4i32: + case ARM::VORRiv2i32: + case ARM::VORRiv4i32: esize = ESize32; break; case ARM::VMOVv1i64: @@ -2347,7 +2800,7 @@ static bool DisassembleN1RegModImmFrm(MCInst &MI, unsigned Opcode, esize = ESize64; break; default: - assert(0 && "Unreachable code!"); + assert(0 && "Unexpected opcode!"); return false; } @@ -2356,6 +2809,16 @@ static bool DisassembleN1RegModImmFrm(MCInst &MI, unsigned Opcode, MI.addOperand(MCOperand::CreateImm(decodeN1VImm(insn, esize))); NumOpsAdded = 2; + + // VBIC/VORRiv*i* variants have an extra $src = $Vd to be filled in. + if (NumOps >= 3 && + (OpInfo[2].RegClass == ARM::DPRRegClassID || + OpInfo[2].RegClass == ARM::QPRRegClassID)) { + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[0].RegClass, + decodeNEONRd(insn)))); + NumOpsAdded += 1; + } + return true; } @@ -2376,7 +2839,7 @@ enum N2VFlag { // // Vector Move Long: // Qd Dm -// +// // Vector Move Narrow: // Dd Qm // @@ -2518,7 +2981,7 @@ static bool DisassembleNVectorShift(MCInst &MI, unsigned Opcode, uint32_t insn, assert(OpInfo[OpIdx].RegClass < 0 && "Imm operand expected"); // Add the imm operand. - + // VSHLL has maximum shift count as the imm, inferred from its size. unsigned Imm; switch (Opcode) { @@ -2631,7 +3094,7 @@ static bool DisassembleNVdVnVmOptImm(MCInst &MI, unsigned Opcode, uint32_t insn, // N3RegFrm. if (Opcode == ARM::VMOVDneon || Opcode == ARM::VMOVQ) return true; - + // Dm = Inst{5:3-0} => NEON Rm // or // Dm is restricted to D0-D7 if size is 16, D0-D15 otherwise @@ -2770,7 +3233,7 @@ static bool DisassembleNGetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn, ElemSize esize = Opcode == ARM::VGETLNi32 ? ESize32 : ((Opcode == ARM::VGETLNs16 || Opcode == ARM::VGETLNu16) ? ESize16 - : ESize32); + : ESize8); // Rt = Inst{15-12} => ARM Rd MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, @@ -2852,17 +3315,6 @@ static bool DisassembleNDupFrm(MCInst &MI, unsigned Opcode, uint32_t insn, return true; } -// A8.6.41 DMB -// A8.6.42 DSB -// A8.6.49 ISB -static inline bool MemBarrierInstr(uint32_t insn) { - unsigned op7_4 = slice(insn, 7, 4); - if (slice(insn, 31, 8) == 0xf57ff0 && (op7_4 >= 4 && op7_4 <= 6)) - return true; - - return false; -} - static inline bool PreLoadOpcode(unsigned Opcode) { switch(Opcode) { case ARM::PLDi12: case ARM::PLDrs: @@ -2878,8 +3330,8 @@ static bool DisassemblePreLoadFrm(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { // Preload Data/Instruction requires either 2 or 3 operands. - // PLDi, PLDWi, PLIi: addrmode_imm12 - // PLDr[a|m], PLDWr[a|m], PLIr[a|m]: ldst_so_reg + // PLDi12, PLDWi12, PLIi12: addrmode_imm12 + // PLDrs, PLDWrs, PLIrs: ldst_so_reg MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, decodeRn(insn)))); @@ -2888,10 +3340,19 @@ static bool DisassemblePreLoadFrm(MCInst &MI, unsigned Opcode, uint32_t insn, || Opcode == ARM::PLIi12) { unsigned Imm12 = slice(insn, 11, 0); bool Negative = getUBit(insn) == 0; + + // A8.6.118 PLD (literal) PLDWi12 with Rn=PC is transformed to PLDi12. + if (Opcode == ARM::PLDWi12 && slice(insn, 19, 16) == 0xF) { + DEBUG(errs() << "Rn == '1111': PLDWi12 morphed to PLDi12\n"); + MI.setOpcode(ARM::PLDi12); + } + // -0 is represented specially. All other values are as normal. + int Offset = Negative ? -1 * Imm12 : Imm12; if (Imm12 == 0 && Negative) - Imm12 = INT32_MIN; - MI.addOperand(MCOperand::CreateImm(Imm12)); + Offset = INT32_MIN; + + MI.addOperand(MCOperand::CreateImm(Offset)); NumOpsAdded = 2; } else { MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, @@ -2917,14 +3378,20 @@ static bool DisassemblePreLoadFrm(MCInst &MI, unsigned Opcode, uint32_t insn, static bool DisassembleMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { - if (MemBarrierInstr(insn)) { - // DMBsy, DSBsy, and ISBsy instructions have zero operand and are taken care - // of within the generic ARMBasicMCBuilder::BuildIt() method. - // + if (Opcode == ARM::DMB || Opcode == ARM::DSB) { // Inst{3-0} encodes the memory barrier option for the variants. - MI.addOperand(MCOperand::CreateImm(slice(insn, 3, 0))); - NumOpsAdded = 1; - return true; + unsigned opt = slice(insn, 3, 0); + switch (opt) { + case ARM_MB::SY: case ARM_MB::ST: + case ARM_MB::ISH: case ARM_MB::ISHST: + case ARM_MB::NSH: case ARM_MB::NSHST: + case ARM_MB::OSH: case ARM_MB::OSHST: + MI.addOperand(MCOperand::CreateImm(opt)); + NumOpsAdded = 1; + return true; + default: + return false; + } } switch (Opcode) { @@ -2936,6 +3403,11 @@ static bool DisassembleMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, case ARM::WFI: case ARM::SEV: return true; + case ARM::SWP: + case ARM::SWPB: + // SWP, SWPB: Rd Rm Rn + // Delegate to DisassembleLdStExFrm().... + return DisassembleLdStExFrm(MI, Opcode, insn, NumOps, NumOpsAdded, B); default: break; } @@ -2950,20 +3422,32 @@ static bool DisassembleMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, // opcodes which match the same real instruction. This is needed since there's // no current handling of optional arguments. Fix here when a better handling // of optional arguments is implemented. - if (Opcode == ARM::CPS3p) { + if (Opcode == ARM::CPS3p) { // M = 1 + // Let's reject these impossible imod values by returning false: + // 1. (imod=0b01) + // + // AsmPrinter cannot handle imod=0b00, plus (imod=0b00,M=1,iflags!=0) is an + // invalid combination, so we just check for imod=0b00 here. + if (slice(insn, 19, 18) == 0 || slice(insn, 19, 18) == 1) + return false; MI.addOperand(MCOperand::CreateImm(slice(insn, 19, 18))); // imod MI.addOperand(MCOperand::CreateImm(slice(insn, 8, 6))); // iflags MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0))); // mode NumOpsAdded = 3; return true; } - if (Opcode == ARM::CPS2p) { + if (Opcode == ARM::CPS2p) { // mode = 0, M = 0 + // Let's reject these impossible imod values by returning false: + // 1. (imod=0b00,M=0) + // 2. (imod=0b01) + if (slice(insn, 19, 18) == 0 || slice(insn, 19, 18) == 1) + return false; MI.addOperand(MCOperand::CreateImm(slice(insn, 19, 18))); // imod MI.addOperand(MCOperand::CreateImm(slice(insn, 8, 6))); // iflags NumOpsAdded = 2; return true; } - if (Opcode == ARM::CPS1p) { + if (Opcode == ARM::CPS1p) { // imod = 0, iflags = 0, M = 1 MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0))); // mode NumOpsAdded = 1; return true; @@ -3142,7 +3626,7 @@ bool ARMBasicMCBuilder::DoPredicateOperands(MCInst& MI, unsigned Opcode, return false; } - + /// TryPredicateAndSBitModifier - TryPredicateAndSBitModifier tries to process /// the possible Predicate and SBitModifier, to build the remaining MCOperand /// constituents. @@ -3154,6 +3638,7 @@ bool ARMBasicMCBuilder::TryPredicateAndSBitModifier(MCInst& MI, unsigned Opcode, const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; const std::string &Name = ARMInsts[Opcode].Name; unsigned Idx = MI.getNumOperands(); + uint64_t TSFlags = ARMInsts[Opcode].TSFlags; // First, we check whether this instr specifies the PredicateOperand through // a pair of TargetOperandInfos with isPredicate() property. @@ -3173,14 +3658,23 @@ bool ARMBasicMCBuilder::TryPredicateAndSBitModifier(MCInst& MI, unsigned Opcode, // like ARM. // // A8.6.16 B - if (Name == "t2Bcc") - MI.addOperand(MCOperand::CreateImm(CondCode(slice(insn, 25, 22)))); - else if (Name == "tBcc") - MI.addOperand(MCOperand::CreateImm(CondCode(slice(insn, 11, 8)))); - else + // Check for undefined encodings. + unsigned cond; + if (Name == "t2Bcc") { + if ((cond = slice(insn, 25, 22)) >= 14) + return false; + MI.addOperand(MCOperand::CreateImm(CondCode(cond))); + } else if (Name == "tBcc") { + if ((cond = slice(insn, 11, 8)) == 14) + return false; + MI.addOperand(MCOperand::CreateImm(CondCode(cond))); + } else MI.addOperand(MCOperand::CreateImm(ARMCC::AL)); } else { // ARM instructions get their condition field from Inst{31-28}. + // We should reject Inst{31-28} = 0b1111 as invalid encoding. + if (!isNEONDomain(TSFlags) && getCondField(insn) == 0xF) + return false; MI.addOperand(MCOperand::CreateImm(CondCode(getCondField(insn)))); } } @@ -3243,3 +3737,84 @@ ARMBasicMCBuilder *llvm::CreateMCBuilder(unsigned Opcode, ARMFormat Format) { return new ARMBasicMCBuilder(Opcode, Format, ARMInsts[Opcode].getNumOperands()); } + +/// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic +/// operand in place of the immediate Value in the MCInst. The immediate +/// Value has had any PC adjustment made by the caller. If the getOpInfo() +/// function was set as part of the setupBuilderForSymbolicDisassembly() call +/// then that function is called to get any symbolic information at the +/// builder's Address for this instrution. If that returns non-zero then the +/// symbolic information it returns is used to create an MCExpr and that is +/// added as an operand to the MCInst. This function returns true if it adds +/// an operand to the MCInst and false otherwise. +bool ARMBasicMCBuilder::tryAddingSymbolicOperand(uint64_t Value, + uint64_t InstSize, + MCInst &MI) { + if (!GetOpInfo) + return false; + + struct LLVMOpInfo1 SymbolicOp; + SymbolicOp.Value = Value; + if (!GetOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp)) + return false; + + const MCExpr *Add = NULL; + if (SymbolicOp.AddSymbol.Present) { + if (SymbolicOp.AddSymbol.Name) { + StringRef Name(SymbolicOp.AddSymbol.Name); + MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name); + Add = MCSymbolRefExpr::Create(Sym, *Ctx); + } else { + Add = MCConstantExpr::Create(SymbolicOp.AddSymbol.Value, *Ctx); + } + } + + const MCExpr *Sub = NULL; + if (SymbolicOp.SubtractSymbol.Present) { + if (SymbolicOp.SubtractSymbol.Name) { + StringRef Name(SymbolicOp.SubtractSymbol.Name); + MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name); + Sub = MCSymbolRefExpr::Create(Sym, *Ctx); + } else { + Sub = MCConstantExpr::Create(SymbolicOp.SubtractSymbol.Value, *Ctx); + } + } + + const MCExpr *Off = NULL; + if (SymbolicOp.Value != 0) + Off = MCConstantExpr::Create(SymbolicOp.Value, *Ctx); + + const MCExpr *Expr; + if (Sub) { + const MCExpr *LHS; + if (Add) + LHS = MCBinaryExpr::CreateSub(Add, Sub, *Ctx); + else + LHS = MCUnaryExpr::CreateMinus(Sub, *Ctx); + if (Off != 0) + Expr = MCBinaryExpr::CreateAdd(LHS, Off, *Ctx); + else + Expr = LHS; + } else if (Add) { + if (Off != 0) + Expr = MCBinaryExpr::CreateAdd(Add, Off, *Ctx); + else + Expr = Add; + } else { + if (Off != 0) + Expr = Off; + else + Expr = MCConstantExpr::Create(0, *Ctx); + } + + if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_ARM_HI16) + MI.addOperand(MCOperand::CreateExpr(ARMMCExpr::CreateUpper16(Expr, *Ctx))); + else if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_ARM_LO16) + MI.addOperand(MCOperand::CreateExpr(ARMMCExpr::CreateLower16(Expr, *Ctx))); + else if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_None) + MI.addOperand(MCOperand::CreateExpr(Expr)); + else + assert("bad SymbolicOp.VariantKind"); + + return true; +} diff --git a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h index 9c30d33..a7ba141 100644 --- a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h +++ b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h @@ -22,12 +22,17 @@ #define ARMDISASSEMBLERCORE_H #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCContext.h" #include "llvm/Target/TargetInstrInfo.h" +#include "llvm-c/Disassembler.h" #include "ARMBaseInstrInfo.h" #include "ARMRegisterInfo.h" #include "ARMDisassembler.h" namespace llvm { +class MCContext; class ARMUtils { public: @@ -134,6 +139,31 @@ static inline void setSlice(unsigned &Bits, unsigned From, unsigned To, Bits |= (Val & Mask) << To; } +// Return an integer result equal to the number of bits of x that are ones. +static inline uint32_t +BitCount (uint64_t x) +{ + // c accumulates the total bits set in x + uint32_t c; + for (c = 0; x; ++c) + { + x &= x - 1; // clear the least significant bit set + } + return c; +} + +static inline bool +BitIsSet (const uint64_t value, const uint64_t bit) +{ + return (value & (1ull << bit)) != 0; +} + +static inline bool +BitIsClear (const uint64_t value, const uint64_t bit) +{ + return (value & (1ull << bit)) == 0; +} + /// Various utilities for checking the target specific flags. /// A unary data processing instruction doesn't have an Rn operand. @@ -141,6 +171,12 @@ static inline bool isUnaryDP(uint64_t TSFlags) { return (TSFlags & ARMII::UnaryDP); } +/// A NEON Domain instruction has cond field (Inst{31-28}) as 0b1111. +static inline bool isNEONDomain(uint64_t TSFlags) { + return (TSFlags & ARMII::DomainNEON) || + (TSFlags & ARMII::DomainNEONA8); +} + /// This four-bit field describes the addressing mode used. /// See also ARMBaseInstrInfo.h. static inline unsigned getAddrMode(uint64_t TSFlags) { @@ -196,7 +232,7 @@ private: public: ARMBasicMCBuilder(ARMBasicMCBuilder &B) : Opcode(B.Opcode), Format(B.Format), NumOps(B.NumOps), Disasm(B.Disasm), - SP(B.SP) { + SP(B.SP), GetOpInfo(0), DisInfo(0), Ctx(0) { Err = 0; } @@ -255,6 +291,44 @@ private: assert(SP); return slice(SP->ITState, 7, 4); } + +private: + // + // Hooks for symbolic disassembly via the public 'C' interface. + // + // The function to get the symbolic information for operands. + LLVMOpInfoCallback GetOpInfo; + // The pointer to the block of symbolic information for above call back. + void *DisInfo; + // The assembly context for creating symbols and MCExprs in place of + // immediate operands when there is symbolic information. + MCContext *Ctx; + // The address of the instruction being disassembled. + uint64_t Address; + +public: + void setupBuilderForSymbolicDisassembly(LLVMOpInfoCallback getOpInfo, + void *disInfo, MCContext *ctx, + uint64_t address) { + GetOpInfo = getOpInfo; + DisInfo = disInfo; + Ctx = ctx; + Address = address; + } + + uint64_t getBuilderAddress() const { return Address; } + + /// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic + /// operand in place of the immediate Value in the MCInst. The immediate + /// Value has had any PC adjustment made by the caller. If the getOpInfo() + /// function was set as part of the setupBuilderForSymbolicDisassembly() call + /// then that function is called to get any symbolic information at the + /// builder's Address for this instrution. If that returns non-zero then the + /// symbolic information it returns is used to create an MCExpr and that is + /// added as an operand to the MCInst. This function returns true if it adds + /// an operand to the MCInst and false otherwise. + bool tryAddingSymbolicOperand(uint64_t Value, uint64_t InstSize, MCInst &MI); + }; } // namespace llvm diff --git a/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h b/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h index 23372e0..8d39982 100644 --- a/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h +++ b/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h @@ -108,6 +108,8 @@ static inline bool IsGPR(unsigned RegClass) { // Utilities for 32-bit Thumb instructions. +static inline bool BadReg(uint32_t n) { return n == 13 || n == 15; } + // Extract imm4: Inst{19-16}. static inline unsigned getImm4(uint32_t insn) { return slice(insn, 19, 16); @@ -398,9 +400,17 @@ static bool DisassembleThumb1General(MCInst &MI, unsigned Opcode, uint32_t insn, assert(OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef() && "Pure imm operand expected"); - MI.addOperand(MCOperand::CreateImm(UseRt ? getT1Imm8(insn) - : (Imm3 ? getT1Imm3(insn) - : getT1Imm5(insn)))); + unsigned Imm = 0; + if (UseRt) + Imm = getT1Imm8(insn); + else if (Imm3) + Imm = getT1Imm3(insn); + else { + Imm = getT1Imm5(insn); + ARM_AM::ShiftOpc ShOp = getShiftOpcForBits(slice(insn, 12, 11)); + getImmShiftSE(ShOp, Imm); + } + MI.addOperand(MCOperand::CreateImm(Imm)); } ++OpIdx; @@ -469,6 +479,7 @@ static bool DisassembleThumb1DP(MCInst &MI, unsigned Opcode, uint32_t insn, // tBX_RET: 0 operand // tBX_RET_vararg: Rm // tBLXr_r9: Rm +// tBRIND: Rm static bool DisassembleThumb1Special(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { @@ -476,11 +487,17 @@ static bool DisassembleThumb1Special(MCInst &MI, unsigned Opcode, uint32_t insn, if (NumOps == 0) return true; - // BX/BLX has 1 reg operand: Rm. - if (NumOps == 1) { + // BX/BLX/tBRIND (indirect branch, i.e, mov pc, Rm) has 1 reg operand: Rm. + if (Opcode==ARM::tBLXr_r9 || Opcode==ARM::tBX_Rm || Opcode==ARM::tBRIND) { + if (Opcode != ARM::tBRIND) { + // Handling the two predicate operands before the reg operand. + if (!B->DoPredicateOperands(MI, Opcode, insn, NumOps)) + return false; + NumOpsAdded += 2; + } MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, getT1Rm(insn)))); - NumOpsAdded = 1; + NumOpsAdded += 1; return true; } @@ -598,7 +615,7 @@ static bool DisassembleThumb2Ldpci(MCInst &MI, unsigned Opcode, // A6.2.4 Load/store single data item // -// Load/Store Register (reg|imm): tRd tRn imm5 tRm +// Load/Store Register (reg|imm): tRd tRn imm5|tRm // Load Register Signed Byte|Halfword: tRd tRn tRm static bool DisassembleThumb1LdSt(unsigned opA, MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { @@ -607,11 +624,6 @@ static bool DisassembleThumb1LdSt(unsigned opA, MCInst &MI, unsigned Opcode, const TargetOperandInfo *OpInfo = TID.OpInfo; unsigned &OpIdx = NumOpsAdded; - // Table A6-5 16-bit Thumb Load/store instructions - // opA = 0b0101 for STR/LDR (register) and friends. - // Otherwise, we have STR/LDR (immediate) and friends. - bool Imm5 = (opA != 5); - assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::tGPRRegClassID && OpInfo[1].RegClass == ARM::tGPRRegClassID @@ -624,28 +636,28 @@ static bool DisassembleThumb1LdSt(unsigned opA, MCInst &MI, unsigned Opcode, getT1tRn(insn)))); OpIdx = 2; - // We have either { imm5, tRm } or { tRm } remaining. - // Process the imm5 first. Note that STR/LDR (register) should skip the imm5 - // offset operand for t_addrmode_s[1|2|4]. + // We have either { imm5 } or { tRm } remaining. + // Note that STR/LDR (register) should skip the imm5 offset operand for + // t_addrmode_s[1|2|4]. assert(OpIdx < NumOps && "More operands expected"); if (OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) { - - MI.addOperand(MCOperand::CreateImm(Imm5 ? getT1Imm5(insn) : 0)); + // Table A6-5 16-bit Thumb Load/store instructions + // opA = 0b0101 for STR/LDR (register) and friends. + // Otherwise, we have STR/LDR (immediate) and friends. + assert(opA != 5 && "Immediate operand expected for this opcode"); + MI.addOperand(MCOperand::CreateImm(getT1Imm5(insn))); + ++OpIdx; + } else { + // The next reg operand is tRm, the offset. + assert(OpIdx < NumOps && OpInfo[OpIdx].RegClass == ARM::tGPRRegClassID + && "Thumb reg operand expected"); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID, + getT1tRm(insn)))); ++OpIdx; } - - // The next reg operand is tRm, the offset. - assert(OpIdx < NumOps && OpInfo[OpIdx].RegClass == ARM::tGPRRegClassID - && "Thumb reg operand expected"); - MI.addOperand(MCOperand::CreateReg( - Imm5 ? 0 - : getRegisterEnum(B, ARM::tGPRRegClassID, - getT1tRm(insn)))); - ++OpIdx; - return true; } @@ -895,6 +907,10 @@ static bool DisassembleThumb1LdStMul(bool Ld, MCInst &MI, unsigned Opcode, } unsigned RegListBits = slice(insn, 7, 0); + if (BitCount(RegListBits) < 1) { + DEBUG(errs() << "if BitCount(registers) < 1 then UNPREDICTABLE\n"); + return false; + } // Fill the variadic part of reglist. for (unsigned i = 0; i < 8; ++i) @@ -945,6 +961,11 @@ static bool DisassembleThumb1CondBr(MCInst &MI, unsigned Opcode, uint32_t insn, : (int)Imm8)); // Predicate operands by ARMBasicMCBuilder::TryPredicateAndSBitModifier(). + // But note that for tBcc, if cond = '1110' then UNDEFINED. + if (Opcode == ARM::tBcc && slice(insn, 11, 8) == 14) { + DEBUG(errs() << "if cond = '1110' then UNDEFINED\n"); + return false; + } NumOpsAdded = 1; return true; @@ -965,11 +986,7 @@ static bool DisassembleThumb1Br(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned Imm11 = getT1Imm11(insn); - // When executing a Thumb instruction, PC reads as the address of the current - // instruction plus 4. The assembler subtracts 4 from the difference between - // the branch instruction and the target address, disassembler has to add 4 to - // to compensate. - MI.addOperand(MCOperand::CreateImm(SignExtend32<12>(Imm11 << 1) + 4)); + MI.addOperand(MCOperand::CreateImm(SignExtend32<12>(Imm11 << 1))); NumOpsAdded = 1; @@ -1129,8 +1146,12 @@ static bool DisassembleThumb2SRS(MCInst &MI, unsigned Opcode, uint32_t insn, // t2RFE[IA|DB]W/t2RFE[IA|DB]: Rn static bool DisassembleThumb2RFE(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { - MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, - decodeRn(insn)))); + unsigned Rn = decodeRn(insn); + if (Rn == 15) { + DEBUG(errs() << "if n == 15 then UNPREDICTABLE\n"); + return false; + } + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B,ARM::GPRRegClassID,Rn))); NumOpsAdded = 1; return true; } @@ -1149,7 +1170,7 @@ static bool DisassembleThumb2LdStMul(MCInst &MI, unsigned Opcode, uint32_t insn, Opcode == ARM::t2STMIA || Opcode == ARM::t2STMIA_UPD || Opcode == ARM::t2STMDB || Opcode == ARM::t2STMDB_UPD) && "Unexpected opcode"); - assert(NumOps >= 5 && "Thumb2 LdStMul expects NumOps >= 5"); + assert(NumOps >= 4 && "Thumb2 LdStMul expects NumOps >= 4"); NumOpsAdded = 0; @@ -1203,45 +1224,79 @@ static bool DisassembleThumb2LdStEx(MCInst &MI, unsigned Opcode, uint32_t insn, OpIdx = 0; assert(NumOps >= 2 - && OpInfo[0].RegClass == ARM::GPRRegClassID - && OpInfo[1].RegClass == ARM::GPRRegClassID + && OpInfo[0].RegClass > 0 + && OpInfo[1].RegClass > 0 && "Expect >=2 operands and first two as reg operands"); bool isStore = (ARM::t2STREX <= Opcode && Opcode <= ARM::t2STREXH); bool isSW = (Opcode == ARM::t2LDREX || Opcode == ARM::t2STREX); bool isDW = (Opcode == ARM::t2LDREXD || Opcode == ARM::t2STREXD); + unsigned Rt = decodeRd(insn); + unsigned Rt2 = decodeRs(insn); // But note that this is Rd for t2STREX. + unsigned Rd = decodeRm(insn); + unsigned Rn = decodeRn(insn); + + // Some sanity checking first. + if (isStore) { + // if d == n || d == t then UNPREDICTABLE + // if d == n || d == t || d == t2 then UNPREDICTABLE + if (isDW) { + if (Rd == Rn || Rd == Rt || Rd == Rt2) { + DEBUG(errs() << "if d == n || d == t || d == t2 then UNPREDICTABLE\n"); + return false; + } + } else { + if (isSW) { + if (Rt2 == Rn || Rt2 == Rt) { + DEBUG(errs() << "if d == n || d == t then UNPREDICTABLE\n"); + return false; + } + } else { + if (Rd == Rn || Rd == Rt) { + DEBUG(errs() << "if d == n || d == t then UNPREDICTABLE\n"); + return false; + } + } + } + } else { + // Load + // A8.6.71 LDREXD + // if t == t2 then UNPREDICTABLE + if (isDW && Rt == Rt2) { + DEBUG(errs() << "if t == t2 then UNPREDICTABLE\n"); + return false; + } + } + // Add the destination operand for store. if (isStore) { MI.addOperand(MCOperand::CreateReg( - getRegisterEnum(B, ARM::GPRRegClassID, - isSW ? decodeRs(insn) : decodeRm(insn)))); + getRegisterEnum(B, OpInfo[OpIdx].RegClass, + isSW ? Rt2 : Rd))); ++OpIdx; } // Source operand for store and destination operand for load. - MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, - decodeRd(insn)))); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass, + Rt))); ++OpIdx; // Thumb2 doubleword complication: with an extra source/destination operand. if (isDW) { - MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, - decodeRs(insn)))); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B,OpInfo[OpIdx].RegClass, + Rt2))); ++OpIdx; } // Finally add the pointer operand. - MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, - decodeRn(insn)))); + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass, + Rn))); ++OpIdx; return true; } -// LLVM, as of Jan-05-2010, does not output <Rt2>, i.e., Rs, in the asm. -// Whereas the ARM Arch. Manual does not require that t2 = t+1 like in ARM ISA. -// // t2LDRDi8: Rd Rs Rn imm8s4 (offset mode) // t2LDRDpci: Rd Rs imm8s4 (Not decoded, prefer the generic t2LDRDi8 version) // t2STRDi8: Rd Rs Rn imm8s4 (offset mode) @@ -1255,18 +1310,50 @@ static bool DisassembleThumb2LdStDual(MCInst &MI, unsigned Opcode, if (!OpInfo) return false; assert(NumOps >= 4 - && OpInfo[0].RegClass == ARM::GPRRegClassID - && OpInfo[1].RegClass == ARM::GPRRegClassID - && OpInfo[2].RegClass == ARM::GPRRegClassID + && OpInfo[0].RegClass > 0 + && OpInfo[0].RegClass == OpInfo[1].RegClass + && OpInfo[2].RegClass > 0 && OpInfo[3].RegClass < 0 && "Expect >= 4 operands and first 3 as reg operands"); + // Thumnb allows for specifying Rt and Rt2, unlike ARM (which has Rt2==Rt+1). + unsigned Rt = decodeRd(insn); + unsigned Rt2 = decodeRs(insn); + unsigned Rn = decodeRn(insn); + + // Some sanity checking first. + + // A8.6.67 LDRD (literal) has its W bit as (0). + if (Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2LDRD_PRE || Opcode == ARM::t2LDRD_POST) { + if (Rn == 15 && slice(insn, 21, 21) != 0) + return false; + } else { + // For Dual Store, PC cannot be used as the base register. + if (Rn == 15) { + DEBUG(errs() << "if n == 15 then UNPREDICTABLE\n"); + return false; + } + } + if (Rt == Rt2) { + DEBUG(errs() << "if t == t2 then UNPREDICTABLE\n"); + return false; + } + if (Opcode != ARM::t2LDRDi8 && Opcode != ARM::t2STRDi8) { + if (Rn == Rt || Rn == Rt2) { + DEBUG(errs() << "if wback && (n == t || n == t2) then UNPREDICTABLE\n"); + return false; + } + } + // Add the <Rt> <Rt2> operands. - MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + unsigned RegClassPair = OpInfo[0].RegClass; + unsigned RegClassBase = OpInfo[2].RegClass; + + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClassPair, decodeRd(insn)))); - MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClassPair, decodeRs(insn)))); - MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClassBase, decodeRn(insn)))); // Finally add (+/-)imm8*4, depending on the U bit. @@ -1394,9 +1481,12 @@ static bool DisassembleThumb2DPSoReg(MCInst &MI, unsigned Opcode, uint32_t insn, if (OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) { - if (Thumb2ShiftOpcode(Opcode)) - MI.addOperand(MCOperand::CreateImm(getShiftAmtBits(insn))); - else { + if (Thumb2ShiftOpcode(Opcode)) { + unsigned Imm = getShiftAmtBits(insn); + ARM_AM::ShiftOpc ShOp = getShiftOpcForBits(slice(insn, 5, 4)); + getImmShiftSE(ShOp, Imm); + MI.addOperand(MCOperand::CreateImm(Imm)); + } else { // Build the constant shift specifier operand. unsigned bits2 = getShiftTypeBits(insn); unsigned imm5 = getShiftAmtBits(insn); @@ -1421,7 +1511,8 @@ static bool DisassembleThumb2DPSoReg(MCInst &MI, unsigned Opcode, uint32_t insn, static bool DisassembleThumb2DPModImm(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { - const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; + const TargetInstrDesc &TID = ARMInsts[Opcode]; + const TargetOperandInfo *OpInfo = TID.OpInfo; unsigned &OpIdx = NumOpsAdded; OpIdx = 0; @@ -1448,8 +1539,15 @@ static bool DisassembleThumb2DPModImm(MCInst &MI, unsigned Opcode, DEBUG(errs()<<"Thumb2 encoding error: d==15 for DPModImm 2-reg instr.\n"); return false; } - MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RnRegClassID, - decodeRn(insn)))); + int Idx; + if ((Idx = TID.getOperandConstraint(OpIdx, TOI::TIED_TO)) != -1) { + // The reg operand is tied to the first reg operand. + MI.addOperand(MI.getOperand(Idx)); + } else { + // Add second reg operand. + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RnRegClassID, + decodeRn(insn)))); + } ++OpIdx; } @@ -1518,7 +1616,7 @@ static bool DisassembleThumb2Sat(MCInst &MI, unsigned Opcode, uint32_t insn, // o t2ADDri12, t2SUBri12: Rs Rn imm12 // o t2LEApcrel (ADR): Rs imm12 // o t2BFC (BFC): Rs Ro(TIED_TO) bf_inv_mask_imm -// o t2BFI (BFI) (Currently not defined in LLVM as of Jan-07-2010) +// o t2BFI (BFI): Rs Ro(TIED_TO) Rn bf_inv_mask_imm // o t2MOVi16: Rs imm16 // o t2MOVTi16: Rs imm16 // o t2SBFX (SBFX): Rs Rn lsb width @@ -1579,9 +1677,10 @@ static bool DisassembleThumb2DPBinImm(MCInst &MI, unsigned Opcode, if (Opcode == ARM::t2ADDri12 || Opcode == ARM::t2SUBri12 || Opcode == ARM::t2LEApcrel) MI.addOperand(MCOperand::CreateImm(getIImm3Imm8(insn))); - else if (Opcode == ARM::t2MOVi16 || Opcode == ARM::t2MOVTi16) - MI.addOperand(MCOperand::CreateImm(getImm16(insn))); - else if (Opcode == ARM::t2BFC || Opcode == ARM::t2BFI) { + else if (Opcode == ARM::t2MOVi16 || Opcode == ARM::t2MOVTi16) { + if (!B->tryAddingSymbolicOperand(getImm16(insn), 4, MI)) + MI.addOperand(MCOperand::CreateImm(getImm16(insn))); + } else if (Opcode == ARM::t2BFC || Opcode == ARM::t2BFI) { uint32_t mask = 0; if (getBitfieldInvMask(insn, mask)) MI.addOperand(MCOperand::CreateImm(mask)); @@ -1625,8 +1724,7 @@ static inline bool t2MiscCtrlInstr(uint32_t insn) { // A8.6.26 // t2BXJ -> Rn // -// Miscellaneous control: t2DMBsy (and its t2DMB variants), -// t2DSBsy (and its t2DSB varianst), t2ISBsy, t2CLREX +// Miscellaneous control: // -> no operand (except pred-imm pred-ccr for CLREX, memory barrier variants) // // Hint: t2NOP, t2YIELD, t2WFE, t2WFI, t2SEV @@ -1643,6 +1741,22 @@ static bool DisassembleThumb2BrMiscCtrl(MCInst &MI, unsigned Opcode, if (NumOps == 0) return true; + if (Opcode == ARM::t2DMB || Opcode == ARM::t2DSB) { + // Inst{3-0} encodes the memory barrier option for the variants. + unsigned opt = slice(insn, 3, 0); + switch (opt) { + case ARM_MB::SY: case ARM_MB::ST: + case ARM_MB::ISH: case ARM_MB::ISHST: + case ARM_MB::NSH: case ARM_MB::NSHST: + case ARM_MB::OSH: case ARM_MB::OSHST: + MI.addOperand(MCOperand::CreateImm(opt)); + NumOpsAdded = 1; + return true; + default: + return false; + } + } + if (t2MiscCtrlInstr(insn)) return true; @@ -1719,6 +1833,17 @@ static bool DisassembleThumb2BrMiscCtrl(MCInst &MI, unsigned Opcode, return true; } + // Some instructions have predicate operands first before the immediate. + if (Opcode == ARM::tBLXi_r9 || Opcode == ARM::tBLr9) { + // Handling the two predicate operands before the imm operand. + if (B->DoPredicateOperands(MI, Opcode, insn, NumOps)) + NumOpsAdded += 2; + else { + DEBUG(errs() << "Expected predicate operands not found.\n"); + return false; + } + } + // Add the imm operand. int Offset = 0; @@ -1739,13 +1864,12 @@ static bool DisassembleThumb2BrMiscCtrl(MCInst &MI, unsigned Opcode, Offset = decodeImm32_BLX(insn); break; } - // When executing a Thumb instruction, PC reads as the address of the current - // instruction plus 4. The assembler subtracts 4 from the difference between - // the branch instruction and the target address, disassembler has to add 4 to - // to compensate. - MI.addOperand(MCOperand::CreateImm(Offset + 4)); - NumOpsAdded = 1; + if (!B->tryAddingSymbolicOperand(Offset + B->getBuilderAddress() + 4, 4, MI)) + MI.addOperand(MCOperand::CreateImm(Offset)); + + // This is an increment as some predicate operands may have been added first. + NumOpsAdded += 1; return true; } @@ -1787,7 +1911,7 @@ static bool DisassembleThumb2PreLoad(MCInst &MI, unsigned Opcode, uint32_t insn, decodeRn(insn)))); ++OpIdx; - if (OpInfo[OpIdx].RegClass == ARM::GPRRegClassID) { + if (OpInfo[OpIdx].RegClass == ARM::rGPRRegClassID) { MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, decodeRm(insn)))); } else { @@ -1795,17 +1919,17 @@ static bool DisassembleThumb2PreLoad(MCInst &MI, unsigned Opcode, uint32_t insn, && !OpInfo[OpIdx].isOptionalDef() && "Pure imm operand expected"); int Offset = 0; - if (slice(insn, 19, 16) == 0xFF) { - bool Negative = slice(insn, 23, 23) == 0; - unsigned Imm12 = getImm12(insn); - Offset = Negative ? -1 - Imm12 : 1 * Imm12; - } else if (Opcode == ARM::t2PLDi8 || Opcode == ARM::t2PLDWi8 || - Opcode == ARM::t2PLIi8) { + if (Opcode == ARM::t2PLDi8 || Opcode == ARM::t2PLDWi8 || + Opcode == ARM::t2PLIi8) { // A8.6.117 Encoding T2: add = FALSE unsigned Imm8 = getImm8(insn); - Offset = -1 - Imm8; - } else // The i12 forms. See, for example, A8.6.117 Encoding T1. + Offset = -1 * Imm8; + } else { + // The i12 forms. See, for example, A8.6.117 Encoding T1. + // Note that currently t2PLDi12 also handles the previously named t2PLDpci + // opcode, that's why we use decodeImm12(insn) which returns +/- imm12. Offset = decodeImm12(insn); + } MI.addOperand(MCOperand::CreateImm(Offset)); } ++OpIdx; @@ -1820,6 +1944,87 @@ static bool DisassembleThumb2PreLoad(MCInst &MI, unsigned Opcode, uint32_t insn, return true; } +static bool BadRegsThumb2LdSt(unsigned Opcode, uint32_t insn, bool Load, + unsigned R0, unsigned R1, unsigned R2, bool UseRm, bool WB) { + + // Inst{22-21} encodes the data item transferred for load/store. + // For single word, it is encoded as ob10. + bool Word = (slice(insn, 22, 21) == 2); + bool Half = (slice(insn, 22, 21) == 1); + bool Byte = (slice(insn, 22, 21) == 0); + + if (UseRm && BadReg(R2)) { + DEBUG(errs() << "if BadReg(m) then UNPREDICTABLE\n"); + return true; + } + + if (Load) { + if (!Word && R0 == 13) { + DEBUG(errs() << "if t == 13 then UNPREDICTABLE\n"); + return true; + } + if (Byte) { + if (WB && R0 == 15 && slice(insn, 10, 8) == 3) { + // A8.6.78 LDRSB (immediate) Encoding T2 (errata markup 8.0) + DEBUG(errs() << "if t == 15 && PUW == '011' then UNPREDICTABLE\n"); + return true; + } + } + // A6.3.8 Load halfword, memory hints + if (Half) { + if (WB) { + if (R0 == R1) { + // A8.6.82 LDRSH (immediate) Encoding T2 + DEBUG(errs() << "if WB && n == t then UNPREDICTABLE\n"); + return true; + } + if (R0 == 15 && slice(insn, 10, 8) == 3) { + // A8.6.82 LDRSH (immediate) Encoding T2 (errata markup 8.0) + DEBUG(errs() << "if t == 15 && PUW == '011' then UNPREDICTABLE\n"); + return true; + } + } else { + if (Opcode == ARM::t2LDRHi8 || Opcode == ARM::t2LDRSHi8) { + if (R0 == 15 && slice(insn, 10, 8) == 4) { + // A8.6.82 LDRSH (immediate) Encoding T2 + DEBUG(errs() << "if Rt == '1111' and PUW == '100' then SEE" + << " \"Unallocated memory hints\"\n"); + return true; + } + } else { + if (R0 == 15) { + // A8.6.82 LDRSH (immediate) Encoding T1 + DEBUG(errs() << "if Rt == '1111' then SEE" + << " \"Unallocated memory hints\"\n"); + return true; + } + } + } + } + } else { + if (WB && R0 == R1) { + DEBUG(errs() << "if wback && n == t then UNPREDICTABLE\n"); + return true; + } + if ((WB && R0 == 15) || (!WB && R1 == 15)) { + DEBUG(errs() << "if Rn == '1111' then UNDEFINED\n"); + return true; + } + if (Word) { + if ((WB && R1 == 15) || (!WB && R0 == 15)) { + DEBUG(errs() << "if t == 15 then UNPREDICTABLE\n"); + return true; + } + } else { + if ((WB && BadReg(R1)) || (!WB && BadReg(R0))) { + DEBUG(errs() << "if BadReg(t) then UNPREDICTABLE\n"); + return true; + } + } + } + return false; +} + // A6.3.10 Store single data item // A6.3.9 Load byte, memory hints // A6.3.8 Load halfword, memory hints @@ -1865,16 +2070,16 @@ static bool DisassembleThumb2LdSt(bool Load, MCInst &MI, unsigned Opcode, OpIdx = 0; assert(NumOps >= 3 && - OpInfo[0].RegClass == ARM::GPRRegClassID && - OpInfo[1].RegClass == ARM::GPRRegClassID && + OpInfo[0].RegClass > 0 && + OpInfo[1].RegClass > 0 && "Expect >= 3 operands and first two as reg operands"); - bool ThreeReg = (OpInfo[2].RegClass == ARM::GPRRegClassID); + bool ThreeReg = (OpInfo[2].RegClass > 0); bool TIED_TO = ThreeReg && TID.getOperandConstraint(2, TOI::TIED_TO) != -1; bool Imm12 = !ThreeReg && slice(insn, 23, 23) == 1; // ARMInstrThumb2.td // Build the register operands, followed by the immediate. - unsigned R0, R1, R2 = 0; + unsigned R0 = 0, R1 = 0, R2 = 0; unsigned Rd = decodeRd(insn); int Imm = 0; @@ -1905,19 +2110,24 @@ static bool DisassembleThumb2LdSt(bool Load, MCInst &MI, unsigned Opcode, Imm = decodeImm8(insn); } - MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass, R0))); ++OpIdx; - MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass, R1))); ++OpIdx; if (ThreeReg) { - MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, + // This could be an offset register or a TIED_TO register. + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B,OpInfo[OpIdx].RegClass, R2))); ++OpIdx; } + if (BadRegsThumb2LdSt(Opcode, insn, Load, R0, R1, R2, ThreeReg & !TIED_TO, + TIED_TO)) + return false; + assert(OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef() && "Pure imm operand expected"); @@ -1947,25 +2157,25 @@ static bool DisassembleThumb2DPReg(MCInst &MI, unsigned Opcode, uint32_t insn, OpIdx = 0; assert(NumOps >= 2 && - OpInfo[0].RegClass == ARM::rGPRRegClassID && - OpInfo[1].RegClass == ARM::rGPRRegClassID && + OpInfo[0].RegClass > 0 && + OpInfo[1].RegClass > 0 && "Expect >= 2 operands and first two as reg operands"); // Build the register operands, followed by the optional rotation amount. - bool ThreeReg = NumOps > 2 && OpInfo[2].RegClass == ARM::rGPRRegClassID; + bool ThreeReg = NumOps > 2 && OpInfo[2].RegClass > 0; - MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID, + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass, decodeRs(insn)))); ++OpIdx; if (ThreeReg) { - MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID, + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B,OpInfo[OpIdx].RegClass, decodeRn(insn)))); ++OpIdx; } - MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID, + MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass, decodeRm(insn)))); ++OpIdx; diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp index 1499da0..fc2aa75 100644 --- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp +++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp @@ -29,6 +29,9 @@ StringRef ARMInstPrinter::getOpcodeName(unsigned Opcode) const { return getInstructionName(Opcode); } +StringRef ARMInstPrinter::getRegName(unsigned RegNo) const { + return getRegisterName(RegNo); +} void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O) { unsigned Opcode = MI->getOpcode(); @@ -133,9 +136,10 @@ static void printSOImm(raw_ostream &O, int64_t V, raw_ostream *CommentStream, unsigned Rot = ARM_AM::getSOImmValRot(V); // Print low-level immediate formation info, per - // A5.1.3: "Data-processing operands - Immediate". + // A5.2.3: Data-processing (immediate), and + // A5.2.4: Modified immediate constants in ARM instructions if (Rot) { - O << "#" << Imm << ", " << Rot; + O << "#" << Imm << ", #" << Rot; // Pretty printed version. if (CommentStream) *CommentStream << (int)ARM_AM::rotr32(Imm, Rot) << "\n"; @@ -178,18 +182,16 @@ void ARMInstPrinter::printSORegOperand(const MCInst *MI, unsigned OpNum, } } +//===--------------------------------------------------------------------===// +// Addressing Mode #2 +//===--------------------------------------------------------------------===// -void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op, - raw_ostream &O) { +void ARMInstPrinter::printAM2PreOrOffsetIndexOp(const MCInst *MI, unsigned Op, + raw_ostream &O) { const MCOperand &MO1 = MI->getOperand(Op); const MCOperand &MO2 = MI->getOperand(Op+1); const MCOperand &MO3 = MI->getOperand(Op+2); - if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. - printOperand(MI, Op, O); - return; - } - O << "[" << getRegisterName(MO1.getReg()); if (!MO2.getReg()) { @@ -212,6 +214,50 @@ void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op, O << "]"; } +void ARMInstPrinter::printAM2PostIndexOp(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(Op); + const MCOperand &MO2 = MI->getOperand(Op+1); + const MCOperand &MO3 = MI->getOperand(Op+2); + + O << "[" << getRegisterName(MO1.getReg()) << "], "; + + if (!MO2.getReg()) { + unsigned ImmOffs = ARM_AM::getAM2Offset(MO3.getImm()); + O << '#' + << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm())) + << ImmOffs; + return; + } + + O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm())) + << getRegisterName(MO2.getReg()); + + if (unsigned ShImm = ARM_AM::getAM2Offset(MO3.getImm())) + O << ", " + << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO3.getImm())) + << " #" << ShImm; +} + +void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(Op); + + if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, Op, O); + return; + } + + const MCOperand &MO3 = MI->getOperand(Op+2); + unsigned IdxMode = ARM_AM::getAM2IdxMode(MO3.getImm()); + + if (IdxMode == ARMII::IndexModePost) { + printAM2PostIndexOp(MI, Op, O); + return; + } + printAM2PreOrOffsetIndexOp(MI, Op, O); +} + void ARMInstPrinter::printAddrMode2OffsetOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) { @@ -235,11 +281,35 @@ void ARMInstPrinter::printAddrMode2OffsetOperand(const MCInst *MI, << " #" << ShImm; } -void ARMInstPrinter::printAddrMode3Operand(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - const MCOperand &MO1 = MI->getOperand(OpNum); - const MCOperand &MO2 = MI->getOperand(OpNum+1); - const MCOperand &MO3 = MI->getOperand(OpNum+2); +//===--------------------------------------------------------------------===// +// Addressing Mode #3 +//===--------------------------------------------------------------------===// + +void ARMInstPrinter::printAM3PostIndexOp(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(Op); + const MCOperand &MO2 = MI->getOperand(Op+1); + const MCOperand &MO3 = MI->getOperand(Op+2); + + O << "[" << getRegisterName(MO1.getReg()) << "], "; + + if (MO2.getReg()) { + O << (char)ARM_AM::getAM3Op(MO3.getImm()) + << getRegisterName(MO2.getReg()); + return; + } + + unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm()); + O << '#' + << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm())) + << ImmOffs; +} + +void ARMInstPrinter::printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(Op); + const MCOperand &MO2 = MI->getOperand(Op+1); + const MCOperand &MO3 = MI->getOperand(Op+2); O << '[' << getRegisterName(MO1.getReg()); @@ -256,6 +326,18 @@ void ARMInstPrinter::printAddrMode3Operand(const MCInst *MI, unsigned OpNum, O << ']'; } +void ARMInstPrinter::printAddrMode3Operand(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &MO3 = MI->getOperand(Op+2); + unsigned IdxMode = ARM_AM::getAM3IdxMode(MO3.getImm()); + + if (IdxMode == ARMII::IndexModePost) { + printAM3PostIndexOp(MI, Op, O); + return; + } + printAM3PreOrOffsetIndexOp(MI, Op, O); +} + void ARMInstPrinter::printAddrMode3OffsetOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) { @@ -314,6 +396,12 @@ void ARMInstPrinter::printAddrMode6Operand(const MCInst *MI, unsigned OpNum, O << "]"; } +void ARMInstPrinter::printAddrMode7Operand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO1 = MI->getOperand(OpNum); + O << "[" << getRegisterName(MO1.getReg()) << "]"; +} + void ARMInstPrinter::printAddrMode6OffsetOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) { @@ -414,16 +502,6 @@ void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum, } } -void ARMInstPrinter::printNegZeroOperand(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNum); - O << '#'; - if (Op.getImm() < 0) - O << '-' << (-Op.getImm() - 1); - else - O << Op.getImm(); -} - void ARMInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) { ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm(); diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h index 679d313..b3ac03a 100644 --- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h +++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h @@ -17,14 +17,18 @@ #include "llvm/MC/MCInstPrinter.h" namespace llvm { - class MCOperand; + +class MCOperand; +class TargetMachine; class ARMInstPrinter : public MCInstPrinter { public: - ARMInstPrinter(const MCAsmInfo &MAI) : MCInstPrinter(MAI) {} + ARMInstPrinter(TargetMachine &TM, const MCAsmInfo &MAI) + : MCInstPrinter(MAI) {} virtual void printInst(const MCInst *MI, raw_ostream &O); virtual StringRef getOpcodeName(unsigned Opcode) const; + virtual StringRef getRegName(unsigned RegNo) const; static const char *getInstructionName(unsigned Opcode); @@ -38,15 +42,25 @@ public: void printSOImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printSORegOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printAddrMode2Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printAM2PostIndexOp(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printAM2PreOrOffsetIndexOp(const MCInst *MI, unsigned OpNum, + raw_ostream &O); void printAddrMode2OffsetOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printAddrMode3Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printAM3PostIndexOp(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned OpNum, + raw_ostream &O); void printAddrMode3OffsetOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printLdStmModeOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printAddrMode5Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printAddrMode6Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printAddrMode7Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printAddrMode6OffsetOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); @@ -87,9 +101,7 @@ public: void printSetendOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printCPSIMod(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printCPSIFlag(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printCPSOptionOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printMSRMaskOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printNegZeroOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printPredicateOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printMandatoryPredicateOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp index 9a27e2f..f6d0242 100644 --- a/lib/Target/ARM/MLxExpansionPass.cpp +++ b/lib/Target/ARM/MLxExpansionPass.cpp @@ -15,11 +15,13 @@ #define DEBUG_TYPE "mlx-expansion" #include "ARM.h" #include "ARMBaseInstrInfo.h" +#include "ARMSubtarget.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -49,15 +51,17 @@ namespace { const TargetRegisterInfo *TRI; MachineRegisterInfo *MRI; + bool isA9; unsigned MIIdx; MachineInstr* LastMIs[4]; + SmallPtrSet<MachineInstr*, 4> IgnoreStall; void clearStack(); void pushStack(MachineInstr *MI); MachineInstr *getAccDefMI(MachineInstr *MI) const; unsigned getDefReg(MachineInstr *MI) const; bool hasRAWHazard(unsigned Reg, MachineInstr *MI) const; - bool FindMLxHazard(MachineInstr *MI) const; + bool FindMLxHazard(MachineInstr *MI); void ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI, unsigned MulOpc, unsigned AddSubOpc, bool NegAcc, bool HasLane); @@ -146,7 +150,7 @@ bool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const { } -bool MLxExpansion::FindMLxHazard(MachineInstr *MI) const { +bool MLxExpansion::FindMLxHazard(MachineInstr *MI) { if (NumExpand >= ExpandLimit) return false; @@ -154,7 +158,7 @@ bool MLxExpansion::FindMLxHazard(MachineInstr *MI) const { return true; MachineInstr *DefMI = getAccDefMI(MI); - if (TII->isFpMLxInstruction(DefMI->getOpcode())) + if (TII->isFpMLxInstruction(DefMI->getOpcode())) { // r0 = vmla // r3 = vmla r0, r1, r2 // takes 16 - 17 cycles @@ -163,24 +167,33 @@ bool MLxExpansion::FindMLxHazard(MachineInstr *MI) const { // r4 = vmul r1, r2 // r3 = vadd r0, r4 // takes about 14 - 15 cycles even with vmul stalling for 4 cycles. + IgnoreStall.insert(DefMI); return true; + } + + if (IgnoreStall.count(MI)) + return false; // If a VMLA.F is followed by an VADD.F or VMUL.F with no RAW hazard, the // VADD.F or VMUL.F will stall 4 cycles before issue. The 4 cycle stall // preserves the in-order retirement of the instructions. // Look at the next few instructions, if *most* of them can cause hazards, // then the scheduler can't *fix* this, we'd better break up the VMLA. + unsigned Limit1 = isA9 ? 1 : 4; + unsigned Limit2 = isA9 ? 1 : 4; for (unsigned i = 1; i <= 4; ++i) { int Idx = ((int)MIIdx - i + 4) % 4; MachineInstr *NextMI = LastMIs[Idx]; if (!NextMI) continue; - if (TII->canCauseFpMLxStall(NextMI->getOpcode())) - return true; + if (TII->canCauseFpMLxStall(NextMI->getOpcode())) { + if (i <= Limit1) + return true; + } // Look for VMLx RAW hazard. - if (hasRAWHazard(getDefReg(MI), NextMI)) + if (i <= Limit2 && hasRAWHazard(getDefReg(MI), NextMI)) return true; } @@ -248,6 +261,7 @@ bool MLxExpansion::ExpandFPMLxInstructions(MachineBasicBlock &MBB) { bool Changed = false; clearStack(); + IgnoreStall.clear(); unsigned Skip = 0; MachineBasicBlock::reverse_iterator MII = MBB.rbegin(), E = MBB.rend(); @@ -299,6 +313,8 @@ bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) { TII = static_cast<const ARMBaseInstrInfo*>(Fn.getTarget().getInstrInfo()); TRI = Fn.getTarget().getRegisterInfo(); MRI = &Fn.getRegInfo(); + const ARMSubtarget *STI = &Fn.getTarget().getSubtarget<ARMSubtarget>(); + isA9 = STI->isCortexA9(); bool Modified = false; for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt index 9fc3fb9..8ba9a27 100644 --- a/lib/Target/ARM/README.txt +++ b/lib/Target/ARM/README.txt @@ -657,3 +657,27 @@ Note that both "tst" and "moveq" are redundant. //===---------------------------------------------------------------------===// +When loading immediate constants with movt/movw, if there are multiple +constants needed with the same low 16 bits, and those values are not live at +the same time, it would be possible to use a single movw instruction, followed +by multiple movt instructions to rewrite the high bits to different values. +For example: + + volatile store i32 -1, i32* inttoptr (i32 1342210076 to i32*), align 4, + !tbaa +!0 + volatile store i32 -1, i32* inttoptr (i32 1342341148 to i32*), align 4, + !tbaa +!0 + +is compiled and optimized to: + + movw r0, #32796 + mov.w r1, #-1 + movt r0, #20480 + str r1, [r0] + movw r0, #32796 @ <= this MOVW is not needed, value is there already + movt r0, #20482 + str r1, [r0] + +//===---------------------------------------------------------------------===// diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp index 233e165..dee3d27 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -34,13 +34,14 @@ bool Thumb1FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const return !MF.getFrameInfo()->hasVarSizedObjects(); } -static void emitSPUpdate(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - const TargetInstrInfo &TII, DebugLoc dl, - const Thumb1RegisterInfo &MRI, - int NumBytes) { - emitThumbRegPlusImmediate(MBB, MBBI, ARM::SP, ARM::SP, NumBytes, TII, - MRI, dl); +static void +emitSPUpdate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + const TargetInstrInfo &TII, DebugLoc dl, + const Thumb1RegisterInfo &MRI, + int NumBytes, unsigned MIFlags = MachineInstr::NoFlags) { + emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, TII, + MRI, MIFlags); } void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const { @@ -70,11 +71,13 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const { int FramePtrSpillFI = 0; if (VARegSaveSize) - emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -VARegSaveSize); + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -VARegSaveSize, + MachineInstr::FrameSetup); if (!AFI->hasStackFrame()) { if (NumBytes != 0) - emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes); + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes, + MachineInstr::FrameSetup); return; } @@ -131,7 +134,8 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const { // Adjust FP so it point to the stack slot that contains the previous FP. if (hasFP(MF)) { BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr) - .addFrameIndex(FramePtrSpillFI).addImm(0); + .addFrameIndex(FramePtrSpillFI).addImm(0) + .setMIFlags(MachineInstr::FrameSetup); if (NumBytes > 7) // If offset is > 7 then sp cannot be adjusted in a single instruction, // try restoring from fp instead. @@ -140,7 +144,8 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const { if (NumBytes) // Insert it after all the callee-save spills. - emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes); + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes, + MachineInstr::FrameSetup); if (STI.isTargetELF() && hasFP(MF)) MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() - @@ -156,7 +161,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const { // to reference locals. if (RegInfo->hasBasePointer(MF)) BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr), BasePtr).addReg(ARM::SP); - + // If the frame has variable sized objects then the epilogue must restore // the sp from fp. We can assume there's an FP here since hasFP already // checks for hasVarSizedObjects. @@ -232,8 +237,8 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF, if (NumBytes) { assert(MF.getRegInfo().isPhysRegUsed(ARM::R4) && "No scratch register to restore SP from FP!"); - emitThumbRegPlusImmediate(MBB, MBBI, ARM::R4, FramePtr, -NumBytes, - TII, *RegInfo, dl); + emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes, + TII, *RegInfo); BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVtgpr2gpr), ARM::SP) .addReg(ARM::R4); } else @@ -307,6 +312,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB, MIB.addReg(Reg, getKillRegState(isKill)); } + MIB.setMIFlags(MachineInstr::FrameSetup); return true; } diff --git a/lib/Target/ARM/Thumb1FrameLowering.h b/lib/Target/ARM/Thumb1FrameLowering.h index c592e12..bcfc516 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.h +++ b/lib/Target/ARM/Thumb1FrameLowering.h @@ -12,7 +12,7 @@ //===----------------------------------------------------------------------===// #ifndef __THUMB_FRAMEINFO_H_ -#define __THUMM_FRAMEINFO_H_ +#define __THUMB_FRAMEINFO_H_ #include "ARM.h" #include "ARMFrameLowering.h" diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp index f62a13e..33cefb6 100644 --- a/lib/Target/ARM/Thumb1RegisterInfo.cpp +++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp @@ -31,8 +31,6 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/ADT/BitVector.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" @@ -48,15 +46,29 @@ Thumb1RegisterInfo::Thumb1RegisterInfo(const ARMBaseInstrInfo &tii, : ARMBaseRegisterInfo(tii, sti) { } +const TargetRegisterClass* +Thumb1RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) + const { + if (RC == ARM::tGPRRegisterClass || RC->hasSuperClass(ARM::tGPRRegisterClass)) + return ARM::tGPRRegisterClass; + return ARMBaseRegisterInfo::getLargestLegalSuperClass(RC); +} + +const TargetRegisterClass * +Thumb1RegisterInfo::getPointerRegClass(unsigned Kind) const { + return ARM::tGPRRegisterClass; +} + /// emitLoadConstPool - Emits a load from constpool to materialize the /// specified immediate. -void Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - DebugLoc dl, - unsigned DestReg, unsigned SubIdx, - int Val, - ARMCC::CondCodes Pred, - unsigned PredReg) const { +void +Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + DebugLoc dl, + unsigned DestReg, unsigned SubIdx, + int Val, + ARMCC::CondCodes Pred, unsigned PredReg, + unsigned MIFlags) const { MachineFunction &MF = *MBB.getParent(); MachineConstantPool *ConstantPool = MF.getConstantPool(); const Constant *C = ConstantInt::get( @@ -64,8 +76,9 @@ void Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB, unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4); BuildMI(MBB, MBBI, dl, TII.get(ARM::tLDRpci)) - .addReg(DestReg, getDefRegState(true), SubIdx) - .addConstantPoolIndex(Idx).addImm(Pred).addReg(PredReg); + .addReg(DestReg, getDefRegState(true), SubIdx) + .addConstantPoolIndex(Idx).addImm(Pred).addReg(PredReg) + .setMIFlags(MIFlags); } @@ -76,11 +89,12 @@ void Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB, static void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, + DebugLoc dl, unsigned DestReg, unsigned BaseReg, int NumBytes, bool CanChangeCC, const TargetInstrInfo &TII, const ARMBaseRegisterInfo& MRI, - DebugLoc dl) { + unsigned MIFlags = MachineInstr::NoFlags) { MachineFunction &MF = *MBB.getParent(); bool isHigh = !isARMLowRegister(DestReg) || (BaseReg != 0 && !isARMLowRegister(BaseReg)); @@ -101,14 +115,15 @@ void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB, if (NumBytes <= 255 && NumBytes >= 0) AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg)) - .addImm(NumBytes); + .addImm(NumBytes).setMIFlags(MIFlags); else if (NumBytes < 0 && NumBytes >= -255) { AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg)) - .addImm(NumBytes); + .addImm(NumBytes).setMIFlags(MIFlags); AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tRSB), LdReg)) - .addReg(LdReg, RegState::Kill); + .addReg(LdReg, RegState::Kill).setMIFlags(MIFlags); } else - MRI.emitLoadConstPool(MBB, MBBI, dl, LdReg, 0, NumBytes); + MRI.emitLoadConstPool(MBB, MBBI, dl, LdReg, 0, NumBytes, + ARMCC::AL, 0, MIFlags); // Emit add / sub. int Opc = (isSub) ? ARM::tSUBrr : (isHigh ? ARM::tADDhirr : ARM::tADDrr); @@ -151,10 +166,11 @@ static unsigned calcNumMI(int Opc, int ExtraOpc, unsigned Bytes, /// a destreg = basereg + immediate in Thumb code. void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, + DebugLoc dl, unsigned DestReg, unsigned BaseReg, int NumBytes, const TargetInstrInfo &TII, const ARMBaseRegisterInfo& MRI, - DebugLoc dl) { + unsigned MIFlags) { bool isSub = NumBytes < 0; unsigned Bytes = (unsigned)NumBytes; if (isSub) Bytes = -NumBytes; @@ -211,8 +227,9 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB, if (NumMIs > Threshold) { // This will expand into too many instructions. Load the immediate from a // constpool entry. - emitThumbRegPlusImmInReg(MBB, MBBI, DestReg, BaseReg, NumBytes, true, TII, - MRI, dl); + emitThumbRegPlusImmInReg(MBB, MBBI, dl, + DestReg, BaseReg, NumBytes, true, + TII, MRI, MIFlags); return; } @@ -224,11 +241,12 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB, Bytes -= ThisVal; const TargetInstrDesc &TID = TII.get(isSub ? ARM::tSUBi3 : ARM::tADDi3); const MachineInstrBuilder MIB = - AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TID, DestReg)); + AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TID, DestReg).setMIFlags(MIFlags)); AddDefaultPred(MIB.addReg(BaseReg, RegState::Kill).addImm(ThisVal)); } else { BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg) - .addReg(BaseReg, RegState::Kill); + .addReg(BaseReg, RegState::Kill) + .setMIFlags(MIFlags); } BaseReg = DestReg; } @@ -243,9 +261,10 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB, MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg); if (NeedCC) MIB = AddDefaultT1CC(MIB); - MIB .addReg(DestReg).addImm(ThisVal); + MIB.addReg(DestReg).addImm(ThisVal); if (NeedPred) MIB = AddDefaultPred(MIB); + MIB.setMIFlags(MIFlags); } else { bool isKill = BaseReg != ARM::SP; @@ -255,8 +274,9 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB, MIB.addReg(BaseReg, getKillRegState(isKill)).addImm(ThisVal); if (NeedPred) MIB = AddDefaultPred(MIB); - BaseReg = DestReg; + MIB.setMIFlags(MIFlags); + BaseReg = DestReg; if (Opc == ARM::tADDrSPi) { // r4 = add sp, imm // r4 = add r4, imm @@ -274,7 +294,8 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB, const TargetInstrDesc &TID = TII.get(ExtraOpc); AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TID, DestReg)) .addReg(DestReg, RegState::Kill) - .addImm(((unsigned)NumBytes) & 3)); + .addImm(((unsigned)NumBytes) & 3) + .setMIFlags(MIFlags)); } } @@ -283,8 +304,8 @@ static void emitSPUpdate(MachineBasicBlock &MBB, const TargetInstrInfo &TII, DebugLoc dl, const Thumb1RegisterInfo &MRI, int NumBytes) { - emitThumbRegPlusImmediate(MBB, MBBI, ARM::SP, ARM::SP, NumBytes, TII, - MRI, dl); + emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, TII, + MRI); } void Thumb1RegisterInfo:: @@ -337,7 +358,7 @@ static void emitThumbConstant(MachineBasicBlock &MBB, DestReg)) .addImm(ThisVal)); if (Imm > 0) - emitThumbRegPlusImmediate(MBB, MBBI, DestReg, DestReg, Imm, TII, MRI, dl); + emitThumbRegPlusImmediate(MBB, MBBI, dl, DestReg, DestReg, Imm, TII, MRI); if (isSub) { const TargetInstrDesc &TID = TII.get(ARM::tRSB); AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TID, DestReg)) @@ -430,8 +451,8 @@ rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx, // MI would expand into a large number of instructions. Don't try to // simplify the immediate. if (NumMIs > 2) { - emitThumbRegPlusImmediate(MBB, II, DestReg, FrameReg, Offset, TII, - *this, dl); + emitThumbRegPlusImmediate(MBB, II, dl, DestReg, FrameReg, Offset, TII, + *this); MBB.erase(II); return true; } @@ -450,8 +471,8 @@ rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx, } Offset = (Offset - Mask * Scale); MachineBasicBlock::iterator NII = llvm::next(II); - emitThumbRegPlusImmediate(MBB, NII, DestReg, DestReg, Offset, TII, - *this, dl); + emitThumbRegPlusImmediate(MBB, NII, dl, DestReg, DestReg, Offset, TII, + *this); } else { // Translate r0 = add sp, -imm to // r0 = -imm (this is then translated into a series of instructons) @@ -645,15 +666,15 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, bool UseRR = false; if (Opcode == ARM::tRestore) { if (FrameReg == ARM::SP) - emitThumbRegPlusImmInReg(MBB, II, TmpReg, FrameReg, - Offset, false, TII, *this, dl); + emitThumbRegPlusImmInReg(MBB, II, dl, TmpReg, FrameReg, + Offset, false, TII, *this); else { emitLoadConstPool(MBB, II, dl, TmpReg, 0, Offset); UseRR = true; } } else { - emitThumbRegPlusImmediate(MBB, II, TmpReg, FrameReg, Offset, TII, - *this, dl); + emitThumbRegPlusImmediate(MBB, II, dl, TmpReg, FrameReg, Offset, TII, + *this); } MI.setDesc(TII.get(UseRR ? ARM::tLDRr : ARM::tLDRi)); @@ -668,15 +689,15 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, if (Opcode == ARM::tSpill) { if (FrameReg == ARM::SP) - emitThumbRegPlusImmInReg(MBB, II, VReg, FrameReg, - Offset, false, TII, *this, dl); + emitThumbRegPlusImmInReg(MBB, II, dl, VReg, FrameReg, + Offset, false, TII, *this); else { emitLoadConstPool(MBB, II, dl, VReg, 0, Offset); UseRR = true; } } else - emitThumbRegPlusImmediate(MBB, II, VReg, FrameReg, Offset, TII, - *this, dl); + emitThumbRegPlusImmediate(MBB, II, dl, VReg, FrameReg, Offset, TII, + *this); MI.setDesc(TII.get(UseRR ? ARM::tSTRr : ARM::tSTRi)); MI.getOperand(i).ChangeToRegister(VReg, false, false, true); if (UseRR) diff --git a/lib/Target/ARM/Thumb1RegisterInfo.h b/lib/Target/ARM/Thumb1RegisterInfo.h index 8a87cc5..9060e59 100644 --- a/lib/Target/ARM/Thumb1RegisterInfo.h +++ b/lib/Target/ARM/Thumb1RegisterInfo.h @@ -28,6 +28,11 @@ struct Thumb1RegisterInfo : public ARMBaseRegisterInfo { public: Thumb1RegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI); + const TargetRegisterClass* + getLargestLegalSuperClass(const TargetRegisterClass *RC) const; + + const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const; + /// emitLoadConstPool - Emits a load from constpool to materialize the /// specified immediate. void emitLoadConstPool(MachineBasicBlock &MBB, @@ -35,7 +40,8 @@ public: DebugLoc dl, unsigned DestReg, unsigned SubIdx, int Val, ARMCC::CondCodes Pred = ARMCC::AL, - unsigned PredReg = 0) const; + unsigned PredReg = 0, + unsigned MIFlags = MachineInstr::NoFlags) const; /// Code Generation virtual methods... void eliminateCallFramePseudoInstr(MachineFunction &MF, diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp index 9b1073b..d169dbb 100644 --- a/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -184,7 +184,7 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, DebugLoc dl, unsigned DestReg, unsigned BaseReg, int NumBytes, ARMCC::CondCodes Pred, unsigned PredReg, - const ARMBaseInstrInfo &TII) { + const ARMBaseInstrInfo &TII, unsigned MIFlags) { bool isSub = NumBytes < 0; if (isSub) NumBytes = -NumBytes; @@ -198,14 +198,14 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB, // Use a movw to materialize the 16-bit constant. BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), DestReg) .addImm(NumBytes) - .addImm((unsigned)Pred).addReg(PredReg); + .addImm((unsigned)Pred).addReg(PredReg).setMIFlags(MIFlags); Fits = true; } else if ((NumBytes & 0xffff) == 0) { // Use a movt to materialize the 32-bit constant. BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVTi16), DestReg) .addReg(DestReg) .addImm(NumBytes >> 16) - .addImm((unsigned)Pred).addReg(PredReg); + .addImm((unsigned)Pred).addReg(PredReg).setMIFlags(MIFlags); Fits = true; } @@ -214,12 +214,14 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB, BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr), DestReg) .addReg(BaseReg, RegState::Kill) .addReg(DestReg, RegState::Kill) - .addImm((unsigned)Pred).addReg(PredReg).addReg(0); + .addImm((unsigned)Pred).addReg(PredReg).addReg(0) + .setMIFlags(MIFlags); } else { BuildMI(MBB, MBBI, dl, TII.get(ARM::t2ADDrr), DestReg) .addReg(DestReg, RegState::Kill) .addReg(BaseReg, RegState::Kill) - .addImm((unsigned)Pred).addReg(PredReg).addReg(0); + .addImm((unsigned)Pred).addReg(PredReg).addReg(0) + .setMIFlags(MIFlags); } return; } @@ -230,7 +232,8 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB, unsigned Opc = 0; if (DestReg == ARM::SP && BaseReg != ARM::SP) { // mov sp, rn. Note t2MOVr cannot be used. - BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr),DestReg).addReg(BaseReg); + BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr),DestReg) + .addReg(BaseReg).setMIFlags(MIFlags); BaseReg = ARM::SP; continue; } @@ -243,7 +246,7 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB, Opc = isSub ? ARM::tSUBspi : ARM::tADDspi; // FIXME: Fix Thumb1 immediate encoding. BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg) - .addReg(BaseReg).addImm(ThisVal/4); + .addReg(BaseReg).addImm(ThisVal/4).setMIFlags(MIFlags); NumBytes = 0; continue; } @@ -283,7 +286,7 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB, MachineInstrBuilder MIB = AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg) .addReg(BaseReg, RegState::Kill) - .addImm(ThisVal)); + .addImm(ThisVal)).setMIFlags(MIFlags); if (HasCCOut) AddDefaultCC(MIB); diff --git a/lib/Target/ARM/Thumb2RegisterInfo.cpp b/lib/Target/ARM/Thumb2RegisterInfo.cpp index 099b8f7..355c3bf 100644 --- a/lib/Target/ARM/Thumb2RegisterInfo.cpp +++ b/lib/Target/ARM/Thumb2RegisterInfo.cpp @@ -13,26 +13,15 @@ //===----------------------------------------------------------------------===// #include "ARM.h" -#include "ARMAddressingModes.h" -#include "ARMBaseInstrInfo.h" -#include "ARMMachineFunctionInfo.h" #include "ARMSubtarget.h" #include "Thumb2InstrInfo.h" #include "Thumb2RegisterInfo.h" #include "llvm/Constants.h" #include "llvm/DerivedTypes.h" #include "llvm/Function.h" -#include "llvm/LLVMContext.h" #include "llvm/CodeGen/MachineConstantPool.h" -#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineLocation.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/ADT/BitVector.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/Support/ErrorHandling.h" using namespace llvm; Thumb2RegisterInfo::Thumb2RegisterInfo(const ARMBaseInstrInfo &tii, @@ -42,13 +31,14 @@ Thumb2RegisterInfo::Thumb2RegisterInfo(const ARMBaseInstrInfo &tii, /// emitLoadConstPool - Emits a load from constpool to materialize the /// specified immediate. -void Thumb2RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - DebugLoc dl, - unsigned DestReg, unsigned SubIdx, - int Val, - ARMCC::CondCodes Pred, - unsigned PredReg) const { +void +Thumb2RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + DebugLoc dl, + unsigned DestReg, unsigned SubIdx, + int Val, + ARMCC::CondCodes Pred, unsigned PredReg, + unsigned MIFlags) const { MachineFunction &MF = *MBB.getParent(); MachineConstantPool *ConstantPool = MF.getConstantPool(); const Constant *C = ConstantInt::get( @@ -57,5 +47,6 @@ void Thumb2RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB, BuildMI(MBB, MBBI, dl, TII.get(ARM::t2LDRpci)) .addReg(DestReg, getDefRegState(true), SubIdx) - .addConstantPoolIndex(Idx).addImm((int64_t)ARMCC::AL).addReg(0); + .addConstantPoolIndex(Idx).addImm((int64_t)ARMCC::AL).addReg(0) + .setMIFlags(MIFlags); } diff --git a/lib/Target/ARM/Thumb2RegisterInfo.h b/lib/Target/ARM/Thumb2RegisterInfo.h index b3cf2e5..824378a 100644 --- a/lib/Target/ARM/Thumb2RegisterInfo.h +++ b/lib/Target/ARM/Thumb2RegisterInfo.h @@ -35,7 +35,8 @@ public: DebugLoc dl, unsigned DestReg, unsigned SubIdx, int Val, ARMCC::CondCodes Pred = ARMCC::AL, - unsigned PredReg = 0) const; + unsigned PredReg = 0, + unsigned MIFlags = MachineInstr::NoFlags) const; }; } diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp index cc8f61c..ce2e966 100644 --- a/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -12,6 +12,7 @@ #include "ARMAddressingModes.h" #include "ARMBaseRegisterInfo.h" #include "ARMBaseInstrInfo.h" +#include "ARMSubtarget.h" #include "Thumb2InstrInfo.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -49,82 +50,86 @@ namespace { // 1 - No cc field. // 2 - Always set CPSR. unsigned PredCC2 : 2; + unsigned PartFlag : 1; // 16-bit instruction does partial flag update unsigned Special : 1; // Needs to be dealt with specially }; static const ReduceEntry ReduceTable[] = { - // Wide, Narrow1, Narrow2, imm1,imm2, lo1, lo2, P/C, S - { ARM::t2ADCrr, 0, ARM::tADC, 0, 0, 0, 1, 0,0, 0 }, - { ARM::t2ADDri, ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 0,0, 0 }, - { ARM::t2ADDrr, ARM::tADDrr, ARM::tADDhirr, 0, 0, 1, 0, 0,1, 0 }, + // Wide, Narrow1, Narrow2, imm1,imm2, lo1, lo2, P/C, PF, S + { ARM::t2ADCrr, 0, ARM::tADC, 0, 0, 0, 1, 0,0, 0,0 }, + { ARM::t2ADDri, ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 0,0, 0,0 }, + { ARM::t2ADDrr, ARM::tADDrr, ARM::tADDhirr, 0, 0, 1, 0, 0,1, 0,0 }, // Note: immediate scale is 4. - { ARM::t2ADDrSPi,ARM::tADDrSPi,0, 8, 0, 1, 0, 1,0, 1 }, - { ARM::t2ADDSri,ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 2,2, 1 }, - { ARM::t2ADDSrr,ARM::tADDrr, 0, 0, 0, 1, 0, 2,0, 1 }, - { ARM::t2ANDrr, 0, ARM::tAND, 0, 0, 0, 1, 0,0, 0 }, - { ARM::t2ASRri, ARM::tASRri, 0, 5, 0, 1, 0, 0,0, 0 }, - { ARM::t2ASRrr, 0, ARM::tASRrr, 0, 0, 0, 1, 0,0, 0 }, - { ARM::t2BICrr, 0, ARM::tBIC, 0, 0, 0, 1, 0,0, 0 }, + { ARM::t2ADDrSPi,ARM::tADDrSPi,0, 8, 0, 1, 0, 1,0, 0,1 }, + { ARM::t2ADDSri,ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 2,2, 0,1 }, + { ARM::t2ADDSrr,ARM::tADDrr, 0, 0, 0, 1, 0, 2,0, 0,1 }, + { ARM::t2ANDrr, 0, ARM::tAND, 0, 0, 0, 1, 0,0, 1,0 }, + { ARM::t2ASRri, ARM::tASRri, 0, 5, 0, 1, 0, 0,0, 1,0 }, + { ARM::t2ASRrr, 0, ARM::tASRrr, 0, 0, 0, 1, 0,0, 1,0 }, + { ARM::t2BICrr, 0, ARM::tBIC, 0, 0, 0, 1, 0,0, 1,0 }, //FIXME: Disable CMN, as CCodes are backwards from compare expectations - //{ ARM::t2CMNrr, ARM::tCMN, 0, 0, 0, 1, 0, 2,0, 0 }, - { ARM::t2CMPri, ARM::tCMPi8, 0, 8, 0, 1, 0, 2,0, 0 }, - { ARM::t2CMPrr, ARM::tCMPhir, 0, 0, 0, 0, 0, 2,0, 1 }, - { ARM::t2EORrr, 0, ARM::tEOR, 0, 0, 0, 1, 0,0, 0 }, + //{ ARM::t2CMNrr, ARM::tCMN, 0, 0, 0, 1, 0, 2,0, 0,0 }, + { ARM::t2CMPri, ARM::tCMPi8, 0, 8, 0, 1, 0, 2,0, 0,0 }, + { ARM::t2CMPrr, ARM::tCMPhir, 0, 0, 0, 0, 0, 2,0, 0,1 }, + { ARM::t2EORrr, 0, ARM::tEOR, 0, 0, 0, 1, 0,0, 1,0 }, // FIXME: adr.n immediate offset must be multiple of 4. - //{ ARM::t2LEApcrelJT,ARM::tLEApcrelJT, 0, 0, 0, 1, 0, 1,0, 0 }, - { ARM::t2LSLri, ARM::tLSLri, 0, 5, 0, 1, 0, 0,0, 0 }, - { ARM::t2LSLrr, 0, ARM::tLSLrr, 0, 0, 0, 1, 0,0, 0 }, - { ARM::t2LSRri, ARM::tLSRri, 0, 5, 0, 1, 0, 0,0, 0 }, - { ARM::t2LSRrr, 0, ARM::tLSRrr, 0, 0, 0, 1, 0,0, 0 }, - { ARM::t2MOVi, ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 0 }, - { ARM::t2MOVi16,ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 1 }, + //{ ARM::t2LEApcrelJT,ARM::tLEApcrelJT, 0, 0, 0, 1, 0, 1,0, 0,0 }, + { ARM::t2LSLri, ARM::tLSLri, 0, 5, 0, 1, 0, 0,0, 1,0 }, + { ARM::t2LSLrr, 0, ARM::tLSLrr, 0, 0, 0, 1, 0,0, 1,0 }, + { ARM::t2LSRri, ARM::tLSRri, 0, 5, 0, 1, 0, 0,0, 1,0 }, + { ARM::t2LSRrr, 0, ARM::tLSRrr, 0, 0, 0, 1, 0,0, 1,0 }, + // FIXME: tMOVi8 and tMVN also partially update CPSR but they are less + // likely to cause issue in the loop. As a size / performance workaround, + // they are not marked as such. + { ARM::t2MOVi, ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 0,0 }, + { ARM::t2MOVi16,ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 0,1 }, // FIXME: Do we need the 16-bit 'S' variant? - { ARM::t2MOVr,ARM::tMOVgpr2gpr,0, 0, 0, 0, 0, 1,0, 0 }, - { ARM::t2MOVCCr,0, ARM::tMOVCCr, 0, 0, 0, 0, 0,1, 0 }, - { ARM::t2MOVCCi,0, ARM::tMOVCCi, 0, 8, 0, 1, 0,1, 0 }, - { ARM::t2MUL, 0, ARM::tMUL, 0, 0, 0, 1, 0,0, 0 }, - { ARM::t2MVNr, ARM::tMVN, 0, 0, 0, 1, 0, 0,0, 0 }, - { ARM::t2ORRrr, 0, ARM::tORR, 0, 0, 0, 1, 0,0, 0 }, - { ARM::t2REV, ARM::tREV, 0, 0, 0, 1, 0, 1,0, 0 }, - { ARM::t2REV16, ARM::tREV16, 0, 0, 0, 1, 0, 1,0, 0 }, - { ARM::t2REVSH, ARM::tREVSH, 0, 0, 0, 1, 0, 1,0, 0 }, - { ARM::t2RORrr, 0, ARM::tROR, 0, 0, 0, 1, 0,0, 0 }, - { ARM::t2RSBri, ARM::tRSB, 0, 0, 0, 1, 0, 0,0, 1 }, - { ARM::t2RSBSri,ARM::tRSB, 0, 0, 0, 1, 0, 2,0, 1 }, - { ARM::t2SBCrr, 0, ARM::tSBC, 0, 0, 0, 1, 0,0, 0 }, - { ARM::t2SUBri, ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 0,0, 0 }, - { ARM::t2SUBrr, ARM::tSUBrr, 0, 0, 0, 1, 0, 0,0, 0 }, - { ARM::t2SUBSri,ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 2,2, 0 }, - { ARM::t2SUBSrr,ARM::tSUBrr, 0, 0, 0, 1, 0, 2,0, 0 }, - { ARM::t2SXTBr, ARM::tSXTB, 0, 0, 0, 1, 0, 1,0, 0 }, - { ARM::t2SXTHr, ARM::tSXTH, 0, 0, 0, 1, 0, 1,0, 0 }, - { ARM::t2TSTrr, ARM::tTST, 0, 0, 0, 1, 0, 2,0, 0 }, - { ARM::t2UXTBr, ARM::tUXTB, 0, 0, 0, 1, 0, 1,0, 0 }, - { ARM::t2UXTHr, ARM::tUXTH, 0, 0, 0, 1, 0, 1,0, 0 }, + { ARM::t2MOVr,ARM::tMOVgpr2gpr,0, 0, 0, 0, 0, 1,0, 0,0 }, + { ARM::t2MOVCCr,0, ARM::tMOVCCr, 0, 0, 0, 0, 0,1, 0,0 }, + { ARM::t2MOVCCi,0, ARM::tMOVCCi, 0, 8, 0, 1, 0,1, 0,0 }, + { ARM::t2MUL, 0, ARM::tMUL, 0, 0, 0, 1, 0,0, 1,0 }, + { ARM::t2MVNr, ARM::tMVN, 0, 0, 0, 1, 0, 0,0, 0,0 }, + { ARM::t2ORRrr, 0, ARM::tORR, 0, 0, 0, 1, 0,0, 1,0 }, + { ARM::t2REV, ARM::tREV, 0, 0, 0, 1, 0, 1,0, 0,0 }, + { ARM::t2REV16, ARM::tREV16, 0, 0, 0, 1, 0, 1,0, 0,0 }, + { ARM::t2REVSH, ARM::tREVSH, 0, 0, 0, 1, 0, 1,0, 0,0 }, + { ARM::t2RORrr, 0, ARM::tROR, 0, 0, 0, 1, 0,0, 1,0 }, + { ARM::t2RSBri, ARM::tRSB, 0, 0, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2RSBSri,ARM::tRSB, 0, 0, 0, 1, 0, 2,0, 0,1 }, + { ARM::t2SBCrr, 0, ARM::tSBC, 0, 0, 0, 1, 0,0, 0,0 }, + { ARM::t2SUBri, ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 0,0, 0,0 }, + { ARM::t2SUBrr, ARM::tSUBrr, 0, 0, 0, 1, 0, 0,0, 0,0 }, + { ARM::t2SUBSri,ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 2,2, 0,0 }, + { ARM::t2SUBSrr,ARM::tSUBrr, 0, 0, 0, 1, 0, 2,0, 0,0 }, + { ARM::t2SXTBr, ARM::tSXTB, 0, 0, 0, 1, 0, 1,0, 0,0 }, + { ARM::t2SXTHr, ARM::tSXTH, 0, 0, 0, 1, 0, 1,0, 0,0 }, + { ARM::t2TSTrr, ARM::tTST, 0, 0, 0, 1, 0, 2,0, 0,0 }, + { ARM::t2UXTBr, ARM::tUXTB, 0, 0, 0, 1, 0, 1,0, 0,0 }, + { ARM::t2UXTHr, ARM::tUXTH, 0, 0, 0, 1, 0, 1,0, 0,0 }, // FIXME: Clean this up after splitting each Thumb load / store opcode // into multiple ones. - { ARM::t2LDRi12,ARM::tLDRi, ARM::tLDRspi, 5, 8, 1, 0, 0,0, 1 }, - { ARM::t2LDRs, ARM::tLDRr, 0, 0, 0, 1, 0, 0,0, 1 }, - { ARM::t2LDRBi12,ARM::tLDRBi, 0, 5, 0, 1, 0, 0,0, 1 }, - { ARM::t2LDRBs, ARM::tLDRBr, 0, 0, 0, 1, 0, 0,0, 1 }, - { ARM::t2LDRHi12,ARM::tLDRHi, 0, 5, 0, 1, 0, 0,0, 1 }, - { ARM::t2LDRHs, ARM::tLDRHr, 0, 0, 0, 1, 0, 0,0, 1 }, - { ARM::t2LDRSBs,ARM::tLDRSB, 0, 0, 0, 1, 0, 0,0, 1 }, - { ARM::t2LDRSHs,ARM::tLDRSH, 0, 0, 0, 1, 0, 0,0, 1 }, - { ARM::t2STRi12,ARM::tSTRi, ARM::tSTRspi, 5, 8, 1, 0, 0,0, 1 }, - { ARM::t2STRs, ARM::tSTRr, 0, 0, 0, 1, 0, 0,0, 1 }, - { ARM::t2STRBi12,ARM::tSTRBi, 0, 5, 0, 1, 0, 0,0, 1 }, - { ARM::t2STRBs, ARM::tSTRBr, 0, 0, 0, 1, 0, 0,0, 1 }, - { ARM::t2STRHi12,ARM::tSTRHi, 0, 5, 0, 1, 0, 0,0, 1 }, - { ARM::t2STRHs, ARM::tSTRHr, 0, 0, 0, 1, 0, 0,0, 1 }, - - { ARM::t2LDMIA, ARM::tLDMIA, 0, 0, 0, 1, 1, 1,1, 1 }, - { ARM::t2LDMIA_RET,0, ARM::tPOP_RET, 0, 0, 1, 1, 1,1, 1 }, - { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0, 0, 1, 1, 1,1, 1 }, + { ARM::t2LDRi12,ARM::tLDRi, ARM::tLDRspi, 5, 8, 1, 0, 0,0, 0,1 }, + { ARM::t2LDRs, ARM::tLDRr, 0, 0, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2LDRBi12,ARM::tLDRBi, 0, 5, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2LDRBs, ARM::tLDRBr, 0, 0, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2LDRHi12,ARM::tLDRHi, 0, 5, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2LDRHs, ARM::tLDRHr, 0, 0, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2LDRSBs,ARM::tLDRSB, 0, 0, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2LDRSHs,ARM::tLDRSH, 0, 0, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2STRi12,ARM::tSTRi, ARM::tSTRspi, 5, 8, 1, 0, 0,0, 0,1 }, + { ARM::t2STRs, ARM::tSTRr, 0, 0, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2STRBi12,ARM::tSTRBi, 0, 5, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2STRBs, ARM::tSTRBr, 0, 0, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2STRHi12,ARM::tSTRHi, 0, 5, 0, 1, 0, 0,0, 0,1 }, + { ARM::t2STRHs, ARM::tSTRHr, 0, 0, 0, 1, 0, 0,0, 0,1 }, + + { ARM::t2LDMIA, ARM::tLDMIA, 0, 0, 0, 1, 1, 1,1, 0,1 }, + { ARM::t2LDMIA_RET,0, ARM::tPOP_RET, 0, 0, 1, 1, 1,1, 0,1 }, + { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0, 0, 1, 1, 1,1, 0,1 }, // ARM::t2STM (with no basereg writeback) has no Thumb1 equivalent - { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 1 }, - { ARM::t2STMDB_UPD, 0, ARM::tPUSH, 0, 0, 1, 1, 1,1, 1 }, + { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1 }, + { ARM::t2STMDB_UPD, 0, ARM::tPUSH, 0, 0, 1, 1, 1,1, 0,1 }, }; class Thumb2SizeReduce : public MachineFunctionPass { @@ -133,6 +138,7 @@ namespace { Thumb2SizeReduce(); const Thumb2InstrInfo *TII; + const ARMSubtarget *STI; virtual bool runOnMachineFunction(MachineFunction &MF); @@ -144,6 +150,8 @@ namespace { /// ReduceOpcodeMap - Maps wide opcode to index of entry in ReduceTable. DenseMap<unsigned, unsigned> ReduceOpcodeMap; + bool canAddPseudoFlagDep(MachineInstr *Def, MachineInstr *Use); + bool VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry, bool is2Addr, ARMCC::CondCodes Pred, bool LiveCPSR, bool &HasCC, bool &CCDead); @@ -152,19 +160,20 @@ namespace { const ReduceEntry &Entry); bool ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, - const ReduceEntry &Entry, bool LiveCPSR); + const ReduceEntry &Entry, bool LiveCPSR, + MachineInstr *CPSRDef); /// ReduceTo2Addr - Reduce a 32-bit instruction to a 16-bit two-address /// instruction. bool ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, const ReduceEntry &Entry, - bool LiveCPSR); + bool LiveCPSR, MachineInstr *CPSRDef); /// ReduceToNarrow - Reduce a 32-bit instruction to a 16-bit /// non-two-address instruction. bool ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, const ReduceEntry &Entry, - bool LiveCPSR); + bool LiveCPSR, MachineInstr *CPSRDef); /// ReduceMBB - Reduce width of instructions in the specified basic block. bool ReduceMBB(MachineBasicBlock &MBB); @@ -187,6 +196,52 @@ static bool HasImplicitCPSRDef(const TargetInstrDesc &TID) { return false; } +/// canAddPseudoFlagDep - For A9 (and other out-of-order) implementations, +/// the 's' 16-bit instruction partially update CPSR. Abort the +/// transformation to avoid adding false dependency on last CPSR setting +/// instruction which hurts the ability for out-of-order execution engine +/// to do register renaming magic. +/// This function checks if there is a read-of-write dependency between the +/// last instruction that defines the CPSR and the current instruction. If there +/// is, then there is no harm done since the instruction cannot be retired +/// before the CPSR setting instruction anyway. +/// Note, we are not doing full dependency analysis here for the sake of compile +/// time. We're not looking for cases like: +/// r0 = muls ... +/// r1 = add.w r0, ... +/// ... +/// = mul.w r1 +/// In this case it would have been ok to narrow the mul.w to muls since there +/// are indirect RAW dependency between the muls and the mul.w +bool +Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Def, MachineInstr *Use) { + if (!Def || !STI->avoidCPSRPartialUpdate()) + return false; + + SmallSet<unsigned, 2> Defs; + for (unsigned i = 0, e = Def->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = Def->getOperand(i); + if (!MO.isReg() || MO.isUndef() || MO.isUse()) + continue; + unsigned Reg = MO.getReg(); + if (Reg == 0 || Reg == ARM::CPSR) + continue; + Defs.insert(Reg); + } + + for (unsigned i = 0, e = Use->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = Use->getOperand(i); + if (!MO.isReg() || MO.isUndef() || MO.isDef()) + continue; + unsigned Reg = MO.getReg(); + if (Defs.count(Reg)) + return false; + } + + // No read-after-write dependency. The narrowing will add false dependency. + return true; +} + bool Thumb2SizeReduce::VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry, bool is2Addr, ARMCC::CondCodes Pred, @@ -410,7 +465,10 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, MIB.addOperand(MI->getOperand(OpNum)); // Transfer memoperands. - (*MIB).setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + + // Transfer MI flags. + MIB.setMIFlags(MI->getFlags()); DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB); @@ -422,7 +480,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, bool Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, const ReduceEntry &Entry, - bool LiveCPSR) { + bool LiveCPSR, MachineInstr *CPSRDef) { if (Entry.LowRegs1 && !VerifyLowRegs(MI)) return false; @@ -440,12 +498,12 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, switch (Opc) { default: break; case ARM::t2ADDSri: { - if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR)) + if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef)) return true; // fallthrough } case ARM::t2ADDSrr: - return ReduceToNarrow(MBB, MI, Entry, LiveCPSR); + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef); } } break; @@ -453,13 +511,13 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, case ARM::t2RSBri: case ARM::t2RSBSri: if (MI->getOperand(2).getImm() == 0) - return ReduceToNarrow(MBB, MI, Entry, LiveCPSR); + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef); break; case ARM::t2MOVi16: // Can convert only 'pure' immediate operands, not immediates obtained as // globals' addresses. if (MI->getOperand(1).isImm()) - return ReduceToNarrow(MBB, MI, Entry, LiveCPSR); + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef); break; case ARM::t2CMPrr: { // Try to reduce to the lo-reg only version first. Why there are two @@ -468,17 +526,17 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, // are prioritized, but the table assumes a unique entry for each // source insn opcode. So for now, we hack a local entry record to use. static const ReduceEntry NarrowEntry = - { ARM::t2CMPrr,ARM::tCMPr, 0, 0, 0, 1, 1,2, 0, 1 }; - if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR)) + { ARM::t2CMPrr,ARM::tCMPr, 0, 0, 0, 1, 1,2, 0, 0,1 }; + if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, CPSRDef)) return true; - return ReduceToNarrow(MBB, MI, Entry, LiveCPSR); + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef); } case ARM::t2ADDrSPi: { static const ReduceEntry NarrowEntry = - { ARM::t2ADDrSPi,ARM::tADDspi, 0, 7, 0, 1, 0, 1, 0, 1 }; + { ARM::t2ADDrSPi,ARM::tADDspi, 0, 7, 0, 1, 0, 1, 0, 0,1 }; if (MI->getOperand(0).getReg() == ARM::SP) - return ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR); - return ReduceToNarrow(MBB, MI, Entry, LiveCPSR); + return ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, CPSRDef); + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef); } } return false; @@ -487,7 +545,7 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, bool Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, const ReduceEntry &Entry, - bool LiveCPSR) { + bool LiveCPSR, MachineInstr *CPSRDef) { if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr)) return false; @@ -542,6 +600,12 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, if (!VerifyPredAndCC(MI, Entry, true, Pred, LiveCPSR, HasCC, CCDead)) return false; + // Avoid adding a false dependency on partial flag update by some 16-bit + // instructions which has the 's' bit set. + if (Entry.PartFlag && NewTID.hasOptionalDef() && HasCC && + canAddPseudoFlagDep(CPSRDef, MI)) + return false; + // Add the 16-bit instruction. DebugLoc dl = MI->getDebugLoc(); MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, NewTID); @@ -563,6 +627,9 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, MIB.addOperand(MI->getOperand(i)); } + // Transfer MI flags. + MIB.setMIFlags(MI->getFlags()); + DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB); MBB.erase(MI); @@ -573,7 +640,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, bool Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, const ReduceEntry &Entry, - bool LiveCPSR) { + bool LiveCPSR, MachineInstr *CPSRDef) { if (ReduceLimit != -1 && ((int)NumNarrows >= ReduceLimit)) return false; @@ -626,6 +693,12 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, if (!VerifyPredAndCC(MI, Entry, false, Pred, LiveCPSR, HasCC, CCDead)) return false; + // Avoid adding a false dependency on partial flag update by some 16-bit + // instructions which has the 's' bit set. + if (Entry.PartFlag && NewTID.hasOptionalDef() && HasCC && + canAddPseudoFlagDep(CPSRDef, MI)) + return false; + // Add the 16-bit instruction. DebugLoc dl = MI->getDebugLoc(); MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, NewTID); @@ -663,6 +736,9 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, if (!TID.isPredicable() && NewTID.isPredicable()) AddDefaultPred(MIB); + // Transfer MI flags. + MIB.setMIFlags(MI->getFlags()); + DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB); MBB.erase(MI); @@ -670,7 +746,7 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, return true; } -static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR) { +static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR, bool &DefCPSR) { bool HasDef = false; for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI.getOperand(i); @@ -678,6 +754,8 @@ static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR) { continue; if (MO.getReg() != ARM::CPSR) continue; + + DefCPSR = true; if (!MO.isDead()) HasDef = true; } @@ -707,6 +785,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { // Yes, CPSR could be livein. bool LiveCPSR = MBB.isLiveIn(ARM::CPSR); + MachineInstr *CPSRDef = 0; MachineBasicBlock::iterator MII = MBB.begin(), E = MBB.end(); MachineBasicBlock::iterator NextMII; @@ -722,7 +801,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { const ReduceEntry &Entry = ReduceTable[OPI->second]; // Ignore "special" cases for now. if (Entry.Special) { - if (ReduceSpecial(MBB, MI, Entry, LiveCPSR)) { + if (ReduceSpecial(MBB, MI, Entry, LiveCPSR, CPSRDef)) { Modified = true; MachineBasicBlock::iterator I = prior(NextMII); MI = &*I; @@ -731,7 +810,8 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { } // Try to transform to a 16-bit two-address instruction. - if (Entry.NarrowOpc2 && ReduceTo2Addr(MBB, MI, Entry, LiveCPSR)) { + if (Entry.NarrowOpc2 && + ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef)) { Modified = true; MachineBasicBlock::iterator I = prior(NextMII); MI = &*I; @@ -739,7 +819,8 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { } // Try to transform to a 16-bit non-two-address instruction. - if (Entry.NarrowOpc1 && ReduceToNarrow(MBB, MI, Entry, LiveCPSR)) { + if (Entry.NarrowOpc1 && + ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef)) { Modified = true; MachineBasicBlock::iterator I = prior(NextMII); MI = &*I; @@ -747,7 +828,14 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { } ProcessNext: - LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR); + bool DefCPSR = false; + LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR, DefCPSR); + if (MI->getDesc().isCall()) + // Calls don't really set CPSR. + CPSRDef = 0; + else if (DefCPSR) + // This is the last CPSR defining instruction. + CPSRDef = MI; } return Modified; @@ -756,6 +844,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) { const TargetMachine &TM = MF.getTarget(); TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo()); + STI = &TM.getSubtarget<ARMSubtarget>(); bool Modified = false; for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) diff --git a/lib/Target/Alpha/Alpha.td b/lib/Target/Alpha/Alpha.td index 4508eda..ae79c2e 100644 --- a/lib/Target/Alpha/Alpha.td +++ b/lib/Target/Alpha/Alpha.td @@ -21,7 +21,7 @@ include "llvm/Target/Target.td" //===----------------------------------------------------------------------===// def FeatureCIX : SubtargetFeature<"cix", "HasCT", "true", - "Enable CIX extentions">; + "Enable CIX extensions">; //===----------------------------------------------------------------------===// // Register File Description diff --git a/lib/Target/Alpha/AlphaISelLowering.cpp b/lib/Target/Alpha/AlphaISelLowering.cpp index c4f43ab..ee404f0 100644 --- a/lib/Target/Alpha/AlphaISelLowering.cpp +++ b/lib/Target/Alpha/AlphaISelLowering.cpp @@ -296,7 +296,7 @@ AlphaTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Build a sequence of copy-to-reg nodes chained together with token chain and // flag operands which copy the outgoing args into registers. The InFlag in - // necessary since all emited instructions must be stuck together. + // necessary since all emitted instructions must be stuck together. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, diff --git a/lib/Target/Alpha/AlphaInstrInfo.td b/lib/Target/Alpha/AlphaInstrInfo.td index 099d715..b201712 100644 --- a/lib/Target/Alpha/AlphaInstrInfo.td +++ b/lib/Target/Alpha/AlphaInstrInfo.td @@ -1030,7 +1030,7 @@ def : Pat<(brcond (setune F8RC:$RA, immFPZ), bb:$DISP), //WMB Mfc 18.4400 Write memory barrier //MF_FPCR F-P 17.025 Move from FPCR //MT_FPCR F-P 17.024 Move to FPCR -//There are in the Multimedia extentions, so let's not use them yet +//There are in the Multimedia extensions, so let's not use them yet //def MAXSB8 : OForm<0x1C, 0x3E, "MAXSB8 $RA,$RB,$RC">; //Vector signed byte maximum //def MAXSW4 : OForm< 0x1C, 0x3F, "MAXSW4 $RA,$RB,$RC">; //Vector signed word maximum //def MAXUB8 : OForm<0x1C, 0x3C, "MAXUB8 $RA,$RB,$RC">; //Vector unsigned byte maximum diff --git a/lib/Target/Alpha/README.txt b/lib/Target/Alpha/README.txt index 9ae1517..cc170e3 100644 --- a/lib/Target/Alpha/README.txt +++ b/lib/Target/Alpha/README.txt @@ -33,9 +33,9 @@ add crazy vector instructions (MVI): (MIN|MAX)(U|S)(B8|W4) min and max, signed and unsigned, byte and word PKWB, UNPKBW pack/unpack word to byte PKLB UNPKBL pack/unpack long to byte -PERR pixel error (sum accross bytes of bytewise abs(i8v8 a - i8v8 b)) +PERR pixel error (sum across bytes of bytewise abs(i8v8 a - i8v8 b)) -cmpbytes bytewise cmpeq of i8v8 a and i8v8 b (not part of MVI extentions) +cmpbytes bytewise cmpeq of i8v8 a and i8v8 b (not part of MVI extensions) this has some good examples for other operations that can be synthesised well from these rather meager vector ops (such as saturating add). diff --git a/lib/Target/Blackfin/BlackfinISelLowering.cpp b/lib/Target/Blackfin/BlackfinISelLowering.cpp index 7c80eec..1e1f8c9 100644 --- a/lib/Target/Blackfin/BlackfinISelLowering.cpp +++ b/lib/Target/Blackfin/BlackfinISelLowering.cpp @@ -345,7 +345,7 @@ BlackfinTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Build a sequence of copy-to-reg nodes chained together with token // chain and flag operands which copy the outgoing args into registers. - // The InFlag in necessary since all emited instructions must be + // The InFlag in necessary since all emitted instructions must be // stuck together. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { diff --git a/lib/Target/CBackend/CBackend.cpp b/lib/Target/CBackend/CBackend.cpp index 6c555a3..358d1b3 100644 --- a/lib/Target/CBackend/CBackend.cpp +++ b/lib/Target/CBackend/CBackend.cpp @@ -2440,24 +2440,6 @@ void CWriter::visitReturnInst(ReturnInst &I) { return; } - if (I.getNumOperands() > 1) { - Out << " {\n"; - Out << " "; - printType(Out, I.getParent()->getParent()->getReturnType()); - Out << " llvm_cbe_mrv_temp = {\n"; - for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) { - Out << " "; - writeOperand(I.getOperand(i)); - if (i != e - 1) - Out << ","; - Out << "\n"; - } - Out << " };\n"; - Out << " return llvm_cbe_mrv_temp;\n"; - Out << " }\n"; - return; - } - Out << " return"; if (I.getNumOperands()) { Out << ' '; diff --git a/lib/Target/CellSPU/SPU64InstrInfo.td b/lib/Target/CellSPU/SPU64InstrInfo.td index 5ef5716..f340edf 100644 --- a/lib/Target/CellSPU/SPU64InstrInfo.td +++ b/lib/Target/CellSPU/SPU64InstrInfo.td @@ -24,7 +24,7 @@ // 5. The code sequences for r64 and v2i64 are probably overly conservative, // compared to the code that gcc produces. // -// M00$E B!tes Kan be Pretty N@sTi!!!!! (appologies to Monty!) +// M00$E B!tes Kan be Pretty N@sTi!!!!! (apologies to Monty!) //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ // selb instruction definition for i64. Note that the selection mask is diff --git a/lib/Target/CellSPU/SPUAsmPrinter.cpp b/lib/Target/CellSPU/SPUAsmPrinter.cpp index 4040461..fd96694 100644 --- a/lib/Target/CellSPU/SPUAsmPrinter.cpp +++ b/lib/Target/CellSPU/SPUAsmPrinter.cpp @@ -182,6 +182,10 @@ namespace { printOp(MI->getOperand(OpNo), O); } + void printHBROperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) { + printOp(MI->getOperand(OpNo), O); + } + void printPCRelativeOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) { // Used to generate a ".-<target>", but it turns out that the assembler // really wants the target. @@ -279,6 +283,9 @@ void SPUAsmPrinter::printOp(const MachineOperand &MO, raw_ostream &O) { } O << *Mang->getSymbol(MO.getGlobal()); return; + case MachineOperand::MO_MCSymbol: + O << *(MO.getMCSymbol()); + return; default: O << "<unknown operand type: " << MO.getType() << ">"; return; diff --git a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp index d226156..9351ffd 100644 --- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp +++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp @@ -321,12 +321,17 @@ SPUDAGToDAGISel::SelectAFormAddr(SDNode *Op, SDValue N, SDValue &Base, // These match the addr256k operand type: EVT OffsVT = MVT::i16; SDValue Zero = CurDAG->getTargetConstant(0, OffsVT); + int64_t val; switch (N.getOpcode()) { case ISD::Constant: + val = dyn_cast<ConstantSDNode>(N.getNode())->getSExtValue(); + Base = CurDAG->getTargetConstant( val , MVT::i32); + Index = Zero; + return true; break; case ISD::ConstantPool: case ISD::GlobalAddress: - report_fatal_error("SPU SelectAFormAddr: Constant/Pool/Global not lowered."); + report_fatal_error("SPU SelectAFormAddr: Pool/Global not lowered."); /*NOTREACHED*/ case ISD::TargetConstant: diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp index 743a4d7..8668da3 100644 --- a/lib/Target/CellSPU/SPUISelLowering.cpp +++ b/lib/Target/CellSPU/SPUISelLowering.cpp @@ -705,7 +705,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { offset )); - // Shift the low similarily + // Shift the low similarly // TODO: add SPUISD::SHL_BYTES low = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, low, offset ); diff --git a/lib/Target/CellSPU/SPUISelLowering.h b/lib/Target/CellSPU/SPUISelLowering.h index dd48d7b..cf883e2 100644 --- a/lib/Target/CellSPU/SPUISelLowering.h +++ b/lib/Target/CellSPU/SPUISelLowering.h @@ -183,14 +183,6 @@ namespace llvm { virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty) const; - - /// After allocating this many registers, the allocator should feel - /// register pressure. The value is a somewhat random guess, based on the - /// number of non callee saved registers in the C calling convention. - virtual unsigned getRegPressureLimit( const TargetRegisterClass *RC, - MachineFunction &MF) const{ - return 50; - } }; } diff --git a/lib/Target/CellSPU/SPUInstrFormats.td b/lib/Target/CellSPU/SPUInstrFormats.td index 21bc275..bdbe255 100644 --- a/lib/Target/CellSPU/SPUInstrFormats.td +++ b/lib/Target/CellSPU/SPUInstrFormats.td @@ -296,3 +296,25 @@ class Pseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern> let Pattern = pattern; let Inst{31-0} = 0; } + +//===----------------------------------------------------------------------===// +// Branch hint formats +//===----------------------------------------------------------------------===// +// For hbrr and hbra +class HBI16Form<bits<7> opcode, dag IOL, string asmstr> + : Instruction { + field bits<32> Inst; + bits<16>i16; + bits<9>RO; + + let Namespace = "SPU"; + let InOperandList = IOL; + let OutOperandList = (outs); //no output + let AsmString = asmstr; + let Itinerary = BranchHints; + + let Inst{0-6} = opcode; + let Inst{7-8} = RO{8-7}; + let Inst{9-24} = i16; + let Inst{25-31} = RO{6-0}; +} diff --git a/lib/Target/CellSPU/SPUInstrInfo.cpp b/lib/Target/CellSPU/SPUInstrInfo.cpp index f9e6c72..080434d 100644 --- a/lib/Target/CellSPU/SPUInstrInfo.cpp +++ b/lib/Target/CellSPU/SPUInstrInfo.cpp @@ -21,6 +21,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/MC/MCContext.h" using namespace llvm; @@ -281,9 +282,20 @@ SPUInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, return true; } +// search MBB for branch hint labels and branch hit ops +static void removeHBR( MachineBasicBlock &MBB) { + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I){ + if (I->getOpcode() == SPU::HBRA || + I->getOpcode() == SPU::HBR_LABEL){ + I=MBB.erase(I); + } + } +} + unsigned SPUInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { MachineBasicBlock::iterator I = MBB.end(); + removeHBR(MBB); if (I == MBB.begin()) return 0; --I; @@ -314,6 +326,23 @@ SPUInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { return 2; } +/** Find the optimal position for a hint branch instruction in a basic block. + * This should take into account: + * -the branch hint delays + * -congestion of the memory bus + * -dual-issue scheduling (i.e. avoid insertion of nops) + * Current implementation is rather simplistic. + */ +static MachineBasicBlock::iterator findHBRPosition(MachineBasicBlock &MBB) +{ + MachineBasicBlock::iterator J = MBB.end(); + for( int i=0; i<8; i++) { + if( J == MBB.begin() ) return J; + J--; + } + return J; +} + unsigned SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, @@ -324,32 +353,61 @@ SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, assert((Cond.size() == 2 || Cond.size() == 0) && "SPU branch conditions have two components!"); + MachineInstrBuilder MIB; + //TODO: make a more accurate algorithm. + bool haveHBR = MBB.size()>8; + + removeHBR(MBB); + MCSymbol *branchLabel = MBB.getParent()->getContext().CreateTempSymbol(); + // Add a label just before the branch + if (haveHBR) + MIB = BuildMI(&MBB, DL, get(SPU::HBR_LABEL)).addSym(branchLabel); + // One-way branch. if (FBB == 0) { if (Cond.empty()) { // Unconditional branch - MachineInstrBuilder MIB = BuildMI(&MBB, DL, get(SPU::BR)); + MIB = BuildMI(&MBB, DL, get(SPU::BR)); MIB.addMBB(TBB); DEBUG(errs() << "Inserted one-way uncond branch: "); DEBUG((*MIB).dump()); + + // basic blocks have just one branch so it is safe to add the hint a its + if (haveHBR) { + MIB = BuildMI( MBB, findHBRPosition(MBB), DL, get(SPU::HBRA)); + MIB.addSym(branchLabel); + MIB.addMBB(TBB); + } } else { // Conditional branch - MachineInstrBuilder MIB = BuildMI(&MBB, DL, get(Cond[0].getImm())); + MIB = BuildMI(&MBB, DL, get(Cond[0].getImm())); MIB.addReg(Cond[1].getReg()).addMBB(TBB); + if (haveHBR) { + MIB = BuildMI(MBB, findHBRPosition(MBB), DL, get(SPU::HBRA)); + MIB.addSym(branchLabel); + MIB.addMBB(TBB); + } + DEBUG(errs() << "Inserted one-way cond branch: "); DEBUG((*MIB).dump()); } return 1; } else { - MachineInstrBuilder MIB = BuildMI(&MBB, DL, get(Cond[0].getImm())); + MIB = BuildMI(&MBB, DL, get(Cond[0].getImm())); MachineInstrBuilder MIB2 = BuildMI(&MBB, DL, get(SPU::BR)); // Two-way Conditional Branch. MIB.addReg(Cond[1].getReg()).addMBB(TBB); MIB2.addMBB(FBB); + if (haveHBR) { + MIB = BuildMI( MBB, findHBRPosition(MBB), DL, get(SPU::HBRA)); + MIB.addSym(branchLabel); + MIB.addMBB(FBB); + } + DEBUG(errs() << "Inserted conditional branch: "); DEBUG((*MIB).dump()); DEBUG(errs() << "part 2: "); diff --git a/lib/Target/CellSPU/SPUInstrInfo.td b/lib/Target/CellSPU/SPUInstrInfo.td index 25f6fd0..e103c9b 100644 --- a/lib/Target/CellSPU/SPUInstrInfo.td +++ b/lib/Target/CellSPU/SPUInstrInfo.td @@ -28,6 +28,8 @@ let hasCtrlDep = 1, Defs = [R1], Uses = [R1] in { def ADJCALLSTACKUP : Pseudo<(outs), (ins u16imm_i32:$amt), "${:comment} ADJCALLSTACKUP", [(callseq_end timm:$amt)]>; + def HBR_LABEL : Pseudo<(outs), (ins hbrtarget:$targ), + "$targ:\t${:comment}branch hint target",[ ]>; } //===----------------------------------------------------------------------===// @@ -2013,9 +2015,9 @@ class SHLHInst<dag OOL, dag IOL, list<dag> pattern>: RotShiftVec, pattern>; class SHLHVecInst<ValueType vectype>: - SHLHInst<(outs VECREG:$rT), (ins VECREG:$rA, R16C:$rB), + SHLHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB), [(set (vectype VECREG:$rT), - (SPUvec_shl (vectype VECREG:$rA), R16C:$rB))]>; + (SPUvec_shl (vectype VECREG:$rA), (vectype VECREG:$rB)))]>; multiclass ShiftLeftHalfword { @@ -2063,9 +2065,9 @@ class SHLInst<dag OOL, dag IOL, list<dag> pattern>: multiclass ShiftLeftWord { def v4i32: - SHLInst<(outs VECREG:$rT), (ins VECREG:$rA, R16C:$rB), + SHLInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB), [(set (v4i32 VECREG:$rT), - (SPUvec_shl (v4i32 VECREG:$rA), R16C:$rB))]>; + (SPUvec_shl (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>; def r32: SHLInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB), [(set R32C:$rT, (shl R32C:$rA, R32C:$rB))]>; @@ -2511,19 +2513,11 @@ class ROTHMInst<dag OOL, dag IOL, list<dag> pattern>: RotShiftVec, pattern>; def ROTHMv8i16: - ROTHMInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB), + ROTHMInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB), [/* see patterns below - $rB must be negated */]>; -def : Pat<(SPUvec_srl (v8i16 VECREG:$rA), R32C:$rB), - (ROTHMv8i16 VECREG:$rA, (SFIr32 R32C:$rB, 0))>; - -def : Pat<(SPUvec_srl (v8i16 VECREG:$rA), R16C:$rB), - (ROTHMv8i16 VECREG:$rA, - (SFIr32 (XSHWr16 R16C:$rB), 0))>; - -def : Pat<(SPUvec_srl (v8i16 VECREG:$rA), R8C:$rB), - (ROTHMv8i16 VECREG:$rA, - (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB) ), 0))>; +def : Pat<(SPUvec_srl (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)), + (ROTHMv8i16 VECREG:$rA, (SFHIvec VECREG:$rB, 0))>; // ROTHM r16 form: Rotate 16-bit quantity to right, zero fill at the left // Note: This instruction doesn't match a pattern because rB must be negated @@ -2584,19 +2578,11 @@ class ROTMInst<dag OOL, dag IOL, list<dag> pattern>: RotShiftVec, pattern>; def ROTMv4i32: - ROTMInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB), + ROTMInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB), [/* see patterns below - $rB must be negated */]>; -def : Pat<(SPUvec_srl (v4i32 VECREG:$rA), R32C:$rB), - (ROTMv4i32 VECREG:$rA, (SFIr32 R32C:$rB, 0))>; - -def : Pat<(SPUvec_srl (v4i32 VECREG:$rA), R16C:$rB), - (ROTMv4i32 VECREG:$rA, - (SFIr32 (XSHWr16 R16C:$rB), 0))>; - -def : Pat<(SPUvec_srl (v4i32 VECREG:$rA), R8C:$rB), - (ROTMv4i32 VECREG:$rA, - (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB)), 0))>; +def : Pat<(SPUvec_srl (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)), + (ROTMv4i32 VECREG:$rA, (SFIvec VECREG:$rB, 0))>; def ROTMr32: ROTMInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB), @@ -2802,20 +2788,12 @@ defm ROTQMBII: RotateMaskQuadByBitsImm; //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ def ROTMAHv8i16: - RRForm<0b01111010000, (outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB), + RRForm<0b01111010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB), "rotmah\t$rT, $rA, $rB", RotShiftVec, [/* see patterns below - $rB must be negated */]>; -def : Pat<(SPUvec_sra (v8i16 VECREG:$rA), R32C:$rB), - (ROTMAHv8i16 VECREG:$rA, (SFIr32 R32C:$rB, 0))>; - -def : Pat<(SPUvec_sra (v8i16 VECREG:$rA), R16C:$rB), - (ROTMAHv8i16 VECREG:$rA, - (SFIr32 (XSHWr16 R16C:$rB), 0))>; - -def : Pat<(SPUvec_sra (v8i16 VECREG:$rA), R8C:$rB), - (ROTMAHv8i16 VECREG:$rA, - (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB)), 0))>; +def : Pat<(SPUvec_sra (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)), + (ROTMAHv8i16 VECREG:$rA, (SFHIvec VECREG:$rB, 0))>; def ROTMAHr16: RRForm<0b01111010000, (outs R16C:$rT), (ins R16C:$rA, R32C:$rB), @@ -2857,20 +2835,12 @@ def : Pat<(sra R16C:$rA, (i8 imm:$val)), (ROTMAHIr16 R16C:$rA, (TO_IMM32 uimm7:$val))>; def ROTMAv4i32: - RRForm<0b01011010000, (outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB), + RRForm<0b01011010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB), "rotma\t$rT, $rA, $rB", RotShiftVec, [/* see patterns below - $rB must be negated */]>; -def : Pat<(SPUvec_sra (v4i32 VECREG:$rA), R32C:$rB), - (ROTMAv4i32 VECREG:$rA, (SFIr32 R32C:$rB, 0))>; - -def : Pat<(SPUvec_sra (v4i32 VECREG:$rA), R16C:$rB), - (ROTMAv4i32 VECREG:$rA, - (SFIr32 (XSHWr16 R16C:$rB), 0))>; - -def : Pat<(SPUvec_sra (v4i32 VECREG:$rA), R8C:$rB), - (ROTMAv4i32 VECREG:$rA, - (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB)), 0))>; +def : Pat<(SPUvec_sra (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)), + (ROTMAv4i32 VECREG:$rA, (SFIvec (v4i32 VECREG:$rB), 0))>; def ROTMAr32: RRForm<0b01011010000, (outs R32C:$rT), (ins R32C:$rA, R32C:$rB), @@ -4208,8 +4178,8 @@ def : Pat<(fabs (v4f32 VECREG:$rA)), //===----------------------------------------------------------------------===// // Hint for branch instructions: //===----------------------------------------------------------------------===// - -/* def HBR : SPUInstr<(outs), (ins), "hbr\t" */ +def HBRA : + HBI16Form<0b0001001,(ins hbrtarget:$brinst, brtarget:$btarg), "hbra\t$brinst, $btarg">; //===----------------------------------------------------------------------===// // Execution, Load NOP (execute NOPs belong in even pipeline, load NOPs belong diff --git a/lib/Target/CellSPU/SPURegisterInfo.h b/lib/Target/CellSPU/SPURegisterInfo.h index 641da04..1708c59 100644 --- a/lib/Target/CellSPU/SPURegisterInfo.h +++ b/lib/Target/CellSPU/SPURegisterInfo.h @@ -46,6 +46,14 @@ namespace llvm { virtual const TargetRegisterClass * getPointerRegClass(unsigned Kind = 0) const; + /// After allocating this many registers, the allocator should feel + /// register pressure. The value is a somewhat random guess, based on the + /// number of non callee saved registers in the C calling convention. + virtual unsigned getRegPressureLimit( const TargetRegisterClass *RC, + MachineFunction &MF) const{ + return 50; + } + //! Return the array of callee-saved registers virtual const unsigned* getCalleeSavedRegs(const MachineFunction *MF) const; diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp index 71d6049..797cfd5 100644 --- a/lib/Target/CppBackend/CPPBackend.cpp +++ b/lib/Target/CppBackend/CPPBackend.cpp @@ -1348,12 +1348,10 @@ void CppWriter::printInstruction(const Instruction *I, const PHINode* phi = cast<PHINode>(I); Out << "PHINode* " << iName << " = PHINode::Create(" - << getCppName(phi->getType()) << ", \""; + << getCppName(phi->getType()) << ", " + << phi->getNumIncomingValues() << ", \""; printEscapedString(phi->getName()); Out << "\", " << bbname << ");"; - nl(Out) << iName << "->reserveOperandSpace(" - << phi->getNumIncomingValues() - << ");"; nl(Out); for (unsigned i = 0; i < phi->getNumOperands(); i+=2) { Out << iName << "->addIncoming(" diff --git a/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp b/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp index 3379ac2..060a87b 100644 --- a/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp +++ b/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp @@ -57,18 +57,26 @@ static unsigned mblazeBinary2Opcode[] = { }; static unsigned getRD(uint32_t insn) { + if (!MBlazeRegisterInfo::isRegister((insn>>21)&0x1F)) + return UNSUPPORTED; return MBlazeRegisterInfo::getRegisterFromNumbering((insn>>21)&0x1F); } static unsigned getRA(uint32_t insn) { + if (!MBlazeRegisterInfo::getRegisterFromNumbering((insn>>16)&0x1F)) + return UNSUPPORTED; return MBlazeRegisterInfo::getRegisterFromNumbering((insn>>16)&0x1F); } static unsigned getRB(uint32_t insn) { + if (!MBlazeRegisterInfo::getRegisterFromNumbering((insn>>11)&0x1F)) + return UNSUPPORTED; return MBlazeRegisterInfo::getRegisterFromNumbering((insn>>11)&0x1F); } static int64_t getRS(uint32_t insn) { + if (!MBlazeRegisterInfo::isSpecialRegister(insn&0x3FFF)) + return UNSUPPORTED; return MBlazeRegisterInfo::getSpecialRegisterFromNumbering(insn&0x3FFF); } @@ -489,13 +497,14 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr, raw_ostream &vStream) const { // The machine instruction. uint32_t insn; + uint64_t read; uint8_t bytes[4]; - // We always consume 4 bytes of data - size = 4; + // By default we consume 1 byte on failure + size = 1; // We want to read exactly 4 bytes of data. - if (region.readBytes(address, 4, (uint8_t*)bytes, NULL) == -1) + if (region.readBytes(address, 4, (uint8_t*)bytes, &read) == -1 || read < 4) return false; // Encoded as a big-endian 32-bit word in the stream. @@ -509,44 +518,63 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr, instr.setOpcode(opcode); + unsigned RD = getRD(insn); + unsigned RA = getRA(insn); + unsigned RB = getRB(insn); + unsigned RS = getRS(insn); + uint64_t tsFlags = MBlazeInsts[opcode].TSFlags; switch ((tsFlags & MBlazeII::FormMask)) { - default: llvm_unreachable("unknown instruction encoding"); + default: + return false; case MBlazeII::FRRRR: - instr.addOperand(MCOperand::CreateReg(getRD(insn))); - instr.addOperand(MCOperand::CreateReg(getRB(insn))); - instr.addOperand(MCOperand::CreateReg(getRA(insn))); + if (RD == UNSUPPORTED || RA == UNSUPPORTED || RB == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RD)); + instr.addOperand(MCOperand::CreateReg(RB)); + instr.addOperand(MCOperand::CreateReg(RA)); break; case MBlazeII::FRRR: - instr.addOperand(MCOperand::CreateReg(getRD(insn))); - instr.addOperand(MCOperand::CreateReg(getRA(insn))); - instr.addOperand(MCOperand::CreateReg(getRB(insn))); + if (RD == UNSUPPORTED || RA == UNSUPPORTED || RB == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RD)); + instr.addOperand(MCOperand::CreateReg(RA)); + instr.addOperand(MCOperand::CreateReg(RB)); break; case MBlazeII::FRI: switch (opcode) { - default: llvm_unreachable("unknown instruction encoding"); + default: + return false; case MBlaze::MFS: - instr.addOperand(MCOperand::CreateReg(getRD(insn))); + if (RD == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RD)); instr.addOperand(MCOperand::CreateImm(insn&0x3FFF)); break; case MBlaze::MTS: + if (RA == UNSUPPORTED) + return false; instr.addOperand(MCOperand::CreateImm(insn&0x3FFF)); - instr.addOperand(MCOperand::CreateReg(getRA(insn))); + instr.addOperand(MCOperand::CreateReg(RA)); break; case MBlaze::MSRSET: case MBlaze::MSRCLR: - instr.addOperand(MCOperand::CreateReg(getRD(insn))); + if (RD == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RD)); instr.addOperand(MCOperand::CreateImm(insn&0x7FFF)); break; } break; case MBlazeII::FRRI: - instr.addOperand(MCOperand::CreateReg(getRD(insn))); - instr.addOperand(MCOperand::CreateReg(getRA(insn))); + if (RD == UNSUPPORTED || RA == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RD)); + instr.addOperand(MCOperand::CreateReg(RA)); switch (opcode) { default: instr.addOperand(MCOperand::CreateImm(getIMM(insn))); @@ -560,27 +588,37 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr, break; case MBlazeII::FCRR: - instr.addOperand(MCOperand::CreateReg(getRA(insn))); - instr.addOperand(MCOperand::CreateReg(getRB(insn))); + if (RA == UNSUPPORTED || RB == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RA)); + instr.addOperand(MCOperand::CreateReg(RB)); break; case MBlazeII::FCRI: - instr.addOperand(MCOperand::CreateReg(getRA(insn))); + if (RA == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RA)); instr.addOperand(MCOperand::CreateImm(getIMM(insn))); break; case MBlazeII::FRCR: - instr.addOperand(MCOperand::CreateReg(getRD(insn))); - instr.addOperand(MCOperand::CreateReg(getRB(insn))); + if (RD == UNSUPPORTED || RB == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RD)); + instr.addOperand(MCOperand::CreateReg(RB)); break; case MBlazeII::FRCI: - instr.addOperand(MCOperand::CreateReg(getRD(insn))); + if (RD == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RD)); instr.addOperand(MCOperand::CreateImm(getIMM(insn))); break; case MBlazeII::FCCR: - instr.addOperand(MCOperand::CreateReg(getRB(insn))); + if (RB == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RB)); break; case MBlazeII::FCCI: @@ -588,33 +626,45 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr, break; case MBlazeII::FRRCI: - instr.addOperand(MCOperand::CreateReg(getRD(insn))); - instr.addOperand(MCOperand::CreateReg(getRA(insn))); + if (RD == UNSUPPORTED || RA == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RD)); + instr.addOperand(MCOperand::CreateReg(RA)); instr.addOperand(MCOperand::CreateImm(getSHT(insn))); break; case MBlazeII::FRRC: - instr.addOperand(MCOperand::CreateReg(getRD(insn))); - instr.addOperand(MCOperand::CreateReg(getRA(insn))); + if (RD == UNSUPPORTED || RA == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RD)); + instr.addOperand(MCOperand::CreateReg(RA)); break; case MBlazeII::FRCX: - instr.addOperand(MCOperand::CreateReg(getRD(insn))); + if (RD == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RD)); instr.addOperand(MCOperand::CreateImm(getFSL(insn))); break; case MBlazeII::FRCS: - instr.addOperand(MCOperand::CreateReg(getRD(insn))); - instr.addOperand(MCOperand::CreateReg(getRS(insn))); + if (RD == UNSUPPORTED || RS == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RD)); + instr.addOperand(MCOperand::CreateReg(RS)); break; case MBlazeII::FCRCS: - instr.addOperand(MCOperand::CreateReg(getRS(insn))); - instr.addOperand(MCOperand::CreateReg(getRA(insn))); + if (RS == UNSUPPORTED || RA == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RS)); + instr.addOperand(MCOperand::CreateReg(RA)); break; case MBlazeII::FCRCX: - instr.addOperand(MCOperand::CreateReg(getRA(insn))); + if (RA == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RA)); instr.addOperand(MCOperand::CreateImm(getFSL(insn))); break; @@ -623,16 +673,23 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr, break; case MBlazeII::FCR: - instr.addOperand(MCOperand::CreateReg(getRB(insn))); + if (RB == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RB)); break; case MBlazeII::FRIR: - instr.addOperand(MCOperand::CreateReg(getRD(insn))); + if (RD == UNSUPPORTED || RA == UNSUPPORTED) + return false; + instr.addOperand(MCOperand::CreateReg(RD)); instr.addOperand(MCOperand::CreateImm(getIMM(insn))); - instr.addOperand(MCOperand::CreateReg(getRA(insn))); + instr.addOperand(MCOperand::CreateReg(RA)); break; } + // We always consume 4 bytes of data on success + size = 4; + return true; } diff --git a/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h b/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h index bebc6c8..13c4b49 100644 --- a/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h +++ b/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h @@ -18,11 +18,12 @@ namespace llvm { class MCOperand; + class TargetMachine; class MBlazeInstPrinter : public MCInstPrinter { public: - MBlazeInstPrinter(const MCAsmInfo &MAI) : MCInstPrinter(MAI) { - } + MBlazeInstPrinter(TargetMachine &TM, const MCAsmInfo &MAI) + : MCInstPrinter(MAI) {} virtual void printInst(const MCInst *MI, raw_ostream &O); diff --git a/lib/Target/MBlaze/MBlaze.td b/lib/Target/MBlaze/MBlaze.td index 1fa1e4d..1245658 100644 --- a/lib/Target/MBlaze/MBlaze.td +++ b/lib/Target/MBlaze/MBlaze.td @@ -31,49 +31,28 @@ def MBlazeInstrInfo : InstrInfo; // Microblaze Subtarget features // //===----------------------------------------------------------------------===// -def FeaturePipe3 : SubtargetFeature<"pipe3", "HasPipe3", "true", - "Implements 3-stage pipeline">; def FeatureBarrel : SubtargetFeature<"barrel", "HasBarrel", "true", "Implements barrel shifter">; def FeatureDiv : SubtargetFeature<"div", "HasDiv", "true", "Implements hardware divider">; def FeatureMul : SubtargetFeature<"mul", "HasMul", "true", "Implements hardware multiplier">; -def FeatureFSL : SubtargetFeature<"fsl", "HasFSL", "true", - "Implements FSL instructions">; -def FeatureEFSL : SubtargetFeature<"efsl", "HasEFSL", "true", - "Implements extended FSL instructions">; -def FeatureMSRSet : SubtargetFeature<"msrset", "HasMSRSet", "true", - "Implements MSR register set and clear">; -def FeatureException : SubtargetFeature<"exception", "HasException", "true", - "Implements hardware exception support">; def FeaturePatCmp : SubtargetFeature<"patcmp", "HasPatCmp", "true", "Implements pattern compare instruction">; def FeatureFPU : SubtargetFeature<"fpu", "HasFPU", "true", "Implements floating point unit">; -def FeatureESR : SubtargetFeature<"esr", "HasESR", "true", - "Implements ESR and EAR registers">; -def FeaturePVR : SubtargetFeature<"pvr", "HasPVR", "true", - "Implements processor version register">; def FeatureMul64 : SubtargetFeature<"mul64", "HasMul64", "true", "Implements multiplier with 64-bit result">; def FeatureSqrt : SubtargetFeature<"sqrt", "HasSqrt", "true", "Implements sqrt and floating point convert">; -def FeatureMMU : SubtargetFeature<"mmu", "HasMMU", "true", - "Implements memory management unit">; //===----------------------------------------------------------------------===// // MBlaze processors supported. //===----------------------------------------------------------------------===// -class Proc<string Name, list<SubtargetFeature> Features> - : Processor<Name, MBlazeGenericItineraries, Features>; - -def : Proc<"v400", []>; -def : Proc<"v500", []>; -def : Proc<"v600", []>; -def : Proc<"v700", []>; -def : Proc<"v710", []>; +def : Processor<"mblaze", MBlazeGenericItineraries, []>; +def : Processor<"mblaze3", MBlazePipe3Itineraries, []>; +def : Processor<"mblaze5", MBlazePipe5Itineraries, []>; //===----------------------------------------------------------------------===// // Instruction Descriptions diff --git a/lib/Target/MBlaze/MBlazeAsmBackend.cpp b/lib/Target/MBlaze/MBlazeAsmBackend.cpp index a4b21af..08f14c3 100644 --- a/lib/Target/MBlaze/MBlazeAsmBackend.cpp +++ b/lib/Target/MBlaze/MBlazeAsmBackend.cpp @@ -150,14 +150,13 @@ void ELFMBlazeAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data, TargetAsmBackend *llvm::createMBlazeAsmBackend(const Target &T, const std::string &TT) { - switch (Triple(TT).getOS()) { - case Triple::Darwin: + Triple TheTriple(TT); + + if (TheTriple.isOSDarwin()) assert(0 && "Mac not supported on MBlaze"); - case Triple::MinGW32: - case Triple::Cygwin: - case Triple::Win32: + + if (TheTriple.isOSWindows()) assert(0 && "Windows not supported on MBlaze"); - default: - return new ELFMBlazeAsmBackend(T, Triple(TT).getOS()); - } + + return new ELFMBlazeAsmBackend(T, TheTriple.getOS()); } diff --git a/lib/Target/MBlaze/MBlazeAsmPrinter.cpp b/lib/Target/MBlaze/MBlazeAsmPrinter.cpp index 0016df5..0f0f60e 100644 --- a/lib/Target/MBlaze/MBlazeAsmPrinter.cpp +++ b/lib/Target/MBlaze/MBlazeAsmPrinter.cpp @@ -319,10 +319,11 @@ isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const { } static MCInstPrinter *createMBlazeMCInstPrinter(const Target &T, + TargetMachine &TM, unsigned SyntaxVariant, const MCAsmInfo &MAI) { if (SyntaxVariant == 0) - return new MBlazeInstPrinter(MAI); + return new MBlazeInstPrinter(TM, MAI); return 0; } diff --git a/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp b/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp index 4399ee2..973e968 100644 --- a/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp +++ b/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp @@ -77,7 +77,7 @@ static bool hasImmInstruction(MachineBasicBlock::iterator &candidate) { // We must assume that unknown immediate values require more than // 16-bits to represent. - if (mop.isGlobal() || mop.isSymbol()) + if (mop.isGlobal() || mop.isSymbol() || mop.isJTI() || mop.isCPI()) return true; // FIXME: we could probably check to see if the FP value happens diff --git a/lib/Target/MBlaze/MBlazeISelLowering.cpp b/lib/Target/MBlaze/MBlazeISelLowering.cpp index f39826b..21a5988 100644 --- a/lib/Target/MBlaze/MBlazeISelLowering.cpp +++ b/lib/Target/MBlaze/MBlazeISelLowering.cpp @@ -274,7 +274,7 @@ MBlazeTargetLowering::EmitCustomShift(MachineInstr *MI, F->insert(It, loop); F->insert(It, finish); - // Update machine-CFG edges by transfering adding all successors and + // Update machine-CFG edges by transferring adding all successors and // remaining instructions from the current block to the new block which // will contain the Phi node for the select. finish->splice(finish->begin(), MBB, @@ -456,7 +456,7 @@ MBlazeTargetLowering::EmitCustomAtomic(MachineInstr *MI, F->insert(It, start); F->insert(It, exit); - // Update machine-CFG edges by transfering adding all successors and + // Update machine-CFG edges by transferring adding all successors and // remaining instructions from the current block to the new block which // will contain the Phi node for the select. exit->splice(exit->begin(), MBB, llvm::next(MachineBasicBlock::iterator(MI)), @@ -778,7 +778,7 @@ LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, // Build a sequence of copy-to-reg nodes chained together with token // chain and flag operands which copy the outgoing args into registers. - // The InFlag in necessary since all emited instructions must be + // The InFlag in necessary since all emitted instructions must be // stuck together. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { @@ -1103,7 +1103,7 @@ MBlazeTargetLowering::getSingleConstraintMatchWeight( switch (*constraint) { default: weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); - break;
+ break; case 'd': case 'y': if (type->isIntegerTy()) diff --git a/lib/Target/MBlaze/MBlazeInstrFPU.td b/lib/Target/MBlaze/MBlazeInstrFPU.td index 094de5c..4acdcfd 100644 --- a/lib/Target/MBlaze/MBlazeInstrFPU.td +++ b/lib/Target/MBlaze/MBlazeInstrFPU.td @@ -21,22 +21,22 @@ class LoadFM<bits<6> op, string instr_asm, PatFrag OpNode> : TA<op, 0x000, (outs GPR:$dst), (ins memrr:$addr), !strconcat(instr_asm, " $dst, $addr"), - [(set (f32 GPR:$dst), (OpNode xaddr:$addr))], IILoad>; + [(set (f32 GPR:$dst), (OpNode xaddr:$addr))], IIC_MEMl>; class LoadFMI<bits<6> op, string instr_asm, PatFrag OpNode> : TB<op, (outs GPR:$dst), (ins memri:$addr), !strconcat(instr_asm, " $dst, $addr"), - [(set (f32 GPR:$dst), (OpNode iaddr:$addr))], IILoad>; + [(set (f32 GPR:$dst), (OpNode iaddr:$addr))], IIC_MEMl>; class StoreFM<bits<6> op, string instr_asm, PatFrag OpNode> : TA<op, 0x000, (outs), (ins GPR:$dst, memrr:$addr), !strconcat(instr_asm, " $dst, $addr"), - [(OpNode (f32 GPR:$dst), xaddr:$addr)], IIStore>; + [(OpNode (f32 GPR:$dst), xaddr:$addr)], IIC_MEMs>; class StoreFMI<bits<6> op, string instr_asm, PatFrag OpNode> : TB<op, (outs), (ins GPR:$dst, memrr:$addr), !strconcat(instr_asm, " $dst, $addr"), - [(OpNode (f32 GPR:$dst), iaddr:$addr)], IIStore>; + [(OpNode (f32 GPR:$dst), iaddr:$addr)], IIC_MEMs>; class ArithF<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode, InstrItinClass itin> : @@ -56,15 +56,10 @@ class ArithFR<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode, !strconcat(instr_asm, " $dst, $c, $b"), [(set GPR:$dst, (OpNode GPR:$b, GPR:$c))], itin>; -class LogicF<bits<6> op, string instr_asm> : - TB<op, (outs GPR:$dst), (ins GPR:$b, GPR:$c), - !strconcat(instr_asm, " $dst, $b, $c"), - [], IIAlu>; - class LogicFI<bits<6> op, string instr_asm> : TB<op, (outs GPR:$dst), (ins GPR:$b, fimm:$c), !strconcat(instr_asm, " $dst, $b, $c"), - [], IIAlu>; + [], IIC_ALU>; let rb=0 in { class ArithF2<bits<6> op, bits<11> flags, string instr_asm, @@ -95,10 +90,10 @@ let rb=0 in { //===----------------------------------------------------------------------===// let Predicates=[HasFPU] in { def FORI : LogicFI<0x28, "ori ">; - def FADD : ArithF<0x16, 0x000, "fadd ", fadd, IIAlu>; - def FRSUB : ArithFR<0x16, 0x080, "frsub ", fsub, IIAlu>; - def FMUL : ArithF<0x16, 0x100, "fmul ", fmul, IIAlu>; - def FDIV : ArithF<0x16, 0x180, "fdiv ", fdiv, IIAlu>; + def FADD : ArithF<0x16, 0x000, "fadd ", fadd, IIC_FPU>; + def FRSUB : ArithFR<0x16, 0x080, "frsub ", fsub, IIC_FPU>; + def FMUL : ArithF<0x16, 0x100, "fmul ", fmul, IIC_FPU>; + def FDIV : ArithF<0x16, 0x180, "fdiv ", fdiv, IIC_FPUd>; } let Predicates=[HasFPU], isCodeGenOnly=1 in { @@ -110,19 +105,19 @@ let Predicates=[HasFPU], isCodeGenOnly=1 in { } let Predicates=[HasFPU,HasSqrt] in { - def FLT : ArithIF<0x16, 0x280, "flt ", IIAlu>; - def FINT : ArithFI<0x16, 0x300, "fint ", IIAlu>; - def FSQRT : ArithF2<0x16, 0x380, "fsqrt ", IIAlu>; + def FLT : ArithIF<0x16, 0x280, "flt ", IIC_FPUf>; + def FINT : ArithFI<0x16, 0x300, "fint ", IIC_FPUi>; + def FSQRT : ArithF2<0x16, 0x380, "fsqrt ", IIC_FPUs>; } let isAsCheapAsAMove = 1 in { - def FCMP_UN : CmpFN<0x16, 0x200, "fcmp.un", IIAlu>; - def FCMP_LT : CmpFN<0x16, 0x210, "fcmp.lt", IIAlu>; - def FCMP_EQ : CmpFN<0x16, 0x220, "fcmp.eq", IIAlu>; - def FCMP_LE : CmpFN<0x16, 0x230, "fcmp.le", IIAlu>; - def FCMP_GT : CmpFN<0x16, 0x240, "fcmp.gt", IIAlu>; - def FCMP_NE : CmpFN<0x16, 0x250, "fcmp.ne", IIAlu>; - def FCMP_GE : CmpFN<0x16, 0x260, "fcmp.ge", IIAlu>; + def FCMP_UN : CmpFN<0x16, 0x200, "fcmp.un", IIC_FPUc>; + def FCMP_LT : CmpFN<0x16, 0x210, "fcmp.lt", IIC_FPUc>; + def FCMP_EQ : CmpFN<0x16, 0x220, "fcmp.eq", IIC_FPUc>; + def FCMP_LE : CmpFN<0x16, 0x230, "fcmp.le", IIC_FPUc>; + def FCMP_GT : CmpFN<0x16, 0x240, "fcmp.gt", IIC_FPUc>; + def FCMP_NE : CmpFN<0x16, 0x250, "fcmp.ne", IIC_FPUc>; + def FCMP_GE : CmpFN<0x16, 0x260, "fcmp.ge", IIC_FPUc>; } diff --git a/lib/Target/MBlaze/MBlazeInstrFSL.td b/lib/Target/MBlaze/MBlazeInstrFSL.td index 3209845..3082a7e 100644 --- a/lib/Target/MBlaze/MBlazeInstrFSL.td +++ b/lib/Target/MBlaze/MBlazeInstrFSL.td @@ -13,7 +13,7 @@ class FSLGet<bits<6> op, bits<5> flags, string instr_asm, Intrinsic OpNode> : MBlazeInst<op, FRCX, (outs GPR:$dst), (ins fslimm:$b), !strconcat(instr_asm, " $dst, $b"), - [(set GPR:$dst, (OpNode immZExt4:$b))],IIAlu> + [(set GPR:$dst, (OpNode immZExt4:$b))],IIC_FSLg> { bits<5> rd; bits<4> fslno; @@ -29,7 +29,7 @@ class FSLGet<bits<6> op, bits<5> flags, string instr_asm, Intrinsic OpNode> : class FSLGetD<bits<6> op, bits<5> flags, string instr_asm, Intrinsic OpNode> : MBlazeInst<op, FRCR, (outs GPR:$dst), (ins GPR:$b), !strconcat(instr_asm, " $dst, $b"), - [(set GPR:$dst, (OpNode GPR:$b))], IIAlu> + [(set GPR:$dst, (OpNode GPR:$b))], IIC_FSLg> { bits<5> rd; bits<5> rb; @@ -45,7 +45,7 @@ class FSLGetD<bits<6> op, bits<5> flags, string instr_asm, Intrinsic OpNode> : class FSLPut<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> : MBlazeInst<op, FCRCX, (outs), (ins GPR:$v, fslimm:$b), !strconcat(instr_asm, " $v, $b"), - [(OpNode GPR:$v, immZExt4:$b)], IIAlu> + [(OpNode GPR:$v, immZExt4:$b)], IIC_FSLp> { bits<5> ra; bits<4> fslno; @@ -61,7 +61,7 @@ class FSLPut<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> : class FSLPutD<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> : MBlazeInst<op, FCRR, (outs), (ins GPR:$v, GPR:$b), !strconcat(instr_asm, " $v, $b"), - [(OpNode GPR:$v, GPR:$b)], IIAlu> + [(OpNode GPR:$v, GPR:$b)], IIC_FSLp> { bits<5> ra; bits<5> rb; @@ -77,7 +77,7 @@ class FSLPutD<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> : class FSLPutT<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> : MBlazeInst<op, FCX, (outs), (ins fslimm:$b), !strconcat(instr_asm, " $b"), - [(OpNode immZExt4:$b)], IIAlu> + [(OpNode immZExt4:$b)], IIC_FSLp> { bits<4> fslno; @@ -92,7 +92,7 @@ class FSLPutT<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> : class FSLPutTD<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> : MBlazeInst<op, FCR, (outs), (ins GPR:$b), !strconcat(instr_asm, " $b"), - [(OpNode GPR:$b)], IIAlu> + [(OpNode GPR:$b)], IIC_FSLp> { bits<5> rb; diff --git a/lib/Target/MBlaze/MBlazeInstrFormats.td b/lib/Target/MBlaze/MBlazeInstrFormats.td index d62574d..54f605f 100644 --- a/lib/Target/MBlaze/MBlazeInstrFormats.td +++ b/lib/Target/MBlaze/MBlazeInstrFormats.td @@ -81,7 +81,7 @@ class MBlazeInst<bits<6> op, Format form, dag outs, dag ins, string asmstr, // Pseudo instruction class //===----------------------------------------------------------------------===// class MBlazePseudo<dag outs, dag ins, string asmstr, list<dag> pattern>: - MBlazeInst<0x0, FPseudo, outs, ins, asmstr, pattern, IIPseudo>; + MBlazeInst<0x0, FPseudo, outs, ins, asmstr, pattern, IIC_Pseudo>; //===----------------------------------------------------------------------===// // Type A instruction class in MBlaze : <|opcode|rd|ra|rb|flags|> diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.cpp b/lib/Target/MBlaze/MBlazeInstrInfo.cpp index b353dcd..794ebed 100644 --- a/lib/Target/MBlaze/MBlazeInstrInfo.cpp +++ b/lib/Target/MBlaze/MBlazeInstrInfo.cpp @@ -17,6 +17,8 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/ScoreboardHazardRecognizer.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "MBlazeGenInstrInfo.inc" diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.h b/lib/Target/MBlaze/MBlazeInstrInfo.h index b7300c1..b717da8 100644 --- a/lib/Target/MBlaze/MBlazeInstrInfo.h +++ b/lib/Target/MBlaze/MBlazeInstrInfo.h @@ -261,7 +261,6 @@ public: virtual bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const; - virtual void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, unsigned DestReg, unsigned SrcReg, diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.td b/lib/Target/MBlaze/MBlazeInstrInfo.td index 7b8f70a..896e8ea 100644 --- a/lib/Target/MBlaze/MBlazeInstrInfo.td +++ b/lib/Target/MBlaze/MBlazeInstrInfo.td @@ -47,22 +47,22 @@ def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_MBCallSeqEnd, //===----------------------------------------------------------------------===// // MBlaze Instruction Predicate Definitions. //===----------------------------------------------------------------------===// -def HasPipe3 : Predicate<"Subtarget.hasPipe3()">; +// def HasPipe3 : Predicate<"Subtarget.hasPipe3()">; def HasBarrel : Predicate<"Subtarget.hasBarrel()">; -def NoBarrel : Predicate<"!Subtarget.hasBarrel()">; +// def NoBarrel : Predicate<"!Subtarget.hasBarrel()">; def HasDiv : Predicate<"Subtarget.hasDiv()">; def HasMul : Predicate<"Subtarget.hasMul()">; -def HasFSL : Predicate<"Subtarget.hasFSL()">; -def HasEFSL : Predicate<"Subtarget.hasEFSL()">; -def HasMSRSet : Predicate<"Subtarget.hasMSRSet()">; -def HasException : Predicate<"Subtarget.hasException()">; +// def HasFSL : Predicate<"Subtarget.hasFSL()">; +// def HasEFSL : Predicate<"Subtarget.hasEFSL()">; +// def HasMSRSet : Predicate<"Subtarget.hasMSRSet()">; +// def HasException : Predicate<"Subtarget.hasException()">; def HasPatCmp : Predicate<"Subtarget.hasPatCmp()">; def HasFPU : Predicate<"Subtarget.hasFPU()">; -def HasESR : Predicate<"Subtarget.hasESR()">; -def HasPVR : Predicate<"Subtarget.hasPVR()">; +// def HasESR : Predicate<"Subtarget.hasESR()">; +// def HasPVR : Predicate<"Subtarget.hasPVR()">; def HasMul64 : Predicate<"Subtarget.hasMul64()">; def HasSqrt : Predicate<"Subtarget.hasSqrt()">; -def HasMMU : Predicate<"Subtarget.hasMMU()">; +// def HasMMU : Predicate<"Subtarget.hasMMU()">; //===----------------------------------------------------------------------===// // MBlaze Operand, Complex Patterns and Transformations Definitions. @@ -170,18 +170,18 @@ class ArithI<bits<6> op, string instr_asm, SDNode OpNode, Operand Od, PatLeaf imm_type> : TB<op, (outs GPR:$dst), (ins GPR:$b, Od:$c), !strconcat(instr_asm, " $dst, $b, $c"), - [(set GPR:$dst, (OpNode GPR:$b, imm_type:$c))], IIAlu>; + [(set GPR:$dst, (OpNode GPR:$b, imm_type:$c))], IIC_ALU>; class ArithI32<bits<6> op, string instr_asm,Operand Od, PatLeaf imm_type> : TB<op, (outs GPR:$dst), (ins GPR:$b, Od:$c), !strconcat(instr_asm, " $dst, $b, $c"), - [], IIAlu>; + [], IIC_ALU>; class ShiftI<bits<6> op, bits<2> flags, string instr_asm, SDNode OpNode, Operand Od, PatLeaf imm_type> : SHT<op, flags, (outs GPR:$dst), (ins GPR:$b, Od:$c), !strconcat(instr_asm, " $dst, $b, $c"), - [(set GPR:$dst, (OpNode GPR:$b, imm_type:$c))], IIAlu>; + [(set GPR:$dst, (OpNode GPR:$b, imm_type:$c))], IIC_SHT>; class ArithR<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode, InstrItinClass itin> : @@ -193,7 +193,7 @@ class ArithRI<bits<6> op, string instr_asm, SDNode OpNode, Operand Od, PatLeaf imm_type> : TBR<op, (outs GPR:$dst), (ins Od:$b, GPR:$c), !strconcat(instr_asm, " $dst, $c, $b"), - [(set GPR:$dst, (OpNode imm_type:$b, GPR:$c))], IIAlu>; + [(set GPR:$dst, (OpNode imm_type:$b, GPR:$c))], IIC_ALU>; class ArithN<bits<6> op, bits<11> flags, string instr_asm, InstrItinClass itin> : @@ -204,7 +204,7 @@ class ArithN<bits<6> op, bits<11> flags, string instr_asm, class ArithNI<bits<6> op, string instr_asm,Operand Od, PatLeaf imm_type> : TB<op, (outs GPR:$dst), (ins GPR:$b, Od:$c), !strconcat(instr_asm, " $dst, $b, $c"), - [], IIAlu>; + [], IIC_ALU>; class ArithRN<bits<6> op, bits<11> flags, string instr_asm, InstrItinClass itin> : @@ -215,7 +215,7 @@ class ArithRN<bits<6> op, bits<11> flags, string instr_asm, class ArithRNI<bits<6> op, string instr_asm,Operand Od, PatLeaf imm_type> : TBR<op, (outs GPR:$dst), (ins Od:$c, GPR:$b), !strconcat(instr_asm, " $dst, $b, $c"), - [], IIAlu>; + [], IIC_ALU>; //===----------------------------------------------------------------------===// // Misc Arithmetic Instructions @@ -224,23 +224,23 @@ class ArithRNI<bits<6> op, string instr_asm,Operand Od, PatLeaf imm_type> : class Logic<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode> : TA<op, flags, (outs GPR:$dst), (ins GPR:$b, GPR:$c), !strconcat(instr_asm, " $dst, $b, $c"), - [(set GPR:$dst, (OpNode GPR:$b, GPR:$c))], IIAlu>; + [(set GPR:$dst, (OpNode GPR:$b, GPR:$c))], IIC_ALU>; class LogicI<bits<6> op, string instr_asm, SDNode OpNode> : TB<op, (outs GPR:$dst), (ins GPR:$b, uimm16:$c), !strconcat(instr_asm, " $dst, $b, $c"), [(set GPR:$dst, (OpNode GPR:$b, immZExt16:$c))], - IIAlu>; + IIC_ALU>; class LogicI32<bits<6> op, string instr_asm> : TB<op, (outs GPR:$dst), (ins GPR:$b, uimm16:$c), !strconcat(instr_asm, " $dst, $b, $c"), - [], IIAlu>; + [], IIC_ALU>; class PatCmp<bits<6> op, bits<11> flags, string instr_asm> : TA<op, flags, (outs GPR:$dst), (ins GPR:$b, GPR:$c), !strconcat(instr_asm, " $dst, $b, $c"), - [], IIAlu>; + [], IIC_ALU>; //===----------------------------------------------------------------------===// // Memory Access Instructions @@ -248,22 +248,22 @@ class PatCmp<bits<6> op, bits<11> flags, string instr_asm> : class LoadM<bits<6> op, bits<11> flags, string instr_asm> : TA<op, flags, (outs GPR:$dst), (ins memrr:$addr), !strconcat(instr_asm, " $dst, $addr"), - [], IILoad>; + [], IIC_MEMl>; class LoadMI<bits<6> op, string instr_asm, PatFrag OpNode> : TB<op, (outs GPR:$dst), (ins memri:$addr), !strconcat(instr_asm, " $dst, $addr"), - [(set (i32 GPR:$dst), (OpNode iaddr:$addr))], IILoad>; + [(set (i32 GPR:$dst), (OpNode iaddr:$addr))], IIC_MEMl>; class StoreM<bits<6> op, bits<11> flags, string instr_asm> : TA<op, flags, (outs), (ins GPR:$dst, memrr:$addr), !strconcat(instr_asm, " $dst, $addr"), - [], IIStore>; + [], IIC_MEMs>; class StoreMI<bits<6> op, string instr_asm, PatFrag OpNode> : TB<op, (outs), (ins GPR:$dst, memri:$addr), !strconcat(instr_asm, " $dst, $addr"), - [(OpNode (i32 GPR:$dst), iaddr:$addr)], IIStore>; + [(OpNode (i32 GPR:$dst), iaddr:$addr)], IIC_MEMs>; //===----------------------------------------------------------------------===// // Branch Instructions @@ -271,7 +271,7 @@ class StoreMI<bits<6> op, string instr_asm, PatFrag OpNode> : class Branch<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> : TA<op, flags, (outs), (ins GPR:$target), !strconcat(instr_asm, " $target"), - [], IIBranch> { + [], IIC_BR> { let rd = 0x0; let ra = br; let Form = FCCR; @@ -280,7 +280,7 @@ class Branch<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> : class BranchI<bits<6> op, bits<5> br, string instr_asm> : TB<op, (outs), (ins brtarget:$target), !strconcat(instr_asm, " $target"), - [], IIBranch> { + [], IIC_BR> { let rd = 0; let ra = br; let Form = FCCI; @@ -292,7 +292,7 @@ class BranchI<bits<6> op, bits<5> br, string instr_asm> : class BranchL<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> : TA<op, flags, (outs), (ins GPR:$link, GPR:$target, variable_ops), !strconcat(instr_asm, " $link, $target"), - [], IIBranch> { + [], IIC_BRl> { let ra = br; let Form = FRCR; } @@ -300,7 +300,7 @@ class BranchL<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> : class BranchLI<bits<6> op, bits<5> br, string instr_asm> : TB<op, (outs), (ins GPR:$link, calltarget:$target, variable_ops), !strconcat(instr_asm, " $link, $target"), - [], IIBranch> { + [], IIC_BRl> { let ra = br; let Form = FRCI; } @@ -312,7 +312,7 @@ class BranchC<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> : TA<op, flags, (outs), (ins GPR:$a, GPR:$b), !strconcat(instr_asm, " $a, $b"), - [], IIBranch> { + [], IIC_BRc> { let rd = br; let Form = FCRR; } @@ -320,7 +320,7 @@ class BranchC<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> : class BranchCI<bits<6> op, bits<5> br, string instr_asm> : TB<op, (outs), (ins GPR:$a, brtarget:$offset), !strconcat(instr_asm, " $a, $offset"), - [], IIBranch> { + [], IIC_BRc> { let rd = br; let Form = FCRI; } @@ -330,71 +330,74 @@ class BranchCI<bits<6> op, bits<5> br, string instr_asm> : //===----------------------------------------------------------------------===// let isCommutable = 1, isAsCheapAsAMove = 1 in { - def ADDK : Arith<0x04, 0x000, "addk ", add, IIAlu>; + def ADDK : Arith<0x04, 0x000, "addk ", add, IIC_ALU>; def AND : Logic<0x21, 0x000, "and ", and>; def OR : Logic<0x20, 0x000, "or ", or>; def XOR : Logic<0x22, 0x000, "xor ", xor>; - def PCMPBF : PatCmp<0x20, 0x400, "pcmpbf ">; - def PCMPEQ : PatCmp<0x22, 0x400, "pcmpeq ">; - def PCMPNE : PatCmp<0x23, 0x400, "pcmpne ">; + + let Predicates=[HasPatCmp] in { + def PCMPBF : PatCmp<0x20, 0x400, "pcmpbf ">; + def PCMPEQ : PatCmp<0x22, 0x400, "pcmpeq ">; + def PCMPNE : PatCmp<0x23, 0x400, "pcmpne ">; + } let Defs = [CARRY] in { - def ADD : Arith<0x00, 0x000, "add ", addc, IIAlu>; + def ADD : Arith<0x00, 0x000, "add ", addc, IIC_ALU>; let Uses = [CARRY] in { - def ADDC : Arith<0x02, 0x000, "addc ", adde, IIAlu>; + def ADDC : Arith<0x02, 0x000, "addc ", adde, IIC_ALU>; } } let Uses = [CARRY] in { - def ADDKC : ArithN<0x06, 0x000, "addkc ", IIAlu>; + def ADDKC : ArithN<0x06, 0x000, "addkc ", IIC_ALU>; } } let isAsCheapAsAMove = 1 in { - def ANDN : ArithN<0x23, 0x000, "andn ", IIAlu>; - def CMP : ArithN<0x05, 0x001, "cmp ", IIAlu>; - def CMPU : ArithN<0x05, 0x003, "cmpu ", IIAlu>; - def RSUBK : ArithR<0x05, 0x000, "rsubk ", sub, IIAlu>; + def ANDN : ArithN<0x23, 0x000, "andn ", IIC_ALU>; + def CMP : ArithN<0x05, 0x001, "cmp ", IIC_ALU>; + def CMPU : ArithN<0x05, 0x003, "cmpu ", IIC_ALU>; + def RSUBK : ArithR<0x05, 0x000, "rsubk ", sub, IIC_ALU>; let Defs = [CARRY] in { - def RSUB : ArithR<0x01, 0x000, "rsub ", subc, IIAlu>; + def RSUB : ArithR<0x01, 0x000, "rsub ", subc, IIC_ALU>; let Uses = [CARRY] in { - def RSUBC : ArithR<0x03, 0x000, "rsubc ", sube, IIAlu>; + def RSUBC : ArithR<0x03, 0x000, "rsubc ", sube, IIC_ALU>; } } let Uses = [CARRY] in { - def RSUBKC : ArithRN<0x07, 0x000, "rsubkc ", IIAlu>; + def RSUBKC : ArithRN<0x07, 0x000, "rsubkc ", IIC_ALU>; } } let isCommutable = 1, Predicates=[HasMul] in { - def MUL : Arith<0x10, 0x000, "mul ", mul, IIAlu>; + def MUL : Arith<0x10, 0x000, "mul ", mul, IIC_ALUm>; } let isCommutable = 1, Predicates=[HasMul,HasMul64] in { - def MULH : Arith<0x10, 0x001, "mulh ", mulhs, IIAlu>; - def MULHU : Arith<0x10, 0x003, "mulhu ", mulhu, IIAlu>; + def MULH : Arith<0x10, 0x001, "mulh ", mulhs, IIC_ALUm>; + def MULHU : Arith<0x10, 0x003, "mulhu ", mulhu, IIC_ALUm>; } let Predicates=[HasMul,HasMul64] in { - def MULHSU : ArithN<0x10, 0x002, "mulhsu ", IIAlu>; + def MULHSU : ArithN<0x10, 0x002, "mulhsu ", IIC_ALUm>; } let Predicates=[HasBarrel] in { - def BSRL : Arith<0x11, 0x000, "bsrl ", srl, IIAlu>; - def BSRA : Arith<0x11, 0x200, "bsra ", sra, IIAlu>; - def BSLL : Arith<0x11, 0x400, "bsll ", shl, IIAlu>; + def BSRL : Arith<0x11, 0x000, "bsrl ", srl, IIC_SHT>; + def BSRA : Arith<0x11, 0x200, "bsra ", sra, IIC_SHT>; + def BSLL : Arith<0x11, 0x400, "bsll ", shl, IIC_SHT>; def BSRLI : ShiftI<0x19, 0x0, "bsrli ", srl, uimm5, immZExt5>; def BSRAI : ShiftI<0x19, 0x1, "bsrai ", sra, uimm5, immZExt5>; def BSLLI : ShiftI<0x19, 0x2, "bslli ", shl, uimm5, immZExt5>; } let Predicates=[HasDiv] in { - def IDIV : ArithR<0x12, 0x000, "idiv ", sdiv, IIAlu>; - def IDIVU : ArithR<0x12, 0x002, "idivu ", udiv, IIAlu>; + def IDIV : ArithR<0x12, 0x000, "idiv ", sdiv, IIC_ALUd>; + def IDIVU : ArithR<0x12, 0x002, "idivu ", udiv, IIC_ALUd>; } //===----------------------------------------------------------------------===// @@ -552,7 +555,7 @@ let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, def RTSD : TB<0x2D, (outs), (ins GPR:$target, simm16:$imm), "rtsd $target, $imm", [], - IIBranch>; + IIC_BR>; } let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, @@ -560,7 +563,7 @@ let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, def RTID : TB<0x2D, (outs), (ins GPR:$target, simm16:$imm), "rtid $target, $imm", [], - IIBranch>; + IIC_BR>; } let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, @@ -568,7 +571,7 @@ let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, def RTBD : TB<0x2D, (outs), (ins GPR:$target, simm16:$imm), "rtbd $target, $imm", [], - IIBranch>; + IIC_BR>; } let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, @@ -576,7 +579,7 @@ let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, def RTED : TB<0x2D, (outs), (ins GPR:$target, simm16:$imm), "rted $target, $imm", [], - IIBranch>; + IIC_BR>; } //===----------------------------------------------------------------------===// @@ -584,7 +587,7 @@ let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, //===----------------------------------------------------------------------===// let neverHasSideEffects = 1 in { - def NOP : MBlazeInst< 0x20, FC, (outs), (ins), "nop ", [], IIAlu>; + def NOP : MBlazeInst< 0x20, FC, (outs), (ins), "nop ", [], IIC_ALU>; } let usesCustomInserter = 1 in { @@ -611,17 +614,17 @@ let usesCustomInserter = 1 in { let rb = 0 in { def SEXT16 : TA<0x24, 0x061, (outs GPR:$dst), (ins GPR:$src), - "sext16 $dst, $src", [], IIAlu>; + "sext16 $dst, $src", [], IIC_ALU>; def SEXT8 : TA<0x24, 0x060, (outs GPR:$dst), (ins GPR:$src), - "sext8 $dst, $src", [], IIAlu>; + "sext8 $dst, $src", [], IIC_ALU>; let Defs = [CARRY] in { def SRL : TA<0x24, 0x041, (outs GPR:$dst), (ins GPR:$src), - "srl $dst, $src", [], IIAlu>; + "srl $dst, $src", [], IIC_ALU>; def SRA : TA<0x24, 0x001, (outs GPR:$dst), (ins GPR:$src), - "sra $dst, $src", [], IIAlu>; + "sra $dst, $src", [], IIC_ALU>; let Uses = [CARRY] in { def SRC : TA<0x24, 0x021, (outs GPR:$dst), (ins GPR:$src), - "src $dst, $src", [], IIAlu>; + "src $dst, $src", [], IIC_ALU>; } } } @@ -637,36 +640,36 @@ let isCodeGenOnly=1 in { //===----------------------------------------------------------------------===// let Form=FRCS in { def MFS : SPC<0x25, 0x2, (outs GPR:$dst), (ins SPR:$src), - "mfs $dst, $src", [], IIAlu>; + "mfs $dst, $src", [], IIC_ALU>; } let Form=FCRCS in { def MTS : SPC<0x25, 0x3, (outs SPR:$dst), (ins GPR:$src), - "mts $dst, $src", [], IIAlu>; + "mts $dst, $src", [], IIC_ALU>; } def MSRSET : MSR<0x25, 0x20, (outs GPR:$dst), (ins uimm15:$set), - "msrset $dst, $set", [], IIAlu>; + "msrset $dst, $set", [], IIC_ALU>; def MSRCLR : MSR<0x25, 0x22, (outs GPR:$dst), (ins uimm15:$clr), - "msrclr $dst, $clr", [], IIAlu>; + "msrclr $dst, $clr", [], IIC_ALU>; let rd=0x0, Form=FCRR in { def WDC : TA<0x24, 0x64, (outs), (ins GPR:$a, GPR:$b), - "wdc $a, $b", [], IIAlu>; + "wdc $a, $b", [], IIC_WDC>; def WDCF : TA<0x24, 0x74, (outs), (ins GPR:$a, GPR:$b), - "wdc.flush $a, $b", [], IIAlu>; + "wdc.flush $a, $b", [], IIC_WDC>; def WDCC : TA<0x24, 0x66, (outs), (ins GPR:$a, GPR:$b), - "wdc.clear $a, $b", [], IIAlu>; + "wdc.clear $a, $b", [], IIC_WDC>; def WIC : TA<0x24, 0x68, (outs), (ins GPR:$a, GPR:$b), - "wic $a, $b", [], IIAlu>; + "wic $a, $b", [], IIC_WDC>; } def BRK : BranchL<0x26, 0x0C, 0x000, "brk ">; def BRKI : BranchLI<0x2E, 0x0C, "brki ">; def IMM : MBlazeInst<0x2C, FCCI, (outs), (ins simm16:$imm), - "imm $imm", [], IIAlu>; + "imm $imm", [], IIC_ALU>; //===----------------------------------------------------------------------===// // Pseudo instructions for atomic operations @@ -848,11 +851,6 @@ def : Pat<(MBWrapper tconstpool:$in), (ORI (i32 R0), tconstpool:$in)>; // Misc instructions def : Pat<(and (i32 GPR:$lh), (not (i32 GPR:$rh))),(ANDN GPR:$lh, GPR:$rh)>; -// Arithmetic with immediates -def : Pat<(add (i32 GPR:$in), imm:$imm),(ADDIK GPR:$in, imm:$imm)>; -def : Pat<(or (i32 GPR:$in), imm:$imm),(ORI GPR:$in, imm:$imm)>; -def : Pat<(xor (i32 GPR:$in), imm:$imm),(XORI GPR:$in, imm:$imm)>; - // Convert any extend loads into zero extend loads def : Pat<(extloadi8 iaddr:$src), (i32 (LBUI iaddr:$src))>; def : Pat<(extloadi16 iaddr:$src), (i32 (LHUI iaddr:$src))>; diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp index fa9140d..ed8511d 100644 --- a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp +++ b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp @@ -181,6 +181,26 @@ unsigned MBlazeRegisterInfo::getSpecialRegisterFromNumbering(unsigned Reg) { return 0; // Not reached } +bool MBlazeRegisterInfo::isRegister(unsigned Reg) { + return Reg <= 31; +} + +bool MBlazeRegisterInfo::isSpecialRegister(unsigned Reg) { + switch (Reg) { + case 0x0000 : case 0x0001 : case 0x0003 : case 0x0005 : + case 0x0007 : case 0x000B : case 0x000D : case 0x1000 : + case 0x1001 : case 0x1002 : case 0x1003 : case 0x1004 : + case 0x2000 : case 0x2001 : case 0x2002 : case 0x2003 : + case 0x2004 : case 0x2005 : case 0x2006 : case 0x2007 : + case 0x2008 : case 0x2009 : case 0x200A : case 0x200B : + return true; + + default: + return false; + } + return false; // Not reached +} + unsigned MBlazeRegisterInfo::getPICCallReg() { return MBlaze::R20; } diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.h b/lib/Target/MBlaze/MBlazeRegisterInfo.h index 839536d..69ec5aa 100644 --- a/lib/Target/MBlaze/MBlazeRegisterInfo.h +++ b/lib/Target/MBlaze/MBlazeRegisterInfo.h @@ -45,6 +45,8 @@ struct MBlazeRegisterInfo : public MBlazeGenRegisterInfo { static unsigned getRegisterNumbering(unsigned RegEnum); static unsigned getRegisterFromNumbering(unsigned RegEnum); static unsigned getSpecialRegisterFromNumbering(unsigned RegEnum); + static bool isRegister(unsigned RegEnum); + static bool isSpecialRegister(unsigned RegEnum); /// Get PIC indirect call register static unsigned getPICCallReg(); diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.td b/lib/Target/MBlaze/MBlazeRegisterInfo.td index fbefb22..1a695a7 100644 --- a/lib/Target/MBlaze/MBlazeRegisterInfo.td +++ b/lib/Target/MBlaze/MBlazeRegisterInfo.td @@ -85,18 +85,19 @@ let Namespace = "MBlaze" in { def RTLBX : MBlazeSPRReg<0x1002, "rtlbx">, DwarfRegNum<[41]>; def RTLBLO : MBlazeSPRReg<0x1003, "rtlblo">, DwarfRegNum<[42]>; def RTLBHI : MBlazeSPRReg<0x1004, "rtlbhi">, DwarfRegNum<[43]>; - def RPVR0 : MBlazeSPRReg<0x2000, "rpvr0">, DwarfRegNum<[44]>; - def RPVR1 : MBlazeSPRReg<0x2001, "rpvr1">, DwarfRegNum<[45]>; - def RPVR2 : MBlazeSPRReg<0x2002, "rpvr2">, DwarfRegNum<[46]>; - def RPVR3 : MBlazeSPRReg<0x2003, "rpvr3">, DwarfRegNum<[47]>; - def RPVR4 : MBlazeSPRReg<0x2004, "rpvr4">, DwarfRegNum<[48]>; - def RPVR5 : MBlazeSPRReg<0x2005, "rpvr5">, DwarfRegNum<[49]>; - def RPVR6 : MBlazeSPRReg<0x2006, "rpvr6">, DwarfRegNum<[50]>; - def RPVR7 : MBlazeSPRReg<0x2007, "rpvr7">, DwarfRegNum<[51]>; - def RPVR8 : MBlazeSPRReg<0x2008, "rpvr8">, DwarfRegNum<[52]>; - def RPVR9 : MBlazeSPRReg<0x2009, "rpvr9">, DwarfRegNum<[53]>; - def RPVR10 : MBlazeSPRReg<0x200A, "rpvr10">, DwarfRegNum<[54]>; - def RPVR11 : MBlazeSPRReg<0x200B, "rpvr11">, DwarfRegNum<[55]>; + def RTLBSX : MBlazeSPRReg<0x1004, "rtlbsx">, DwarfRegNum<[44]>; + def RPVR0 : MBlazeSPRReg<0x2000, "rpvr0">, DwarfRegNum<[45]>; + def RPVR1 : MBlazeSPRReg<0x2001, "rpvr1">, DwarfRegNum<[46]>; + def RPVR2 : MBlazeSPRReg<0x2002, "rpvr2">, DwarfRegNum<[47]>; + def RPVR3 : MBlazeSPRReg<0x2003, "rpvr3">, DwarfRegNum<[48]>; + def RPVR4 : MBlazeSPRReg<0x2004, "rpvr4">, DwarfRegNum<[49]>; + def RPVR5 : MBlazeSPRReg<0x2005, "rpvr5">, DwarfRegNum<[50]>; + def RPVR6 : MBlazeSPRReg<0x2006, "rpvr6">, DwarfRegNum<[51]>; + def RPVR7 : MBlazeSPRReg<0x2007, "rpvr7">, DwarfRegNum<[52]>; + def RPVR8 : MBlazeSPRReg<0x2008, "rpvr8">, DwarfRegNum<[53]>; + def RPVR9 : MBlazeSPRReg<0x2009, "rpvr9">, DwarfRegNum<[54]>; + def RPVR10 : MBlazeSPRReg<0x200A, "rpvr10">, DwarfRegNum<[55]>; + def RPVR11 : MBlazeSPRReg<0x200B, "rpvr11">, DwarfRegNum<[56]>; // The carry bit. In the Microblaze this is really bit 29 of the // MSR register but this is the only bit of that register that we diff --git a/lib/Target/MBlaze/MBlazeSchedule.td b/lib/Target/MBlaze/MBlazeSchedule.td index ac4d98c..4662f25 100644 --- a/lib/Target/MBlaze/MBlazeSchedule.td +++ b/lib/Target/MBlaze/MBlazeSchedule.td @@ -8,57 +8,48 @@ //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// -// Functional units across MBlaze chips sets. Based on GCC/MBlaze backend files. +// MBlaze functional units. //===----------------------------------------------------------------------===// -def ALU : FuncUnit; -def IMULDIV : FuncUnit; +def IF : FuncUnit; +def ID : FuncUnit; +def EX : FuncUnit; +def MA : FuncUnit; +def WB : FuncUnit; //===----------------------------------------------------------------------===// // Instruction Itinerary classes used for MBlaze //===----------------------------------------------------------------------===// -def IIAlu : InstrItinClass; -def IILoad : InstrItinClass; -def IIStore : InstrItinClass; -def IIXfer : InstrItinClass; -def IIBranch : InstrItinClass; -def IIHiLo : InstrItinClass; -def IIImul : InstrItinClass; -def IIIdiv : InstrItinClass; -def IIFcvt : InstrItinClass; -def IIFmove : InstrItinClass; -def IIFcmp : InstrItinClass; -def IIFadd : InstrItinClass; -def IIFmulSingle : InstrItinClass; -def IIFmulDouble : InstrItinClass; -def IIFdivSingle : InstrItinClass; -def IIFdivDouble : InstrItinClass; -def IIFsqrtSingle : InstrItinClass; -def IIFsqrtDouble : InstrItinClass; -def IIFrecipFsqrtStep : InstrItinClass; -def IIPseudo : InstrItinClass; +def IIC_ALU : InstrItinClass; +def IIC_ALUm : InstrItinClass; +def IIC_ALUd : InstrItinClass; +def IIC_SHT : InstrItinClass; +def IIC_FSLg : InstrItinClass; +def IIC_FSLp : InstrItinClass; +def IIC_MEMs : InstrItinClass; +def IIC_MEMl : InstrItinClass; +def IIC_FPU : InstrItinClass; +def IIC_FPUd : InstrItinClass; +def IIC_FPUf : InstrItinClass; +def IIC_FPUi : InstrItinClass; +def IIC_FPUs : InstrItinClass; +def IIC_FPUc : InstrItinClass; +def IIC_BR : InstrItinClass; +def IIC_BRc : InstrItinClass; +def IIC_BRl : InstrItinClass; +def IIC_WDC : InstrItinClass; +def IIC_Pseudo : InstrItinClass; //===----------------------------------------------------------------------===// -// MBlaze Generic instruction itineraries. +// MBlaze generic instruction itineraries. //===----------------------------------------------------------------------===// -def MBlazeGenericItineraries : ProcessorItineraries< - [ALU, IMULDIV], [], [ - InstrItinData<IIAlu , [InstrStage<1, [ALU]>]>, - InstrItinData<IILoad , [InstrStage<3, [ALU]>]>, - InstrItinData<IIStore , [InstrStage<1, [ALU]>]>, - InstrItinData<IIXfer , [InstrStage<2, [ALU]>]>, - InstrItinData<IIBranch , [InstrStage<1, [ALU]>]>, - InstrItinData<IIHiLo , [InstrStage<1, [IMULDIV]>]>, - InstrItinData<IIImul , [InstrStage<17, [IMULDIV]>]>, - InstrItinData<IIIdiv , [InstrStage<38, [IMULDIV]>]>, - InstrItinData<IIFcvt , [InstrStage<1, [ALU]>]>, - InstrItinData<IIFmove , [InstrStage<2, [ALU]>]>, - InstrItinData<IIFcmp , [InstrStage<3, [ALU]>]>, - InstrItinData<IIFadd , [InstrStage<4, [ALU]>]>, - InstrItinData<IIFmulSingle , [InstrStage<7, [ALU]>]>, - InstrItinData<IIFmulDouble , [InstrStage<8, [ALU]>]>, - InstrItinData<IIFdivSingle , [InstrStage<23, [ALU]>]>, - InstrItinData<IIFdivDouble , [InstrStage<36, [ALU]>]>, - InstrItinData<IIFsqrtSingle , [InstrStage<54, [ALU]>]>, - InstrItinData<IIFsqrtDouble , [InstrStage<12, [ALU]>]>, - InstrItinData<IIFrecipFsqrtStep , [InstrStage<5, [ALU]>]> -]>; +def MBlazeGenericItineraries : ProcessorItineraries<[], [], []>; + +//===----------------------------------------------------------------------===// +// MBlaze instruction itineraries for three stage pipeline. +//===----------------------------------------------------------------------===// +include "MBlazeSchedule3.td" + +//===----------------------------------------------------------------------===// +// MBlaze instruction itineraries for five stage pipeline. +//===----------------------------------------------------------------------===// +include "MBlazeSchedule5.td" diff --git a/lib/Target/MBlaze/MBlazeSchedule3.td b/lib/Target/MBlaze/MBlazeSchedule3.td new file mode 100644 index 0000000..ccbf99d --- /dev/null +++ b/lib/Target/MBlaze/MBlazeSchedule3.td @@ -0,0 +1,236 @@ +//===- MBlazeSchedule3.td - MBlaze Scheduling Definitions --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// MBlaze instruction itineraries for the three stage pipeline. +//===----------------------------------------------------------------------===// +def MBlazePipe3Itineraries : ProcessorItineraries< + [IF,ID,EX], [], [ + + // ALU instruction with one destination register and either two register + // source operands or one register source operand and one immediate operand. + // The instruction takes one cycle to execute in each of the stages. The + // two source operands are read during the decode stage and the result is + // ready after the execute stage. + InstrItinData< IIC_ALU, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]>], // one cycle in execute stage + [ 2 // result ready after two cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // ALU multiply instruction with one destination register and either two + // register source operands or one register source operand and one immediate + // operand. The instruction takes one cycle to execute in each of the + // pipeline stages except the execute stage, which takes three cycles. The + // two source operands are read during the decode stage and the result is + // ready after the execute stage. + InstrItinData< IIC_ALUm, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<3,[EX]>], // three cycles in execute stage + [ 4 // result ready after four cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // ALU divide instruction with one destination register two register source + // operands. The instruction takes one cycle to execute in each the pipeline + // stages except the execute stage, which takes 34 cycles. The two + // source operands are read during the decode stage and the result is ready + // after the execute stage. + InstrItinData< IIC_ALUd, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<34,[EX]>], // 34 cycles in execute stage + [ 35 // result ready after 35 cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Shift instruction with one destination register and either two register + // source operands or one register source operand and one immediate operand. + // The instruction takes one cycle to execute in each of the pipeline stages + // except the execute stage, which takes two cycles. The two source operands + // are read during the decode stage and the result is ready after the execute + // stage. + InstrItinData< IIC_SHT, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<2,[EX]>], // two cycles in execute stage + [ 3 // result ready after three cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Branch instruction with one source operand register. The instruction takes + // one cycle to execute in each of the pipeline stages. The source operand is + // read during the decode stage. + InstrItinData< IIC_BR, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]>], // one cycle in execute stage + [ 1 ]>, // first operand read after one cycle + + // Conditional branch instruction with two source operand registers. The + // instruction takes one cycle to execute in each of the pipeline stages. The + // two source operands are read during the decode stage. + InstrItinData< IIC_BRc, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]>], // one cycle in execute stage + [ 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Branch and link instruction with one destination register and one source + // operand register. The instruction takes one cycle to execute in each of + // the pipeline stages. The source operand is read during the decode stage + // and the destination register is ready after the execute stage. + InstrItinData< IIC_BRl, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]>], // one cycle in execute stage + [ 2 // result ready after two cycles + , 1 ]>, // first operand read after one cycle + + // Cache control instruction with two source operand registers. The + // instruction takes one cycle to execute in each of the pipeline stages + // except the memory access stage, which takes two cycles. The source + // operands are read during the decode stage. + InstrItinData< IIC_WDC, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<2,[EX]>], // two cycles in execute stage + [ 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Floating point instruction with one destination register and two source + // operand registers. The instruction takes one cycle to execute in each of + // the pipeline stages except the execute stage, which takes six cycles. The + // source operands are read during the decode stage and the results are ready + // after the execute stage. + InstrItinData< IIC_FPU, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<6,[EX]>], // six cycles in execute stage + [ 7 // result ready after seven cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Floating point divide instruction with one destination register and two + // source operand registers. The instruction takes one cycle to execute in + // each of the pipeline stages except the execute stage, which takes 30 + // cycles. The source operands are read during the decode stage and the + // results are ready after the execute stage. + InstrItinData< IIC_FPUd, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<30,[EX]>], // one cycle in execute stage + [ 31 // result ready after 31 cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Convert floating point to integer instruction with one destination + // register and one source operand register. The instruction takes one cycle + // to execute in each of the pipeline stages except the execute stage, + // which takes seven cycles. The source operands are read during the decode + // stage and the results are ready after the execute stage. + InstrItinData< IIC_FPUi, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<7,[EX]>], // seven cycles in execute stage + [ 8 // result ready after eight cycles + , 1 ]>, // first operand read after one cycle + + // Convert integer to floating point instruction with one destination + // register and one source operand register. The instruction takes one cycle + // to execute in each of the pipeline stages except the execute stage, + // which takes six cycles. The source operands are read during the decode + // stage and the results are ready after the execute stage. + InstrItinData< IIC_FPUf, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<6,[EX]>], // six cycles in execute stage + [ 7 // result ready after seven cycles + , 1 ]>, // first operand read after one cycle + + // Floating point square root instruction with one destination register and + // one source operand register. The instruction takes one cycle to execute in + // each of the pipeline stages except the execute stage, which takes 29 + // cycles. The source operands are read during the decode stage and the + // results are ready after the execute stage. + InstrItinData< IIC_FPUs, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<29,[EX]>], // 29 cycles in execute stage + [ 30 // result ready after 30 cycles + , 1 ]>, // first operand read after one cycle + + // Floating point comparison instruction with one destination register and + // two source operand registers. The instruction takes one cycle to execute + // in each of the pipeline stages except the execute stage, which takes three + // cycles. The source operands are read during the decode stage and the + // results are ready after the execute stage. + InstrItinData< IIC_FPUc, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<3,[EX]>], // three cycles in execute stage + [ 4 // result ready after four cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // FSL get instruction with one register or immediate source operand and one + // destination register. The instruction takes one cycle to execute in each + // of the pipeline stages except the execute stage, which takes two cycles. + // The one source operand is read during the decode stage and the result is + // ready after the execute stage. + InstrItinData< IIC_FSLg, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<2,[EX]>], // two cycles in execute stage + [ 3 // result ready after two cycles + , 1 ]>, // first operand read after one cycle + + // FSL put instruction with either two register source operands or one + // register source operand and one immediate operand. There is no result + // produced by the instruction. The instruction takes one cycle to execute in + // each of the pipeline stages except the execute stage, which takes two + // cycles. The two source operands are read during the decode stage. + InstrItinData< IIC_FSLp, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<2,[EX]>], // two cycles in execute stage + [ 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Memory store instruction with either three register source operands or two + // register source operands and one immediate operand. There is no result + // produced by the instruction. The instruction takes one cycle to execute in + // each of the pipeline stages except the execute stage, which takes two + // cycles. All of the source operands are read during the decode stage. + InstrItinData< IIC_MEMs, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<2,[EX]>], // two cycles in execute stage + [ 1 // first operand read after one cycle + , 1 // second operand read after one cycle + , 1 ]>, // third operand read after one cycle + + // Memory load instruction with one destination register and either two + // register source operands or one register source operand and one immediate + // operand. The instruction takes one cycle to execute in each of the + // pipeline stages except the execute stage, which takes two cycles. All of + // the source operands are read during the decode stage and the result is + // ready after the execute stage. + InstrItinData< IIC_MEMl, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<2,[EX]>], // two cycles in execute stage + [ 3 // result ready after four cycles + , 1 // second operand read after one cycle + , 1 ]> // third operand read after one cycle +]>; diff --git a/lib/Target/MBlaze/MBlazeSchedule5.td b/lib/Target/MBlaze/MBlazeSchedule5.td new file mode 100644 index 0000000..fa88766 --- /dev/null +++ b/lib/Target/MBlaze/MBlazeSchedule5.td @@ -0,0 +1,267 @@ +//===- MBlazeSchedule5.td - MBlaze Scheduling Definitions --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// MBlaze instruction itineraries for the five stage pipeline. +//===----------------------------------------------------------------------===// +def MBlazePipe5Itineraries : ProcessorItineraries< + [IF,ID,EX,MA,WB], [], [ + + // ALU instruction with one destination register and either two register + // source operands or one register source operand and one immediate operand. + // The instruction takes one cycle to execute in each of the stages. The + // two source operands are read during the decode stage and the result is + // ready after the execute stage. + InstrItinData< IIC_ALU, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<1,[MA]> // one cycle in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 2 // result ready after two cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // ALU multiply instruction with one destination register and either two + // register source operands or one register source operand and one immediate + // operand. The instruction takes one cycle to execute in each of the + // pipeline stages. The two source operands are read during the decode stage + // and the result is ready after the execute stage. + InstrItinData< IIC_ALUm, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<1,[MA]> // one cycle in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 2 // result ready after two cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // ALU divide instruction with one destination register two register source + // operands. The instruction takes one cycle to execute in each the pipeline + // stages except the memory access stage, which takes 31 cycles. The two + // source operands are read during the decode stage and the result is ready + // after the memory access stage. + InstrItinData< IIC_ALUd, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<31,[MA]> // 31 cycles in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 33 // result ready after 33 cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Shift instruction with one destination register and either two register + // source operands or one register source operand and one immediate operand. + // The instruction takes one cycle to execute in each of the pipeline stages. + // The two source operands are read during the decode stage and the result is + // ready after the memory access stage. + InstrItinData< IIC_SHT, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<1,[MA]> // one cycle in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 3 // result ready after three cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Branch instruction with one source operand register. The instruction takes + // one cycle to execute in each of the pipeline stages. The source operand is + // read during the decode stage. + InstrItinData< IIC_BR, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<1,[MA]> // one cycle in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 1 ]>, // first operand read after one cycle + + // Conditional branch instruction with two source operand registers. The + // instruction takes one cycle to execute in each of the pipeline stages. The + // two source operands are read during the decode stage. + InstrItinData< IIC_BRc, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<1,[MA]> // one cycle in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Branch and link instruction with one destination register and one source + // operand register. The instruction takes one cycle to execute in each of + // the pipeline stages. The source operand is read during the decode stage + // and the destination register is ready after the writeback stage. + InstrItinData< IIC_BRl, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<1,[MA]> // one cycle in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 4 // result ready after four cycles + , 1 ]>, // first operand read after one cycle + + // Cache control instruction with two source operand registers. The + // instruction takes one cycle to execute in each of the pipeline stages + // except the memory access stage, which takes two cycles. The source + // operands are read during the decode stage. + InstrItinData< IIC_WDC, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<2,[MA]> // two cycles in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Floating point instruction with one destination register and two source + // operand registers. The instruction takes one cycle to execute in each of + // the pipeline stages except the memory access stage, which takes two + // cycles. The source operands are read during the decode stage and the + // results are ready after the writeback stage. + InstrItinData< IIC_FPU, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<2,[MA]> // two cycles in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 5 // result ready after five cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Floating point divide instruction with one destination register and two + // source operand registers. The instruction takes one cycle to execute in + // each of the pipeline stages except the memory access stage, which takes 26 + // cycles. The source operands are read during the decode stage and the + // results are ready after the writeback stage. + InstrItinData< IIC_FPUd, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<26,[MA]> // 26 cycles in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 29 // result ready after 29 cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Convert floating point to integer instruction with one destination + // register and one source operand register. The instruction takes one cycle + // to execute in each of the pipeline stages except the memory access stage, + // which takes three cycles. The source operands are read during the decode + // stage and the results are ready after the writeback stage. + InstrItinData< IIC_FPUi, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<3,[MA]> // three cycles in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 6 // result ready after six cycles + , 1 ]>, // first operand read after one cycle + + // Convert integer to floating point instruction with one destination + // register and one source operand register. The instruction takes one cycle + // to execute in each of the pipeline stages except the memory access stage, + // which takes two cycles. The source operands are read during the decode + // stage and the results are ready after the writeback stage. + InstrItinData< IIC_FPUf, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<2,[MA]> // two cycles in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 5 // result ready after five cycles + , 1 ]>, // first operand read after one cycle + + // Floating point square root instruction with one destination register and + // one source operand register. The instruction takes one cycle to execute in + // each of the pipeline stages except the memory access stage, which takes 25 + // cycles. The source operands are read during the decode stage and the + // results are ready after the writeback stage. + InstrItinData< IIC_FPUs, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<25,[MA]> // 25 cycles in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 28 // result ready after 28 cycles + , 1 ]>, // first operand read after one cycle + + // Floating point comparison instruction with one destination register and + // two source operand registers. The instruction takes one cycle to execute + // in each of the pipeline stages. The source operands are read during the + // decode stage and the results are ready after the execute stage. + InstrItinData< IIC_FPUc, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<1,[MA]> // one cycle in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 2 // result ready after two cycles + , 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // FSL get instruction with one register or immediate source operand and one + // destination register. The instruction takes one cycle to execute in each + // of the pipeline stages. The one source operand is read during the decode + // stage and the result is ready after the execute stage. + InstrItinData< IIC_FSLg, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<1,[MA]> // one cycle in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 2 // result ready after two cycles + , 1 ]>, // first operand read after one cycle + + // FSL put instruction with either two register source operands or one + // register source operand and one immediate operand. There is no result + // produced by the instruction. The instruction takes one cycle to execute in + // each of the pipeline stages. The two source operands are read during the + // decode stage. + InstrItinData< IIC_FSLp, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<1,[MA]> // one cycle in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 1 // first operand read after one cycle + , 1 ]>, // second operand read after one cycle + + // Memory store instruction with either three register source operands or two + // register source operands and one immediate operand. There is no result + // produced by the instruction. The instruction takes one cycle to execute in + // each of the pipeline stages. All of the source operands are read during + // the decode stage. + InstrItinData< IIC_MEMs, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<1,[MA]> // one cycle in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 1 // first operand read after one cycle + , 1 // second operand read after one cycle + , 1 ]>, // third operand read after one cycle + + // Memory load instruction with one destination register and either two + // register source operands or one register source operand and one immediate + // operand. The instruction takes one cycle to execute in each of the + // pipeline stages. All of the source operands are read during the decode + // stage and the result is ready after the writeback stage. + InstrItinData< IIC_MEMl, + [ InstrStage<1,[IF]> // one cycle in fetch stage + , InstrStage<1,[ID]> // one cycle in decode stage + , InstrStage<1,[EX]> // one cycle in execute stage + , InstrStage<1,[MA]> // one cycle in memory access stage + , InstrStage<1,[WB]>], // one cycle in write back stage + [ 4 // result ready after four cycles + , 1 // second operand read after one cycle + , 1 ]> // third operand read after one cycle +]>; diff --git a/lib/Target/MBlaze/MBlazeSubtarget.cpp b/lib/Target/MBlaze/MBlazeSubtarget.cpp index 3440521..a80744a 100644 --- a/lib/Target/MBlaze/MBlazeSubtarget.cpp +++ b/lib/Target/MBlaze/MBlazeSubtarget.cpp @@ -13,19 +13,39 @@ #include "MBlazeSubtarget.h" #include "MBlaze.h" +#include "MBlazeRegisterInfo.h" #include "MBlazeGenSubtarget.inc" #include "llvm/Support/CommandLine.h" using namespace llvm; MBlazeSubtarget::MBlazeSubtarget(const std::string &TT, const std::string &FS): - HasPipe3(false), HasBarrel(false), HasDiv(false), HasMul(false), - HasFSL(false), HasEFSL(false), HasMSRSet(false), HasException(false), - HasPatCmp(false), HasFPU(false), HasESR(false), HasPVR(false), - HasMul64(false), HasSqrt(false), HasMMU(false) + HasBarrel(false), HasDiv(false), HasMul(false), HasPatCmp(false), + HasFPU(false), HasMul64(false), HasSqrt(false) { - std::string CPU = "v400"; - MBlazeArchVersion = V400; - // Parse features string. - ParseSubtargetFeatures(FS, CPU); + std::string CPU = "mblaze"; + CPU = ParseSubtargetFeatures(FS, CPU); + + // Only use instruction scheduling if the selected CPU has an instruction + // itinerary (the default CPU is the only one that doesn't). + HasItin = CPU != "mblaze"; + DEBUG(dbgs() << "CPU " << CPU << "(" << HasItin << ")\n"); + + // Compute the issue width of the MBlaze itineraries + computeIssueWidth(); +} + +void MBlazeSubtarget::computeIssueWidth() { + InstrItins.IssueWidth = 1; +} + +bool MBlazeSubtarget:: +enablePostRAScheduler(CodeGenOpt::Level OptLevel, + TargetSubtarget::AntiDepBreakMode& Mode, + RegClassVector& CriticalPathRCs) const { + Mode = TargetSubtarget::ANTIDEP_CRITICAL; + CriticalPathRCs.clear(); + CriticalPathRCs.push_back(&MBlaze::GPRRegClass); + return HasItin && OptLevel >= CodeGenOpt::Default; } + diff --git a/lib/Target/MBlaze/MBlazeSubtarget.h b/lib/Target/MBlaze/MBlazeSubtarget.h index bebb3f7..2255b28 100644 --- a/lib/Target/MBlaze/MBlazeSubtarget.h +++ b/lib/Target/MBlaze/MBlazeSubtarget.h @@ -24,29 +24,14 @@ namespace llvm { class MBlazeSubtarget : public TargetSubtarget { protected: - - enum MBlazeArchEnum { - V400, V500, V600, V700, V710 - }; - - // MBlaze architecture version - MBlazeArchEnum MBlazeArchVersion; - - bool HasPipe3; bool HasBarrel; bool HasDiv; bool HasMul; - bool HasFSL; - bool HasEFSL; - bool HasMSRSet; - bool HasException; bool HasPatCmp; bool HasFPU; - bool HasESR; - bool HasPVR; bool HasMul64; bool HasSqrt; - bool HasMMU; + bool HasItin; InstrItineraryData InstrItins; @@ -61,18 +46,26 @@ public: std::string ParseSubtargetFeatures(const std::string &FS, const std::string &CPU); + /// Compute the number of maximum number of issues per cycle for the + /// MBlaze scheduling itineraries. + void computeIssueWidth(); + + /// enablePostRAScheduler - True at 'More' optimization. + bool enablePostRAScheduler(CodeGenOpt::Level OptLevel, + TargetSubtarget::AntiDepBreakMode& Mode, + RegClassVector& CriticalPathRCs) const; + + /// getInstrItins - Return the instruction itineraies based on subtarget. + const InstrItineraryData &getInstrItineraryData() const { return InstrItins; } + + bool hasItin() const { return HasItin; } + bool hasPCMP() const { return HasPatCmp; } bool hasFPU() const { return HasFPU; } bool hasSqrt() const { return HasSqrt; } bool hasMul() const { return HasMul; } bool hasMul64() const { return HasMul64; } bool hasDiv() const { return HasDiv; } bool hasBarrel() const { return HasBarrel; } - - bool isV400() const { return MBlazeArchVersion == V400; } - bool isV500() const { return MBlazeArchVersion == V500; } - bool isV600() const { return MBlazeArchVersion == V600; } - bool isV700() const { return MBlazeArchVersion == V700; } - bool isV710() const { return MBlazeArchVersion == V710; } }; } // End llvm namespace diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.cpp b/lib/Target/MBlaze/MBlazeTargetMachine.cpp index cd949e1..df34a83 100644 --- a/lib/Target/MBlaze/MBlazeTargetMachine.cpp +++ b/lib/Target/MBlaze/MBlazeTargetMachine.cpp @@ -36,19 +36,18 @@ static MCStreamer *createMCStreamer(const Target &T, const std::string &TT, bool RelaxAll, bool NoExecStack) { Triple TheTriple(TT); - switch (TheTriple.getOS()) { - case Triple::Darwin: + + if (TheTriple.isOSDarwin()) { llvm_unreachable("MBlaze does not support Darwin MACH-O format"); return NULL; - case Triple::MinGW32: - case Triple::Cygwin: - case Triple::Win32: + } + + if (TheTriple.isOSWindows()) { llvm_unreachable("MBlaze does not support Windows COFF format"); return NULL; - default: - return createELFStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll, - NoExecStack); } + + return createELFStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll, NoExecStack); } @@ -87,7 +86,8 @@ MBlazeTargetMachine(const Target &T, const std::string &TT, DataLayout("E-p:32:32:32-i8:8:8-i16:16:16"), InstrInfo(*this), FrameLowering(Subtarget), - TLInfo(*this), TSInfo(*this), ELFWriterInfo(*this) { + TLInfo(*this), TSInfo(*this), ELFWriterInfo(*this), + InstrItins(Subtarget.getInstrItineraryData()) { if (getRelocationModel() == Reloc::Default) { setRelocationModel(Reloc::Static); } diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.h b/lib/Target/MBlaze/MBlazeTargetMachine.h index 45ad078..48ce37a 100644 --- a/lib/Target/MBlaze/MBlazeTargetMachine.h +++ b/lib/Target/MBlaze/MBlazeTargetMachine.h @@ -38,13 +38,18 @@ namespace llvm { MBlazeSelectionDAGInfo TSInfo; MBlazeIntrinsicInfo IntrinsicInfo; MBlazeELFWriterInfo ELFWriterInfo; + InstrItineraryData InstrItins; + public: MBlazeTargetMachine(const Target &T, const std::string &TT, - const std::string &FS); + const std::string &FS); virtual const MBlazeInstrInfo *getInstrInfo() const { return &InstrInfo; } + virtual const InstrItineraryData *getInstrItineraryData() const + { return &InstrItins; } + virtual const TargetFrameLowering *getFrameLowering() const { return &FrameLowering; } diff --git a/lib/Target/MBlaze/TODO b/lib/Target/MBlaze/TODO index 2e613eb..317d7c0 100644 --- a/lib/Target/MBlaze/TODO +++ b/lib/Target/MBlaze/TODO @@ -9,8 +9,6 @@ needs to be examined more closely: - The stack layout needs to be examined to make sure it meets the standard, especially in regards to var arg functions. - - The processor itineraries are copied from a different backend - and need to be updated to model the MicroBlaze correctly. - Look at the MBlazeGenFastISel.inc stuff and make use of it if appropriate. @@ -18,9 +16,6 @@ There are a few things that need to be looked at: - There are some instructions that are not generated by the backend and have not been tested as far as the parser is concerned. - - The assembly parser does not use any MicroBlaze specific directives. + - The assembly parser does not use many MicroBlaze specific directives. I should investigate if there are MicroBlaze specific directive and, if there are, add them. - - The instruction MFS and MTS use special names for some of the - special registers that can be accessed. These special register - names should be parsed by the assembly parser. diff --git a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h index f0e1ce2..63860dc 100644 --- a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h +++ b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h @@ -18,11 +18,12 @@ namespace llvm { class MCOperand; + class TargetMachine; class MSP430InstPrinter : public MCInstPrinter { public: - MSP430InstPrinter(const MCAsmInfo &MAI) : MCInstPrinter(MAI) { - } + MSP430InstPrinter(TargetMachine &TM, const MCAsmInfo &MAI) + : MCInstPrinter(MAI) {} virtual void printInst(const MCInst *MI, raw_ostream &O); diff --git a/lib/Target/MSP430/MSP430AsmPrinter.cpp b/lib/Target/MSP430/MSP430AsmPrinter.cpp index a1a7f44..5264d68 100644 --- a/lib/Target/MSP430/MSP430AsmPrinter.cpp +++ b/lib/Target/MSP430/MSP430AsmPrinter.cpp @@ -164,10 +164,11 @@ void MSP430AsmPrinter::EmitInstruction(const MachineInstr *MI) { } static MCInstPrinter *createMSP430MCInstPrinter(const Target &T, + TargetMachine &TM, unsigned SyntaxVariant, const MCAsmInfo &MAI) { if (SyntaxVariant == 0) - return new MSP430InstPrinter(MAI); + return new MSP430InstPrinter(TM, MAI); return 0; } diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp index a95d59c..006785b 100644 --- a/lib/Target/MSP430/MSP430ISelLowering.cpp +++ b/lib/Target/MSP430/MSP430ISelLowering.cpp @@ -515,7 +515,7 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee, // Build a sequence of copy-to-reg nodes chained together with token chain and // flag operands which copy the outgoing args into registers. The InFlag in - // necessary since all emited instructions must be stuck together. + // necessary since all emitted instructions must be stuck together. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt index 26df1a0..8939b0a 100644 --- a/lib/Target/Mips/CMakeLists.txt +++ b/lib/Target/Mips/CMakeLists.txt @@ -13,6 +13,7 @@ tablegen(MipsGenSubtarget.inc -gen-subtarget) add_llvm_target(MipsCodeGen MipsAsmPrinter.cpp MipsDelaySlotFiller.cpp + MipsExpandPseudo.cpp MipsInstrInfo.cpp MipsISelDAGToDAG.cpp MipsISelLowering.cpp diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h index a9ab050..05b4c5a 100644 --- a/lib/Target/Mips/Mips.h +++ b/lib/Target/Mips/Mips.h @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// // -// This file contains the entry points for global functions defined in +// This file contains the entry points for global functions defined in // the LLVM Mips back-end. // //===----------------------------------------------------------------------===// @@ -25,6 +25,7 @@ namespace llvm { FunctionPass *createMipsISelDag(MipsTargetMachine &TM); FunctionPass *createMipsDelaySlotFillerPass(MipsTargetMachine &TM); + FunctionPass *createMipsExpandPseudoPass(MipsTargetMachine &TM); extern Target TheMipsTarget; extern Target TheMipselTarget; diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td index 3e6437b..b79016d 100644 --- a/lib/Target/Mips/Mips.td +++ b/lib/Target/Mips/Mips.td @@ -59,7 +59,7 @@ def FeatureMips1 : SubtargetFeature<"mips1", "MipsArchVersion", "Mips1", def FeatureMips2 : SubtargetFeature<"mips2", "MipsArchVersion", "Mips2", "Mips2 ISA Support">; def FeatureMips32 : SubtargetFeature<"mips32", "MipsArchVersion", "Mips32", - "Mips32 ISA Support", + "Mips32 ISA Support", [FeatureCondMov, FeatureBitCount]>; def FeatureMips32r2 : SubtargetFeature<"mips32r2", "MipsArchVersion", "Mips32r2", "Mips32r2 ISA Support", @@ -81,7 +81,7 @@ def : Proc<"r6000", [FeatureMips2]>; def : Proc<"4ke", [FeatureMips32r2]>; -// Allegrex is a 32bit subset of r4000, both for interger and fp registers, +// Allegrex is a 32bit subset of r4000, both for integer and fp registers, // but much more similar to Mips2 than Mips3. It also contains some of // Mips32/Mips32r2 instructions and a custom vector fpu processor. def : Proc<"allegrex", [FeatureMips2, FeatureSingleFloat, FeatureEABI, diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp index bd28a9b..502f744 100644 --- a/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/lib/Target/Mips/MipsAsmPrinter.cpp @@ -30,7 +30,7 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/Target/Mangler.h" #include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetRegistry.h" @@ -53,14 +53,14 @@ namespace { return "Mips Assembly Printer"; } - bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant, const char *ExtraCode, raw_ostream &O); void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O); void printUnsignedImm(const MachineInstr *MI, int opNum, raw_ostream &O); - void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O, + void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O, const char *Modifier = 0); - void printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O, + void printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O, const char *Modifier = 0); void printSavedRegsBitmask(raw_ostream &O); void printHex32(unsigned int Value, raw_ostream &O); @@ -77,7 +77,8 @@ namespace { } virtual void EmitFunctionBodyStart(); virtual void EmitFunctionBodyEnd(); - virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const; + virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock* + MBB) const; static const char *getRegisterName(unsigned RegNo); virtual void EmitFunctionEntryLabel(); @@ -94,12 +95,12 @@ namespace { // -- Frame directive "frame Stackpointer, Stacksize, RARegister" // Describe the stack frame. // -// -- Mask directives "(f)mask bitmask, offset" +// -- Mask directives "(f)mask bitmask, offset" // Tells the assembler which registers are saved and where. -// bitmask - contain a little endian bitset indicating which registers are -// saved on function prologue (e.g. with a 0x80000000 mask, the +// bitmask - contain a little endian bitset indicating which registers are +// saved on function prologue (e.g. with a 0x80000000 mask, the // assembler knows the register 31 (RA) is saved at prologue. -// offset - the position before stack pointer subtraction indicating where +// offset - the position before stack pointer subtraction indicating where // the first saved register on prologue is located. (e.g. with a // // Consider the following function prologue: @@ -110,9 +111,9 @@ namespace { // sw $ra, 40($sp) // sw $fp, 36($sp) // -// With a 0xc0000000 mask, the assembler knows the register 31 (RA) and -// 30 (FP) are saved at prologue. As the save order on prologue is from -// left to right, RA is saved first. A -8 offset means that after the +// With a 0xc0000000 mask, the assembler knows the register 31 (RA) and +// 30 (FP) are saved at prologue. As the save order on prologue is from +// left to right, RA is saved first. A -8 offset means that after the // stack pointer subtration, the first register in the mask (RA) will be // saved at address 48-8=40. // @@ -122,7 +123,7 @@ namespace { // Mask directives //===----------------------------------------------------------------------===// -// Create a bitmask with all callee saved registers for CPU or Floating Point +// Create a bitmask with all callee saved registers for CPU or Floating Point // registers. For CPU registers consider RA, GP and FP for saving if necessary. void MipsAsmPrinter::printSavedRegsBitmask(raw_ostream &O) { const TargetFrameLowering *TFI = TM.getFrameLowering(); @@ -168,7 +169,7 @@ void MipsAsmPrinter::printSavedRegsBitmask(raw_ostream &O) { // Print a 32 bit hex number with all numbers. void MipsAsmPrinter::printHex32(unsigned Value, raw_ostream &O) { O << "0x"; - for (int i = 7; i >= 0; i--) + for (int i = 7; i >= 0; i--) O << utohexstr((Value & (0xF << (i*4))) >> (i*4)); } @@ -191,9 +192,9 @@ void MipsAsmPrinter::emitFrameDirective() { } /// Emit Set directives. -const char *MipsAsmPrinter::getCurrentABIString() const { +const char *MipsAsmPrinter::getCurrentABIString() const { switch (Subtarget->getTargetABI()) { - case MipsSubtarget::O32: return "abi32"; + case MipsSubtarget::O32: return "abi32"; case MipsSubtarget::O64: return "abiO64"; case MipsSubtarget::N32: return "abiN32"; case MipsSubtarget::N64: return "abi64"; @@ -203,7 +204,7 @@ const char *MipsAsmPrinter::getCurrentABIString() const { llvm_unreachable("Unknown Mips ABI"); return NULL; -} +} void MipsAsmPrinter::EmitFunctionEntryLabel() { OutStreamer.EmitRawText("\t.ent\t" + Twine(CurrentFnSym->getName())); @@ -214,7 +215,7 @@ void MipsAsmPrinter::EmitFunctionEntryLabel() { /// the first basic block in the function. void MipsAsmPrinter::EmitFunctionBodyStart() { emitFrameDirective(); - + SmallString<128> Str; raw_svector_ostream OS(Str); printSavedRegsBitmask(OS); @@ -226,7 +227,7 @@ void MipsAsmPrinter::EmitFunctionBodyStart() { void MipsAsmPrinter::EmitFunctionBodyEnd() { // There are instruction for this macros, but they must // always be at the function end, and we can't emit and - // break with BB logic. + // break with BB logic. OutStreamer.EmitRawText(StringRef("\t.set\tmacro")); OutStreamer.EmitRawText(StringRef("\t.set\treorder")); OutStreamer.EmitRawText("\t.end\t" + Twine(CurrentFnSym->getName())); @@ -236,8 +237,8 @@ void MipsAsmPrinter::EmitFunctionBodyEnd() { /// isBlockOnlyReachableByFallthough - Return true if the basic block has /// exactly one predecessor and the control transfer mechanism between /// the predecessor and this block is a fall-through. -bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) - const { +bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock* + MBB) const { // The predecessor has to be immediately before this block. const MachineBasicBlock *Pred = *MBB->pred_begin(); @@ -246,16 +247,41 @@ bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock * if (const BasicBlock *bb = Pred->getBasicBlock()) if (isa<SwitchInst>(bb->getTerminator())) return false; + + // If this is a landing pad, it isn't a fall through. If it has no preds, + // then nothing falls through to it. + if (MBB->isLandingPad() || MBB->pred_empty()) + return false; + + // If there isn't exactly one predecessor, it can't be a fall through. + MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(), PI2 = PI; + ++PI2; + + if (PI2 != MBB->pred_end()) + return false; + + // The predecessor has to be immediately before this block. + if (!Pred->isLayoutSuccessor(MBB)) + return false; + + // If the block is completely empty, then it definitely does fall through. + if (Pred->empty()) + return true; - return AsmPrinter::isBlockOnlyReachableByFallthrough(MBB); + // Otherwise, check the last instruction. + // Check if the last terminator is an unconditional branch. + MachineBasicBlock::const_iterator I = Pred->end(); + while (I != Pred->begin() && !(--I)->getDesc().isTerminator()) ; + + return !I->getDesc().isBarrier(); } // Print out an operand for an inline asm expression. -bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, +bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant,const char *ExtraCode, raw_ostream &O) { // Does this asm operand have a single letter operand modifier? - if (ExtraCode && ExtraCode[0]) + if (ExtraCode && ExtraCode[0]) return true; // Unknown modifier. printOperand(MI, OpNo, O); @@ -273,22 +299,9 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum, switch(MO.getTargetFlags()) { case MipsII::MO_GPREL: O << "%gp_rel("; break; case MipsII::MO_GOT_CALL: O << "%call16("; break; - case MipsII::MO_GOT: { - const MachineOperand &LastMO = MI->getOperand(opNum-1); - bool LastMOIsGP = LastMO.getType() == MachineOperand::MO_Register - && LastMO.getReg() == Mips::GP; - if (MI->getOpcode() == Mips::LW || LastMOIsGP) - O << "%got("; - else - O << "%lo("; - break; - } - case MipsII::MO_ABS_HILO: - if (MI->getOpcode() == Mips::LUi) - O << "%hi("; - else - O << "%lo("; - break; + case MipsII::MO_GOT: O << "%got("; break; + case MipsII::MO_ABS_HI: O << "%hi("; break; + case MipsII::MO_ABS_LO: O << "%lo("; break; } switch (MO.getType()) { @@ -308,6 +321,12 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum, O << *Mang->getSymbol(MO.getGlobal()); break; + case MachineOperand::MO_BlockAddress: { + MCSymbol* BA = GetBlockAddressSymbol(MO.getBlockAddress()); + O << BA->getName(); + break; + } + case MachineOperand::MO_ExternalSymbol: O << *GetExternalSymbolSymbol(MO.getSymbolName()); break; @@ -323,7 +342,7 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum, if (MO.getOffset()) O << "+" << MO.getOffset(); break; - + default: llvm_unreachable("<unknown operand type>"); } @@ -336,7 +355,7 @@ void MipsAsmPrinter::printUnsignedImm(const MachineInstr *MI, int opNum, const MachineOperand &MO = MI->getOperand(opNum); if (MO.isImm()) O << (unsigned short int)MO.getImm(); - else + else printOperand(MI, opNum, O); } @@ -352,8 +371,8 @@ printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O, return; } - // Load/Store memory operands -- imm($reg) - // If PIC target the target is loaded as the + // Load/Store memory operands -- imm($reg) + // If PIC target the target is loaded as the // pattern lw $25,%call16($28) printOperand(MI, opNum, O); O << "("; @@ -365,12 +384,12 @@ void MipsAsmPrinter:: printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O, const char *Modifier) { const MachineOperand& MO = MI->getOperand(opNum); - O << Mips::MipsFCCToString((Mips::CondCode)MO.getImm()); + O << Mips::MipsFCCToString((Mips::CondCode)MO.getImm()); } void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) { // FIXME: Use SwitchSection. - + // Tell the assembler which ABI we are using OutStreamer.EmitRawText("\t.section .mdebug." + Twine(getCurrentABIString())); @@ -383,11 +402,11 @@ void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) { } // return to previous section - OutStreamer.EmitRawText(StringRef("\t.previous")); + OutStreamer.EmitRawText(StringRef("\t.previous")); } // Force static initialization. -extern "C" void LLVMInitializeMipsAsmPrinter() { +extern "C" void LLVMInitializeMipsAsmPrinter() { RegisterAsmPrinter<MipsAsmPrinter> X(TheMipsTarget); RegisterAsmPrinter<MipsAsmPrinter> Y(TheMipselTarget); } diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td index 8f313ef..57aeb1d 100644 --- a/lib/Target/Mips/MipsCallingConv.td +++ b/lib/Target/Mips/MipsCallingConv.td @@ -1,23 +1,23 @@ //===- MipsCallingConv.td - Calling Conventions for Mips ---*- tablegen -*-===// -// +// // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. -// +// //===----------------------------------------------------------------------===// // This describes the calling conventions for Mips architecture. //===----------------------------------------------------------------------===// /// CCIfSubtarget - Match if the current subtarget has a feature F. -class CCIfSubtarget<string F, CCAction A>: +class CCIfSubtarget<string F, CCAction A>: CCIf<!strconcat("State.getTarget().getSubtarget<MipsSubtarget>().", F), A>; //===----------------------------------------------------------------------===// // Mips O32 Calling Convention //===----------------------------------------------------------------------===// -// Only the return rules are defined here for O32. The rules for argument +// Only the return rules are defined here for O32. The rules for argument // passing are defined in MipsISelLowering.cpp. def RetCC_MipsO32 : CallingConv<[ // i32 are returned in registers V0, V1 @@ -41,15 +41,15 @@ def CC_MipsEABI : CallingConv<[ // Integer arguments are passed in integer registers. CCIfType<[i32], CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3]>>, - // Single fp arguments are passed in pairs within 32-bit mode - CCIfType<[f32], CCIfSubtarget<"isSingleFloat()", + // Single fp arguments are passed in pairs within 32-bit mode + CCIfType<[f32], CCIfSubtarget<"isSingleFloat()", CCAssignToReg<[F12, F13, F14, F15, F16, F17, F18, F19]>>>, - CCIfType<[f32], CCIfSubtarget<"isNotSingleFloat()", + CCIfType<[f32], CCIfSubtarget<"isNotSingleFloat()", CCAssignToReg<[F12, F14, F16, F18]>>>, - // The first 4 doubl fp arguments are passed in single fp registers. - CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()", + // The first 4 double fp arguments are passed in single fp registers. + CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()", CCAssignToReg<[D6, D7, D8, D9]>>>, // Integer values get stored in stack slots that are 4 bytes in diff --git a/lib/Target/Mips/MipsExpandPseudo.cpp b/lib/Target/Mips/MipsExpandPseudo.cpp new file mode 100644 index 0000000..4423f51 --- /dev/null +++ b/lib/Target/Mips/MipsExpandPseudo.cpp @@ -0,0 +1,117 @@ +//===-- MipsExpandPseudo.cpp - Expand pseudo instructions ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass expands pseudo instructions into target instructions after register +// allocation but before post-RA scheduling. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "mips-expand-pseudo" + +#include "Mips.h" +#include "MipsTargetMachine.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/ADT/Statistic.h" + +using namespace llvm; + +namespace { + struct MipsExpandPseudo : public MachineFunctionPass { + + TargetMachine &TM; + const TargetInstrInfo *TII; + + static char ID; + MipsExpandPseudo(TargetMachine &tm) + : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()) { } + + virtual const char *getPassName() const { + return "Mips PseudoInstrs Expansion"; + } + + bool runOnMachineFunction(MachineFunction &F); + bool runOnMachineBasicBlock(MachineBasicBlock &MBB); + + private: + void ExpandBuildPairF64(MachineBasicBlock&, MachineBasicBlock::iterator); + void ExpandExtractElementF64(MachineBasicBlock&, + MachineBasicBlock::iterator); + }; + char MipsExpandPseudo::ID = 0; +} // end of anonymous namespace + +bool MipsExpandPseudo::runOnMachineFunction(MachineFunction& F) { + bool Changed = false; + + for (MachineFunction::iterator I = F.begin(); I != F.end(); ++I) + Changed |= runOnMachineBasicBlock(*I); + + return Changed; +} + +bool MipsExpandPseudo::runOnMachineBasicBlock(MachineBasicBlock& MBB) { + + bool Changed = false; + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end();) { + const TargetInstrDesc& Tid = I->getDesc(); + + switch(Tid.getOpcode()) { + default: + ++I; + continue; + case Mips::BuildPairF64: + ExpandBuildPairF64(MBB, I); + break; + case Mips::ExtractElementF64: + ExpandExtractElementF64(MBB, I); + break; + } + + // delete original instr + MBB.erase(I++); + Changed = true; + } + + return Changed; +} + +void MipsExpandPseudo::ExpandBuildPairF64(MachineBasicBlock& MBB, + MachineBasicBlock::iterator I) { + unsigned DstReg = I->getOperand(0).getReg(); + unsigned LoReg = I->getOperand(1).getReg(), HiReg = I->getOperand(2).getReg(); + const TargetInstrDesc& Mtc1Tdd = TII->get(Mips::MTC1); + DebugLoc dl = I->getDebugLoc(); + const unsigned* SubReg = + TM.getRegisterInfo()->getSubRegisters(DstReg); + + // mtc1 Lo, $fp + // mtc1 Hi, $fp + 1 + BuildMI(MBB, I, dl, Mtc1Tdd, *SubReg).addReg(LoReg); + BuildMI(MBB, I, dl, Mtc1Tdd, *(SubReg + 1)).addReg(HiReg); +} + +void MipsExpandPseudo::ExpandExtractElementF64(MachineBasicBlock& MBB, + MachineBasicBlock::iterator I) { + unsigned DstReg = I->getOperand(0).getReg(); + unsigned SrcReg = I->getOperand(1).getReg(); + unsigned N = I->getOperand(2).getImm(); + const TargetInstrDesc& Mfc1Tdd = TII->get(Mips::MFC1); + DebugLoc dl = I->getDebugLoc(); + const unsigned* SubReg = TM.getRegisterInfo()->getSubRegisters(SrcReg); + + BuildMI(MBB, I, dl, Mfc1Tdd, DstReg).addReg(*(SubReg + N)); +} + +/// createMipsMipsExpandPseudoPass - Returns a pass that expands pseudo +/// instrs into real instrs +FunctionPass *llvm::createMipsExpandPseudoPass(MipsTargetMachine &tm) { + return new MipsExpandPseudo(tm); +} diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp index 87a097a..21e3314 100644 --- a/lib/Target/Mips/MipsFrameLowering.cpp +++ b/lib/Target/Mips/MipsFrameLowering.cpp @@ -203,6 +203,46 @@ void MipsFrameLowering::adjustMipsStackFrame(MachineFunction &MF) const { MipsFI->setFPUTopSavedRegOff(TopFPUSavedRegOff-StackOffset); } + +// expand pair of register and immediate if the immediate doesn't fit in the +// 16-bit offset field. +// e.g. +// if OrigImm = 0x10000, OrigReg = $sp: +// generate the following sequence of instrs: +// lui $at, hi(0x10000) +// addu $at, $sp, $at +// +// (NewReg, NewImm) = ($at, lo(Ox10000)) +// return true +static bool expandRegLargeImmPair(unsigned OrigReg, int OrigImm, + unsigned& NewReg, int& NewImm, + MachineBasicBlock& MBB, + MachineBasicBlock::iterator I) { + // OrigImm fits in the 16-bit field + if (OrigImm < 0x8000 && OrigImm >= -0x8000) { + NewReg = OrigReg; + NewImm = OrigImm; + return false; + } + + MachineFunction* MF = MBB.getParent(); + const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); + DebugLoc DL = I->getDebugLoc(); + int ImmLo = OrigImm & 0xffff; + int ImmHi = (((unsigned)OrigImm & 0xffff0000) >> 16) + + ((OrigImm & 0x8000) != 0); + + // FIXME: change this when mips goes MC". + BuildMI(MBB, I, DL, TII->get(Mips::NOAT)); + BuildMI(MBB, I, DL, TII->get(Mips::LUi), Mips::AT).addImm(ImmHi); + BuildMI(MBB, I, DL, TII->get(Mips::ADDu), Mips::AT).addReg(OrigReg) + .addReg(Mips::AT); + NewReg = Mips::AT; + NewImm = ImmLo; + + return true; +} + void MipsFrameLowering::emitPrologue(MachineFunction &MF) const { MachineBasicBlock &MBB = MF.front(); MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -214,6 +254,9 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const { MachineBasicBlock::iterator MBBI = MBB.begin(); DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); bool isPIC = (MF.getTarget().getRelocationModel() == Reloc::PIC_); + unsigned NewReg = 0; + int NewImm = 0; + bool ATUsed; // Get the right frame order for Mips. adjustMipsStackFrame(MF); @@ -236,22 +279,40 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const { BuildMI(MBB, MBBI, dl, TII.get(Mips::NOMACRO)); // Adjust stack : addi sp, sp, (-imm) + ATUsed = expandRegLargeImmPair(Mips::SP, -StackSize, NewReg, NewImm, MBB, + MBBI); BuildMI(MBB, MBBI, dl, TII.get(Mips::ADDiu), Mips::SP) - .addReg(Mips::SP).addImm(-StackSize); + .addReg(NewReg).addImm(NewImm); + + // FIXME: change this when mips goes MC". + if (ATUsed) + BuildMI(MBB, MBBI, dl, TII.get(Mips::ATMACRO)); - // Save the return address only if the function isnt a leaf one. + // Save the return address only if the function isn't a leaf one. // sw $ra, stack_loc($sp) if (MFI->adjustsStack()) { + ATUsed = expandRegLargeImmPair(Mips::SP, RAOffset, NewReg, NewImm, MBB, + MBBI); BuildMI(MBB, MBBI, dl, TII.get(Mips::SW)) - .addReg(Mips::RA).addImm(RAOffset).addReg(Mips::SP); + .addReg(Mips::RA).addImm(NewImm).addReg(NewReg); + + // FIXME: change this when mips goes MC". + if (ATUsed) + BuildMI(MBB, MBBI, dl, TII.get(Mips::ATMACRO)); } // if framepointer enabled, save it and set it // to point to the stack pointer if (hasFP(MF)) { // sw $fp,stack_loc($sp) + ATUsed = expandRegLargeImmPair(Mips::SP, FPOffset, NewReg, NewImm, MBB, + MBBI); BuildMI(MBB, MBBI, dl, TII.get(Mips::SW)) - .addReg(Mips::FP).addImm(FPOffset).addReg(Mips::SP); + .addReg(Mips::FP).addImm(NewImm).addReg(NewReg); + + // FIXME: change this when mips goes MC". + if (ATUsed) + BuildMI(MBB, MBBI, dl, TII.get(Mips::ATMACRO)); // move $fp, $sp BuildMI(MBB, MBBI, dl, TII.get(Mips::ADDu), Mips::FP) @@ -280,6 +341,10 @@ void MipsFrameLowering::emitEpilogue(MachineFunction &MF, int FPOffset = MipsFI->getFPStackOffset(); int RAOffset = MipsFI->getRAStackOffset(); + unsigned NewReg = 0; + int NewImm = 0; + bool ATUsed = false; + // if framepointer enabled, restore it and restore the // stack pointer if (hasFP(MF)) { @@ -288,21 +353,39 @@ void MipsFrameLowering::emitEpilogue(MachineFunction &MF, .addReg(Mips::FP).addReg(Mips::ZERO); // lw $fp,stack_loc($sp) + ATUsed = expandRegLargeImmPair(Mips::SP, FPOffset, NewReg, NewImm, MBB, + MBBI); BuildMI(MBB, MBBI, dl, TII.get(Mips::LW), Mips::FP) - .addImm(FPOffset).addReg(Mips::SP); + .addImm(NewImm).addReg(NewReg); + + // FIXME: change this when mips goes MC". + if (ATUsed) + BuildMI(MBB, MBBI, dl, TII.get(Mips::ATMACRO)); } - // Restore the return address only if the function isnt a leaf one. + // Restore the return address only if the function isn't a leaf one. // lw $ra, stack_loc($sp) if (MFI->adjustsStack()) { + ATUsed = expandRegLargeImmPair(Mips::SP, RAOffset, NewReg, NewImm, MBB, + MBBI); BuildMI(MBB, MBBI, dl, TII.get(Mips::LW), Mips::RA) - .addImm(RAOffset).addReg(Mips::SP); + .addImm(NewImm).addReg(NewReg); + + // FIXME: change this when mips goes MC". + if (ATUsed) + BuildMI(MBB, MBBI, dl, TII.get(Mips::ATMACRO)); } // adjust stack : insert addi sp, sp, (imm) if (NumBytes) { + ATUsed = expandRegLargeImmPair(Mips::SP, NumBytes, NewReg, NewImm, MBB, + MBBI); BuildMI(MBB, MBBI, dl, TII.get(Mips::ADDiu), Mips::SP) - .addReg(Mips::SP).addImm(NumBytes); + .addReg(NewReg).addImm(NewImm); + + // FIXME: change this when mips goes MC". + if (ATUsed) + BuildMI(MBB, MBBI, dl, TII.get(Mips::ATMACRO)); } } diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h index a8426c1..34647df 100644 --- a/lib/Target/Mips/MipsFrameLowering.h +++ b/lib/Target/Mips/MipsFrameLowering.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef ALPHA_FRAMEINFO_H -#define ALPHA_FRAMEINFO_H +#ifndef MIPS_FRAMEINFO_H +#define MIPS_FRAMEINFO_H #include "Mips.h" #include "MipsSubtarget.h" diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp index 755e04d..0382964 100644 --- a/lib/Target/Mips/MipsISelDAGToDAG.cpp +++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp @@ -52,19 +52,19 @@ class MipsDAGToDAGISel : public SelectionDAGISel { /// Subtarget - Keep a pointer to the MipsSubtarget around so that we can /// make the right decision when generating code for different targets. const MipsSubtarget &Subtarget; - + public: explicit MipsDAGToDAGISel(MipsTargetMachine &tm) : SelectionDAGISel(tm), TM(tm), Subtarget(tm.getSubtarget<MipsSubtarget>()) {} - + // Pass Name virtual const char *getPassName() const { return "MIPS DAG->DAG Pattern Instruction Selection"; - } - + } -private: + +private: // Include the pieces autogenerated from the target description. #include "MipsGenDAGISel.inc" @@ -116,12 +116,14 @@ SelectAddr(SDValue Addr, SDValue &Offset, SDValue &Base) { Offset = CurDAG->getTargetConstant(0, MVT::i32); return true; } - + // on PIC code Load GA if (TM.getRelocationModel() == Reloc::PIC_) { - if ((Addr.getOpcode() == ISD::TargetGlobalAddress) || - (Addr.getOpcode() == ISD::TargetConstantPool) || - (Addr.getOpcode() == ISD::TargetJumpTable)){ + if ((Addr.getOpcode() == ISD::TargetGlobalAddress) || + (Addr.getOpcode() == ISD::TargetConstantPool) || + (Addr.getOpcode() == ISD::TargetJumpTable) || + (Addr.getOpcode() == ISD::TargetBlockAddress) || + (Addr.getOpcode() == ISD::TargetExternalSymbol)) { Base = CurDAG->getRegister(Mips::GP, MVT::i32); Offset = Addr; return true; @@ -130,8 +132,8 @@ SelectAddr(SDValue Addr, SDValue &Offset, SDValue &Base) { if ((Addr.getOpcode() == ISD::TargetExternalSymbol || Addr.getOpcode() == ISD::TargetGlobalAddress)) return false; - } - + } + // Operand is a result from an ADD. if (Addr.getOpcode() == ISD::ADD) { if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) { @@ -158,10 +160,10 @@ SelectAddr(SDValue Addr, SDValue &Offset, SDValue &Base) { // Generate: // lui $2, %hi($CPI1_0) // lwc1 $f0, %lo($CPI1_0)($2) - if ((Addr.getOperand(0).getOpcode() == MipsISD::Hi || + if ((Addr.getOperand(0).getOpcode() == MipsISD::Hi || Addr.getOperand(0).getOpcode() == ISD::LOAD) && Addr.getOperand(1).getOpcode() == MipsISD::Lo) { - SDValue LoVal = Addr.getOperand(1); + SDValue LoVal = Addr.getOperand(1); if (dyn_cast<ConstantPoolSDNode>(LoVal.getOperand(0))) { Base = Addr.getOperand(0); Offset = LoVal.getOperand(0); @@ -176,7 +178,7 @@ SelectAddr(SDValue Addr, SDValue &Offset, SDValue &Base) { } SDNode *MipsDAGToDAGISel::SelectLoadFp64(SDNode *N) { - MVT::SimpleValueType NVT = + MVT::SimpleValueType NVT = N->getValueType(0).getSimpleVT().SimpleTy; if (!Subtarget.isMips1() || NVT != MVT::f64) @@ -199,14 +201,14 @@ SDNode *MipsDAGToDAGISel::SelectLoadFp64(SDNode *N) { MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand(); DebugLoc dl = N->getDebugLoc(); - // The second load should start after for 4 bytes. + // The second load should start after for 4 bytes. if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Offset0)) Offset1 = CurDAG->getTargetConstant(C->getSExtValue()+4, MVT::i32); else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Offset0)) - Offset1 = CurDAG->getTargetConstantPool(CP->getConstVal(), - MVT::i32, - CP->getAlignment(), - CP->getOffset()+4, + Offset1 = CurDAG->getTargetConstantPool(CP->getConstVal(), + MVT::i32, + CP->getAlignment(), + CP->getOffset()+4, CP->getTargetFlags()); else return NULL; @@ -220,16 +222,16 @@ SDNode *MipsDAGToDAGISel::SelectLoadFp64(SDNode *N) { // Generate: // lwc $f0, X($3) // lwc $f1, X+4($3) - SDNode *LD0 = CurDAG->getMachineNode(Mips::LWC1, dl, MVT::f32, + SDNode *LD0 = CurDAG->getMachineNode(Mips::LWC1, dl, MVT::f32, MVT::Other, Offset0, Base, Chain); SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, NVT), 0); - SDValue I0 = CurDAG->getTargetInsertSubreg(Mips::sub_fpeven, dl, + SDValue I0 = CurDAG->getTargetInsertSubreg(Mips::sub_fpeven, dl, MVT::f64, Undef, SDValue(LD0, 0)); SDNode *LD1 = CurDAG->getMachineNode(Mips::LWC1, dl, MVT::f32, MVT::Other, Offset1, Base, SDValue(LD0, 1)); - SDValue I1 = CurDAG->getTargetInsertSubreg(Mips::sub_fpodd, dl, + SDValue I1 = CurDAG->getTargetInsertSubreg(Mips::sub_fpodd, dl, MVT::f64, I0, SDValue(LD1, 0)); ReplaceUses(SDValue(N, 0), I1); @@ -241,7 +243,7 @@ SDNode *MipsDAGToDAGISel::SelectLoadFp64(SDNode *N) { SDNode *MipsDAGToDAGISel::SelectStoreFp64(SDNode *N) { - if (!Subtarget.isMips1() || + if (!Subtarget.isMips1() || N->getOperand(1).getValueType() != MVT::f64) return NULL; @@ -265,12 +267,12 @@ SDNode *MipsDAGToDAGISel::SelectStoreFp64(SDNode *N) { DebugLoc dl = N->getDebugLoc(); // Get the even and odd part from the f64 register - SDValue FPOdd = CurDAG->getTargetExtractSubreg(Mips::sub_fpodd, + SDValue FPOdd = CurDAG->getTargetExtractSubreg(Mips::sub_fpodd, dl, MVT::f32, N1); SDValue FPEven = CurDAG->getTargetExtractSubreg(Mips::sub_fpeven, dl, MVT::f32, N1); - // The second store should start after for 4 bytes. + // The second store should start after for 4 bytes. if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Offset0)) Offset1 = CurDAG->getTargetConstant(C->getSExtValue()+4, MVT::i32); else @@ -315,26 +317,26 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) { } /// - // Instruction Selection not handled by the auto-generated + // Instruction Selection not handled by the auto-generated // tablegen selection should be handled here. - /// + /// switch(Opcode) { default: break; - case ISD::SUBE: + case ISD::SUBE: case ISD::ADDE: { SDValue InFlag = Node->getOperand(2), CmpLHS; unsigned Opc = InFlag.getOpcode(); (void)Opc; - assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) || - (Opc == ISD::SUBC || Opc == ISD::SUBE)) && + assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) || + (Opc == ISD::SUBC || Opc == ISD::SUBE)) && "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn"); unsigned MOp; if (Opcode == ISD::ADDE) { CmpLHS = InFlag.getValue(0); MOp = Mips::ADDu; - } else { + } else { CmpLHS = InFlag.getOperand(0); MOp = Mips::SUBu; } @@ -346,7 +348,7 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) { EVT VT = LHS.getValueType(); SDNode *Carry = CurDAG->getMachineNode(Mips::SLTu, dl, VT, Ops, 2); - SDNode *AddCarry = CurDAG->getMachineNode(Mips::ADDu, dl, VT, + SDNode *AddCarry = CurDAG->getMachineNode(Mips::ADDu, dl, VT, SDValue(Carry,0), RHS); return CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, @@ -356,36 +358,34 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) { /// Mul/Div with two results case ISD::SDIVREM: case ISD::UDIVREM: + break; case ISD::SMUL_LOHI: case ISD::UMUL_LOHI: { SDValue Op1 = Node->getOperand(0); SDValue Op2 = Node->getOperand(1); unsigned Op; - if (Opcode == ISD::UMUL_LOHI || Opcode == ISD::SMUL_LOHI) - Op = (Opcode == ISD::UMUL_LOHI ? Mips::MULTu : Mips::MULT); - else - Op = (Opcode == ISD::UDIVREM ? Mips::DIVu : Mips::DIV); + Op = (Opcode == ISD::UMUL_LOHI ? Mips::MULTu : Mips::MULT); - SDNode *MulDiv = CurDAG->getMachineNode(Op, dl, MVT::Glue, Op1, Op2); + SDNode *Mul = CurDAG->getMachineNode(Op, dl, MVT::Glue, Op1, Op2); - SDValue InFlag = SDValue(MulDiv, 0); - SDNode *Lo = CurDAG->getMachineNode(Mips::MFLO, dl, MVT::i32, + SDValue InFlag = SDValue(Mul, 0); + SDNode *Lo = CurDAG->getMachineNode(Mips::MFLO, dl, MVT::i32, MVT::Glue, InFlag); InFlag = SDValue(Lo,1); SDNode *Hi = CurDAG->getMachineNode(Mips::MFHI, dl, MVT::i32, InFlag); - if (!SDValue(Node, 0).use_empty()) + if (!SDValue(Node, 0).use_empty()) ReplaceUses(SDValue(Node, 0), SDValue(Lo,0)); - if (!SDValue(Node, 1).use_empty()) + if (!SDValue(Node, 1).use_empty()) ReplaceUses(SDValue(Node, 1), SDValue(Hi,0)); return NULL; } /// Special Muls - case ISD::MUL: + case ISD::MUL: if (Subtarget.isMips32()) break; case ISD::MULHS: @@ -394,7 +394,7 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) { SDValue MulOp2 = Node->getOperand(1); unsigned MulOp = (Opcode == ISD::MULHU ? Mips::MULTu : Mips::MULT); - SDNode *MulNode = CurDAG->getMachineNode(MulOp, dl, + SDNode *MulNode = CurDAG->getMachineNode(MulOp, dl, MVT::Glue, MulOp1, MulOp2); SDValue InFlag = SDValue(MulNode, 0); @@ -408,24 +408,9 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) { /// Div/Rem operations case ISD::SREM: case ISD::UREM: - case ISD::SDIV: - case ISD::UDIV: { - SDValue Op1 = Node->getOperand(0); - SDValue Op2 = Node->getOperand(1); - - unsigned Op, MOp; - if (Opcode == ISD::SDIV || Opcode == ISD::UDIV) { - Op = (Opcode == ISD::SDIV ? Mips::DIV : Mips::DIVu); - MOp = Mips::MFLO; - } else { - Op = (Opcode == ISD::SREM ? Mips::DIV : Mips::DIVu); - MOp = Mips::MFHI; - } - SDNode *Node = CurDAG->getMachineNode(Op, dl, MVT::Glue, Op1, Op2); - - SDValue InFlag = SDValue(Node, 0); - return CurDAG->getMachineNode(MOp, dl, MVT::i32, InFlag); - } + case ISD::SDIV: + case ISD::UDIV: + break; // Get target GOT address. case ISD::GLOBAL_OFFSET_TABLE: @@ -433,15 +418,15 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) { case ISD::ConstantFP: { ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(Node); - if (Node->getValueType(0) == MVT::f64 && CN->isExactlyValue(+0.0)) { - SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, + if (Node->getValueType(0) == MVT::f64 && CN->isExactlyValue(+0.0)) { + SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, Mips::ZERO, MVT::i32); SDValue Undef = SDValue( CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, MVT::f64), 0); SDNode *MTC = CurDAG->getMachineNode(Mips::MTC1, dl, MVT::f32, Zero); - SDValue I0 = CurDAG->getTargetInsertSubreg(Mips::sub_fpeven, dl, + SDValue I0 = CurDAG->getTargetInsertSubreg(Mips::sub_fpeven, dl, MVT::f64, Undef, SDValue(MTC, 0)); - SDValue I1 = CurDAG->getTargetInsertSubreg(Mips::sub_fpodd, dl, + SDValue I1 = CurDAG->getTargetInsertSubreg(Mips::sub_fpodd, dl, MVT::f64, I0, SDValue(MTC, 0)); ReplaceUses(SDValue(Node, 0), I1); return I1.getNode(); @@ -460,61 +445,6 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) { return ResNode; // Other cases are autogenerated. break; - - /// Handle direct and indirect calls when using PIC. On PIC, when - /// GOT is smaller than about 64k (small code) the GA target is - /// loaded with only one instruction. Otherwise GA's target must - /// be loaded with 3 instructions. - case MipsISD::JmpLink: { - if (TM.getRelocationModel() == Reloc::PIC_) { - unsigned LastOpNum = Node->getNumOperands()-1; - - SDValue Chain = Node->getOperand(0); - SDValue Callee = Node->getOperand(1); - SDValue InFlag; - - // Skip the incomming flag if present - if (Node->getOperand(LastOpNum).getValueType() == MVT::Glue) - LastOpNum--; - - if ( (isa<GlobalAddressSDNode>(Callee)) || - (isa<ExternalSymbolSDNode>(Callee)) ) - { - /// Direct call for global addresses and external symbols - SDValue GPReg = CurDAG->getRegister(Mips::GP, MVT::i32); - - // Use load to get GOT target - SDValue Ops[] = { Callee, GPReg, Chain }; - SDValue Load = SDValue(CurDAG->getMachineNode(Mips::LW, dl, MVT::i32, - MVT::Other, Ops, 3), 0); - Chain = Load.getValue(1); - - // Call target must be on T9 - Chain = CurDAG->getCopyToReg(Chain, dl, Mips::T9, Load, InFlag); - } else - /// Indirect call - Chain = CurDAG->getCopyToReg(Chain, dl, Mips::T9, Callee, InFlag); - - // Map the JmpLink operands to JALR - SDVTList NodeTys = CurDAG->getVTList(MVT::Other, MVT::Glue); - SmallVector<SDValue, 8> Ops; - Ops.push_back(CurDAG->getRegister(Mips::T9, MVT::i32)); - - for (unsigned i = 2, e = LastOpNum+1; i != e; ++i) - Ops.push_back(Node->getOperand(i)); - Ops.push_back(Chain); - Ops.push_back(Chain.getValue(1)); - - // Emit Jump and Link Register - SDNode *ResNode = CurDAG->getMachineNode(Mips::JALR, dl, NodeTys, - &Ops[0], Ops.size()); - - // Replace Chain and InFlag - ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0)); - ReplaceUses(SDValue(Node, 1), SDValue(ResNode, 1)); - return ResNode; - } - } } // Select the default instruction @@ -529,7 +459,7 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) { return ResNode; } -/// createMipsISelDag - This pass converts a legalized DAG into a +/// createMipsISelDag - This pass converts a legalized DAG into a /// MIPS-specific DAG, ready for instruction scheduling. FunctionPass *llvm::createMipsISelDag(MipsTargetMachine &TM) { return new MipsDAGToDAGISel(TM); diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index 1d7a1c0..1f1220f 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -41,15 +41,19 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const { case MipsISD::Lo : return "MipsISD::Lo"; case MipsISD::GPRel : return "MipsISD::GPRel"; case MipsISD::Ret : return "MipsISD::Ret"; - case MipsISD::SelectCC : return "MipsISD::SelectCC"; - case MipsISD::FPSelectCC : return "MipsISD::FPSelectCC"; case MipsISD::FPBrcond : return "MipsISD::FPBrcond"; case MipsISD::FPCmp : return "MipsISD::FPCmp"; + case MipsISD::CMovFP_T : return "MipsISD::CMovFP_T"; + case MipsISD::CMovFP_F : return "MipsISD::CMovFP_F"; case MipsISD::FPRound : return "MipsISD::FPRound"; case MipsISD::MAdd : return "MipsISD::MAdd"; case MipsISD::MAddu : return "MipsISD::MAddu"; case MipsISD::MSub : return "MipsISD::MSub"; case MipsISD::MSubu : return "MipsISD::MSubu"; + case MipsISD::DivRem : return "MipsISD::DivRem"; + case MipsISD::DivRemU : return "MipsISD::DivRemU"; + case MipsISD::BuildPairF64: return "MipsISD::BuildPairF64"; + case MipsISD::ExtractElementF64: return "MipsISD::ExtractElementF64"; default : return NULL; } } @@ -89,25 +93,22 @@ MipsTargetLowering(MipsTargetMachine &TM) // Mips Custom Operations setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + setOperationAction(ISD::BlockAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); setOperationAction(ISD::JumpTable, MVT::i32, Custom); setOperationAction(ISD::ConstantPool, MVT::i32, Custom); setOperationAction(ISD::SELECT, MVT::f32, Custom); setOperationAction(ISD::SELECT, MVT::f64, Custom); setOperationAction(ISD::SELECT, MVT::i32, Custom); - setOperationAction(ISD::SETCC, MVT::f32, Custom); - setOperationAction(ISD::SETCC, MVT::f64, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); setOperationAction(ISD::VASTART, MVT::Other, Custom); - - // We custom lower AND/OR to handle the case where the DAG contain 'ands/ors' - // with operands comming from setcc fp comparions. This is necessary since - // the result from these setcc are in a flag registers (FCR31). - setOperationAction(ISD::AND, MVT::i32, Custom); - setOperationAction(ISD::OR, MVT::i32, Custom); + setOperationAction(ISD::SDIV, MVT::i32, Expand); + setOperationAction(ISD::SREM, MVT::i32, Expand); + setOperationAction(ISD::UDIV, MVT::i32, Expand); + setOperationAction(ISD::UREM, MVT::i32, Expand); // Operations not directly supported by Mips. setOperationAction(ISD::BR_JT, MVT::Other, Expand); @@ -129,7 +130,9 @@ MipsTargetLowering(MipsTargetMachine &TM) setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FSIN, MVT::f32, Expand); + setOperationAction(ISD::FSIN, MVT::f64, Expand); setOperationAction(ISD::FCOS, MVT::f32, Expand); + setOperationAction(ISD::FCOS, MVT::f64, Expand); setOperationAction(ISD::FPOWI, MVT::f32, Expand); setOperationAction(ISD::FPOW, MVT::f32, Expand); setOperationAction(ISD::FLOG, MVT::f32, Expand); @@ -139,6 +142,10 @@ MipsTargetLowering(MipsTargetMachine &TM) setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); + setOperationAction(ISD::VAARG, MVT::Other, Expand); + setOperationAction(ISD::VACOPY, MVT::Other, Expand); + setOperationAction(ISD::VAEND, MVT::Other, Expand); + // Use the default for now setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); @@ -160,6 +167,9 @@ MipsTargetLowering(MipsTargetMachine &TM) setTargetDAGCombine(ISD::ADDE); setTargetDAGCombine(ISD::SUBE); + setTargetDAGCombine(ISD::SDIVREM); + setTargetDAGCombine(ISD::UDIVREM); + setTargetDAGCombine(ISD::SETCC); setStackPointerRegisterToSaveRestore(Mips::SP); computeRegisterProperties(); @@ -181,7 +191,7 @@ unsigned MipsTargetLowering::getFunctionAlignment(const Function *) const { // multHi/Lo: product of multiplication // Lo0: initial value of Lo register // Hi0: initial value of Hi register -// Return true if mattern matching was successful. +// Return true if pattern matching was successful. static bool SelectMadd(SDNode* ADDENode, SelectionDAG* CurDAG) { // ADDENode's second operand must be a flag output of an ADDC node in order // for the matching to be successful. @@ -255,7 +265,7 @@ static bool SelectMadd(SDNode* ADDENode, SelectionDAG* CurDAG) { // multHi/Lo: product of multiplication // Lo0: initial value of Lo register // Hi0: initial value of Hi register -// Return true if mattern matching was successful. +// Return true if pattern matching was successful. static bool SelectMsub(SDNode* SUBENode, SelectionDAG* CurDAG) { // SUBENode's second operand must be a flag output of an SUBC node in order // for the matching to be successful. @@ -346,6 +356,130 @@ static SDValue PerformSUBECombine(SDNode *N, SelectionDAG& DAG, return SDValue(); } +static SDValue PerformDivRemCombine(SDNode *N, SelectionDAG& DAG, + TargetLowering::DAGCombinerInfo &DCI, + const MipsSubtarget* Subtarget) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + unsigned opc = N->getOpcode() == ISD::SDIVREM ? MipsISD::DivRem : + MipsISD::DivRemU; + DebugLoc dl = N->getDebugLoc(); + + SDValue DivRem = DAG.getNode(opc, dl, MVT::Glue, + N->getOperand(0), N->getOperand(1)); + SDValue InChain = DAG.getEntryNode(); + SDValue InGlue = DivRem; + + // insert MFLO + if (N->hasAnyUseOfValue(0)) { + SDValue CopyFromLo = DAG.getCopyFromReg(InChain, dl, Mips::LO, MVT::i32, + InGlue); + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyFromLo); + InChain = CopyFromLo.getValue(1); + InGlue = CopyFromLo.getValue(2); + } + + // insert MFHI + if (N->hasAnyUseOfValue(1)) { + SDValue CopyFromHi = DAG.getCopyFromReg(InChain, dl, + Mips::HI, MVT::i32, InGlue); + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), CopyFromHi); + } + + return SDValue(); +} + +static Mips::CondCode FPCondCCodeToFCC(ISD::CondCode CC) { + switch (CC) { + default: llvm_unreachable("Unknown fp condition code!"); + case ISD::SETEQ: + case ISD::SETOEQ: return Mips::FCOND_OEQ; + case ISD::SETUNE: return Mips::FCOND_UNE; + case ISD::SETLT: + case ISD::SETOLT: return Mips::FCOND_OLT; + case ISD::SETGT: + case ISD::SETOGT: return Mips::FCOND_OGT; + case ISD::SETLE: + case ISD::SETOLE: return Mips::FCOND_OLE; + case ISD::SETGE: + case ISD::SETOGE: return Mips::FCOND_OGE; + case ISD::SETULT: return Mips::FCOND_ULT; + case ISD::SETULE: return Mips::FCOND_ULE; + case ISD::SETUGT: return Mips::FCOND_UGT; + case ISD::SETUGE: return Mips::FCOND_UGE; + case ISD::SETUO: return Mips::FCOND_UN; + case ISD::SETO: return Mips::FCOND_OR; + case ISD::SETNE: + case ISD::SETONE: return Mips::FCOND_ONE; + case ISD::SETUEQ: return Mips::FCOND_UEQ; + } +} + + +// Returns true if condition code has to be inverted. +static bool InvertFPCondCode(Mips::CondCode CC) { + if (CC >= Mips::FCOND_F && CC <= Mips::FCOND_NGT) + return false; + + if (CC >= Mips::FCOND_T && CC <= Mips::FCOND_GT) + return true; + + assert(false && "Illegal Condition Code"); + return false; +} + +// Creates and returns an FPCmp node from a setcc node. +// Returns Op if setcc is not a floating point comparison. +static SDValue CreateFPCmp(SelectionDAG& DAG, const SDValue& Op) { + // must be a SETCC node + if (Op.getOpcode() != ISD::SETCC) + return Op; + + SDValue LHS = Op.getOperand(0); + + if (!LHS.getValueType().isFloatingPoint()) + return Op; + + SDValue RHS = Op.getOperand(1); + DebugLoc dl = Op.getDebugLoc(); + + // Assume the 3rd operand is a CondCodeSDNode. Add code to check the type of + // node if necessary. + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); + + return DAG.getNode(MipsISD::FPCmp, dl, MVT::Glue, LHS, RHS, + DAG.getConstant(FPCondCCodeToFCC(CC), MVT::i32)); +} + +// Creates and returns a CMovFPT/F node. +static SDValue CreateCMovFP(SelectionDAG& DAG, SDValue Cond, SDValue True, + SDValue False, DebugLoc DL) { + bool invert = InvertFPCondCode((Mips::CondCode) + cast<ConstantSDNode>(Cond.getOperand(2)) + ->getSExtValue()); + + return DAG.getNode((invert ? MipsISD::CMovFP_F : MipsISD::CMovFP_T), DL, + True.getValueType(), True, False, Cond); +} + +static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG& DAG, + TargetLowering::DAGCombinerInfo &DCI, + const MipsSubtarget* Subtarget) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SDValue Cond = CreateFPCmp(DAG, SDValue(N, 0)); + + if (Cond.getOpcode() != MipsISD::FPCmp) + return SDValue(); + + SDValue True = DAG.getConstant(1, MVT::i32); + SDValue False = DAG.getConstant(0, MVT::i32); + + return CreateCMovFP(DAG, Cond, True, False, N->getDebugLoc()); +} + SDValue MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -357,6 +491,11 @@ SDValue MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) return PerformADDECombine(N, DAG, DCI, Subtarget); case ISD::SUBE: return PerformSUBECombine(N, DAG, DCI, Subtarget); + case ISD::SDIVREM: + case ISD::UDIVREM: + return PerformDivRemCombine(N, DAG, DCI, Subtarget); + case ISD::SETCC: + return PerformSETCCCombine(N, DAG, DCI, Subtarget); } return SDValue(); @@ -367,17 +506,15 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { - case ISD::AND: return LowerANDOR(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); + case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); - case ISD::OR: return LowerANDOR(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); - case ISD::SETCC: return LowerSETCC(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); } return SDValue(); @@ -410,122 +547,110 @@ static Mips::FPBranchCode GetFPBranchCodeFromCond(Mips::CondCode CC) { return Mips::BRANCH_INVALID; } -static unsigned FPBranchCodeToOpc(Mips::FPBranchCode BC) { - switch(BC) { - default: - llvm_unreachable("Unknown branch code"); - case Mips::BRANCH_T : return Mips::BC1T; - case Mips::BRANCH_F : return Mips::BC1F; - case Mips::BRANCH_TL : return Mips::BC1TL; - case Mips::BRANCH_FL : return Mips::BC1FL; - } -} - -static Mips::CondCode FPCondCCodeToFCC(ISD::CondCode CC) { - switch (CC) { - default: llvm_unreachable("Unknown fp condition code!"); - case ISD::SETEQ: - case ISD::SETOEQ: return Mips::FCOND_EQ; - case ISD::SETUNE: return Mips::FCOND_OGL; - case ISD::SETLT: - case ISD::SETOLT: return Mips::FCOND_OLT; - case ISD::SETGT: - case ISD::SETOGT: return Mips::FCOND_OGT; - case ISD::SETLE: - case ISD::SETOLE: return Mips::FCOND_OLE; - case ISD::SETGE: - case ISD::SETOGE: return Mips::FCOND_OGE; - case ISD::SETULT: return Mips::FCOND_ULT; - case ISD::SETULE: return Mips::FCOND_ULE; - case ISD::SETUGT: return Mips::FCOND_UGT; - case ISD::SETUGE: return Mips::FCOND_UGE; - case ISD::SETUO: return Mips::FCOND_UN; - case ISD::SETO: return Mips::FCOND_OR; - case ISD::SETNE: - case ISD::SETONE: return Mips::FCOND_NEQ; - case ISD::SETUEQ: return Mips::FCOND_UEQ; - } -} - MachineBasicBlock * MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { + // There is no need to expand CMov instructions if target has + // conditional moves. + if (Subtarget->hasCondMov()) + return BB; + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); bool isFPCmp = false; DebugLoc dl = MI->getDebugLoc(); + unsigned Opc; switch (MI->getOpcode()) { default: assert(false && "Unexpected instr type to insert"); - case Mips::Select_FCC: - case Mips::Select_FCC_S32: - case Mips::Select_FCC_D32: - isFPCmp = true; // FALL THROUGH - case Mips::Select_CC: - case Mips::Select_CC_S32: - case Mips::Select_CC_D32: { - // To "insert" a SELECT_CC instruction, we actually have to insert the - // diamond control-flow pattern. The incoming instruction knows the - // destination vreg to set, the condition code register to branch on, the - // true/false values to select between, and a branch opcode to use. - const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator It = BB; - ++It; - - // thisMBB: - // ... - // TrueVal = ... - // setcc r1, r2, r3 - // bNE r1, r0, copy1MBB - // fallthrough --> copy0MBB - MachineBasicBlock *thisMBB = BB; - MachineFunction *F = BB->getParent(); - MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); - F->insert(It, copy0MBB); - F->insert(It, sinkMBB); - - // Transfer the remainder of BB and its successor edges to sinkMBB. - sinkMBB->splice(sinkMBB->begin(), BB, - llvm::next(MachineBasicBlock::iterator(MI)), - BB->end()); - sinkMBB->transferSuccessorsAndUpdatePHIs(BB); - - // Next, add the true and fallthrough blocks as its successors. - BB->addSuccessor(copy0MBB); - BB->addSuccessor(sinkMBB); - - // Emit the right instruction according to the type of the operands compared - if (isFPCmp) { - // Find the condiction code present in the setcc operation. - Mips::CondCode CC = (Mips::CondCode)MI->getOperand(4).getImm(); - // Get the branch opcode from the branch code. - unsigned Opc = FPBranchCodeToOpc(GetFPBranchCodeFromCond(CC)); - BuildMI(BB, dl, TII->get(Opc)).addMBB(sinkMBB); - } else - BuildMI(BB, dl, TII->get(Mips::BNE)).addReg(MI->getOperand(1).getReg()) - .addReg(Mips::ZERO).addMBB(sinkMBB); - - // copy0MBB: - // %FalseValue = ... - // # fallthrough to sinkMBB - BB = copy0MBB; - - // Update machine-CFG edges - BB->addSuccessor(sinkMBB); - - // sinkMBB: - // %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ] - // ... - BB = sinkMBB; + case Mips::MOVT: + case Mips::MOVT_S: + case Mips::MOVT_D: + isFPCmp = true; + Opc = Mips::BC1F; + break; + case Mips::MOVF: + case Mips::MOVF_S: + case Mips::MOVF_D: + isFPCmp = true; + Opc = Mips::BC1T; + break; + case Mips::MOVZ_I: + case Mips::MOVZ_S: + case Mips::MOVZ_D: + Opc = Mips::BNE; + break; + case Mips::MOVN_I: + case Mips::MOVN_S: + case Mips::MOVN_D: + Opc = Mips::BEQ; + break; + } + + // To "insert" a SELECT_CC instruction, we actually have to insert the + // diamond control-flow pattern. The incoming instruction knows the + // destination vreg to set, the condition code register to branch on, the + // true/false values to select between, and a branch opcode to use. + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction::iterator It = BB; + ++It; + + // thisMBB: + // ... + // TrueVal = ... + // setcc r1, r2, r3 + // bNE r1, r0, copy1MBB + // fallthrough --> copy0MBB + MachineBasicBlock *thisMBB = BB; + MachineFunction *F = BB->getParent(); + MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(It, copy0MBB); + F->insert(It, sinkMBB); + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), BB, + llvm::next(MachineBasicBlock::iterator(MI)), + BB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(BB); + + // Next, add the true and fallthrough blocks as its successors. + BB->addSuccessor(copy0MBB); + BB->addSuccessor(sinkMBB); + + // Emit the right instruction according to the type of the operands compared + if (isFPCmp) + BuildMI(BB, dl, TII->get(Opc)).addMBB(sinkMBB); + else + BuildMI(BB, dl, TII->get(Opc)).addReg(MI->getOperand(2).getReg()) + .addReg(Mips::ZERO).addMBB(sinkMBB); + + + // copy0MBB: + // %FalseValue = ... + // # fallthrough to sinkMBB + BB = copy0MBB; + + // Update machine-CFG edges + BB->addSuccessor(sinkMBB); + + // sinkMBB: + // %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ] + // ... + BB = sinkMBB; + + if (isFPCmp) BuildMI(*BB, BB->begin(), dl, TII->get(Mips::PHI), MI->getOperand(0).getReg()) .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB) - .addReg(MI->getOperand(3).getReg()).addMBB(copy0MBB); + .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB); + else + BuildMI(*BB, BB->begin(), dl, + TII->get(Mips::PHI), MI->getOperand(0).getReg()) + .addReg(MI->getOperand(3).getReg()).addMBB(thisMBB) + .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB); - MI->eraseFromParent(); // The pseudo instruction is gone now. - return BB; - } - } + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; } //===----------------------------------------------------------------------===// @@ -590,27 +715,6 @@ LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const } SDValue MipsTargetLowering:: -LowerANDOR(SDValue Op, SelectionDAG &DAG) const -{ - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - DebugLoc dl = Op.getDebugLoc(); - - if (LHS.getOpcode() != MipsISD::FPCmp || RHS.getOpcode() != MipsISD::FPCmp) - return Op; - - SDValue True = DAG.getConstant(1, MVT::i32); - SDValue False = DAG.getConstant(0, MVT::i32); - - SDValue LSEL = DAG.getNode(MipsISD::FPSelectCC, dl, True.getValueType(), - LHS, True, False, LHS.getOperand(2)); - SDValue RSEL = DAG.getNode(MipsISD::FPSelectCC, dl, True.getValueType(), - RHS, True, False, RHS.getOperand(2)); - - return DAG.getNode(Op.getOpcode(), dl, MVT::i32, LSEL, RSEL); -} - -SDValue MipsTargetLowering:: LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { // The first operand is the chain, the second is the condition, the third is @@ -619,58 +723,32 @@ LowerBRCOND(SDValue Op, SelectionDAG &DAG) const SDValue Dest = Op.getOperand(2); DebugLoc dl = Op.getDebugLoc(); - if (Op.getOperand(1).getOpcode() != MipsISD::FPCmp) + SDValue CondRes = CreateFPCmp(DAG, Op.getOperand(1)); + + // Return if flag is not set by a floating point comparison. + if (CondRes.getOpcode() != MipsISD::FPCmp) return Op; - SDValue CondRes = Op.getOperand(1); SDValue CCNode = CondRes.getOperand(2); Mips::CondCode CC = (Mips::CondCode)cast<ConstantSDNode>(CCNode)->getZExtValue(); SDValue BrCode = DAG.getConstant(GetFPBranchCodeFromCond(CC), MVT::i32); return DAG.getNode(MipsISD::FPBrcond, dl, Op.getValueType(), Chain, BrCode, - Dest, CondRes); -} - -SDValue MipsTargetLowering:: -LowerSETCC(SDValue Op, SelectionDAG &DAG) const -{ - // The operands to this are the left and right operands to compare (ops #0, - // and #1) and the condition code to compare them with (op #2) as a - // CondCodeSDNode. - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - DebugLoc dl = Op.getDebugLoc(); - - ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); - - return DAG.getNode(MipsISD::FPCmp, dl, Op.getValueType(), LHS, RHS, - DAG.getConstant(FPCondCCodeToFCC(CC), MVT::i32)); + Dest, CondRes); } SDValue MipsTargetLowering:: LowerSELECT(SDValue Op, SelectionDAG &DAG) const { - SDValue Cond = Op.getOperand(0); - SDValue True = Op.getOperand(1); - SDValue False = Op.getOperand(2); - DebugLoc dl = Op.getDebugLoc(); + SDValue Cond = CreateFPCmp(DAG, Op.getOperand(0)); - // if the incomming condition comes from a integer compare, the select - // operation must be SelectCC or a conditional move if the subtarget - // supports it. - if (Cond.getOpcode() != MipsISD::FPCmp) { - if (Subtarget->hasCondMov() && !True.getValueType().isFloatingPoint()) - return Op; - return DAG.getNode(MipsISD::SelectCC, dl, True.getValueType(), - Cond, True, False); - } + // Return if flag is not set by a floating point comparison. + if (Cond.getOpcode() != MipsISD::FPCmp) + return Op; - // if the incomming condition comes from fpcmp, the select - // operation must use FPSelectCC. - SDValue CCNode = Cond.getOperand(2); - return DAG.getNode(MipsISD::FPSelectCC, dl, True.getValueType(), - Cond, True, False, CCNode); + return CreateCMovFP(DAG, Cond, Op.getOperand(1), Op.getOperand(2), + Op.getDebugLoc()); } SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op, @@ -693,12 +771,13 @@ SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op, return DAG.getNode(ISD::ADD, dl, MVT::i32, GOT, GPRelNode); } // %hi/%lo relocation - SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0, - MipsII::MO_ABS_HILO); - SDValue HiPart = DAG.getNode(MipsISD::Hi, dl, VTs, &GA, 1); - SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, GA); + SDValue GAHi = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0, + MipsII::MO_ABS_HI); + SDValue GALo = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0, + MipsII::MO_ABS_LO); + SDValue HiPart = DAG.getNode(MipsISD::Hi, dl, VTs, &GAHi, 1); + SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, GALo); return DAG.getNode(ISD::ADD, dl, MVT::i32, HiPart, Lo); - } else { SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0, MipsII::MO_GOT); @@ -707,9 +786,12 @@ SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op, false, false, 0); // On functions and global targets not internal linked only // a load from got/GP is necessary for PIC to work. - if (!GV->hasLocalLinkage() || isa<Function>(GV)) + if (!GV->hasInternalLinkage() && + (!GV->hasLocalLinkage() || isa<Function>(GV))) return ResNode; - SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, GA); + SDValue GALo = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0, + MipsII::MO_ABS_LO); + SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, GALo); return DAG.getNode(ISD::ADD, dl, MVT::i32, ResNode, Lo); } @@ -717,6 +799,34 @@ SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op, return SDValue(0,0); } +SDValue MipsTargetLowering::LowerBlockAddress(SDValue Op, + SelectionDAG &DAG) const { + const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); + // FIXME there isn't actually debug info here + DebugLoc dl = Op.getDebugLoc(); + + if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { + // %hi/%lo relocation + SDValue BAHi = DAG.getBlockAddress(BA, MVT::i32, true, + MipsII::MO_ABS_HI); + SDValue BALo = DAG.getBlockAddress(BA, MVT::i32, true, + MipsII::MO_ABS_LO); + SDValue Hi = DAG.getNode(MipsISD::Hi, dl, MVT::i32, BAHi); + SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, BALo); + return DAG.getNode(ISD::ADD, dl, MVT::i32, Hi, Lo); + } + + SDValue BAGOTOffset = DAG.getBlockAddress(BA, MVT::i32, true, + MipsII::MO_GOT); + SDValue BALOOffset = DAG.getBlockAddress(BA, MVT::i32, true, + MipsII::MO_ABS_LO); + SDValue Load = DAG.getLoad(MVT::i32, dl, + DAG.getEntryNode(), BAGOTOffset, + MachinePointerInfo(), false, false, 0); + SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, BALOOffset); + return DAG.getNode(ISD::ADD, dl, MVT::i32, Load, Lo); +} + SDValue MipsTargetLowering:: LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { @@ -732,7 +842,7 @@ LowerJumpTable(SDValue Op, SelectionDAG &DAG) const // FIXME there isn't actually debug info here DebugLoc dl = Op.getDebugLoc(); bool IsPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_; - unsigned char OpFlag = IsPIC ? MipsII::MO_GOT : MipsII::MO_ABS_HILO; + unsigned char OpFlag = IsPIC ? MipsII::MO_GOT : MipsII::MO_ABS_HI; EVT PtrVT = Op.getValueType(); JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); @@ -747,7 +857,9 @@ LowerJumpTable(SDValue Op, SelectionDAG &DAG) const MachinePointerInfo(), false, false, 0); - SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, JTI); + SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, + MipsII::MO_ABS_LO); + SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, JTILo); ResNode = DAG.getNode(ISD::ADD, dl, MVT::i32, HiPart, Lo); return ResNode; @@ -764,7 +876,7 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG) const // gp_rel relocation // FIXME: we should reference the constant pool using small data sections, - // but the asm printer currently doens't support this feature without + // but the asm printer currently doesn't support this feature without // hacking it. This feature should come soon so we can uncomment the // stuff below. //if (IsInSmallSection(C->getType())) { @@ -773,18 +885,22 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG) const // ResNode = DAG.getNode(ISD::ADD, MVT::i32, GOT, GPRelNode); if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { - SDValue CP = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(), - N->getOffset(), MipsII::MO_ABS_HILO); - SDValue HiPart = DAG.getNode(MipsISD::Hi, dl, MVT::i32, CP); - SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, CP); + SDValue CPHi = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(), + N->getOffset(), MipsII::MO_ABS_HI); + SDValue CPLo = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(), + N->getOffset(), MipsII::MO_ABS_LO); + SDValue HiPart = DAG.getNode(MipsISD::Hi, dl, MVT::i32, CPHi); + SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, CPLo); ResNode = DAG.getNode(ISD::ADD, dl, MVT::i32, HiPart, Lo); } else { SDValue CP = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(), - N->getOffset(), MipsII::MO_GOT); + N->getOffset(), MipsII::MO_GOT); SDValue Load = DAG.getLoad(MVT::i32, dl, DAG.getEntryNode(), CP, MachinePointerInfo::getConstantPool(), false, false, 0); - SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, CP); + SDValue CPLo = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(), + N->getOffset(), MipsII::MO_ABS_LO); + SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, CPLo); ResNode = DAG.getNode(ISD::ADD, dl, MVT::i32, Load, Lo); } @@ -937,46 +1053,28 @@ static bool CC_MipsO32_VarArgs(unsigned ValNo, MVT ValVT, LocInfo = CCValAssign::AExt; } - if (ValVT == MVT::i32 || ValVT == MVT::f32) { - if (unsigned Reg = State.AllocateReg(IntRegs, IntRegsSize)) { - State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, MVT::i32, LocInfo)); - return false; - } - unsigned Off = State.AllocateStack(4, 4); - State.addLoc(CCValAssign::getMem(ValNo, ValVT, Off, LocVT, LocInfo)); - return false; - } - - unsigned UnallocIntReg = State.getFirstUnallocated(IntRegs, IntRegsSize); - if (ValVT == MVT::f64) { - if (IntRegs[UnallocIntReg] == (unsigned (Mips::A1))) { - // A1 can't be used anymore, because 64 bit arguments - // must be aligned when copied back to the caller stack - State.AllocateReg(IntRegs, IntRegsSize); - UnallocIntReg++; - } - - if (IntRegs[UnallocIntReg] == (unsigned (Mips::A0)) || - IntRegs[UnallocIntReg] == (unsigned (Mips::A2))) { - unsigned Reg = State.AllocateReg(IntRegs, IntRegsSize); - State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, MVT::i32, LocInfo)); - // Shadow the next register so it can be used - // later to get the other 32bit part. - State.AllocateReg(IntRegs, IntRegsSize); - return false; - } + unsigned Reg; - // Register is shadowed to preserve alignment, and the - // argument goes to a stack location. - if (UnallocIntReg != IntRegsSize) - State.AllocateReg(IntRegs, IntRegsSize); + if (ValVT == MVT::i32 || ValVT == MVT::f32) { + Reg = State.AllocateReg(IntRegs, IntRegsSize); + LocVT = MVT::i32; + } else if (ValVT == MVT::f64) { + Reg = State.AllocateReg(IntRegs, IntRegsSize); + if (Reg == Mips::A1 || Reg == Mips::A3) + Reg = State.AllocateReg(IntRegs, IntRegsSize); + State.AllocateReg(IntRegs, IntRegsSize); + LocVT = MVT::i32; + } else + llvm_unreachable("Cannot handle this ValVT."); - unsigned Off = State.AllocateStack(8, 8); - State.addLoc(CCValAssign::getMem(ValNo, ValVT, Off, LocVT, LocInfo)); - return false; - } + if (!Reg) { + unsigned SizeInBytes = ValVT.getSizeInBits() >> 3; + unsigned Offset = State.AllocateStack(SizeInBytes, SizeInBytes); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + } else + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - return true; // CC didn't match + return false; // CC must always match } //===----------------------------------------------------------------------===// @@ -1043,11 +1141,12 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (VA.getValVT() == MVT::f32 && VA.getLocVT() == MVT::i32) Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); if (VA.getValVT() == MVT::f64 && VA.getLocVT() == MVT::i32) { - Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); - SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Arg, - DAG.getConstant(0, getPointerTy())); - SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Arg, - DAG.getConstant(1, getPointerTy())); + SDValue Lo = DAG.getNode(MipsISD::ExtractElementF64, dl, MVT::i32, + Arg, DAG.getConstant(0, MVT::i32)); + SDValue Hi = DAG.getNode(MipsISD::ExtractElementF64, dl, MVT::i32, + Arg, DAG.getConstant(1, MVT::i32)); + if (!Subtarget->isLittle()) + std::swap(Lo, Hi); RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo)); RegsToPass.push_back(std::make_pair(VA.getLocReg()+1, Hi)); continue; @@ -1100,7 +1199,7 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Build a sequence of copy-to-reg nodes chained together with token // chain and flag operands which copy the outgoing args into registers. - // The InFlag in necessary since all emited instructions must be + // The InFlag in necessary since all emitted instructions must be // stuck together. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { @@ -1113,12 +1212,52 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol // node so that legalize doesn't hack it. unsigned char OpFlag = IsPIC ? MipsII::MO_GOT_CALL : MipsII::MO_NO_FLAG; - if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) - Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, - getPointerTy(), 0, OpFlag); - else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) + bool LoadSymAddr = false; + SDValue CalleeLo; + + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + if (IsPIC && G->getGlobal()->hasInternalLinkage()) { + Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, + getPointerTy(), 0,MipsII:: MO_GOT); + CalleeLo = DAG.getTargetGlobalAddress(G->getGlobal(), dl, getPointerTy(), + 0, MipsII::MO_ABS_LO); + } else { + Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, + getPointerTy(), 0, OpFlag); + } + + LoadSymAddr = true; + } + else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), OpFlag); + LoadSymAddr = true; + } + + // Create nodes that load address of callee and copy it to T9 + if (IsPIC) { + if (LoadSymAddr) { + // Load callee address + SDValue LoadValue = DAG.getLoad(MVT::i32, dl, Chain, Callee, + MachinePointerInfo::getGOT(), + false, false, 0); + + // Use GOT+LO if callee has internal linkage. + if (CalleeLo.getNode()) { + SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, CalleeLo); + Callee = DAG.getNode(ISD::ADD, dl, MVT::i32, LoadValue, Lo); + } else + Callee = LoadValue; + + // Use chain output from LoadValue + Chain = LoadValue.getValue(1); + } + + // copy to T9 + Chain = DAG.getCopyToReg(Chain, dl, Mips::T9, Callee, SDValue(0, 0)); + InFlag = Chain.getValue(1); + Callee = DAG.getRegister(Mips::T9, MVT::i32); + } // MipsJmpLink = #chain, #target_address, #opt_in_flags... // = Chain, Callee, Reg#1, Reg#2, ... @@ -1143,7 +1282,7 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Create a stack location to hold GP when PIC is used. This stack // location is used on function prologue to save GP and also after all - // emited CALL's to restore GP. + // emitted CALL's to restore GP. if (IsPIC) { // Function can have an arbitrary number of calls, so // hold the LastArgStackLoc with the biggest offset. @@ -1218,18 +1357,18 @@ MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, /// and generate load operations for arguments places on the stack. SDValue MipsTargetLowering::LowerFormalArguments(SDValue Chain, - CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl<ISD::InputArg> - &Ins, - DebugLoc dl, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::InputArg> + &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>(); - unsigned StackReg = MF.getTarget().getRegisterInfo()->getFrameRegister(MF); MipsFI->setVarArgsFrameIndex(0); // Used with vargs to acumulate store chains. @@ -1249,9 +1388,9 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, else CCInfo.AnalyzeFormalArguments(Ins, CC_Mips); - SDValue StackPtr; - unsigned FirstStackArgLoc = (Subtarget->isABI_EABI() ? 0 : 16); + unsigned LastStackArgEndOffset = 0; + EVT LastRegArgValVT; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; @@ -1260,6 +1399,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); ArgRegEnd = VA.getLocReg(); + LastRegArgValVT = VA.getValVT(); TargetRegisterClass *RC = 0; if (RegVT == MVT::i32) @@ -1300,8 +1440,10 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, unsigned Reg2 = AddLiveIn(DAG.getMachineFunction(), VA.getLocReg()+1, RC); SDValue ArgValue2 = DAG.getCopyFromReg(Chain, dl, Reg2, RegVT); - SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, ArgValue2, ArgValue); - ArgValue = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Pair); + if (!Subtarget->isLittle()) + std::swap(ArgValue, ArgValue2); + ArgValue = DAG.getNode(MipsISD::BuildPairF64, dl, MVT::f64, + ArgValue, ArgValue2); } } @@ -1321,10 +1463,10 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, // used instead of a direct negative address (which is recorded to // be used on emitPrologue) to avoid mis-calc of the first stack // offset on PEI::calculateFrameObjectOffsets. - // Arguments are always 32-bit. - unsigned ArgSize = VA.getLocVT().getSizeInBits()/8; + unsigned ArgSize = VA.getValVT().getSizeInBits()/8; + LastStackArgEndOffset = FirstStackArgLoc + VA.getLocMemOffset() + ArgSize; int FI = MFI->CreateFixedObject(ArgSize, 0, true); - MipsFI->recordLoadArgsFI(FI, -(ArgSize+ + MipsFI->recordLoadArgsFI(FI, -(4 + (FirstStackArgLoc + VA.getLocMemOffset()))); // Create load nodes to retrieve arguments from the stack @@ -1351,29 +1493,52 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, // To meet ABI, when VARARGS are passed on registers, the registers // must have their values written to the caller stack frame. If the last // argument was placed in the stack, there's no need to save any register. - if ((isVarArg) && (Subtarget->isABI_O32() && ArgRegEnd)) { - if (StackPtr.getNode() == 0) - StackPtr = DAG.getRegister(StackReg, getPointerTy()); - - // The last register argument that must be saved is Mips::A3 - TargetRegisterClass *RC = Mips::CPURegsRegisterClass; - unsigned StackLoc = ArgLocs.size()-1; - - for (++ArgRegEnd; ArgRegEnd <= Mips::A3; ++ArgRegEnd, ++StackLoc) { - unsigned Reg = AddLiveIn(DAG.getMachineFunction(), ArgRegEnd, RC); - SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, MVT::i32); - - int FI = MFI->CreateFixedObject(4, 0, true); - MipsFI->recordStoreVarArgsFI(FI, -(4+(StackLoc*4))); - SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy()); - OutChains.push_back(DAG.getStore(Chain, dl, ArgValue, PtrOff, - MachinePointerInfo(), - false, false, 0)); - - // Record the frame index of the first variable argument - // which is a value necessary to VASTART. - if (!MipsFI->getVarArgsFrameIndex()) + if (isVarArg && Subtarget->isABI_O32()) { + if (ArgRegEnd) { + // Last named formal argument is passed in register. + + // The last register argument that must be saved is Mips::A3 + TargetRegisterClass *RC = Mips::CPURegsRegisterClass; + if (LastRegArgValVT == MVT::f64) + ArgRegEnd++; + + if (ArgRegEnd < Mips::A3) { + // Both the last named formal argument and the first variable + // argument are passed in registers. + for (++ArgRegEnd; ArgRegEnd <= Mips::A3; ++ArgRegEnd) { + unsigned Reg = AddLiveIn(DAG.getMachineFunction(), ArgRegEnd, RC); + SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, MVT::i32); + + int FI = MFI->CreateFixedObject(4, 0, true); + MipsFI->recordStoreVarArgsFI(FI, -(4+(ArgRegEnd-Mips::A0)*4)); + SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy()); + OutChains.push_back(DAG.getStore(Chain, dl, ArgValue, PtrOff, + MachinePointerInfo(), + false, false, 0)); + + // Record the frame index of the first variable argument + // which is a value necessary to VASTART. + if (!MipsFI->getVarArgsFrameIndex()) { + MFI->setObjectAlignment(FI, 4); + MipsFI->setVarArgsFrameIndex(FI); + } + } + } else { + // Last named formal argument is in register Mips::A3, and the first + // variable argument is on stack. Record the frame index of the first + // variable argument. + int FI = MFI->CreateFixedObject(4, 0, true); + MFI->setObjectAlignment(FI, 4); + MipsFI->recordStoreVarArgsFI(FI, -20); MipsFI->setVarArgsFrameIndex(FI); + } + } else { + // Last named formal argument and all the variable arguments are passed + // on stack. Record the frame index of the first variable argument. + int FI = MFI->CreateFixedObject(4, 0, true); + MFI->setObjectAlignment(FI, 4); + MipsFI->recordStoreVarArgsFI(FI, -(4+LastStackArgEndOffset)); + MipsFI->setVarArgsFrameIndex(FI); } } diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h index 9d6b9f3..e4d0c3d 100644 --- a/lib/Target/Mips/MipsISelLowering.h +++ b/lib/Target/Mips/MipsISelLowering.h @@ -31,45 +31,50 @@ namespace llvm { // Get the Higher 16 bits from a 32-bit immediate // No relation with Mips Hi register - Hi, + Hi, // Get the Lower 16 bits from a 32-bit immediate // No relation with Mips Lo register - Lo, + Lo, // Handle gp_rel (small data/bss sections) relocation. GPRel, - // Select CC Pseudo Instruction - SelectCC, - - // Floating Point Select CC Pseudo Instruction - FPSelectCC, - // Floating Point Branch Conditional FPBrcond, // Floating Point Compare FPCmp, + // Floating Point Conditional Moves + CMovFP_T, + CMovFP_F, + // Floating Point Rounding FPRound, - // Return + // Return Ret, // MAdd/Sub nodes MAdd, MAddu, MSub, - MSubu + MSubu, + + // DivRem(u) + DivRem, + DivRemU, + + BuildPairF64, + ExtractElementF64 }; } //===--------------------------------------------------------------------===// // TargetLowering Implementation //===--------------------------------------------------------------------===// - + class MipsTargetLowering : public TargetLowering { public: explicit MipsTargetLowering(MipsTargetMachine &TM); @@ -77,7 +82,7 @@ namespace llvm { /// LowerOperation - Provide custom lowering hooks for some operations. virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const; - /// getTargetNodeName - This method returns the name of a target specific + /// getTargetNodeName - This method returns the name of a target specific // DAG node. virtual const char *getTargetNodeName(unsigned Opcode) const; @@ -87,7 +92,7 @@ namespace llvm { /// getFunctionAlignment - Return the Log2 alignment of this function. virtual unsigned getFunctionAlignment(const Function *F) const; - virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; + virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; private: // Subtarget Info const MipsSubtarget *Subtarget; @@ -101,16 +106,15 @@ namespace llvm { SmallVectorImpl<SDValue> &InVals) const; // Lower Operand specifics - SDValue LowerANDOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; virtual SDValue @@ -149,7 +153,7 @@ namespace llvm { ConstraintWeight getSingleConstraintMatchWeight( AsmOperandInfo &info, const char *constraint) const; - std::pair<unsigned, const TargetRegisterClass*> + std::pair<unsigned, const TargetRegisterClass*> getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const; diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td index 977e0df..a86c5c7 100644 --- a/lib/Target/Mips/MipsInstrFPU.td +++ b/lib/Target/Mips/MipsInstrFPU.td @@ -24,19 +24,28 @@ //===----------------------------------------------------------------------===// // Floating Point Compare and Branch -def SDT_MipsFPBrcond : SDTypeProfile<0, 3, [SDTCisSameAs<0, 2>, SDTCisInt<0>, - SDTCisVT<1, OtherVT>]>; -def SDT_MipsFPCmp : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, - SDTCisSameAs<1, 2>, SDTCisFP<1>, - SDTCisInt<3>]>; -def SDT_MipsFPSelectCC : SDTypeProfile<1, 4, [SDTCisInt<1>, SDTCisInt<4>, - SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>]>; - +def SDT_MipsFPBrcond : SDTypeProfile<0, 2, [SDTCisInt<0>, + SDTCisVT<1, OtherVT>]>; +def SDT_MipsFPCmp : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>, SDTCisFP<1>, + SDTCisInt<2>]>; +def SDT_MipsCMovFP : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, + SDTCisSameAs<1, 2>]>; +def SDT_MipsBuildPairF64 : SDTypeProfile<1, 2, [SDTCisVT<0, f64>, + SDTCisVT<1, i32>, + SDTCisSameAs<1, 2>]>; +def SDT_MipsExtractElementF64 : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, + SDTCisVT<1, f64>, + SDTCisVT<0, i32>]>; + +def MipsFPCmp : SDNode<"MipsISD::FPCmp", SDT_MipsFPCmp, [SDNPOutGlue]>; +def MipsCMovFP_T : SDNode<"MipsISD::CMovFP_T", SDT_MipsCMovFP, [SDNPInGlue]>; +def MipsCMovFP_F : SDNode<"MipsISD::CMovFP_F", SDT_MipsCMovFP, [SDNPInGlue]>; def MipsFPRound : SDNode<"MipsISD::FPRound", SDTFPRoundOp, [SDNPOptInGlue]>; -def MipsFPBrcond : SDNode<"MipsISD::FPBrcond", SDT_MipsFPBrcond, - [SDNPHasChain]>; -def MipsFPCmp : SDNode<"MipsISD::FPCmp", SDT_MipsFPCmp>; -def MipsFPSelectCC : SDNode<"MipsISD::FPSelectCC", SDT_MipsFPSelectCC>; +def MipsFPBrcond : SDNode<"MipsISD::FPBrcond", SDT_MipsFPBrcond, + [SDNPHasChain, SDNPOptInGlue]>; +def MipsBuildPairF64 : SDNode<"MipsISD::BuildPairF64", SDT_MipsBuildPairF64>; +def MipsExtractElementF64 : SDNode<"MipsISD::ExtractElementF64", + SDT_MipsExtractElementF64>; // Operand for printing out a condition code. let PrintMethod = "printFCCOperand" in @@ -54,7 +63,7 @@ def IsNotMipsI : Predicate<"!Subtarget.isMips1()">; //===----------------------------------------------------------------------===// // Instruction Class Templates // -// A set of multiclasses is used to address the register usage. +// A set of multiclasses is used to address the register usage. // // S32 - single precision in 16 32bit even fp registers // single precision in 32 32bit fp registers in SingleOnly mode @@ -65,7 +74,7 @@ def IsNotMipsI : Predicate<"!Subtarget.isMips1()">; // Only S32 and D32 are supported right now. //===----------------------------------------------------------------------===// -multiclass FFR1_1<bits<6> funct, string asmstr> +multiclass FFR1_1<bits<6> funct, string asmstr> { def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd), (ins FGR32:$fs), !strconcat(asmstr, ".s $fd, $fs"), []>; @@ -74,31 +83,31 @@ multiclass FFR1_1<bits<6> funct, string asmstr> !strconcat(asmstr, ".d $fd, $fs"), []>, Requires<[In32BitMode]>; } -multiclass FFR1_2<bits<6> funct, string asmstr, SDNode FOp> +multiclass FFR1_2<bits<6> funct, string asmstr, SDNode FOp> { def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd), (ins FGR32:$fs), - !strconcat(asmstr, ".s $fd, $fs"), + !strconcat(asmstr, ".s $fd, $fs"), [(set FGR32:$fd, (FOp FGR32:$fs))]>; def _D32 : FFR<0x11, funct, 0x1, (outs AFGR64:$fd), (ins AFGR64:$fs), - !strconcat(asmstr, ".d $fd, $fs"), + !strconcat(asmstr, ".d $fd, $fs"), [(set AFGR64:$fd, (FOp AFGR64:$fs))]>, Requires<[In32BitMode]>; } -class FFR1_3<bits<6> funct, bits<5> fmt, RegisterClass RcSrc, - RegisterClass RcDst, string asmstr>: - FFR<0x11, funct, fmt, (outs RcSrc:$fd), (ins RcDst:$fs), - !strconcat(asmstr, " $fd, $fs"), []>; +class FFR1_3<bits<6> funct, bits<5> fmt, RegisterClass RcSrc, + RegisterClass RcDst, string asmstr>: + FFR<0x11, funct, fmt, (outs RcSrc:$fd), (ins RcDst:$fs), + !strconcat(asmstr, " $fd, $fs"), []>; multiclass FFR1_4<bits<6> funct, string asmstr, SDNode FOp> { - def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd), - (ins FGR32:$fs, FGR32:$ft), + def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd), + (ins FGR32:$fs, FGR32:$ft), !strconcat(asmstr, ".s $fd, $fs, $ft"), [(set FGR32:$fd, (FOp FGR32:$fs, FGR32:$ft))]>; - def _D32 : FFR<0x11, funct, 0x1, (outs AFGR64:$fd), - (ins AFGR64:$fs, AFGR64:$ft), + def _D32 : FFR<0x11, funct, 0x1, (outs AFGR64:$fd), + (ins AFGR64:$fs, AFGR64:$ft), !strconcat(asmstr, ".d $fd, $fs, $ft"), [(set AFGR64:$fd, (FOp AFGR64:$fs, AFGR64:$ft))]>, Requires<[In32BitMode]>; @@ -115,8 +124,8 @@ let ft = 0 in { defm TRUNC_W : FFR1_1<0b001101, "trunc.w">; defm CVTW : FFR1_1<0b100100, "cvt.w">; - defm FABS : FFR1_2<0b000101, "abs", fabs>; - defm FNEG : FFR1_2<0b000111, "neg", fneg>; + defm FABS : FFR1_2<0b000101, "abs", fabs>; + defm FNEG : FFR1_2<0b000111, "neg", fneg>; defm FSQRT : FFR1_2<0b000100, "sqrt", fsqrt>; /// Convert to Single Precison @@ -140,23 +149,23 @@ let ft = 0 in { def TRUNC_LD : FFR1_3<0b001001, 0x1, AFGR64, AFGR64, "trunc.l">; /// Convert to long signed integer - def CVTL_S : FFR1_3<0b100101, 0x0, FGR32, FGR32, "cvt.l">; - def CVTL_D : FFR1_3<0b100101, 0x1, AFGR64, AFGR64, "cvt.l">; - - /// Convert to Double Precison - def CVTD_S32 : FFR1_3<0b100001, 0x0, AFGR64, FGR32, "cvt.d.s">; - def CVTD_W32 : FFR1_3<0b100001, 0x2, AFGR64, FGR32, "cvt.d.w">; - def CVTD_L32 : FFR1_3<0b100001, 0x3, AFGR64, AFGR64, "cvt.d.l">; - + def CVTL_S : FFR1_3<0b100101, 0x0, FGR32, FGR32, "cvt.l">; + def CVTL_D : FFR1_3<0b100101, 0x1, AFGR64, AFGR64, "cvt.l">; + + /// Convert to Double Precison + def CVTD_S32 : FFR1_3<0b100001, 0x0, AFGR64, FGR32, "cvt.d.s">; + def CVTD_W32 : FFR1_3<0b100001, 0x2, AFGR64, FGR32, "cvt.d.w">; + def CVTD_L32 : FFR1_3<0b100001, 0x3, AFGR64, AFGR64, "cvt.d.l">; + /// Convert to Single Precison def CVTS_D32 : FFR1_3<0b100000, 0x1, FGR32, AFGR64, "cvt.s.d">; - def CVTS_L32 : FFR1_3<0b100000, 0x3, FGR32, AFGR64, "cvt.s.l">; + def CVTS_L32 : FFR1_3<0b100000, 0x3, FGR32, AFGR64, "cvt.s.l">; } } // The odd-numbered registers are only referenced when doing loads, // stores, and moves between floating-point and integer registers. -// When defining instructions, we reference all 32-bit registers, +// When defining instructions, we reference all 32-bit registers, // regardless of register aliasing. let fd = 0 in { /// Move Control Registers From/To CPU Registers @@ -165,7 +174,7 @@ let fd = 0 in { def CTC1 : FFR<0x11, 0x0, 0x6, (outs CCR:$rt), (ins CPURegs:$fs), "ctc1 $fs, $rt", []>; - + def MFC1 : FFR<0x11, 0x00, 0x00, (outs CPURegs:$rt), (ins FGR32:$fs), "mfc1 $rt, $fs", []>; @@ -180,18 +189,18 @@ def FMOV_D32 : FFR<0x11, 0b000110, 0x1, (outs AFGR64:$fd), (ins AFGR64:$fs), /// Floating Point Memory Instructions let Predicates = [IsNotSingleFloat, IsNotMipsI] in { - def LDC1 : FFI<0b110101, (outs AFGR64:$ft), (ins mem:$addr), + def LDC1 : FFI<0b110101, (outs AFGR64:$ft), (ins mem:$addr), "ldc1 $ft, $addr", [(set AFGR64:$ft, (load addr:$addr))]>; - def SDC1 : FFI<0b111101, (outs), (ins AFGR64:$ft, mem:$addr), + def SDC1 : FFI<0b111101, (outs), (ins AFGR64:$ft, mem:$addr), "sdc1 $ft, $addr", [(store AFGR64:$ft, addr:$addr)]>; } -// LWC1 and SWC1 can always be emited with odd registers. +// LWC1 and SWC1 can always be emitted with odd registers. def LWC1 : FFI<0b110001, (outs FGR32:$ft), (ins mem:$addr), "lwc1 $ft, $addr", - [(set FGR32:$ft, (load addr:$addr))]>; + [(set FGR32:$ft, (load addr:$addr))]>; def SWC1 : FFI<0b111001, (outs), (ins FGR32:$ft, mem:$addr), "swc1 $ft, $addr", - [(store FGR32:$ft, addr:$addr)]>; + [(store FGR32:$ft, addr:$addr)]>; /// Floating-point Aritmetic defm FADD : FFR1_4<0x10, "add", fadd>; @@ -202,7 +211,7 @@ defm FSUB : FFR1_4<0x01, "sub", fsub>; //===----------------------------------------------------------------------===// // Floating Point Branch Codes //===----------------------------------------------------------------------===// -// Mips branch codes. These correspond to condcode in MipsInstrInfo.h. +// Mips branch codes. These correspond to condcode in MipsInstrInfo.h. // They must be kept in synch. def MIPS_BRANCH_F : PatLeaf<(i32 0)>; def MIPS_BRANCH_T : PatLeaf<(i32 1)>; @@ -210,11 +219,11 @@ def MIPS_BRANCH_FL : PatLeaf<(i32 2)>; def MIPS_BRANCH_TL : PatLeaf<(i32 3)>; /// Floating Point Branch of False/True (Likely) -let isBranch=1, isTerminator=1, hasDelaySlot=1, base=0x8, Uses=[FCR31] in { - class FBRANCH<PatLeaf op, string asmstr> : FFI<0x11, (outs), +let isBranch=1, isTerminator=1, hasDelaySlot=1, base=0x8, Uses=[FCR31] in + class FBRANCH<PatLeaf op, string asmstr> : FFI<0x11, (outs), (ins brtarget:$dst), !strconcat(asmstr, " $dst"), - [(MipsFPBrcond op, bb:$dst, FCR31)]>; -} + [(MipsFPBrcond op, bb:$dst)]>; + def BC1F : FBRANCH<MIPS_BRANCH_F, "bc1f">; def BC1T : FBRANCH<MIPS_BRANCH_T, "bc1t">; def BC1FL : FBRANCH<MIPS_BRANCH_FL, "bc1fl">; @@ -223,11 +232,11 @@ def BC1TL : FBRANCH<MIPS_BRANCH_TL, "bc1tl">; //===----------------------------------------------------------------------===// // Floating Point Flag Conditions //===----------------------------------------------------------------------===// -// Mips condition codes. They must correspond to condcode in MipsInstrInfo.h. +// Mips condition codes. They must correspond to condcode in MipsInstrInfo.h. // They must be kept in synch. def MIPS_FCOND_F : PatLeaf<(i32 0)>; def MIPS_FCOND_UN : PatLeaf<(i32 1)>; -def MIPS_FCOND_EQ : PatLeaf<(i32 2)>; +def MIPS_FCOND_OEQ : PatLeaf<(i32 2)>; def MIPS_FCOND_UEQ : PatLeaf<(i32 3)>; def MIPS_FCOND_OLT : PatLeaf<(i32 4)>; def MIPS_FCOND_ULT : PatLeaf<(i32 5)>; @@ -245,44 +254,90 @@ def MIPS_FCOND_NGT : PatLeaf<(i32 15)>; /// Floating Point Compare let hasDelaySlot = 1, Defs=[FCR31] in { def FCMP_S32 : FCC<0x0, (outs), (ins FGR32:$fs, FGR32:$ft, condcode:$cc), - "c.$cc.s $fs, $ft", - [(set FCR31, (MipsFPCmp FGR32:$fs, FGR32:$ft, imm:$cc))]>; - + "c.$cc.s $fs, $ft", + [(MipsFPCmp FGR32:$fs, FGR32:$ft, imm:$cc)]>; + def FCMP_D32 : FCC<0x1, (outs), (ins AFGR64:$fs, AFGR64:$ft, condcode:$cc), - "c.$cc.d $fs, $ft", - [(set FCR31, (MipsFPCmp AFGR64:$fs, AFGR64:$ft, imm:$cc))]>, - Requires<[In32BitMode]>; + "c.$cc.d $fs, $ft", + [(MipsFPCmp AFGR64:$fs, AFGR64:$ft, imm:$cc)]>, + Requires<[In32BitMode]>; } -//===----------------------------------------------------------------------===// -// Floating Point Pseudo-Instructions -//===----------------------------------------------------------------------===// -// For some explanation, see Select_CC at MipsInstrInfo.td. We also embedd a -// condiciton code to enable easy handling by the Custom Inserter. -let usesCustomInserter = 1, Uses=[FCR31] in { - class PseudoFPSelCC<RegisterClass RC, string asmstr> : - MipsPseudo<(outs RC:$dst), - (ins CPURegs:$CmpRes, RC:$T, RC:$F, condcode:$cc), asmstr, - [(set RC:$dst, (MipsFPSelectCC CPURegs:$CmpRes, RC:$T, RC:$F, - imm:$cc))]>; +// Conditional moves: +// These instructions are expanded in +// MipsISelLowering::EmitInstrWithCustomInserter if target does not have +// conditional move instructions. +// flag:int, data:float +let usesCustomInserter = 1, Constraints = "$F = $dst" in +class CondMovIntFP<RegisterClass RC, bits<5> fmt, bits<6> func, + string instr_asm> : + FFR<0x11, func, fmt, (outs RC:$dst), (ins RC:$T, CPURegs:$cond, RC:$F), + !strconcat(instr_asm, "\t$dst, $T, $cond"), []>; + +def MOVZ_S : CondMovIntFP<FGR32, 16, 18, "movz.s">; +def MOVN_S : CondMovIntFP<FGR32, 16, 19, "movn.s">; + +let Predicates = [In32BitMode] in { + def MOVZ_D : CondMovIntFP<AFGR64, 17, 18, "movz.d">; + def MOVN_D : CondMovIntFP<AFGR64, 17, 19, "movn.d">; } -// The values to be selected are fp but the condition test is with integers. -def Select_CC_S32 : PseudoSelCC<FGR32, "# MipsSelect_CC_S32_f32">; -def Select_CC_D32 : PseudoSelCC<AFGR64, "# MipsSelect_CC_D32_f32">, - Requires<[In32BitMode]>; +defm : MovzPats<FGR32, MOVZ_S>; +defm : MovnPats<FGR32, MOVN_S>; + +let Predicates = [In32BitMode] in { + defm : MovzPats<AFGR64, MOVZ_D>; + defm : MovnPats<AFGR64, MOVN_D>; +} -// The values to be selected are int but the condition test is done with fp. -def Select_FCC : PseudoFPSelCC<CPURegs, "# MipsSelect_FCC">; +let usesCustomInserter = 1, Uses = [FCR31], Constraints = "$F = $dst" in { +// flag:float, data:int +class CondMovFPInt<SDNode cmov, bits<1> tf, string instr_asm> : + FCMOV<tf, (outs CPURegs:$dst), (ins CPURegs:$T, CPURegs:$F), + !strconcat(instr_asm, "\t$dst, $T, $$fcc0"), + [(set CPURegs:$dst, (cmov CPURegs:$T, CPURegs:$F))]>; + +// flag:float, data:float +class CondMovFPFP<RegisterClass RC, SDNode cmov, bits<5> fmt, bits<1> tf, + string instr_asm> : + FFCMOV<fmt, tf, (outs RC:$dst), (ins RC:$T, RC:$F), + !strconcat(instr_asm, "\t$dst, $T, $$fcc0"), + [(set RC:$dst, (cmov RC:$T, RC:$F))]>; +} -// The values to be selected and the condition test is done with fp. -def Select_FCC_S32 : PseudoFPSelCC<FGR32, "# MipsSelect_FCC_S32_f32">; -def Select_FCC_D32 : PseudoFPSelCC<AFGR64, "# MipsSelect_FCC_D32_f32">, - Requires<[In32BitMode]>; +def MOVT : CondMovFPInt<MipsCMovFP_T, 1, "movt">; +def MOVF : CondMovFPInt<MipsCMovFP_F, 0, "movf">; +def MOVT_S : CondMovFPFP<FGR32, MipsCMovFP_T, 16, 1, "movt.s">; +def MOVF_S : CondMovFPFP<FGR32, MipsCMovFP_F, 16, 0, "movf.s">; -def MOVCCRToCCR : MipsPseudo<(outs CCR:$dst), (ins CCR:$src), - "# MOVCCRToCCR", []>; +let Predicates = [In32BitMode] in { + def MOVT_D : CondMovFPFP<AFGR64, MipsCMovFP_T, 17, 1, "movt.d">; + def MOVF_D : CondMovFPFP<AFGR64, MipsCMovFP_F, 17, 0, "movf.d">; +} + +//===----------------------------------------------------------------------===// +// Floating Point Pseudo-Instructions +//===----------------------------------------------------------------------===// +def MOVCCRToCCR : MipsPseudo<(outs CCR:$dst), (ins CCR:$src), + "# MOVCCRToCCR", []>; + +// This pseudo instr gets expanded into 2 mtc1 instrs after register +// allocation. +def BuildPairF64 : + MipsPseudo<(outs AFGR64:$dst), + (ins CPURegs:$lo, CPURegs:$hi), "", + [(set AFGR64:$dst, (MipsBuildPairF64 CPURegs:$lo, CPURegs:$hi))]>; + +// This pseudo instr gets expanded into 2 mfc1 instrs after register +// allocation. +// if n is 0, lower part of src is extracted. +// if n is 1, higher part of src is extracted. +def ExtractElementF64 : + MipsPseudo<(outs CPURegs:$dst), + (ins AFGR64:$src, i32imm:$n), "", + [(set CPURegs:$dst, + (MipsExtractElementF64 AFGR64:$src, imm:$n))]>; //===----------------------------------------------------------------------===// // Floating Point Patterns @@ -306,7 +361,7 @@ def : Pat<(i32 (fp_to_sint FGR32:$src)), (MFC1 (TRUNC_W_S32 FGR32:$src))>; def : Pat<(i32 (bitconvert FGR32:$src)), (MFC1 FGR32:$src)>; def : Pat<(f32 (bitconvert CPURegs:$src)), (MTC1 CPURegs:$src)>; -let Predicates = [In32BitMode] in { +let Predicates = [In32BitMode] in { def : Pat<(f32 (fround AFGR64:$src)), (CVTS_D32 AFGR64:$src)>; def : Pat<(f64 (fextend FGR32:$src)), (CVTD_S32 FGR32:$src)>; } diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td index 98ae2fa..9dfcdfb 100644 --- a/lib/Target/Mips/MipsInstrFormats.td +++ b/lib/Target/Mips/MipsInstrFormats.td @@ -22,8 +22,8 @@ //===----------------------------------------------------------------------===// // Generic Mips Format -class MipsInst<dag outs, dag ins, string asmstr, list<dag> pattern, - InstrItinClass itin>: Instruction +class MipsInst<dag outs, dag ins, string asmstr, list<dag> pattern, + InstrItinClass itin>: Instruction { field bits<32> Inst; @@ -32,8 +32,8 @@ class MipsInst<dag outs, dag ins, string asmstr, list<dag> pattern, bits<6> opcode; // Top 5 bits are the 'opcode' field - let Inst{31-26} = opcode; - + let Inst{31-26} = opcode; + dag OutOperandList = outs; dag InOperandList = ins; @@ -52,7 +52,7 @@ class MipsPseudo<dag outs, dag ins, string asmstr, list<dag> pattern>: class FR<bits<6> op, bits<6> _funct, dag outs, dag ins, string asmstr, list<dag> pattern, InstrItinClass itin>: - MipsInst<outs, ins, asmstr, pattern, itin> + MipsInst<outs, ins, asmstr, pattern, itin> { bits<5> rd; bits<5> rs; @@ -64,7 +64,7 @@ class FR<bits<6> op, bits<6> _funct, dag outs, dag ins, string asmstr, let funct = _funct; let Inst{25-21} = rs; - let Inst{20-16} = rt; + let Inst{20-16} = rt; let Inst{15-11} = rd; let Inst{10-6} = shamt; let Inst{5-0} = funct; @@ -75,7 +75,7 @@ class FR<bits<6> op, bits<6> _funct, dag outs, dag ins, string asmstr, //===----------------------------------------------------------------------===// class FI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern, - InstrItinClass itin>: MipsInst<outs, ins, asmstr, pattern, itin> + InstrItinClass itin>: MipsInst<outs, ins, asmstr, pattern, itin> { bits<5> rt; bits<5> rs; @@ -84,7 +84,7 @@ class FI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern, let opcode = op; let Inst{25-21} = rs; - let Inst{20-16} = rt; + let Inst{20-16} = rt; let Inst{15-0} = imm16; } @@ -93,12 +93,12 @@ class FI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern, //===----------------------------------------------------------------------===// class FJ<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern, - InstrItinClass itin>: MipsInst<outs, ins, asmstr, pattern, itin> + InstrItinClass itin>: MipsInst<outs, ins, asmstr, pattern, itin> { bits<26> addr; let opcode = op; - + let Inst{25-0} = addr; } @@ -119,9 +119,9 @@ class FJ<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern, // Format FR instruction class in Mips : <|opcode|fmt|ft|fs|fd|funct|> //===----------------------------------------------------------------------===// -class FFR<bits<6> op, bits<6> _funct, bits<5> _fmt, dag outs, dag ins, - string asmstr, list<dag> pattern> : - MipsInst<outs, ins, asmstr, pattern, NoItinerary> +class FFR<bits<6> op, bits<6> _funct, bits<5> _fmt, dag outs, dag ins, + string asmstr, list<dag> pattern> : + MipsInst<outs, ins, asmstr, pattern, NoItinerary> { bits<5> fd; bits<5> fs; @@ -134,7 +134,7 @@ class FFR<bits<6> op, bits<6> _funct, bits<5> _fmt, dag outs, dag ins, let fmt = _fmt; let Inst{25-21} = fmt; - let Inst{20-16} = ft; + let Inst{20-16} = ft; let Inst{15-11} = fs; let Inst{10-6} = fd; let Inst{5-0} = funct; @@ -144,8 +144,8 @@ class FFR<bits<6> op, bits<6> _funct, bits<5> _fmt, dag outs, dag ins, // Format FI instruction class in Mips : <|opcode|base|ft|immediate|> //===----------------------------------------------------------------------===// -class FFI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern>: - MipsInst<outs, ins, asmstr, pattern, NoItinerary> +class FFI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern>: + MipsInst<outs, ins, asmstr, pattern, NoItinerary> { bits<5> ft; bits<5> base; @@ -154,7 +154,7 @@ class FFI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern>: let opcode = op; let Inst{25-21} = base; - let Inst{20-16} = ft; + let Inst{20-16} = ft; let Inst{15-0} = imm16; } @@ -162,8 +162,8 @@ class FFI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern>: // Compare instruction class in Mips : <|010001|fmt|ft|fs|0000011|condcode|> //===----------------------------------------------------------------------===// -class FCC<bits<5> _fmt, dag outs, dag ins, string asmstr, list<dag> pattern> : - MipsInst<outs, ins, asmstr, pattern, NoItinerary> +class FCC<bits<5> _fmt, dag outs, dag ins, string asmstr, list<dag> pattern> : + MipsInst<outs, ins, asmstr, pattern, NoItinerary> { bits<5> fs; bits<5> ft; @@ -174,9 +174,54 @@ class FCC<bits<5> _fmt, dag outs, dag ins, string asmstr, list<dag> pattern> : let fmt = _fmt; let Inst{25-21} = fmt; - let Inst{20-16} = ft; + let Inst{20-16} = ft; let Inst{15-11} = fs; let Inst{10-6} = 0; let Inst{5-4} = 0b11; let Inst{3-0} = cc; } + + +class FCMOV<bits<1> _tf, dag outs, dag ins, string asmstr, + list<dag> pattern> : + MipsInst<outs, ins, asmstr, pattern, NoItinerary> +{ + bits<5> rd; + bits<5> rs; + bits<3> N; + bits<1> tf; + + let opcode = 0; + let tf = _tf; + + let Inst{25-21} = rs; + let Inst{20-18} = N; + let Inst{17} = 0; + let Inst{16} = tf; + let Inst{15-11} = rd; + let Inst{10-6} = 0; + let Inst{5-0} = 1; +} + +class FFCMOV<bits<5> _fmt, bits<1> _tf, dag outs, dag ins, string asmstr, + list<dag> pattern> : + MipsInst<outs, ins, asmstr, pattern, NoItinerary> +{ + bits<5> fd; + bits<5> fs; + bits<3> N; + bits<5> fmt; + bits<1> tf; + + let opcode = 17; + let fmt = _fmt; + let tf = _tf; + + let Inst{25-21} = fmt; + let Inst{20-18} = N; + let Inst{17} = 0; + let Inst{16} = tf; + let Inst{15-11} = fs; + let Inst{10-6} = fd; + let Inst{5-0} = 17; +}
\ No newline at end of file diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp index aaf307b..be044fa 100644 --- a/lib/Target/Mips/MipsInstrInfo.cpp +++ b/lib/Target/Mips/MipsInstrInfo.cpp @@ -36,7 +36,7 @@ static bool isZeroImm(const MachineOperand &op) { /// not, return 0. This predicate must return 0 if the instruction has /// any side effects other than loading from the stack slot. unsigned MipsInstrInfo:: -isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const +isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const { if ((MI->getOpcode() == Mips::LW) || (MI->getOpcode() == Mips::LWC1) || (MI->getOpcode() == Mips::LDC1)) { @@ -57,7 +57,7 @@ isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const /// not, return 0. This predicate must return 0 if the instruction has /// any side effects other than storing to the stack slot. unsigned MipsInstrInfo:: -isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const +isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const { if ((MI->getOpcode() == Mips::SW) || (MI->getOpcode() == Mips::SWC1) || (MI->getOpcode() == Mips::SDC1)) { @@ -74,7 +74,7 @@ isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const /// insertNoop - If data hazard condition is found insert the target nop /// instruction. void MipsInstrInfo:: -insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const +insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const { DebugLoc DL; BuildMI(MBB, MI, DL, get(Mips::NOP)); @@ -136,7 +136,7 @@ copyPhysReg(MachineBasicBlock &MBB, .addReg(SrcReg, getKillRegState(KillSrc)); return; } - + if (Mips::AFGR64RegClass.contains(DestReg, SrcReg)) { BuildMI(MBB, I, DL, get(Mips::FMOV_D32), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); @@ -153,13 +153,13 @@ copyPhysReg(MachineBasicBlock &MBB, void MipsInstrInfo:: storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - unsigned SrcReg, bool isKill, int FI, + unsigned SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); - if (RC == Mips::CPURegsRegisterClass) + if (RC == Mips::CPURegsRegisterClass) BuildMI(MBB, I, DL, get(Mips::SW)).addReg(SrcReg, getKillRegState(isKill)) .addImm(0).addFrameIndex(FI); else if (RC == Mips::FGR32RegisterClass) @@ -171,7 +171,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, .addReg(SrcReg, getKillRegState(isKill)) .addImm(0).addFrameIndex(FI); } else { - const TargetRegisterInfo *TRI = + const TargetRegisterInfo *TRI = MBB.getParent()->getTarget().getRegisterInfo(); const unsigned *SubSet = TRI->getSubRegisters(SrcReg); BuildMI(MBB, I, DL, get(Mips::SWC1)) @@ -189,12 +189,12 @@ void MipsInstrInfo:: loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned DestReg, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const + const TargetRegisterInfo *TRI) const { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); - if (RC == Mips::CPURegsRegisterClass) + if (RC == Mips::CPURegsRegisterClass) BuildMI(MBB, I, DL, get(Mips::LW), DestReg).addImm(0).addFrameIndex(FI); else if (RC == Mips::FGR32RegisterClass) BuildMI(MBB, I, DL, get(Mips::LWC1), DestReg).addImm(0).addFrameIndex(FI); @@ -202,7 +202,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, if (!TM.getSubtarget<MipsSubtarget>().isMips1()) { BuildMI(MBB, I, DL, get(Mips::LDC1), DestReg).addImm(0).addFrameIndex(FI); } else { - const TargetRegisterInfo *TRI = + const TargetRegisterInfo *TRI = MBB.getParent()->getTarget().getRegisterInfo(); const unsigned *SubSet = TRI->getSubRegisters(DestReg); BuildMI(MBB, I, DL, get(Mips::LWC1), SubSet[0]) @@ -218,281 +218,202 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, // Branch Analysis //===----------------------------------------------------------------------===// -/// GetCondFromBranchOpc - Return the Mips CC that matches -/// the correspondent Branch instruction opcode. -static Mips::CondCode GetCondFromBranchOpc(unsigned BrOpc) -{ - switch (BrOpc) { - default: return Mips::COND_INVALID; - case Mips::BEQ : return Mips::COND_E; - case Mips::BNE : return Mips::COND_NE; - case Mips::BGTZ : return Mips::COND_GZ; - case Mips::BGEZ : return Mips::COND_GEZ; - case Mips::BLTZ : return Mips::COND_LZ; - case Mips::BLEZ : return Mips::COND_LEZ; - - // We dont do fp branch analysis yet! - case Mips::BC1T : - case Mips::BC1F : return Mips::COND_INVALID; - } +static unsigned GetAnalyzableBrOpc(unsigned Opc) { + return (Opc == Mips::BEQ || Opc == Mips::BNE || Opc == Mips::BGTZ || + Opc == Mips::BGEZ || Opc == Mips::BLTZ || Opc == Mips::BLEZ || + Opc == Mips::BC1T || Opc == Mips::BC1F || Opc == Mips::J) ? Opc : 0; } -/// GetCondBranchFromCond - Return the Branch instruction -/// opcode that matches the cc. -unsigned Mips::GetCondBranchFromCond(Mips::CondCode CC) +/// GetOppositeBranchOpc - Return the inverse of the specified +/// opcode, e.g. turning BEQ to BNE. +unsigned Mips::GetOppositeBranchOpc(unsigned Opc) { - switch (CC) { - default: llvm_unreachable("Illegal condition code!"); - case Mips::COND_E : return Mips::BEQ; - case Mips::COND_NE : return Mips::BNE; - case Mips::COND_GZ : return Mips::BGTZ; - case Mips::COND_GEZ : return Mips::BGEZ; - case Mips::COND_LZ : return Mips::BLTZ; - case Mips::COND_LEZ : return Mips::BLEZ; - - case Mips::FCOND_F: - case Mips::FCOND_UN: - case Mips::FCOND_EQ: - case Mips::FCOND_UEQ: - case Mips::FCOND_OLT: - case Mips::FCOND_ULT: - case Mips::FCOND_OLE: - case Mips::FCOND_ULE: - case Mips::FCOND_SF: - case Mips::FCOND_NGLE: - case Mips::FCOND_SEQ: - case Mips::FCOND_NGL: - case Mips::FCOND_LT: - case Mips::FCOND_NGE: - case Mips::FCOND_LE: - case Mips::FCOND_NGT: return Mips::BC1T; - - case Mips::FCOND_T: - case Mips::FCOND_OR: - case Mips::FCOND_NEQ: - case Mips::FCOND_OGL: - case Mips::FCOND_UGE: - case Mips::FCOND_OGE: - case Mips::FCOND_UGT: - case Mips::FCOND_OGT: - case Mips::FCOND_ST: - case Mips::FCOND_GLE: - case Mips::FCOND_SNE: - case Mips::FCOND_GL: - case Mips::FCOND_NLT: - case Mips::FCOND_GE: - case Mips::FCOND_NLE: - case Mips::FCOND_GT: return Mips::BC1F; + switch (Opc) { + default: llvm_unreachable("Illegal opcode!"); + case Mips::BEQ : return Mips::BNE; + case Mips::BNE : return Mips::BEQ; + case Mips::BGTZ : return Mips::BLEZ; + case Mips::BGEZ : return Mips::BLTZ; + case Mips::BLTZ : return Mips::BGEZ; + case Mips::BLEZ : return Mips::BGTZ; + case Mips::BC1T : return Mips::BC1F; + case Mips::BC1F : return Mips::BC1T; } } -/// GetOppositeBranchCondition - Return the inverse of the specified -/// condition, e.g. turning COND_E to COND_NE. -Mips::CondCode Mips::GetOppositeBranchCondition(Mips::CondCode CC) -{ - switch (CC) { - default: llvm_unreachable("Illegal condition code!"); - case Mips::COND_E : return Mips::COND_NE; - case Mips::COND_NE : return Mips::COND_E; - case Mips::COND_GZ : return Mips::COND_LEZ; - case Mips::COND_GEZ : return Mips::COND_LZ; - case Mips::COND_LZ : return Mips::COND_GEZ; - case Mips::COND_LEZ : return Mips::COND_GZ; - case Mips::FCOND_F : return Mips::FCOND_T; - case Mips::FCOND_UN : return Mips::FCOND_OR; - case Mips::FCOND_EQ : return Mips::FCOND_NEQ; - case Mips::FCOND_UEQ: return Mips::FCOND_OGL; - case Mips::FCOND_OLT: return Mips::FCOND_UGE; - case Mips::FCOND_ULT: return Mips::FCOND_OGE; - case Mips::FCOND_OLE: return Mips::FCOND_UGT; - case Mips::FCOND_ULE: return Mips::FCOND_OGT; - case Mips::FCOND_SF: return Mips::FCOND_ST; - case Mips::FCOND_NGLE:return Mips::FCOND_GLE; - case Mips::FCOND_SEQ: return Mips::FCOND_SNE; - case Mips::FCOND_NGL: return Mips::FCOND_GL; - case Mips::FCOND_LT: return Mips::FCOND_NLT; - case Mips::FCOND_NGE: return Mips::FCOND_GE; - case Mips::FCOND_LE: return Mips::FCOND_NLE; - case Mips::FCOND_NGT: return Mips::FCOND_GT; - } +static void AnalyzeCondBr(const MachineInstr* Inst, unsigned Opc, + MachineBasicBlock *&BB, + SmallVectorImpl<MachineOperand>& Cond) { + assert(GetAnalyzableBrOpc(Opc) && "Not an analyzable branch"); + int NumOp = Inst->getNumExplicitOperands(); + + // for both int and fp branches, the last explicit operand is the + // MBB. + BB = Inst->getOperand(NumOp-1).getMBB(); + Cond.push_back(MachineOperand::CreateImm(Opc)); + + for (int i=0; i<NumOp-1; i++) + Cond.push_back(Inst->getOperand(i)); } -bool MipsInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, +bool MipsInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl<MachineOperand> &Cond, - bool AllowModify) const + bool AllowModify) const { - // If the block has no terminators, it just falls into the block after it. - MachineBasicBlock::iterator I = MBB.end(); - if (I == MBB.begin()) + MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend(); + + // Skip all the debug instructions. + while (I != REnd && I->isDebugValue()) + ++I; + + if (I == REnd || !isUnpredicatedTerminator(&*I)) { + // If this block ends with no branches (it just falls through to its succ) + // just return false, leaving TBB/FBB null. + TBB = FBB = NULL; return false; - --I; - while (I->isDebugValue()) { - if (I == MBB.begin()) - return false; - --I; } - if (!isUnpredicatedTerminator(I)) - return false; - - // Get the last instruction in the block. - MachineInstr *LastInst = I; - - // If there is only one terminator instruction, process it. + + MachineInstr *LastInst = &*I; unsigned LastOpc = LastInst->getOpcode(); - if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { - if (!LastInst->getDesc().isBranch()) + + // Not an analyzable branch (must be an indirect jump). + if (!GetAnalyzableBrOpc(LastOpc)) + return true; + + // Get the second to last instruction in the block. + unsigned SecondLastOpc = 0; + MachineInstr *SecondLastInst = NULL; + + if (++I != REnd) { + SecondLastInst = &*I; + SecondLastOpc = GetAnalyzableBrOpc(SecondLastInst->getOpcode()); + + // Not an analyzable branch (must be an indirect jump). + if (isUnpredicatedTerminator(SecondLastInst) && !SecondLastOpc) return true; + } + // If there is only one terminator instruction, process it. + if (!SecondLastOpc) { // Unconditional branch if (LastOpc == Mips::J) { TBB = LastInst->getOperand(0).getMBB(); return false; } - Mips::CondCode BranchCode = GetCondFromBranchOpc(LastInst->getOpcode()); - if (BranchCode == Mips::COND_INVALID) - return true; // Can't handle indirect branch. - // Conditional branch - // Block ends with fall-through condbranch. - if (LastOpc != Mips::COND_INVALID) { - int LastNumOp = LastInst->getNumOperands(); - - TBB = LastInst->getOperand(LastNumOp-1).getMBB(); - Cond.push_back(MachineOperand::CreateImm(BranchCode)); - - for (int i=0; i<LastNumOp-1; i++) { - Cond.push_back(LastInst->getOperand(i)); - } - - return false; - } + AnalyzeCondBr(LastInst, LastOpc, TBB, Cond); + return false; } - - // Get the instruction before it if it is a terminator. - MachineInstr *SecondLastInst = I; - + + // If we reached here, there are two branches. // If there are three terminators, we don't know what sort of block this is. - if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I)) + if (++I != REnd && isUnpredicatedTerminator(&*I)) return true; - // If the block ends with Mips::J and a Mips::BNE/Mips::BEQ, handle it. - unsigned SecondLastOpc = SecondLastInst->getOpcode(); - Mips::CondCode BranchCode = GetCondFromBranchOpc(SecondLastOpc); + // If second to last instruction is an unconditional branch, + // analyze it and remove the last instruction. + if (SecondLastOpc == Mips::J) { + // Return if the last instruction cannot be removed. + if (!AllowModify) + return true; - if (BranchCode != Mips::COND_INVALID && LastOpc == Mips::J) { - int SecondNumOp = SecondLastInst->getNumOperands(); + TBB = SecondLastInst->getOperand(0).getMBB(); + LastInst->eraseFromParent(); + return false; + } - TBB = SecondLastInst->getOperand(SecondNumOp-1).getMBB(); - Cond.push_back(MachineOperand::CreateImm(BranchCode)); + // Conditional branch followed by an unconditional branch. + // The last one must be unconditional. + if (LastOpc != Mips::J) + return true; - for (int i=0; i<SecondNumOp-1; i++) { - Cond.push_back(SecondLastInst->getOperand(i)); - } + AnalyzeCondBr(SecondLastInst, SecondLastOpc, TBB, Cond); + FBB = LastInst->getOperand(0).getMBB(); - FBB = LastInst->getOperand(0).getMBB(); - return false; - } + return false; +} - // If the block ends with two unconditional branches, handle it. The last - // one is not executed, so remove it. - if ((SecondLastOpc == Mips::J) && (LastOpc == Mips::J)) { - TBB = SecondLastInst->getOperand(0).getMBB(); - I = LastInst; - if (AllowModify) - I->eraseFromParent(); - return false; - } +void MipsInstrInfo::BuildCondBr(MachineBasicBlock &MBB, + MachineBasicBlock *TBB, DebugLoc DL, + const SmallVectorImpl<MachineOperand>& Cond) + const { + unsigned Opc = Cond[0].getImm(); + const TargetInstrDesc &TID = get(Opc); + MachineInstrBuilder MIB = BuildMI(&MBB, DL, TID); - // Otherwise, can't handle this. - return true; + for (unsigned i = 1; i < Cond.size(); ++i) + MIB.addReg(Cond[i].getReg()); + + MIB.addMBB(TBB); } unsigned MipsInstrInfo:: -InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, +InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const { // Shouldn't be a fall through. assert(TBB && "InsertBranch must not be told to insert a fallthrough"); - assert((Cond.size() == 3 || Cond.size() == 2 || Cond.size() == 0) && - "Mips branch conditions can have two|three components!"); - if (FBB == 0) { // One way branch. - if (Cond.empty()) { - // Unconditional branch? - BuildMI(&MBB, DL, get(Mips::J)).addMBB(TBB); - } else { - // Conditional branch. - unsigned Opc = GetCondBranchFromCond((Mips::CondCode)Cond[0].getImm()); - const TargetInstrDesc &TID = get(Opc); - - if (TID.getNumOperands() == 3) - BuildMI(&MBB, DL, TID).addReg(Cond[1].getReg()) - .addReg(Cond[2].getReg()) - .addMBB(TBB); - else - BuildMI(&MBB, DL, TID).addReg(Cond[1].getReg()) - .addMBB(TBB); - - } - return 1; - } - - // Two-way Conditional branch. - unsigned Opc = GetCondBranchFromCond((Mips::CondCode)Cond[0].getImm()); - const TargetInstrDesc &TID = get(Opc); + // # of condition operands: + // Unconditional branches: 0 + // Floating point branches: 1 (opc) + // Int BranchZero: 2 (opc, reg) + // Int Branch: 3 (opc, reg0, reg1) + assert((Cond.size() <= 3) && + "# of Mips branch conditions must be <= 3!"); - if (TID.getNumOperands() == 3) - BuildMI(&MBB, DL, TID).addReg(Cond[1].getReg()).addReg(Cond[2].getReg()) - .addMBB(TBB); - else - BuildMI(&MBB, DL, TID).addReg(Cond[1].getReg()).addMBB(TBB); + // Two-way Conditional branch. + if (FBB) { + BuildCondBr(MBB, TBB, DL, Cond); + BuildMI(&MBB, DL, get(Mips::J)).addMBB(FBB); + return 2; + } - BuildMI(&MBB, DL, get(Mips::J)).addMBB(FBB); - return 2; + // One way branch. + // Unconditional branch. + if (Cond.empty()) + BuildMI(&MBB, DL, get(Mips::J)).addMBB(TBB); + else // Conditional branch. + BuildCondBr(MBB, TBB, DL, Cond); + return 1; } unsigned MipsInstrInfo:: -RemoveBranch(MachineBasicBlock &MBB) const +RemoveBranch(MachineBasicBlock &MBB) const { - MachineBasicBlock::iterator I = MBB.end(); - if (I == MBB.begin()) return 0; - --I; - while (I->isDebugValue()) { - if (I == MBB.begin()) - return 0; - --I; - } - if (I->getOpcode() != Mips::J && - GetCondFromBranchOpc(I->getOpcode()) == Mips::COND_INVALID) - return 0; - - // Remove the branch. - I->eraseFromParent(); - - I = MBB.end(); - - if (I == MBB.begin()) return 1; - --I; - if (GetCondFromBranchOpc(I->getOpcode()) == Mips::COND_INVALID) - return 1; - - // Remove the branch. - I->eraseFromParent(); - return 2; + MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend(); + MachineBasicBlock::reverse_iterator FirstBr; + unsigned removed; + + // Skip all the debug instructions. + while (I != REnd && I->isDebugValue()) + ++I; + + FirstBr = I; + + // Up to 2 branches are removed. + // Note that indirect branches are not removed. + for(removed = 0; I != REnd && removed < 2; ++I, ++removed) + if (!GetAnalyzableBrOpc(I->getOpcode())) + break; + + MBB.erase(I.base(), FirstBr.base()); + + return removed; } -/// ReverseBranchCondition - Return the inverse opcode of the +/// ReverseBranchCondition - Return the inverse opcode of the /// specified Branch instruction. bool MipsInstrInfo:: -ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const +ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { - assert( (Cond.size() == 3 || Cond.size() == 2) && + assert( (Cond.size() && Cond.size() <= 3) && "Invalid Mips branch condition!"); - Cond[0].setImm(GetOppositeBranchCondition((Mips::CondCode)Cond[0].getImm())); + Cond[0].setImm(Mips::GetOppositeBranchOpc(Cond[0].getImm())); return false; } diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h index 52a3d39..5fdbf1f 100644 --- a/lib/Target/Mips/MipsInstrInfo.h +++ b/lib/Target/Mips/MipsInstrInfo.h @@ -37,7 +37,7 @@ namespace Mips { // To be used with float branch True FCOND_F, FCOND_UN, - FCOND_EQ, + FCOND_OEQ, FCOND_UEQ, FCOND_OLT, FCOND_ULT, @@ -57,8 +57,8 @@ namespace Mips { // above ones, but are used with a branch False; FCOND_T, FCOND_OR, - FCOND_NEQ, - FCOND_OGL, + FCOND_UNE, + FCOND_ONE, FCOND_UGE, FCOND_OGE, FCOND_UGT, @@ -70,27 +70,15 @@ namespace Mips { FCOND_NLT, FCOND_GE, FCOND_NLE, - FCOND_GT, - - // Only integer conditions - COND_E, - COND_GZ, - COND_GEZ, - COND_LZ, - COND_LEZ, - COND_NE, - COND_INVALID + FCOND_GT }; - - // Turn condition code into conditional branch opcode. - unsigned GetCondBranchFromCond(CondCode CC); - /// GetOppositeBranchCondition - Return the inverse of the specified cond, - /// e.g. turning COND_E to COND_NE. - CondCode GetOppositeBranchCondition(Mips::CondCode CC); + /// GetOppositeBranchOpc - Return the inverse of the specified + /// opcode, e.g. turning BEQ to BNE. + unsigned GetOppositeBranchOpc(unsigned Opc); /// MipsCCToString - Map each FP condition code to its string - inline static const char *MipsFCCToString(Mips::CondCode CC) + inline static const char *MipsFCCToString(Mips::CondCode CC) { switch (CC) { default: llvm_unreachable("Unknown condition code"); @@ -98,10 +86,10 @@ namespace Mips { case FCOND_T: return "f"; case FCOND_UN: case FCOND_OR: return "un"; - case FCOND_EQ: - case FCOND_NEQ: return "eq"; + case FCOND_OEQ: + case FCOND_UNE: return "eq"; case FCOND_UEQ: - case FCOND_OGL: return "ueq"; + case FCOND_ONE: return "ueq"; case FCOND_OLT: case FCOND_UGE: return "olt"; case FCOND_ULT: @@ -121,11 +109,11 @@ namespace Mips { case FCOND_LT: case FCOND_NLT: return "lt"; case FCOND_NGE: - case FCOND_GE: return "ge"; + case FCOND_GE: return "nge"; case FCOND_LE: - case FCOND_NLE: return "nle"; + case FCOND_NLE: return "le"; case FCOND_NGT: - case FCOND_GT: return "gt"; + case FCOND_GT: return "ngt"; } } } @@ -138,27 +126,27 @@ namespace MipsII { enum TOF { //===------------------------------------------------------------------===// // Mips Specific MachineOperand flags. - + MO_NO_FLAG, /// MO_GOT - Represents the offset into the global offset table at which /// the address the relocation entry symbol resides during execution. MO_GOT, - /// MO_GOT_CALL - Represents the offset into the global offset table at - /// which the address of a call site relocation entry symbol resides + /// MO_GOT_CALL - Represents the offset into the global offset table at + /// which the address of a call site relocation entry symbol resides /// during execution. This is different from the above since this flag /// can only be present in call instructions. MO_GOT_CALL, - /// MO_GPREL - Represents the offset from the current gp value to be used + /// MO_GPREL - Represents the offset from the current gp value to be used /// for the relocatable object file being produced. MO_GPREL, - /// MO_ABS_HILO - Represents the hi or low part of an absolute symbol - /// address. - MO_ABS_HILO - + /// MO_ABS_HI/LO - Represents the hi or low part of an absolute symbol + /// address. + MO_ABS_HI, + MO_ABS_LO }; } @@ -181,7 +169,7 @@ public: /// any side effects other than loading from the stack slot. virtual unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const; - + /// isStoreToStackSlot - If the specified machine instruction is a direct /// store to a stack slot, return the virtual or physical register number of /// the source reg along with the FrameIndex of the loaded stack slot. If @@ -189,13 +177,19 @@ public: /// any side effects other than storing to the stack slot. virtual unsigned isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const; - + /// Branch Analysis virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const; virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const; + +private: + void BuildCondBr(MachineBasicBlock &MBB, MachineBasicBlock *TBB, DebugLoc DL, + const SmallVectorImpl<MachineOperand>& Cond) const; + +public: virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, const SmallVectorImpl<MachineOperand> &Cond, @@ -220,7 +214,7 @@ public: bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const; /// Insert nop instruction when hazard condition is found - virtual void insertNoop(MachineBasicBlock &MBB, + virtual void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const; /// getGlobalBaseReg - Return a virtual register initialized with the diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td index b70266a..19b9c35 100644 --- a/lib/Target/Mips/MipsInstrInfo.td +++ b/lib/Target/Mips/MipsInstrInfo.td @@ -19,18 +19,19 @@ include "MipsInstrFormats.td" def SDT_MipsRet : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def SDT_MipsJmpLink : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>; -def SDT_MipsSelectCC : SDTypeProfile<1, 3, [SDTCisSameAs<0, 2>, - SDTCisSameAs<2, 3>, SDTCisInt<1>]>; def SDT_MipsCMov : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, - SDTCisSameAs<1, 2>, SDTCisSameAs<3, 4>, - SDTCisInt<4>]>; + SDTCisSameAs<1, 2>, + SDTCisSameAs<3, 4>, + SDTCisInt<4>]>; def SDT_MipsCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>; def SDT_MipsCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; -def SDT_MipsMAddMSub : SDTypeProfile<0, 4, +def SDT_MipsMAddMSub : SDTypeProfile<0, 4, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>, - SDTCisSameAs<1, 2>, + SDTCisSameAs<1, 2>, SDTCisSameAs<2, 3>]>; - +def SDT_MipsDivRem : SDTypeProfile<0, 2, + [SDTCisVT<0, i32>, + SDTCisSameAs<0, 1>]>; // Call def MipsJmpLink : SDNode<"MipsISD::JmpLink",SDT_MipsJmpLink, @@ -54,9 +55,6 @@ def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_MipsCallSeqStart, def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_MipsCallSeqEnd, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; -// Select Condition Code -def MipsSelectCC : SDNode<"MipsISD::SelectCC", SDT_MipsSelectCC>; - // MAdd*/MSub* nodes def MipsMAdd : SDNode<"MipsISD::MAdd", SDT_MipsMAddMSub, [SDNPOptInGlue, SDNPOutGlue]>; @@ -67,6 +65,12 @@ def MipsMSub : SDNode<"MipsISD::MSub", SDT_MipsMAddMSub, def MipsMSubu : SDNode<"MipsISD::MSubu", SDT_MipsMAddMSub, [SDNPOptInGlue, SDNPOutGlue]>; +// DivRem(u) nodes +def MipsDivRem : SDNode<"MipsISD::DivRem", SDT_MipsDivRem, + [SDNPOutGlue]>; +def MipsDivRemU : SDNode<"MipsISD::DivRemU", SDT_MipsDivRem, + [SDNPOutGlue]>; + //===----------------------------------------------------------------------===// // Mips Instruction Predicate Definitions. //===----------------------------------------------------------------------===// @@ -165,7 +169,7 @@ class ArithOverflowI<bits<6> op, string instr_asm, SDNode OpNode, let rd = 0, shamt = 0, Defs = [HI, LO], Uses = [HI, LO] in class MArithR<bits<6> func, string instr_asm, SDNode op> : FR<0x1c, func, (outs), (ins CPURegs:$rs, CPURegs:$rt), - !strconcat(instr_asm, "\t$rs, $rt"), + !strconcat(instr_asm, "\t$rs, $rt"), [(op CPURegs:$rs, CPURegs:$rt, LO, HI)], IIImul>; // Logical @@ -185,7 +189,7 @@ class LogicNOR<bits<6> op, bits<6> func, string instr_asm>: [(set CPURegs:$dst, (not (or CPURegs:$b, CPURegs:$c)))], IIAlu>; // Shifts -class LogicR_shift_rotate_imm<bits<6> func, bits<5> _rs, string instr_asm, +class LogicR_shift_rotate_imm<bits<6> func, bits<5> _rs, string instr_asm, SDNode OpNode>: FR<0x00, func, (outs CPURegs:$dst), (ins CPURegs:$b, shamt:$c), !strconcat(instr_asm, "\t$dst, $b, $c"), @@ -193,7 +197,7 @@ class LogicR_shift_rotate_imm<bits<6> func, bits<5> _rs, string instr_asm, let rs = _rs; } -class LogicR_shift_rotate_reg<bits<6> func, bits<5> _shamt, string instr_asm, +class LogicR_shift_rotate_reg<bits<6> func, bits<5> _shamt, string instr_asm, SDNode OpNode>: FR<0x00, func, (outs CPURegs:$dst), (ins CPURegs:$c, CPURegs:$b), !strconcat(instr_asm, "\t$dst, $b, $c"), @@ -283,9 +287,16 @@ let isCall=1, hasDelaySlot=1, } // Mul, Div -class MulDiv<bits<6> func, string instr_asm, InstrItinClass itin>: - FR<0x00, func, (outs), (ins CPURegs:$a, CPURegs:$b), - !strconcat(instr_asm, "\t$a, $b"), [], itin>; +let Defs = [HI, LO] in { + class Mul<bits<6> func, string instr_asm, InstrItinClass itin>: + FR<0x00, func, (outs), (ins CPURegs:$a, CPURegs:$b), + !strconcat(instr_asm, "\t$a, $b"), [], itin>; + + class Div<SDNode op, bits<6> func, string instr_asm, InstrItinClass itin>: + FR<0x00, func, (outs), (ins CPURegs:$a, CPURegs:$b), + !strconcat(instr_asm, "\t$$zero, $a, $b"), + [(op CPURegs:$a, CPURegs:$b)], itin>; +} // Move from Hi/Lo class MoveFromLOHI<bits<6> func, string instr_asm>: @@ -348,6 +359,11 @@ def REORDER : MipsPseudo<(outs), (ins), ".set\treorder", []>; def NOMACRO : MipsPseudo<(outs), (ins), ".set\tnomacro", []>; def NOREORDER : MipsPseudo<(outs), (ins), ".set\tnoreorder", []>; +// These macros are inserted to prevent GAS from complaining +// when using the AT register. +def NOAT : MipsPseudo<(outs), (ins), ".set\tnoat", []>; +def ATMACRO : MipsPseudo<(outs), (ins), ".set\tat", []>; + // When handling PIC code the assembler needs .cpload and .cprestore // directives. If the real instructions corresponding these directives // are used, we have the same behavior, but get also a bunch of warnings @@ -355,18 +371,6 @@ def NOREORDER : MipsPseudo<(outs), (ins), ".set\tnoreorder", []>; def CPLOAD : MipsPseudo<(outs), (ins CPURegs:$picreg), ".cpload\t$picreg", []>; def CPRESTORE : MipsPseudo<(outs), (ins uimm16:$loc), ".cprestore\t$loc\n", []>; -// The supported Mips ISAs dont have any instruction close to the SELECT_CC -// operation. The solution is to create a Mips pseudo SELECT_CC instruction -// (MipsSelectCC), use LowerSELECT_CC to generate this instruction and finally -// replace it for real supported nodes into EmitInstrWithCustomInserter -let usesCustomInserter = 1 in { - class PseudoSelCC<RegisterClass RC, string asmstr>: - MipsPseudo<(outs RC:$dst), (ins CPURegs:$CmpRes, RC:$T, RC:$F), asmstr, - [(set RC:$dst, (MipsSelectCC CPURegs:$CmpRes, RC:$T, RC:$F))]>; -} - -def Select_CC : PseudoSelCC<CPURegs, "# MipsSelect_CC_i32">; - //===----------------------------------------------------------------------===// // Instruction definition //===----------------------------------------------------------------------===// @@ -447,12 +451,10 @@ let isReturn=1, isTerminator=1, hasDelaySlot=1, "jr\t$target", [(MipsRet CPURegs:$target)], IIBranch>; /// Multiply and Divide Instructions. -let Defs = [HI, LO] in { - def MULT : MulDiv<0x18, "mult", IIImul>; - def MULTu : MulDiv<0x19, "multu", IIImul>; - def DIV : MulDiv<0x1a, "div", IIIdiv>; - def DIVu : MulDiv<0x1b, "divu", IIIdiv>; -} +def MULT : Mul<0x18, "mult", IIImul>; +def MULTu : Mul<0x19, "multu", IIImul>; +def SDIV : Div<MipsDivRem, 0x1a, "div", IIIdiv>; +def UDIV : Div<MipsDivRemU, 0x1b, "divu", IIIdiv>; let Defs = [HI] in def MTHI : MoveToLOHI<0x11, "mthi">; @@ -489,10 +491,19 @@ let Predicates = [HasSwap] in { def MIPS_CMOV_ZERO : PatLeaf<(i32 0)>; def MIPS_CMOV_NZERO : PatLeaf<(i32 1)>; -let Predicates = [HasCondMov], Constraints = "$F = $dst" in { - def MOVN : CondMov<0x0a, "movn", MIPS_CMOV_NZERO>; - def MOVZ : CondMov<0x0b, "movz", MIPS_CMOV_ZERO>; -} +// Conditional moves: +// These instructions are expanded in +// MipsISelLowering::EmitInstrWithCustomInserter if target does not have +// conditional move instructions. +// flag:int, data:int +let usesCustomInserter = 1, shamt = 0, Constraints = "$F = $dst" in + class CondMovIntInt<bits<6> funct, string instr_asm> : + FR<0, funct, (outs CPURegs:$dst), + (ins CPURegs:$T, CPURegs:$cond, CPURegs:$F), + !strconcat(instr_asm, "\t$dst, $T, $cond"), [], NoItinerary>; + +def MOVZ_I : CondMovIntInt<0x0a, "movz">; +def MOVN_I : CondMovIntInt<0x0b, "movn">; /// No operation let addr=0 in @@ -533,7 +544,7 @@ def : Pat<(subc CPURegs:$lhs, CPURegs:$rhs), (SUBu CPURegs:$lhs, CPURegs:$rhs)>; def : Pat<(addc CPURegs:$lhs, CPURegs:$rhs), (ADDu CPURegs:$lhs, CPURegs:$rhs)>; -def : Pat<(addc CPURegs:$src, imm:$imm), +def : Pat<(addc CPURegs:$src, immSExt16:$imm), (ADDiu CPURegs:$src, imm:$imm)>; // Call @@ -546,8 +557,11 @@ def : Pat<(MipsJmpLink (i32 texternalsym:$dst)), // hi/lo relocs def : Pat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>; +def : Pat<(MipsHi tblockaddress:$in), (LUi tblockaddress:$in)>; def : Pat<(add CPURegs:$hi, (MipsLo tglobaladdr:$lo)), (ADDiu CPURegs:$hi, tglobaladdr:$lo)>; +def : Pat<(add CPURegs:$hi, (MipsLo tblockaddress:$lo)), + (ADDiu CPURegs:$hi, tblockaddress:$lo)>; def : Pat<(MipsHi tjumptable:$in), (LUi tjumptable:$in)>; def : Pat<(add CPURegs:$hi, (MipsLo tjumptable:$lo)), @@ -599,33 +613,43 @@ def : Pat<(brcond CPURegs:$cond, bb:$dst), (BNE CPURegs:$cond, ZERO, bb:$dst)>; // select patterns -def : Pat<(select (setge CPURegs:$lhs, CPURegs:$rhs), CPURegs:$T, CPURegs:$F), - (MOVZ CPURegs:$F, CPURegs:$T, (SLT CPURegs:$lhs, CPURegs:$rhs))>; -def : Pat<(select (setuge CPURegs:$lhs, CPURegs:$rhs), CPURegs:$T, CPURegs:$F), - (MOVZ CPURegs:$F, CPURegs:$T, (SLTu CPURegs:$lhs, CPURegs:$rhs))>; -def : Pat<(select (setge CPURegs:$lhs, immSExt16:$rhs), CPURegs:$T, CPURegs:$F), - (MOVZ CPURegs:$F, CPURegs:$T, (SLTi CPURegs:$lhs, immSExt16:$rhs))>; -def : Pat<(select (setuge CPURegs:$lh, immSExt16:$rh), CPURegs:$T, CPURegs:$F), - (MOVZ CPURegs:$F, CPURegs:$T, (SLTiu CPURegs:$lh, immSExt16:$rh))>; - -def : Pat<(select (setle CPURegs:$lhs, CPURegs:$rhs), CPURegs:$T, CPURegs:$F), - (MOVZ CPURegs:$F, CPURegs:$T, (SLT CPURegs:$rhs, CPURegs:$lhs))>; -def : Pat<(select (setule CPURegs:$lhs, CPURegs:$rhs), CPURegs:$T, CPURegs:$F), - (MOVZ CPURegs:$F, CPURegs:$T, (SLTu CPURegs:$rhs, CPURegs:$lhs))>; - -def : Pat<(select (seteq CPURegs:$lhs, CPURegs:$rhs), CPURegs:$T, CPURegs:$F), - (MOVZ CPURegs:$F, CPURegs:$T, (XOR CPURegs:$lhs, CPURegs:$rhs))>; -def : Pat<(select (setne CPURegs:$lhs, CPURegs:$rhs), CPURegs:$T, CPURegs:$F), - (MOVN CPURegs:$F, CPURegs:$T, (XOR CPURegs:$lhs, CPURegs:$rhs))>; - -def : Pat<(select CPURegs:$cond, CPURegs:$T, CPURegs:$F), - (MOVN CPURegs:$F, CPURegs:$T, CPURegs:$cond)>; +multiclass MovzPats<RegisterClass RC, Instruction MOVZInst> { + def : Pat<(select (setge CPURegs:$lhs, CPURegs:$rhs), RC:$T, RC:$F), + (MOVZInst RC:$T, (SLT CPURegs:$lhs, CPURegs:$rhs), RC:$F)>; + def : Pat<(select (setuge CPURegs:$lhs, CPURegs:$rhs), RC:$T, RC:$F), + (MOVZInst RC:$T, (SLTu CPURegs:$lhs, CPURegs:$rhs), RC:$F)>; + def : Pat<(select (setge CPURegs:$lhs, immSExt16:$rhs), RC:$T, RC:$F), + (MOVZInst RC:$T, (SLTi CPURegs:$lhs, immSExt16:$rhs), RC:$F)>; + def : Pat<(select (setuge CPURegs:$lh, immSExt16:$rh), RC:$T, RC:$F), + (MOVZInst RC:$T, (SLTiu CPURegs:$lh, immSExt16:$rh), RC:$F)>; + def : Pat<(select (setle CPURegs:$lhs, CPURegs:$rhs), RC:$T, RC:$F), + (MOVZInst RC:$T, (SLT CPURegs:$rhs, CPURegs:$lhs), RC:$F)>; + def : Pat<(select (setule CPURegs:$lhs, CPURegs:$rhs), RC:$T, RC:$F), + (MOVZInst RC:$T, (SLTu CPURegs:$rhs, CPURegs:$lhs), RC:$F)>; + def : Pat<(select (seteq CPURegs:$lhs, CPURegs:$rhs), RC:$T, RC:$F), + (MOVZInst RC:$T, (XOR CPURegs:$lhs, CPURegs:$rhs), RC:$F)>; + def : Pat<(select (seteq CPURegs:$lhs, 0), RC:$T, RC:$F), + (MOVZInst RC:$T, CPURegs:$lhs, RC:$F)>; +} + +multiclass MovnPats<RegisterClass RC, Instruction MOVNInst> { + def : Pat<(select (setne CPURegs:$lhs, CPURegs:$rhs), RC:$T, RC:$F), + (MOVNInst RC:$T, (XOR CPURegs:$lhs, CPURegs:$rhs), RC:$F)>; + def : Pat<(select CPURegs:$cond, RC:$T, RC:$F), + (MOVNInst RC:$T, CPURegs:$cond, RC:$F)>; + def : Pat<(select (setne CPURegs:$lhs, 0), RC:$T, RC:$F), + (MOVNInst RC:$T, CPURegs:$lhs, RC:$F)>; +} + +defm : MovzPats<CPURegs, MOVZ_I>; +defm : MovnPats<CPURegs, MOVN_I>; // select patterns with got access -def : Pat<(select (setne CPURegs:$lhs, CPURegs:$rhs), - (i32 tglobaladdr:$T), CPURegs:$F), - (MOVN CPURegs:$F, (ADDiu GP, tglobaladdr:$T), - (XOR CPURegs:$lhs, CPURegs:$rhs))>; +let AddedComplexity = 10 in + def : Pat<(select (setne CPURegs:$lhs, CPURegs:$rhs), + (i32 tglobaladdr:$T), CPURegs:$F), + (MOVN_I CPURegs:$F, (ADDiu GP, tglobaladdr:$T), + (XOR CPURegs:$lhs, CPURegs:$rhs))>; // setcc patterns def : Pat<(seteq CPURegs:$lhs, CPURegs:$rhs), diff --git a/lib/Target/Mips/MipsMCAsmInfo.h b/lib/Target/Mips/MipsMCAsmInfo.h index 15a867e..41b7192 100644 --- a/lib/Target/Mips/MipsMCAsmInfo.h +++ b/lib/Target/Mips/MipsMCAsmInfo.h @@ -19,7 +19,7 @@ namespace llvm { class Target; - + class MipsMCAsmInfo : public MCAsmInfo { public: explicit MipsMCAsmInfo(const Target &T, StringRef TT); diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp index 3719e58..c09b129 100644 --- a/lib/Target/Mips/MipsRegisterInfo.cpp +++ b/lib/Target/Mips/MipsRegisterInfo.cpp @@ -38,7 +38,7 @@ using namespace llvm; -MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &ST, +MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &ST, const TargetInstrInfo &tii) : MipsGenRegisterInfo(Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP), Subtarget(ST), TII(tii) {} @@ -46,7 +46,7 @@ MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &ST, /// getRegisterNumbering - Given the enum value for some register, e.g. /// Mips::RA, return the number that it corresponds to (e.g. 31). unsigned MipsRegisterInfo:: -getRegisterNumbering(unsigned RegEnum) +getRegisterNumbering(unsigned RegEnum) { switch (RegEnum) { case Mips::ZERO : case Mips::F0 : case Mips::D0 : return 0; @@ -82,30 +82,30 @@ getRegisterNumbering(unsigned RegEnum) case Mips::FP : case Mips::F30: case Mips::D15: return 30; case Mips::RA : case Mips::F31: return 31; default: llvm_unreachable("Unknown register number!"); - } + } return 0; // Not reached } unsigned MipsRegisterInfo::getPICCallReg() { return Mips::T9; } //===----------------------------------------------------------------------===// -// Callee Saved Registers methods +// Callee Saved Registers methods //===----------------------------------------------------------------------===// /// Mips Callee Saved Registers const unsigned* MipsRegisterInfo:: -getCalleeSavedRegs(const MachineFunction *MF) const +getCalleeSavedRegs(const MachineFunction *MF) const { // Mips callee-save register range is $16-$23, $f20-$f30 static const unsigned SingleFloatOnlyCalleeSavedRegs[] = { - Mips::S0, Mips::S1, Mips::S2, Mips::S3, + Mips::S0, Mips::S1, Mips::S2, Mips::S3, Mips::S4, Mips::S5, Mips::S6, Mips::S7, - Mips::F20, Mips::F21, Mips::F22, Mips::F23, Mips::F24, Mips::F25, + Mips::F20, Mips::F21, Mips::F22, Mips::F23, Mips::F24, Mips::F25, Mips::F26, Mips::F27, Mips::F28, Mips::F29, Mips::F30, 0 }; static const unsigned BitMode32CalleeSavedRegs[] = { - Mips::S0, Mips::S1, Mips::S2, Mips::S3, + Mips::S0, Mips::S1, Mips::S2, Mips::S3, Mips::S4, Mips::S5, Mips::S6, Mips::S7, Mips::F20, Mips::F22, Mips::F24, Mips::F26, Mips::F28, Mips::F30, 0 }; @@ -132,11 +132,11 @@ getReservedRegs(const MachineFunction &MF) const { if (!Subtarget.isSingleFloat()) for (unsigned FReg=(Mips::F0)+1; FReg < Mips::F30; FReg+=2) Reserved.set(FReg); - + return Reserved; } -// This function eliminate ADJCALLSTACKDOWN, +// This function eliminate ADJCALLSTACKDOWN, // ADJCALLSTACKUP pseudo instructions void MipsRegisterInfo:: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, @@ -157,7 +157,7 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned i = 0; while (!MI.getOperand(i).isFI()) { ++i; - assert(i < MI.getNumOperands() && + assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!"); } @@ -179,8 +179,43 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, DEBUG(errs() << "Offset : " << Offset << "\n" << "<--------->\n"); - MI.getOperand(i-1).ChangeToImmediate(Offset); - MI.getOperand(i).ChangeToRegister(getFrameRegister(MF), false); + unsigned NewReg = 0; + int NewImm = 0; + MachineBasicBlock &MBB = *MI.getParent(); + bool ATUsed; + unsigned OrigReg = getFrameRegister(MF); + int OrigImm = Offset; + +// OrigImm fits in the 16-bit field + if (OrigImm < 0x8000 && OrigImm >= -0x8000) { + NewReg = OrigReg; + NewImm = OrigImm; + ATUsed = false; + } + else { + const TargetInstrInfo *TII = MF.getTarget().getInstrInfo(); + DebugLoc DL = II->getDebugLoc(); + int ImmLo = OrigImm & 0xffff; + int ImmHi = (((unsigned)OrigImm & 0xffff0000) >> 16) + + ((OrigImm & 0x8000) != 0); + + // FIXME: change this when mips goes MC". + BuildMI(MBB, II, DL, TII->get(Mips::NOAT)); + BuildMI(MBB, II, DL, TII->get(Mips::LUi), Mips::AT).addImm(ImmHi); + BuildMI(MBB, II, DL, TII->get(Mips::ADDu), Mips::AT).addReg(OrigReg) + .addReg(Mips::AT); + NewReg = Mips::AT; + NewImm = ImmLo; + + ATUsed = true; + } + + // FIXME: change this when mips goes MC". + if (ATUsed) + BuildMI(MBB, ++II, MI.getDebugLoc(), TII.get(Mips::ATMACRO)); + + MI.getOperand(i).ChangeToRegister(NewReg, false); + MI.getOperand(i-1).ChangeToImmediate(NewImm); } void MipsRegisterInfo:: diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h index a7f4bf9..767359f 100644 --- a/lib/Target/Mips/MipsRegisterInfo.h +++ b/lib/Target/Mips/MipsRegisterInfo.h @@ -26,7 +26,7 @@ class Type; struct MipsRegisterInfo : public MipsGenRegisterInfo { const MipsSubtarget &Subtarget; const TargetInstrInfo &TII; - + MipsRegisterInfo(const MipsSubtarget &Subtarget, const TargetInstrInfo &tii); /// getRegisterNumbering - Given the enum value for some register, e.g. diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td index 60efe31..9f9cae7 100644 --- a/lib/Target/Mips/MipsRegisterInfo.td +++ b/lib/Target/Mips/MipsRegisterInfo.td @@ -17,7 +17,7 @@ class MipsReg<string n> : Register<n> { let Namespace = "Mips"; } -class MipsRegWithSubRegs<string n, list<Register> subregs> +class MipsRegWithSubRegs<string n, list<Register> subregs> : RegisterWithSubRegs<n, subregs> { field bits<5> Num; let Namespace = "Mips"; @@ -83,7 +83,7 @@ let Namespace = "Mips" in { def SP : MipsGPRReg< 29, "SP">, DwarfRegNum<[29]>; def FP : MipsGPRReg< 30, "FP">, DwarfRegNum<[30]>; def RA : MipsGPRReg< 31, "RA">, DwarfRegNum<[31]>; - + /// Mips Single point precision FPU Registers def F0 : FPR< 0, "F0">, DwarfRegNum<[32]>; def F1 : FPR< 1, "F1">, DwarfRegNum<[33]>; @@ -117,7 +117,7 @@ let Namespace = "Mips" in { def F29 : FPR<29, "F29">, DwarfRegNum<[61]>; def F30 : FPR<30, "F30">, DwarfRegNum<[62]>; def F31 : FPR<31, "F31">, DwarfRegNum<[63]>; - + /// Mips Double point precision FPU Registers (aliased /// with the single precision to hold 64 bit values) def D0 : AFPR< 0, "F0", [F0, F1]>, DwarfRegNum<[32]>; @@ -149,11 +149,11 @@ let Namespace = "Mips" in { // Register Classes //===----------------------------------------------------------------------===// -def CPURegs : RegisterClass<"Mips", [i32], 32, +def CPURegs : RegisterClass<"Mips", [i32], 32, // Return Values and Arguments [V0, V1, A0, A1, A2, A3, // Not preserved across procedure calls - T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, + T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, // Callee save S0, S1, S2, S3, S4, S5, S6, S7, // Reserved @@ -173,16 +173,16 @@ def CPURegs : RegisterClass<"Mips", [i32], 32, // 64bit fp: // * FGR64 - 32 64-bit registers -// * AFGR64 - 16 32-bit even registers (32-bit FP Mode) +// * AFGR64 - 16 32-bit even registers (32-bit FP Mode) // // 32bit fp: // * FGR32 - 16 32-bit even registers // * FGR32 - 32 32-bit registers (single float only mode) -def FGR32 : RegisterClass<"Mips", [f32], 32, +def FGR32 : RegisterClass<"Mips", [f32], 32, // Return Values and Arguments [F0, F1, F2, F3, F12, F13, F14, F15, // Not preserved across procedure calls - F4, F5, F6, F7, F8, F9, F10, F11, F16, F17, F18, F19, + F4, F5, F6, F7, F8, F9, F10, F11, F16, F17, F18, F19, // Callee save F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30, // Reserved @@ -195,17 +195,17 @@ def FGR32 : RegisterClass<"Mips", [f32], 32, let MethodBodies = [{ static const unsigned MIPS_FGR32[] = { - Mips::F0, Mips::F1, Mips::F2, Mips::F3, Mips::F12, Mips::F13, - Mips::F14, Mips::F15, Mips::F4, Mips::F5, Mips::F6, Mips::F7, - Mips::F8, Mips::F9, Mips::F10, Mips::F11, Mips::F16, Mips::F17, - Mips::F18, Mips::F19, Mips::F20, Mips::F21, Mips::F22, Mips::F23, - Mips::F24, Mips::F25, Mips::F26, Mips::F27, Mips::F28, Mips::F29, + Mips::F0, Mips::F1, Mips::F2, Mips::F3, Mips::F12, Mips::F13, + Mips::F14, Mips::F15, Mips::F4, Mips::F5, Mips::F6, Mips::F7, + Mips::F8, Mips::F9, Mips::F10, Mips::F11, Mips::F16, Mips::F17, + Mips::F18, Mips::F19, Mips::F20, Mips::F21, Mips::F22, Mips::F23, + Mips::F24, Mips::F25, Mips::F26, Mips::F27, Mips::F28, Mips::F29, Mips::F30 }; static const unsigned MIPS_SVR4_FGR32[] = { - Mips::F0, Mips::F2, Mips::F12, Mips::F14, Mips::F4, - Mips::F6, Mips::F8, Mips::F10, Mips::F16, Mips::F18, + Mips::F0, Mips::F2, Mips::F12, Mips::F14, Mips::F4, + Mips::F6, Mips::F8, Mips::F10, Mips::F16, Mips::F18, Mips::F20, Mips::F22, Mips::F24, Mips::F26, Mips::F28, Mips::F30, }; @@ -217,7 +217,7 @@ def FGR32 : RegisterClass<"Mips", [f32], 32, if (Subtarget.isSingleFloat()) return MIPS_FGR32; else - return MIPS_SVR4_FGR32; + return MIPS_SVR4_FGR32; } FGR32Class::iterator @@ -233,11 +233,11 @@ def FGR32 : RegisterClass<"Mips", [f32], 32, }]; } -def AFGR64 : RegisterClass<"Mips", [f64], 64, +def AFGR64 : RegisterClass<"Mips", [f64], 64, // Return Values and Arguments [D0, D1, D6, D7, // Not preserved across procedure calls - D2, D3, D4, D5, D8, D9, + D2, D3, D4, D5, D8, D9, // Callee save D10, D11, D12, D13, D14, // Reserved diff --git a/lib/Target/Mips/MipsSchedule.td b/lib/Target/Mips/MipsSchedule.td index 49ca5d1..00be8ee 100644 --- a/lib/Target/Mips/MipsSchedule.td +++ b/lib/Target/Mips/MipsSchedule.td @@ -14,7 +14,7 @@ def ALU : FuncUnit; def IMULDIV : FuncUnit; //===----------------------------------------------------------------------===// -// Instruction Itinerary classes used for Mips +// Instruction Itinerary classes used for Mips //===----------------------------------------------------------------------===// def IIAlu : InstrItinClass; def IILoad : InstrItinClass; diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp index db114da..70747f5d 100644 --- a/lib/Target/Mips/MipsSubtarget.cpp +++ b/lib/Target/Mips/MipsSubtarget.cpp @@ -17,7 +17,7 @@ using namespace llvm; MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &FS, - bool little) : + bool little) : MipsArchVersion(Mips1), MipsABI(O32), IsLittle(little), IsSingleFloat(false), IsFP64bit(false), IsGP64bit(false), HasVFPU(false), IsLinux(true), HasSEInReg(false), HasCondMov(false), HasMulDivAdd(false), HasMinMax(false), @@ -33,7 +33,7 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &FS, if (TT.find("linux") == std::string::npos) IsLinux = false; - // When only the target triple is specified and is + // When only the target triple is specified and is // a allegrex target, set the features. We also match // big and little endian allegrex cores (dont really // know if a big one exists) diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h index e4f4b33..096bbed 100644 --- a/lib/Target/Mips/MipsSubtarget.h +++ b/lib/Target/Mips/MipsSubtarget.h @@ -26,7 +26,7 @@ class MipsSubtarget : public TargetSubtarget { public: enum MipsABIEnum { O32, O64, N32, N64, EABI - }; + }; protected: @@ -34,10 +34,10 @@ protected: Mips1, Mips2, Mips3, Mips4, Mips32, Mips32r2 }; - // Mips architecture version + // Mips architecture version MipsArchEnum MipsArchVersion; - // Mips supported ABIs + // Mips supported ABIs MipsABIEnum MipsABI; // IsLittle - The target is Little Endian @@ -61,14 +61,14 @@ protected: bool IsLinux; /// Features related to the presence of specific instructions. - + // HasSEInReg - SEB and SEH (signext in register) instructions. bool HasSEInReg; // HasCondMov - Conditional mov (MOVZ, MOVN) instructions. bool HasCondMov; - // HasMulDivAdd - Multiply add and sub (MADD, MADDu, MSUB, MSUBu) + // HasMulDivAdd - Multiply add and sub (MADD, MADDu, MSUB, MSUBu) // instructions. bool HasMulDivAdd; @@ -93,14 +93,14 @@ public: /// This constructor initializes the data members to match that /// of the specified triple. MipsSubtarget(const std::string &TT, const std::string &FS, bool little); - - /// ParseSubtargetFeatures - Parses features string setting specified + + /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. std::string ParseSubtargetFeatures(const std::string &FS, const std::string &CPU); bool isMips1() const { return MipsArchVersion == Mips1; } - bool isMips32() const { return MipsArchVersion >= Mips32; } + bool isMips32() const { return MipsArchVersion >= Mips32; } bool isMips32r2() const { return MipsArchVersion == Mips32r2; } bool isLittle() const { return IsLittle; } diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp index 7a2dd1f..53190b4 100644 --- a/lib/Target/Mips/MipsTargetMachine.cpp +++ b/lib/Target/Mips/MipsTargetMachine.cpp @@ -75,3 +75,9 @@ addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel) PM.add(createMipsDelaySlotFillerPass(*this)); return true; } + +bool MipsTargetMachine:: +addPostRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel) { + PM.add(createMipsExpandPseudoPass(*this)); + return true; +} diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h index 43ab798..badb652 100644 --- a/lib/Target/Mips/MipsTargetMachine.h +++ b/lib/Target/Mips/MipsTargetMachine.h @@ -63,6 +63,7 @@ namespace llvm { CodeGenOpt::Level OptLevel); virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel); + virtual bool addPostRegAlloc(PassManagerBase &, CodeGenOpt::Level); }; /// MipselTargetMachine - Mipsel target machine. diff --git a/lib/Target/Mips/MipsTargetObjectFile.h b/lib/Target/Mips/MipsTargetObjectFile.h index 237b160..c394a9d 100644 --- a/lib/Target/Mips/MipsTargetObjectFile.h +++ b/lib/Target/Mips/MipsTargetObjectFile.h @@ -18,22 +18,22 @@ namespace llvm { const MCSection *SmallDataSection; const MCSection *SmallBSSSection; public: - + void Initialize(MCContext &Ctx, const TargetMachine &TM); - + /// IsGlobalInSmallSection - Return true if this global address should be /// placed into small data/bss section. bool IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM, SectionKind Kind)const; bool IsGlobalInSmallSection(const GlobalValue *GV, - const TargetMachine &TM) const; - + const TargetMachine &TM) const; + const MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, Mangler *Mang, const TargetMachine &TM) const; - + // TODO: Classify globals as mips wishes. }; } // end namespace llvm diff --git a/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp b/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp index cc3d61e..a8d6fe9 100644 --- a/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp +++ b/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp @@ -14,7 +14,7 @@ using namespace llvm; Target llvm::TheMipsTarget, llvm::TheMipselTarget; -extern "C" void LLVMInitializeMipsTargetInfo() { +extern "C" void LLVMInitializeMipsTargetInfo() { RegisterTarget<Triple::mips> X(TheMipsTarget, "mips", "Mips"); RegisterTarget<Triple::mipsel> Y(TheMipselTarget, "mipsel", "Mipsel"); diff --git a/lib/Target/PTX/PTX.h b/lib/Target/PTX/PTX.h index 19385ba..ec2be92 100644 --- a/lib/Target/PTX/PTX.h +++ b/lib/Target/PTX/PTX.h @@ -29,6 +29,11 @@ namespace llvm { PARAMETER = 3, SHARED = 4 }; + + enum Predicate { + PRED_NORMAL = 0, + PRED_NEGATE = 1 + }; } // namespace PTX FunctionPass *createPTXISelDag(PTXTargetMachine &TM, @@ -37,7 +42,8 @@ namespace llvm { FunctionPass *createPTXMFInfoExtract(PTXTargetMachine &TM, CodeGenOpt::Level OptLevel); - extern Target ThePTXTarget; + extern Target ThePTX32Target; + extern Target ThePTX64Target; } // namespace llvm; // Defines symbolic names for PTX registers. diff --git a/lib/Target/PTX/PTX.td b/lib/Target/PTX/PTX.td index 8b1a1b1..ae8326e 100644 --- a/lib/Target/PTX/PTX.td +++ b/lib/Target/PTX/PTX.td @@ -19,8 +19,35 @@ include "llvm/Target/Target.td" // Subtarget Features. //===----------------------------------------------------------------------===// -def FeatureSM20 : SubtargetFeature<"sm20", "is_sm20", "true", - "Enable sm_20 target architecture">; +//===- Architectural Features ---------------------------------------------===// + +def FeatureDouble : SubtargetFeature<"double", "SupportsDouble", "true", + "Do not demote .f64 to .f32">; + +//===- PTX Version --------------------------------------------------------===// + +def FeaturePTX20 : SubtargetFeature<"ptx20", "PTXVersion", "PTX_VERSION_2_0", + "Use PTX Language Version 2.0", + []>; + +def FeaturePTX21 : SubtargetFeature<"ptx21", "PTXVersion", "PTX_VERSION_2_1", + "Use PTX Language Version 2.1", + [FeaturePTX20]>; + +def FeaturePTX22 : SubtargetFeature<"ptx22", "PTXVersion", "PTX_VERSION_2_2", + "Use PTX Language Version 2.2", + [FeaturePTX21]>; + +//===- PTX Shader Model ---------------------------------------------------===// + +def FeatureSM10 : SubtargetFeature<"sm10", "PTXShaderModel", "PTX_SM_1_0", + "Enable Shader Model 1.0 compliance">; +def FeatureSM13 : SubtargetFeature<"sm13", "PTXShaderModel", "PTX_SM_1_3", + "Enable Shader Model 1.3 compliance", + [FeatureSM10, FeatureDouble]>; +def FeatureSM20 : SubtargetFeature<"sm20", "PTXShaderModel", "PTX_SM_2_0", + "Enable Shader Model 2.0 compliance", + [FeatureSM13]>; //===----------------------------------------------------------------------===// // PTX supported processors. diff --git a/lib/Target/PTX/PTXAsmPrinter.cpp b/lib/Target/PTX/PTXAsmPrinter.cpp index a605997..29c4781 100644 --- a/lib/Target/PTX/PTXAsmPrinter.cpp +++ b/lib/Target/PTX/PTXAsmPrinter.cpp @@ -24,6 +24,7 @@ #include "llvm/ADT/Twine.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Target/Mangler.h" @@ -37,13 +38,6 @@ using namespace llvm; -static cl::opt<std::string> -OptPTXVersion("ptx-version", cl::desc("Set PTX version"), cl::init("1.4")); - -static cl::opt<std::string> -OptPTXTarget("ptx-target", cl::desc("Set GPU target (comma-separated list)"), - cl::init("sm_10")); - namespace { class PTXAsmPrinter : public AsmPrinter { public: @@ -68,6 +62,7 @@ public: const char *Modifier = 0); void printParamOperand(const MachineInstr *MI, int opNum, raw_ostream &OS, const char *Modifier = 0); + void printPredicateOperand(const MachineInstr *MI, raw_ostream &O); // autogen'd. void printInstruction(const MachineInstr *MI, raw_ostream &OS); @@ -82,27 +77,20 @@ private: static const char PARAM_PREFIX[] = "__param_"; static const char *getRegisterTypeName(unsigned RegNo) { -#define TEST_REGCLS(cls, clsstr) \ +#define TEST_REGCLS(cls, clsstr) \ if (PTX::cls ## RegisterClass->contains(RegNo)) return # clsstr; - TEST_REGCLS(RRegs32, s32); TEST_REGCLS(Preds, pred); + TEST_REGCLS(RRegu16, u16); + TEST_REGCLS(RRegu32, u32); + TEST_REGCLS(RRegu64, u64); + TEST_REGCLS(RRegf32, f32); + TEST_REGCLS(RRegf64, f64); #undef TEST_REGCLS llvm_unreachable("Not in any register class!"); return NULL; } -static const char *getInstructionTypeName(const MachineInstr *MI) { - for (int i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); - if (MO.getType() == MachineOperand::MO_Register) - return getRegisterTypeName(MO.getReg()); - } - - llvm_unreachable("No reg operand found in instruction!"); - return NULL; -} - static const char *getStateSpaceName(unsigned addressSpace) { switch (addressSpace) { default: llvm_unreachable("Unknown state space"); @@ -115,6 +103,28 @@ static const char *getStateSpaceName(unsigned addressSpace) { return NULL; } +static const char *getTypeName(const Type* type) { + while (true) { + switch (type->getTypeID()) { + default: llvm_unreachable("Unknown type"); + case Type::FloatTyID: return ".f32"; + case Type::DoubleTyID: return ".f64"; + case Type::IntegerTyID: + switch (type->getPrimitiveSizeInBits()) { + default: llvm_unreachable("Unknown integer bit-width"); + case 16: return ".u16"; + case 32: return ".u32"; + case 64: return ".u64"; + } + case Type::ArrayTyID: + case Type::PointerTyID: + type = dyn_cast<const SequentialType>(type)->getElementType(); + break; + } + } + return NULL; +} + bool PTXAsmPrinter::doFinalization(Module &M) { // XXX Temproarily remove global variables so that doFinalization() will not // emit them again (global variables are emitted at beginning). @@ -146,8 +156,12 @@ bool PTXAsmPrinter::doFinalization(Module &M) { void PTXAsmPrinter::EmitStartOfAsmFile(Module &M) { - OutStreamer.EmitRawText(Twine("\t.version " + OptPTXVersion)); - OutStreamer.EmitRawText(Twine("\t.target " + OptPTXTarget)); + const PTXSubtarget& ST = TM.getSubtarget<PTXSubtarget>(); + + OutStreamer.EmitRawText(Twine("\t.version " + ST.getPTXVersionString())); + OutStreamer.EmitRawText(Twine("\t.target " + ST.getTargetString() + + (ST.supportsDouble() ? "" + : ", map_f64_to_f32"))); OutStreamer.AddBlankLine(); // declare global variables @@ -186,17 +200,16 @@ void PTXAsmPrinter::EmitInstruction(const MachineInstr *MI) { std::string str; str.reserve(64); - // Write instruction to str raw_string_ostream OS(str); + + // Emit predicate + printPredicateOperand(MI, OS); + + // Write instruction to str printInstruction(MI, OS); OS << ';'; OS.flush(); - // Replace "%type" if found - size_t pos; - if ((pos = str.find("%type")) != std::string::npos) - str.replace(pos, /*strlen("%type")==*/5, getInstructionTypeName(MI)); - StringRef strref = StringRef(str); OutStreamer.EmitRawText(strref); } @@ -213,11 +226,36 @@ void PTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum, OS << *Mang->getSymbol(MO.getGlobal()); break; case MachineOperand::MO_Immediate: - OS << (int) MO.getImm(); + OS << (long) MO.getImm(); + break; + case MachineOperand::MO_MachineBasicBlock: + OS << *MO.getMBB()->getSymbol(); break; case MachineOperand::MO_Register: OS << getRegisterName(MO.getReg()); break; + case MachineOperand::MO_FPImmediate: + APInt constFP = MO.getFPImm()->getValueAPF().bitcastToAPInt(); + bool isFloat = MO.getFPImm()->getType()->getTypeID() == Type::FloatTyID; + // Emit 0F for 32-bit floats and 0D for 64-bit doubles. + if (isFloat) { + OS << "0F"; + } + else { + OS << "0D"; + } + // Emit the encoded floating-point value. + if (constFP.getZExtValue() > 0) { + OS << constFP.toString(16, false); + } + else { + OS << "00000000"; + // If We have a double-precision zero, pad to 8-bytes. + if (!isFloat) { + OS << "00000000"; + } + } + break; } } @@ -265,13 +303,77 @@ void PTXAsmPrinter::EmitVariableDeclaration(const GlobalVariable *gv) { decl += " "; } - // TODO: add types - decl += ".s32 "; - decl += gvsym->getName(); + if (PointerType::classof(gv->getType())) { + const PointerType* pointerTy = dyn_cast<const PointerType>(gv->getType()); + const Type* elementTy = pointerTy->getElementType(); + + decl += ".b8 "; + decl += gvsym->getName(); + decl += "["; + + if (elementTy->isArrayTy()) + { + assert(elementTy->isArrayTy() && "Only pointers to arrays are supported"); - if (ArrayType::classof(gv->getType()) || PointerType::classof(gv->getType())) - decl += "[]"; + const ArrayType* arrayTy = dyn_cast<const ArrayType>(elementTy); + elementTy = arrayTy->getElementType(); + + unsigned numElements = arrayTy->getNumElements(); + + while (elementTy->isArrayTy()) { + + arrayTy = dyn_cast<const ArrayType>(elementTy); + elementTy = arrayTy->getElementType(); + + numElements *= arrayTy->getNumElements(); + } + + // FIXME: isPrimitiveType() == false for i16? + assert(elementTy->isSingleValueType() && + "Non-primitive types are not handled"); + + // Compute the size of the array, in bytes. + uint64_t arraySize = (elementTy->getPrimitiveSizeInBits() >> 3) + * numElements; + + decl += utostr(arraySize); + } + + decl += "]"; + + // handle string constants (assume ConstantArray means string) + + if (gv->hasInitializer()) + { + Constant *C = gv->getInitializer(); + if (const ConstantArray *CA = dyn_cast<ConstantArray>(C)) + { + decl += " = {"; + + for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i) + { + if (i > 0) decl += ","; + + decl += "0x" + utohexstr(cast<ConstantInt>(CA->getOperand(i))->getZExtValue()); + } + + decl += "}"; + } + } + } + else { + // Note: this is currently the fall-through case and most likely generates + // incorrect code. + decl += getTypeName(gv->getType()); + decl += " "; + + decl += gvsym->getName(); + + if (ArrayType::classof(gv->getType()) || + PointerType::classof(gv->getType())) + decl += "[]"; + } decl += ";"; @@ -313,16 +415,24 @@ void PTXAsmPrinter::EmitFunctionDeclaration() { if (!MFI->argRegEmpty()) { decl += " ("; if (isKernel) { - for (int i = 0, e = MFI->getNumArg(); i != e; ++i) { - if (i != 0) + unsigned cnt = 0; + for(PTXMachineFunctionInfo::reg_iterator + i = MFI->argRegBegin(), e = MFI->argRegEnd(), b = i; + i != e; ++i) { + reg = *i; + assert(reg != PTX::NoRegister && "Not a valid register!"); + if (i != b) decl += ", "; - decl += ".param .s32 "; // TODO: add types + decl += ".param ."; + decl += getRegisterTypeName(reg); + decl += " "; decl += PARAM_PREFIX; - decl += utostr(i + 1); + decl += utostr(++cnt); } } else { for (PTXMachineFunctionInfo::reg_iterator - i = MFI->argRegBegin(), e = MFI->argRegEnd(), b = i; i != e; ++i) { + i = MFI->argRegBegin(), e = MFI->argRegEnd(), b = i; + i != e; ++i) { reg = *i; assert(reg != PTX::NoRegister && "Not a valid register!"); if (i != b) @@ -339,9 +449,29 @@ void PTXAsmPrinter::EmitFunctionDeclaration() { OutStreamer.EmitRawText(Twine(decl)); } +void PTXAsmPrinter:: +printPredicateOperand(const MachineInstr *MI, raw_ostream &O) { + int i = MI->findFirstPredOperandIdx(); + if (i == -1) + llvm_unreachable("missing predicate operand"); + + unsigned reg = MI->getOperand(i).getReg(); + int predOp = MI->getOperand(i+1).getImm(); + + DEBUG(dbgs() << "predicate: (" << reg << ", " << predOp << ")\n"); + + if (reg != PTX::NoRegister) { + O << '@'; + if (predOp == PTX::PRED_NEGATE) + O << '!'; + O << getRegisterName(reg); + } +} + #include "PTXGenAsmWriter.inc" // Force static initialization. extern "C" void LLVMInitializePTXAsmPrinter() { - RegisterAsmPrinter<PTXAsmPrinter> X(ThePTXTarget); + RegisterAsmPrinter<PTXAsmPrinter> X(ThePTX32Target); + RegisterAsmPrinter<PTXAsmPrinter> Y(ThePTX64Target); } diff --git a/lib/Target/PTX/PTXFrameLowering.h b/lib/Target/PTX/PTXFrameLowering.h index 574ae7a..9320676 100644 --- a/lib/Target/PTX/PTXFrameLowering.h +++ b/lib/Target/PTX/PTXFrameLowering.h @@ -27,7 +27,8 @@ protected: public: explicit PTXFrameLowering(const PTXSubtarget &sti) - : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 2, -2), STI(sti) { + : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 2, -2), + STI(sti) { } /// emitProlog/emitEpilog - These methods insert prolog and epilog code into diff --git a/lib/Target/PTX/PTXISelDAGToDAG.cpp b/lib/Target/PTX/PTXISelDAGToDAG.cpp index efb0e8b..b3c85da 100644 --- a/lib/Target/PTX/PTXISelDAGToDAG.cpp +++ b/lib/Target/PTX/PTXISelDAGToDAG.cpp @@ -15,6 +15,7 @@ #include "PTXTargetMachine.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/DerivedTypes.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -42,8 +43,14 @@ class PTXDAGToDAGISel : public SelectionDAGISel { private: SDNode *SelectREAD_PARAM(SDNode *Node); + // We need this only because we can't match intruction BRAdp + // pattern (PTXbrcond bb:$d, ...) in PTXInstrInfo.td + SDNode *SelectBRCOND(SDNode *Node); + bool isImm(const SDValue &operand); bool SelectImm(const SDValue &operand, SDValue &imm); + + const PTXSubtarget& getSubtarget() const; }; // class PTXDAGToDAGISel } // namespace @@ -59,21 +66,62 @@ PTXDAGToDAGISel::PTXDAGToDAGISel(PTXTargetMachine &TM, : SelectionDAGISel(TM, OptLevel) {} SDNode *PTXDAGToDAGISel::Select(SDNode *Node) { - if (Node->getOpcode() == PTXISD::READ_PARAM) - return SelectREAD_PARAM(Node); - else - return SelectCode(Node); + switch (Node->getOpcode()) { + case PTXISD::READ_PARAM: + return SelectREAD_PARAM(Node); + case ISD::BRCOND: + return SelectBRCOND(Node); + default: + return SelectCode(Node); + } } SDNode *PTXDAGToDAGISel::SelectREAD_PARAM(SDNode *Node) { - SDValue index = Node->getOperand(1); - DebugLoc dl = Node->getDebugLoc(); + SDValue index = Node->getOperand(1); + DebugLoc dl = Node->getDebugLoc(); + unsigned opcode; if (index.getOpcode() != ISD::TargetConstant) llvm_unreachable("READ_PARAM: index is not ISD::TargetConstant"); + if (Node->getValueType(0) == MVT::i16) { + opcode = PTX::LDpiU16; + } + else if (Node->getValueType(0) == MVT::i32) { + opcode = PTX::LDpiU32; + } + else if (Node->getValueType(0) == MVT::i64) { + opcode = PTX::LDpiU64; + } + else if (Node->getValueType(0) == MVT::f32) { + opcode = PTX::LDpiF32; + } + else if (Node->getValueType(0) == MVT::f64) { + opcode = PTX::LDpiF64; + } + else { + llvm_unreachable("Unknown parameter type for ld.param"); + } + return PTXInstrInfo:: - GetPTXMachineNode(CurDAG, PTX::LDpi, dl, MVT::i32, index); + GetPTXMachineNode(CurDAG, opcode, dl, Node->getValueType(0), index); +} + +SDNode *PTXDAGToDAGISel::SelectBRCOND(SDNode *Node) { + assert(Node->getNumOperands() >= 3); + + SDValue Chain = Node->getOperand(0); + SDValue Pred = Node->getOperand(1); + SDValue Target = Node->getOperand(2); // branch target + SDValue PredOp = CurDAG->getTargetConstant(PTX::PRED_NORMAL, MVT::i32); + DebugLoc dl = Node->getDebugLoc(); + + assert(Target.getOpcode() == ISD::BasicBlock); + assert(Pred.getValueType() == MVT::i1); + + // Emit BRAdp + SDValue Ops[] = { Target, Pred, PredOp, Chain }; + return CurDAG->getMachineNode(PTX::BRAdp, dl, MVT::Other, Ops, 4); } // Match memory operand of the form [reg+reg] @@ -82,8 +130,11 @@ bool PTXDAGToDAGISel::SelectADDRrr(SDValue &Addr, SDValue &R1, SDValue &R2) { isImm(Addr.getOperand(0)) || isImm(Addr.getOperand(1))) return false; + assert(Addr.getValueType().isSimple() && "Type must be simple"); + R1 = Addr; - R2 = CurDAG->getTargetConstant(0, MVT::i32); + R2 = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT()); + return true; } @@ -95,8 +146,12 @@ bool PTXDAGToDAGISel::SelectADDRri(SDValue &Addr, SDValue &Base, if (isImm(Addr)) return false; // it is [reg] + + assert(Addr.getValueType().isSimple() && "Type must be simple"); + Base = Addr; - Offset = CurDAG->getTargetConstant(0, MVT::i32); + Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT()); + return true; } @@ -129,7 +184,10 @@ bool PTXDAGToDAGISel::SelectADDRii(SDValue &Addr, SDValue &Base, // is [imm]? if (SelectImm(Addr, Base)) { - Offset = CurDAG->getTargetConstant(0, MVT::i32); + assert(Addr.getValueType().isSimple() && "Type must be simple"); + + Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT()); + return true; } @@ -146,6 +204,13 @@ bool PTXDAGToDAGISel::SelectImm(const SDValue &operand, SDValue &imm) { return false; ConstantSDNode *CN = cast<ConstantSDNode>(node); - imm = CurDAG->getTargetConstant(*CN->getConstantIntValue(), MVT::i32); + imm = CurDAG->getTargetConstant(*CN->getConstantIntValue(), + operand.getValueType()); return true; } + +const PTXSubtarget& PTXDAGToDAGISel::getSubtarget() const +{ + return TM.getSubtarget<PTXSubtarget>(); +} + diff --git a/lib/Target/PTX/PTXISelLowering.cpp b/lib/Target/PTX/PTXISelLowering.cpp index e6d4490..23b93da 100644 --- a/lib/Target/PTX/PTXISelLowering.cpp +++ b/lib/Target/PTX/PTXISelLowering.cpp @@ -20,6 +20,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -27,21 +28,60 @@ PTXTargetLowering::PTXTargetLowering(TargetMachine &TM) : TargetLowering(TM, new TargetLoweringObjectFileELF()) { // Set up the register classes. addRegisterClass(MVT::i1, PTX::PredsRegisterClass); - addRegisterClass(MVT::i32, PTX::RRegs32RegisterClass); - + addRegisterClass(MVT::i16, PTX::RRegu16RegisterClass); + addRegisterClass(MVT::i32, PTX::RRegu32RegisterClass); + addRegisterClass(MVT::i64, PTX::RRegu64RegisterClass); + addRegisterClass(MVT::f32, PTX::RRegf32RegisterClass); + addRegisterClass(MVT::f64, PTX::RRegf64RegisterClass); + + setBooleanContents(ZeroOrOneBooleanContent); + setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); + setOperationAction(ISD::ConstantFP, MVT::f32, Legal); + setOperationAction(ISD::ConstantFP, MVT::f64, Legal); + + // Turn i16 (z)extload into load + (z)extend + setLoadExtAction(ISD::EXTLOAD, MVT::i16, Expand); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Expand); + + // Turn f32 extload into load + fextend + setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); + + // Turn f64 truncstore into trunc + store. + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + // Customize translation of memory addresses setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); - + setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); + + // Expand BR_CC into BRCOND + setOperationAction(ISD::BR_CC, MVT::Other, Expand); + + // Expand SELECT_CC into SETCC + setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); + + // need to lower SETCC of Preds into bitwise logic + setOperationAction(ISD::SETCC, MVT::i1, Custom); + // Compute derived properties from the register classes computeRegisterProperties(); } +MVT::SimpleValueType PTXTargetLowering::getSetCCResultType(EVT VT) const { + return MVT::i1; +} + SDValue PTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { - default: llvm_unreachable("Unimplemented operand"); - case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); + default: + llvm_unreachable("Unimplemented operand"); + case ISD::SETCC: + return LowerSETCC(Op, DAG); + case ISD::GlobalAddress: + return LowerGlobalAddress(Op, DAG); } } @@ -49,6 +89,8 @@ const char *PTXTargetLowering::getTargetNodeName(unsigned Opcode) const { switch (Opcode) { default: llvm_unreachable("Unknown opcode"); + case PTXISD::COPY_ADDRESS: + return "PTXISD::COPY_ADDRESS"; case PTXISD::READ_PARAM: return "PTXISD::READ_PARAM"; case PTXISD::EXIT: @@ -62,12 +104,43 @@ const char *PTXTargetLowering::getTargetNodeName(unsigned Opcode) const { // Custom Lower Operation //===----------------------------------------------------------------------===// +SDValue PTXTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { + assert(Op.getValueType() == MVT::i1 && "SetCC type must be 1-bit integer"); + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue Op2 = Op.getOperand(2); + DebugLoc dl = Op.getDebugLoc(); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); + + // Look for X == 0, X == 1, X != 0, or X != 1 + // We can simplify these to bitwise logic + + if (Op1.getOpcode() == ISD::Constant && + (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || + cast<ConstantSDNode>(Op1)->isNullValue()) && + (CC == ISD::SETEQ || CC == ISD::SETNE)) { + + return DAG.getNode(ISD::AND, dl, MVT::i1, Op0, Op1); + } + + return DAG.getNode(ISD::SETCC, dl, MVT::i1, Op0, Op1, Op2); +} + SDValue PTXTargetLowering:: LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(); DebugLoc dl = Op.getDebugLoc(); const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); - return DAG.getTargetGlobalAddress(GV, dl, PtrVT); + + assert(PtrVT.isSimple() && "Pointer must be to primitive type."); + + SDValue targetGlobal = DAG.getTargetGlobalAddress(GV, dl, PtrVT); + SDValue movInstr = DAG.getNode(PTXISD::COPY_ADDRESS, + dl, + PtrVT.getSimpleVT(), + targetGlobal); + + return movInstr; } //===----------------------------------------------------------------------===// @@ -87,9 +160,13 @@ struct argmap_entry { bool operator==(MVT::SimpleValueType _VT) const { return VT == _VT; } } argmap[] = { argmap_entry(MVT::i1, PTX::PredsRegisterClass), - argmap_entry(MVT::i32, PTX::RRegs32RegisterClass) + argmap_entry(MVT::i16, PTX::RRegu16RegisterClass), + argmap_entry(MVT::i32, PTX::RRegu32RegisterClass), + argmap_entry(MVT::i64, PTX::RRegu64RegisterClass), + argmap_entry(MVT::f32, PTX::RRegf32RegisterClass), + argmap_entry(MVT::f64, PTX::RRegf64RegisterClass) }; -} // end anonymous namespace +} // end anonymous namespace SDValue PTXTargetLowering:: LowerFormalArguments(SDValue Chain, @@ -185,10 +262,25 @@ SDValue PTXTargetLowering:: if (Outs.size() == 0) return DAG.getNode(PTXISD::RET, dl, MVT::Other, Chain); - assert(Outs[0].VT == MVT::i32 && "Can return only basic types"); - SDValue Flag; - unsigned reg = PTX::R0; + unsigned reg; + + if (Outs[0].VT == MVT::i16) { + reg = PTX::RH0; + } + else if (Outs[0].VT == MVT::i32) { + reg = PTX::R0; + } + else if (Outs[0].VT == MVT::i64) { + reg = PTX::RD0; + } + else if (Outs[0].VT == MVT::f32) { + reg = PTX::F0; + } + else { + assert(Outs[0].VT == MVT::f64 && "Can return only basic types"); + reg = PTX::FD0; + } MachineFunction &MF = DAG.getMachineFunction(); PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>(); diff --git a/lib/Target/PTX/PTXISelLowering.h b/lib/Target/PTX/PTXISelLowering.h index b03a9f6..6a7e3e6 100644 --- a/lib/Target/PTX/PTXISelLowering.h +++ b/lib/Target/PTX/PTXISelLowering.h @@ -26,7 +26,8 @@ namespace PTXISD { FIRST_NUMBER = ISD::BUILTIN_OP_END, READ_PARAM, EXIT, - RET + RET, + COPY_ADDRESS }; } // namespace PTXISD @@ -41,6 +42,8 @@ class PTXTargetLowering : public TargetLowering { virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const; + virtual SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; + virtual SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, @@ -58,7 +61,9 @@ class PTXTargetLowering : public TargetLowering { const SmallVectorImpl<SDValue> &OutVals, DebugLoc dl, SelectionDAG &DAG) const; - + + virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const; + private: SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; }; // class PTXTargetLowering diff --git a/lib/Target/PTX/PTXInstrInfo.cpp b/lib/Target/PTX/PTXInstrInfo.cpp index 805759b..a12a6d0 100644 --- a/lib/Target/PTX/PTXInstrInfo.cpp +++ b/lib/Target/PTX/PTXInstrInfo.cpp @@ -11,9 +11,15 @@ // //===----------------------------------------------------------------------===// +#define DEBUG_TYPE "ptx-instrinfo" + #include "PTX.h" #include "PTXInstrInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -27,20 +33,27 @@ static const struct map_entry { const TargetRegisterClass *cls; const int opcode; } map[] = { - { &PTX::RRegs32RegClass, PTX::MOVrr }, - { &PTX::PredsRegClass, PTX::MOVpp } + { &PTX::RRegu16RegClass, PTX::MOVU16rr }, + { &PTX::RRegu32RegClass, PTX::MOVU32rr }, + { &PTX::RRegu64RegClass, PTX::MOVU64rr }, + { &PTX::RRegf32RegClass, PTX::MOVF32rr }, + { &PTX::RRegf64RegClass, PTX::MOVF64rr }, + { &PTX::PredsRegClass, PTX::MOVPREDrr } }; void PTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, unsigned DstReg, unsigned SrcReg, bool KillSrc) const { - for (int i = 0, e = sizeof(map)/sizeof(map[0]); i != e; ++ i) - if (PTX::RRegs32RegClass.contains(DstReg, SrcReg)) { - BuildMI(MBB, I, DL, - get(PTX::MOVrr), DstReg).addReg(SrcReg, getKillRegState(KillSrc)); + for (int i = 0, e = sizeof(map)/sizeof(map[0]); i != e; ++ i) { + if (map[i].cls->contains(DstReg, SrcReg)) { + const TargetInstrDesc &TID = get(map[i].opcode); + MachineInstr *MI = BuildMI(MBB, I, DL, TID, DstReg). + addReg(SrcReg, getKillRegState(KillSrc)); + AddDefaultPredicate(MI); return; } + } llvm_unreachable("Impossible reg-to-reg copy"); } @@ -56,12 +69,9 @@ bool PTXInstrInfo::copyRegToReg(MachineBasicBlock &MBB, for (int i = 0, e = sizeof(map)/sizeof(map[0]); i != e; ++ i) if (DstRC == map[i].cls) { - MachineInstr *MI = BuildMI(MBB, I, DL, get(map[i].opcode), - DstReg).addReg(SrcReg); - if (MI->findFirstPredOperandIdx() == -1) { - MI->addOperand(MachineOperand::CreateReg(0, false)); - MI->addOperand(MachineOperand::CreateImm(/*IsInv=*/0)); - } + const TargetInstrDesc &TID = get(map[i].opcode); + MachineInstr *MI = BuildMI(MBB, I, DL, TID, DstReg).addReg(SrcReg); + AddDefaultPredicate(MI); return true; } @@ -74,8 +84,12 @@ bool PTXInstrInfo::isMoveInstr(const MachineInstr& MI, switch (MI.getOpcode()) { default: return false; - case PTX::MOVpp: - case PTX::MOVrr: + case PTX::MOVU16rr: + case PTX::MOVU32rr: + case PTX::MOVU64rr: + case PTX::MOVF32rr: + case PTX::MOVF64rr: + case PTX::MOVPREDrr: assert(MI.getNumOperands() >= 2 && MI.getOperand(0).isReg() && MI.getOperand(1).isReg() && "Invalid register-register move instruction"); @@ -85,3 +99,239 @@ bool PTXInstrInfo::isMoveInstr(const MachineInstr& MI, return true; } } + +// predicate support + +bool PTXInstrInfo::isPredicated(const MachineInstr *MI) const { + int i = MI->findFirstPredOperandIdx(); + return i != -1 && MI->getOperand(i).getReg() != PTX::NoRegister; +} + +bool PTXInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const { + return !isPredicated(MI) && get(MI->getOpcode()).isTerminator(); +} + +bool PTXInstrInfo:: +PredicateInstruction(MachineInstr *MI, + const SmallVectorImpl<MachineOperand> &Pred) const { + if (Pred.size() < 2) + llvm_unreachable("lesser than 2 predicate operands are provided"); + + int i = MI->findFirstPredOperandIdx(); + if (i == -1) + llvm_unreachable("missing predicate operand"); + + MI->getOperand(i).setReg(Pred[0].getReg()); + MI->getOperand(i+1).setImm(Pred[1].getImm()); + + return true; +} + +bool PTXInstrInfo:: +SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1, + const SmallVectorImpl<MachineOperand> &Pred2) const { + const MachineOperand &PredReg1 = Pred1[0]; + const MachineOperand &PredReg2 = Pred2[0]; + if (PredReg1.getReg() != PredReg2.getReg()) + return false; + + const MachineOperand &PredOp1 = Pred1[1]; + const MachineOperand &PredOp2 = Pred2[1]; + if (PredOp1.getImm() != PredOp2.getImm()) + return false; + + return true; +} + +bool PTXInstrInfo:: +DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const { + // If an instruction sets a predicate register, it defines a predicate. + + // TODO supprot 5-operand format of setp instruction + + if (MI->getNumOperands() < 1) + return false; + + const MachineOperand &MO = MI->getOperand(0); + + if (!MO.isReg() || RI.getRegClass(MO.getReg()) != &PTX::PredsRegClass) + return false; + + Pred.push_back(MO); + Pred.push_back(MachineOperand::CreateImm(PTX::PRED_NORMAL)); + return true; +} + +// branch support + +bool PTXInstrInfo:: +AnalyzeBranch(MachineBasicBlock &MBB, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { + // TODO implement cases when AllowModify is true + + if (MBB.empty()) + return true; + + MachineBasicBlock::const_iterator iter = MBB.end(); + const MachineInstr& instLast1 = *--iter; + const TargetInstrDesc &desc1 = instLast1.getDesc(); + // for special case that MBB has only 1 instruction + const bool IsSizeOne = MBB.size() == 1; + // if IsSizeOne is true, *--iter and instLast2 are invalid + // we put a dummy value in instLast2 and desc2 since they are used + const MachineInstr& instLast2 = IsSizeOne ? instLast1 : *--iter; + const TargetInstrDesc &desc2 = IsSizeOne ? desc1 : instLast2.getDesc(); + + DEBUG(dbgs() << "\n"); + DEBUG(dbgs() << "AnalyzeBranch: opcode: " << instLast1.getOpcode() << "\n"); + DEBUG(dbgs() << "AnalyzeBranch: MBB: " << MBB.getName().str() << "\n"); + DEBUG(dbgs() << "AnalyzeBranch: TBB: " << TBB << "\n"); + DEBUG(dbgs() << "AnalyzeBranch: FBB: " << FBB << "\n"); + + // this block ends with no branches + if (!IsAnyKindOfBranch(instLast1)) { + DEBUG(dbgs() << "AnalyzeBranch: ends with no branch\n"); + return false; + } + + // this block ends with only an unconditional branch + if (desc1.isUnconditionalBranch() && + // when IsSizeOne is true, it "absorbs" the evaluation of instLast2 + (IsSizeOne || !IsAnyKindOfBranch(instLast2))) { + DEBUG(dbgs() << "AnalyzeBranch: ends with only uncond branch\n"); + TBB = GetBranchTarget(instLast1); + return false; + } + + // this block ends with a conditional branch and + // it falls through to a successor block + if (desc1.isConditionalBranch() && + IsAnySuccessorAlsoLayoutSuccessor(MBB)) { + DEBUG(dbgs() << "AnalyzeBranch: ends with cond branch and fall through\n"); + TBB = GetBranchTarget(instLast1); + int i = instLast1.findFirstPredOperandIdx(); + Cond.push_back(instLast1.getOperand(i)); + Cond.push_back(instLast1.getOperand(i+1)); + return false; + } + + // when IsSizeOne is true, we are done + if (IsSizeOne) + return true; + + // this block ends with a conditional branch + // followed by an unconditional branch + if (desc2.isConditionalBranch() && + desc1.isUnconditionalBranch()) { + DEBUG(dbgs() << "AnalyzeBranch: ends with cond and uncond branch\n"); + TBB = GetBranchTarget(instLast2); + FBB = GetBranchTarget(instLast1); + int i = instLast2.findFirstPredOperandIdx(); + Cond.push_back(instLast2.getOperand(i)); + Cond.push_back(instLast2.getOperand(i+1)); + return false; + } + + // branch cannot be understood + DEBUG(dbgs() << "AnalyzeBranch: cannot be understood\n"); + return true; +} + +unsigned PTXInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + unsigned count = 0; + while (!MBB.empty()) + if (IsAnyKindOfBranch(MBB.back())) { + MBB.pop_back(); + ++count; + } else + break; + DEBUG(dbgs() << "RemoveBranch: MBB: " << MBB.getName().str() << "\n"); + DEBUG(dbgs() << "RemoveBranch: remove " << count << " branch inst\n"); + return count; +} + +unsigned PTXInstrInfo:: +InsertBranch(MachineBasicBlock &MBB, + MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const SmallVectorImpl<MachineOperand> &Cond, + DebugLoc DL) const { + DEBUG(dbgs() << "InsertBranch: MBB: " << MBB.getName().str() << "\n"); + DEBUG(if (TBB) dbgs() << "InsertBranch: TBB: " << TBB->getName().str() + << "\n"; + else dbgs() << "InsertBranch: TBB: (NULL)\n"); + DEBUG(if (FBB) dbgs() << "InsertBranch: FBB: " << FBB->getName().str() + << "\n"; + else dbgs() << "InsertBranch: FBB: (NULL)\n"); + DEBUG(dbgs() << "InsertBranch: Cond size: " << Cond.size() << "\n"); + + assert(TBB && "TBB is NULL"); + + if (FBB) { + BuildMI(&MBB, DL, get(PTX::BRAdp)) + .addMBB(TBB).addReg(Cond[0].getReg()).addImm(Cond[1].getImm()); + BuildMI(&MBB, DL, get(PTX::BRAd)) + .addMBB(FBB).addReg(PTX::NoRegister).addImm(PTX::PRED_NORMAL); + return 2; + } else if (Cond.size()) { + BuildMI(&MBB, DL, get(PTX::BRAdp)) + .addMBB(TBB).addReg(Cond[0].getReg()).addImm(Cond[1].getImm()); + return 1; + } else { + BuildMI(&MBB, DL, get(PTX::BRAd)) + .addMBB(TBB).addReg(PTX::NoRegister).addImm(PTX::PRED_NORMAL); + return 1; + } +} + +// static helper routines + +MachineSDNode *PTXInstrInfo:: +GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode, + DebugLoc dl, EVT VT, SDValue Op1) { + SDValue predReg = DAG->getRegister(PTX::NoRegister, MVT::i1); + SDValue predOp = DAG->getTargetConstant(PTX::PRED_NORMAL, MVT::i32); + SDValue ops[] = { Op1, predReg, predOp }; + return DAG->getMachineNode(Opcode, dl, VT, ops, array_lengthof(ops)); +} + +MachineSDNode *PTXInstrInfo:: +GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode, + DebugLoc dl, EVT VT, SDValue Op1, SDValue Op2) { + SDValue predReg = DAG->getRegister(PTX::NoRegister, MVT::i1); + SDValue predOp = DAG->getTargetConstant(PTX::PRED_NORMAL, MVT::i32); + SDValue ops[] = { Op1, Op2, predReg, predOp }; + return DAG->getMachineNode(Opcode, dl, VT, ops, array_lengthof(ops)); +} + +void PTXInstrInfo::AddDefaultPredicate(MachineInstr *MI) { + if (MI->findFirstPredOperandIdx() == -1) { + MI->addOperand(MachineOperand::CreateReg(PTX::NoRegister, /*IsDef=*/false)); + MI->addOperand(MachineOperand::CreateImm(PTX::PRED_NORMAL)); + } +} + +bool PTXInstrInfo::IsAnyKindOfBranch(const MachineInstr& inst) { + const TargetInstrDesc &desc = inst.getDesc(); + return desc.isTerminator() || desc.isBranch() || desc.isIndirectBranch(); +} + +bool PTXInstrInfo:: +IsAnySuccessorAlsoLayoutSuccessor(const MachineBasicBlock& MBB) { + for (MachineBasicBlock::const_succ_iterator + i = MBB.succ_begin(), e = MBB.succ_end(); i != e; ++i) + if (MBB.isLayoutSuccessor((const MachineBasicBlock*) &*i)) + return true; + return false; +} + +MachineBasicBlock *PTXInstrInfo::GetBranchTarget(const MachineInstr& inst) { + // FIXME So far all branch instructions put destination in 1st operand + const MachineOperand& target = inst.getOperand(0); + assert(target.isMBB() && "FIXME: detect branch target operand"); + return target.getMBB(); +} diff --git a/lib/Target/PTX/PTXInstrInfo.h b/lib/Target/PTX/PTXInstrInfo.h index e7f00f0..a04be77 100644 --- a/lib/Target/PTX/PTXInstrInfo.h +++ b/lib/Target/PTX/PTXInstrInfo.h @@ -15,61 +15,93 @@ #define PTX_INSTR_INFO_H #include "PTXRegisterInfo.h" -#include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/Target/TargetInstrInfo.h" namespace llvm { class PTXTargetMachine; +class MachineSDNode; +class SDValue; +class SelectionDAG; + class PTXInstrInfo : public TargetInstrInfoImpl { - private: - const PTXRegisterInfo RI; - PTXTargetMachine &TM; - - public: - explicit PTXInstrInfo(PTXTargetMachine &_TM); - - virtual const PTXRegisterInfo &getRegisterInfo() const { return RI; } - - virtual void copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, DebugLoc DL, - unsigned DstReg, unsigned SrcReg, - bool KillSrc) const; - - virtual bool copyRegToReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - unsigned DstReg, unsigned SrcReg, - const TargetRegisterClass *DstRC, - const TargetRegisterClass *SrcRC, - DebugLoc DL) const; - - virtual bool isMoveInstr(const MachineInstr& MI, - unsigned &SrcReg, unsigned &DstReg, - unsigned &SrcSubIdx, unsigned &DstSubIdx) const; - - // static helper routines - - static MachineSDNode *GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode, - DebugLoc dl, EVT VT, - SDValue Op1) { - SDValue pred_reg = DAG->getRegister(0, MVT::i1); - SDValue pred_imm = DAG->getTargetConstant(0, MVT::i32); - SDValue ops[] = { Op1, pred_reg, pred_imm }; - return DAG->getMachineNode(Opcode, dl, VT, ops, array_lengthof(ops)); - } - - static MachineSDNode *GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode, - DebugLoc dl, EVT VT, - SDValue Op1, - SDValue Op2) { - SDValue pred_reg = DAG->getRegister(0, MVT::i1); - SDValue pred_imm = DAG->getTargetConstant(0, MVT::i32); - SDValue ops[] = { Op1, Op2, pred_reg, pred_imm }; - return DAG->getMachineNode(Opcode, dl, VT, ops, array_lengthof(ops)); - } - - }; // class PTXInstrInfo +private: + const PTXRegisterInfo RI; + PTXTargetMachine &TM; + +public: + explicit PTXInstrInfo(PTXTargetMachine &_TM); + + virtual const PTXRegisterInfo &getRegisterInfo() const { return RI; } + + virtual void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DstReg, unsigned SrcReg, + bool KillSrc) const; + + virtual bool copyRegToReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DstReg, unsigned SrcReg, + const TargetRegisterClass *DstRC, + const TargetRegisterClass *SrcRC, + DebugLoc DL) const; + + virtual bool isMoveInstr(const MachineInstr& MI, + unsigned &SrcReg, unsigned &DstReg, + unsigned &SrcSubIdx, unsigned &DstSubIdx) const; + + // predicate support + + virtual bool isPredicated(const MachineInstr *MI) const; + + virtual bool isUnpredicatedTerminator(const MachineInstr *MI) const; + + virtual + bool PredicateInstruction(MachineInstr *MI, + const SmallVectorImpl<MachineOperand> &Pred) const; + + virtual + bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1, + const SmallVectorImpl<MachineOperand> &Pred2) const; + + virtual bool DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const; + + // PTX is fully-predicable + virtual bool isPredicable(MachineInstr *MI) const { return true; } + + // branch support + + virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify = false) const; + + virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const; + + virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const SmallVectorImpl<MachineOperand> &Cond, + DebugLoc DL) const; + + // static helper routines + + static MachineSDNode *GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode, + DebugLoc dl, EVT VT, + SDValue Op1); + + static MachineSDNode *GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode, + DebugLoc dl, EVT VT, + SDValue Op1, SDValue Op2); + + static void AddDefaultPredicate(MachineInstr *MI); + + static bool IsAnyKindOfBranch(const MachineInstr& inst); + + static bool IsAnySuccessorAlsoLayoutSuccessor(const MachineBasicBlock& MBB); + + static MachineBasicBlock *GetBranchTarget(const MachineInstr& inst); +}; // class PTXInstrInfo } // namespace llvm #endif // PTX_INSTR_INFO_H diff --git a/lib/Target/PTX/PTXInstrInfo.td b/lib/Target/PTX/PTXInstrInfo.td index 9a74778..1ac9d3f 100644 --- a/lib/Target/PTX/PTXInstrInfo.td +++ b/lib/Target/PTX/PTXInstrInfo.td @@ -18,6 +18,26 @@ include "PTXInstrFormats.td" //===----------------------------------------------------------------------===// +// Code Generation Predicates +//===----------------------------------------------------------------------===// + +// Addressing +def Use32BitAddresses : Predicate<"!getSubtarget().is64Bit()">; +def Use64BitAddresses : Predicate<"getSubtarget().is64Bit()">; + +// Shader Model Support +def SupportsSM13 : Predicate<"getSubtarget().supportsSM13()">; +def DoesNotSupportSM13 : Predicate<"!getSubtarget().supportsSM13()">; +def SupportsSM20 : Predicate<"getSubtarget().supportsSM20()">; +def DoesNotSupportSM20 : Predicate<"!getSubtarget().supportsSM20()">; + +// PTX Version Support +def SupportsPTX21 : Predicate<"getSubtarget().supportsPTX21()">; +def DoesNotSupportPTX21 : Predicate<"!getSubtarget().supportsPTX21()">; +def SupportsPTX22 : Predicate<"getSubtarget().supportsPTX22()">; +def DoesNotSupportPTX22 : Predicate<"!getSubtarget().supportsPTX22()">; + +//===----------------------------------------------------------------------===// // Instruction Pattern Stuff //===----------------------------------------------------------------------===// @@ -107,24 +127,41 @@ def store_shared }]>; // Addressing modes. -def ADDRrr : ComplexPattern<i32, 2, "SelectADDRrr", [], []>; -def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [], []>; -def ADDRii : ComplexPattern<i32, 2, "SelectADDRii", [], []>; +def ADDRrr32 : ComplexPattern<i32, 2, "SelectADDRrr", [], []>; +def ADDRrr64 : ComplexPattern<i64, 2, "SelectADDRrr", [], []>; +def ADDRri32 : ComplexPattern<i32, 2, "SelectADDRri", [], []>; +def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri", [], []>; +def ADDRii32 : ComplexPattern<i32, 2, "SelectADDRii", [], []>; +def ADDRii64 : ComplexPattern<i64, 2, "SelectADDRii", [], []>; // Address operands -def MEMri : Operand<i32> { +def MEMri32 : Operand<i32> { + let PrintMethod = "printMemOperand"; + let MIOperandInfo = (ops RRegu32, i32imm); +} +def MEMri64 : Operand<i64> { let PrintMethod = "printMemOperand"; - let MIOperandInfo = (ops RRegs32, i32imm); + let MIOperandInfo = (ops RRegu64, i64imm); } -def MEMii : Operand<i32> { +def MEMii32 : Operand<i32> { let PrintMethod = "printMemOperand"; let MIOperandInfo = (ops i32imm, i32imm); } +def MEMii64 : Operand<i64> { + let PrintMethod = "printMemOperand"; + let MIOperandInfo = (ops i64imm, i64imm); +} +// The operand here does not correspond to an actual address, so we +// can use i32 in 64-bit address modes. def MEMpi : Operand<i32> { let PrintMethod = "printParamOperand"; let MIOperandInfo = (ops i32imm); } +// Branch & call targets have OtherVT type. +def brtarget : Operand<OtherVT>; +def calltarget : Operand<i32>; + //===----------------------------------------------------------------------===// // PTX Specific Node Definitions //===----------------------------------------------------------------------===// @@ -138,66 +175,389 @@ def PTXexit : SDNode<"PTXISD::EXIT", SDTNone, [SDNPHasChain]>; def PTXret : SDNode<"PTXISD::RET", SDTNone, [SDNPHasChain]>; +def PTXcopyaddress + : SDNode<"PTXISD::COPY_ADDRESS", SDTypeProfile<1, 1, []>, []>; //===----------------------------------------------------------------------===// // Instruction Class Templates //===----------------------------------------------------------------------===// +//===- Floating-Point Instructions - 2 Operand Form -----------------------===// +multiclass PTX_FLOAT_2OP<string opcstr, SDNode opnode> { + def rr32 : InstPTX<(outs RRegf32:$d), + (ins RRegf32:$a), + !strconcat(opcstr, ".f32\t$d, $a"), + [(set RRegf32:$d, (opnode RRegf32:$a))]>; + def ri32 : InstPTX<(outs RRegf32:$d), + (ins f32imm:$a), + !strconcat(opcstr, ".f32\t$d, $a"), + [(set RRegf32:$d, (opnode fpimm:$a))]>; + def rr64 : InstPTX<(outs RRegf64:$d), + (ins RRegf64:$a), + !strconcat(opcstr, ".f64\t$d, $a"), + [(set RRegf64:$d, (opnode RRegf64:$a))]>; + def ri64 : InstPTX<(outs RRegf64:$d), + (ins f64imm:$a), + !strconcat(opcstr, ".f64\t$d, $a"), + [(set RRegf64:$d, (opnode fpimm:$a))]>; +} + +//===- Floating-Point Instructions - 3 Operand Form -----------------------===// +multiclass PTX_FLOAT_3OP<string opcstr, SDNode opnode> { + def rr32 : InstPTX<(outs RRegf32:$d), + (ins RRegf32:$a, RRegf32:$b), + !strconcat(opcstr, ".f32\t$d, $a, $b"), + [(set RRegf32:$d, (opnode RRegf32:$a, RRegf32:$b))]>; + def ri32 : InstPTX<(outs RRegf32:$d), + (ins RRegf32:$a, f32imm:$b), + !strconcat(opcstr, ".f32\t$d, $a, $b"), + [(set RRegf32:$d, (opnode RRegf32:$a, fpimm:$b))]>; + def rr64 : InstPTX<(outs RRegf64:$d), + (ins RRegf64:$a, RRegf64:$b), + !strconcat(opcstr, ".f64\t$d, $a, $b"), + [(set RRegf64:$d, (opnode RRegf64:$a, RRegf64:$b))]>; + def ri64 : InstPTX<(outs RRegf64:$d), + (ins RRegf64:$a, f64imm:$b), + !strconcat(opcstr, ".f64\t$d, $a, $b"), + [(set RRegf64:$d, (opnode RRegf64:$a, fpimm:$b))]>; +} + +//===- Floating-Point Instructions - 4 Operand Form -----------------------===// +multiclass PTX_FLOAT_4OP<string opcstr, SDNode opnode1, SDNode opnode2> { + def rrr32 : InstPTX<(outs RRegf32:$d), + (ins RRegf32:$a, RRegf32:$b, RRegf32:$c), + !strconcat(opcstr, ".f32\t$d, $a, $b, $c"), + [(set RRegf32:$d, (opnode2 (opnode1 RRegf32:$a, + RRegf32:$b), + RRegf32:$c))]>; + def rri32 : InstPTX<(outs RRegf32:$d), + (ins RRegf32:$a, RRegf32:$b, f32imm:$c), + !strconcat(opcstr, ".f32\t$d, $a, $b, $c"), + [(set RRegf32:$d, (opnode2 (opnode1 RRegf32:$a, + RRegf32:$b), + fpimm:$c))]>; + def rrr64 : InstPTX<(outs RRegf64:$d), + (ins RRegf64:$a, RRegf64:$b, RRegf64:$c), + !strconcat(opcstr, ".f64\t$d, $a, $b, $c"), + [(set RRegf64:$d, (opnode2 (opnode1 RRegf64:$a, + RRegf64:$b), + RRegf64:$c))]>; + def rri64 : InstPTX<(outs RRegf64:$d), + (ins RRegf64:$a, RRegf64:$b, f64imm:$c), + !strconcat(opcstr, ".f64\t$d, $a, $b, $c"), + [(set RRegf64:$d, (opnode2 (opnode1 RRegf64:$a, + RRegf64:$b), + fpimm:$c))]>; +} + multiclass INT3<string opcstr, SDNode opnode> { - def rr : InstPTX<(outs RRegs32:$d), - (ins RRegs32:$a, RRegs32:$b), - !strconcat(opcstr, ".%type\t$d, $a, $b"), - [(set RRegs32:$d, (opnode RRegs32:$a, RRegs32:$b))]>; - def ri : InstPTX<(outs RRegs32:$d), - (ins RRegs32:$a, i32imm:$b), - !strconcat(opcstr, ".%type\t$d, $a, $b"), - [(set RRegs32:$d, (opnode RRegs32:$a, imm:$b))]>; + def rr16 : InstPTX<(outs RRegu16:$d), + (ins RRegu16:$a, RRegu16:$b), + !strconcat(opcstr, ".u16\t$d, $a, $b"), + [(set RRegu16:$d, (opnode RRegu16:$a, RRegu16:$b))]>; + def ri16 : InstPTX<(outs RRegu16:$d), + (ins RRegu16:$a, i16imm:$b), + !strconcat(opcstr, ".u16\t$d, $a, $b"), + [(set RRegu16:$d, (opnode RRegu16:$a, imm:$b))]>; + def rr32 : InstPTX<(outs RRegu32:$d), + (ins RRegu32:$a, RRegu32:$b), + !strconcat(opcstr, ".u32\t$d, $a, $b"), + [(set RRegu32:$d, (opnode RRegu32:$a, RRegu32:$b))]>; + def ri32 : InstPTX<(outs RRegu32:$d), + (ins RRegu32:$a, i32imm:$b), + !strconcat(opcstr, ".u32\t$d, $a, $b"), + [(set RRegu32:$d, (opnode RRegu32:$a, imm:$b))]>; + def rr64 : InstPTX<(outs RRegu64:$d), + (ins RRegu64:$a, RRegu64:$b), + !strconcat(opcstr, ".u64\t$d, $a, $b"), + [(set RRegu64:$d, (opnode RRegu64:$a, RRegu64:$b))]>; + def ri64 : InstPTX<(outs RRegu64:$d), + (ins RRegu64:$a, i64imm:$b), + !strconcat(opcstr, ".u64\t$d, $a, $b"), + [(set RRegu64:$d, (opnode RRegu64:$a, imm:$b))]>; +} + +multiclass PTX_LOGIC<string opcstr, SDNode opnode> { + def ripreds : InstPTX<(outs Preds:$d), + (ins Preds:$a, i1imm:$b), + !strconcat(opcstr, ".pred\t$d, $a, $b"), + [(set Preds:$d, (opnode Preds:$a, imm:$b))]>; + def rrpreds : InstPTX<(outs Preds:$d), + (ins Preds:$a, Preds:$b), + !strconcat(opcstr, ".pred\t$d, $a, $b"), + [(set Preds:$d, (opnode Preds:$a, Preds:$b))]>; + def rr16 : InstPTX<(outs RRegu16:$d), + (ins RRegu16:$a, RRegu16:$b), + !strconcat(opcstr, ".b16\t$d, $a, $b"), + [(set RRegu16:$d, (opnode RRegu16:$a, RRegu16:$b))]>; + def ri16 : InstPTX<(outs RRegu16:$d), + (ins RRegu16:$a, i16imm:$b), + !strconcat(opcstr, ".b16\t$d, $a, $b"), + [(set RRegu16:$d, (opnode RRegu16:$a, imm:$b))]>; + def rr32 : InstPTX<(outs RRegu32:$d), + (ins RRegu32:$a, RRegu32:$b), + !strconcat(opcstr, ".b32\t$d, $a, $b"), + [(set RRegu32:$d, (opnode RRegu32:$a, RRegu32:$b))]>; + def ri32 : InstPTX<(outs RRegu32:$d), + (ins RRegu32:$a, i32imm:$b), + !strconcat(opcstr, ".b32\t$d, $a, $b"), + [(set RRegu32:$d, (opnode RRegu32:$a, imm:$b))]>; + def rr64 : InstPTX<(outs RRegu64:$d), + (ins RRegu64:$a, RRegu64:$b), + !strconcat(opcstr, ".b64\t$d, $a, $b"), + [(set RRegu64:$d, (opnode RRegu64:$a, RRegu64:$b))]>; + def ri64 : InstPTX<(outs RRegu64:$d), + (ins RRegu64:$a, i64imm:$b), + !strconcat(opcstr, ".b64\t$d, $a, $b"), + [(set RRegu64:$d, (opnode RRegu64:$a, imm:$b))]>; } -// no %type directive, non-communtable multiclass INT3ntnc<string opcstr, SDNode opnode> { - def rr : InstPTX<(outs RRegs32:$d), - (ins RRegs32:$a, RRegs32:$b), - !strconcat(opcstr, "\t$d, $a, $b"), - [(set RRegs32:$d, (opnode RRegs32:$a, RRegs32:$b))]>; - def ri : InstPTX<(outs RRegs32:$d), - (ins RRegs32:$a, i32imm:$b), - !strconcat(opcstr, "\t$d, $a, $b"), - [(set RRegs32:$d, (opnode RRegs32:$a, imm:$b))]>; - def ir : InstPTX<(outs RRegs32:$d), - (ins i32imm:$a, RRegs32:$b), - !strconcat(opcstr, "\t$d, $a, $b"), - [(set RRegs32:$d, (opnode imm:$a, RRegs32:$b))]>; + def rr16 : InstPTX<(outs RRegu16:$d), + (ins RRegu16:$a, RRegu16:$b), + !strconcat(opcstr, "16\t$d, $a, $b"), + [(set RRegu16:$d, (opnode RRegu16:$a, RRegu16:$b))]>; + def rr32 : InstPTX<(outs RRegu32:$d), + (ins RRegu32:$a, RRegu32:$b), + !strconcat(opcstr, "32\t$d, $a, $b"), + [(set RRegu32:$d, (opnode RRegu32:$a, RRegu32:$b))]>; + def rr64 : InstPTX<(outs RRegu64:$d), + (ins RRegu64:$a, RRegu64:$b), + !strconcat(opcstr, "64\t$d, $a, $b"), + [(set RRegu64:$d, (opnode RRegu64:$a, RRegu64:$b))]>; + def ri16 : InstPTX<(outs RRegu16:$d), + (ins RRegu16:$a, i16imm:$b), + !strconcat(opcstr, "16\t$d, $a, $b"), + [(set RRegu16:$d, (opnode RRegu16:$a, imm:$b))]>; + def ri32 : InstPTX<(outs RRegu32:$d), + (ins RRegu32:$a, i32imm:$b), + !strconcat(opcstr, "32\t$d, $a, $b"), + [(set RRegu32:$d, (opnode RRegu32:$a, imm:$b))]>; + def ri64 : InstPTX<(outs RRegu64:$d), + (ins RRegu64:$a, i64imm:$b), + !strconcat(opcstr, "64\t$d, $a, $b"), + [(set RRegu64:$d, (opnode RRegu64:$a, imm:$b))]>; + def ir16 : InstPTX<(outs RRegu16:$d), + (ins i16imm:$a, RRegu16:$b), + !strconcat(opcstr, "16\t$d, $a, $b"), + [(set RRegu16:$d, (opnode imm:$a, RRegu16:$b))]>; + def ir32 : InstPTX<(outs RRegu32:$d), + (ins i32imm:$a, RRegu32:$b), + !strconcat(opcstr, "32\t$d, $a, $b"), + [(set RRegu32:$d, (opnode imm:$a, RRegu32:$b))]>; + def ir64 : InstPTX<(outs RRegu64:$d), + (ins i64imm:$a, RRegu64:$b), + !strconcat(opcstr, "64\t$d, $a, $b"), + [(set RRegu64:$d, (opnode imm:$a, RRegu64:$b))]>; } -multiclass PTX_LD<string opstr, RegisterClass RC, PatFrag pat_load> { - def rr : InstPTX<(outs RC:$d), - (ins MEMri:$a), - !strconcat(opstr, ".%type\t$d, [$a]"), - [(set RC:$d, (pat_load ADDRrr:$a))]>; - def ri : InstPTX<(outs RC:$d), - (ins MEMri:$a), - !strconcat(opstr, ".%type\t$d, [$a]"), - [(set RC:$d, (pat_load ADDRri:$a))]>; - def ii : InstPTX<(outs RC:$d), - (ins MEMii:$a), - !strconcat(opstr, ".%type\t$d, [$a]"), - [(set RC:$d, (pat_load ADDRii:$a))]>; +multiclass PTX_SETP_I<RegisterClass RC, string regclsname, Operand immcls, + CondCode cmp, string cmpstr> { + // TODO support 5-operand format: p|q, a, b, c + + def rr + : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b), + !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"), + [(set Preds:$p, (setcc RC:$a, RC:$b, cmp))]>; + def ri + : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b), + !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"), + [(set Preds:$p, (setcc RC:$a, imm:$b, cmp))]>; + + def rr_and_r + : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c), + !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, $c"), + [(set Preds:$p, (and (setcc RC:$a, RC:$b, cmp), Preds:$c))]>; + def ri_and_r + : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b, Preds:$c), + !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, $c"), + [(set Preds:$p, (and (setcc RC:$a, imm:$b, cmp), Preds:$c))]>; + def rr_or_r + : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c), + !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, $c"), + [(set Preds:$p, (or (setcc RC:$a, RC:$b, cmp), Preds:$c))]>; + def ri_or_r + : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b, Preds:$c), + !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, $c"), + [(set Preds:$p, (or (setcc RC:$a, imm:$b, cmp), Preds:$c))]>; + def rr_xor_r + : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c), + !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, $c"), + [(set Preds:$p, (xor (setcc RC:$a, RC:$b, cmp), Preds:$c))]>; + def ri_xor_r + : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b, Preds:$c), + !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, $c"), + [(set Preds:$p, (xor (setcc RC:$a, imm:$b, cmp), Preds:$c))]>; + + def rr_and_not_r + : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c), + !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, !$c"), + [(set Preds:$p, (and (setcc RC:$a, RC:$b, cmp), (not Preds:$c)))]>; + def ri_and_not_r + : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b, Preds:$c), + !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, !$c"), + [(set Preds:$p, (and (setcc RC:$a, imm:$b, cmp), (not Preds:$c)))]>; + def rr_or_not_r + : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c), + !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, !$c"), + [(set Preds:$p, (or (setcc RC:$a, RC:$b, cmp), (not Preds:$c)))]>; + def ri_or_not_r + : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b, Preds:$c), + !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, !$c"), + [(set Preds:$p, (or (setcc RC:$a, imm:$b, cmp), (not Preds:$c)))]>; + def rr_xor_not_r + : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c), + !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, !$c"), + [(set Preds:$p, (xor (setcc RC:$a, RC:$b, cmp), (not Preds:$c)))]>; + def ri_xor_not_r + : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b, Preds:$c), + !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, !$c"), + [(set Preds:$p, (xor (setcc RC:$a, imm:$b, cmp), (not Preds:$c)))]>; } -multiclass PTX_ST<string opstr, RegisterClass RC, PatFrag pat_store> { - def rr : InstPTX<(outs), - (ins RC:$d, MEMri:$a), - !strconcat(opstr, ".%type\t[$a], $d"), - [(pat_store RC:$d, ADDRrr:$a)]>; - def ri : InstPTX<(outs), - (ins RC:$d, MEMri:$a), - !strconcat(opstr, ".%type\t[$a], $d"), - [(pat_store RC:$d, ADDRri:$a)]>; - def ii : InstPTX<(outs), - (ins RC:$d, MEMii:$a), - !strconcat(opstr, ".%type\t[$a], $d"), - [(pat_store RC:$d, ADDRii:$a)]>; +multiclass PTX_SETP_FP<RegisterClass RC, string regclsname, + CondCode ucmp, CondCode ocmp, string cmpstr> { + // TODO support 5-operand format: p|q, a, b, c + + def rr_u + : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b), + !strconcat("setp.", cmpstr, "u.", regclsname, "\t$p, $a, $b"), + [(set Preds:$p, (setcc RC:$a, RC:$b, ucmp))]>; + def rr_o + : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b), + !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"), + [(set Preds:$p, (setcc RC:$a, RC:$b, ocmp))]>; + + def rr_and_r_u + : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c), + !strconcat("setp.", cmpstr, "u.and.", regclsname, "\t$p, $a, $b, $c"), + [(set Preds:$p, (and (setcc RC:$a, RC:$b, ucmp), Preds:$c))]>; + def rr_and_r_o + : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c), + !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, $c"), + [(set Preds:$p, (and (setcc RC:$a, RC:$b, ocmp), Preds:$c))]>; + + def rr_or_r_u + : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c), + !strconcat("setp.", cmpstr, "u.or.", regclsname, "\t$p, $a, $b, $c"), + [(set Preds:$p, (or (setcc RC:$a, RC:$b, ucmp), Preds:$c))]>; + def rr_or_r_o + : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c), + !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, $c"), + [(set Preds:$p, (or (setcc RC:$a, RC:$b, ocmp), Preds:$c))]>; + + def rr_xor_r_u + : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c), + !strconcat("setp.", cmpstr, "u.xor.", regclsname, "\t$p, $a, $b, $c"), + [(set Preds:$p, (xor (setcc RC:$a, RC:$b, ucmp), Preds:$c))]>; + def rr_xor_r_o + : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c), + !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, $c"), + [(set Preds:$p, (xor (setcc RC:$a, RC:$b, ocmp), Preds:$c))]>; + + def rr_and_not_r_u + : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c), + !strconcat("setp.", cmpstr, "u.and.", regclsname, "\t$p, $a, $b, !$c"), + [(set Preds:$p, (and (setcc RC:$a, RC:$b, ucmp), (not Preds:$c)))]>; + def rr_and_not_r_o + : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c), + !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, !$c"), + [(set Preds:$p, (and (setcc RC:$a, RC:$b, ocmp), (not Preds:$c)))]>; + + def rr_or_not_r_u + : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c), + !strconcat("setp.", cmpstr, "u.or.", regclsname, "\t$p, $a, $b, !$c"), + [(set Preds:$p, (or (setcc RC:$a, RC:$b, ucmp), (not Preds:$c)))]>; + def rr_or_not_r_o + : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c), + !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, !$c"), + [(set Preds:$p, (or (setcc RC:$a, RC:$b, ocmp), (not Preds:$c)))]>; + + def rr_xor_not_r_u + : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c), + !strconcat("setp.", cmpstr, "u.xor.", regclsname, "\t$p, $a, $b, !$c"), + [(set Preds:$p, (xor (setcc RC:$a, RC:$b, ucmp), (not Preds:$c)))]>; + def rr_xor_not_r_o + : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c), + !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, !$c"), + [(set Preds:$p, (xor (setcc RC:$a, RC:$b, ocmp), (not Preds:$c)))]>; +} + +multiclass PTX_SELP<RegisterClass RC, string regclsname> { + def rr + : InstPTX<(outs RC:$r), (ins Preds:$a, RC:$b, RC:$c), + !strconcat("selp.", regclsname, "\t$r, $b, $c, $a"), + [(set RC:$r, (select Preds:$a, RC:$b, RC:$c))]>; +} + +multiclass PTX_LD<string opstr, string typestr, RegisterClass RC, PatFrag pat_load> { + def rr32 : InstPTX<(outs RC:$d), + (ins MEMri32:$a), + !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")), + [(set RC:$d, (pat_load ADDRrr32:$a))]>, Requires<[Use32BitAddresses]>; + def rr64 : InstPTX<(outs RC:$d), + (ins MEMri64:$a), + !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")), + [(set RC:$d, (pat_load ADDRrr64:$a))]>, Requires<[Use64BitAddresses]>; + def ri32 : InstPTX<(outs RC:$d), + (ins MEMri32:$a), + !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")), + [(set RC:$d, (pat_load ADDRri32:$a))]>, Requires<[Use32BitAddresses]>; + def ri64 : InstPTX<(outs RC:$d), + (ins MEMri64:$a), + !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")), + [(set RC:$d, (pat_load ADDRri64:$a))]>, Requires<[Use64BitAddresses]>; + def ii32 : InstPTX<(outs RC:$d), + (ins MEMii32:$a), + !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")), + [(set RC:$d, (pat_load ADDRii32:$a))]>, Requires<[Use32BitAddresses]>; + def ii64 : InstPTX<(outs RC:$d), + (ins MEMii64:$a), + !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")), + [(set RC:$d, (pat_load ADDRii64:$a))]>, Requires<[Use64BitAddresses]>; +} + +multiclass PTX_LD_ALL<string opstr, PatFrag pat_load> { + defm u16 : PTX_LD<opstr, ".u16", RRegu16, pat_load>; + defm u32 : PTX_LD<opstr, ".u32", RRegu32, pat_load>; + defm u64 : PTX_LD<opstr, ".u64", RRegu64, pat_load>; + defm f32 : PTX_LD<opstr, ".f32", RRegf32, pat_load>; + defm f64 : PTX_LD<opstr, ".f64", RRegf64, pat_load>; +} + +multiclass PTX_ST<string opstr, string typestr, RegisterClass RC, PatFrag pat_store> { + def rr32 : InstPTX<(outs), + (ins RC:$d, MEMri32:$a), + !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")), + [(pat_store RC:$d, ADDRrr32:$a)]>, Requires<[Use32BitAddresses]>; + def rr64 : InstPTX<(outs), + (ins RC:$d, MEMri64:$a), + !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")), + [(pat_store RC:$d, ADDRrr64:$a)]>, Requires<[Use64BitAddresses]>; + def ri32 : InstPTX<(outs), + (ins RC:$d, MEMri32:$a), + !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")), + [(pat_store RC:$d, ADDRri32:$a)]>, Requires<[Use32BitAddresses]>; + def ri64 : InstPTX<(outs), + (ins RC:$d, MEMri64:$a), + !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")), + [(pat_store RC:$d, ADDRri64:$a)]>, Requires<[Use64BitAddresses]>; + def ii32 : InstPTX<(outs), + (ins RC:$d, MEMii32:$a), + !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")), + [(pat_store RC:$d, ADDRii32:$a)]>, Requires<[Use32BitAddresses]>; + def ii64 : InstPTX<(outs), + (ins RC:$d, MEMii64:$a), + !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")), + [(pat_store RC:$d, ADDRii64:$a)]>, Requires<[Use64BitAddresses]>; +} + +multiclass PTX_ST_ALL<string opstr, PatFrag pat_store> { + defm u16 : PTX_ST<opstr, ".u16", RRegu16, pat_store>; + defm u32 : PTX_ST<opstr, ".u32", RRegu32, pat_store>; + defm u64 : PTX_ST<opstr, ".u64", RRegu64, pat_store>; + defm f32 : PTX_ST<opstr, ".f32", RRegf32, pat_store>; + defm f64 : PTX_ST<opstr, ".f64", RRegf64, pat_store>; } //===----------------------------------------------------------------------===// @@ -208,50 +568,392 @@ multiclass PTX_ST<string opstr, RegisterClass RC, PatFrag pat_store> { defm ADD : INT3<"add", add>; defm SUB : INT3<"sub", sub>; +defm MUL : INT3<"mul.lo", mul>; // FIXME: Allow 32x32 -> 64 multiplies +defm DIV : INT3<"div", udiv>; +defm REM : INT3<"rem", urem>; + +///===- Floating-Point Arithmetic Instructions ----------------------------===// + +// Standard Unary Operations +defm FNEG : PTX_FLOAT_2OP<"neg", fneg>; + +// Standard Binary Operations +defm FADD : PTX_FLOAT_3OP<"add", fadd>; +defm FSUB : PTX_FLOAT_3OP<"sub", fsub>; +defm FMUL : PTX_FLOAT_3OP<"mul", fmul>; + +// TODO: Allow user selection of rounding modes for fdiv. +// For division, we need to have f32 and f64 differently. +// For f32, we just always use .approx since it is supported on all hardware +// for PTX 1.4+, which is our minimum target. +def FDIVrr32 : InstPTX<(outs RRegf32:$d), + (ins RRegf32:$a, RRegf32:$b), + "div.approx.f32\t$d, $a, $b", + [(set RRegf32:$d, (fdiv RRegf32:$a, RRegf32:$b))]>; +def FDIVri32 : InstPTX<(outs RRegf32:$d), + (ins RRegf32:$a, f32imm:$b), + "div.approx.f32\t$d, $a, $b", + [(set RRegf32:$d, (fdiv RRegf32:$a, fpimm:$b))]>; + +// For f64, we must specify a rounding for sm 1.3+ but *not* for sm 1.0. +def FDIVrr64SM13 : InstPTX<(outs RRegf64:$d), + (ins RRegf64:$a, RRegf64:$b), + "div.rn.f64\t$d, $a, $b", + [(set RRegf64:$d, (fdiv RRegf64:$a, RRegf64:$b))]>, + Requires<[SupportsSM13]>; +def FDIVri64SM13 : InstPTX<(outs RRegf64:$d), + (ins RRegf64:$a, f64imm:$b), + "div.rn.f64\t$d, $a, $b", + [(set RRegf64:$d, (fdiv RRegf64:$a, fpimm:$b))]>, + Requires<[SupportsSM13]>; +def FDIVrr64SM10 : InstPTX<(outs RRegf64:$d), + (ins RRegf64:$a, RRegf64:$b), + "div.f64\t$d, $a, $b", + [(set RRegf64:$d, (fdiv RRegf64:$a, RRegf64:$b))]>, + Requires<[DoesNotSupportSM13]>; +def FDIVri64SM10 : InstPTX<(outs RRegf64:$d), + (ins RRegf64:$a, f64imm:$b), + "div.f64\t$d, $a, $b", + [(set RRegf64:$d, (fdiv RRegf64:$a, fpimm:$b))]>, + Requires<[DoesNotSupportSM13]>; + + + +// Multi-operation hybrid instructions + +// The selection of mad/fma is tricky. In some cases, they are the *same* +// instruction, but in other cases we may prefer one or the other. Also, +// different PTX versions differ on whether rounding mode flags are required. +// In the short term, mad is supported on all PTX versions and we use a +// default rounding mode no matter what shader model or PTX version. +// TODO: Allow the rounding mode to be selectable through llc. +defm FMADSM13 : PTX_FLOAT_4OP<"mad.rn", fmul, fadd>, Requires<[SupportsSM13]>; +defm FMAD : PTX_FLOAT_4OP<"mad", fmul, fadd>, Requires<[DoesNotSupportSM13]>; + +///===- Floating-Point Intrinsic Instructions -----------------------------===// + +def FSQRT32 : InstPTX<(outs RRegf32:$d), + (ins RRegf32:$a), + "sqrt.rn.f32\t$d, $a", + [(set RRegf32:$d, (fsqrt RRegf32:$a))]>; + +def FSQRT64 : InstPTX<(outs RRegf64:$d), + (ins RRegf64:$a), + "sqrt.rn.f64\t$d, $a", + [(set RRegf64:$d, (fsqrt RRegf64:$a))]>; + +def FSIN32 : InstPTX<(outs RRegf32:$d), + (ins RRegf32:$a), + "sin.approx.f32\t$d, $a", + [(set RRegf32:$d, (fsin RRegf32:$a))]>; + +def FSIN64 : InstPTX<(outs RRegf64:$d), + (ins RRegf64:$a), + "sin.approx.f64\t$d, $a", + [(set RRegf64:$d, (fsin RRegf64:$a))]>; + +def FCOS32 : InstPTX<(outs RRegf32:$d), + (ins RRegf32:$a), + "cos.approx.f32\t$d, $a", + [(set RRegf32:$d, (fcos RRegf32:$a))]>; + +def FCOS64 : InstPTX<(outs RRegf64:$d), + (ins RRegf64:$a), + "cos.approx.f64\t$d, $a", + [(set RRegf64:$d, (fcos RRegf64:$a))]>; + + +///===- Comparison and Selection Instructions -----------------------------===// + +// Compare u16 + +defm SETPEQu16 : PTX_SETP_I<RRegu16, "u16", i16imm, SETEQ, "eq">; +defm SETPNEu16 : PTX_SETP_I<RRegu16, "u16", i16imm, SETNE, "ne">; +defm SETPLTu16 : PTX_SETP_I<RRegu16, "u16", i16imm, SETULT, "lt">; +defm SETPLEu16 : PTX_SETP_I<RRegu16, "u16", i16imm, SETULE, "le">; +defm SETPGTu16 : PTX_SETP_I<RRegu16, "u16", i16imm, SETUGT, "gt">; +defm SETPGEu16 : PTX_SETP_I<RRegu16, "u16", i16imm, SETUGE, "ge">; + +// Compare u32 + +defm SETPEQu32 : PTX_SETP_I<RRegu32, "u32", i32imm, SETEQ, "eq">; +defm SETPNEu32 : PTX_SETP_I<RRegu32, "u32", i32imm, SETNE, "ne">; +defm SETPLTu32 : PTX_SETP_I<RRegu32, "u32", i32imm, SETULT, "lt">; +defm SETPLEu32 : PTX_SETP_I<RRegu32, "u32", i32imm, SETULE, "le">; +defm SETPGTu32 : PTX_SETP_I<RRegu32, "u32", i32imm, SETUGT, "gt">; +defm SETPGEu32 : PTX_SETP_I<RRegu32, "u32", i32imm, SETUGE, "ge">; + +// Compare u64 + +defm SETPEQu64 : PTX_SETP_I<RRegu64, "u64", i64imm, SETEQ, "eq">; +defm SETPNEu64 : PTX_SETP_I<RRegu64, "u64", i64imm, SETNE, "ne">; +defm SETPLTu64 : PTX_SETP_I<RRegu64, "u64", i64imm, SETULT, "lt">; +defm SETPLEu64 : PTX_SETP_I<RRegu64, "u64", i64imm, SETULE, "le">; +defm SETPGTu64 : PTX_SETP_I<RRegu64, "u64", i64imm, SETUGT, "gt">; +defm SETPGEu64 : PTX_SETP_I<RRegu64, "u64", i64imm, SETUGE, "ge">; + +// Compare f32 + +defm SETPEQf32 : PTX_SETP_FP<RRegf32, "f32", SETUEQ, SETOEQ, "eq">; +defm SETPNEf32 : PTX_SETP_FP<RRegf32, "f32", SETUNE, SETONE, "ne">; +defm SETPLTf32 : PTX_SETP_FP<RRegf32, "f32", SETULT, SETOLT, "lt">; +defm SETPLEf32 : PTX_SETP_FP<RRegf32, "f32", SETULE, SETOLE, "le">; +defm SETPGTf32 : PTX_SETP_FP<RRegf32, "f32", SETUGT, SETOGT, "gt">; +defm SETPGEf32 : PTX_SETP_FP<RRegf32, "f32", SETUGE, SETOGE, "ge">; + +// Compare f64 + +defm SETPEQf64 : PTX_SETP_FP<RRegf64, "f64", SETUEQ, SETOEQ, "eq">; +defm SETPNEf64 : PTX_SETP_FP<RRegf64, "f64", SETUNE, SETONE, "ne">; +defm SETPLTf64 : PTX_SETP_FP<RRegf64, "f64", SETULT, SETOLT, "lt">; +defm SETPLEf64 : PTX_SETP_FP<RRegf64, "f64", SETULE, SETOLE, "le">; +defm SETPGTf64 : PTX_SETP_FP<RRegf64, "f64", SETUGT, SETOGT, "gt">; +defm SETPGEf64 : PTX_SETP_FP<RRegf64, "f64", SETUGE, SETOGE, "ge">; + +// .selp + +defm PTX_SELPu16 : PTX_SELP<RRegu16, "u16">; +defm PTX_SELPu32 : PTX_SELP<RRegu32, "u32">; +defm PTX_SELPu64 : PTX_SELP<RRegu64, "u64">; +defm PTX_SELPf32 : PTX_SELP<RRegf32, "f32">; +defm PTX_SELPf64 : PTX_SELP<RRegf64, "f64">; ///===- Logic and Shift Instructions --------------------------------------===// -defm SHL : INT3ntnc<"shl.b32", PTXshl>; -defm SRL : INT3ntnc<"shr.u32", PTXsrl>; -defm SRA : INT3ntnc<"shr.s32", PTXsra>; +defm SHL : INT3ntnc<"shl.b", PTXshl>; +defm SRL : INT3ntnc<"shr.u", PTXsrl>; +defm SRA : INT3ntnc<"shr.s", PTXsra>; + +defm AND : PTX_LOGIC<"and", and>; +defm OR : PTX_LOGIC<"or", or>; +defm XOR : PTX_LOGIC<"xor", xor>; ///===- Data Movement and Conversion Instructions -------------------------===// let neverHasSideEffects = 1 in { - // rely on isMoveInstr to separate MOVpp, MOVrr, etc. - def MOVpp + def MOVPREDrr : InstPTX<(outs Preds:$d), (ins Preds:$a), "mov.pred\t$d, $a", []>; - def MOVrr - : InstPTX<(outs RRegs32:$d), (ins RRegs32:$a), "mov.%type\t$d, $a", []>; + def MOVU16rr + : InstPTX<(outs RRegu16:$d), (ins RRegu16:$a), "mov.u16\t$d, $a", []>; + def MOVU32rr + : InstPTX<(outs RRegu32:$d), (ins RRegu32:$a), "mov.u32\t$d, $a", []>; + def MOVU64rr + : InstPTX<(outs RRegu64:$d), (ins RRegu64:$a), "mov.u64\t$d, $a", []>; + def MOVF32rr + : InstPTX<(outs RRegf32:$d), (ins RRegf32:$a), "mov.f32\t$d, $a", []>; + def MOVF64rr + : InstPTX<(outs RRegf64:$d), (ins RRegf64:$a), "mov.f64\t$d, $a", []>; } let isReMaterializable = 1, isAsCheapAsAMove = 1 in { - def MOVpi + def MOVPREDri : InstPTX<(outs Preds:$d), (ins i1imm:$a), "mov.pred\t$d, $a", [(set Preds:$d, imm:$a)]>; - def MOVri - : InstPTX<(outs RRegs32:$d), (ins i32imm:$a), "mov.s32\t$d, $a", - [(set RRegs32:$d, imm:$a)]>; + def MOVU16ri + : InstPTX<(outs RRegu16:$d), (ins i16imm:$a), "mov.u16\t$d, $a", + [(set RRegu16:$d, imm:$a)]>; + def MOVU32ri + : InstPTX<(outs RRegu32:$d), (ins i32imm:$a), "mov.u32\t$d, $a", + [(set RRegu32:$d, imm:$a)]>; + def MOVU64ri + : InstPTX<(outs RRegu64:$d), (ins i64imm:$a), "mov.u64\t$d, $a", + [(set RRegu64:$d, imm:$a)]>; + def MOVF32ri + : InstPTX<(outs RRegf32:$d), (ins f32imm:$a), "mov.f32\t$d, $a", + [(set RRegf32:$d, fpimm:$a)]>; + def MOVF64ri + : InstPTX<(outs RRegf64:$d), (ins f64imm:$a), "mov.f64\t$d, $a", + [(set RRegf64:$d, fpimm:$a)]>; } -defm LDg : PTX_LD<"ld.global", RRegs32, load_global>; -defm LDc : PTX_LD<"ld.const", RRegs32, load_constant>; -defm LDl : PTX_LD<"ld.local", RRegs32, load_local>; -defm LDp : PTX_LD<"ld.param", RRegs32, load_parameter>; -defm LDs : PTX_LD<"ld.shared", RRegs32, load_shared>; +let isReMaterializable = 1, isAsCheapAsAMove = 1 in { + def MOVaddr32 + : InstPTX<(outs RRegu32:$d), (ins i32imm:$a), "mov.u32\t$d, $a", + [(set RRegu32:$d, (PTXcopyaddress tglobaladdr:$a))]>; + def MOVaddr64 + : InstPTX<(outs RRegu64:$d), (ins i64imm:$a), "mov.u64\t$d, $a", + [(set RRegu64:$d, (PTXcopyaddress tglobaladdr:$a))]>; +} + +// Loads +defm LDg : PTX_LD_ALL<"ld.global", load_global>; +defm LDc : PTX_LD_ALL<"ld.const", load_constant>; +defm LDl : PTX_LD_ALL<"ld.local", load_local>; +defm LDs : PTX_LD_ALL<"ld.shared", load_shared>; + +// This is a special instruction that is manually inserted for kernel parameters +def LDpiU16 : InstPTX<(outs RRegu16:$d), (ins MEMpi:$a), + "ld.param.u16\t$d, [$a]", []>; +def LDpiU32 : InstPTX<(outs RRegu32:$d), (ins MEMpi:$a), + "ld.param.u32\t$d, [$a]", []>; +def LDpiU64 : InstPTX<(outs RRegu64:$d), (ins MEMpi:$a), + "ld.param.u64\t$d, [$a]", []>; +def LDpiF32 : InstPTX<(outs RRegf32:$d), (ins MEMpi:$a), + "ld.param.f32\t$d, [$a]", []>; +def LDpiF64 : InstPTX<(outs RRegf64:$d), (ins MEMpi:$a), + "ld.param.f64\t$d, [$a]", []>; + +// Stores +defm STg : PTX_ST_ALL<"st.global", store_global>; +defm STl : PTX_ST_ALL<"st.local", store_local>; +defm STs : PTX_ST_ALL<"st.shared", store_shared>; + +// defm STp : PTX_ST_ALL<"st.param", store_parameter>; +// defm LDp : PTX_LD_ALL<"ld.param", load_parameter>; +// TODO: Do something with st.param if/when it is needed. + +// Conversion to pred + +def CVT_pred_u16 + : InstPTX<(outs Preds:$d), (ins RRegu16:$a), "cvt.pred.u16\t$d, $a", + [(set Preds:$d, (trunc RRegu16:$a))]>; -def LDpi : InstPTX<(outs RRegs32:$d), (ins MEMpi:$a), - "ld.param.%type\t$d, [$a]", []>; +def CVT_pred_u32 + : InstPTX<(outs Preds:$d), (ins RRegu32:$a), "cvt.pred.u32\t$d, $a", + [(set Preds:$d, (trunc RRegu32:$a))]>; -defm STg : PTX_ST<"st.global", RRegs32, store_global>; -defm STl : PTX_ST<"st.local", RRegs32, store_local>; -// Store to parameter state space requires PTX 2.0 or higher? -// defm STp : PTX_ST<"st.param", RRegs32, store_parameter>; -defm STs : PTX_ST<"st.shared", RRegs32, store_shared>; +def CVT_pred_u64 + : InstPTX<(outs Preds:$d), (ins RRegu64:$a), "cvt.pred.u64\t$d, $a", + [(set Preds:$d, (trunc RRegu64:$a))]>; + +def CVT_pred_f32 + : InstPTX<(outs Preds:$d), (ins RRegf32:$a), "cvt.rni.pred.f32\t$d, $a", + [(set Preds:$d, (fp_to_uint RRegf32:$a))]>; + +def CVT_pred_f64 + : InstPTX<(outs Preds:$d), (ins RRegf64:$a), "cvt.rni.pred.f64\t$d, $a", + [(set Preds:$d, (fp_to_uint RRegf64:$a))]>; + +// Conversion to u16 + +def CVT_u16_pred + : InstPTX<(outs RRegu16:$d), (ins Preds:$a), "cvt.u16.pred\t$d, $a", + [(set RRegu16:$d, (zext Preds:$a))]>; + +def CVT_u16_u32 + : InstPTX<(outs RRegu16:$d), (ins RRegu32:$a), "cvt.u16.u32\t$d, $a", + [(set RRegu16:$d, (trunc RRegu32:$a))]>; + +def CVT_u16_u64 + : InstPTX<(outs RRegu16:$d), (ins RRegu64:$a), "cvt.u16.u64\t$d, $a", + [(set RRegu16:$d, (trunc RRegu64:$a))]>; + +def CVT_u16_f32 + : InstPTX<(outs RRegu16:$d), (ins RRegf32:$a), "cvt.rni.u16.f32\t$d, $a", + [(set RRegu16:$d, (fp_to_uint RRegf32:$a))]>; + +def CVT_u16_f64 + : InstPTX<(outs RRegu16:$d), (ins RRegf64:$a), "cvt.rni.u16.f64\t$d, $a", + [(set RRegu16:$d, (fp_to_uint RRegf64:$a))]>; + +// Conversion to u32 + +def CVT_u32_pred + : InstPTX<(outs RRegu32:$d), (ins Preds:$a), "cvt.u32.pred\t$d, $a", + [(set RRegu32:$d, (zext Preds:$a))]>; + +def CVT_u32_u16 + : InstPTX<(outs RRegu32:$d), (ins RRegu16:$a), "cvt.u32.u16\t$d, $a", + [(set RRegu32:$d, (zext RRegu16:$a))]>; + +def CVT_u32_u64 + : InstPTX<(outs RRegu32:$d), (ins RRegu64:$a), "cvt.u32.u64\t$d, $a", + [(set RRegu32:$d, (trunc RRegu64:$a))]>; + +def CVT_u32_f32 + : InstPTX<(outs RRegu32:$d), (ins RRegf32:$a), "cvt.rni.u32.f32\t$d, $a", + [(set RRegu32:$d, (fp_to_uint RRegf32:$a))]>; + +def CVT_u32_f64 + : InstPTX<(outs RRegu32:$d), (ins RRegf64:$a), "cvt.rni.u32.f64\t$d, $a", + [(set RRegu32:$d, (fp_to_uint RRegf64:$a))]>; + +// Conversion to u64 + +def CVT_u64_pred + : InstPTX<(outs RRegu64:$d), (ins Preds:$a), "cvt.u64.pred\t$d, $a", + [(set RRegu64:$d, (zext Preds:$a))]>; + +def CVT_u64_u16 + : InstPTX<(outs RRegu64:$d), (ins RRegu16:$a), "cvt.u64.u16\t$d, $a", + [(set RRegu64:$d, (zext RRegu16:$a))]>; + +def CVT_u64_u32 + : InstPTX<(outs RRegu64:$d), (ins RRegu32:$a), "cvt.u64.u32\t$d, $a", + [(set RRegu64:$d, (zext RRegu32:$a))]>; + +def CVT_u64_f32 + : InstPTX<(outs RRegu64:$d), (ins RRegf32:$a), "cvt.rni.u64.f32\t$d, $a", + [(set RRegu64:$d, (fp_to_uint RRegf32:$a))]>; + +def CVT_u64_f64 + : InstPTX<(outs RRegu64:$d), (ins RRegf64:$a), "cvt.rni.u64.f64\t$d, $a", + [(set RRegu64:$d, (fp_to_uint RRegf64:$a))]>; + +// Conversion to f32 + +def CVT_f32_pred + : InstPTX<(outs RRegf32:$d), (ins Preds:$a), "cvt.rn.f32.pred\t$d, $a", + [(set RRegf32:$d, (uint_to_fp Preds:$a))]>; + +def CVT_f32_u16 + : InstPTX<(outs RRegf32:$d), (ins RRegu16:$a), "cvt.rn.f32.u16\t$d, $a", + [(set RRegf32:$d, (uint_to_fp RRegu16:$a))]>; + +def CVT_f32_u32 + : InstPTX<(outs RRegf32:$d), (ins RRegu32:$a), "cvt.rn.f32.u32\t$d, $a", + [(set RRegf32:$d, (uint_to_fp RRegu32:$a))]>; + +def CVT_f32_u64 + : InstPTX<(outs RRegf32:$d), (ins RRegu64:$a), "cvt.rn.f32.u64\t$d, $a", + [(set RRegf32:$d, (uint_to_fp RRegu64:$a))]>; + +def CVT_f32_f64 + : InstPTX<(outs RRegf32:$d), (ins RRegf64:$a), "cvt.rn.f32.f64\t$d, $a", + [(set RRegf32:$d, (fround RRegf64:$a))]>; + +// Conversion to f64 + +def CVT_f64_pred + : InstPTX<(outs RRegf64:$d), (ins Preds:$a), "cvt.rn.f64.pred\t$d, $a", + [(set RRegf64:$d, (uint_to_fp Preds:$a))]>; + +def CVT_f64_u16 + : InstPTX<(outs RRegf64:$d), (ins RRegu16:$a), "cvt.rn.f64.u16\t$d, $a", + [(set RRegf64:$d, (uint_to_fp RRegu16:$a))]>; + +def CVT_f64_u32 + : InstPTX<(outs RRegf64:$d), (ins RRegu32:$a), "cvt.rn.f64.u32\t$d, $a", + [(set RRegf64:$d, (uint_to_fp RRegu32:$a))]>; + +def CVT_f64_u64 + : InstPTX<(outs RRegf64:$d), (ins RRegu64:$a), "cvt.rn.f64.u64\t$d, $a", + [(set RRegf64:$d, (uint_to_fp RRegu64:$a))]>; + +def CVT_f64_f32 + : InstPTX<(outs RRegf64:$d), (ins RRegf32:$a), "cvt.f64.f32\t$d, $a", + [(set RRegf64:$d, (fextend RRegf32:$a))]>; ///===- Control Flow Instructions -----------------------------------------===// +let isBranch = 1, isTerminator = 1, isBarrier = 1 in { + def BRAd + : InstPTX<(outs), (ins brtarget:$d), "bra\t$d", [(br bb:$d)]>; +} + +let isBranch = 1, isTerminator = 1 in { + // FIXME: The pattern part is blank because I cannot (or do not yet know + // how to) use the first operand of PredicateOperand (a Preds register) here + def BRAdp + : InstPTX<(outs), (ins brtarget:$d), "bra\t$d", + [/*(brcond pred:$_p, bb:$d)*/]>; +} + let isReturn = 1, isTerminator = 1, isBarrier = 1 in { def EXIT : InstPTX<(outs), (ins), "exit", [(PTXexit)]>; def RET : InstPTX<(outs), (ins), "ret", [(PTXret)]>; } + +///===- Intrinsic Instructions --------------------------------------------===// + +include "PTXIntrinsicInstrInfo.td" diff --git a/lib/Target/PTX/PTXIntrinsicInstrInfo.td b/lib/Target/PTX/PTXIntrinsicInstrInfo.td new file mode 100644 index 0000000..320934a --- /dev/null +++ b/lib/Target/PTX/PTXIntrinsicInstrInfo.td @@ -0,0 +1,84 @@ +//===- PTXIntrinsicInstrInfo.td - Defines PTX intrinsics ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines all of the PTX-specific intrinsic instructions. +// +//===----------------------------------------------------------------------===// + +// PTX Special Purpose Register Accessor Intrinsics + +class PTX_READ_SPECIAL_REGISTER_R64<string regname, Intrinsic intop> + : InstPTX<(outs RRegu64:$d), (ins), + !strconcat("mov.u64\t$d, %", regname), + [(set RRegu64:$d, (intop))]>; + +class PTX_READ_SPECIAL_REGISTER_R32<string regname, Intrinsic intop> + : InstPTX<(outs RRegu32:$d), (ins), + !strconcat("mov.u32\t$d, %", regname), + [(set RRegu32:$d, (intop))]>; + +// TODO Add read vector-version of special registers + +//def PTX_READ_TID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"tid", int_ptx_read_tid_r64>; +def PTX_READ_TID_X : PTX_READ_SPECIAL_REGISTER_R32<"tid.x", int_ptx_read_tid_x>; +def PTX_READ_TID_Y : PTX_READ_SPECIAL_REGISTER_R32<"tid.y", int_ptx_read_tid_y>; +def PTX_READ_TID_Z : PTX_READ_SPECIAL_REGISTER_R32<"tid.z", int_ptx_read_tid_z>; +def PTX_READ_TID_W : PTX_READ_SPECIAL_REGISTER_R32<"tid.w", int_ptx_read_tid_w>; + +//def PTX_READ_NTID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"ntid", int_ptx_read_ntid_r64>; +def PTX_READ_NTID_X : PTX_READ_SPECIAL_REGISTER_R32<"ntid.x", int_ptx_read_ntid_x>; +def PTX_READ_NTID_Y : PTX_READ_SPECIAL_REGISTER_R32<"ntid.y", int_ptx_read_ntid_y>; +def PTX_READ_NTID_Z : PTX_READ_SPECIAL_REGISTER_R32<"ntid.z", int_ptx_read_ntid_z>; +def PTX_READ_NTID_W : PTX_READ_SPECIAL_REGISTER_R32<"ntid.w", int_ptx_read_ntid_w>; + +def PTX_READ_LANEID : PTX_READ_SPECIAL_REGISTER_R32<"laneid", int_ptx_read_laneid>; +def PTX_READ_WARPID : PTX_READ_SPECIAL_REGISTER_R32<"warpid", int_ptx_read_warpid>; +def PTX_READ_NWARPID : PTX_READ_SPECIAL_REGISTER_R32<"nwarpid", int_ptx_read_nwarpid>; + +//def PTX_READ_CTAID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"ctaid", int_ptx_read_ctaid_r64>; +def PTX_READ_CTAID_X : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.x", int_ptx_read_ctaid_x>; +def PTX_READ_CTAID_Y : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.y", int_ptx_read_ctaid_y>; +def PTX_READ_CTAID_Z : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.z", int_ptx_read_ctaid_z>; +def PTX_READ_CTAID_W : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.w", int_ptx_read_ctaid_w>; + +//def PTX_READ_NCTAID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"nctaid", int_ptx_read_nctaid_r64>; +def PTX_READ_NCTAID_X : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.x", int_ptx_read_nctaid_x>; +def PTX_READ_NCTAID_Y : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.y", int_ptx_read_nctaid_y>; +def PTX_READ_NCTAID_Z : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.z", int_ptx_read_nctaid_z>; +def PTX_READ_NCTAID_W : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.w", int_ptx_read_nctaid_w>; + +def PTX_READ_SMID : PTX_READ_SPECIAL_REGISTER_R32<"smid", int_ptx_read_smid>; +def PTX_READ_NSMID : PTX_READ_SPECIAL_REGISTER_R32<"nsmid", int_ptx_read_nsmid>; +def PTX_READ_GRIDID : PTX_READ_SPECIAL_REGISTER_R32<"gridid", int_ptx_read_gridid>; + +def PTX_READ_LANEMASK_EQ + : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_eq", int_ptx_read_lanemask_eq>; +def PTX_READ_LANEMASK_LE + : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_le", int_ptx_read_lanemask_le>; +def PTX_READ_LANEMASK_LT + : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_lt", int_ptx_read_lanemask_lt>; +def PTX_READ_LANEMASK_GE + : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_ge", int_ptx_read_lanemask_ge>; +def PTX_READ_LANEMASK_GT + : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_gt", int_ptx_read_lanemask_gt>; + +def PTX_READ_CLOCK + : PTX_READ_SPECIAL_REGISTER_R32<"clock", int_ptx_read_clock>; +def PTX_READ_CLOCK64 + : PTX_READ_SPECIAL_REGISTER_R64<"clock64", int_ptx_read_clock64>; + +def PTX_READ_PM0 : PTX_READ_SPECIAL_REGISTER_R32<"pm0", int_ptx_read_pm0>; +def PTX_READ_PM1 : PTX_READ_SPECIAL_REGISTER_R32<"pm1", int_ptx_read_pm1>; +def PTX_READ_PM2 : PTX_READ_SPECIAL_REGISTER_R32<"pm2", int_ptx_read_pm2>; +def PTX_READ_PM3 : PTX_READ_SPECIAL_REGISTER_R32<"pm3", int_ptx_read_pm3>; + +// PTX Parallel Synchronization and Communication Intrinsics + +def PTX_BAR_SYNC : InstPTX<(outs), (ins i32imm:$i), "bar.sync\t$i", + [(int_ptx_bar_sync imm:$i)]>; diff --git a/lib/Target/PTX/PTXMCAsmStreamer.cpp b/lib/Target/PTX/PTXMCAsmStreamer.cpp index 0886ba8..1574670 100644 --- a/lib/Target/PTX/PTXMCAsmStreamer.cpp +++ b/lib/Target/PTX/PTXMCAsmStreamer.cpp @@ -143,9 +143,9 @@ public: virtual void EmitBytes(StringRef Data, unsigned AddrSpace); virtual void EmitValueImpl(const MCExpr *Value, unsigned Size, - bool isPCRel, unsigned AddrSpace); - virtual void EmitULEB128Value(const MCExpr *Value, unsigned AddrSpace = 0); - virtual void EmitSLEB128Value(const MCExpr *Value, unsigned AddrSpace = 0); + unsigned AddrSpace); + virtual void EmitULEB128Value(const MCExpr *Value); + virtual void EmitSLEB128Value(const MCExpr *Value); virtual void EmitGPRel32Value(const MCExpr *Value); @@ -233,7 +233,7 @@ void PTXMCAsmStreamer::ChangeSection(const MCSection *Section) { void PTXMCAsmStreamer::EmitLabel(MCSymbol *Symbol) { assert(Symbol->isUndefined() && "Cannot define a symbol twice!"); assert(!Symbol->isVariable() && "Cannot emit a variable symbol!"); - assert(getCurrentSection() && "Cannot emit before setting section!"); + //assert(getCurrentSection() && "Cannot emit before setting section!"); OS << *Symbol << MAI.getLabelSuffix(); EmitEOL(); @@ -352,9 +352,8 @@ void PTXMCAsmStreamer::EmitBytes(StringRef Data, unsigned AddrSpace) { } void PTXMCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size, - bool isPCRel, unsigned AddrSpace) { + unsigned AddrSpace) { assert(getCurrentSection() && "Cannot emit contents before setting section!"); - assert(!isPCRel && "Cannot emit pc relative relocations!"); const char *Directive = 0; switch (Size) { default: break; @@ -383,15 +382,13 @@ void PTXMCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size, EmitEOL(); } -void PTXMCAsmStreamer::EmitULEB128Value(const MCExpr *Value, - unsigned AddrSpace) { +void PTXMCAsmStreamer::EmitULEB128Value(const MCExpr *Value) { assert(MAI.hasLEB128() && "Cannot print a .uleb"); OS << ".uleb128 " << *Value; EmitEOL(); } -void PTXMCAsmStreamer::EmitSLEB128Value(const MCExpr *Value, - unsigned AddrSpace) { +void PTXMCAsmStreamer::EmitSLEB128Value(const MCExpr *Value) { assert(MAI.hasLEB128() && "Cannot print a .sleb"); OS << ".sleb128 " << *Value; EmitEOL(); @@ -423,7 +420,8 @@ void PTXMCAsmStreamer::EmitFill(uint64_t NumBytes, uint8_t FillValue, MCStreamer::EmitFill(NumBytes, FillValue, AddrSpace); } -void PTXMCAsmStreamer::EmitValueToAlignment(unsigned ByteAlignment, int64_t Value, +void PTXMCAsmStreamer::EmitValueToAlignment(unsigned ByteAlignment, + int64_t Value, unsigned ValueSize, unsigned MaxBytesToEmit) { // Some assemblers don't support non-power of two alignments, so we always @@ -532,7 +530,7 @@ void PTXMCAsmStreamer::Finish() {} namespace llvm { MCStreamer *createPTXAsmStreamer(MCContext &Context, formatted_raw_ostream &OS, - bool isVerboseAsm, bool useLoc, + bool isVerboseAsm, bool useLoc, bool useCFI, MCInstPrinter *IP, MCCodeEmitter *CE, TargetAsmBackend *TAB, bool ShowInst) { diff --git a/lib/Target/PTX/PTXMFInfoExtract.cpp b/lib/Target/PTX/PTXMFInfoExtract.cpp index b37c740..c5e1910 100644 --- a/lib/Target/PTX/PTXMFInfoExtract.cpp +++ b/lib/Target/PTX/PTXMFInfoExtract.cpp @@ -79,12 +79,12 @@ bool PTXMFInfoExtract::runOnMachineFunction(MachineFunction &MF) { DEBUG(for (PTXMachineFunctionInfo::reg_iterator i = MFI->argRegBegin(), e = MFI->argRegEnd(); - i != e; ++i) + i != e; ++i) dbgs() << "Arg Reg: " << *i << "\n";); DEBUG(for (PTXMachineFunctionInfo::reg_iterator i = MFI->localVarRegBegin(), e = MFI->localVarRegEnd(); - i != e; ++i) + i != e; ++i) dbgs() << "Local Var Reg: " << *i << "\n";); return false; diff --git a/lib/Target/PTX/PTXMachineFunctionInfo.h b/lib/Target/PTX/PTXMachineFunctionInfo.h index 56d044b..81df1c2 100644 --- a/lib/Target/PTX/PTXMachineFunctionInfo.h +++ b/lib/Target/PTX/PTXMachineFunctionInfo.h @@ -42,36 +42,37 @@ public: void setRetReg(unsigned reg) { reg_ret = reg; } void doneAddArg(void) { - std::sort(reg_arg.begin(), reg_arg.end()); _isDoneAddArg = true; } - void doneAddLocalVar(void) { - std::sort(reg_local_var.begin(), reg_local_var.end()); - } + void doneAddLocalVar(void) {} bool isDoneAddArg(void) { return _isDoneAddArg; } bool isKernel() const { return is_kernel; } - typedef std::vector<unsigned>::const_iterator reg_iterator; + typedef std::vector<unsigned>::const_iterator reg_iterator; + typedef std::vector<unsigned>::const_reverse_iterator reg_reverse_iterator; - bool argRegEmpty() const { return reg_arg.empty(); } - int getNumArg() const { return reg_arg.size(); } + bool argRegEmpty() const { return reg_arg.empty(); } + int getNumArg() const { return reg_arg.size(); } reg_iterator argRegBegin() const { return reg_arg.begin(); } reg_iterator argRegEnd() const { return reg_arg.end(); } + reg_reverse_iterator argRegReverseBegin() const { return reg_arg.rbegin(); } + reg_reverse_iterator argRegReverseEnd() const { return reg_arg.rend(); } - bool localVarRegEmpty() const { return reg_local_var.empty(); } + bool localVarRegEmpty() const { return reg_local_var.empty(); } reg_iterator localVarRegBegin() const { return reg_local_var.begin(); } reg_iterator localVarRegEnd() const { return reg_local_var.end(); } unsigned retReg() const { return reg_ret; } bool isArgReg(unsigned reg) const { - return std::binary_search(reg_arg.begin(), reg_arg.end(), reg); + return std::find(reg_arg.begin(), reg_arg.end(), reg) != reg_arg.end(); } bool isLocalVarReg(unsigned reg) const { - return std::binary_search(reg_local_var.begin(), reg_local_var.end(), reg); + return std::find(reg_local_var.begin(), reg_local_var.end(), reg) + != reg_local_var.end(); } }; // class PTXMachineFunctionInfo } // namespace llvm diff --git a/lib/Target/PTX/PTXRegisterInfo.td b/lib/Target/PTX/PTXRegisterInfo.td index 22e2b34..f616141 100644 --- a/lib/Target/PTX/PTXRegisterInfo.td +++ b/lib/Target/PTX/PTXRegisterInfo.td @@ -19,6 +19,8 @@ class PTXReg<string n> : Register<n> { // Registers //===----------------------------------------------------------------------===// +///===- Predicate Registers -----------------------------------------------===// + def P0 : PTXReg<"p0">; def P1 : PTXReg<"p1">; def P2 : PTXReg<"p2">; @@ -51,6 +53,108 @@ def P28 : PTXReg<"p28">; def P29 : PTXReg<"p29">; def P30 : PTXReg<"p30">; def P31 : PTXReg<"p31">; +def P32 : PTXReg<"p32">; +def P33 : PTXReg<"p33">; +def P34 : PTXReg<"p34">; +def P35 : PTXReg<"p35">; +def P36 : PTXReg<"p36">; +def P37 : PTXReg<"p37">; +def P38 : PTXReg<"p38">; +def P39 : PTXReg<"p39">; +def P40 : PTXReg<"p40">; +def P41 : PTXReg<"p41">; +def P42 : PTXReg<"p42">; +def P43 : PTXReg<"p43">; +def P44 : PTXReg<"p44">; +def P45 : PTXReg<"p45">; +def P46 : PTXReg<"p46">; +def P47 : PTXReg<"p47">; +def P48 : PTXReg<"p48">; +def P49 : PTXReg<"p49">; +def P50 : PTXReg<"p50">; +def P51 : PTXReg<"p51">; +def P52 : PTXReg<"p52">; +def P53 : PTXReg<"p53">; +def P54 : PTXReg<"p54">; +def P55 : PTXReg<"p55">; +def P56 : PTXReg<"p56">; +def P57 : PTXReg<"p57">; +def P58 : PTXReg<"p58">; +def P59 : PTXReg<"p59">; +def P60 : PTXReg<"p60">; +def P61 : PTXReg<"p61">; +def P62 : PTXReg<"p62">; +def P63 : PTXReg<"p63">; + +///===- 16-bit Integer Registers ------------------------------------------===// + +def RH0 : PTXReg<"rh0">; +def RH1 : PTXReg<"rh1">; +def RH2 : PTXReg<"rh2">; +def RH3 : PTXReg<"rh3">; +def RH4 : PTXReg<"rh4">; +def RH5 : PTXReg<"rh5">; +def RH6 : PTXReg<"rh6">; +def RH7 : PTXReg<"rh7">; +def RH8 : PTXReg<"rh8">; +def RH9 : PTXReg<"rh9">; +def RH10 : PTXReg<"rh10">; +def RH11 : PTXReg<"rh11">; +def RH12 : PTXReg<"rh12">; +def RH13 : PTXReg<"rh13">; +def RH14 : PTXReg<"rh14">; +def RH15 : PTXReg<"rh15">; +def RH16 : PTXReg<"rh16">; +def RH17 : PTXReg<"rh17">; +def RH18 : PTXReg<"rh18">; +def RH19 : PTXReg<"rh19">; +def RH20 : PTXReg<"rh20">; +def RH21 : PTXReg<"rh21">; +def RH22 : PTXReg<"rh22">; +def RH23 : PTXReg<"rh23">; +def RH24 : PTXReg<"rh24">; +def RH25 : PTXReg<"rh25">; +def RH26 : PTXReg<"rh26">; +def RH27 : PTXReg<"rh27">; +def RH28 : PTXReg<"rh28">; +def RH29 : PTXReg<"rh29">; +def RH30 : PTXReg<"rh30">; +def RH31 : PTXReg<"rh31">; +def RH32 : PTXReg<"rh32">; +def RH33 : PTXReg<"rh33">; +def RH34 : PTXReg<"rh34">; +def RH35 : PTXReg<"rh35">; +def RH36 : PTXReg<"rh36">; +def RH37 : PTXReg<"rh37">; +def RH38 : PTXReg<"rh38">; +def RH39 : PTXReg<"rh39">; +def RH40 : PTXReg<"rh40">; +def RH41 : PTXReg<"rh41">; +def RH42 : PTXReg<"rh42">; +def RH43 : PTXReg<"rh43">; +def RH44 : PTXReg<"rh44">; +def RH45 : PTXReg<"rh45">; +def RH46 : PTXReg<"rh46">; +def RH47 : PTXReg<"rh47">; +def RH48 : PTXReg<"rh48">; +def RH49 : PTXReg<"rh49">; +def RH50 : PTXReg<"rh50">; +def RH51 : PTXReg<"rh51">; +def RH52 : PTXReg<"rh52">; +def RH53 : PTXReg<"rh53">; +def RH54 : PTXReg<"rh54">; +def RH55 : PTXReg<"rh55">; +def RH56 : PTXReg<"rh56">; +def RH57 : PTXReg<"rh57">; +def RH58 : PTXReg<"rh58">; +def RH59 : PTXReg<"rh59">; +def RH60 : PTXReg<"rh60">; +def RH61 : PTXReg<"rh61">; +def RH62 : PTXReg<"rh62">; +def RH63 : PTXReg<"rh63">; + + +///===- 32-bit Integer Registers ------------------------------------------===// def R0 : PTXReg<"r0">; def R1 : PTXReg<"r1">; @@ -84,6 +188,243 @@ def R28 : PTXReg<"r28">; def R29 : PTXReg<"r29">; def R30 : PTXReg<"r30">; def R31 : PTXReg<"r31">; +def R32 : PTXReg<"r32">; +def R33 : PTXReg<"r33">; +def R34 : PTXReg<"r34">; +def R35 : PTXReg<"r35">; +def R36 : PTXReg<"r36">; +def R37 : PTXReg<"r37">; +def R38 : PTXReg<"r38">; +def R39 : PTXReg<"r39">; +def R40 : PTXReg<"r40">; +def R41 : PTXReg<"r41">; +def R42 : PTXReg<"r42">; +def R43 : PTXReg<"r43">; +def R44 : PTXReg<"r44">; +def R45 : PTXReg<"r45">; +def R46 : PTXReg<"r46">; +def R47 : PTXReg<"r47">; +def R48 : PTXReg<"r48">; +def R49 : PTXReg<"r49">; +def R50 : PTXReg<"r50">; +def R51 : PTXReg<"r51">; +def R52 : PTXReg<"r52">; +def R53 : PTXReg<"r53">; +def R54 : PTXReg<"r54">; +def R55 : PTXReg<"r55">; +def R56 : PTXReg<"r56">; +def R57 : PTXReg<"r57">; +def R58 : PTXReg<"r58">; +def R59 : PTXReg<"r59">; +def R60 : PTXReg<"r60">; +def R61 : PTXReg<"r61">; +def R62 : PTXReg<"r62">; +def R63 : PTXReg<"r63">; + + +///===- 64-bit Integer Registers ------------------------------------------===// + +def RD0 : PTXReg<"rd0">; +def RD1 : PTXReg<"rd1">; +def RD2 : PTXReg<"rd2">; +def RD3 : PTXReg<"rd3">; +def RD4 : PTXReg<"rd4">; +def RD5 : PTXReg<"rd5">; +def RD6 : PTXReg<"rd6">; +def RD7 : PTXReg<"rd7">; +def RD8 : PTXReg<"rd8">; +def RD9 : PTXReg<"rd9">; +def RD10 : PTXReg<"rd10">; +def RD11 : PTXReg<"rd11">; +def RD12 : PTXReg<"rd12">; +def RD13 : PTXReg<"rd13">; +def RD14 : PTXReg<"rd14">; +def RD15 : PTXReg<"rd15">; +def RD16 : PTXReg<"rd16">; +def RD17 : PTXReg<"rd17">; +def RD18 : PTXReg<"rd18">; +def RD19 : PTXReg<"rd19">; +def RD20 : PTXReg<"rd20">; +def RD21 : PTXReg<"rd21">; +def RD22 : PTXReg<"rd22">; +def RD23 : PTXReg<"rd23">; +def RD24 : PTXReg<"rd24">; +def RD25 : PTXReg<"rd25">; +def RD26 : PTXReg<"rd26">; +def RD27 : PTXReg<"rd27">; +def RD28 : PTXReg<"rd28">; +def RD29 : PTXReg<"rd29">; +def RD30 : PTXReg<"rd30">; +def RD31 : PTXReg<"rd31">; +def RD32 : PTXReg<"rd32">; +def RD33 : PTXReg<"rd33">; +def RD34 : PTXReg<"rd34">; +def RD35 : PTXReg<"rd35">; +def RD36 : PTXReg<"rd36">; +def RD37 : PTXReg<"rd37">; +def RD38 : PTXReg<"rd38">; +def RD39 : PTXReg<"rd39">; +def RD40 : PTXReg<"rd40">; +def RD41 : PTXReg<"rd41">; +def RD42 : PTXReg<"rd42">; +def RD43 : PTXReg<"rd43">; +def RD44 : PTXReg<"rd44">; +def RD45 : PTXReg<"rd45">; +def RD46 : PTXReg<"rd46">; +def RD47 : PTXReg<"rd47">; +def RD48 : PTXReg<"rd48">; +def RD49 : PTXReg<"rd49">; +def RD50 : PTXReg<"rd50">; +def RD51 : PTXReg<"rd51">; +def RD52 : PTXReg<"rd52">; +def RD53 : PTXReg<"rd53">; +def RD54 : PTXReg<"rd54">; +def RD55 : PTXReg<"rd55">; +def RD56 : PTXReg<"rd56">; +def RD57 : PTXReg<"rd57">; +def RD58 : PTXReg<"rd58">; +def RD59 : PTXReg<"rd59">; +def RD60 : PTXReg<"rd60">; +def RD61 : PTXReg<"rd61">; +def RD62 : PTXReg<"rd62">; +def RD63 : PTXReg<"rd63">; + + +///===- 32-bit Floating-Point Registers -----------------------------------===// + +def F0 : PTXReg<"f0">; +def F1 : PTXReg<"f1">; +def F2 : PTXReg<"f2">; +def F3 : PTXReg<"f3">; +def F4 : PTXReg<"f4">; +def F5 : PTXReg<"f5">; +def F6 : PTXReg<"f6">; +def F7 : PTXReg<"f7">; +def F8 : PTXReg<"f8">; +def F9 : PTXReg<"f9">; +def F10 : PTXReg<"f10">; +def F11 : PTXReg<"f11">; +def F12 : PTXReg<"f12">; +def F13 : PTXReg<"f13">; +def F14 : PTXReg<"f14">; +def F15 : PTXReg<"f15">; +def F16 : PTXReg<"f16">; +def F17 : PTXReg<"f17">; +def F18 : PTXReg<"f18">; +def F19 : PTXReg<"f19">; +def F20 : PTXReg<"f20">; +def F21 : PTXReg<"f21">; +def F22 : PTXReg<"f22">; +def F23 : PTXReg<"f23">; +def F24 : PTXReg<"f24">; +def F25 : PTXReg<"f25">; +def F26 : PTXReg<"f26">; +def F27 : PTXReg<"f27">; +def F28 : PTXReg<"f28">; +def F29 : PTXReg<"f29">; +def F30 : PTXReg<"f30">; +def F31 : PTXReg<"f31">; +def F32 : PTXReg<"f32">; +def F33 : PTXReg<"f33">; +def F34 : PTXReg<"f34">; +def F35 : PTXReg<"f35">; +def F36 : PTXReg<"f36">; +def F37 : PTXReg<"f37">; +def F38 : PTXReg<"f38">; +def F39 : PTXReg<"f39">; +def F40 : PTXReg<"f40">; +def F41 : PTXReg<"f41">; +def F42 : PTXReg<"f42">; +def F43 : PTXReg<"f43">; +def F44 : PTXReg<"f44">; +def F45 : PTXReg<"f45">; +def F46 : PTXReg<"f46">; +def F47 : PTXReg<"f47">; +def F48 : PTXReg<"f48">; +def F49 : PTXReg<"f49">; +def F50 : PTXReg<"f50">; +def F51 : PTXReg<"f51">; +def F52 : PTXReg<"f52">; +def F53 : PTXReg<"f53">; +def F54 : PTXReg<"f54">; +def F55 : PTXReg<"f55">; +def F56 : PTXReg<"f56">; +def F57 : PTXReg<"f57">; +def F58 : PTXReg<"f58">; +def F59 : PTXReg<"f59">; +def F60 : PTXReg<"f60">; +def F61 : PTXReg<"f61">; +def F62 : PTXReg<"f62">; +def F63 : PTXReg<"f63">; + + +///===- 64-bit Floating-Point Registers -----------------------------------===// + +def FD0 : PTXReg<"fd0">; +def FD1 : PTXReg<"fd1">; +def FD2 : PTXReg<"fd2">; +def FD3 : PTXReg<"fd3">; +def FD4 : PTXReg<"fd4">; +def FD5 : PTXReg<"fd5">; +def FD6 : PTXReg<"fd6">; +def FD7 : PTXReg<"fd7">; +def FD8 : PTXReg<"fd8">; +def FD9 : PTXReg<"fd9">; +def FD10 : PTXReg<"fd10">; +def FD11 : PTXReg<"fd11">; +def FD12 : PTXReg<"fd12">; +def FD13 : PTXReg<"fd13">; +def FD14 : PTXReg<"fd14">; +def FD15 : PTXReg<"fd15">; +def FD16 : PTXReg<"fd16">; +def FD17 : PTXReg<"fd17">; +def FD18 : PTXReg<"fd18">; +def FD19 : PTXReg<"fd19">; +def FD20 : PTXReg<"fd20">; +def FD21 : PTXReg<"fd21">; +def FD22 : PTXReg<"fd22">; +def FD23 : PTXReg<"fd23">; +def FD24 : PTXReg<"fd24">; +def FD25 : PTXReg<"fd25">; +def FD26 : PTXReg<"fd26">; +def FD27 : PTXReg<"fd27">; +def FD28 : PTXReg<"fd28">; +def FD29 : PTXReg<"fd29">; +def FD30 : PTXReg<"fd30">; +def FD31 : PTXReg<"fd31">; +def FD32 : PTXReg<"fd32">; +def FD33 : PTXReg<"fd33">; +def FD34 : PTXReg<"fd34">; +def FD35 : PTXReg<"fd35">; +def FD36 : PTXReg<"fd36">; +def FD37 : PTXReg<"fd37">; +def FD38 : PTXReg<"fd38">; +def FD39 : PTXReg<"fd39">; +def FD40 : PTXReg<"fd40">; +def FD41 : PTXReg<"fd41">; +def FD42 : PTXReg<"fd42">; +def FD43 : PTXReg<"fd43">; +def FD44 : PTXReg<"fd44">; +def FD45 : PTXReg<"fd45">; +def FD46 : PTXReg<"f4d6">; +def FD47 : PTXReg<"fd47">; +def FD48 : PTXReg<"fd48">; +def FD49 : PTXReg<"fd49">; +def FD50 : PTXReg<"fd50">; +def FD51 : PTXReg<"fd51">; +def FD52 : PTXReg<"fd52">; +def FD53 : PTXReg<"fd53">; +def FD54 : PTXReg<"fd54">; +def FD55 : PTXReg<"fd55">; +def FD56 : PTXReg<"fd56">; +def FD57 : PTXReg<"fd57">; +def FD58 : PTXReg<"fd58">; +def FD59 : PTXReg<"fd59">; +def FD60 : PTXReg<"fd60">; +def FD61 : PTXReg<"fd61">; +def FD62 : PTXReg<"fd62">; +def FD63 : PTXReg<"fd63">; + //===----------------------------------------------------------------------===// // Register classes @@ -93,10 +434,58 @@ def Preds : RegisterClass<"PTX", [i1], 8, [P0, P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18, P19, P20, P21, P22, P23, - P24, P25, P26, P27, P28, P29, P30, P31]>; + P24, P25, P26, P27, P28, P29, P30, P31, + P32, P33, P34, P35, P36, P37, P38, P39, + P40, P41, P42, P43, P44, P45, P46, P47, + P48, P49, P50, P51, P52, P53, P54, P55, + P56, P57, P58, P59, P60, P61, P62, P63]>; -def RRegs32 : RegisterClass<"PTX", [i32], 32, +def RRegu16 : RegisterClass<"PTX", [i16], 16, + [RH0, RH1, RH2, RH3, RH4, RH5, RH6, RH7, + RH8, RH9, RH10, RH11, RH12, RH13, RH14, RH15, + RH16, RH17, RH18, RH19, RH20, RH21, RH22, RH23, + RH24, RH25, RH26, RH27, RH28, RH29, RH30, RH31, + RH32, RH33, RH34, RH35, RH36, RH37, RH38, RH39, + RH40, RH41, RH42, RH43, RH44, RH45, RH46, RH47, + RH48, RH49, RH50, RH51, RH52, RH53, RH54, RH55, + RH56, RH57, RH58, RH59, RH60, RH61, RH62, RH63]>; + +def RRegu32 : RegisterClass<"PTX", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R18, R19, R20, R21, R22, R23, - R24, R25, R26, R27, R28, R29, R30, R31]>; + R24, R25, R26, R27, R28, R29, R30, R31, + R32, R33, R34, R35, R36, R37, R38, R39, + R40, R41, R42, R43, R44, R45, R46, R47, + R48, R49, R50, R51, R52, R53, R54, R55, + R56, R57, R58, R59, R60, R61, R62, R63]>; + +def RRegu64 : RegisterClass<"PTX", [i64], 64, + [RD0, RD1, RD2, RD3, RD4, RD5, RD6, RD7, + RD8, RD9, RD10, RD11, RD12, RD13, RD14, RD15, + RD16, RD17, RD18, RD19, RD20, RD21, RD22, RD23, + RD24, RD25, RD26, RD27, RD28, RD29, RD30, RD31, + RD32, RD33, RD34, RD35, RD36, RD37, RD38, RD39, + RD40, RD41, RD42, RD43, RD44, RD45, RD46, RD47, + RD48, RD49, RD50, RD51, RD52, RD53, RD54, RD55, + RD56, RD57, RD58, RD59, RD60, RD61, RD62, RD63]>; + +def RRegf32 : RegisterClass<"PTX", [f32], 32, + [F0, F1, F2, F3, F4, F5, F6, F7, + F8, F9, F10, F11, F12, F13, F14, F15, + F16, F17, F18, F19, F20, F21, F22, F23, + F24, F25, F26, F27, F28, F29, F30, F31, + F32, F33, F34, F35, F36, F37, F38, F39, + F40, F41, F42, F43, F44, F45, F46, F47, + F48, F49, F50, F51, F52, F53, F54, F55, + F56, F57, F58, F59, F60, F61, F62, F63]>; + +def RRegf64 : RegisterClass<"PTX", [f64], 64, + [FD0, FD1, FD2, FD3, FD4, FD5, FD6, FD7, + FD8, FD9, FD10, FD11, FD12, FD13, FD14, FD15, + FD16, FD17, FD18, FD19, FD20, FD21, FD22, FD23, + FD24, FD25, FD26, FD27, FD28, FD29, FD30, FD31, + FD32, FD33, FD34, FD35, FD36, FD37, FD38, FD39, + FD40, FD41, FD42, FD43, FD44, FD45, FD46, FD47, + FD48, FD49, FD50, FD51, FD52, FD53, FD54, FD55, + FD56, FD57, FD58, FD59, FD60, FD61, FD62, FD63]>; diff --git a/lib/Target/PTX/PTXSubtarget.cpp b/lib/Target/PTX/PTXSubtarget.cpp index 00e2c88..a224f2b 100644 --- a/lib/Target/PTX/PTXSubtarget.cpp +++ b/lib/Target/PTX/PTXSubtarget.cpp @@ -12,12 +12,36 @@ //===----------------------------------------------------------------------===// #include "PTXSubtarget.h" +#include "llvm/Support/ErrorHandling.h" using namespace llvm; -PTXSubtarget::PTXSubtarget(const std::string &TT, const std::string &FS) { - std::string TARGET = "sm_20"; - // TODO: call ParseSubtargetFeatures(FS, TARGET); +PTXSubtarget::PTXSubtarget(const std::string &TT, const std::string &FS, + bool is64Bit) + : PTXShaderModel(PTX_SM_1_0), + PTXVersion(PTX_VERSION_2_0), + SupportsDouble(false), + Is64Bit(is64Bit) { + std::string TARGET = "generic"; + ParseSubtargetFeatures(FS, TARGET); +} + +std::string PTXSubtarget::getTargetString() const { + switch(PTXShaderModel) { + default: llvm_unreachable("Unknown shader model"); + case PTX_SM_1_0: return "sm_10"; + case PTX_SM_1_3: return "sm_13"; + case PTX_SM_2_0: return "sm_20"; + } +} + +std::string PTXSubtarget::getPTXVersionString() const { + switch(PTXVersion) { + default: llvm_unreachable("Unknown PTX version"); + case PTX_VERSION_2_0: return "2.0"; + case PTX_VERSION_2_1: return "2.1"; + case PTX_VERSION_2_2: return "2.2"; + } } #include "PTXGenSubtarget.inc" diff --git a/lib/Target/PTX/PTXSubtarget.h b/lib/Target/PTX/PTXSubtarget.h index 7fd85f8..47d9842 100644 --- a/lib/Target/PTX/PTXSubtarget.h +++ b/lib/Target/PTX/PTXSubtarget.h @@ -19,10 +19,57 @@ namespace llvm { class PTXSubtarget : public TargetSubtarget { private: - bool is_sm20; + + /** + * Enumeration of Shader Models supported by the back-end. + */ + enum PTXShaderModelEnum { + PTX_SM_1_0, /*< Shader Model 1.0 */ + PTX_SM_1_3, /*< Shader Model 1.3 */ + PTX_SM_2_0 /*< Shader Model 2.0 */ + }; + + /** + * Enumeration of PTX versions supported by the back-end. + * + * Currently, PTX 2.0 is the minimum supported version. + */ + enum PTXVersionEnum { + PTX_VERSION_2_0, /*< PTX Version 2.0 */ + PTX_VERSION_2_1, /*< PTX Version 2.1 */ + PTX_VERSION_2_2 /*< PTX Version 2.2 */ + }; + + /// Shader Model supported on the target GPU. + PTXShaderModelEnum PTXShaderModel; + + /// PTX Language Version. + PTXVersionEnum PTXVersion; + + // The native .f64 type is supported on the hardware. + bool SupportsDouble; + + // Use .u64 instead of .u32 for addresses. + bool Is64Bit; public: - PTXSubtarget(const std::string &TT, const std::string &FS); + PTXSubtarget(const std::string &TT, const std::string &FS, bool is64Bit); + + std::string getTargetString() const; + + std::string getPTXVersionString() const; + + bool supportsDouble() const { return SupportsDouble; } + + bool is64Bit() const { return Is64Bit; } + + bool supportsSM13() const { return PTXShaderModel >= PTX_SM_1_3; } + + bool supportsSM20() const { return PTXShaderModel >= PTX_SM_2_0; } + + bool supportsPTX21() const { return PTXVersion >= PTX_VERSION_2_1; } + + bool supportsPTX22() const { return PTXVersion >= PTX_VERSION_2_2; } std::string ParseSubtargetFeatures(const std::string &FS, const std::string &CPU); diff --git a/lib/Target/PTX/PTXTargetMachine.cpp b/lib/Target/PTX/PTXTargetMachine.cpp index b263813..1b737c9 100644 --- a/lib/Target/PTX/PTXTargetMachine.cpp +++ b/lib/Target/PTX/PTXTargetMachine.cpp @@ -16,12 +16,14 @@ #include "PTXTargetMachine.h" #include "llvm/PassManager.h" #include "llvm/Target/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; namespace llvm { MCStreamer *createPTXAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS, bool isVerboseAsm, bool useLoc, + bool useCFI, MCInstPrinter *InstPrint, MCCodeEmitter *CE, TargetAsmBackend *TAB, @@ -29,21 +31,47 @@ namespace llvm { } extern "C" void LLVMInitializePTXTarget() { - RegisterTargetMachine<PTXTargetMachine> X(ThePTXTarget); - RegisterAsmInfo<PTXMCAsmInfo> Y(ThePTXTarget); - TargetRegistry::RegisterAsmStreamer(ThePTXTarget, createPTXAsmStreamer); + + RegisterTargetMachine<PTX32TargetMachine> X(ThePTX32Target); + RegisterTargetMachine<PTX64TargetMachine> Y(ThePTX64Target); + + RegisterAsmInfo<PTXMCAsmInfo> Z(ThePTX32Target); + RegisterAsmInfo<PTXMCAsmInfo> W(ThePTX64Target); + + TargetRegistry::RegisterAsmStreamer(ThePTX32Target, createPTXAsmStreamer); + TargetRegistry::RegisterAsmStreamer(ThePTX64Target, createPTXAsmStreamer); +} + +namespace { + const char* DataLayout32 = + "e-p:32:32-i64:32:32-f64:32:32-v128:32:128-v64:32:64-n32:64"; + const char* DataLayout64 = + "e-p:64:64-i64:32:32-f64:32:32-v128:32:128-v64:32:64-n32:64"; } // DataLayout and FrameLowering are filled with dummy data PTXTargetMachine::PTXTargetMachine(const Target &T, const std::string &TT, - const std::string &FS) + const std::string &FS, + bool is64Bit) : LLVMTargetMachine(T, TT), - DataLayout("e-p:32:32-i64:32:32-f64:32:32-v128:32:128-v64:32:64-n32:64"), + DataLayout(is64Bit ? DataLayout64 : DataLayout32), + Subtarget(TT, FS, is64Bit), FrameLowering(Subtarget), InstrInfo(*this), - TLInfo(*this), - Subtarget(TT, FS) { + TLInfo(*this) { +} + +PTX32TargetMachine::PTX32TargetMachine(const Target &T, + const std::string& TT, + const std::string& FS) + : PTXTargetMachine(T, TT, FS, false) { +} + +PTX64TargetMachine::PTX64TargetMachine(const Target &T, + const std::string& TT, + const std::string& FS) + : PTXTargetMachine(T, TT, FS, true) { } bool PTXTargetMachine::addInstSelector(PassManagerBase &PM, diff --git a/lib/Target/PTX/PTXTargetMachine.h b/lib/Target/PTX/PTXTargetMachine.h index 728e36f..149be8e 100644 --- a/lib/Target/PTX/PTXTargetMachine.h +++ b/lib/Target/PTX/PTXTargetMachine.h @@ -25,15 +25,15 @@ namespace llvm { class PTXTargetMachine : public LLVMTargetMachine { private: - const TargetData DataLayout; - PTXFrameLowering FrameLowering; - PTXInstrInfo InstrInfo; + const TargetData DataLayout; + PTXSubtarget Subtarget; // has to be initialized before FrameLowering + PTXFrameLowering FrameLowering; + PTXInstrInfo InstrInfo; PTXTargetLowering TLInfo; - PTXSubtarget Subtarget; public: PTXTargetMachine(const Target &T, const std::string &TT, - const std::string &FS); + const std::string &FS, bool is64Bit); virtual const TargetData *getTargetData() const { return &DataLayout; } @@ -55,6 +55,22 @@ class PTXTargetMachine : public LLVMTargetMachine { virtual bool addPostRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel); }; // class PTXTargetMachine + + +class PTX32TargetMachine : public PTXTargetMachine { +public: + + PTX32TargetMachine(const Target &T, const std::string &TT, + const std::string& FS); +}; // class PTX32TargetMachine + +class PTX64TargetMachine : public PTXTargetMachine { +public: + + PTX64TargetMachine(const Target &T, const std::string &TT, + const std::string& FS); +}; // class PTX32TargetMachine + } // namespace llvm #endif // PTX_TARGET_MACHINE_H diff --git a/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp b/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp index a577d77..9df6c75 100644 --- a/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp +++ b/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp @@ -13,9 +13,13 @@ using namespace llvm; -Target llvm::ThePTXTarget; +Target llvm::ThePTX32Target; +Target llvm::ThePTX64Target; extern "C" void LLVMInitializePTXTargetInfo() { // see llvm/ADT/Triple.h - RegisterTarget<Triple::ptx> X(ThePTXTarget, "ptx", "PTX"); + RegisterTarget<Triple::ptx32> X32(ThePTX32Target, "ptx32", + "PTX (32-bit) [Experimental]"); + RegisterTarget<Triple::ptx64> X64(ThePTX64Target, "ptx64", + "PTX (64-bit) [Experimental]"); } diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h index ebc10da..9cf9db9 100644 --- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h +++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h @@ -17,13 +17,16 @@ #include "llvm/MC/MCInstPrinter.h" namespace llvm { - class MCOperand; + +class MCOperand; +class TargetMachine; class PPCInstPrinter : public MCInstPrinter { // 0 -> AIX, 1 -> Darwin. unsigned SyntaxVariant; public: - PPCInstPrinter(const MCAsmInfo &MAI, unsigned syntaxVariant) + PPCInstPrinter(TargetMachine &TM, const MCAsmInfo &MAI, + unsigned syntaxVariant) : MCInstPrinter(MAI), SyntaxVariant(syntaxVariant) {} bool isDarwinSyntax() const { diff --git a/lib/Target/PowerPC/PPCAsmBackend.cpp b/lib/Target/PowerPC/PPCAsmBackend.cpp index c4d4ac9..f562a3f 100644 --- a/lib/Target/PowerPC/PPCAsmBackend.cpp +++ b/lib/Target/PowerPC/PPCAsmBackend.cpp @@ -110,10 +110,8 @@ namespace { TargetAsmBackend *llvm::createPPCAsmBackend(const Target &T, const std::string &TT) { - switch (Triple(TT).getOS()) { - case Triple::Darwin: + if (Triple(TT).isOSDarwin()) return new DarwinPPCAsmBackend(T); - default: - return 0; - } + + return 0; } diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp index 8ed5d7f..09a9be9 100644 --- a/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -680,9 +680,10 @@ static AsmPrinter *createPPCAsmPrinterPass(TargetMachine &tm, } static MCInstPrinter *createPPCMCInstPrinter(const Target &T, + TargetMachine &TM, unsigned SyntaxVariant, const MCAsmInfo &MAI) { - return new PPCInstPrinter(MAI, SyntaxVariant); + return new PPCInstPrinter(TM, MAI, SyntaxVariant); } diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 70d00e4..128522c 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -899,7 +899,8 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, short Imm; if (isIntS16Immediate(CN, Imm)) { Disp = DAG.getTargetConstant(Imm, CN->getValueType(0)); - Base = DAG.getRegister(PPC::R0, CN->getValueType(0)); + Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::X0 : PPC::R0, + CN->getValueType(0)); return true; } @@ -947,7 +948,8 @@ bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base, } // Otherwise, do it the hard way, using R0 as the base register. - Base = DAG.getRegister(PPC::R0, N.getValueType()); + Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::X0 : PPC::R0, + N.getValueType()); Index = N; return true; } @@ -2153,7 +2155,7 @@ CalculateParameterAndLinkageAreaSize(SelectionDAG &DAG, } /// CalculateTailCallSPDiff - Get the amount the stack pointer has to be -/// adjusted to accomodate the arguments for the tailcall. +/// adjusted to accommodate the arguments for the tailcall. static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall, unsigned ParamSize) { @@ -2394,7 +2396,7 @@ void PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain, // Emit a sequence of copyto/copyfrom virtual registers for arguments that // might overwrite each other in case of tail call optimization. SmallVector<SDValue, 8> MemOpChains2; - // Do not flag preceeding copytoreg stuff together with the following stuff. + // Do not flag preceding copytoreg stuff together with the following stuff. InFlag = SDValue(); StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments, MemOpChains2, dl); @@ -2442,7 +2444,8 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, if (!DAG.getTarget().getSubtarget<PPCSubtarget>().isJITCodeModel()) { unsigned OpFlags = 0; if (DAG.getTarget().getRelocationModel() != Reloc::Static && - PPCSubTarget.getDarwinVers() < 9 && + (!PPCSubTarget.getTargetTriple().isMacOSX() || + PPCSubTarget.getTargetTriple().isMacOSXVersionLT(10, 5)) && (G->getGlobal()->isDeclaration() || G->getGlobal()->isWeakForLinker())) { // PC-relative references to external symbols should go through $stub, @@ -2465,7 +2468,8 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, unsigned char OpFlags = 0; if (DAG.getTarget().getRelocationModel() != Reloc::Static && - PPCSubTarget.getDarwinVers() < 9) { + (!PPCSubTarget.getTargetTriple().isMacOSX() || + PPCSubTarget.getTargetTriple().isMacOSXVersionLT(10, 5))) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. @@ -4571,6 +4575,7 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI, // registers without caring whether they're 32 or 64, but here we're // doing actual arithmetic on the addresses. bool is64bit = PPCSubTarget.isPPC64(); + unsigned ZeroReg = is64bit ? PPC::X0 : PPC::R0; const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction *F = BB->getParent(); @@ -4634,8 +4639,7 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI, // bne- loopMBB // fallthrough --> exitMBB // srw dest, tmpDest, shift - - if (ptrA!=PPC::R0) { + if (ptrA != ZeroReg) { Ptr1Reg = RegInfo.createVirtualRegister(RC); BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg) .addReg(ptrA).addReg(ptrB); @@ -4665,7 +4669,7 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI, BB = loopMBB; BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg) - .addReg(PPC::R0).addReg(PtrReg); + .addReg(ZeroReg).addReg(PtrReg); if (BinOpcode) BuildMI(BB, dl, TII->get(BinOpcode), TmpReg) .addReg(Incr2Reg).addReg(TmpDestReg); @@ -4676,7 +4680,7 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI, BuildMI(BB, dl, TII->get(is64bit ? PPC::OR8 : PPC::OR), Tmp4Reg) .addReg(Tmp3Reg).addReg(Tmp2Reg); BuildMI(BB, dl, TII->get(PPC::STWCX)) - .addReg(Tmp4Reg).addReg(PPC::R0).addReg(PtrReg); + .addReg(Tmp4Reg).addReg(ZeroReg).addReg(PtrReg); BuildMI(BB, dl, TII->get(PPC::BCC)) .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB); BB->addSuccessor(loopMBB); @@ -4685,7 +4689,8 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI, // exitMBB: // ... BB = exitMBB; - BuildMI(BB, dl, TII->get(PPC::SRW), dest).addReg(TmpDestReg).addReg(ShiftReg); + BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest).addReg(TmpDestReg) + .addReg(ShiftReg); return BB; } @@ -4933,6 +4938,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, unsigned TmpDestReg = RegInfo.createVirtualRegister(RC); unsigned Ptr1Reg; unsigned TmpReg = RegInfo.createVirtualRegister(RC); + unsigned ZeroReg = is64bit ? PPC::X0 : PPC::R0; // thisMBB: // ... // fallthrough --> loopMBB @@ -4965,7 +4971,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // stwcx. tmpDest, ptr // exitBB: // srw dest, tmpDest, shift - if (ptrA!=PPC::R0) { + if (ptrA != ZeroReg) { Ptr1Reg = RegInfo.createVirtualRegister(RC); BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg) .addReg(ptrA).addReg(ptrB); @@ -5002,7 +5008,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, BB = loop1MBB; BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg) - .addReg(PPC::R0).addReg(PtrReg); + .addReg(ZeroReg).addReg(PtrReg); BuildMI(BB, dl, TII->get(PPC::AND),TmpReg) .addReg(TmpDestReg).addReg(MaskReg); BuildMI(BB, dl, TII->get(PPC::CMPW), PPC::CR0) @@ -5018,7 +5024,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, BuildMI(BB, dl, TII->get(PPC::OR),Tmp4Reg) .addReg(Tmp2Reg).addReg(NewVal3Reg); BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(Tmp4Reg) - .addReg(PPC::R0).addReg(PtrReg); + .addReg(ZeroReg).addReg(PtrReg); BuildMI(BB, dl, TII->get(PPC::BCC)) .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB); BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); @@ -5027,13 +5033,14 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, BB = midMBB; BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(TmpDestReg) - .addReg(PPC::R0).addReg(PtrReg); + .addReg(ZeroReg).addReg(PtrReg); BB->addSuccessor(exitMBB); // exitMBB: // ... BB = exitMBB; - BuildMI(BB, dl, TII->get(PPC::SRW),dest).addReg(TmpReg).addReg(ShiftReg); + BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW),dest).addReg(TmpReg) + .addReg(ShiftReg); } else { llvm_unreachable("Unexpected instr type to insert"); } diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td index 6636b69..9f0fae5 100644 --- a/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/lib/Target/PowerPC/PPCInstr64Bit.td @@ -130,7 +130,7 @@ def : Pat<(PPCnop), // Atomic operations let usesCustomInserter = 1 in { - let Uses = [CR0] in { + let Defs = [CR0] in { def ATOMIC_LOAD_ADD_I64 : Pseudo< (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "", [(set G8RC:$dst, (atomic_load_add_64 xoaddr:$ptr, G8RC:$incr))]>; diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index 82aadeb..24071b7 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -550,7 +550,7 @@ def DCBZL : DCB_Form<1014, 1, (outs), (ins memrr:$dst), // Atomic operations let usesCustomInserter = 1 in { - let Uses = [CR0] in { + let Defs = [CR0] in { def ATOMIC_LOAD_ADD_I8 : Pseudo< (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "", [(set GPRC:$dst, (atomic_load_add_8 xoaddr:$ptr, GPRC:$incr))]>; diff --git a/lib/Target/PowerPC/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/PPCMCAsmInfo.cpp index d1178dd..9e508cc 100644 --- a/lib/Target/PowerPC/PPCMCAsmInfo.cpp +++ b/lib/Target/PowerPC/PPCMCAsmInfo.cpp @@ -17,7 +17,7 @@ using namespace llvm; PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit) { PCSymbol = "."; CommentString = ";"; - ExceptionsType = ExceptionHandling::DwarfTable; + ExceptionsType = ExceptionHandling::DwarfCFI; if (!is64Bit) Data64bitsDirective = 0; // We can't emit a 64-bit unit in PPC32 mode. diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp index 72a1dee..5f3aa23 100644 --- a/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/lib/Target/PowerPC/PPCSubtarget.cpp @@ -70,7 +70,7 @@ PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &FS, , HasSTFIWX(false) , HasLazyResolverStubs(false) , IsJITCodeModel(false) - , DarwinVers(0) { + , TargetTriple(TT) { // Determine default and user specified characteristics std::string CPU = "generic"; @@ -92,19 +92,6 @@ PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &FS, // support it, ignore. if (use64BitRegs() && !has64BitSupport()) Use64BitRegs = false; - - // Set the boolean corresponding to the current target triple, or the default - // if one cannot be determined, to true. - if (TT.length() > 7) { - // Determine which version of darwin this is. - size_t DarwinPos = TT.find("-darwin"); - if (DarwinPos != std::string::npos) { - if (isdigit(TT[DarwinPos+7])) - DarwinVers = atoi(&TT[DarwinPos+7]); - else - DarwinVers = 8; // Minimum supported darwin is Tiger. - } - } // Set up darwin-specific properties. if (isDarwin()) diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h index 00ec747..8fd1a44 100644 --- a/lib/Target/PowerPC/PPCSubtarget.h +++ b/lib/Target/PowerPC/PPCSubtarget.h @@ -14,6 +14,7 @@ #ifndef POWERPCSUBTARGET_H #define POWERPCSUBTARGET_H +#include "llvm/ADT/Triple.h" #include "llvm/Target/TargetInstrItineraries.h" #include "llvm/Target/TargetSubtarget.h" @@ -65,9 +66,9 @@ protected: bool HasLazyResolverStubs; bool IsJITCodeModel; - /// DarwinVers - Nonzero if this is a darwin platform. Otherwise, the numeric - /// version of the platform, e.g. 8 = 10.4 (Tiger), 9 = 10.5 (Leopard), etc. - unsigned char DarwinVers; // Is any darwin-ppc platform. + /// TargetTriple - What processor and OS we're targeting. + Triple TargetTriple; + public: /// This constructor initializes the data members to match that /// of the specified triple. @@ -134,13 +135,10 @@ public: bool hasAltivec() const { return HasAltivec; } bool isGigaProcessor() const { return IsGigaProcessor; } - /// isDarwin - True if this is any darwin platform. - bool isDarwin() const { return DarwinVers != 0; } - /// isDarwin - True if this is darwin9 (leopard, 10.5) or above. - bool isDarwin9() const { return DarwinVers >= 9; } + const Triple &getTargetTriple() const { return TargetTriple; } - /// getDarwinVers - Return the darwin version number, 8 = tiger, 9 = leopard. - unsigned getDarwinVers() const { return DarwinVers; } + /// isDarwin - True if this is any darwin platform. + bool isDarwin() const { return TargetTriple.isMacOSX(); } bool isDarwinABI() const { return isDarwin(); } bool isSVR4ABI() const { return !isDarwin(); } diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp index 212b450..d27e54e 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -24,7 +24,7 @@ using namespace llvm; static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) { Triple TheTriple(TT); bool isPPC64 = TheTriple.getArch() == Triple::ppc64; - if (TheTriple.getOS() == Triple::Darwin) + if (TheTriple.isOSDarwin()) return new PPCMCAsmInfoDarwin(isPPC64); return new PPCLinuxMCAsmInfo(isPPC64); @@ -37,12 +37,10 @@ static MCStreamer *createMCStreamer(const Target &T, const std::string &TT, MCCodeEmitter *Emitter, bool RelaxAll, bool NoExecStack) { - switch (Triple(TT).getOS()) { - case Triple::Darwin: + if (Triple(TT).isOSDarwin()) return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll); - default: - return NULL; - } + + return NULL; } extern "C" void LLVMInitializePowerPCTarget() { diff --git a/lib/Target/README.txt b/lib/Target/README.txt index f85914b..ffe3fa4 100644 --- a/lib/Target/README.txt +++ b/lib/Target/README.txt @@ -392,34 +392,6 @@ PHI Slicing could be extended to do this. //===---------------------------------------------------------------------===// -LSR should know what GPR types a target has from TargetData. This code: - -volatile short X, Y; // globals - -void foo(int N) { - int i; - for (i = 0; i < N; i++) { X = i; Y = i*4; } -} - -produces two near identical IV's (after promotion) on PPC/ARM: - -LBB1_2: - ldr r3, LCPI1_0 - ldr r3, [r3] - strh r2, [r3] - ldr r3, LCPI1_1 - ldr r3, [r3] - strh r1, [r3] - add r1, r1, #4 - add r2, r2, #1 <- [0,+,1] - sub r0, r0, #1 <- [0,-,1] - cmp r0, #0 - bne LBB1_2 - -LSR should reuse the "+" IV for the exit test. - -//===---------------------------------------------------------------------===// - Tail call elim should be more aggressive, checking to see if the call is followed by an uncond branch to an exit block. @@ -1325,6 +1297,21 @@ codegen. //===---------------------------------------------------------------------===// +simplifylibcalls should turn these snprintf idioms into memcpy (GCC PR47917) + +char buf1[6], buf2[6], buf3[4], buf4[4]; +int i; + +int foo (void) { + int ret = snprintf (buf1, sizeof buf1, "abcde"); + ret += snprintf (buf2, sizeof buf2, "abcdef") * 16; + ret += snprintf (buf3, sizeof buf3, "%s", i++ < 6 ? "abc" : "def") * 256; + ret += snprintf (buf4, sizeof buf4, "%s", i++ > 10 ? "abcde" : "defgh")*4096; + return ret; +} + +//===---------------------------------------------------------------------===// + "gas" uses this idiom: else if (strchr ("+-/*%|&^:[]()~", *intel_parser.op_string)) .. @@ -1780,43 +1767,6 @@ case it choses instead to keep the max operation obvious. //===---------------------------------------------------------------------===// -Take the following testcase on x86-64 (similar testcases exist for all targets -with addc/adde): - -define void @a(i64* nocapture %s, i64* nocapture %t, i64 %a, i64 %b, -i64 %c) nounwind { -entry: - %0 = zext i64 %a to i128 ; <i128> [#uses=1] - %1 = zext i64 %b to i128 ; <i128> [#uses=1] - %2 = add i128 %1, %0 ; <i128> [#uses=2] - %3 = zext i64 %c to i128 ; <i128> [#uses=1] - %4 = shl i128 %3, 64 ; <i128> [#uses=1] - %5 = add i128 %4, %2 ; <i128> [#uses=1] - %6 = lshr i128 %5, 64 ; <i128> [#uses=1] - %7 = trunc i128 %6 to i64 ; <i64> [#uses=1] - store i64 %7, i64* %s, align 8 - %8 = trunc i128 %2 to i64 ; <i64> [#uses=1] - store i64 %8, i64* %t, align 8 - ret void -} - -Generated code: - addq %rcx, %rdx - sbbq %rax, %rax - subq %rax, %r8 - movq %r8, (%rdi) - movq %rdx, (%rsi) - ret - -Expected code: - addq %rcx, %rdx - adcq $0, %r8 - movq %r8, (%rdi) - movq %rdx, (%rsi) - ret - -//===---------------------------------------------------------------------===// - Switch lowering generates less than ideal code for the following switch: define void @a(i32 %x) nounwind { entry: @@ -2124,11 +2074,12 @@ for.end: ; preds = %entry } This shouldn't need the ((zext (%n - 1)) + 1) game, and it should ideally fold -the two memset's together. The issue with %n seems to stem from poor handling -of the original loop. +the two memset's together. -To simplify this, we need SCEV to know that "n != 0" because of the dominating -conditional. That would turn the second memset into a simple memset of 'n'. +The issue with the addition only occurs in 64-bit mode, and appears to be at +least partially caused by Scalar Evolution not keeping its cache updated: it +returns the "wrong" result immediately after indvars runs, but figures out the +expected result if it is run from scratch on IR resulting from running indvars. //===---------------------------------------------------------------------===// @@ -2287,4 +2238,71 @@ missed cases: //===---------------------------------------------------------------------===// +define i1 @test1(i32 %x) nounwind { + %and = and i32 %x, 3 + %cmp = icmp ult i32 %and, 2 + ret i1 %cmp +} + +Can be folded to (x & 2) == 0. + +define i1 @test2(i32 %x) nounwind { + %and = and i32 %x, 3 + %cmp = icmp ugt i32 %and, 1 + ret i1 %cmp +} + +Can be folded to (x & 2) != 0. + +SimplifyDemandedBits shrinks the "and" constant to 2 but instcombine misses the +icmp transform. + +//===---------------------------------------------------------------------===// + +This code: + +typedef struct { +int f1:1; +int f2:1; +int f3:1; +int f4:29; +} t1; + +typedef struct { +int f1:1; +int f2:1; +int f3:30; +} t2; + +t1 s1; +t2 s2; + +void func1(void) +{ +s1.f1 = s2.f1; +s1.f2 = s2.f2; +} + +Compiles into this IR (on x86-64 at least): + +%struct.t1 = type { i8, [3 x i8] } +@s2 = global %struct.t1 zeroinitializer, align 4 +@s1 = global %struct.t1 zeroinitializer, align 4 +define void @func1() nounwind ssp noredzone { +entry: + %0 = load i32* bitcast (%struct.t1* @s2 to i32*), align 4 + %bf.val.sext5 = and i32 %0, 1 + %1 = load i32* bitcast (%struct.t1* @s1 to i32*), align 4 + %2 = and i32 %1, -4 + %3 = or i32 %2, %bf.val.sext5 + %bf.val.sext26 = and i32 %0, 2 + %4 = or i32 %3, %bf.val.sext26 + store i32 %4, i32* bitcast (%struct.t1* @s1 to i32*), align 4 + ret void +} + +The two or/and's should be merged into one each. + +//===---------------------------------------------------------------------===// + diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp index 70574c3..edb62fa 100644 --- a/lib/Target/Sparc/SparcISelLowering.cpp +++ b/lib/Target/Sparc/SparcISelLowering.cpp @@ -544,7 +544,7 @@ SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Build a sequence of copy-to-reg nodes chained together with token // chain and flag operands which copy the outgoing args into registers. - // The InFlag in necessary since all emited instructions must be + // The InFlag in necessary since all emitted instructions must be // stuck together. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { diff --git a/lib/Target/SubtargetFeature.cpp b/lib/Target/SubtargetFeature.cpp index 3cf95b5..e0a9de8 100644 --- a/lib/Target/SubtargetFeature.cpp +++ b/lib/Target/SubtargetFeature.cpp @@ -211,7 +211,7 @@ const std::string & SubtargetFeatures::getCPU() const { /// feature, set it. /// static -void SetImpliedBits(uint32_t &Bits, const SubtargetFeatureKV *FeatureEntry, +void SetImpliedBits(uint64_t &Bits, const SubtargetFeatureKV *FeatureEntry, const SubtargetFeatureKV *FeatureTable, size_t FeatureTableSize) { for (size_t i = 0; i < FeatureTableSize; ++i) { @@ -230,7 +230,7 @@ void SetImpliedBits(uint32_t &Bits, const SubtargetFeatureKV *FeatureEntry, /// feature, clear it. /// static -void ClearImpliedBits(uint32_t &Bits, const SubtargetFeatureKV *FeatureEntry, +void ClearImpliedBits(uint64_t &Bits, const SubtargetFeatureKV *FeatureEntry, const SubtargetFeatureKV *FeatureTable, size_t FeatureTableSize) { for (size_t i = 0; i < FeatureTableSize; ++i) { @@ -247,7 +247,7 @@ void ClearImpliedBits(uint32_t &Bits, const SubtargetFeatureKV *FeatureEntry, /// getBits - Get feature bits. /// -uint32_t SubtargetFeatures::getBits(const SubtargetFeatureKV *CPUTable, +uint64_t SubtargetFeatures::getBits(const SubtargetFeatureKV *CPUTable, size_t CPUTableSize, const SubtargetFeatureKV *FeatureTable, size_t FeatureTableSize) { @@ -263,7 +263,7 @@ uint32_t SubtargetFeatures::getBits(const SubtargetFeatureKV *CPUTable, "CPU features table is not sorted"); } #endif - uint32_t Bits = 0; // Resulting bits + uint64_t Bits = 0; // Resulting bits // Check if help is needed if (Features[0] == "help") diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index 90939c3..d331614 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -451,7 +451,7 @@ SystemZTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee, // Build a sequence of copy-to-reg nodes chained together with token chain and // flag operands which copy the outgoing args into registers. The InFlag in - // necessary since all emited instructions must be stuck together. + // necessary since all emitted instructions must be stuck together. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, diff --git a/lib/Target/TargetData.cpp b/lib/Target/TargetData.cpp index c628df0..1990bc7 100644 --- a/lib/Target/TargetData.cpp +++ b/lib/Target/TargetData.cpp @@ -617,10 +617,14 @@ uint64_t TargetData::getIndexedOffset(const Type *ptrTy, Value* const* Indices, unsigned TargetData::getPreferredAlignment(const GlobalVariable *GV) const { const Type *ElemType = GV->getType()->getElementType(); unsigned Alignment = getPrefTypeAlignment(ElemType); - if (GV->getAlignment() > Alignment) - Alignment = GV->getAlignment(); + unsigned GVAlignment = GV->getAlignment(); + if (GVAlignment >= Alignment) { + Alignment = GVAlignment; + } else if (GVAlignment != 0) { + Alignment = std::max(GVAlignment, getABITypeAlignment(ElemType)); + } - if (GV->hasInitializer()) { + if (GV->hasInitializer() && GVAlignment == 0) { if (Alignment < 16) { // If the global is not external, see if it is large. If so, give it a // larger alignment. diff --git a/lib/Target/TargetInstrInfo.cpp b/lib/Target/TargetInstrInfo.cpp index 97f3bf6..d4b7697 100644 --- a/lib/Target/TargetInstrInfo.cpp +++ b/lib/Target/TargetInstrInfo.cpp @@ -149,10 +149,10 @@ bool TargetInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const { /// Measure the specified inline asm to determine an approximation of its /// length. -/// Comments (which run till the next SeparatorChar or newline) do not +/// Comments (which run till the next SeparatorString or newline) do not /// count as an instruction. /// Any other non-whitespace text is considered an instruction, with -/// multiple instructions separated by SeparatorChar or newlines. +/// multiple instructions separated by SeparatorString or newlines. /// Variable-length instructions are not handled here; this function /// may be overloaded in the target code to do that. unsigned TargetInstrInfo::getInlineAsmLength(const char *Str, @@ -163,7 +163,8 @@ unsigned TargetInstrInfo::getInlineAsmLength(const char *Str, bool atInsnStart = true; unsigned Length = 0; for (; *Str; ++Str) { - if (*Str == '\n' || *Str == MAI.getSeparatorChar()) + if (*Str == '\n' || strncmp(Str, MAI.getSeparatorString(), + strlen(MAI.getSeparatorString())) == 0) atInsnStart = true; if (atInsnStart && !std::isspace(*Str)) { Length += MAI.getMaxInstLength(); diff --git a/lib/Target/TargetLibraryInfo.cpp b/lib/Target/TargetLibraryInfo.cpp index c8bed18..e336b09 100644 --- a/lib/Target/TargetLibraryInfo.cpp +++ b/lib/Target/TargetLibraryInfo.cpp @@ -28,9 +28,22 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T) { // memset_pattern16 is only available on iOS 3.0 and Mac OS/X 10.5 and later. - if (T.getOS() != Triple::Darwin || T.getDarwinMajorNumber() < 9) + if (T.isMacOSX()) { + if (T.isMacOSXVersionLT(10, 5)) + TLI.setUnavailable(LibFunc::memset_pattern16); + } else if (T.getOS() == Triple::IOS) { + if (T.isOSVersionLT(3, 0)) + TLI.setUnavailable(LibFunc::memset_pattern16); + } else { TLI.setUnavailable(LibFunc::memset_pattern16); - + } + + // iprintf and friends are only available on XCore. + if (T.getArch() != Triple::xcore) { + TLI.setUnavailable(LibFunc::iprintf); + TLI.setUnavailable(LibFunc::siprintf); + TLI.setUnavailable(LibFunc::fiprintf); + } } diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp index 5d34c7d..717ad41 100644 --- a/lib/Target/TargetLoweringObjectFile.cpp +++ b/lib/Target/TargetLoweringObjectFile.cpp @@ -120,6 +120,18 @@ static bool IsNullTerminatedString(const Constant *C) { return false; } +MCSymbol *TargetLoweringObjectFile:: +getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang, + MachineModuleInfo *MMI) const { + return Mang->getSymbol(GV); +} + +void TargetLoweringObjectFile::emitPersonalityValue(MCStreamer &Streamer, + const TargetMachine &TM, + const MCSymbol *Sym) const { +} + + /// getKindForGlobal - This is a top-level target-independent classifier for /// a global variable. Given an global variable and information from TM, it /// classifies the global in a variety of ways that make various target @@ -305,16 +317,15 @@ getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang, MachineModuleInfo *MMI, unsigned Encoding, MCStreamer &Streamer) const { const MCSymbol *Sym = Mang->getSymbol(GV); - return getExprForDwarfReference(Sym, Mang, MMI, Encoding, Streamer); + return getExprForDwarfReference(Sym, Encoding, Streamer); } const MCExpr *TargetLoweringObjectFile:: -getExprForDwarfReference(const MCSymbol *Sym, Mangler *Mang, - MachineModuleInfo *MMI, unsigned Encoding, +getExprForDwarfReference(const MCSymbol *Sym, unsigned Encoding, MCStreamer &Streamer) const { const MCExpr *Res = MCSymbolRefExpr::Create(Sym, getContext()); - switch (Encoding & 0xF0) { + switch (Encoding & 0x70) { default: report_fatal_error("We do not support this DWARF encoding yet!"); case dwarf::DW_EH_PE_absptr: @@ -339,7 +350,7 @@ unsigned TargetLoweringObjectFile::getLSDAEncoding() const { return dwarf::DW_EH_PE_absptr; } -unsigned TargetLoweringObjectFile::getFDEEncoding() const { +unsigned TargetLoweringObjectFile::getFDEEncoding(bool CFI) const { return dwarf::DW_EH_PE_absptr; } diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp index d579d95..76ccc09 100644 --- a/lib/Target/TargetMachine.cpp +++ b/lib/Target/TargetMachine.cpp @@ -48,6 +48,7 @@ namespace llvm { bool RealignStack; bool DisableJumpTables; bool StrongPHIElim; + bool HasDivModLibcall; bool AsmVerbosityDefault(false); } @@ -205,6 +206,10 @@ EnableStrongPHIElim(cl::Hidden, "strong-phi-elim", cl::desc("Use strong PHI elimination."), cl::location(StrongPHIElim), cl::init(false)); +static cl::opt<std::string> +TrapFuncName("trap-func", cl::Hidden, + cl::desc("Emit a call to trap function rather than a trap instruction"), + cl::init("")); static cl::opt<bool> DataSections("fdata-sections", cl::desc("Emit data into separate sections"), @@ -221,7 +226,9 @@ TargetMachine::TargetMachine(const Target &T) : TheTarget(T), AsmInfo(0), MCRelaxAll(false), MCNoExecStack(false), - MCUseLoc(true) { + MCSaveTempLabels(false), + MCUseLoc(true), + MCUseCFI(true) { // Typically it will be subtargets that will adjust FloatABIType from Default // to Soft or Hard. if (UseSoftFloat) @@ -303,4 +310,11 @@ namespace llvm { bool HonorSignDependentRoundingFPMath() { return !UnsafeFPMath && HonorSignDependentRoundingFPMathOption; } + + /// getTrapFunctionName - If this returns a non-empty string, this means isel + /// should lower Intrinsic::trap to a call to the specified function name + /// instead of an ISD::TRAP node. + StringRef getTrapFunctionName() { + return TrapFuncName; + } } diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index 8fe549b..c352bfc 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -53,6 +53,14 @@ private: SmallVectorImpl<MCParsedAsmOperand*> &Operands, MCStreamer &Out); + /// isSrcOp - Returns true if operand is either (%rsi) or %ds:%(rsi) + /// in 64bit mode or (%edi) or %es:(%edi) in 32bit mode. + bool isSrcOp(X86Operand &Op); + + /// isDstOp - Returns true if operand is either %es:(%rdi) in 64bit mode + /// or %es:(%edi) in 32bit mode. + bool isDstOp(X86Operand &Op); + /// @name Auto-generated Matcher Functions /// { @@ -356,6 +364,24 @@ struct X86Operand : public MCParsedAsmOperand { } // end anonymous namespace. +bool X86ATTAsmParser::isSrcOp(X86Operand &Op) { + unsigned basereg = Is64Bit ? X86::RSI : X86::ESI; + + return (Op.isMem() && + (Op.Mem.SegReg == 0 || Op.Mem.SegReg == X86::DS) && + isa<MCConstantExpr>(Op.Mem.Disp) && + cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 && + Op.Mem.BaseReg == basereg && Op.Mem.IndexReg == 0); +} + +bool X86ATTAsmParser::isDstOp(X86Operand &Op) { + unsigned basereg = Is64Bit ? X86::RDI : X86::EDI; + + return Op.isMem() && Op.Mem.SegReg == X86::ES && + isa<MCConstantExpr>(Op.Mem.Disp) && + cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 && + Op.Mem.BaseReg == basereg && Op.Mem.IndexReg == 0; +} bool X86ATTAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) { @@ -788,7 +814,106 @@ ParseInstruction(StringRef Name, SMLoc NameLoc, delete &Op; } } - + // Transform "ins[bwl] %dx, %es:(%edi)" into "ins[bwl]" + if (Name.startswith("ins") && Operands.size() == 3 && + (Name == "insb" || Name == "insw" || Name == "insl")) { + X86Operand &Op = *(X86Operand*)Operands.begin()[1]; + X86Operand &Op2 = *(X86Operand*)Operands.begin()[2]; + if (Op.isReg() && Op.getReg() == X86::DX && isDstOp(Op2)) { + Operands.pop_back(); + Operands.pop_back(); + delete &Op; + delete &Op2; + } + } + + // Transform "outs[bwl] %ds:(%esi), %dx" into "out[bwl]" + if (Name.startswith("outs") && Operands.size() == 3 && + (Name == "outsb" || Name == "outsw" || Name == "outsl")) { + X86Operand &Op = *(X86Operand*)Operands.begin()[1]; + X86Operand &Op2 = *(X86Operand*)Operands.begin()[2]; + if (isSrcOp(Op) && Op2.isReg() && Op2.getReg() == X86::DX) { + Operands.pop_back(); + Operands.pop_back(); + delete &Op; + delete &Op2; + } + } + + // Transform "movs[bwl] %ds:(%esi), %es:(%edi)" into "movs[bwl]" + if (Name.startswith("movs") && Operands.size() == 3 && + (Name == "movsb" || Name == "movsw" || Name == "movsl" || + (Is64Bit && Name == "movsq"))) { + X86Operand &Op = *(X86Operand*)Operands.begin()[1]; + X86Operand &Op2 = *(X86Operand*)Operands.begin()[2]; + if (isSrcOp(Op) && isDstOp(Op2)) { + Operands.pop_back(); + Operands.pop_back(); + delete &Op; + delete &Op2; + } + } + // Transform "lods[bwl] %ds:(%esi),{%al,%ax,%eax,%rax}" into "lods[bwl]" + if (Name.startswith("lods") && Operands.size() == 3 && + (Name == "lods" || Name == "lodsb" || Name == "lodsw" || + Name == "lodsl" || (Is64Bit && Name == "lodsq"))) { + X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]); + X86Operand *Op2 = static_cast<X86Operand*>(Operands[2]); + if (isSrcOp(*Op1) && Op2->isReg()) { + const char *ins; + unsigned reg = Op2->getReg(); + bool isLods = Name == "lods"; + if (reg == X86::AL && (isLods || Name == "lodsb")) + ins = "lodsb"; + else if (reg == X86::AX && (isLods || Name == "lodsw")) + ins = "lodsw"; + else if (reg == X86::EAX && (isLods || Name == "lodsl")) + ins = "lodsl"; + else if (reg == X86::RAX && (isLods || Name == "lodsq")) + ins = "lodsq"; + else + ins = NULL; + if (ins != NULL) { + Operands.pop_back(); + Operands.pop_back(); + delete Op1; + delete Op2; + if (Name != ins) + static_cast<X86Operand*>(Operands[0])->setTokenValue(ins); + } + } + } + // Transform "stos[bwl] {%al,%ax,%eax,%rax},%es:(%edi)" into "stos[bwl]" + if (Name.startswith("stos") && Operands.size() == 3 && + (Name == "stos" || Name == "stosb" || Name == "stosw" || + Name == "stosl" || (Is64Bit && Name == "stosq"))) { + X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]); + X86Operand *Op2 = static_cast<X86Operand*>(Operands[2]); + if (isDstOp(*Op2) && Op1->isReg()) { + const char *ins; + unsigned reg = Op1->getReg(); + bool isStos = Name == "stos"; + if (reg == X86::AL && (isStos || Name == "stosb")) + ins = "stosb"; + else if (reg == X86::AX && (isStos || Name == "stosw")) + ins = "stosw"; + else if (reg == X86::EAX && (isStos || Name == "stosl")) + ins = "stosl"; + else if (reg == X86::RAX && (isStos || Name == "stosq")) + ins = "stosq"; + else + ins = NULL; + if (ins != NULL) { + Operands.pop_back(); + Operands.pop_back(); + delete Op1; + delete Op2; + if (Name != ins) + static_cast<X86Operand*>(Operands[0])->setTokenValue(ins); + } + } + } + // FIXME: Hack to handle recognize s{hr,ar,hl} $1, <op>. Canonicalize to // "shift <op>". if ((Name.startswith("shr") || Name.startswith("sar") || @@ -803,6 +928,18 @@ ParseInstruction(StringRef Name, SMLoc NameLoc, Operands.erase(Operands.begin() + 1); } } + + // Transforms "int $3" into "int3" as a size optimization. We can't write an + // instalias with an immediate operand yet. + if (Name == "int" && Operands.size() == 2) { + X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]); + if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) && + cast<MCConstantExpr>(Op1->getImm())->getValue() == 3) { + delete Operands[1]; + Operands.erase(Operands.begin() + 1); + static_cast<X86Operand*>(Operands[0])->setTokenValue("int3"); + } + } return false; } diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp index f777756..d8a105e 100644 --- a/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -409,6 +409,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, case TYPE_XMM32: case TYPE_XMM64: case TYPE_XMM128: + case TYPE_XMM256: case TYPE_DEBUGREG: case TYPE_CONTROLREG: return translateRMRegister(mcInst, insn); @@ -418,6 +419,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, case TYPE_M32: case TYPE_M64: case TYPE_M128: + case TYPE_M256: case TYPE_M512: case TYPE_Mv: case TYPE_M32FP: @@ -500,6 +502,9 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand, case ENCODING_Rv: translateRegister(mcInst, insn.opcodeRegister); return false; + case ENCODING_VVVV: + translateRegister(mcInst, insn.vvvv); + return false; case ENCODING_DUP: return translateOperand(mcInst, insn.spec->operands[operand.type - TYPE_DUP0], diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c index b6546fc..de1610b 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c @@ -75,6 +75,12 @@ static int modRMRequired(OpcodeType type, case THREEBYTE_3A: decision = &THREEBYTE3A_SYM; break; + case THREEBYTE_A6: + decision = &THREEBYTEA6_SYM; + break; + case THREEBYTE_A7: + decision = &THREEBYTEA7_SYM; + break; } return decision->opcodeDecisions[insnContext].modRMDecisions[opcode]. @@ -115,6 +121,12 @@ static InstrUID decode(OpcodeType type, case THREEBYTE_3A: dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; break; + case THREEBYTE_A6: + dec = &THREEBYTEA6_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + case THREEBYTE_A7: + dec = &THREEBYTEA7_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; } switch (dec->modrm_type) { @@ -368,29 +380,109 @@ static int readPrefixes(struct InternalInstruction* insn) { if (isPrefix) dbgprintf(insn, "Found prefix 0x%hhx", byte); } + + insn->vexSize = 0; - if (insn->mode == MODE_64BIT) { - if ((byte & 0xf0) == 0x40) { - uint8_t opcodeByte; + if (byte == 0xc4) { + uint8_t byte1; - if (lookAtByte(insn, &opcodeByte) || ((opcodeByte & 0xf0) == 0x40)) { - dbgprintf(insn, "Redundant REX prefix"); - return -1; + if (lookAtByte(insn, &byte1)) { + dbgprintf(insn, "Couldn't read second byte of VEX"); + return -1; + } + + if (insn->mode == MODE_64BIT || byte1 & 0x8) { + insn->vexSize = 3; + insn->necessaryPrefixLocation = insn->readerCursor - 1; + } + else { + unconsumeByte(insn); + insn->necessaryPrefixLocation = insn->readerCursor - 1; + } + + if (insn->vexSize == 3) { + insn->vexPrefix[0] = byte; + consumeByte(insn, &insn->vexPrefix[1]); + consumeByte(insn, &insn->vexPrefix[2]); + + /* We simulate the REX prefix for simplicity's sake */ + + insn->rexPrefix = 0x40 + | (wFromVEX3of3(insn->vexPrefix[2]) << 3) + | (rFromVEX2of3(insn->vexPrefix[1]) << 2) + | (xFromVEX2of3(insn->vexPrefix[1]) << 1) + | (bFromVEX2of3(insn->vexPrefix[1]) << 0); + + switch (ppFromVEX3of3(insn->vexPrefix[2])) + { + default: + break; + case VEX_PREFIX_66: + hasOpSize = TRUE; + break; } + + dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx", insn->vexPrefix[0], insn->vexPrefix[1], insn->vexPrefix[2]); + } + } + else if (byte == 0xc5) { + uint8_t byte1; + + if (lookAtByte(insn, &byte1)) { + dbgprintf(insn, "Couldn't read second byte of VEX"); + return -1; + } - insn->rexPrefix = byte; - insn->necessaryPrefixLocation = insn->readerCursor - 2; - - dbgprintf(insn, "Found REX prefix 0x%hhx", byte); - } else { + if (insn->mode == MODE_64BIT || byte1 & 0x8) { + insn->vexSize = 2; + } + else { + unconsumeByte(insn); + } + + if (insn->vexSize == 2) { + insn->vexPrefix[0] = byte; + consumeByte(insn, &insn->vexPrefix[1]); + + insn->rexPrefix = 0x40 + | (rFromVEX2of2(insn->vexPrefix[1]) << 2); + + switch (ppFromVEX2of2(insn->vexPrefix[1])) + { + default: + break; + case VEX_PREFIX_66: + hasOpSize = TRUE; + break; + } + + dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx", insn->vexPrefix[0], insn->vexPrefix[1]); + } + } + else { + if (insn->mode == MODE_64BIT) { + if ((byte & 0xf0) == 0x40) { + uint8_t opcodeByte; + + if (lookAtByte(insn, &opcodeByte) || ((opcodeByte & 0xf0) == 0x40)) { + dbgprintf(insn, "Redundant REX prefix"); + return -1; + } + + insn->rexPrefix = byte; + insn->necessaryPrefixLocation = insn->readerCursor - 2; + + dbgprintf(insn, "Found REX prefix 0x%hhx", byte); + } else { + unconsumeByte(insn); + insn->necessaryPrefixLocation = insn->readerCursor - 1; + } + } else { unconsumeByte(insn); insn->necessaryPrefixLocation = insn->readerCursor - 1; } - } else { - unconsumeByte(insn); - insn->necessaryPrefixLocation = insn->readerCursor - 1; } - + if (insn->mode == MODE_16BIT) { insn->registerSize = (hasOpSize ? 4 : 2); insn->addressSize = (hasAdSize ? 4 : 2); @@ -438,6 +530,39 @@ static int readOpcode(struct InternalInstruction* insn) { dbgprintf(insn, "readOpcode()"); insn->opcodeType = ONEBYTE; + + if (insn->vexSize == 3) + { + switch (mmmmmFromVEX2of3(insn->vexPrefix[1])) + { + default: + dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)", mmmmmFromVEX2of3(insn->vexPrefix[1])); + return -1; + case 0: + break; + case VEX_LOB_0F: + insn->twoByteEscape = 0x0f; + insn->opcodeType = TWOBYTE; + return consumeByte(insn, &insn->opcode); + case VEX_LOB_0F38: + insn->twoByteEscape = 0x0f; + insn->threeByteEscape = 0x38; + insn->opcodeType = THREEBYTE_38; + return consumeByte(insn, &insn->opcode); + case VEX_LOB_0F3A: + insn->twoByteEscape = 0x0f; + insn->threeByteEscape = 0x3a; + insn->opcodeType = THREEBYTE_3A; + return consumeByte(insn, &insn->opcode); + } + } + else if (insn->vexSize == 2) + { + insn->twoByteEscape = 0x0f; + insn->opcodeType = TWOBYTE; + return consumeByte(insn, &insn->opcode); + } + if (consumeByte(insn, ¤t)) return -1; @@ -467,6 +592,24 @@ static int readOpcode(struct InternalInstruction* insn) { return -1; insn->opcodeType = THREEBYTE_3A; + } else if (current == 0xa6) { + dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); + + insn->threeByteEscape = current; + + if (consumeByte(insn, ¤t)) + return -1; + + insn->opcodeType = THREEBYTE_A6; + } else if (current == 0xa7) { + dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); + + insn->threeByteEscape = current; + + if (consumeByte(insn, ¤t)) + return -1; + + insn->opcodeType = THREEBYTE_A7; } else { dbgprintf(insn, "Didn't find a three-byte escape prefix"); @@ -600,20 +743,64 @@ static int getID(struct InternalInstruction* insn) { dbgprintf(insn, "getID()"); attrMask = ATTR_NONE; - + if (insn->mode == MODE_64BIT) attrMask |= ATTR_64BIT; - - if (insn->rexPrefix & 0x08) - attrMask |= ATTR_REXW; - - if (isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation)) - attrMask |= ATTR_OPSIZE; - else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation)) - attrMask |= ATTR_XS; - else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation)) - attrMask |= ATTR_XD; - + + if (insn->vexSize) { + attrMask |= ATTR_VEX; + + if (insn->vexSize == 3) { + switch (ppFromVEX3of3(insn->vexPrefix[2])) { + case VEX_PREFIX_66: + attrMask |= ATTR_OPSIZE; + break; + case VEX_PREFIX_F3: + attrMask |= ATTR_XS; + break; + case VEX_PREFIX_F2: + attrMask |= ATTR_XD; + break; + } + + if (wFromVEX3of3(insn->vexPrefix[2])) + attrMask |= ATTR_REXW; + if (lFromVEX3of3(insn->vexPrefix[2])) + attrMask |= ATTR_VEXL; + } + else if (insn->vexSize == 2) { + switch (ppFromVEX2of2(insn->vexPrefix[1])) { + case VEX_PREFIX_66: + attrMask |= ATTR_OPSIZE; + break; + case VEX_PREFIX_F3: + attrMask |= ATTR_XS; + break; + case VEX_PREFIX_F2: + attrMask |= ATTR_XD; + break; + } + + if (lFromVEX2of2(insn->vexPrefix[1])) + attrMask |= ATTR_VEXL; + } + else { + return -1; + } + } + else { + if (insn->rexPrefix & 0x08) + attrMask |= ATTR_REXW; + + if (isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation)) + attrMask |= ATTR_OPSIZE; + else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation)) + attrMask |= ATTR_XS; + else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation)) + attrMask |= ATTR_XD; + + } + if (getIDWithAttrMask(&instructionID, insn, attrMask)) return -1; @@ -749,7 +936,7 @@ static int readSIB(struct InternalInstruction* insn) { insn->sibIndex = SIB_INDEX_NONE; break; default: - insn->sibIndex = (EABase)(sibIndexBase + index); + insn->sibIndex = (SIBIndex)(sibIndexBase + index); if (insn->sibIndex == SIB_INDEX_sib || insn->sibIndex == SIB_INDEX_sib64) insn->sibIndex = SIB_INDEX_NONE; @@ -796,7 +983,7 @@ static int readSIB(struct InternalInstruction* insn) { } break; default: - insn->sibBase = (EABase)(sibBaseBase + base); + insn->sibBase = (SIBBase)(sibBaseBase + base); break; } @@ -1012,6 +1199,8 @@ static int readModRM(struct InternalInstruction* insn) { return prefix##_EAX + index; \ case TYPE_R64: \ return prefix##_RAX + index; \ + case TYPE_XMM256: \ + return prefix##_YMM0 + index; \ case TYPE_XMM128: \ case TYPE_XMM64: \ case TYPE_XMM32: \ @@ -1073,6 +1262,14 @@ static int fixupReg(struct InternalInstruction *insn, default: debug("Expected a REG or R/M encoding in fixupReg"); return -1; + case ENCODING_VVVV: + insn->vvvv = (Reg)fixupRegValue(insn, + (OperandType)op->type, + insn->vvvv, + &valid); + if (!valid) + return -1; + break; case ENCODING_REG: insn->reg = (Reg)fixupRegValue(insn, (OperandType)op->type, @@ -1237,6 +1434,27 @@ static int readImmediate(struct InternalInstruction* insn, uint8_t size) { } /* + * readVVVV - Consumes an immediate operand from an instruction, given the + * desired operand size. + * + * @param insn - The instruction whose operand is to be read. + * @return - 0 if the immediate was successfully consumed; nonzero + * otherwise. + */ +static int readVVVV(struct InternalInstruction* insn) { + dbgprintf(insn, "readVVVV()"); + + if (insn->vexSize == 3) + insn->vvvv = vvvvFromVEX3of3(insn->vexPrefix[2]); + else if (insn->vexSize == 2) + insn->vvvv = vvvvFromVEX2of2(insn->vexPrefix[1]); + else + return -1; + + return 0; +} + +/* * readOperands - Consults the specifier for an instruction and consumes all * operands for that instruction, interpreting them as it goes. * @@ -1317,6 +1535,13 @@ static int readOperands(struct InternalInstruction* insn) { case ENCODING_I: if (readOpcodeModifier(insn)) return -1; + break; + case ENCODING_VVVV: + if (readVVVV(insn)) + return -1; + if (fixupReg(insn, &insn->spec->operands[index])) + return -1; + break; case ENCODING_DUP: break; default: diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index d0dc8b5..a9c90f8 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -34,16 +34,30 @@ extern "C" { /* * Accessor functions for various fields of an Intel instruction */ -#define modFromModRM(modRM) ((modRM & 0xc0) >> 6) -#define regFromModRM(modRM) ((modRM & 0x38) >> 3) -#define rmFromModRM(modRM) (modRM & 0x7) -#define scaleFromSIB(sib) ((sib & 0xc0) >> 6) -#define indexFromSIB(sib) ((sib & 0x38) >> 3) -#define baseFromSIB(sib) (sib & 0x7) -#define wFromREX(rex) ((rex & 0x8) >> 3) -#define rFromREX(rex) ((rex & 0x4) >> 2) -#define xFromREX(rex) ((rex & 0x2) >> 1) -#define bFromREX(rex) (rex & 0x1) +#define modFromModRM(modRM) (((modRM) & 0xc0) >> 6) +#define regFromModRM(modRM) (((modRM) & 0x38) >> 3) +#define rmFromModRM(modRM) ((modRM) & 0x7) +#define scaleFromSIB(sib) (((sib) & 0xc0) >> 6) +#define indexFromSIB(sib) (((sib) & 0x38) >> 3) +#define baseFromSIB(sib) ((sib) & 0x7) +#define wFromREX(rex) (((rex) & 0x8) >> 3) +#define rFromREX(rex) (((rex) & 0x4) >> 2) +#define xFromREX(rex) (((rex) & 0x2) >> 1) +#define bFromREX(rex) ((rex) & 0x1) + +#define rFromVEX2of3(vex) (((~(vex)) & 0x80) >> 7) +#define xFromVEX2of3(vex) (((~(vex)) & 0x40) >> 6) +#define bFromVEX2of3(vex) (((~(vex)) & 0x20) >> 5) +#define mmmmmFromVEX2of3(vex) ((vex) & 0x1f) +#define wFromVEX3of3(vex) (((vex) & 0x80) >> 7) +#define vvvvFromVEX3of3(vex) (((~(vex)) & 0x78) >> 3) +#define lFromVEX3of3(vex) (((vex) & 0x4) >> 2) +#define ppFromVEX3of3(vex) ((vex) & 0x3) + +#define rFromVEX2of2(vex) (((~(vex)) & 0x80) >> 7) +#define vvvvFromVEX2of2(vex) (((~(vex)) & 0x78) >> 3) +#define lFromVEX2of2(vex) (((vex) & 0x4) >> 2) +#define ppFromVEX2of2(vex) ((vex) & 0x3) /* * These enums represent Intel registers for use by the decoder. @@ -206,7 +220,25 @@ extern "C" { ENTRY(XMM13) \ ENTRY(XMM14) \ ENTRY(XMM15) - + +#define REGS_YMM \ + ENTRY(YMM0) \ + ENTRY(YMM1) \ + ENTRY(YMM2) \ + ENTRY(YMM3) \ + ENTRY(YMM4) \ + ENTRY(YMM5) \ + ENTRY(YMM6) \ + ENTRY(YMM7) \ + ENTRY(YMM8) \ + ENTRY(YMM9) \ + ENTRY(YMM10) \ + ENTRY(YMM11) \ + ENTRY(YMM12) \ + ENTRY(YMM13) \ + ENTRY(YMM14) \ + ENTRY(YMM15) + #define REGS_SEGMENT \ ENTRY(ES) \ ENTRY(CS) \ @@ -252,6 +284,7 @@ extern "C" { REGS_64BIT \ REGS_MMX \ REGS_XMM \ + REGS_YMM \ REGS_SEGMENT \ REGS_DEBUG \ REGS_CONTROL \ @@ -332,6 +365,27 @@ typedef enum { SEG_OVERRIDE_GS, SEG_OVERRIDE_max } SegmentOverride; + +/* + * VEXLeadingOpcodeByte - Possible values for the VEX.m-mmmm field + */ + +typedef enum { + VEX_LOB_0F = 0x1, + VEX_LOB_0F38 = 0x2, + VEX_LOB_0F3A = 0x3 +} VEXLeadingOpcodeByte; + +/* + * VEXPrefixCode - Possible values for the VEX.pp field + */ + +typedef enum { + VEX_PREFIX_NONE = 0x0, + VEX_PREFIX_66 = 0x1, + VEX_PREFIX_F3 = 0x2, + VEX_PREFIX_F2 = 0x3 +} VEXPrefixCode; typedef uint8_t BOOL; @@ -389,10 +443,12 @@ struct InternalInstruction { uint8_t prefixPresent[0x100]; /* contains the location (for use with the reader) of the prefix byte */ uint64_t prefixLocations[0x100]; + /* The value of the VEX prefix, if present */ + uint8_t vexPrefix[3]; + /* The length of the VEX prefix (0 if not present) */ + uint8_t vexSize; /* The value of the REX prefix, if present */ uint8_t rexPrefix; - /* The location of the REX prefix */ - uint64_t rexLocation; /* The location where a mandatory prefix would have to be (i.e., right before the opcode, or right before the REX prefix if one is present) */ uint64_t necessaryPrefixLocation; @@ -428,6 +484,10 @@ struct InternalInstruction { /* state for additional bytes, consumed during operand decode. Pattern: consumed___ indicates that the byte was already consumed and does not need to be consumed again */ + + /* The VEX.vvvv field, which contains a third register operand for some AVX + instructions */ + Reg vvvv; /* The ModR/M byte, which contains most register operands and some portion of all memory operands */ diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h index 1425b86..70315ed 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h @@ -30,6 +30,8 @@ #define TWOBYTE_SYM x86DisassemblerTwoByteOpcodes #define THREEBYTE38_SYM x86DisassemblerThreeByte38Opcodes #define THREEBYTE3A_SYM x86DisassemblerThreeByte3AOpcodes +#define THREEBYTEA6_SYM x86DisassemblerThreeByteA6Opcodes +#define THREEBYTEA7_SYM x86DisassemblerThreeByteA7Opcodes #define INSTRUCTIONS_STR "x86DisassemblerInstrSpecifiers" #define CONTEXTS_STR "x86DisassemblerContexts" @@ -37,6 +39,8 @@ #define TWOBYTE_STR "x86DisassemblerTwoByteOpcodes" #define THREEBYTE38_STR "x86DisassemblerThreeByte38Opcodes" #define THREEBYTE3A_STR "x86DisassemblerThreeByte3AOpcodes" +#define THREEBYTEA6_STR "x86DisassemblerThreeByteA6Opcodes" +#define THREEBYTEA7_STR "x86DisassemblerThreeByteA7Opcodes" /* * Attributes of an instruction that must be known before the opcode can be @@ -49,7 +53,9 @@ ENUM_ENTRY(ATTR_XS, 0x02) \ ENUM_ENTRY(ATTR_XD, 0x04) \ ENUM_ENTRY(ATTR_REXW, 0x08) \ - ENUM_ENTRY(ATTR_OPSIZE, 0x10) + ENUM_ENTRY(ATTR_OPSIZE, 0x10) \ + ENUM_ENTRY(ATTR_VEX, 0x20) \ + ENUM_ENTRY(ATTR_VEXL, 0x40) #define ENUM_ENTRY(n, v) n = v, enum attributeBits { @@ -87,7 +93,20 @@ enum attributeBits { "IC_64BIT_REXW_XS") \ ENUM_ENTRY(IC_64BIT_REXW_OPSIZE, 7, "The Dynamic Duo! Prefer over all " \ "else because this changes most " \ - "operands' meaning") + "operands' meaning") \ + ENUM_ENTRY(IC_VEX, 1, "requires a VEX prefix") \ + ENUM_ENTRY(IC_VEX_XS, 2, "requires VEX and the XS prefix") \ + ENUM_ENTRY(IC_VEX_XD, 2, "requires VEX and the XD prefix") \ + ENUM_ENTRY(IC_VEX_OPSIZE, 2, "requires VEX and the OpSize prefix") \ + ENUM_ENTRY(IC_VEX_W, 3, "requires VEX and the W prefix") \ + ENUM_ENTRY(IC_VEX_W_XS, 4, "requires VEX, W, and XS prefix") \ + ENUM_ENTRY(IC_VEX_W_XD, 4, "requires VEX, W, and XD prefix") \ + ENUM_ENTRY(IC_VEX_W_OPSIZE, 4, "requires VEX, W, and OpSize") \ + ENUM_ENTRY(IC_VEX_L, 3, "requires VEX and the L prefix") \ + ENUM_ENTRY(IC_VEX_L_XS, 4, "requires VEX and the L and XS prefix")\ + ENUM_ENTRY(IC_VEX_L_XD, 4, "requires VEX and the L and XS prefix")\ + ENUM_ENTRY(IC_VEX_L_OPSIZE, 4, "requires VEX, L, and OpSize") + #define ENUM_ENTRY(n, r, d) n, typedef enum { @@ -104,7 +123,9 @@ typedef enum { ONEBYTE = 0, TWOBYTE = 1, THREEBYTE_38 = 2, - THREEBYTE_3A = 3 + THREEBYTE_3A = 3, + THREEBYTE_A6 = 4, + THREEBYTE_A7 = 5 } OpcodeType; /* @@ -183,6 +204,7 @@ struct ContextDecision { ENUM_ENTRY(ENCODING_NONE, "") \ ENUM_ENTRY(ENCODING_REG, "Register operand in ModR/M byte.") \ ENUM_ENTRY(ENCODING_RM, "R/M operand in ModR/M byte.") \ + ENUM_ENTRY(ENCODING_VVVV, "Register operand in VEX.vvvv byte.") \ ENUM_ENTRY(ENCODING_CB, "1-byte code offset (possible new CS value)") \ ENUM_ENTRY(ENCODING_CW, "2-byte") \ ENUM_ENTRY(ENCODING_CD, "4-byte") \ @@ -278,6 +300,7 @@ struct ContextDecision { ENUM_ENTRY(TYPE_XMM32, "4-byte XMM register or memory operand") \ ENUM_ENTRY(TYPE_XMM64, "8-byte") \ ENUM_ENTRY(TYPE_XMM128, "16-byte") \ + ENUM_ENTRY(TYPE_XMM256, "32-byte") \ ENUM_ENTRY(TYPE_XMM0, "Implicit use of XMM0") \ ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand") \ ENUM_ENTRY(TYPE_DEBUGREG, "Debug register operand") \ diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp index d6950f4..dd6e353 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -15,6 +15,7 @@ #define DEBUG_TYPE "asm-printer" #include "X86ATTInstPrinter.h" #include "X86InstComments.h" +#include "X86Subtarget.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" @@ -22,24 +23,38 @@ #include "llvm/Support/Format.h" #include "llvm/Support/FormattedStream.h" #include "X86GenInstrNames.inc" +#include <map> using namespace llvm; // Include the auto-generated portion of the assembly writer. #define GET_INSTRUCTION_NAME +#define PRINT_ALIAS_INSTR +#include "X86GenRegisterNames.inc" #include "X86GenAsmWriter.inc" +#undef PRINT_ALIAS_INSTR +#undef GET_INSTRUCTION_NAME + +X86ATTInstPrinter::X86ATTInstPrinter(TargetMachine &TM, const MCAsmInfo &MAI) + : MCInstPrinter(MAI) { + // Initialize the set of available features. + setAvailableFeatures(ComputeAvailableFeatures( + &TM.getSubtarget<X86Subtarget>())); +} void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS) { - printInstruction(MI, OS); + // Try to print any aliases first. + if (!printAliasInstr(MI, OS)) + printInstruction(MI, OS); // If verbose assembly is enabled, we can print some informative comments. if (CommentStream) EmitAnyX86InstComments(MI, *CommentStream, getRegisterName); } + StringRef X86ATTInstPrinter::getOpcodeName(unsigned Opcode) const { return getInstructionName(Opcode); } - void X86ATTInstPrinter::printSSECC(const MCInst *MI, unsigned Op, raw_ostream &O) { switch (MI->getOperand(Op).getImm()) { diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h index eb98664..8d69391 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h @@ -17,16 +17,24 @@ #include "llvm/MC/MCInstPrinter.h" namespace llvm { - class MCOperand; + +class MCOperand; +class X86Subtarget; +class TargetMachine; class X86ATTInstPrinter : public MCInstPrinter { public: - X86ATTInstPrinter(const MCAsmInfo &MAI) : MCInstPrinter(MAI) {} - + X86ATTInstPrinter(TargetMachine &TM, const MCAsmInfo &MAI); virtual void printInst(const MCInst *MI, raw_ostream &OS); virtual StringRef getOpcodeName(unsigned Opcode) const; + // Methods used to print the alias of an instruction. + unsigned ComputeAvailableFeatures(const X86Subtarget *Subtarget) const; + // Autogenerated by tblgen, returns true if we successfully printed an + // alias. + bool printAliasInstr(const MCInst *MI, raw_ostream &OS); + // Autogenerated by tblgen. void printInstruction(const MCInst *MI, raw_ostream &OS); static const char *getRegisterName(unsigned RegNo); diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp index 12144e3..c642acc 100644 --- a/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -111,28 +111,28 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, // FALL THROUGH. case X86::PUNPCKLBWrm: Src1Name = getRegName(MI->getOperand(0).getReg()); - DecodePUNPCKLMask(16, ShuffleMask); + DecodePUNPCKLBWMask(16, ShuffleMask); break; case X86::PUNPCKLWDrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::PUNPCKLWDrm: Src1Name = getRegName(MI->getOperand(0).getReg()); - DecodePUNPCKLMask(8, ShuffleMask); + DecodePUNPCKLWDMask(8, ShuffleMask); break; case X86::PUNPCKLDQrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::PUNPCKLDQrm: Src1Name = getRegName(MI->getOperand(0).getReg()); - DecodePUNPCKLMask(4, ShuffleMask); + DecodePUNPCKLDQMask(4, ShuffleMask); break; case X86::PUNPCKLQDQrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::PUNPCKLQDQrm: Src1Name = getRegName(MI->getOperand(0).getReg()); - DecodePUNPCKLMask(2, ShuffleMask); + DecodePUNPCKLQDQMask(2, ShuffleMask); break; case X86::SHUFPDrri: @@ -153,16 +153,44 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::UNPCKLPDrm: - DecodeUNPCKLPMask(2, ShuffleMask); + DecodeUNPCKLPDMask(2, ShuffleMask); Src1Name = getRegName(MI->getOperand(0).getReg()); break; + case X86::VUNPCKLPDrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VUNPCKLPDrm: + DecodeUNPCKLPDMask(2, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + break; + case X86::VUNPCKLPDYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VUNPCKLPDYrm: + DecodeUNPCKLPDMask(4, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + break; case X86::UNPCKLPSrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::UNPCKLPSrm: - DecodeUNPCKLPMask(4, ShuffleMask); + DecodeUNPCKLPSMask(4, ShuffleMask); Src1Name = getRegName(MI->getOperand(0).getReg()); break; + case X86::VUNPCKLPSrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VUNPCKLPSrm: + DecodeUNPCKLPSMask(4, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + break; + case X86::VUNPCKLPSYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VUNPCKLPSYrm: + DecodeUNPCKLPSMask(8, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + break; case X86::UNPCKHPDrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp index 0484529..47253eb 100644 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp @@ -15,6 +15,7 @@ #define DEBUG_TYPE "asm-printer" #include "X86IntelInstPrinter.h" #include "X86InstComments.h" +#include "X86Subtarget.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h index 6f12032..ca99dc0 100644 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h +++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h @@ -18,13 +18,15 @@ #include "llvm/Support/raw_ostream.h" namespace llvm { - class MCOperand; + +class MCOperand; +class TargetMachine; class X86IntelInstPrinter : public MCInstPrinter { public: - X86IntelInstPrinter(const MCAsmInfo &MAI) + X86IntelInstPrinter(TargetMachine &TM, const MCAsmInfo &MAI) : MCInstPrinter(MAI) {} - + virtual void printInst(const MCInst *MI, raw_ostream &OS); virtual StringRef getOpcodeName(unsigned Opcode) const; @@ -33,7 +35,6 @@ public: static const char *getRegisterName(unsigned RegNo); static const char *getInstructionName(unsigned Opcode); - void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O); void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &O); diff --git a/lib/Target/X86/README-X86-64.txt b/lib/Target/X86/README-X86-64.txt index e21d69a..e7429a3 100644 --- a/lib/Target/X86/README-X86-64.txt +++ b/lib/Target/X86/README-X86-64.txt @@ -36,7 +36,7 @@ _conv: cmovb %rcx, %rax ret -Seems like the jb branch has high likelyhood of being taken. It would have +Seems like the jb branch has high likelihood of being taken. It would have saved a few instructions. //===---------------------------------------------------------------------===// diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt index abd1515..ea3014e 100644 --- a/lib/Target/X86/README.txt +++ b/lib/Target/X86/README.txt @@ -7,14 +7,6 @@ copy (3-addr bswap + memory support?) This is available on Atom processors. //===---------------------------------------------------------------------===// -CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move. The X86 -backend knows how to three-addressify this shift, but it appears the register -allocator isn't even asking it to do so in this case. We should investigate -why this isn't happening, it could have significant impact on other important -cases for X86 as well. - -//===---------------------------------------------------------------------===// - This should be one DIV/IDIV instruction, not a libcall: unsigned test(unsigned long long X, unsigned Y) { @@ -1572,7 +1564,7 @@ Implement processor-specific optimizations for parity with GCC on these processors. GCC does two optimizations: 1. ix86_pad_returns inserts a noop before ret instructions if immediately - preceeded by a conditional branch or is the target of a jump. + preceded by a conditional branch or is the target of a jump. 2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of code contains more than 3 branches. @@ -1656,28 +1648,61 @@ information to add the "lock" prefix. //===---------------------------------------------------------------------===// -_Bool bar(int *x) { return *x & 1; } +struct B { + unsigned char y0 : 1; +}; -define zeroext i1 @bar(i32* nocapture %x) nounwind readonly { -entry: - %tmp1 = load i32* %x ; <i32> [#uses=1] - %and = and i32 %tmp1, 1 ; <i32> [#uses=1] - %tobool = icmp ne i32 %and, 0 ; <i1> [#uses=1] - ret i1 %tobool +int bar(struct B* a) { return a->y0; } + +define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize { + %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0 + %2 = load i8* %1, align 1 + %3 = and i8 %2, 1 + %4 = zext i8 %3 to i32 + ret i32 %4 } -bar: # @bar -# BB#0: # %entry - movl 4(%esp), %eax - movb (%eax), %al - andb $1, %al - movzbl %al, %eax - ret +bar: # @bar +# BB#0: + movb (%rdi), %al + andb $1, %al + movzbl %al, %eax + ret Missed optimization: should be movl+andl. //===---------------------------------------------------------------------===// +The x86_64 abi says: + +Booleans, when stored in a memory object, are stored as single byte objects the +value of which is always 0 (false) or 1 (true). + +We are not using this fact: + +int bar(_Bool *a) { return *a; } + +define i32 @bar(i8* nocapture %a) nounwind readonly optsize { + %1 = load i8* %a, align 1, !tbaa !0 + %tmp = and i8 %1, 1 + %2 = zext i8 %tmp to i32 + ret i32 %2 +} + +bar: + movb (%rdi), %al + andb $1, %al + movzbl %al, %eax + ret + +GCC produces + +bar: + movzbl (%rdi), %eax + ret + +//===---------------------------------------------------------------------===// + Consider the following two functions compiled with clang: _Bool foo(int *x) { return !(*x & 4); } unsigned bar(int *x) { return !(*x & 4); } @@ -1703,26 +1728,6 @@ are functionally identical. //===---------------------------------------------------------------------===// Take the following C code: -int x(int y) { return (y & 63) << 14; } - -Code produced by gcc: - andl $63, %edi - sall $14, %edi - movl %edi, %eax - ret - -Code produced by clang: - shll $14, %edi - movl %edi, %eax - andl $1032192, %eax - ret - -The code produced by gcc is 3 bytes shorter. This sort of construct often -shows up with bitfields. - -//===---------------------------------------------------------------------===// - -Take the following C code: int f(int a, int b) { return (unsigned char)a == (unsigned char)b; } We generate the following IR with clang: @@ -1947,3 +1952,91 @@ which is "perfect". //===---------------------------------------------------------------------===// +For the branch in the following code: +int a(); +int b(int x, int y) { + if (x & (1<<(y&7))) + return a(); + return y; +} + +We currently generate: + movb %sil, %al + andb $7, %al + movzbl %al, %eax + btl %eax, %edi + jae .LBB0_2 + +movl+andl would be shorter than the movb+andb+movzbl sequence. + +//===---------------------------------------------------------------------===// + +For the following: +struct u1 { + float x, y; +}; +float foo(struct u1 u) { + return u.x + u.y; +} + +We currently generate: + movdqa %xmm0, %xmm1 + pshufd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0,0,0] + addss %xmm1, %xmm0 + ret + +We could save an instruction here by commuting the addss. + +//===---------------------------------------------------------------------===// + +This (from PR9661): + +float clamp_float(float a) { + if (a > 1.0f) + return 1.0f; + else if (a < 0.0f) + return 0.0f; + else + return a; +} + +Could compile to: + +clamp_float: # @clamp_float + movss .LCPI0_0(%rip), %xmm1 + minss %xmm1, %xmm0 + pxor %xmm1, %xmm1 + maxss %xmm1, %xmm0 + ret + +with -ffast-math. + +//===---------------------------------------------------------------------===// + +This function (from PR9803): + +int clamp2(int a) { + if (a > 5) + a = 5; + if (a < 0) + return 0; + return a; +} + +Compiles to: + +_clamp2: ## @clamp2 + pushq %rbp + movq %rsp, %rbp + cmpl $5, %edi + movl $5, %ecx + cmovlel %edi, %ecx + testl %ecx, %ecx + movl $0, %eax + cmovnsl %ecx, %eax + popq %rbp + ret + +The move of 0 could be scheduled above the test to make it is xor reg,reg. + +//===---------------------------------------------------------------------===// diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp index 1287977..cd06060 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -1,4 +1,4 @@ -//===-- X86ShuffleDecode.h - X86 shuffle decode logic ---------------------===// +//===-- X86ShuffleDecode.cpp - X86 shuffle decode logic -------------------===// // // The LLVM Compiler Infrastructure // @@ -95,12 +95,29 @@ void DecodePSHUFLWMask(unsigned Imm, ShuffleMask.push_back(7); } -void DecodePUNPCKLMask(unsigned NElts, +void DecodePUNPCKLBWMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask) { + DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i8, NElts), ShuffleMask); +} + +void DecodePUNPCKLWDMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask) { + DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i16, NElts), ShuffleMask); +} + +void DecodePUNPCKLDQMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask) { + DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i32, NElts), ShuffleMask); +} + +void DecodePUNPCKLQDQMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask) { + DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i64, NElts), ShuffleMask); +} + +void DecodePUNPCKLMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask) { - for (unsigned i = 0; i != NElts/2; ++i) { - ShuffleMask.push_back(i); - ShuffleMask.push_back(i+NElts); - } + DecodeUNPCKLPMask(VT, ShuffleMask); } void DecodePUNPCKHMask(unsigned NElts, @@ -133,15 +150,40 @@ void DecodeUNPCKHPMask(unsigned NElts, } } +void DecodeUNPCKLPSMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask) { + DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i32, NElts), ShuffleMask); +} + +void DecodeUNPCKLPDMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask) { + DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i64, NElts), ShuffleMask); +} /// DecodeUNPCKLPMask - This decodes the shuffle masks for unpcklps/unpcklpd -/// etc. NElts indicates the number of elements in the vector allowing it to -/// handle different datatypes and vector widths. -void DecodeUNPCKLPMask(unsigned NElts, +/// etc. VT indicates the type of the vector allowing it to handle different +/// datatypes and vector widths. +void DecodeUNPCKLPMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask) { - for (unsigned i = 0; i != NElts/2; ++i) { - ShuffleMask.push_back(i); // Reads from dest - ShuffleMask.push_back(i+NElts); // Reads from src + unsigned NumElts = VT.getVectorNumElements(); + + // Handle vector lengths > 128 bits. Define a "section" as a set of + // 128 bits. AVX defines UNPCK* to operate independently on 128-bit + // sections. + unsigned NumSections = VT.getSizeInBits() / 128; + if (NumSections == 0 ) NumSections = 1; // Handle MMX + unsigned NumSectionElts = NumElts / NumSections; + + unsigned Start = 0; + unsigned End = NumSectionElts / 2; + for (unsigned s = 0; s < NumSections; ++s) { + for (unsigned i = Start; i != End; ++i) { + ShuffleMask.push_back(i); // Reads from dest/src1 + ShuffleMask.push_back(i+NumSectionElts); // Reads from src/src2 + } + // Process the next 128 bits. + Start += NumSectionElts; + End += NumSectionElts; } } diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h index 50d9ccb..b18f670 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -16,6 +16,7 @@ #define X86_SHUFFLE_DECODE_H #include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/ValueTypes.h" //===----------------------------------------------------------------------===// // Vector Mask Decoding @@ -45,7 +46,19 @@ void DecodePSHUFHWMask(unsigned Imm, void DecodePSHUFLWMask(unsigned Imm, SmallVectorImpl<unsigned> &ShuffleMask); -void DecodePUNPCKLMask(unsigned NElts, +void DecodePUNPCKLBWMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodePUNPCKLWDMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodePUNPCKLDQMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodePUNPCKLQDQMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodePUNPCKLMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask); void DecodePUNPCKHMask(unsigned NElts, @@ -57,11 +70,16 @@ void DecodeSHUFPSMask(unsigned NElts, unsigned Imm, void DecodeUNPCKHPMask(unsigned NElts, SmallVectorImpl<unsigned> &ShuffleMask); +void DecodeUNPCKLPSMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodeUNPCKLPDMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask); /// DecodeUNPCKLPMask - This decodes the shuffle masks for unpcklps/unpcklpd -/// etc. NElts indicates the number of elements in the vector allowing it to -/// handle different datatypes and vector widths. -void DecodeUNPCKLPMask(unsigned NElts, +/// etc. VT indicates the type of the vector allowing it to handle different +/// datatypes and vector widths. +void DecodeUNPCKLPMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask); } // llvm namespace diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index efb6c8c..25b8d3e 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -1,13 +1,13 @@ //===- X86.td - Target definition file for the Intel X86 ---*- tablegen -*-===// -// +// // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. -// +// //===----------------------------------------------------------------------===// // -// This is a target description file for the Intel i386 architecture, refered to +// This is a target description file for the Intel i386 architecture, referred to // here as the "X86" architecture. // //===----------------------------------------------------------------------===// @@ -32,7 +32,7 @@ def FeatureMMX : SubtargetFeature<"mmx","X86SSELevel", "MMX", def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1", "Enable SSE instructions", // SSE codegen depends on cmovs, and all - // SSE1+ processors support them. + // SSE1+ processors support them. [FeatureMMX, FeatureCMOV]>; def FeatureSSE2 : SubtargetFeature<"sse2", "X86SSELevel", "SSE2", "Enable SSE2 instructions", @@ -50,7 +50,8 @@ def FeatureSSE42 : SubtargetFeature<"sse42", "X86SSELevel", "SSE42", "Enable SSE 4.2 instructions", [FeatureSSE41, FeaturePOPCNT]>; def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow", - "Enable 3DNow! instructions">; + "Enable 3DNow! instructions", + [FeatureMMX]>; def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA", "Enable 3DNow! Athlon instructions", [Feature3DNow]>; @@ -125,10 +126,10 @@ def : Proc<"sandybridge", [FeatureSSE42, Feature64Bit, FeatureAES, FeatureCLMUL]>; def : Proc<"k6", [FeatureMMX]>; -def : Proc<"k6-2", [FeatureMMX, Feature3DNow]>; -def : Proc<"k6-3", [FeatureMMX, Feature3DNow]>; -def : Proc<"athlon", [FeatureMMX, Feature3DNowA, FeatureSlowBTMem]>; -def : Proc<"athlon-tbird", [FeatureMMX, Feature3DNowA, FeatureSlowBTMem]>; +def : Proc<"k6-2", [Feature3DNow]>; +def : Proc<"k6-3", [Feature3DNow]>; +def : Proc<"athlon", [Feature3DNowA, FeatureSlowBTMem]>; +def : Proc<"athlon-tbird", [Feature3DNowA, FeatureSlowBTMem]>; def : Proc<"athlon-4", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>; def : Proc<"athlon-xp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>; def : Proc<"athlon-mp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>; @@ -156,8 +157,8 @@ def : Proc<"shanghai", [Feature3DNowA, Feature64Bit, FeatureSSE4A, Feature3DNowA]>; def : Proc<"winchip-c6", [FeatureMMX]>; -def : Proc<"winchip2", [FeatureMMX, Feature3DNow]>; -def : Proc<"c3", [FeatureMMX, Feature3DNow]>; +def : Proc<"winchip2", [Feature3DNow]>; +def : Proc<"c3", [Feature3DNow]>; def : Proc<"c3-2", [FeatureSSE1]>; //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86AsmBackend.cpp b/lib/Target/X86/X86AsmBackend.cpp index da5f5b1..4d7d96d 100644 --- a/lib/Target/X86/X86AsmBackend.cpp +++ b/lib/Target/X86/X86AsmBackend.cpp @@ -21,6 +21,7 @@ #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/Object/MachOFormat.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/ELF.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" @@ -28,6 +29,13 @@ #include "llvm/Target/TargetAsmBackend.h" using namespace llvm; +// Option to allow disabling arithmetic relaxation to workaround PR9807, which +// is useful when running bitwise comparison experiments on Darwin. We should be +// able to remove this once PR9807 is resolved. +static cl::opt<bool> +MCDisableArithRelaxation("mc-x86-disable-arith-relaxation", + cl::desc("Disable relaxation of arithmetic instruction for X86")); + static unsigned getFixupKindLog2Size(unsigned Kind) { switch (Kind) { default: assert(0 && "invalid fixup kind!"); @@ -201,6 +209,9 @@ bool X86AsmBackend::MayNeedRelaxation(const MCInst &Inst) const { if (getRelaxedOpcodeBranch(Inst.getOpcode()) != Inst.getOpcode()) return true; + if (MCDisableArithRelaxation) + return false; + // Check if this instruction is ever relaxable. if (getRelaxedOpcodeArith(Inst.getOpcode()) == Inst.getOpcode()) return false; @@ -307,10 +318,13 @@ public: : ELFX86AsmBackend(T, OSType) {} MCObjectWriter *createObjectWriter(raw_ostream &OS) const { - return createELFObjectWriter(new X86ELFObjectWriter(false, OSType, - ELF::EM_386, false), + return createELFObjectWriter(createELFObjectTargetWriter(), OS, /*IsLittleEndian*/ true); } + + MCELFObjectTargetWriter *createELFObjectTargetWriter() const { + return new X86ELFObjectWriter(false, OSType, ELF::EM_386, false); + } }; class ELFX86_64AsmBackend : public ELFX86AsmBackend { @@ -319,10 +333,13 @@ public: : ELFX86AsmBackend(T, OSType) {} MCObjectWriter *createObjectWriter(raw_ostream &OS) const { - return createELFObjectWriter(new X86ELFObjectWriter(true, OSType, - ELF::EM_X86_64, true), + return createELFObjectWriter(createELFObjectTargetWriter(), OS, /*IsLittleEndian*/ true); } + + MCELFObjectTargetWriter *createELFObjectTargetWriter() const { + return new X86ELFObjectWriter(true, OSType, ELF::EM_X86_64, true); + } }; class WindowsX86AsmBackend : public X86AsmBackend { @@ -408,34 +425,26 @@ public: TargetAsmBackend *llvm::createX86_32AsmBackend(const Target &T, const std::string &TT) { - switch (Triple(TT).getOS()) { - case Triple::Darwin: + Triple TheTriple(TT); + + if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) return new DarwinX86_32AsmBackend(T); - case Triple::MinGW32: - case Triple::Cygwin: - case Triple::Win32: - if (Triple(TT).getEnvironment() == Triple::MachO) - return new DarwinX86_32AsmBackend(T); - else - return new WindowsX86AsmBackend(T, false); - default: - return new ELFX86_32AsmBackend(T, Triple(TT).getOS()); - } + + if (TheTriple.isOSWindows()) + return new WindowsX86AsmBackend(T, false); + + return new ELFX86_32AsmBackend(T, TheTriple.getOS()); } TargetAsmBackend *llvm::createX86_64AsmBackend(const Target &T, const std::string &TT) { - switch (Triple(TT).getOS()) { - case Triple::Darwin: + Triple TheTriple(TT); + + if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) return new DarwinX86_64AsmBackend(T); - case Triple::MinGW32: - case Triple::Cygwin: - case Triple::Win32: - if (Triple(TT).getEnvironment() == Triple::MachO) - return new DarwinX86_64AsmBackend(T); - else - return new WindowsX86AsmBackend(T, true); - default: - return new ELFX86_64AsmBackend(T, Triple(TT).getOS()); - } + + if (TheTriple.isOSWindows()) + return new WindowsX86AsmBackend(T, true); + + return new ELFX86_64AsmBackend(T, TheTriple.getOS()); } diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp index 99b4479..c2d53c4 100644 --- a/lib/Target/X86/X86AsmPrinter.cpp +++ b/lib/Target/X86/X86AsmPrinter.cpp @@ -709,12 +709,13 @@ void X86AsmPrinter::PrintDebugValueComment(const MachineInstr *MI, //===----------------------------------------------------------------------===// static MCInstPrinter *createX86MCInstPrinter(const Target &T, + TargetMachine &TM, unsigned SyntaxVariant, const MCAsmInfo &MAI) { if (SyntaxVariant == 0) - return new X86ATTInstPrinter(MAI); + return new X86ATTInstPrinter(TM, MAI); if (SyntaxVariant == 1) - return new X86IntelInstPrinter(MAI); + return new X86IntelInstPrinter(TM, MAI); return 0; } diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index a44fb69..5635175 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -215,6 +215,13 @@ def CC_X86_Win64_C : CallingConv<[ // The first 4 integer arguments are passed in integer registers. CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ], [XMM0, XMM1, XMM2, XMM3]>>, + + // Do not pass the sret argument in RCX, the Win64 thiscall calling + // convention requires "this" to be passed in RCX. + CCIfCC<"CallingConv::X86_ThisCall", + CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[RDX , R8 , R9 ], + [XMM1, XMM2, XMM3]>>>>, + CCIfType<[i64], CCAssignToRegWithShadow<[RCX , RDX , R8 , R9 ], [XMM0, XMM1, XMM2, XMM3]>>, diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp index 60d9d4a..421e221 100644 --- a/lib/Target/X86/X86CodeEmitter.cpp +++ b/lib/Target/X86/X86CodeEmitter.cpp @@ -652,6 +652,8 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, case X86II::TB: // Two-byte opcode prefix case X86II::T8: // 0F 38 case X86II::TA: // 0F 3A + case X86II::A6: // 0F A6 + case X86II::A7: // 0F A7 Need0FPrefix = true; break; case X86II::TF: // F2 0F 38 @@ -695,6 +697,12 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, case X86II::TA: // 0F 3A MCE.emitByte(0x3A); break; + case X86II::A6: // 0F A6 + MCE.emitByte(0xA6); + break; + case X86II::A7: // 0F A7 + MCE.emitByte(0xA7); + break; } // If this is a two-address instruction, skip one of the register operands. diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 6fa9284..1382f18 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -23,6 +23,7 @@ #include "llvm/GlobalVariable.h" #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" +#include "llvm/Operator.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" @@ -77,10 +78,8 @@ private: bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, unsigned &RR); - bool X86FastEmitStore(EVT VT, const Value *Val, - const X86AddressMode &AM); - bool X86FastEmitStore(EVT VT, unsigned Val, - const X86AddressMode &AM); + bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM); + bool X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM); bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT, unsigned &ResultReg); @@ -125,6 +124,8 @@ private: unsigned TargetMaterializeAlloca(const AllocaInst *C); + unsigned TargetMaterializeFloatZero(const ConstantFP *CF); + /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is /// computed in an SSE register, not on the X87 floating point stack. bool isScalarFPTypeInSSEReg(EVT VT) const { @@ -133,6 +134,9 @@ private: } bool isTypeLegal(const Type *Ty, MVT &VT, bool AllowI1 = false); + + bool TryEmitSmallMemcpy(X86AddressMode DestAM, + X86AddressMode SrcAM, uint64_t Len); }; } // end anonymous namespace. @@ -224,8 +228,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, /// and a displacement offset, or a GlobalAddress, /// i.e. V. Return true if it is possible. bool -X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, - const X86AddressMode &AM) { +X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) { // Get opcode and regclass of the output for the given store instruction. unsigned Opc = 0; switch (VT.getSimpleVT().SimpleTy) { @@ -395,37 +398,45 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { const Value *Op = *i; if (const StructType *STy = dyn_cast<StructType>(*GTI)) { const StructLayout *SL = TD.getStructLayout(STy); - unsigned Idx = cast<ConstantInt>(Op)->getZExtValue(); - Disp += SL->getElementOffset(Idx); - } else { - uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType()); - SmallVector<const Value *, 4> Worklist; - Worklist.push_back(Op); - do { - Op = Worklist.pop_back_val(); - if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { - // Constant-offset addressing. - Disp += CI->getSExtValue() * S; - } else if (isa<AddOperator>(Op) && - isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) { - // An add with a constant operand. Fold the constant. - ConstantInt *CI = - cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1)); - Disp += CI->getSExtValue() * S; - // Add the other operand back to the work list. - Worklist.push_back(cast<AddOperator>(Op)->getOperand(0)); - } else if (IndexReg == 0 && - (!AM.GV || !Subtarget->isPICStyleRIPRel()) && - (S == 1 || S == 2 || S == 4 || S == 8)) { - // Scaled-index addressing. - Scale = S; - IndexReg = getRegForGEPIndex(Op).first; - if (IndexReg == 0) - return false; - } else - // Unsupported. - goto unsupported_gep; - } while (!Worklist.empty()); + Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue()); + continue; + } + + // A array/variable index is always of the form i*S where S is the + // constant scale size. See if we can push the scale into immediates. + uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType()); + for (;;) { + if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { + // Constant-offset addressing. + Disp += CI->getSExtValue() * S; + break; + } + if (isa<AddOperator>(Op) && + (!isa<Instruction>(Op) || + FuncInfo.MBBMap[cast<Instruction>(Op)->getParent()] + == FuncInfo.MBB) && + isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) { + // An add (in the same block) with a constant operand. Fold the + // constant. + ConstantInt *CI = + cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1)); + Disp += CI->getSExtValue() * S; + // Iterate on the other operand. + Op = cast<AddOperator>(Op)->getOperand(0); + continue; + } + if (IndexReg == 0 && + (!AM.GV || !Subtarget->isPICStyleRIPRel()) && + (S == 1 || S == 2 || S == 4 || S == 8)) { + // Scaled-index addressing. + Scale = S; + IndexReg = getRegForGEPIndex(Op).first; + if (IndexReg == 0) + return false; + break; + } + // Unsupported. + goto unsupported_gep; } } // Check for displacement overflow. @@ -439,7 +450,7 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { if (X86SelectAddress(U->getOperand(0), AM)) return true; - // If we couldn't merge the sub value into this addr mode, revert back to + // If we couldn't merge the gep value into this addr mode, revert back to // our address and just match the value instead of completely failing. AM = SavedAM; break; @@ -451,91 +462,91 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { // Handle constant address. if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) { - // Can't handle alternate code models yet. + // Can't handle alternate code models or TLS yet. if (TM.getCodeModel() != CodeModel::Small) return false; - // RIP-relative addresses can't have additional register operands. - if (Subtarget->isPICStyleRIPRel() && - (AM.Base.Reg != 0 || AM.IndexReg != 0)) - return false; - - // Can't handle TLS yet. if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) if (GVar->isThreadLocal()) return false; + + // RIP-relative addresses can't have additional register operands, so if + // we've already folded stuff into the addressing mode, just force the + // global value into its own register, which we can use as the basereg. + if (!Subtarget->isPICStyleRIPRel() || + (AM.Base.Reg == 0 && AM.IndexReg == 0)) { + // Okay, we've committed to selecting this global. Set up the address. + AM.GV = GV; + + // Allow the subtarget to classify the global. + unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM); + + // If this reference is relative to the pic base, set it now. + if (isGlobalRelativeToPICBase(GVFlags)) { + // FIXME: How do we know Base.Reg is free?? + AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); + } - // Okay, we've committed to selecting this global. Set up the basic address. - AM.GV = GV; - - // Allow the subtarget to classify the global. - unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM); - - // If this reference is relative to the pic base, set it now. - if (isGlobalRelativeToPICBase(GVFlags)) { - // FIXME: How do we know Base.Reg is free?? - AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); - } - - // Unless the ABI requires an extra load, return a direct reference to - // the global. - if (!isGlobalStubReference(GVFlags)) { - if (Subtarget->isPICStyleRIPRel()) { - // Use rip-relative addressing if we can. Above we verified that the - // base and index registers are unused. - assert(AM.Base.Reg == 0 && AM.IndexReg == 0); - AM.Base.Reg = X86::RIP; + // Unless the ABI requires an extra load, return a direct reference to + // the global. + if (!isGlobalStubReference(GVFlags)) { + if (Subtarget->isPICStyleRIPRel()) { + // Use rip-relative addressing if we can. Above we verified that the + // base and index registers are unused. + assert(AM.Base.Reg == 0 && AM.IndexReg == 0); + AM.Base.Reg = X86::RIP; + } + AM.GVOpFlags = GVFlags; + return true; } - AM.GVOpFlags = GVFlags; - return true; - } - // Ok, we need to do a load from a stub. If we've already loaded from this - // stub, reuse the loaded pointer, otherwise emit the load now. - DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V); - unsigned LoadReg; - if (I != LocalValueMap.end() && I->second != 0) { - LoadReg = I->second; - } else { - // Issue load from stub. - unsigned Opc = 0; - const TargetRegisterClass *RC = NULL; - X86AddressMode StubAM; - StubAM.Base.Reg = AM.Base.Reg; - StubAM.GV = GV; - StubAM.GVOpFlags = GVFlags; - - // Prepare for inserting code in the local-value area. - SavePoint SaveInsertPt = enterLocalValueArea(); - - if (TLI.getPointerTy() == MVT::i64) { - Opc = X86::MOV64rm; - RC = X86::GR64RegisterClass; - - if (Subtarget->isPICStyleRIPRel()) - StubAM.Base.Reg = X86::RIP; + // Ok, we need to do a load from a stub. If we've already loaded from + // this stub, reuse the loaded pointer, otherwise emit the load now. + DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V); + unsigned LoadReg; + if (I != LocalValueMap.end() && I->second != 0) { + LoadReg = I->second; } else { - Opc = X86::MOV32rm; - RC = X86::GR32RegisterClass; - } + // Issue load from stub. + unsigned Opc = 0; + const TargetRegisterClass *RC = NULL; + X86AddressMode StubAM; + StubAM.Base.Reg = AM.Base.Reg; + StubAM.GV = GV; + StubAM.GVOpFlags = GVFlags; + + // Prepare for inserting code in the local-value area. + SavePoint SaveInsertPt = enterLocalValueArea(); + + if (TLI.getPointerTy() == MVT::i64) { + Opc = X86::MOV64rm; + RC = X86::GR64RegisterClass; + + if (Subtarget->isPICStyleRIPRel()) + StubAM.Base.Reg = X86::RIP; + } else { + Opc = X86::MOV32rm; + RC = X86::GR32RegisterClass; + } - LoadReg = createResultReg(RC); - MachineInstrBuilder LoadMI = - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), LoadReg); - addFullAddress(LoadMI, StubAM); + LoadReg = createResultReg(RC); + MachineInstrBuilder LoadMI = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), LoadReg); + addFullAddress(LoadMI, StubAM); - // Ok, back to normal mode. - leaveLocalValueArea(SaveInsertPt); + // Ok, back to normal mode. + leaveLocalValueArea(SaveInsertPt); - // Prevent loading GV stub multiple times in same MBB. - LocalValueMap[V] = LoadReg; - } + // Prevent loading GV stub multiple times in same MBB. + LocalValueMap[V] = LoadReg; + } - // Now construct the final address. Note that the Disp, Scale, - // and Index values may already be set here. - AM.Base.Reg = LoadReg; - AM.GV = 0; - return true; + // Now construct the final address. Note that the Disp, Scale, + // and Index values may already be set here. + AM.Base.Reg = LoadReg; + AM.GV = 0; + return true; + } } // If all else fails, try to materialize the value in a register. @@ -856,12 +867,9 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { unsigned NEReg = createResultReg(&X86::GR8RegClass); unsigned PReg = createResultReg(&X86::GR8RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(X86::SETNEr), NEReg); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(X86::SETPr), PReg); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(X86::OR8rr), ResultReg) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETNEr), NEReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETPr), PReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::OR8rr),ResultReg) .addReg(PReg).addReg(NEReg); UpdateValueMap(I, ResultReg); return true; @@ -1059,14 +1067,49 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { } } } + } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) { + // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which + // typically happen for _Bool and C++ bools. + MVT SourceVT; + if (TI->hasOneUse() && TI->getParent() == I->getParent() && + isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) { + unsigned TestOpc = 0; + switch (SourceVT.SimpleTy) { + default: break; + case MVT::i8: TestOpc = X86::TEST8ri; break; + case MVT::i16: TestOpc = X86::TEST16ri; break; + case MVT::i32: TestOpc = X86::TEST32ri; break; + case MVT::i64: TestOpc = X86::TEST64ri32; break; + } + if (TestOpc) { + unsigned OpReg = getRegForValue(TI->getOperand(0)); + if (OpReg == 0) return false; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TestOpc)) + .addReg(OpReg).addImm(1); + + unsigned JmpOpc = X86::JNE_4; + if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) { + std::swap(TrueMBB, FalseMBB); + JmpOpc = X86::JE_4; + } + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(JmpOpc)) + .addMBB(TrueMBB); + FastEmitBranch(FalseMBB, DL); + FuncInfo.MBB->addSuccessor(TrueMBB); + return true; + } + } } // Otherwise do a clumsy setcc and re-test it. + // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used + // in an explicit cast, so make sure to handle that correctly. unsigned OpReg = getRegForValue(BI->getCondition()); if (OpReg == 0) return false; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TEST8rr)) - .addReg(OpReg).addReg(OpReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TEST8ri)) + .addReg(OpReg).addImm(1); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::JNE_4)) .addMBB(TrueMBB); FastEmitBranch(FalseMBB, DL); @@ -1075,42 +1118,42 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { } bool X86FastISel::X86SelectShift(const Instruction *I) { - unsigned CReg = 0, OpReg = 0, OpImm = 0; + unsigned CReg = 0, OpReg = 0; const TargetRegisterClass *RC = NULL; if (I->getType()->isIntegerTy(8)) { CReg = X86::CL; RC = &X86::GR8RegClass; switch (I->getOpcode()) { - case Instruction::LShr: OpReg = X86::SHR8rCL; OpImm = X86::SHR8ri; break; - case Instruction::AShr: OpReg = X86::SAR8rCL; OpImm = X86::SAR8ri; break; - case Instruction::Shl: OpReg = X86::SHL8rCL; OpImm = X86::SHL8ri; break; + case Instruction::LShr: OpReg = X86::SHR8rCL; break; + case Instruction::AShr: OpReg = X86::SAR8rCL; break; + case Instruction::Shl: OpReg = X86::SHL8rCL; break; default: return false; } } else if (I->getType()->isIntegerTy(16)) { CReg = X86::CX; RC = &X86::GR16RegClass; switch (I->getOpcode()) { - case Instruction::LShr: OpReg = X86::SHR16rCL; OpImm = X86::SHR16ri; break; - case Instruction::AShr: OpReg = X86::SAR16rCL; OpImm = X86::SAR16ri; break; - case Instruction::Shl: OpReg = X86::SHL16rCL; OpImm = X86::SHL16ri; break; + case Instruction::LShr: OpReg = X86::SHR16rCL; break; + case Instruction::AShr: OpReg = X86::SAR16rCL; break; + case Instruction::Shl: OpReg = X86::SHL16rCL; break; default: return false; } } else if (I->getType()->isIntegerTy(32)) { CReg = X86::ECX; RC = &X86::GR32RegClass; switch (I->getOpcode()) { - case Instruction::LShr: OpReg = X86::SHR32rCL; OpImm = X86::SHR32ri; break; - case Instruction::AShr: OpReg = X86::SAR32rCL; OpImm = X86::SAR32ri; break; - case Instruction::Shl: OpReg = X86::SHL32rCL; OpImm = X86::SHL32ri; break; + case Instruction::LShr: OpReg = X86::SHR32rCL; break; + case Instruction::AShr: OpReg = X86::SAR32rCL; break; + case Instruction::Shl: OpReg = X86::SHL32rCL; break; default: return false; } } else if (I->getType()->isIntegerTy(64)) { CReg = X86::RCX; RC = &X86::GR64RegClass; switch (I->getOpcode()) { - case Instruction::LShr: OpReg = X86::SHR64rCL; OpImm = X86::SHR64ri; break; - case Instruction::AShr: OpReg = X86::SAR64rCL; OpImm = X86::SAR64ri; break; - case Instruction::Shl: OpReg = X86::SHL64rCL; OpImm = X86::SHL64ri; break; + case Instruction::LShr: OpReg = X86::SHR64rCL; break; + case Instruction::AShr: OpReg = X86::SAR64rCL; break; + case Instruction::Shl: OpReg = X86::SHL64rCL; break; default: return false; } } else { @@ -1124,15 +1167,6 @@ bool X86FastISel::X86SelectShift(const Instruction *I) { unsigned Op0Reg = getRegForValue(I->getOperand(0)); if (Op0Reg == 0) return false; - // Fold immediate in shl(x,3). - if (const ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) { - unsigned ResultReg = createResultReg(RC); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpImm), - ResultReg).addReg(Op0Reg).addImm(CI->getZExtValue() & 0xff); - UpdateValueMap(I, ResultReg); - return true; - } - unsigned Op1Reg = getRegForValue(I->getOperand(1)); if (Op1Reg == 0) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), @@ -1294,10 +1328,61 @@ bool X86FastISel::X86SelectExtractValue(const Instruction *I) { return false; } +bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM, + X86AddressMode SrcAM, uint64_t Len) { + // Make sure we don't bloat code by inlining very large memcpy's. + bool i64Legal = TLI.isTypeLegal(MVT::i64); + if (Len > (i64Legal ? 32 : 16)) return false; + + // We don't care about alignment here since we just emit integer accesses. + while (Len) { + MVT VT; + if (Len >= 8 && i64Legal) + VT = MVT::i64; + else if (Len >= 4) + VT = MVT::i32; + else if (Len >= 2) + VT = MVT::i16; + else { + assert(Len == 1); + VT = MVT::i8; + } + + unsigned Reg; + bool RV = X86FastEmitLoad(VT, SrcAM, Reg); + RV &= X86FastEmitStore(VT, Reg, DestAM); + assert(RV && "Failed to emit load or store??"); + + unsigned Size = VT.getSizeInBits()/8; + Len -= Size; + DestAM.Disp += Size; + SrcAM.Disp += Size; + } + + return true; +} + bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { // FIXME: Handle more intrinsics. switch (I.getIntrinsicID()) { default: return false; + case Intrinsic::memcpy: { + const MemCpyInst &MCI = cast<MemCpyInst>(I); + // Don't handle volatile or variable length memcpys. + if (MCI.isVolatile() || !isa<ConstantInt>(MCI.getLength())) + return false; + + uint64_t Len = cast<ConstantInt>(MCI.getLength())->getZExtValue(); + + // Get the address of the dest and source addresses. + X86AddressMode DestAM, SrcAM; + if (!X86SelectAddress(MCI.getRawDest(), DestAM) || + !X86SelectAddress(MCI.getRawSource(), SrcAM)) + return false; + + return TryEmitSmallMemcpy(DestAM, SrcAM, Len); + } + case Intrinsic::stackprotector: { // Emit code inline code to store the stack guard onto the stack. EVT PtrTy = TLI.getPointerTy(); @@ -1308,17 +1393,14 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { // Grab the frame index. X86AddressMode AM; if (!X86SelectAddress(Slot, AM)) return false; - if (!X86FastEmitStore(PtrTy, Op1, AM)) return false; - return true; } case Intrinsic::objectsize: { - ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(1)); + // FIXME: This should be moved to generic code! + ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(1)); const Type *Ty = I.getCalledFunction()->getReturnType(); - assert(CI && "Non-constant type in Intrinsic::objectsize?"); - MVT VT; if (!isTypeLegal(Ty, VT)) return false; @@ -1356,6 +1438,8 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { } case Intrinsic::sadd_with_overflow: case Intrinsic::uadd_with_overflow: { + // FIXME: Should fold immediates. + // Replace "add with overflow" intrinsics with an "add" instruction followed // by a seto/setc instruction. Later on, when the "extractvalue" // instructions are encountered, we use the fact that two registers were @@ -1427,8 +1511,7 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { // Handle only C and fastcc calling conventions for now. ImmutableCallSite CS(CI); CallingConv::ID CC = CS.getCallingConv(); - if (CC != CallingConv::C && - CC != CallingConv::Fast && + if (CC != CallingConv::C && CC != CallingConv::Fast && CC != CallingConv::X86_FastCall) return false; @@ -1437,14 +1520,17 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { if (CC == CallingConv::Fast && GuaranteedTailCallOpt) return false; - // Let SDISel handle vararg functions. const PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType()); const FunctionType *FTy = cast<FunctionType>(PT->getElementType()); - if (FTy->isVarArg()) + bool isVarArg = FTy->isVarArg(); + + // Don't know how to handle Win64 varargs yet. Nothing special needed for + // x86-32. Special handling for x86-64 is implemented. + if (isVarArg && Subtarget->isTargetWin64()) return false; // Fast-isel doesn't know about callee-pop yet. - if (Subtarget->IsCalleePop(FTy->isVarArg(), CC)) + if (Subtarget->IsCalleePop(isVarArg, CC)) return false; // Handle *simple* calls for now. @@ -1487,9 +1573,7 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { ArgFlags.reserve(CS.arg_size()); for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end(); i != e; ++i) { - unsigned Arg = getRegForValue(*i); - if (Arg == 0) - return false; + Value *ArgVal = *i; ISD::ArgFlagsTy Flags; unsigned AttrInd = i - CS.arg_begin() + 1; if (CS.paramHasAttr(AttrInd, Attribute::SExt)) @@ -1497,34 +1581,67 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { if (CS.paramHasAttr(AttrInd, Attribute::ZExt)) Flags.setZExt(); + // If this is an i1/i8/i16 argument, promote to i32 to avoid an extra + // instruction. This is safe because it is common to all fastisel supported + // calling conventions on x86. + if (ConstantInt *CI = dyn_cast<ConstantInt>(ArgVal)) { + if (CI->getBitWidth() == 1 || CI->getBitWidth() == 8 || + CI->getBitWidth() == 16) { + if (Flags.isSExt()) + ArgVal = ConstantExpr::getSExt(CI,Type::getInt32Ty(CI->getContext())); + else + ArgVal = ConstantExpr::getZExt(CI,Type::getInt32Ty(CI->getContext())); + } + } + + unsigned ArgReg; + + // Passing bools around ends up doing a trunc to i1 and passing it. + // Codegen this as an argument + "and 1". + if (ArgVal->getType()->isIntegerTy(1) && isa<TruncInst>(ArgVal) && + cast<TruncInst>(ArgVal)->getParent() == I->getParent() && + ArgVal->hasOneUse()) { + ArgVal = cast<TruncInst>(ArgVal)->getOperand(0); + ArgReg = getRegForValue(ArgVal); + if (ArgReg == 0) return false; + + MVT ArgVT; + if (!isTypeLegal(ArgVal->getType(), ArgVT)) return false; + + ArgReg = FastEmit_ri(ArgVT, ArgVT, ISD::AND, ArgReg, + ArgVal->hasOneUse(), 1); + } else { + ArgReg = getRegForValue(ArgVal); + } + + if (ArgReg == 0) return false; + // FIXME: Only handle *easy* calls for now. if (CS.paramHasAttr(AttrInd, Attribute::InReg) || - CS.paramHasAttr(AttrInd, Attribute::StructRet) || CS.paramHasAttr(AttrInd, Attribute::Nest) || CS.paramHasAttr(AttrInd, Attribute::ByVal)) return false; - const Type *ArgTy = (*i)->getType(); + const Type *ArgTy = ArgVal->getType(); MVT ArgVT; if (!isTypeLegal(ArgTy, ArgVT)) return false; unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy); Flags.setOrigAlign(OriginalAlignment); - Args.push_back(Arg); - ArgVals.push_back(*i); + Args.push_back(ArgReg); + ArgVals.push_back(ArgVal); ArgVTs.push_back(ArgVT); ArgFlags.push_back(Flags); } // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CC, false, TM, ArgLocs, I->getParent()->getContext()); + CCState CCInfo(CC, isVarArg, TM, ArgLocs, I->getParent()->getContext()); // Allocate shadow area for Win64 - if (Subtarget->isTargetWin64()) { + if (Subtarget->isTargetWin64()) CCInfo.AllocateStack(32, 8); - } CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_X86); @@ -1618,6 +1735,17 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { X86::EBX).addReg(Base); } + if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64()) { + // Count the number of XMM registers allocated. + static const unsigned XMMArgRegs[] = { + X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, + X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 + }; + unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::MOV8ri), + X86::AL).addImm(NumXMMRegs); + } + // Issue the call. MachineInstrBuilder MIB; if (CalleeOp) { @@ -1656,7 +1784,8 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { OpFlags = X86II::MO_PLT; } else if (Subtarget->isPICStyleStubAny() && (GV->isDeclaration() || GV->isWeakForLinker()) && - Subtarget->getDarwinVers() < 9) { + (!Subtarget->getTargetTriple().isMacOSX() || + Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. @@ -1672,14 +1801,20 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { if (Subtarget->isPICStyleGOT()) MIB.addReg(X86::EBX); + if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64()) + MIB.addReg(X86::AL); + // Add implicit physical register uses to the call. for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) MIB.addReg(RegArgs[i]); // Issue CALLSEQ_END unsigned AdjStackUp = TM.getRegisterInfo()->getCallFrameDestroyOpcode(); + unsigned NumBytesCallee = 0; + if (!Subtarget->is64Bit() && CS.paramHasAttr(1, Attribute::StructRet)) + NumBytesCallee = 4; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(AdjStackUp)) - .addImm(NumBytes).addImm(0); + .addImm(NumBytes).addImm(NumBytesCallee); // Now handle call return value (if any). SmallVector<unsigned, 4> UsedRegs; @@ -1850,10 +1985,13 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { if (isa<GlobalValue>(C)) { X86AddressMode AM; if (X86SelectAddress(C, AM)) { - if (TLI.getPointerTy() == MVT::i32) - Opc = X86::LEA32r; - else - Opc = X86::LEA64r; + // If the expression is just a basereg, then we're done, otherwise we need + // to emit an LEA. + if (AM.BaseType == X86AddressMode::RegBase && + AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == 0) + return AM.Base.Reg; + + Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r; unsigned ResultReg = createResultReg(RC); addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg), AM); @@ -1915,6 +2053,45 @@ unsigned X86FastISel::TargetMaterializeAlloca(const AllocaInst *C) { return ResultReg; } +unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) { + MVT VT; + if (!isTypeLegal(CF->getType(), VT)) + return false; + + // Get opcode and regclass for the given zero. + unsigned Opc = 0; + const TargetRegisterClass *RC = NULL; + switch (VT.SimpleTy) { + default: return false; + case MVT::f32: + if (Subtarget->hasSSE1()) { + Opc = X86::FsFLD0SS; + RC = X86::FR32RegisterClass; + } else { + Opc = X86::LD_Fp032; + RC = X86::RFP32RegisterClass; + } + break; + case MVT::f64: + if (Subtarget->hasSSE2()) { + Opc = X86::FsFLD0SD; + RC = X86::FR64RegisterClass; + } else { + Opc = X86::LD_Fp064; + RC = X86::RFP64RegisterClass; + } + break; + case MVT::f80: + // No f80 support yet. + return false; + } + + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg); + return ResultReg; +} + + /// TryToFoldLoad - The specified machine instr operand is a vreg, and that /// vreg is being provided by the specified load instruction. If possible, /// try to fold the load as an operand to the instruction, returning true if diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index 3aaa693..325d061 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -1307,7 +1307,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { // set up by FpSET_ST0, and our StackTop is off by one because of it. unsigned Op0 = getFPReg(MI->getOperand(0)); // Restore the actual StackTop from before Fp_SET_ST0. - // Note we can't handle Fp_SET_ST1 without a preceeding Fp_SET_ST0, and we + // Note we can't handle Fp_SET_ST1 without a preceding Fp_SET_ST0, and we // are not enforcing the constraint. ++StackTop; unsigned RegOnTop = getStackEntry(0); // This reg must remain in st(0). diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 0a3f931..06d12fc 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/Target/TargetData.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Support/CommandLine.h" @@ -296,7 +297,7 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF, // FIXME: This is dirty hack. The code itself is pretty mess right now. // It should be rewritten from scratch and generalized sometimes. - // Determine maximum offset (minumum due to stack growth). + // Determine maximum offset (minimum due to stack growth). int64_t MaxOffset = 0; for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(), E = CSI.end(); I != E; ++I) @@ -551,65 +552,71 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // responsible for adjusting the stack pointer. Touching the stack at 4K // increments is necessary to ensure that the guard pages used by the OS // virtual memory manager are allocated in correct sequence. - if (NumBytes >= 4096 && - (STI.isTargetCygMing() || STI.isTargetWin32()) && - !STI.isTargetEnvMacho()) { + if (NumBytes >= 4096 && STI.isTargetCOFF() && !STI.isTargetEnvMacho()) { + const char *StackProbeSymbol; + bool isSPUpdateNeeded = false; + + if (Is64Bit) { + if (STI.isTargetCygMing()) + StackProbeSymbol = "___chkstk"; + else { + StackProbeSymbol = "__chkstk"; + isSPUpdateNeeded = true; + } + } else if (STI.isTargetCygMing()) + StackProbeSymbol = "_alloca"; + else + StackProbeSymbol = "_chkstk"; + // Check whether EAX is livein for this function. bool isEAXAlive = isEAXLiveIn(MF); - const char *StackProbeSymbol = - STI.isTargetWindows() ? "_chkstk" : "_alloca"; - if (Is64Bit && STI.isTargetCygMing()) - StackProbeSymbol = "__chkstk"; - unsigned CallOp = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32; - if (!isEAXAlive) { - BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) - .addImm(NumBytes); - BuildMI(MBB, MBBI, DL, TII.get(CallOp)) - .addExternalSymbol(StackProbeSymbol) - .addReg(StackPtr, RegState::Define | RegState::Implicit) - .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); - } else { + if (isEAXAlive) { + // Sanity check that EAX is not livein for this function. + // It should not be, so throw an assert. + assert(!Is64Bit && "EAX is livein in x64 case!"); + // Save EAX BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r)) .addReg(X86::EAX, RegState::Kill); + } - // Allocate NumBytes-4 bytes on stack. We'll also use 4 already - // allocated bytes for EAX. + if (Is64Bit) { + // Handle the 64-bit Windows ABI case where we need to call __chkstk. + // Function prologue is responsible for adjusting the stack pointer. + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX) + .addImm(NumBytes); + } else { + // Allocate NumBytes-4 bytes on stack in case of isEAXAlive. + // We'll also use 4 already allocated bytes for EAX. BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) - .addImm(NumBytes - 4); - BuildMI(MBB, MBBI, DL, TII.get(CallOp)) - .addExternalSymbol(StackProbeSymbol) - .addReg(StackPtr, RegState::Define | RegState::Implicit) - .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); - - // Restore EAX - MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), - X86::EAX), - StackPtr, false, NumBytes - 4); - MBB.insert(MBBI, MI); + .addImm(isEAXAlive ? NumBytes - 4 : NumBytes); + } + + BuildMI(MBB, MBBI, DL, + TII.get(Is64Bit ? X86::W64ALLOCA : X86::CALLpcrel32)) + .addExternalSymbol(StackProbeSymbol) + .addReg(StackPtr, RegState::Define | RegState::Implicit) + .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); + + // MSVC x64's __chkstk needs to adjust %rsp. + // FIXME: %rax preserves the offset and should be available. + if (isSPUpdateNeeded) + emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, + TII, *RegInfo); + + if (isEAXAlive) { + // Restore EAX + MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), + X86::EAX), + StackPtr, false, NumBytes - 4); + MBB.insert(MBBI, MI); } - } else if (NumBytes >= 4096 && - STI.isTargetWin64() && - !STI.isTargetEnvMacho()) { - // Sanity check that EAX is not livein for this function. It should - // not be, so throw an assert. - assert(!isEAXLiveIn(MF) && "EAX is livein in the Win64 case!"); - - // Handle the 64-bit Windows ABI case where we need to call __chkstk. - // Function prologue is responsible for adjusting the stack pointer. - BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) - .addImm(NumBytes); - BuildMI(MBB, MBBI, DL, TII.get(X86::WINCALL64pcrel32)) - .addExternalSymbol("__chkstk") - .addReg(StackPtr, RegState::Define | RegState::Implicit); - emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, - TII, *RegInfo); } else if (NumBytes) emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, TII, *RegInfo); - if ((NumBytes || PushedRegs) && needsFrameMoves) { + if (( (!HasFP && NumBytes) || PushedRegs) && needsFrameMoves) { // Mark end of stack pointer adjustment. MCSymbol *Label = MMI.getContext().CreateTempSymbol(); BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)).addSym(Label); @@ -779,7 +786,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, assert(Offset >= 0 && "Offset should never be negative"); if (Offset) { - // Check for possible merge with preceeding ADD instruction. + // Check for possible merge with preceding ADD instruction. Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true); emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, TII, *RegInfo); } @@ -823,7 +830,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, int delta = -1*X86FI->getTCReturnAddrDelta(); MBBI = MBB.getLastNonDebugInstr(); - // Check for possible merge with preceeding ADD instruction. + // Check for possible merge with preceding ADD instruction. delta += mergeSPUpdates(MBB, MBBI, StackPtr, true); emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, TII, *RegInfo); } @@ -892,7 +899,6 @@ bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineFunction &MF = *MBB.getParent(); - bool isWin64 = STI.isTargetWin64(); unsigned SlotSize = STI.is64Bit() ? 8 : 4; unsigned FPReg = TRI->getFrameRegister(MF); unsigned CalleeFrameSize = 0; @@ -900,25 +906,39 @@ bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + // Push GPRs. It increases frame size. unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r; for (unsigned i = CSI.size(); i != 0; --i) { unsigned Reg = CSI[i-1].getReg(); + if (!X86::GR64RegClass.contains(Reg) && + !X86::GR32RegClass.contains(Reg)) + continue; // Add the callee-saved register as live-in. It's killed at the spill. MBB.addLiveIn(Reg); if (Reg == FPReg) // X86RegisterInfo::emitPrologue will handle spilling of frame register. continue; - if (!X86::VR128RegClass.contains(Reg) && !isWin64) { - CalleeFrameSize += SlotSize; - BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill); - } else { - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i-1].getFrameIdx(), - RC, TRI); - } + CalleeFrameSize += SlotSize; + BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill); } X86FI->setCalleeSavedFrameSize(CalleeFrameSize); + + // Make XMM regs spilled. X86 does not have ability of push/pop XMM. + // It can be done by spilling XMMs to stack frame. + // Note that only Win64 ABI might spill XMMs. + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + if (X86::GR64RegClass.contains(Reg) || + X86::GR32RegClass.contains(Reg)) + continue; + // Add the callee-saved register as live-in. It's killed at the spill. + MBB.addLiveIn(Reg); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i-1].getFrameIdx(), + RC, TRI); + } + return true; } @@ -933,21 +953,30 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + + // Reload XMMs from stack frame. + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + if (X86::GR64RegClass.contains(Reg) || + X86::GR32RegClass.contains(Reg)) + continue; + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), + RC, TRI); + } + + // POP GPRs. unsigned FPReg = TRI->getFrameRegister(MF); - bool isWin64 = STI.isTargetWin64(); unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r; for (unsigned i = 0, e = CSI.size(); i != e; ++i) { unsigned Reg = CSI[i].getReg(); + if (!X86::GR64RegClass.contains(Reg) && + !X86::GR32RegClass.contains(Reg)) + continue; if (Reg == FPReg) // X86RegisterInfo::emitEpilogue will handle restoring of frame register. continue; - if (!X86::VR128RegClass.contains(Reg) && !isWin64) { - BuildMI(MBB, MI, DL, TII.get(Opc), Reg); - } else { - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), - RC, TRI); - } + BuildMI(MBB, MI, DL, TII.get(Opc), Reg); } return true; } diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 9b0ec6e..4534e85 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -1580,6 +1580,81 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { return RetVal; break; } + case ISD::AND: + case ISD::OR: + case ISD::XOR: { + // For operations of the form (x << C1) op C2, check if we can use a smaller + // encoding for C2 by transforming it into (x op (C2>>C1)) << C1. + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + if (N0->getOpcode() != ISD::SHL || !N0->hasOneUse()) + break; + + // i8 is unshrinkable, i16 should be promoted to i32. + if (NVT != MVT::i32 && NVT != MVT::i64) + break; + + ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1); + ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(N0->getOperand(1)); + if (!Cst || !ShlCst) + break; + + int64_t Val = Cst->getSExtValue(); + uint64_t ShlVal = ShlCst->getZExtValue(); + + // Make sure that we don't change the operation by removing bits. + // This only matters for OR and XOR, AND is unaffected. + if (Opcode != ISD::AND && ((Val >> ShlVal) << ShlVal) != Val) + break; + + unsigned ShlOp, Op = 0; + EVT CstVT = NVT; + + // Check the minimum bitwidth for the new constant. + // TODO: AND32ri is the same as AND64ri32 with zext imm. + // TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr + // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32. + if (!isInt<8>(Val) && isInt<8>(Val >> ShlVal)) + CstVT = MVT::i8; + else if (!isInt<32>(Val) && isInt<32>(Val >> ShlVal)) + CstVT = MVT::i32; + + // Bail if there is no smaller encoding. + if (NVT == CstVT) + break; + + switch (NVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i32: + assert(CstVT == MVT::i8); + ShlOp = X86::SHL32ri; + + switch (Opcode) { + case ISD::AND: Op = X86::AND32ri8; break; + case ISD::OR: Op = X86::OR32ri8; break; + case ISD::XOR: Op = X86::XOR32ri8; break; + } + break; + case MVT::i64: + assert(CstVT == MVT::i8 || CstVT == MVT::i32); + ShlOp = X86::SHL64ri; + + switch (Opcode) { + case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break; + case ISD::OR: Op = CstVT==MVT::i8? X86::OR64ri8 : X86::OR64ri32; break; + case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break; + } + break; + } + + // Emit the smaller op and the shift. + SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, CstVT); + SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst); + return CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0), + getI8Imm(ShlVal)); + break; + } case X86ISD::UMUL: { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 2f49dbc..703c01d 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -45,6 +45,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/VectorExtras.h" +#include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Dwarf.h" #include "llvm/Support/ErrorHandling.h" @@ -221,7 +222,13 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // X86 is weird, it always uses i8 for shift amounts and setcc results. setBooleanContents(ZeroOrOneBooleanContent); - setSchedulingPreference(Sched::RegPressure); + + // For 64-bit since we have so many registers use the ILP scheduler, for + // 32-bit code use the register pressure specific scheduling. + if (Subtarget->is64Bit()) + setSchedulingPreference(Sched::ILP); + else + setSchedulingPreference(Sched::RegPressure); setStackPointerRegisterToSaveRestore(X86StackPtr); if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { @@ -543,12 +550,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - if (Subtarget->is64Bit()) - setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); - if (Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) - setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); - else - setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); + setOperationAction(ISD::DYNAMIC_STACKALLOC, + (Subtarget->is64Bit() ? MVT::i64 : MVT::i32), + (Subtarget->isTargetCOFF() + && !Subtarget->isTargetEnvMacho() + ? Custom : Expand)); if (!UseSoftFloat && X86ScalarSSEf64) { // f32 and f64 use SSE. @@ -921,6 +927,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // Can turn SHL into an integer multiply. setOperationAction(ISD::SHL, MVT::v4i32, Custom); setOperationAction(ISD::SHL, MVT::v16i8, Custom); + setOperationAction(ISD::SRL, MVT::v4i32, Legal); // i8 and i16 vectors are custom , because the source register and source // source memory operand types are not the same width. f32 vectors are @@ -1271,27 +1278,6 @@ X86TargetLowering::findRepresentativeClass(EVT VT) const{ return std::make_pair(RRC, Cost); } -// FIXME: Why this routine is here? Move to RegInfo! -unsigned -X86TargetLowering::getRegPressureLimit(const TargetRegisterClass *RC, - MachineFunction &MF) const { - const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); - - unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0; - switch (RC->getID()) { - default: - return 0; - case X86::GR32RegClassID: - return 4 - FPDiff; - case X86::GR64RegClassID: - return 8 - FPDiff; - case X86::VR128RegClassID: - return Subtarget->is64Bit() ? 10 : 4; - case X86::VR64RegClassID: - return 4; - } -} - bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, unsigned &Offset) const { if (!Subtarget->isTargetLinux()) @@ -1463,6 +1449,20 @@ bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const { return HasRet; } +EVT +X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, + ISD::NodeType ExtendKind) const { + MVT ReturnMVT; + // TODO: Is this also valid on 32-bit? + if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) + ReturnMVT = MVT::i8; + else + ReturnMVT = MVT::i32; + + EVT MinVT = getRegisterType(Context, ReturnMVT); + return VT.bitsLT(MinVT) ? MinVT : VT; +} + /// LowerCallResult - Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. /// @@ -1595,6 +1595,18 @@ static bool IsTailCallConvention(CallingConv::ID CC) { return (CC == CallingConv::Fast || CC == CallingConv::GHC); } +bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { + if (!CI->isTailCall()) + return false; + + CallSite CS(CI); + CallingConv::ID CalleeCC = CS.getCallingConv(); + if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) + return false; + + return true; +} + /// FuncIsMadeTailCallSafe - Return true if the function is being made into /// a tailcall target by changing its ABI. static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { @@ -1627,8 +1639,9 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, // In case of tail call optimization mark all arguments mutable. Since they // could be overwritten by lowering of arguments in case of a tail call. if (Flags.isByVal()) { - int FI = MFI->CreateFixedObject(Flags.getByValSize(), - VA.getLocMemOffset(), isImmutable); + unsigned Bytes = Flags.getByValSize(); + if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. + int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); return DAG.getFrameIndex(FI, getPointerTy()); } else { int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, @@ -1765,8 +1778,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, // If the function takes variable number of arguments, make a frame index for // the start of the first vararg value... for expansion of llvm.va_start. if (isVarArg) { - if (!IsWin64 && (Is64Bit || (CallConv != CallingConv::X86_FastCall && - CallConv != CallingConv::X86_ThisCall))) { + if (Is64Bit || (CallConv != CallingConv::X86_FastCall && + CallConv != CallingConv::X86_ThisCall)) { FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); } if (Is64Bit) { @@ -1818,7 +1831,9 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, int HomeOffset = TFI.getOffsetOfLocalArea() + 8; FuncInfo->setRegSaveFrameIndex( MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); - FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); + // Fixup to set vararg frame on shadow area (4 x i64). + if (NumIntRegs < 4) + FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); } else { // For X86-64, if there are vararg parameters that are passed via // registers, then we must store them to their spots on the stack so they @@ -1937,7 +1952,7 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, return SDValue(OutRetAddr.getNode(), 1); } -/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call +/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call /// optimization is performed and it is required (FPDiff!=0). static SDValue EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, @@ -2028,7 +2043,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); SDValue RetAddrFrIdx; - // Load return adress for tail calls. + // Load return address for tail calls. if (isTailCall && FPDiff) Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, Is64Bit, FPDiff, dl); @@ -2185,7 +2200,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, SmallVector<SDValue, 8> MemOpChains2; SDValue FIN; int FI = 0; - // Do not flag preceeding copytoreg stuff together with the following stuff. + // Do not flag preceding copytoreg stuff together with the following stuff. InFlag = SDValue(); if (GuaranteedTailCallOpt) { for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { @@ -2266,7 +2281,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, OpFlags = X86II::MO_PLT; } else if (Subtarget->isPICStyleStubAny() && (GV->isDeclaration() || GV->isWeakForLinker()) && - Subtarget->getDarwinVers() < 9) { + (!Subtarget->getTargetTriple().isMacOSX() || + Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. @@ -2285,7 +2301,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, getTargetMachine().getRelocationModel() == Reloc::PIC_) { OpFlags = X86II::MO_PLT; } else if (Subtarget->isPICStyleStubAny() && - Subtarget->getDarwinVers() < 9) { + (!Subtarget->getTargetTriple().isMacOSX() || + Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. @@ -3173,7 +3190,8 @@ bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { unsigned NumElems = N->getValueType(0).getVectorNumElements(); - if (NumElems != 2 && NumElems != 4) + if ((NumElems != 2 && NumElems != 4) + || N->getValueType(0).getSizeInBits() > 128) return false; for (unsigned i = 0; i < NumElems/2; ++i) @@ -3195,19 +3213,36 @@ static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) return false; - for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { - int BitI = Mask[i]; - int BitI1 = Mask[i+1]; - if (!isUndefOrEqual(BitI, j)) - return false; - if (V2IsSplat) { - if (!isUndefOrEqual(BitI1, NumElts)) - return false; - } else { - if (!isUndefOrEqual(BitI1, j + NumElts)) + // Handle vector lengths > 128 bits. Define a "section" as a set of + // 128 bits. AVX defines UNPCK* to operate independently on 128-bit + // sections. + unsigned NumSections = VT.getSizeInBits() / 128; + if (NumSections == 0 ) NumSections = 1; // Handle MMX + unsigned NumSectionElts = NumElts / NumSections; + + unsigned Start = 0; + unsigned End = NumSectionElts; + for (unsigned s = 0; s < NumSections; ++s) { + for (unsigned i = Start, j = s * NumSectionElts; + i != End; + i += 2, ++j) { + int BitI = Mask[i]; + int BitI1 = Mask[i+1]; + if (!isUndefOrEqual(BitI, j)) return false; + if (V2IsSplat) { + if (!isUndefOrEqual(BitI1, NumElts)) + return false; + } else { + if (!isUndefOrEqual(BitI1, j + NumElts)) + return false; + } } + // Process the next 128 bits. + Start += NumSectionElts; + End += NumSectionElts; } + return true; } @@ -3255,14 +3290,27 @@ static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) return false; - for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { - int BitI = Mask[i]; - int BitI1 = Mask[i+1]; - if (!isUndefOrEqual(BitI, j)) - return false; - if (!isUndefOrEqual(BitI1, j)) - return false; + // Handle vector lengths > 128 bits. Define a "section" as a set of + // 128 bits. AVX defines UNPCK* to operate independently on 128-bit + // sections. + unsigned NumSections = VT.getSizeInBits() / 128; + if (NumSections == 0 ) NumSections = 1; // Handle MMX + unsigned NumSectionElts = NumElems / NumSections; + + for (unsigned s = 0; s < NumSections; ++s) { + for (unsigned i = s * NumSectionElts, j = s * NumSectionElts; + i != NumSectionElts * (s + 1); + i += 2, ++j) { + int BitI = Mask[i]; + int BitI1 = Mask[i+1]; + + if (!isUndefOrEqual(BitI, j)) + return false; + if (!isUndefOrEqual(BitI1, j)) + return false; + } } + return true; } @@ -3846,8 +3894,8 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, /// getShuffleScalarElt - Returns the scalar element that will make up the ith /// element of the result of the vector shuffle. -SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, - unsigned Depth) { +static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, + unsigned Depth) { if (Depth == 6) return SDValue(); // Limit search depth. @@ -3895,11 +3943,15 @@ SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, case X86ISD::PUNPCKLWD: case X86ISD::PUNPCKLDQ: case X86ISD::PUNPCKLQDQ: - DecodePUNPCKLMask(NumElems, ShuffleMask); + DecodePUNPCKLMask(VT, ShuffleMask); break; case X86ISD::UNPCKLPS: case X86ISD::UNPCKLPD: - DecodeUNPCKLPMask(NumElems, ShuffleMask); + case X86ISD::VUNPCKLPS: + case X86ISD::VUNPCKLPD: + case X86ISD::VUNPCKLPSY: + case X86ISD::VUNPCKLPDY: + DecodeUNPCKLPMask(VT, ShuffleMask); break; case X86ISD::MOVHLPS: DecodeMOVHLPSMask(NumElems, ShuffleMask); @@ -3968,7 +4020,7 @@ SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, /// getNumOfConsecutiveZeros - Return the number of elements of a vector /// shuffle operation which come from a consecutively from a zero. The -/// search can start in two diferent directions, from left or right. +/// search can start in two different directions, from left or right. static unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems, bool ZerosFromLeft, SelectionDAG &DAG) { @@ -5263,6 +5315,7 @@ LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { // Break it into (shuffle shuffle_hi, shuffle_lo). Locs.clear(); + Locs.resize(4); SmallVector<int,8> LoMask(4U, -1); SmallVector<int,8> HiMask(4U, -1); @@ -5508,12 +5561,16 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { X86::getShuffleSHUFImmediate(SVOp), DAG); } -static inline unsigned getUNPCKLOpcode(EVT VT) { +static inline unsigned getUNPCKLOpcode(EVT VT, const X86Subtarget *Subtarget) { switch(VT.getSimpleVT().SimpleTy) { case MVT::v4i32: return X86ISD::PUNPCKLDQ; case MVT::v2i64: return X86ISD::PUNPCKLQDQ; - case MVT::v4f32: return X86ISD::UNPCKLPS; - case MVT::v2f64: return X86ISD::UNPCKLPD; + case MVT::v4f32: + return Subtarget->hasAVX() ? X86ISD::VUNPCKLPS : X86ISD::UNPCKLPS; + case MVT::v2f64: + return Subtarget->hasAVX() ? X86ISD::VUNPCKLPD : X86ISD::UNPCKLPD; + case MVT::v8f32: return X86ISD::VUNPCKLPSY; + case MVT::v4f64: return X86ISD::VUNPCKLPDY; case MVT::v16i8: return X86ISD::PUNPCKLBW; case MVT::v8i16: return X86ISD::PUNPCKLWD; default: @@ -5641,7 +5698,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // unpckh_undef). Only use pshufd if speed is more important than size. if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp)) if (VT != MVT::v2i64 && VT != MVT::v2f64) - return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); + return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()), dl, VT, V1, V1, DAG); if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp)) if (VT != MVT::v2i64 && VT != MVT::v2f64) return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); @@ -5762,7 +5819,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { } if (X86::isUNPCKLMask(SVOp)) - return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG); + return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()), + dl, VT, V1, V2, DAG); if (X86::isUNPCKHMask(SVOp)) return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG); @@ -5789,7 +5847,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); if (X86::isUNPCKLMask(NewSVOp)) - return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG); + return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()), + dl, VT, V2, V1, DAG); if (X86::isUNPCKHMask(NewSVOp)) return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG); @@ -5812,8 +5871,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && SVOp->getSplatIndex() == 0 && V2IsUndef) { - if (VT == MVT::v2f64) - return getTargetShuffleNode(X86ISD::UNPCKLPD, dl, VT, V1, V1, DAG); + if (VT == MVT::v2f64) { + X86ISD::NodeType Opcode = + getSubtarget()->hasAVX() ? X86ISD::VUNPCKLPD : X86ISD::UNPCKLPD; + return getTargetShuffleNode(Opcode, dl, VT, V1, V1, DAG); + } if (VT == MVT::v2i64) return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG); } @@ -5840,7 +5902,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (X86::isUNPCKL_v_undef_Mask(SVOp)) if (VT != MVT::v2i64 && VT != MVT::v2f64) - return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); + return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()), + dl, VT, V1, V1, DAG); if (X86::isUNPCKH_v_undef_Mask(SVOp)) if (VT != MVT::v2i64 && VT != MVT::v2f64) return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); @@ -7868,6 +7931,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) && "This should be used only on Windows targets"); + assert(!Subtarget->isTargetEnvMacho()); DebugLoc dl = Op.getDebugLoc(); // Get the inputs. @@ -7878,8 +7942,9 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue Flag; EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; + unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX); - Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); + Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); Flag = Chain.getValue(1); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); @@ -8809,8 +8874,8 @@ SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { case ISD::SADDO: // A subtract of one will be selected as a INC. Note that INC doesn't // set CF, so we can't do this for UADDO. - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) - if (C->getAPIntValue() == 1) { + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) + if (C->isOne()) { BaseOp = X86ISD::INC; Cond = X86::COND_O; break; @@ -8825,8 +8890,8 @@ SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { case ISD::SSUBO: // A subtract of one will be selected as a DEC. Note that DEC doesn't // set CF, so we can't do this for USUBO. - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) - if (C->getAPIntValue() == 1) { + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) + if (C->isOne()) { BaseOp = X86ISD::DEC; Cond = X86::COND_O; break; @@ -10351,21 +10416,48 @@ X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); + assert(!Subtarget->isTargetEnvMacho()); + // The lowering is pretty easy: we're just emitting the call to _alloca. The // non-trivial part is impdef of ESP. - // FIXME: The code should be tweaked as soon as we'll try to do codegen for - // mingw-w64. - const char *StackProbeSymbol = + if (Subtarget->isTargetWin64()) { + if (Subtarget->isTargetCygMing()) { + // ___chkstk(Mingw64): + // Clobbers R10, R11, RAX and EFLAGS. + // Updates RSP. + BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) + .addExternalSymbol("___chkstk") + .addReg(X86::RAX, RegState::Implicit) + .addReg(X86::RSP, RegState::Implicit) + .addReg(X86::RAX, RegState::Define | RegState::Implicit) + .addReg(X86::RSP, RegState::Define | RegState::Implicit) + .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); + } else { + // __chkstk(MSVCRT): does not update stack pointer. + // Clobbers R10, R11 and EFLAGS. + // FIXME: RAX(allocated size) might be reused and not killed. + BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) + .addExternalSymbol("__chkstk") + .addReg(X86::RAX, RegState::Implicit) + .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); + // RAX has the offset to subtracted from RSP. + BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP) + .addReg(X86::RSP) + .addReg(X86::RAX); + } + } else { + const char *StackProbeSymbol = Subtarget->isTargetWindows() ? "_chkstk" : "_alloca"; - BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) - .addExternalSymbol(StackProbeSymbol) - .addReg(X86::EAX, RegState::Implicit) - .addReg(X86::ESP, RegState::Implicit) - .addReg(X86::EAX, RegState::Define | RegState::Implicit) - .addReg(X86::ESP, RegState::Define | RegState::Implicit) - .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); + BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) + .addExternalSymbol(StackProbeSymbol) + .addReg(X86::EAX, RegState::Implicit) + .addReg(X86::ESP, RegState::Implicit) + .addReg(X86::EAX, RegState::Define | RegState::Implicit) + .addReg(X86::ESP, RegState::Define | RegState::Implicit) + .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); + } MI->eraseFromParent(); // The pseudo instruction is gone now. return BB; @@ -12126,7 +12218,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { AsmPieces.clear(); SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. - // FIXME: this should verify that we are targetting a 486 or better. If not, + // FIXME: this should verify that we are targeting a 486 or better. If not, // we will turn this bswap into something that will be lowered to logical ops // instead of emitting the bswap asm. For now, we don't support 486 or lower // so don't worry about this. diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 6ec4a7d..6301057 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -677,9 +677,6 @@ namespace llvm { /// getFunctionAlignment - Return the Log2 alignment of this function. virtual unsigned getFunctionAlignment(const Function *F) const; - unsigned getRegPressureLimit(const TargetRegisterClass *RC, - MachineFunction &MF) const; - /// getStackCookieLocation - Return true if the target stores stack /// protector cookies at a fixed offset in some non-standard address /// space, and populates the address space and offset as @@ -846,6 +843,12 @@ namespace llvm { virtual bool isUsedByReturnOnly(SDNode *N) const; + virtual bool mayBeEmittedAsTailCall(CallInst *CI) const; + + virtual EVT + getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, + ISD::NodeType ExtendKind) const; + virtual bool CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, diff --git a/lib/Target/X86/X86Instr3DNow.td b/lib/Target/X86/X86Instr3DNow.td index 45d1c6b..dd4f6a5 100644 --- a/lib/Target/X86/X86Instr3DNow.td +++ b/lib/Target/X86/X86Instr3DNow.td @@ -12,66 +12,91 @@ // //===----------------------------------------------------------------------===// -// FIXME: We don't support any intrinsics for these instructions yet. +class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pat> + : I<o, F, outs, ins, asm, pat>, TB, Requires<[Has3DNow]> { +} -class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : I<o, F, outs, ins, asm, pattern>, TB, Requires<[Has3DNow]> { +class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat> + : I3DNow<o, F, (outs VR64:$dst), ins, + !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), pat>, + Has3DNow0F0FOpcode { + // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet. + let isAsmParserOnly = 1; + let Constraints = "$src1 = $dst"; } -class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic> - : I<o, F, (outs VR64:$dst), ins, - !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), []>, - TB, Requires<[Has3DNow]>, Has3DNow0F0FOpcode { +class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat> + : I3DNow<o, F, (outs VR64:$dst), ins, + !strconcat(Mnemonic, "\t{$src, $dst|$dst, $src}"), pat>, + Has3DNow0F0FOpcode { // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet. let isAsmParserOnly = 1; } +multiclass I3DNow_binop_rm<bits<8> opc, string Mn> { + def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn, []>; + def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, []>; +} + +multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, string Ver = ""> { + def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>; + def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, + (bitconvert (load_mmx addr:$src2))))]>; +} + +multiclass I3DNow_conv_rm<bits<8> opc, string Mn> { + def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src1), Mn, []>; + def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src1), Mn, []>; +} -let Constraints = "$src1 = $dst" in { - // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic. - // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp. - multiclass I3DNow_binop_rm<bits<8> opc, string Mn> { - def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn>; - def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn>; - } +multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn, string Ver = ""> { + def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))]>; + def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) + (bitconvert (load_mmx addr:$src))))]>; } -defm PAVGUSB : I3DNow_binop_rm<0xBF, "pavgusb">; -defm PF2ID : I3DNow_binop_rm<0x1D, "pf2id">; -defm PFACC : I3DNow_binop_rm<0xAE, "pfacc">; -defm PFADD : I3DNow_binop_rm<0x9E, "pfadd">; -defm PFCMPEQ : I3DNow_binop_rm<0xB0, "pfcmpeq">; -defm PFCMPGE : I3DNow_binop_rm<0x90, "pfcmpge">; -defm PFCMPGT : I3DNow_binop_rm<0xA0, "pfcmpgt">; -defm PFMAX : I3DNow_binop_rm<0xA4, "pfmax">; -defm PFMIN : I3DNow_binop_rm<0x94, "pfmin">; -defm PFMUL : I3DNow_binop_rm<0xB4, "pfmul">; -defm PFRCP : I3DNow_binop_rm<0x96, "pfrcp">; -defm PFRCPIT1 : I3DNow_binop_rm<0xA6, "pfrcpit1">; -defm PFRCPIT2 : I3DNow_binop_rm<0xB6, "pfrcpit2">; -defm PFRSQIT1 : I3DNow_binop_rm<0xA7, "pfrsqit1">; -defm PFRSQRT : I3DNow_binop_rm<0x97, "pfrsqrt">; -defm PFSUB : I3DNow_binop_rm<0x9A, "pfsub">; -defm PFSUBR : I3DNow_binop_rm<0xAA, "pfsubr">; -defm PI2FD : I3DNow_binop_rm<0x0D, "pi2fd">; -defm PMULHRW : I3DNow_binop_rm<0xB7, "pmulhrw">; +defm PAVGUSB : I3DNow_binop_rm_int<0xBF, "pavgusb">; +defm PF2ID : I3DNow_conv_rm_int<0x1D, "pf2id">; +defm PFACC : I3DNow_binop_rm_int<0xAE, "pfacc">; +defm PFADD : I3DNow_binop_rm_int<0x9E, "pfadd">; +defm PFCMPEQ : I3DNow_binop_rm_int<0xB0, "pfcmpeq">; +defm PFCMPGE : I3DNow_binop_rm_int<0x90, "pfcmpge">; +defm PFCMPGT : I3DNow_binop_rm_int<0xA0, "pfcmpgt">; +defm PFMAX : I3DNow_binop_rm_int<0xA4, "pfmax">; +defm PFMIN : I3DNow_binop_rm_int<0x94, "pfmin">; +defm PFMUL : I3DNow_binop_rm_int<0xB4, "pfmul">; +defm PFRCP : I3DNow_conv_rm_int<0x96, "pfrcp">; +defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1">; +defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2">; +defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1">; +defm PFRSQRT : I3DNow_conv_rm_int<0x97, "pfrsqrt">; +defm PFSUB : I3DNow_binop_rm_int<0x9A, "pfsub">; +defm PFSUBR : I3DNow_binop_rm_int<0xAA, "pfsubr">; +defm PI2FD : I3DNow_conv_rm_int<0x0D, "pi2fd">; +defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw">; def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms", [(int_x86_mmx_femms)]>; def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i32mem:$addr), "prefetch $addr", []>; - + // FIXME: Diassembler gets a bogus decode conflict. -let isAsmParserOnly = 1 in { +let isAsmParserOnly = 1 in def PREFETCHW : I3DNow<0x0D, MRM1m, (outs), (ins i16mem:$addr), "prefetchw $addr", []>; -} // "3DNowA" instructions -defm PF2IW : I3DNow_binop_rm<0x1C, "pf2iw">; -defm PI2FW : I3DNow_binop_rm<0x0C, "pi2fw">; -defm PFNACC : I3DNow_binop_rm<0x8A, "pfnacc">; -defm PFPNACC : I3DNow_binop_rm<0x8E, "pfpnacc">; -defm PSWAPD : I3DNow_binop_rm<0xBB, "pswapd">; +defm PF2IW : I3DNow_conv_rm_int<0x1C, "pf2iw", "a">; +defm PI2FW : I3DNow_conv_rm_int<0x0C, "pi2fw", "a">; +defm PFNACC : I3DNow_binop_rm_int<0x8A, "pfnacc", "a">; +defm PFPNACC : I3DNow_binop_rm_int<0x8E, "pfpnacc", "a">; +defm PSWAPD : I3DNow_conv_rm_int<0xBB, "pswapd", "a">; diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td index f0ea068..9f7a4b0 100644 --- a/lib/Target/X86/X86InstrArithmetic.td +++ b/lib/Target/X86/X86InstrArithmetic.td @@ -163,7 +163,7 @@ def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst), } // Defs = [EFLAGS] -// Suprisingly enough, these are not two address instructions! +// Surprisingly enough, these are not two address instructions! let Defs = [EFLAGS] in { // Register-Integer Signed Integer Multiply def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16 diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td index 77f4725..c228a0ae 100644 --- a/lib/Target/X86/X86InstrControl.td +++ b/lib/Target/X86/X86InstrControl.td @@ -263,6 +263,16 @@ let isCall = 1, isCodeGenOnly = 1 in Requires<[IsWin64]>; } +let isCall = 1, isCodeGenOnly = 1 in + // __chkstk(MSVC): clobber R10, R11 and EFLAGS. + // ___chkstk(Mingw64): clobber R10, R11, RAX and EFLAGS, and update RSP. + let Defs = [RAX, R10, R11, RSP, EFLAGS], + Uses = [RSP] in { + def W64ALLOCA : Ii32PCRel<0xE8, RawFrm, + (outs), (ins i64i32imm_pcrel:$dst, variable_ops), + "call{q}\t$dst", []>, + Requires<[IsWin64]>; + } let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, isCodeGenOnly = 1 in diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td index 0660072..7daa264 100644 --- a/lib/Target/X86/X86InstrFormats.td +++ b/lib/Target/X86/X86InstrFormats.td @@ -91,21 +91,23 @@ class REX_W { bit hasREX_WPrefix = 1; } class LOCK { bit hasLockPrefix = 1; } class SegFS { bits<2> SegOvrBits = 1; } class SegGS { bits<2> SegOvrBits = 2; } -class TB { bits<4> Prefix = 1; } -class REP { bits<4> Prefix = 2; } -class D8 { bits<4> Prefix = 3; } -class D9 { bits<4> Prefix = 4; } -class DA { bits<4> Prefix = 5; } -class DB { bits<4> Prefix = 6; } -class DC { bits<4> Prefix = 7; } -class DD { bits<4> Prefix = 8; } -class DE { bits<4> Prefix = 9; } -class DF { bits<4> Prefix = 10; } -class XD { bits<4> Prefix = 11; } -class XS { bits<4> Prefix = 12; } -class T8 { bits<4> Prefix = 13; } -class TA { bits<4> Prefix = 14; } -class TF { bits<4> Prefix = 15; } +class TB { bits<5> Prefix = 1; } +class REP { bits<5> Prefix = 2; } +class D8 { bits<5> Prefix = 3; } +class D9 { bits<5> Prefix = 4; } +class DA { bits<5> Prefix = 5; } +class DB { bits<5> Prefix = 6; } +class DC { bits<5> Prefix = 7; } +class DD { bits<5> Prefix = 8; } +class DE { bits<5> Prefix = 9; } +class DF { bits<5> Prefix = 10; } +class XD { bits<5> Prefix = 11; } +class XS { bits<5> Prefix = 12; } +class T8 { bits<5> Prefix = 13; } +class TA { bits<5> Prefix = 14; } +class A6 { bits<5> Prefix = 15; } +class A7 { bits<5> Prefix = 16; } +class TF { bits<5> Prefix = 17; } class VEX { bit hasVEXPrefix = 1; } class VEX_W { bit hasVEX_WPrefix = 1; } class VEX_4V : VEX { bit hasVEX_4VPrefix = 1; } @@ -136,7 +138,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, bit hasOpSizePrefix = 0; // Does this inst have a 0x66 prefix? bit hasAdSizePrefix = 0; // Does this inst have a 0x67 prefix? - bits<4> Prefix = 0; // Which prefix byte does this inst have? + bits<5> Prefix = 0; // Which prefix byte does this inst have? bit hasREX_WPrefix = 0; // Does this inst require the REX.W prefix? FPFormat FPForm = NotFP; // What flavor of FP instruction is this? bit hasLockPrefix = 0; // Does this inst have a 0xF0 prefix? @@ -154,20 +156,20 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, let TSFlags{5-0} = FormBits; let TSFlags{6} = hasOpSizePrefix; let TSFlags{7} = hasAdSizePrefix; - let TSFlags{11-8} = Prefix; - let TSFlags{12} = hasREX_WPrefix; - let TSFlags{15-13} = ImmT.Value; - let TSFlags{18-16} = FPForm.Value; - let TSFlags{19} = hasLockPrefix; - let TSFlags{21-20} = SegOvrBits; - let TSFlags{23-22} = ExeDomain.Value; - let TSFlags{31-24} = Opcode; - let TSFlags{32} = hasVEXPrefix; - let TSFlags{33} = hasVEX_WPrefix; - let TSFlags{34} = hasVEX_4VPrefix; - let TSFlags{35} = hasVEX_i8ImmReg; - let TSFlags{36} = hasVEX_L; - let TSFlags{37} = has3DNow0F0FOpcode; + let TSFlags{12-8} = Prefix; + let TSFlags{13} = hasREX_WPrefix; + let TSFlags{16-14} = ImmT.Value; + let TSFlags{19-17} = FPForm.Value; + let TSFlags{20} = hasLockPrefix; + let TSFlags{22-21} = SegOvrBits; + let TSFlags{24-23} = ExeDomain.Value; + let TSFlags{32-25} = Opcode; + let TSFlags{33} = hasVEXPrefix; + let TSFlags{34} = hasVEX_WPrefix; + let TSFlags{35} = hasVEX_4VPrefix; + let TSFlags{36} = hasVEX_i8ImmReg; + let TSFlags{37} = hasVEX_L; + let TSFlags{38} = has3DNow0F0FOpcode; } class PseudoI<dag oops, dag iops, list<dag> pattern> @@ -319,7 +321,7 @@ class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm, Requires<[HasAVX]>; class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> - : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedSingle>, + : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedSingle>, TB, Requires<[HasAVX]>; // SSE2 Instruction Templates: @@ -353,7 +355,7 @@ class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm, Requires<[HasAVX]>; class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> - : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedDouble>, + : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedDouble>, TB, OpSize, Requires<[HasAVX]>; // SSE3 Instruction Templates: diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 5016c0f..3cbfac1 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -132,6 +132,8 @@ def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>; def X86Unpcklps : SDNode<"X86ISD::UNPCKLPS", SDTShuff2Op>; def X86Unpcklpd : SDNode<"X86ISD::UNPCKLPD", SDTShuff2Op>; +def X86Unpcklpsy : SDNode<"X86ISD::VUNPCKLPSY", SDTShuff2Op>; +def X86Unpcklpdy : SDNode<"X86ISD::VUNPCKLPDY", SDTShuff2Op>; def X86Unpckhps : SDNode<"X86ISD::UNPCKHPS", SDTShuff2Op>; def X86Unpckhpd : SDNode<"X86ISD::UNPCKHPD", SDTShuff2Op>; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 76a9b12..83f0260 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -232,7 +232,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) assert(!RegOp2MemOpTable2Addr.count(RegOp) && "Duplicated entries?"); RegOp2MemOpTable2Addr[RegOp] = std::make_pair(MemOp, 0U); - // If this is not a reversable operation (because there is a many->one) + // If this is not a reversible operation (because there is a many->one) // mapping, don't insert the reverse of the operation into MemOp2RegOpTable. if (OpTbl2Addr[i][1] & TB_NOT_REVERSABLE) continue; @@ -335,7 +335,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) assert(!RegOp2MemOpTable0.count(RegOp) && "Duplicated entries?"); RegOp2MemOpTable0[RegOp] = std::make_pair(MemOp, Align); - // If this is not a reversable operation (because there is a many->one) + // If this is not a reversible operation (because there is a many->one) // mapping, don't insert the reverse of the operation into MemOp2RegOpTable. if (OpTbl0[i][1] & TB_NOT_REVERSABLE) continue; @@ -460,7 +460,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) assert(!RegOp2MemOpTable1.count(RegOp) && "Duplicate entries"); RegOp2MemOpTable1[RegOp] = std::make_pair(MemOp, Align); - // If this is not a reversable operation (because there is a many->one) + // If this is not a reversible operation (because there is a many->one) // mapping, don't insert the reverse of the operation into MemOp2RegOpTable. if (OpTbl1[i][1] & TB_NOT_REVERSABLE) continue; @@ -682,7 +682,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) assert(!RegOp2MemOpTable2.count(RegOp) && "Duplicate entry!"); RegOp2MemOpTable2[RegOp] = std::make_pair(MemOp, Align); - // If this is not a reversable operation (because there is a many->one) + // If this is not a reversible operation (because there is a many->one) // mapping, don't insert the reverse of the operation into MemOp2RegOpTable. if (OpTbl2[i][1] & TB_NOT_REVERSABLE) continue; @@ -916,7 +916,6 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, case X86::MOVSDrm: case X86::MOVAPSrm: case X86::MOVUPSrm: - case X86::MOVUPSrm_Int: case X86::MOVAPDrm: case X86::MOVDQArm: case X86::MMX_MOVD64rm: @@ -2241,6 +2240,12 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, bool isTwoAddr = NumOps > 1 && MI->getDesc().getOperandConstraint(1, TOI::TIED_TO) != -1; + // FIXME: AsmPrinter doesn't know how to handle + // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding. + if (MI->getOpcode() == X86::ADD32ri && + MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS) + return NULL; + MachineInstr *NewMI = NULL; // Folding a memory location into the two-address part of a two-address // instruction is different than folding it other places. It requires @@ -2535,6 +2540,12 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI, case X86::TEST32rr: case X86::TEST64rr: return true; + case X86::ADD32ri: + // FIXME: AsmPrinter doesn't know how to handle + // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding. + if (MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS) + return false; + break; } } @@ -2845,11 +2856,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::FsMOVAPDrm: case X86::MOVAPSrm: case X86::MOVUPSrm: - case X86::MOVUPSrm_Int: case X86::MOVAPDrm: case X86::MOVDQArm: case X86::MOVDQUrm: - case X86::MOVDQUrm_Int: break; } switch (Opc2) { @@ -2869,11 +2878,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::FsMOVAPDrm: case X86::MOVAPSrm: case X86::MOVUPSrm: - case X86::MOVUPSrm_Int: case X86::MOVAPDrm: case X86::MOVDQArm: case X86::MOVDQUrm: - case X86::MOVDQUrm_Int: break; } @@ -3085,12 +3092,8 @@ void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { NopInst.setOpcode(X86::NOOP); } -bool X86InstrInfo:: -hasHighOperandLatency(const InstrItineraryData *ItinData, - const MachineRegisterInfo *MRI, - const MachineInstr *DefMI, unsigned DefIdx, - const MachineInstr *UseMI, unsigned UseIdx) const { - switch (DefMI->getOpcode()) { +bool X86InstrInfo::isHighLatencyDef(int opc) const { + switch (opc) { default: return false; case X86::DIVSDrm: case X86::DIVSDrm_Int: @@ -3120,6 +3123,14 @@ hasHighOperandLatency(const InstrItineraryData *ItinData, } } +bool X86InstrInfo:: +hasHighOperandLatency(const InstrItineraryData *ItinData, + const MachineRegisterInfo *MRI, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const { + return isHighLatencyDef(DefMI->getOpcode()); +} + namespace { /// CGBR - Create Global Base Reg pass. This initializes the PIC /// global base register for x86-32. diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index fcb5a25..8da68b5 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -33,15 +33,15 @@ namespace X86 { AddrScaleAmt = 1, AddrIndexReg = 2, AddrDisp = 3, - + /// AddrSegmentReg - The operand # of the segment in the memory operand. AddrSegmentReg = 4, /// AddrNumOperands - Total number of operands in a memory reference. AddrNumOperands = 5 }; - - + + // X86 specific condition code. These correspond to X86_*_COND in // X86InstrInfo.td. They must be kept in synch. enum CondCode { @@ -72,16 +72,16 @@ namespace X86 { COND_INVALID }; - + // Turn condition code into conditional branch opcode. unsigned GetCondBranchFromCond(CondCode CC); - + /// GetOppositeBranchCondition - Return the inverse of the specified cond, /// e.g. turning COND_E to COND_NE. CondCode GetOppositeBranchCondition(X86::CondCode CC); } - + /// X86II - This namespace holds all of the target specific flags that /// instruction info tracks. /// @@ -90,14 +90,14 @@ namespace X86II { enum TOF { //===------------------------------------------------------------------===// // X86 Specific MachineOperand flags. - + MO_NO_FLAG, - + /// MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a /// relocation of: /// SYMBOL_LABEL + [. - PICBASELABEL] MO_GOT_ABSOLUTE_ADDRESS, - + /// MO_PIC_BASE_OFFSET - On a symbol operand this indicates that the /// immediate should get the value of the symbol minus the PIC base label: /// SYMBOL_LABEL - PICBASELABEL @@ -106,77 +106,77 @@ namespace X86II { /// MO_GOT - On a symbol operand this indicates that the immediate is the /// offset to the GOT entry for the symbol name from the base of the GOT. /// - /// See the X86-64 ELF ABI supplement for more details. + /// See the X86-64 ELF ABI supplement for more details. /// SYMBOL_LABEL @GOT MO_GOT, - + /// MO_GOTOFF - On a symbol operand this indicates that the immediate is - /// the offset to the location of the symbol name from the base of the GOT. + /// the offset to the location of the symbol name from the base of the GOT. /// - /// See the X86-64 ELF ABI supplement for more details. + /// See the X86-64 ELF ABI supplement for more details. /// SYMBOL_LABEL @GOTOFF MO_GOTOFF, - + /// MO_GOTPCREL - On a symbol operand this indicates that the immediate is /// offset to the GOT entry for the symbol name from the current code - /// location. + /// location. /// - /// See the X86-64 ELF ABI supplement for more details. + /// See the X86-64 ELF ABI supplement for more details. /// SYMBOL_LABEL @GOTPCREL MO_GOTPCREL, - + /// MO_PLT - On a symbol operand this indicates that the immediate is - /// offset to the PLT entry of symbol name from the current code location. + /// offset to the PLT entry of symbol name from the current code location. /// - /// See the X86-64 ELF ABI supplement for more details. + /// See the X86-64 ELF ABI supplement for more details. /// SYMBOL_LABEL @PLT MO_PLT, - + /// MO_TLSGD - On a symbol operand this indicates that the immediate is /// some TLS offset. /// - /// See 'ELF Handling for Thread-Local Storage' for more details. + /// See 'ELF Handling for Thread-Local Storage' for more details. /// SYMBOL_LABEL @TLSGD MO_TLSGD, - + /// MO_GOTTPOFF - On a symbol operand this indicates that the immediate is /// some TLS offset. /// - /// See 'ELF Handling for Thread-Local Storage' for more details. + /// See 'ELF Handling for Thread-Local Storage' for more details. /// SYMBOL_LABEL @GOTTPOFF MO_GOTTPOFF, - + /// MO_INDNTPOFF - On a symbol operand this indicates that the immediate is /// some TLS offset. /// - /// See 'ELF Handling for Thread-Local Storage' for more details. + /// See 'ELF Handling for Thread-Local Storage' for more details. /// SYMBOL_LABEL @INDNTPOFF MO_INDNTPOFF, - + /// MO_TPOFF - On a symbol operand this indicates that the immediate is /// some TLS offset. /// - /// See 'ELF Handling for Thread-Local Storage' for more details. + /// See 'ELF Handling for Thread-Local Storage' for more details. /// SYMBOL_LABEL @TPOFF MO_TPOFF, - + /// MO_NTPOFF - On a symbol operand this indicates that the immediate is /// some TLS offset. /// - /// See 'ELF Handling for Thread-Local Storage' for more details. + /// See 'ELF Handling for Thread-Local Storage' for more details. /// SYMBOL_LABEL @NTPOFF MO_NTPOFF, - + /// MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the /// reference is actually to the "__imp_FOO" symbol. This is used for /// dllimport linkage on windows. MO_DLLIMPORT, - + /// MO_DARWIN_STUB - On a symbol operand "FOO", this indicates that the /// reference is actually to the "FOO$stub" symbol. This is used for calls /// and jumps to external functions on Tiger and earlier. MO_DARWIN_STUB, - + /// MO_DARWIN_NONLAZY - On a symbol operand "FOO", this indicates that the /// reference is actually to the "FOO$non_lazy_ptr" symbol, which is a /// non-PIC-base-relative reference to a non-hidden dyld lazy pointer stub. @@ -186,19 +186,19 @@ namespace X86II { /// that the reference is actually to "FOO$non_lazy_ptr - PICBASE", which is /// a PIC-base-relative reference to a non-hidden dyld lazy pointer stub. MO_DARWIN_NONLAZY_PIC_BASE, - + /// MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this /// indicates that the reference is actually to "FOO$non_lazy_ptr -PICBASE", /// which is a PIC-base-relative reference to a hidden dyld lazy pointer /// stub. MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE, - + /// MO_TLVP - On a symbol operand this indicates that the immediate is /// some TLS offset. /// /// This is the TLS offset for the Darwin TLS mechanism. MO_TLVP, - + /// MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate /// is some TLS offset from the picbase. /// @@ -239,7 +239,7 @@ inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) { return false; } } - + /// X86II - This namespace holds all of the target specific flags that /// instruction info tracks. /// @@ -299,7 +299,7 @@ namespace X86II { // MRMInitReg - This form is used for instructions whose source and // destinations are the same register. MRMInitReg = 32, - + //// MRM_C1 - A mod/rm byte of exactly 0xC1. MRM_C1 = 33, MRM_C2 = 34, @@ -318,7 +318,7 @@ namespace X86II { /// immediates, the first of which is a 16-bit immediate (specified by /// the imm encoding) and the second is a 8-bit fixed value. RawFrmImm8 = 43, - + /// RawFrmImm16 - This is used for CALL FAR instructions, which have two /// immediates, the first of which is a 16 or 32-bit immediate (specified by /// the imm encoding) and the second is a 16-bit fixed value. In the AMD @@ -347,7 +347,7 @@ namespace X86II { // set, there is no prefix byte for obtaining a multibyte opcode. // Op0Shift = 8, - Op0Mask = 0xF << Op0Shift, + Op0Mask = 0x1F << Op0Shift, // TB - TwoByte - Set if this instruction has a two byte opcode, which // starts with a 0x0F byte before the real opcode. @@ -368,11 +368,12 @@ namespace X86II { // floating point operations performed in the SSE registers. XD = 11 << Op0Shift, XS = 12 << Op0Shift, - // T8, TA - Prefix after the 0x0F prefix. + // T8, TA, A6, A7 - Prefix after the 0x0F prefix. T8 = 13 << Op0Shift, TA = 14 << Op0Shift, - + A6 = 15 << Op0Shift, A7 = 16 << Op0Shift, + // TF - Prefix before and after 0x0F - TF = 15 << Op0Shift, + TF = 17 << Op0Shift, //===------------------------------------------------------------------===// // REX_W - REX prefixes are instruction prefixes used in 64-bit mode. @@ -380,13 +381,13 @@ namespace X86II { // etc. We only cares about REX.W and REX.R bits and only the former is // statically determined. // - REXShift = 12, + REXShift = Op0Shift + 5, REX_W = 1 << REXShift, //===------------------------------------------------------------------===// // This three-bit field describes the size of an immediate operand. Zero is // unused so that we can tell if we forgot to set a value. - ImmShift = 13, + ImmShift = REXShift + 1, ImmMask = 7 << ImmShift, Imm8 = 1 << ImmShift, Imm8PCRel = 2 << ImmShift, @@ -400,7 +401,7 @@ namespace X86II { // FP Instruction Classification... Zero is non-fp instruction. // FPTypeMask - Mask for all of the FP types... - FPTypeShift = 16, + FPTypeShift = ImmShift + 3, FPTypeMask = 7 << FPTypeShift, // NotFP - The default, set for instructions that do not use FP registers. @@ -433,25 +434,26 @@ namespace X86II { SpecialFP = 7 << FPTypeShift, // Lock prefix - LOCKShift = 19, + LOCKShift = FPTypeShift + 3, LOCK = 1 << LOCKShift, // Segment override prefixes. Currently we just need ability to address // stuff in gs and fs segments. - SegOvrShift = 20, + SegOvrShift = LOCKShift + 1, SegOvrMask = 3 << SegOvrShift, FS = 1 << SegOvrShift, GS = 2 << SegOvrShift, - // Execution domain for SSE instructions in bits 22, 23. - // 0 in bits 22-23 means normal, non-SSE instruction. - SSEDomainShift = 22, + // Execution domain for SSE instructions in bits 23, 24. + // 0 in bits 23-24 means normal, non-SSE instruction. + SSEDomainShift = SegOvrShift + 2, - OpcodeShift = 24, - OpcodeMask = 0xFF << OpcodeShift, + OpcodeShift = SSEDomainShift + 2, + OpcodeMask = 0xFFULL << OpcodeShift, //===------------------------------------------------------------------===// /// VEX - The opcode prefix used by AVX instructions + VEXShift = OpcodeShift + 8, VEX = 1U << 0, /// VEX_W - Has a opcode specific functionality, but is used in the same @@ -473,7 +475,7 @@ namespace X86II { /// if a VR256 register is used, but some AVX instructions also have this /// field marked when using a f256 memory references. VEX_L = 1U << 4, - + /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the /// wacky 0x0F 0x0F prefix for 3DNow! instructions. The manual documents /// this as having a 0x0F prefix with a 0x0F opcode, and each instruction @@ -482,18 +484,18 @@ namespace X86II { /// this flag to indicate that the encoder should do the wacky 3DNow! thing. Has3DNow0F0FOpcode = 1U << 5 }; - + // getBaseOpcodeFor - This function returns the "base" X86 opcode for the // specified machine instruction. // static inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) { return TSFlags >> X86II::OpcodeShift; } - + static inline bool hasImm(uint64_t TSFlags) { return (TSFlags & X86II::ImmMask) != 0; } - + /// getSizeOfImm - Decode the "size of immediate" field from the TSFlags field /// of the specified instruction. static inline unsigned getSizeOfImm(uint64_t TSFlags) { @@ -508,7 +510,7 @@ namespace X86II { case X86II::Imm64: return 8; } } - + /// isImmPCRel - Return true if the immediate of the specified instruction's /// TSFlags indicates that it is pc relative. static inline unsigned isImmPCRel(uint64_t TSFlags) { @@ -525,7 +527,7 @@ namespace X86II { return false; } } - + /// getMemoryOperandNo - The function returns the MCInst operand # for the /// first field of the memory operand. If the instruction doesn't have a /// memory operand, this returns -1. @@ -549,11 +551,11 @@ namespace X86II { case X86II::MRMDestMem: return 0; case X86II::MRMSrcMem: { - bool HasVEX_4V = (TSFlags >> 32) & X86II::VEX_4V; + bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; unsigned FirstMemOp = 1; if (HasVEX_4V) ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV). - + // FIXME: Maybe lea should have its own form? This is a horrible hack. //if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r || // Opcode == X86::LEA16r || Opcode == X86::LEA32r) @@ -613,7 +615,7 @@ inline static bool isMem(const MachineInstr *MI, unsigned Op) { class X86InstrInfo : public TargetInstrInfoImpl { X86TargetMachine &TM; const X86RegisterInfo RI; - + /// RegOp2MemOpTable2Addr, RegOp2MemOpTable0, RegOp2MemOpTable1, /// RegOp2MemOpTable2 - Load / store folding opcode maps. /// @@ -621,7 +623,7 @@ class X86InstrInfo : public TargetInstrInfoImpl { DenseMap<unsigned, std::pair<unsigned,unsigned> > RegOp2MemOpTable0; DenseMap<unsigned, std::pair<unsigned,unsigned> > RegOp2MemOpTable1; DenseMap<unsigned, std::pair<unsigned,unsigned> > RegOp2MemOpTable2; - + /// MemOp2RegOpTable - Load / store unfolding opcode map. /// DenseMap<unsigned, std::pair<unsigned, unsigned> > MemOp2RegOpTable; @@ -795,7 +797,7 @@ public: virtual unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, bool UnfoldLoad, bool UnfoldStore, unsigned *LoadRegIndex = 0) const; - + /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler /// to determine if two loads are loading from the same base address. It /// should only return true if the base pointers are the same and the @@ -805,7 +807,7 @@ public: int64_t &Offset1, int64_t &Offset2) const; /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to - /// determine (in conjuction with areLoadsFromSameBasePtr) if two loads should + /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should /// be scheduled togther. On some targets if two loads are loading from /// addresses in the same cache line, it's better if they are scheduled /// together. This function takes two integers that represent the load offsets @@ -829,7 +831,7 @@ public: return (reg == X86::SPL || reg == X86::BPL || reg == X86::SIL || reg == X86::DIL); } - + static bool isX86_64ExtendedReg(const MachineOperand &MO) { if (!MO.isReg()) return false; return isX86_64ExtendedReg(MO.getReg()); @@ -858,11 +860,13 @@ public: const SmallVectorImpl<MachineOperand> &MOs, unsigned Size, unsigned Alignment) const; + bool isHighLatencyDef(int opc) const; + bool hasHighOperandLatency(const InstrItineraryData *ItinData, const MachineRegisterInfo *MRI, const MachineInstr *DefMI, unsigned DefIdx, const MachineInstr *UseMI, unsigned UseIdx) const; - + private: MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc, MachineFunction::iterator &MFI, diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index f832a7c..03a0b0c 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -459,7 +459,7 @@ def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">; include "X86InstrFormats.td" //===----------------------------------------------------------------------===// -// Pattern fragments... +// Pattern fragments. // // X86 specific condition code. These correspond to CondCode in @@ -481,21 +481,21 @@ def X86_COND_O : PatLeaf<(i8 13)>; def X86_COND_P : PatLeaf<(i8 14)>; // alt. COND_PE def X86_COND_S : PatLeaf<(i8 15)>; -def immSext8 : PatLeaf<(imm), [{ return immSext8(N); }]>; +let FastIselShouldIgnore = 1 in { // FastIsel should ignore all simm8 instrs. + def i16immSExt8 : ImmLeaf<i16, [{ return Imm == (int8_t)Imm; }]>; + def i32immSExt8 : ImmLeaf<i32, [{ return Imm == (int8_t)Imm; }]>; + def i64immSExt8 : ImmLeaf<i64, [{ return Imm == (int8_t)Imm; }]>; +} -def i16immSExt8 : PatLeaf<(i16 immSext8)>; -def i32immSExt8 : PatLeaf<(i32 immSext8)>; -def i64immSExt8 : PatLeaf<(i64 immSext8)>; -def i64immSExt32 : PatLeaf<(i64 imm), [{ return i64immSExt32(N); }]>; -def i64immZExt32 : PatLeaf<(i64 imm), [{ - // i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit - // unsignedsign extended field. - return (uint64_t)N->getZExtValue() == (uint32_t)N->getZExtValue(); -}]>; +def i64immSExt32 : ImmLeaf<i64, [{ return Imm == (int32_t)Imm; }]>; + + +// i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit +// unsigned field. +def i64immZExt32 : ImmLeaf<i64, [{ return (uint64_t)Imm == (uint32_t)Imm; }]>; -def i64immZExt32SExt8 : PatLeaf<(i64 imm), [{ - uint64_t v = N->getZExtValue(); - return v == (uint32_t)v && (int32_t)v == (int8_t)v; +def i64immZExt32SExt8 : ImmLeaf<i64, [{ + return (uint64_t)Imm == (uint32_t)Imm && (int32_t)Imm == (int8_t)Imm; }]>; // Helper fragments for loads. @@ -1437,7 +1437,7 @@ def : InstAlias<"idivq $src, %rax", (IDIV64m i64mem:$src)>; // Various unary fpstack operations default to operating on on ST1. // For example, "fxch" -> "fxch %st(1)" -def : InstAlias<"faddp", (ADD_FPrST0 ST1)>; +def : InstAlias<"faddp", (ADD_FPrST0 ST1), 0>; def : InstAlias<"fsubp", (SUBR_FPrST0 ST1)>; def : InstAlias<"fsubrp", (SUB_FPrST0 ST1)>; def : InstAlias<"fmulp", (MUL_FPrST0 ST1)>; @@ -1455,13 +1455,15 @@ def : InstAlias<"fucompi", (UCOM_FIPr ST1)>; // For example, "fadd %st(4), %st(0)" -> "fadd %st(4)". We also disambiguate // instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with // gas. -multiclass FpUnaryAlias<string Mnemonic, Instruction Inst> { - def : InstAlias<!strconcat(Mnemonic, " $op, %st(0)"), (Inst RST:$op)>; - def : InstAlias<!strconcat(Mnemonic, " %st(0), %st(0)"), (Inst ST0)>; +multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> { + def : InstAlias<!strconcat(Mnemonic, " $op, %st(0)"), + (Inst RST:$op), EmitAlias>; + def : InstAlias<!strconcat(Mnemonic, " %st(0), %st(0)"), + (Inst ST0), EmitAlias>; } defm : FpUnaryAlias<"fadd", ADD_FST0r>; -defm : FpUnaryAlias<"faddp", ADD_FPrST0>; +defm : FpUnaryAlias<"faddp", ADD_FPrST0, 0>; defm : FpUnaryAlias<"fsub", SUB_FST0r>; defm : FpUnaryAlias<"fsubp", SUBR_FPrST0>; defm : FpUnaryAlias<"fsubr", SUBR_FST0r>; @@ -1472,8 +1474,8 @@ defm : FpUnaryAlias<"fdiv", DIV_FST0r>; defm : FpUnaryAlias<"fdivp", DIVR_FPrST0>; defm : FpUnaryAlias<"fdivr", DIVR_FST0r>; defm : FpUnaryAlias<"fdivrp", DIV_FPrST0>; -defm : FpUnaryAlias<"fcomi", COM_FIr>; -defm : FpUnaryAlias<"fucomi", UCOM_FIr>; +defm : FpUnaryAlias<"fcomi", COM_FIr, 0>; +defm : FpUnaryAlias<"fucomi", UCOM_FIr, 0>; defm : FpUnaryAlias<"fcompi", COM_FIPr>; defm : FpUnaryAlias<"fucompi", UCOM_FIPr>; @@ -1481,7 +1483,7 @@ defm : FpUnaryAlias<"fucompi", UCOM_FIPr>; // Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they // commute. We also allow fdiv[r]p/fsubrp even though they don't commute, // solely because gas supports it. -def : InstAlias<"faddp %st(0), $op", (ADD_FPrST0 RST:$op)>; +def : InstAlias<"faddp %st(0), $op", (ADD_FPrST0 RST:$op), 0>; def : InstAlias<"fmulp %st(0), $op", (MUL_FPrST0 RST:$op)>; def : InstAlias<"fsubrp %st(0), $op", (SUB_FPrST0 RST:$op)>; def : InstAlias<"fdivp %st(0), $op", (DIVR_FPrST0 RST:$op)>; @@ -1534,29 +1536,31 @@ def : InstAlias<"mov $seg, $mem", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg)>; def : InstAlias<"movq $imm, $reg", (MOV64ri GR64:$reg, i64imm:$imm)>; // Match 'movq GR64, MMX' as an alias for movd. -def : InstAlias<"movq $src, $dst", (MMX_MOVD64to64rr VR64:$dst, GR64:$src)>; -def : InstAlias<"movq $src, $dst", (MMX_MOVD64from64rr GR64:$dst, VR64:$src)>; +def : InstAlias<"movq $src, $dst", + (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>; +def : InstAlias<"movq $src, $dst", + (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>; // movsd with no operands (as opposed to the SSE scalar move of a double) is an // alias for movsl. (as in rep; movsd) def : InstAlias<"movsd", (MOVSD)>; // movsx aliases -def : InstAlias<"movsx $src, $dst", (MOVSX16rr8W GR16:$dst, GR8:$src)>; -def : InstAlias<"movsx $src, $dst", (MOVSX16rm8W GR16:$dst, i8mem:$src)>; -def : InstAlias<"movsx $src, $dst", (MOVSX32rr8 GR32:$dst, GR8:$src)>; -def : InstAlias<"movsx $src, $dst", (MOVSX32rr16 GR32:$dst, GR16:$src)>; -def : InstAlias<"movsx $src, $dst", (MOVSX64rr8 GR64:$dst, GR8:$src)>; -def : InstAlias<"movsx $src, $dst", (MOVSX64rr16 GR64:$dst, GR16:$src)>; -def : InstAlias<"movsx $src, $dst", (MOVSX64rr32 GR64:$dst, GR32:$src)>; +def : InstAlias<"movsx $src, $dst", (MOVSX16rr8W GR16:$dst, GR8:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX16rm8W GR16:$dst, i8mem:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>; // movzx aliases -def : InstAlias<"movzx $src, $dst", (MOVZX16rr8W GR16:$dst, GR8:$src)>; -def : InstAlias<"movzx $src, $dst", (MOVZX16rm8W GR16:$dst, i8mem:$src)>; -def : InstAlias<"movzx $src, $dst", (MOVZX32rr8 GR32:$dst, GR8:$src)>; -def : InstAlias<"movzx $src, $dst", (MOVZX32rr16 GR32:$dst, GR16:$src)>; -def : InstAlias<"movzx $src, $dst", (MOVZX64rr8_Q GR64:$dst, GR8:$src)>; -def : InstAlias<"movzx $src, $dst", (MOVZX64rr16_Q GR64:$dst, GR16:$src)>; +def : InstAlias<"movzx $src, $dst", (MOVZX16rr8W GR16:$dst, GR8:$src), 0>; +def : InstAlias<"movzx $src, $dst", (MOVZX16rm8W GR16:$dst, i8mem:$src), 0>; +def : InstAlias<"movzx $src, $dst", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>; +def : InstAlias<"movzx $src, $dst", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>; +def : InstAlias<"movzx $src, $dst", (MOVZX64rr8_Q GR64:$dst, GR8:$src), 0>; +def : InstAlias<"movzx $src, $dst", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>; // Note: No GR32->GR64 movzx form. // outb %dx -> outb %al, %dx diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index b912949..cde3f6b 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -135,18 +135,16 @@ class sse12_move_rm<RegisterClass RC, X86MemOperand x86memop, // is used instead. Register-to-register movss/movsd is not modeled as an // INSERT_SUBREG because INSERT_SUBREG requires that the insert be implementable // in terms of a copy, and just mentioned, we don't use movss/movsd for copies. -let isAsmParserOnly = 1 in { - def VMOVSSrr : sse12_move_rr<FR32, v4f32, - "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XS, VEX_4V; - def VMOVSDrr : sse12_move_rr<FR64, v2f64, - "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XD, VEX_4V; +def VMOVSSrr : sse12_move_rr<FR32, v4f32, + "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XS, VEX_4V; +def VMOVSDrr : sse12_move_rr<FR64, v2f64, + "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XD, VEX_4V; - let canFoldAsLoad = 1, isReMaterializable = 1 in { - def VMOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS, VEX; +let canFoldAsLoad = 1, isReMaterializable = 1 in { + def VMOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS, VEX; - let AddedComplexity = 20 in - def VMOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD, VEX; - } + let AddedComplexity = 20 in + def VMOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD, VEX; } let Constraints = "$src1 = $dst" in { @@ -218,14 +216,12 @@ def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src), "movsd\t{$src, $dst|$dst, $src}", [(store FR64:$src, addr:$dst)]>; -let isAsmParserOnly = 1 in { def VMOVSSmr : SI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src), "movss\t{$src, $dst|$dst, $src}", [(store FR32:$src, addr:$dst)]>, XS, VEX; def VMOVSDmr : SI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src), "movsd\t{$src, $dst|$dst, $src}", [(store FR64:$src, addr:$dst)]>, XD, VEX; -} // Extract and store. def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), @@ -251,7 +247,6 @@ let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in [(set RC:$dst, (ld_frag addr:$src))], d>; } -let isAsmParserOnly = 1 in { defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", SSEPackedSingle>, VEX; defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, @@ -269,7 +264,6 @@ defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups", SSEPackedSingle>, VEX; defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", SSEPackedDouble, 0>, OpSize, VEX; -} defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", SSEPackedSingle>, TB; defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, @@ -279,7 +273,6 @@ defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", SSEPackedDouble, 0>, TB, OpSize; -let isAsmParserOnly = 1 in { def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movaps\t{$src, $dst|$dst, $src}", [(alignedstore (v4f32 VR128:$src), addr:$dst)]>, VEX; @@ -304,7 +297,6 @@ def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movupd\t{$src, $dst|$dst, $src}", [(store (v4f64 VR256:$src), addr:$dst)]>, VEX; -} def : Pat<(int_x86_avx_loadu_ps_256 addr:$src), (VMOVUPSYrm addr:$src)>; def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src), @@ -328,32 +320,14 @@ def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), [(store (v2f64 VR128:$src), addr:$dst)]>; // Intrinsic forms of MOVUPS/D load and store -let isAsmParserOnly = 1 in { - let canFoldAsLoad = 1, isReMaterializable = 1 in - def VMOVUPSrm_Int : VPSI<0x10, MRMSrcMem, (outs VR128:$dst), - (ins f128mem:$src), - "movups\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>, VEX; - def VMOVUPDrm_Int : VPDI<0x10, MRMSrcMem, (outs VR128:$dst), - (ins f128mem:$src), - "movupd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_loadu_pd addr:$src))]>, VEX; - def VMOVUPSmr_Int : VPSI<0x11, MRMDestMem, (outs), - (ins f128mem:$dst, VR128:$src), - "movups\t{$src, $dst|$dst, $src}", - [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>, VEX; - def VMOVUPDmr_Int : VPDI<0x11, MRMDestMem, (outs), - (ins f128mem:$dst, VR128:$src), - "movupd\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>, VEX; -} -let canFoldAsLoad = 1, isReMaterializable = 1 in -def MOVUPSrm_Int : PSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "movups\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>; -def MOVUPDrm_Int : PDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "movupd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_loadu_pd addr:$src))]>; +def VMOVUPSmr_Int : VPSI<0x11, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src), + "movups\t{$src, $dst|$dst, $src}", + [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>, VEX; +def VMOVUPDmr_Int : VPDI<0x11, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src), + "movupd\t{$src, $dst|$dst, $src}", + [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>, VEX; def MOVUPSmr_Int : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movups\t{$src, $dst|$dst, $src}", @@ -382,7 +356,7 @@ multiclass sse12_mov_hilo_packed<bits<8>opc, RegisterClass RC, SSEPackedDouble>, TB, OpSize; } -let isAsmParserOnly = 1, AddedComplexity = 20 in { +let AddedComplexity = 20 in { defm VMOVL : sse12_mov_hilo_packed<0x12, VR128, movlp, "movlp", "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, VEX_4V; defm VMOVH : sse12_mov_hilo_packed<0x16, VR128, movlhps, "movhp", @@ -395,7 +369,6 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in { "\t{$src2, $dst|$dst, $src2}">; } -let isAsmParserOnly = 1 in { def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlps\t{$src, $dst|$dst, $src}", [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)), @@ -404,7 +377,6 @@ def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlpd\t{$src, $dst|$dst, $src}", [(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), addr:$dst)]>, VEX; -} def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlps\t{$src, $dst|$dst, $src}", [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)), @@ -416,7 +388,6 @@ def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), // v2f64 extract element 1 is always custom lowered to unpack high to low // and extract element 0 so the non-store version isn't too horrible. -let isAsmParserOnly = 1 in { def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhps\t{$src, $dst|$dst, $src}", [(store (f64 (vector_extract @@ -429,7 +400,6 @@ def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), (v2f64 (unpckh VR128:$src, (undef))), (iPTR 0))), addr:$dst)]>, VEX; -} def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhps\t{$src, $dst|$dst, $src}", [(store (f64 (vector_extract @@ -441,7 +411,7 @@ def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), (v2f64 (unpckh VR128:$src, (undef))), (iPTR 0))), addr:$dst)]>; -let isAsmParserOnly = 1, AddedComplexity = 20 in { +let AddedComplexity = 20 in { def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -516,7 +486,6 @@ multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>; } -let isAsmParserOnly = 1 in { defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, "cvttss2si\t{$src, $dst|$dst, $src}">, XS, VEX; defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, @@ -542,7 +511,6 @@ defm VCVTSI2SDL : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">, XD, VEX_4V; defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">, XD, VEX_4V, VEX_W; -} defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, "cvttss2si\t{$src, $dst|$dst, $src}">, XS; @@ -591,27 +559,25 @@ multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC, [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))]>; } -let isAsmParserOnly = 1 in { - defm Int_VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, - f32mem, load, "cvtss2si">, XS, VEX; - defm Int_VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, - int_x86_sse_cvtss2si64, f32mem, load, "cvtss2si">, - XS, VEX, VEX_W; - defm Int_VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si, - f128mem, load, "cvtsd2si">, XD, VEX; - defm Int_VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, - int_x86_sse2_cvtsd2si64, f128mem, load, "cvtsd2si">, - XD, VEX, VEX_W; - - // FIXME: The asm matcher has a hack to ignore instructions with _Int and Int_ - // Get rid of this hack or rename the intrinsics, there are several - // intructions that only match with the intrinsic form, why create duplicates - // to let them be recognized by the assembler? - defm VCVTSD2SI_alt : sse12_cvt_s_np<0x2D, FR64, GR32, f64mem, - "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX; - defm VCVTSD2SI64 : sse12_cvt_s_np<0x2D, FR64, GR64, f64mem, - "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, VEX_W; -} +defm Int_VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, + f32mem, load, "cvtss2si">, XS, VEX; +defm Int_VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, + int_x86_sse_cvtss2si64, f32mem, load, "cvtss2si">, + XS, VEX, VEX_W; +defm Int_VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si, + f128mem, load, "cvtsd2si">, XD, VEX; +defm Int_VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, + int_x86_sse2_cvtsd2si64, f128mem, load, "cvtsd2si">, + XD, VEX, VEX_W; + +// FIXME: The asm matcher has a hack to ignore instructions with _Int and Int_ +// Get rid of this hack or rename the intrinsics, there are several +// intructions that only match with the intrinsic form, why create duplicates +// to let them be recognized by the assembler? +defm VCVTSD2SI_alt : sse12_cvt_s_np<0x2D, FR64, GR32, f64mem, + "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX; +defm VCVTSD2SI64 : sse12_cvt_s_np<0x2D, FR64, GR64, f64mem, + "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, VEX_W; defm Int_CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, f32mem, load, "cvtss2si">, XS; defm Int_CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64, @@ -622,18 +588,16 @@ defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64, f128mem, load, "cvtsd2si{q}">, XD, REX_W; -let isAsmParserOnly = 1 in { - defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, - int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss", 0>, XS, VEX_4V; - defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, - int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss", 0>, XS, VEX_4V, - VEX_W; - defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, - int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd", 0>, XD, VEX_4V; - defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, - int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd", 0>, XD, - VEX_4V, VEX_W; -} +defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss", 0>, XS, VEX_4V; +defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss", 0>, XS, VEX_4V, + VEX_W; +defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd", 0>, XD, VEX_4V; +defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd", 0>, XD, + VEX_4V, VEX_W; let Constraints = "$src1 = $dst" in { defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, @@ -653,7 +617,6 @@ let Constraints = "$src1 = $dst" in { /// SSE 1 Only // Aliases for intrinsics -let isAsmParserOnly = 1 in { defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, f32mem, load, "cvttss2si">, XS, VEX; defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, @@ -664,7 +627,6 @@ defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, int_x86_sse2_cvttsd2si64, f128mem, load, "cvttsd2si">, XD, VEX, VEX_W; -} defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, f32mem, load, "cvttss2si">, XS; defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, @@ -676,7 +638,7 @@ defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, int_x86_sse2_cvttsd2si64, f128mem, load, "cvttsd2si{q}">, XD, REX_W; -let isAsmParserOnly = 1, Pattern = []<dag> in { +let Pattern = []<dag> in { defm VCVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load, "cvtss2si{l}\t{$src, $dst|$dst, $src}">, XS, VEX; defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load, @@ -702,7 +664,6 @@ defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load /*dummy*/, /// SSE 2 Only // Convert scalar double to scalar single -let isAsmParserOnly = 1 in { def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src1, FR64:$src2), "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, @@ -711,7 +672,6 @@ def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins FR64:$src1, f64mem:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, XD, Requires<[HasAVX, OptForSize]>, VEX_4V; -} def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>, Requires<[HasAVX]>; @@ -723,7 +683,6 @@ def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), [(set FR32:$dst, (fround (loadf64 addr:$src)))]>, XD, Requires<[HasSSE2, OptForSize]>; -let isAsmParserOnly = 1 in defm Int_VCVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128, int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss", 0>, XS, VEX_4V; @@ -732,7 +691,7 @@ defm Int_CVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128, int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss">, XS; // Convert scalar single to scalar double -let isAsmParserOnly = 1 in { // SSE2 instructions with XS prefix +// SSE2 instructions with XS prefix def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src1, FR32:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -741,7 +700,6 @@ def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins FR32:$src1, f32mem:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, XS, VEX_4V, Requires<[HasAVX, OptForSize]>; -} def : Pat<(f64 (fextend FR32:$src)), (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[HasAVX]>; @@ -754,7 +712,6 @@ def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), [(set FR64:$dst, (extloadf32 addr:$src))]>, XS, Requires<[HasSSE2, OptForSize]>; -let isAsmParserOnly = 1 in { def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -767,7 +724,6 @@ def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem, [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, (load addr:$src2)))]>, XS, VEX_4V, Requires<[HasAVX]>; -} let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), @@ -788,7 +744,7 @@ def : Pat<(extloadf32 addr:$src), Requires<[HasSSE2, OptForSpeed]>; // Convert doubleword to packed single/double fp -let isAsmParserOnly = 1 in { // SSE2 instructions without OpSize prefix +// SSE2 instructions without OpSize prefix def Int_VCVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtdq2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>, @@ -798,7 +754,6 @@ def Int_VCVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), [(set VR128:$dst, (int_x86_sse2_cvtdq2ps (bitconvert (memopv2i64 addr:$src))))]>, TB, VEX, Requires<[HasAVX]>; -} def Int_CVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtdq2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>, @@ -810,7 +765,7 @@ def Int_CVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), TB, Requires<[HasSSE2]>; // FIXME: why the non-intrinsic version is described as SSE3? -let isAsmParserOnly = 1 in { // SSE2 instructions with XS prefix +// SSE2 instructions with XS prefix def Int_VCVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>, @@ -820,7 +775,6 @@ def Int_VCVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), [(set VR128:$dst, (int_x86_sse2_cvtdq2pd (bitconvert (memopv2i64 addr:$src))))]>, XS, VEX, Requires<[HasAVX]>; -} def Int_CVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>, @@ -833,7 +787,6 @@ def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), // Convert packed single/double fp to doubleword -let isAsmParserOnly = 1 in { def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX; def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), @@ -842,13 +795,11 @@ def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX; def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX; -} def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", []>; def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", []>; -let isAsmParserOnly = 1 in { def Int_VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>, @@ -858,7 +809,6 @@ def Int_VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2dq (memop addr:$src)))]>, VEX; -} def Int_CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>; @@ -867,7 +817,7 @@ def Int_CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), [(set VR128:$dst, (int_x86_sse2_cvtps2dq (memop addr:$src)))]>; -let isAsmParserOnly = 1 in { // SSE2 packed instructions with XD prefix +// SSE2 packed instructions with XD prefix def Int_VCVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>, @@ -877,7 +827,6 @@ def Int_VCVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), [(set VR128:$dst, (int_x86_sse2_cvtpd2dq (memop addr:$src)))]>, XD, VEX, Requires<[HasAVX]>; -} def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>, @@ -890,7 +839,7 @@ def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), // Convert with truncation packed single/double fp to doubleword -let isAsmParserOnly = 1 in { // SSE2 packed instructions with XS prefix +// SSE2 packed instructions with XS prefix def VCVTTPS2DQrr : VSSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX; def VCVTTPS2DQrm : VSSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), @@ -899,7 +848,6 @@ def VCVTTPS2DQYrr : VSSI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX; def VCVTTPS2DQYrm : VSSI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX; -} def CVTTPS2DQrr : SSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -910,7 +858,6 @@ def CVTTPS2DQrm : SSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), (int_x86_sse2_cvttps2dq (memop addr:$src)))]>; -let isAsmParserOnly = 1 in { def Int_VCVTTPS2DQrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -921,9 +868,7 @@ def Int_VCVTTPS2DQrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), [(set VR128:$dst, (int_x86_sse2_cvttps2dq (memop addr:$src)))]>, XS, VEX, Requires<[HasAVX]>; -} -let isAsmParserOnly = 1 in { def Int_VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", @@ -934,7 +879,6 @@ def Int_VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvttpd2dq (memop addr:$src)))]>, VEX; -} def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))]>; @@ -943,7 +887,6 @@ def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), [(set VR128:$dst, (int_x86_sse2_cvttpd2dq (memop addr:$src)))]>; -let isAsmParserOnly = 1 in { // The assembler can recognize rr 256-bit instructions by seeing a ymm // register, but the same isn't true when using memory operands instead. // Provide other assembly rr and rm forms to address this explicitly. @@ -963,10 +906,9 @@ def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), "cvttpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX; def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "cvttpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L; -} // Convert packed single to packed double -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { // SSE2 instructions without OpSize prefix def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, VEX; @@ -982,7 +924,6 @@ def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), "cvtps2pd\t{$src, $dst|$dst, $src}", []>, TB; -let isAsmParserOnly = 1 in { def Int_VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>, @@ -992,7 +933,6 @@ def Int_VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), [(set VR128:$dst, (int_x86_sse2_cvtps2pd (load addr:$src)))]>, VEX, Requires<[HasAVX]>; -} def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>, @@ -1004,7 +944,6 @@ def Int_CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), TB, Requires<[HasSSE2]>; // Convert packed double to packed single -let isAsmParserOnly = 1 in { // The assembler can recognize rr 256-bit instructions by seeing a ymm // register, but the same isn't true when using memory operands instead. // Provide other assembly rr and rm forms to address this explicitly. @@ -1024,14 +963,12 @@ def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), "cvtpd2psy\t{$src, $dst|$dst, $src}", []>, VEX; def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "cvtpd2psy\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L; -} def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", []>; def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", []>; -let isAsmParserOnly = 1 in { def Int_VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>; @@ -1040,7 +977,6 @@ def Int_VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), "cvtpd2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2ps (memop addr:$src)))]>; -} def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>; @@ -1089,26 +1025,27 @@ def : Pat<(int_x86_avx_cvtt_ps2dq_256 (memopv8f32 addr:$src)), // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, string asm, string asm_alt> { - def rr : SIi8<0xC2, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, RC:$src, SSECC:$cc), - asm, []>; - let mayLoad = 1 in - def rm : SIi8<0xC2, MRMSrcMem, - (outs RC:$dst), (ins RC:$src1, x86memop:$src, SSECC:$cc), - asm, []>; - // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1 in { - def rr_alt : SIi8<0xC2, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, RC:$src, i8imm:$src2), - asm_alt, []>; + def rr : SIi8<0xC2, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src, SSECC:$cc), + asm, []>; let mayLoad = 1 in - def rm_alt : SIi8<0xC2, MRMSrcMem, - (outs RC:$dst), (ins RC:$src1, x86memop:$src, i8imm:$src2), - asm_alt, []>; + def rm : SIi8<0xC2, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src, SSECC:$cc), + asm, []>; } + + // Accept explicit immediate argument form instead of comparison code. + def rr_alt : SIi8<0xC2, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src, i8imm:$src2), + asm_alt, []>; + let mayLoad = 1 in + def rm_alt : SIi8<0xC2, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src, i8imm:$src2), + asm_alt, []>; } -let neverHasSideEffects = 1, isAsmParserOnly = 1 in { +let neverHasSideEffects = 1 in { defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}", "cmpss\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}">, @@ -1141,14 +1078,12 @@ multiclass sse12_cmp_scalar_int<RegisterClass RC, X86MemOperand x86memop, } // Aliases to match intrinsics which expect XMM operand(s). -let isAsmParserOnly = 1 in { - defm Int_VCMPSS : sse12_cmp_scalar_int<VR128, f32mem, int_x86_sse_cmp_ss, - "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}">, - XS, VEX_4V; - defm Int_VCMPSD : sse12_cmp_scalar_int<VR128, f64mem, int_x86_sse2_cmp_sd, - "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}">, - XD, VEX_4V; -} +defm Int_VCMPSS : sse12_cmp_scalar_int<VR128, f32mem, int_x86_sse_cmp_ss, + "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}">, + XS, VEX_4V; +defm Int_VCMPSD : sse12_cmp_scalar_int<VR128, f64mem, int_x86_sse2_cmp_sd, + "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}">, + XD, VEX_4V; let Constraints = "$src1 = $dst" in { defm Int_CMPSS : sse12_cmp_scalar_int<VR128, f32mem, int_x86_sse_cmp_ss, "cmp${cc}ss\t{$src, $dst|$dst, $src}">, XS; @@ -1171,28 +1106,26 @@ multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode, } let Defs = [EFLAGS] in { - let isAsmParserOnly = 1 in { - defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, - "ucomiss", SSEPackedSingle>, VEX; - defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, - "ucomisd", SSEPackedDouble>, OpSize, VEX; - let Pattern = []<dag> in { - defm VCOMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load, - "comiss", SSEPackedSingle>, VEX; - defm VCOMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load, - "comisd", SSEPackedDouble>, OpSize, VEX; - } - - defm Int_VUCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem, - load, "ucomiss", SSEPackedSingle>, VEX; - defm Int_VUCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem, - load, "ucomisd", SSEPackedDouble>, OpSize, VEX; - - defm Int_VCOMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, - load, "comiss", SSEPackedSingle>, VEX; - defm Int_VCOMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, - load, "comisd", SSEPackedDouble>, OpSize, VEX; + defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, + "ucomiss", SSEPackedSingle>, VEX; + defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, + "ucomisd", SSEPackedDouble>, OpSize, VEX; + let Pattern = []<dag> in { + defm VCOMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load, + "comiss", SSEPackedSingle>, VEX; + defm VCOMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load, + "comisd", SSEPackedDouble>, OpSize, VEX; } + + defm Int_VUCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem, + load, "ucomiss", SSEPackedSingle>, VEX; + defm Int_VUCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem, + load, "ucomisd", SSEPackedDouble>, OpSize, VEX; + + defm Int_VCOMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, + load, "comiss", SSEPackedSingle>, VEX; + defm Int_VCOMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, + load, "comisd", SSEPackedDouble>, OpSize, VEX; defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, "ucomiss", SSEPackedSingle>, TB; defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, @@ -1220,41 +1153,40 @@ let Defs = [EFLAGS] in { multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int, string asm, string asm_alt, Domain d> { - def rri : PIi8<0xC2, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, RC:$src, SSECC:$cc), asm, - [(set RC:$dst, (Int RC:$src1, RC:$src, imm:$cc))], d>; - def rmi : PIi8<0xC2, MRMSrcMem, - (outs RC:$dst), (ins RC:$src1, f128mem:$src, SSECC:$cc), asm, - [(set RC:$dst, (Int RC:$src1, (memop addr:$src), imm:$cc))], d>; - // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1 in { - def rri_alt : PIi8<0xC2, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, RC:$src, i8imm:$src2), - asm_alt, [], d>; - def rmi_alt : PIi8<0xC2, MRMSrcMem, - (outs RC:$dst), (ins RC:$src1, f128mem:$src, i8imm:$src2), - asm_alt, [], d>; + def rri : PIi8<0xC2, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src, SSECC:$cc), asm, + [(set RC:$dst, (Int RC:$src1, RC:$src, imm:$cc))], d>; + def rmi : PIi8<0xC2, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, f128mem:$src, SSECC:$cc), asm, + [(set RC:$dst, (Int RC:$src1, (memop addr:$src), imm:$cc))], d>; } -} -let isAsmParserOnly = 1 in { - defm VCMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps, - "cmp${cc}ps\t{$src, $src1, $dst|$dst, $src1, $src}", - "cmpps\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}", - SSEPackedSingle>, VEX_4V; - defm VCMPPD : sse12_cmp_packed<VR128, f128mem, int_x86_sse2_cmp_pd, - "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}", - "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}", - SSEPackedDouble>, OpSize, VEX_4V; - defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_ps_256, - "cmp${cc}ps\t{$src, $src1, $dst|$dst, $src1, $src}", - "cmpps\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}", - SSEPackedSingle>, VEX_4V; - defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_pd_256, - "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}", - "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}", - SSEPackedDouble>, OpSize, VEX_4V; -} + // Accept explicit immediate argument form instead of comparison code. + def rri_alt : PIi8<0xC2, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src, i8imm:$src2), + asm_alt, [], d>; + def rmi_alt : PIi8<0xC2, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, f128mem:$src, i8imm:$src2), + asm_alt, [], d>; +} + +defm VCMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps, + "cmp${cc}ps\t{$src, $src1, $dst|$dst, $src1, $src}", + "cmpps\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}", + SSEPackedSingle>, VEX_4V; +defm VCMPPD : sse12_cmp_packed<VR128, f128mem, int_x86_sse2_cmp_pd, + "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}", + "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}", + SSEPackedDouble>, OpSize, VEX_4V; +defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_ps_256, + "cmp${cc}ps\t{$src, $src1, $dst|$dst, $src1, $src}", + "cmpps\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}", + SSEPackedSingle>, VEX_4V; +defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_pd_256, + "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}", + "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}", + SSEPackedDouble>, OpSize, VEX_4V; let Constraints = "$src1 = $dst" in { defm CMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps, "cmp${cc}ps\t{$src, $dst|$dst, $src}", @@ -1294,20 +1226,18 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, (vt (shufp:$src3 RC:$src1, RC:$src2)))], d>; } -let isAsmParserOnly = 1 in { - defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, - "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - memopv4f32, SSEPackedSingle>, VEX_4V; - defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, - "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - memopv8f32, SSEPackedSingle>, VEX_4V; - defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, - "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}", - memopv2f64, SSEPackedDouble>, OpSize, VEX_4V; - defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, - "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}", - memopv4f64, SSEPackedDouble>, OpSize, VEX_4V; -} +defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, + "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + memopv4f32, SSEPackedSingle>, TB, VEX_4V; +defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, + "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + memopv8f32, SSEPackedSingle>, TB, VEX_4V; +defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, + "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}", + memopv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V; +defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, + "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}", + memopv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V; let Constraints = "$src1 = $dst" in { defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, @@ -1340,33 +1270,31 @@ multiclass sse12_unpack_interleave<bits<8> opc, PatFrag OpNode, ValueType vt, } let AddedComplexity = 10 in { - let isAsmParserOnly = 1 in { - defm VUNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32, - VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, VEX_4V; - defm VUNPCKHPD: sse12_unpack_interleave<0x15, unpckh, v2f64, memopv2f64, - VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, OpSize, VEX_4V; - defm VUNPCKLPS: sse12_unpack_interleave<0x14, unpckl, v4f32, memopv4f32, - VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, VEX_4V; - defm VUNPCKLPD: sse12_unpack_interleave<0x14, unpckl, v2f64, memopv2f64, - VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, OpSize, VEX_4V; - - defm VUNPCKHPSY: sse12_unpack_interleave<0x15, unpckh, v8f32, memopv8f32, - VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, VEX_4V; - defm VUNPCKHPDY: sse12_unpack_interleave<0x15, unpckh, v4f64, memopv4f64, - VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, OpSize, VEX_4V; - defm VUNPCKLPSY: sse12_unpack_interleave<0x14, unpckl, v8f32, memopv8f32, - VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, VEX_4V; - defm VUNPCKLPDY: sse12_unpack_interleave<0x14, unpckl, v4f64, memopv4f64, - VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, OpSize, VEX_4V; - } + defm VUNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32, + VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, VEX_4V; + defm VUNPCKHPD: sse12_unpack_interleave<0x15, unpckh, v2f64, memopv2f64, + VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, OpSize, VEX_4V; + defm VUNPCKLPS: sse12_unpack_interleave<0x14, unpckl, v4f32, memopv4f32, + VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, VEX_4V; + defm VUNPCKLPD: sse12_unpack_interleave<0x14, unpckl, v2f64, memopv2f64, + VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, OpSize, VEX_4V; + + defm VUNPCKHPSY: sse12_unpack_interleave<0x15, unpckh, v8f32, memopv8f32, + VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, VEX_4V; + defm VUNPCKHPDY: sse12_unpack_interleave<0x15, unpckh, v4f64, memopv4f64, + VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, OpSize, VEX_4V; + defm VUNPCKLPSY: sse12_unpack_interleave<0x14, unpckl, v8f32, memopv8f32, + VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, VEX_4V; + defm VUNPCKLPDY: sse12_unpack_interleave<0x14, unpckl, v4f64, memopv4f64, + VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, OpSize, VEX_4V; let Constraints = "$src1 = $dst" in { defm UNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32, @@ -1404,30 +1332,28 @@ defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps", defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd", SSEPackedDouble>, TB, OpSize; -let isAsmParserOnly = 1 in { - defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, - "movmskps", SSEPackedSingle>, VEX; - defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, - "movmskpd", SSEPackedDouble>, OpSize, - VEX; - defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256, - "movmskps", SSEPackedSingle>, VEX; - defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256, - "movmskpd", SSEPackedDouble>, OpSize, - VEX; +defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, + "movmskps", SSEPackedSingle>, VEX; +defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, + "movmskpd", SSEPackedDouble>, OpSize, + VEX; +defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256, + "movmskps", SSEPackedSingle>, VEX; +defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256, + "movmskpd", SSEPackedDouble>, OpSize, + VEX; - // Assembler Only - def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), - "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX; - def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), - "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize, - VEX; - def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), - "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX; - def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), - "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize, - VEX; -} +// Assembler Only +def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), + "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX; +def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), + "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize, + VEX; +def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), + "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX; +def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), + "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize, + VEX; //===----------------------------------------------------------------------===// // SSE 1 & 2 - Misc aliasing of packed SSE 1 & 2 instructions @@ -1482,13 +1408,11 @@ def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src), /// multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr, SDNode OpNode> { - let isAsmParserOnly = 1 in { - defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, - FR32, f32, f128mem, memopfsf32, SSEPackedSingle, 0>, VEX_4V; + defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, + FR32, f32, f128mem, memopfsf32, SSEPackedSingle, 0>, VEX_4V; - defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, - FR64, f64, f128mem, memopfsf64, SSEPackedDouble, 0>, OpSize, VEX_4V; - } + defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, + FR64, f64, f128mem, memopfsf64, SSEPackedDouble, 0>, OpSize, VEX_4V; let Constraints = "$src1 = $dst" in { defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32, @@ -1514,7 +1438,7 @@ let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, SDNode OpNode, int HasPat = 0, list<list<dag>> Pattern = []> { - let isAsmParserOnly = 1, Pattern = []<dag> in { + let Pattern = []<dag> in { defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, !strconcat(OpcodeStr, "ps"), f128mem, !if(HasPat, Pattern[0], // rr @@ -1561,7 +1485,6 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, /// sse12_fp_packed_logical_y - AVX 256-bit SSE 1 & 2 logical ops forms /// -let isAsmParserOnly = 1 in { multiclass sse12_fp_packed_logical_y<bits<8> opc, string OpcodeStr> { defm PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle, !strconcat(OpcodeStr, "ps"), f256mem, [], [], 0>, VEX_4V; @@ -1569,7 +1492,6 @@ multiclass sse12_fp_packed_logical_y<bits<8> opc, string OpcodeStr> { defm PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble, !strconcat(OpcodeStr, "pd"), f256mem, [], [], 0>, OpSize, VEX_4V; } -} // AVX 256-bit packed logical ops forms defm VAND : sse12_fp_packed_logical_y<0x54, "and">; @@ -1667,38 +1589,36 @@ multiclass basic_sse12_fp_binop_p_y_int<bits<8> opc, string OpcodeStr> { } // Binary Arithmetic instructions -let isAsmParserOnly = 1 in { - defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, 0>, - basic_sse12_fp_binop_s_int<0x58, "add", 0>, - basic_sse12_fp_binop_p<0x58, "add", fadd, 0>, - basic_sse12_fp_binop_p_y<0x58, "add", fadd>, VEX_4V; - defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, 0>, - basic_sse12_fp_binop_s_int<0x59, "mul", 0>, - basic_sse12_fp_binop_p<0x59, "mul", fmul, 0>, - basic_sse12_fp_binop_p_y<0x59, "mul", fmul>, VEX_4V; +defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, 0>, + basic_sse12_fp_binop_s_int<0x58, "add", 0>, + basic_sse12_fp_binop_p<0x58, "add", fadd, 0>, + basic_sse12_fp_binop_p_y<0x58, "add", fadd>, VEX_4V; +defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, 0>, + basic_sse12_fp_binop_s_int<0x59, "mul", 0>, + basic_sse12_fp_binop_p<0x59, "mul", fmul, 0>, + basic_sse12_fp_binop_p_y<0x59, "mul", fmul>, VEX_4V; - let isCommutable = 0 in { - defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, 0>, - basic_sse12_fp_binop_s_int<0x5C, "sub", 0>, - basic_sse12_fp_binop_p<0x5C, "sub", fsub, 0>, - basic_sse12_fp_binop_p_y<0x5C, "sub", fsub>, VEX_4V; - defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, 0>, - basic_sse12_fp_binop_s_int<0x5E, "div", 0>, - basic_sse12_fp_binop_p<0x5E, "div", fdiv, 0>, - basic_sse12_fp_binop_p_y<0x5E, "div", fdiv>, VEX_4V; - defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, 0>, - basic_sse12_fp_binop_s_int<0x5F, "max", 0>, - basic_sse12_fp_binop_p<0x5F, "max", X86fmax, 0>, - basic_sse12_fp_binop_p_int<0x5F, "max", 0>, - basic_sse12_fp_binop_p_y<0x5F, "max", X86fmax>, - basic_sse12_fp_binop_p_y_int<0x5F, "max">, VEX_4V; - defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, 0>, - basic_sse12_fp_binop_s_int<0x5D, "min", 0>, - basic_sse12_fp_binop_p<0x5D, "min", X86fmin, 0>, - basic_sse12_fp_binop_p_int<0x5D, "min", 0>, - basic_sse12_fp_binop_p_y_int<0x5D, "min">, - basic_sse12_fp_binop_p_y<0x5D, "min", X86fmin>, VEX_4V; - } +let isCommutable = 0 in { + defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, 0>, + basic_sse12_fp_binop_s_int<0x5C, "sub", 0>, + basic_sse12_fp_binop_p<0x5C, "sub", fsub, 0>, + basic_sse12_fp_binop_p_y<0x5C, "sub", fsub>, VEX_4V; + defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, 0>, + basic_sse12_fp_binop_s_int<0x5E, "div", 0>, + basic_sse12_fp_binop_p<0x5E, "div", fdiv, 0>, + basic_sse12_fp_binop_p_y<0x5E, "div", fdiv>, VEX_4V; + defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, 0>, + basic_sse12_fp_binop_s_int<0x5F, "max", 0>, + basic_sse12_fp_binop_p<0x5F, "max", X86fmax, 0>, + basic_sse12_fp_binop_p_int<0x5F, "max", 0>, + basic_sse12_fp_binop_p_y<0x5F, "max", X86fmax>, + basic_sse12_fp_binop_p_y_int<0x5F, "max">, VEX_4V; + defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, 0>, + basic_sse12_fp_binop_s_int<0x5D, "min", 0>, + basic_sse12_fp_binop_p<0x5D, "min", X86fmin, 0>, + basic_sse12_fp_binop_p_int<0x5D, "min", 0>, + basic_sse12_fp_binop_p_y_int<0x5D, "min">, + basic_sse12_fp_binop_p_y<0x5D, "min", X86fmin>, VEX_4V; } let Constraints = "$src1 = $dst" in { @@ -1899,7 +1819,7 @@ multiclass sse2_fp_unop_p_y_int<bits<8> opc, string OpcodeStr, [(set VR256:$dst, (V2F64Int (memopv4f64 addr:$src)))]>; } -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { // Square root. defm VSQRT : sse1_fp_unop_s_avx<0x51, "vsqrt", fsqrt, int_x86_sse_sqrt_ss>, sse2_fp_unop_s_avx<0x51, "vsqrt", fsqrt, int_x86_sse2_sqrt_sd>, @@ -1955,67 +1875,65 @@ defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss>, // SSE 1 & 2 - Non-temporal stores //===----------------------------------------------------------------------===// -let isAsmParserOnly = 1 in { - def VMOVNTPSmr_Int : VPSI<0x2B, MRMDestMem, (outs), - (ins i128mem:$dst, VR128:$src), - "movntps\t{$src, $dst|$dst, $src}", - [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>, VEX; - def VMOVNTPDmr_Int : VPDI<0x2B, MRMDestMem, (outs), - (ins i128mem:$dst, VR128:$src), - "movntpd\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>, VEX; +def VMOVNTPSmr_Int : VPSI<0x2B, MRMDestMem, (outs), + (ins i128mem:$dst, VR128:$src), + "movntps\t{$src, $dst|$dst, $src}", + [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>, VEX; +def VMOVNTPDmr_Int : VPDI<0x2B, MRMDestMem, (outs), + (ins i128mem:$dst, VR128:$src), + "movntpd\t{$src, $dst|$dst, $src}", + [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>, VEX; - let ExeDomain = SSEPackedInt in - def VMOVNTDQmr_Int : VPDI<0xE7, MRMDestMem, (outs), +let ExeDomain = SSEPackedInt in + def VMOVNTDQmr_Int : VPDI<0xE7, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>, VEX; + +let AddedComplexity = 400 in { // Prefer non-temporal versions + def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src), + "movntps\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f32 VR128:$src), + addr:$dst)]>, VEX; + def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), - "movntdq\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>, VEX; - - let AddedComplexity = 400 in { // Prefer non-temporal versions - def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), - (ins f128mem:$dst, VR128:$src), - "movntps\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v4f32 VR128:$src), - addr:$dst)]>, VEX; - def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs), - (ins f128mem:$dst, VR128:$src), - "movntpd\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v2f64 VR128:$src), - addr:$dst)]>, VEX; - def VMOVNTDQ_64mr : VPDI<0xE7, MRMDestMem, (outs), - (ins f128mem:$dst, VR128:$src), - "movntdq\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v2f64 VR128:$src), - addr:$dst)]>, VEX; - let ExeDomain = SSEPackedInt in - def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), + "movntpd\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v2f64 VR128:$src), + addr:$dst)]>, VEX; + def VMOVNTDQ_64mr : VPDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntdq\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v4f32 VR128:$src), + [(alignednontemporalstore (v2f64 VR128:$src), addr:$dst)]>, VEX; - - def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs), - (ins f256mem:$dst, VR256:$src), - "movntps\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v8f32 VR256:$src), - addr:$dst)]>, VEX; - def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs), - (ins f256mem:$dst, VR256:$src), - "movntpd\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v4f64 VR256:$src), - addr:$dst)]>, VEX; - def VMOVNTDQY_64mr : VPDI<0xE7, MRMDestMem, (outs), - (ins f256mem:$dst, VR256:$src), - "movntdq\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v4f64 VR256:$src), - addr:$dst)]>, VEX; - let ExeDomain = SSEPackedInt in - def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), + let ExeDomain = SSEPackedInt in + def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f32 VR128:$src), + addr:$dst)]>, VEX; + + def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs), + (ins f256mem:$dst, VR256:$src), + "movntps\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v8f32 VR256:$src), + addr:$dst)]>, VEX; + def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs), + (ins f256mem:$dst, VR256:$src), + "movntpd\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f64 VR256:$src), + addr:$dst)]>, VEX; + def VMOVNTDQY_64mr : VPDI<0xE7, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movntdq\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v8f32 VR256:$src), + [(alignednontemporalstore (v4f64 VR256:$src), addr:$dst)]>, VEX; - } + let ExeDomain = SSEPackedInt in + def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), + (ins f256mem:$dst, VR256:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v8f32 VR256:$src), + addr:$dst)]>, VEX; } def : Pat<(int_x86_avx_movnt_dq_256 addr:$dst, VR256:$src), @@ -2138,12 +2056,10 @@ def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), // SSE 1 & 2 - Load/Store XCSR register //===----------------------------------------------------------------------===// -let isAsmParserOnly = 1 in { - def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src), - "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, VEX; - def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), - "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, VEX; -} +def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src), + "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, VEX; +def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), + "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, VEX; def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src), "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>; @@ -2156,45 +2072,43 @@ def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), let ExeDomain = SSEPackedInt in { // SSE integer instructions -let isAsmParserOnly = 1 in { - let neverHasSideEffects = 1 in { - def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; - def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; - } - def VMOVDQUrr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX; - def VMOVDQUYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), - "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX; - - let canFoldAsLoad = 1, mayLoad = 1 in { - def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; - def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; - let Predicates = [HasAVX] in { - def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; - def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), - "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; - } - } +let neverHasSideEffects = 1 in { +def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; +def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; +} +def VMOVDQUrr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX; +def VMOVDQUYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX; - let mayStore = 1 in { - def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), - (ins i128mem:$dst, VR128:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; - def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs), - (ins i256mem:$dst, VR256:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; - let Predicates = [HasAVX] in { - def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), +let canFoldAsLoad = 1, mayLoad = 1 in { +def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; +def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; +let Predicates = [HasAVX] in { + def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; - def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), + def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; - } - } +} +} + +let mayStore = 1 in { +def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), + (ins i128mem:$dst, VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; +def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs), + (ins i256mem:$dst, VR256:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; +let Predicates = [HasAVX] in { +def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), + "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; +def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), + "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; +} } let neverHasSideEffects = 1 in @@ -2226,23 +2140,11 @@ def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), } // Intrinsic forms of MOVDQU load and store -let isAsmParserOnly = 1 in { -let canFoldAsLoad = 1 in -def VMOVDQUrm_Int : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "vmovdqu\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_loadu_dq addr:$src))]>, - XS, VEX, Requires<[HasAVX]>; def VMOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "vmovdqu\t{$src, $dst|$dst, $src}", [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>, XS, VEX, Requires<[HasAVX]>; -} -let canFoldAsLoad = 1 in -def MOVDQUrm_Int : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "movdqu\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_loadu_dq addr:$src))]>, - XS, Requires<[HasSSE2]>; def MOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "movdqu\t{$src, $dst|$dst, $src}", [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>, @@ -2347,7 +2249,7 @@ multiclass PDI_binop_rm_v2i64<bits<8> opc, string OpcodeStr, SDNode OpNode, // 128-bit Integer Arithmetic -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPADDB : PDI_binop_rm<0xFC, "vpaddb", add, v16i8, 1, 0 /*3addr*/>, VEX_4V; defm VPADDW : PDI_binop_rm<0xFD, "vpaddw", add, v8i16, 1, 0>, VEX_4V; defm VPADDD : PDI_binop_rm<0xFE, "vpaddd", add, v4i32, 1, 0>, VEX_4V; @@ -2437,7 +2339,7 @@ defm PSADBW : PDI_binop_rm_int<0xF6, "psadbw", int_x86_sse2_psad_bw, 1>; // SSE2 - Packed Integer Logical Instructions //===---------------------------------------------------------------------===// -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "vpsllw", int_x86_sse2_psll_w, int_x86_sse2_pslli_w, 0>, VEX_4V; @@ -2584,7 +2486,7 @@ let Predicates = [HasSSE2] in { // SSE2 - Packed Integer Comparison Instructions //===---------------------------------------------------------------------===// -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPCMPEQB : PDI_binop_rm_int<0x74, "vpcmpeqb", int_x86_sse2_pcmpeq_b, 1, 0>, VEX_4V; defm VPCMPEQW : PDI_binop_rm_int<0x75, "vpcmpeqw", int_x86_sse2_pcmpeq_w, 1, @@ -2638,7 +2540,7 @@ def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, (memop addr:$src2))), // SSE2 - Packed Integer Pack Instructions //===---------------------------------------------------------------------===// -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPACKSSWB : PDI_binop_rm_int<0x63, "vpacksswb", int_x86_sse2_packsswb_128, 0, 0>, VEX_4V; defm VPACKSSDW : PDI_binop_rm_int<0x6B, "vpackssdw", int_x86_sse2_packssdw_128, @@ -2676,7 +2578,7 @@ def mi : Ii8<0x70, MRMSrcMem, } } // ExeDomain = SSEPackedInt -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { let AddedComplexity = 5 in defm VPSHUFD : sse2_pshuffle<"vpshufd", v4i32, pshufd, bc_v4i32>, OpSize, VEX; @@ -2724,7 +2626,7 @@ multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt, addr:$src2))))]>; } -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, unpckl, bc_v16i8, 0>, VEX_4V; defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, unpckl, bc_v8i16, @@ -2834,7 +2736,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> { } // Extract -let isAsmParserOnly = 1, Predicates = [HasAVX] in +let Predicates = [HasAVX] in def VPEXTRWri : Ii8<0xC5, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2), "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -2847,7 +2749,7 @@ def PEXTRWri : PDIi8<0xC5, MRMSrcReg, imm:$src2))]>; // Insert -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPINSRW : sse2_pinsrw<0>, OpSize, VEX_4V; def VPINSRWrr64i : Ii8<0xC4, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, GR64:$src2, i32i8imm:$src3), @@ -2866,13 +2768,11 @@ let Constraints = "$src1 = $dst" in let ExeDomain = SSEPackedInt in { -let isAsmParserOnly = 1 in { def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src), "pmovmskb\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>, VEX; def VPMOVMSKBr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX; -} def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src), "pmovmskb\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>; @@ -2885,7 +2785,6 @@ def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src), let ExeDomain = SSEPackedInt in { -let isAsmParserOnly = 1 in { let Uses = [EDI] in def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), @@ -2896,7 +2795,6 @@ def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), "maskmovdqu\t{$mask, $src|$src, $mask}", [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>, VEX; -} let Uses = [EDI] in def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), @@ -2914,7 +2812,6 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), //===---------------------------------------------------------------------===// // Move Int Doubleword to Packed Double Int -let isAsmParserOnly = 1 in { def VMOVDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -2924,7 +2821,6 @@ def VMOVDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), [(set VR128:$dst, (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, VEX; -} def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -2943,7 +2839,6 @@ def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), // Move Int Doubleword to Single Scalar -let isAsmParserOnly = 1 in { def VMOVDI2SSrr : VPDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set FR32:$dst, (bitconvert GR32:$src))]>, VEX; @@ -2952,7 +2847,6 @@ def VMOVDI2SSrm : VPDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>, VEX; -} def MOVDI2SSrr : PDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set FR32:$dst, (bitconvert GR32:$src))]>; @@ -2962,7 +2856,6 @@ def MOVDI2SSrm : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>; // Move Packed Doubleword Int to Packed Double Int -let isAsmParserOnly = 1 in { def VMOVPDI2DIrr : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (vector_extract (v4i32 VR128:$src), @@ -2972,7 +2865,6 @@ def VMOVPDI2DImr : VPDI<0x7E, MRMDestMem, (outs), "movd\t{$src, $dst|$dst, $src}", [(store (i32 (vector_extract (v4i32 VR128:$src), (iPTR 0))), addr:$dst)]>, VEX; -} def MOVPDI2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (vector_extract (v4i32 VR128:$src), @@ -2998,14 +2890,12 @@ def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>; // Move Scalar Single to Double Int -let isAsmParserOnly = 1 in { def VMOVSS2DIrr : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (bitconvert FR32:$src))]>, VEX; def VMOVSS2DImr : VPDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), "movd\t{$src, $dst|$dst, $src}", [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>, VEX; -} def MOVSS2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (bitconvert FR32:$src))]>; @@ -3014,7 +2904,7 @@ def MOVSS2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>; // movd / movq to XMM register zero-extends -let AddedComplexity = 15, isAsmParserOnly = 1 in { +let AddedComplexity = 15 in { def VMOVZDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86vzmovl @@ -3038,7 +2928,6 @@ def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), } let AddedComplexity = 20 in { -let isAsmParserOnly = 1 in def VMOVZDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -3064,7 +2953,6 @@ def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), //===---------------------------------------------------------------------===// // Move Quadword Int to Packed Quadword Int -let isAsmParserOnly = 1 in def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -3077,7 +2965,6 @@ def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), Requires<[HasSSE2]>; // SSE2 instruction with XS Prefix // Move Packed Quadword Int to Quadword Int -let isAsmParserOnly = 1 in def VMOVPQI2QImr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", [(store (i64 (vector_extract (v2i64 VR128:$src), @@ -3091,7 +2978,6 @@ def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), (f64 (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>; // Store / copy lower 64-bits of a XMM register. -let isAsmParserOnly = 1 in def VMOVLQ128mr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>, VEX; @@ -3099,7 +2985,7 @@ def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>; -let AddedComplexity = 20, isAsmParserOnly = 1 in +let AddedComplexity = 20 in def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -3124,7 +3010,7 @@ def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>; // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in // IA32 document. movq xmm1, xmm2 does clear the high bits. -let isAsmParserOnly = 1, AddedComplexity = 15 in +let AddedComplexity = 15 in def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, @@ -3135,7 +3021,7 @@ def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, XS, Requires<[HasSSE2]>; -let AddedComplexity = 20, isAsmParserOnly = 1 in +let AddedComplexity = 20 in def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (X86vzmovl @@ -3153,7 +3039,6 @@ def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))), } // Instructions to match in the assembler -let isAsmParserOnly = 1 in { def VMOVQs64rr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W; def VMOVQd64rr : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), @@ -3161,13 +3046,12 @@ def VMOVQd64rr : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), // Recognize "movd" with GR64 destination, but encode as a "movq" def VMOVQd64rr_alt : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), "movd\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W; -} // Instructions for the disassembler // xr = XMM register // xm = mem64 -let isAsmParserOnly = 1, Predicates = [HasAVX] in +let Predicates = [HasAVX] in def VMOVQxrxr: I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vmovq\t{$src, $dst|$dst, $src}", []>, VEX, XS; def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), @@ -3209,7 +3093,7 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, //===---------------------------------------------------------------------===// // Convert Packed Double FP to Packed DW Integers -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { // The assembler can recognize rr 256-bit instructions by seeing a ymm // register, but the same isn't true when using memory operands instead. // Provide other assembly rr and rm forms to address this explicitly. @@ -3237,7 +3121,7 @@ def CVTPD2DQrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2dq\t{$src, $dst|$dst, $src}", []>; // Convert Packed DW Integers to Packed Double FP -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { def VCVTDQ2PDrm : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX; def VCVTDQ2PDrr : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), @@ -3288,7 +3172,7 @@ def rm : S3SI<op, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; } -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { // FIXME: Merge above classes when we have patterns for the ymm version defm VMOVSHDUP : sse3_replicate_sfp<0x16, movshdup, "vmovshdup">, VEX; defm VMOVSLDUP : sse3_replicate_sfp<0x12, movsldup, "vmovsldup">, VEX; @@ -3319,7 +3203,7 @@ def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), []>; } -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { // FIXME: Merge above classes when we have patterns for the ymm version defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX; defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX; @@ -3327,7 +3211,7 @@ let isAsmParserOnly = 1, Predicates = [HasAVX] in { defm MOVDDUP : sse3_replicate_dfp<"movddup">; // Move Unaligned Integer -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vlddqu\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX; @@ -3391,21 +3275,21 @@ multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC, [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))]>; } -let isAsmParserOnly = 1, Predicates = [HasAVX], +let Predicates = [HasAVX], ExeDomain = SSEPackedDouble in { defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128, - f128mem, 0>, XD, VEX_4V; + f128mem, 0>, TB, XD, VEX_4V; defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128, - f128mem, 0>, OpSize, VEX_4V; + f128mem, 0>, TB, OpSize, VEX_4V; defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256, - f256mem, 0>, XD, VEX_4V; + f256mem, 0>, TB, XD, VEX_4V; defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256, - f256mem, 0>, OpSize, VEX_4V; + f256mem, 0>, TB, OpSize, VEX_4V; } let Constraints = "$src1 = $dst", Predicates = [HasSSE3], ExeDomain = SSEPackedDouble in { defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128, - f128mem>, XD; + f128mem>, TB, XD; defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128, f128mem>, TB, OpSize; } @@ -3444,7 +3328,7 @@ multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, [(set RC:$dst, (vt (IntId RC:$src1, (memop addr:$src2))))]>; } -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem, int_x86_sse3_hadd_ps, 0>, VEX_4V; defm VHADDPD : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem, @@ -3496,7 +3380,7 @@ multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, (bitconvert (mem_frag128 addr:$src))))]>, OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPABSB : SS3I_unop_rm_int<0x1C, "vpabsb", memopv16i8, int_x86_ssse3_pabs_b_128>, VEX; defm VPABSW : SS3I_unop_rm_int<0x1D, "vpabsw", memopv8i16, @@ -3538,7 +3422,7 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, (bitconvert (memopv16i8 addr:$src2))))]>, OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { let isCommutable = 0 in { defm VPHADDW : SS3I_binop_rm_int<0x01, "vphaddw", memopv8i16, int_x86_ssse3_phadd_w_128, 0>, VEX_4V; @@ -3630,7 +3514,7 @@ multiclass ssse3_palign<string asm, bit Is2Addr = 1> { []>, OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPALIGN : ssse3_palign<"vpalignr", 0>, VEX_4V; let Constraints = "$src1 = $dst" in defm PALIGN : ssse3_palign<"palignr">; @@ -3985,7 +3869,7 @@ multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId> { OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw", int_x86_sse41_pmovsxbw>, VEX; defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd", int_x86_sse41_pmovsxwd>, @@ -4051,7 +3935,7 @@ multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> { OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPMOVSXBD : SS41I_binop_rm_int4<0x21, "vpmovsxbd", int_x86_sse41_pmovsxbd>, VEX; defm VPMOVSXWQ : SS41I_binop_rm_int4<0x24, "vpmovsxwq", int_x86_sse41_pmovsxwq>, @@ -4092,7 +3976,7 @@ multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> { OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPMOVSXBQ : SS41I_binop_rm_int2<0x22, "vpmovsxbq", int_x86_sse41_pmovsxbq>, VEX; defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq>, @@ -4134,7 +4018,7 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { // (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst) } -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX; def VPEXTRBrr64 : SS4AIi8<0x14, MRMDestReg, (outs GR64:$dst), (ins VR128:$src1, i32i8imm:$src2), @@ -4156,7 +4040,7 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { // (store (i16 (trunc (X86pextrw (v16i8 VR128:$src1), imm:$src2))), addr:$dst) } -let isAsmParserOnly = 1, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX; defm PEXTRW : SS41I_extract16<0x15, "pextrw">; @@ -4178,7 +4062,7 @@ multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { addr:$dst)]>, OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX; defm PEXTRD : SS41I_extract32<0x16, "pextrd">; @@ -4199,7 +4083,7 @@ multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { addr:$dst)]>, OpSize, REX_W; } -let isAsmParserOnly = 1, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W; defm PEXTRQ : SS41I_extract64<0x16, "pextrq">; @@ -4222,7 +4106,7 @@ multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> { addr:$dst)]>, OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX; def VEXTRACTPSrr64 : SS4AIi8<0x17, MRMDestReg, (outs GR64:$dst), (ins VR128:$src1, i32i8imm:$src2), @@ -4262,7 +4146,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { imm:$src3))]>, OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V; let Constraints = "$src1 = $dst" in defm PINSRB : SS41I_insert8<0x20, "pinsrb">; @@ -4288,7 +4172,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { imm:$src3)))]>, OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V; let Constraints = "$src1 = $dst" in defm PINSRD : SS41I_insert32<0x22, "pinsrd">; @@ -4314,7 +4198,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { imm:$src3)))]>, OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W; let Constraints = "$src1 = $dst" in defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W; @@ -4347,7 +4231,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> { let Constraints = "$src1 = $dst" in defm INSERTPS : SS41I_insertf32<0x21, "insertps">; -let isAsmParserOnly = 1, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V; def : Pat<(int_x86_sse41_insertps VR128:$src1, VR128:$src2, imm:$src3), @@ -4517,7 +4401,7 @@ multiclass sse41_fp_binop_rm_avx_s<bits<8> opcss, bits<8> opcsd, } // FP round - roundss, roundps, roundsd, roundpd -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { // Intrinsic form defm VROUND : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128, memopv4f32, memopv2f64, @@ -4552,7 +4436,7 @@ defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round", // ptest instruction we'll lower to this in X86ISelLowering primarily from // the intel intrinsic that corresponds to this. -let Defs = [EFLAGS], isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Defs = [EFLAGS], Predicates = [HasAVX] in { def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), "vptest\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86ptest VR128:$src1, (v4f32 VR128:$src2)))]>, @@ -4595,7 +4479,7 @@ multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC, OpSize, VEX; } -let Defs = [EFLAGS], isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Defs = [EFLAGS], Predicates = [HasAVX] in { defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, memopv4f32, v4f32>; defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, memopv8f32, v8f32>; defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, memopv2f64, v2f64>; @@ -4644,7 +4528,7 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, (bitconvert (memopv8i16 addr:$src))))]>, OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw", int_x86_sse41_phminposuw>, VEX; defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw", @@ -4670,7 +4554,7 @@ multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr, (bitconvert (memopv16i8 addr:$src2))))]>, OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { let isCommutable = 0 in defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw, 0>, VEX_4V; @@ -4737,7 +4621,7 @@ multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, 0>, VEX_4V; let Constraints = "$src1 = $dst" in defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32>; @@ -4769,7 +4653,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { let isCommutable = 0 in { defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps, VR128, memopv16i8, i128mem, 0>, VEX_4V; @@ -4810,7 +4694,7 @@ let Constraints = "$src1 = $dst" in { } /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop, PatFrag mem_frag, Intrinsic IntId> { @@ -4870,7 +4754,7 @@ defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", int_x86_sse41_pblendvb>; def : Pat<(X86pblendv VR128:$src1, VR128:$src2, XMM0), (PBLENDVBrr0 VR128:$src1, VR128:$src2)>; -let isAsmParserOnly = 1, Predicates = [HasAVX] in +let Predicates = [HasAVX] in def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>, @@ -4904,7 +4788,7 @@ multiclass SS42I_binop_rm_int<bits<8> opc, string OpcodeStr, (bitconvert (memopv16i8 addr:$src2))))]>, OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPCMPGTQ : SS42I_binop_rm_int<0x37, "vpcmpgtq", int_x86_sse42_pcmpgtq, 0>, VEX_4V; let Constraints = "$src1 = $dst" in @@ -4936,8 +4820,7 @@ let Defs = [EFLAGS], usesCustomInserter = 1 in { defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>; } -let Defs = [XMM0, EFLAGS], isAsmParserOnly = 1, - Predicates = [HasAVX] in { +let Defs = [XMM0, EFLAGS], Predicates = [HasAVX] in { def VPCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2, i8imm:$src3), "vpcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize, VEX; @@ -4972,7 +4855,7 @@ let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in { defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>; } -let isAsmParserOnly = 1, Predicates = [HasAVX], +let Predicates = [HasAVX], Defs = [XMM0, EFLAGS], Uses = [EAX, EDX] in { def VPCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src3, i8imm:$src5), @@ -5007,7 +4890,7 @@ let Defs = [ECX, EFLAGS] in { } } -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPCMPISTRI : SS42AI_pcmpistri<int_x86_sse42_pcmpistri128, "vpcmpistri">, VEX; defm VPCMPISTRIA : SS42AI_pcmpistri<int_x86_sse42_pcmpistria128, "vpcmpistri">, @@ -5046,7 +4929,7 @@ let Defs = [ECX, EFLAGS], Uses = [EAX, EDX] in { } } -let isAsmParserOnly = 1, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPCMPESTRI : SS42AI_pcmpestri<int_x86_sse42_pcmpestri128, "vpcmpestri">, VEX; defm VPCMPESTRIA : SS42AI_pcmpestri<int_x86_sse42_pcmpestria128, "vpcmpestri">, @@ -5165,7 +5048,7 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, } // Perform One Round of an AES Encryption/Decryption Flow -let isAsmParserOnly = 1, Predicates = [HasAVX, HasAES] in { +let Predicates = [HasAVX, HasAES] in { defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc", int_x86_aesni_aesenc, 0>, VEX_4V; defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast", @@ -5205,7 +5088,7 @@ def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, (memop addr:$src2))), (AESDECLASTrm VR128:$src1, addr:$src2)>; // Perform the AES InvMixColumn Transformation -let isAsmParserOnly = 1, Predicates = [HasAVX, HasAES] in { +let Predicates = [HasAVX, HasAES] in { def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1), "vaesimc\t{$src1, $dst|$dst, $src1}", @@ -5233,7 +5116,7 @@ def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), OpSize; // AES Round Key Generation Assist -let isAsmParserOnly = 1, Predicates = [HasAVX, HasAES] in { +let Predicates = [HasAVX, HasAES] in { def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2), "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -5269,7 +5152,6 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), // Only the AVX version of CLMUL instructions are described here. // Carry-less Multiplication instructions -let isAsmParserOnly = 1 in { def VPCLMULQDQrr : CLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i8imm:$src3), "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", @@ -5295,13 +5177,10 @@ defm VPCLMULHQLQDQ : avx_vpclmul<"vpclmulhqlqdq">; defm VPCLMULLQHQDQ : avx_vpclmul<"vpclmullqhqdq">; defm VPCLMULLQLQDQ : avx_vpclmul<"vpclmullqlqdq">; -} // isAsmParserOnly - //===----------------------------------------------------------------------===// // AVX Instructions //===----------------------------------------------------------------------===// -let isAsmParserOnly = 1 in { // Load from memory and broadcast to all elements of the destination operand class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC, @@ -5435,8 +5314,6 @@ def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall", def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", [(int_x86_avx_vzeroupper)]>, VEX, Requires<[HasAVX]>; -} // isAsmParserOnly - def : Pat<(int_x86_avx_vinsertf128_pd_256 VR256:$src1, VR128:$src2, imm:$src3), (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>; def : Pat<(int_x86_avx_vinsertf128_ps_256 VR256:$src1, VR128:$src2, imm:$src3), @@ -5622,11 +5499,15 @@ def : Pat<(X86Movddup (bc_v2f64 // Shuffle with UNPCKLPS def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))), (VUNPCKLPSrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>; +def : Pat<(v8f32 (X86Unpcklpsy VR256:$src1, (memopv8f32 addr:$src2))), + (VUNPCKLPSYrm VR256:$src1, addr:$src2)>, Requires<[HasAVX]>; def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))), (UNPCKLPSrm VR128:$src1, addr:$src2)>; def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)), (VUNPCKLPSrr VR128:$src1, VR128:$src2)>, Requires<[HasAVX]>; +def : Pat<(v8f32 (X86Unpcklpsy VR256:$src1, VR256:$src2)), + (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>, Requires<[HasAVX]>; def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)), (UNPCKLPSrr VR128:$src1, VR128:$src2)>; @@ -5644,11 +5525,15 @@ def : Pat<(v4f32 (X86Unpckhps VR128:$src1, VR128:$src2)), // Shuffle with UNPCKLPD def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))), (VUNPCKLPDrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>; +def : Pat<(v4f64 (X86Unpcklpdy VR256:$src1, (memopv4f64 addr:$src2))), + (VUNPCKLPDYrm VR256:$src1, addr:$src2)>, Requires<[HasAVX]>; def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))), (UNPCKLPDrm VR128:$src1, addr:$src2)>; def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)), (VUNPCKLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasAVX]>; +def : Pat<(v4f64 (X86Unpcklpdy VR256:$src1, VR256:$src2)), + (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>, Requires<[HasAVX]>; def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)), (UNPCKLPDrr VR128:$src1, VR128:$src2)>; diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td index 6a24d14..f73cff3 100644 --- a/lib/Target/X86/X86InstrSystem.td +++ b/lib/Target/X86/X86InstrSystem.td @@ -34,9 +34,16 @@ let Uses = [EFLAGS] in def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>; def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", [(int_x86_int (i8 3))]>; + +// The long form of "int $3" turns into int3 as a size optimization. +// FIXME: This doesn't work because InstAlias can't match immediate constants. +//def : InstAlias<"int\t$3", (INT3)>; + + def INT : Ii8<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap", [(int_x86_int imm:$trap)]>; + def SYSCALL : I<0x05, RawFrm, (outs), (ins), "syscall", []>, TB; def SYSRETL : I<0x07, RawFrm, (outs), (ins), "sysretl", []>, TB; def SYSRETQ :RI<0x07, RawFrm, (outs), (ins), "sysretq", []>, TB, @@ -207,10 +214,15 @@ def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr", []>, TB; -def STRr : I<0x00, MRM1r, (outs GR16:$dst), (ins), - "str{w}\t{$dst}", []>, TB; -def STRm : I<0x00, MRM1m, (outs i16mem:$dst), (ins), - "str{w}\t{$dst}", []>, TB; +def STR16r : I<0x00, MRM1r, (outs GR16:$dst), (ins), + "str{w}\t{$dst}", []>, TB, OpSize; +def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins), + "str{l}\t{$dst}", []>, TB; +def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins), + "str{q}\t{$dst}", []>, TB; +def STRm : I<0x00, MRM1m, (outs i16mem:$dst), (ins), + "str{w}\t{$dst}", []>, TB; + def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src), "ltr{w}\t{$src}", []>, TB; def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src), @@ -393,3 +405,23 @@ let Defs = [RDX, RAX], Uses = [RCX] in let Uses = [RDX, RAX, RCX] in def XSETBV : I<0x01, MRM_D1, (outs), (ins), "xsetbv", []>, TB; + +//===----------------------------------------------------------------------===// +// VIA PadLock crypto instructions +let Defs = [RAX, RDI], Uses = [RDX, RDI] in + def XSTORE : I<0xc0, RawFrm, (outs), (ins), "xstore", []>, A7; + +let Defs = [RSI, RDI], Uses = [RBX, RDX, RSI, RDI] in { + def XCRYPTECB : I<0xc8, RawFrm, (outs), (ins), "xcryptecb", []>, A7; + def XCRYPTCBC : I<0xd0, RawFrm, (outs), (ins), "xcryptcbc", []>, A7; + def XCRYPTCTR : I<0xd8, RawFrm, (outs), (ins), "xcryptctr", []>, A7; + def XCRYPTCFB : I<0xe0, RawFrm, (outs), (ins), "xcryptcfb", []>, A7; + def XCRYPTOFB : I<0xe8, RawFrm, (outs), (ins), "xcryptofb", []>, A7; +} + +let Defs = [RAX, RSI, RDI], Uses = [RAX, RSI, RDI] in { + def XSHA1 : I<0xc8, RawFrm, (outs), (ins), "xsha1", []>, A6; + def XSHA256 : I<0xd0, RawFrm, (outs), (ins), "xsha256", []>, A6; +} +let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in + def MONTMUL : I<0xc0, RawFrm, (outs), (ins), "montmul", []>, A6; diff --git a/lib/Target/X86/X86MCAsmInfo.cpp b/lib/Target/X86/X86MCAsmInfo.cpp index 6686214..83bba52 100644 --- a/lib/Target/X86/X86MCAsmInfo.cpp +++ b/lib/Target/X86/X86MCAsmInfo.cpp @@ -15,7 +15,9 @@ #include "X86TargetMachine.h" #include "llvm/ADT/Triple.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCStreamer.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ELF.h" using namespace llvm; @@ -69,7 +71,22 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &Triple) { DwarfUsesInlineInfoSection = true; // Exceptions handling - ExceptionsType = ExceptionHandling::DwarfTable; + ExceptionsType = ExceptionHandling::DwarfCFI; +} + +const MCExpr * +X86_64MCAsmInfoDarwin::getExprForPersonalitySymbol(const MCSymbol *Sym, + unsigned Encoding, + MCStreamer &Streamer) const { + MCContext &Context = Streamer.getContext(); + const MCExpr *Res = + MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Context); + const MCExpr *Four = MCConstantExpr::Create(4, Context); + return MCBinaryExpr::CreateAdd(Res, Four, Context); +} + +X86_64MCAsmInfoDarwin::X86_64MCAsmInfoDarwin(const Triple &Triple) + : X86MCAsmInfoDarwin(Triple) { } X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) { @@ -89,7 +106,9 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) { SupportsDebugInformation = true; // Exceptions handling - ExceptionsType = ExceptionHandling::DwarfTable; + ExceptionsType = ExceptionHandling::DwarfCFI; + + DwarfRequiresFrameSection = false; // OpenBSD has buggy support for .quad in 32-bit mode, just split into two // .words. diff --git a/lib/Target/X86/X86MCAsmInfo.h b/lib/Target/X86/X86MCAsmInfo.h index 5815225..2cd4c8e 100644 --- a/lib/Target/X86/X86MCAsmInfo.h +++ b/lib/Target/X86/X86MCAsmInfo.h @@ -25,6 +25,14 @@ namespace llvm { explicit X86MCAsmInfoDarwin(const Triple &Triple); }; + struct X86_64MCAsmInfoDarwin : public X86MCAsmInfoDarwin { + explicit X86_64MCAsmInfoDarwin(const Triple &Triple); + virtual const MCExpr * + getExprForPersonalitySymbol(const MCSymbol *Sym, + unsigned Encoding, + MCStreamer &Streamer) const; + }; + struct X86ELFMCAsmInfo : public MCAsmInfo { explicit X86ELFMCAsmInfo(const Triple &Triple); virtual const MCSection *getNonexecutableStackSection(MCContext &Ctx) const; diff --git a/lib/Target/X86/X86MCCodeEmitter.cpp b/lib/Target/X86/X86MCCodeEmitter.cpp index 0e3b571..f195a67 100644 --- a/lib/Target/X86/X86MCCodeEmitter.cpp +++ b/lib/Target/X86/X86MCCodeEmitter.cpp @@ -382,7 +382,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, const TargetInstrDesc &Desc, raw_ostream &OS) const { bool HasVEX_4V = false; - if ((TSFlags >> 32) & X86II::VEX_4V) + if ((TSFlags >> X86II::VEXShift) & X86II::VEX_4V) HasVEX_4V = true; // VEX_R: opcode externsion equivalent to REX.R in @@ -446,10 +446,10 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, if (TSFlags & X86II::OpSize) VEX_PP = 0x01; - if ((TSFlags >> 32) & X86II::VEX_W) + if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W) VEX_W = 1; - if ((TSFlags >> 32) & X86II::VEX_L) + if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L) VEX_L = 1; switch (TSFlags & X86II::Op0Mask) { @@ -470,6 +470,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, case X86II::XD: // F2 0F VEX_PP = 0x3; break; + case X86II::A6: // Bypass: Not used by VEX + case X86II::A7: // Bypass: Not used by VEX case X86II::TB: // Bypass: Not used by VEX case 0: break; // No prefix! @@ -512,13 +514,13 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, } // To only check operands before the memory address ones, start - // the search from the begining + // the search from the beginning if (IsDestMem) CurOp = 0; // If the last register should be encoded in the immediate field // do not use any bit from VEX prefix to this register, ignore it - if ((TSFlags >> 32) & X86II::VEX_I8IMM) + if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) NumOps--; for (; CurOp != NumOps; ++CurOp) { @@ -742,6 +744,8 @@ void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, case X86II::TB: // Two-byte opcode prefix case X86II::T8: // 0F 38 case X86II::TA: // 0F 3A + case X86II::A6: // 0F A6 + case X86II::A7: // 0F A7 Need0FPrefix = true; break; case X86II::TF: // F2 0F 38 @@ -786,6 +790,12 @@ void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, case X86II::TA: // 0F 3A EmitByte(0x3A, CurByte, OS); break; + case X86II::A6: // 0F A6 + EmitByte(0xA6, CurByte, OS); + break; + case X86II::A7: // 0F A7 + EmitByte(0xA7, CurByte, OS); + break; } } @@ -819,9 +829,9 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, // It uses the VEX.VVVV field? bool HasVEX_4V = false; - if ((TSFlags >> 32) & X86II::VEX) + if ((TSFlags >> X86II::VEXShift) & X86II::VEX) HasVEXPrefix = true; - if ((TSFlags >> 32) & X86II::VEX_4V) + if ((TSFlags >> X86II::VEXShift) & X86II::VEX_4V) HasVEX_4V = true; @@ -837,7 +847,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, unsigned char BaseOpcode = X86II::getBaseOpcodeFor(TSFlags); - if ((TSFlags >> 32) & X86II::Has3DNow0F0FOpcode) + if ((TSFlags >> X86II::VEXShift) & X86II::Has3DNow0F0FOpcode) BaseOpcode = 0x0F; // Weird 3DNow! encoding. unsigned SrcRegNum = 0; @@ -994,7 +1004,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, if (CurOp != NumOps) { // The last source register of a 4 operand instruction in AVX is encoded // in bits[7:4] of a immediate byte, and bits[3:0] are ignored. - if ((TSFlags >> 32) & X86II::VEX_I8IMM) { + if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) { const MCOperand &MO = MI.getOperand(CurOp++); bool IsExtReg = X86InstrInfo::isX86_64ExtendedReg(MO.getReg()); @@ -1017,7 +1027,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, } } - if ((TSFlags >> 32) & X86II::Has3DNow0F0FOpcode) + if ((TSFlags >> X86II::VEXShift) & X86II::Has3DNow0F0FOpcode) EmitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS); diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 2f6bd88..37fb0fe 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -308,6 +308,33 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, return 0; } +const TargetRegisterClass* +X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{ + const TargetRegisterClass *Super = RC; + TargetRegisterClass::sc_iterator I = RC->superclasses_begin(); + do { + switch (Super->getID()) { + case X86::GR8RegClassID: + case X86::GR16RegClassID: + case X86::GR32RegClassID: + case X86::GR64RegClassID: + case X86::FR32RegClassID: + case X86::FR64RegClassID: + case X86::RFP32RegClassID: + case X86::RFP64RegClassID: + case X86::RFP80RegClassID: + case X86::VR128RegClassID: + case X86::VR256RegClassID: + // Don't return a super-class that would shrink the spill size. + // That can happen with the vector and float classes. + if (Super->getSize() == RC->getSize()) + return Super; + } + Super = *I++; + } while (Super); + return RC; +} + const TargetRegisterClass * X86RegisterInfo::getPointerRegClass(unsigned Kind) const { switch (Kind) { @@ -337,7 +364,27 @@ X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { else return &X86::GR32RegClass; } - return NULL; + return RC; +} + +unsigned +X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + + unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0; + switch (RC->getID()) { + default: + return 0; + case X86::GR32RegClassID: + return 4 - FPDiff; + case X86::GR64RegClassID: + return 12 - FPDiff; + case X86::VR128RegClassID: + return TM.getSubtarget<X86Subtarget>().is64Bit() ? 10 : 4; + case X86::VR64RegClassID: + return 4; + } } const unsigned * @@ -450,7 +497,7 @@ bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { // FIXME: It's more complicated than this... if (0 && requiresRealignment && MFI->hasVarSizedObjects()) report_fatal_error( - "Stack realignment in presense of dynamic allocas is not supported"); + "Stack realignment in presence of dynamic allocas is not supported"); // If we've requested that we force align the stack do so now. if (ForceStackAlign) diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h index 064be64..9970c52 100644 --- a/lib/Target/X86/X86RegisterInfo.h +++ b/lib/Target/X86/X86RegisterInfo.h @@ -91,6 +91,9 @@ public: getMatchingSuperRegClass(const TargetRegisterClass *A, const TargetRegisterClass *B, unsigned Idx) const; + const TargetRegisterClass* + getLargestLegalSuperClass(const TargetRegisterClass *RC) const; + /// getPointerRegClass - Returns a TargetRegisterClass used for pointer /// values. const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const; @@ -101,6 +104,9 @@ public: const TargetRegisterClass * getCrossCopyRegClass(const TargetRegisterClass *RC) const; + unsigned getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const; + /// getCalleeSavedRegs - Return a null-terminated list of all of the /// callee-save registers on this target. const unsigned *getCalleeSavedRegs(const MachineFunction* MF = 0) const; diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td index 612fac2..fd7a247 100644 --- a/lib/Target/X86/X86RegisterInfo.td +++ b/lib/Target/X86/X86RegisterInfo.td @@ -46,7 +46,8 @@ let Namespace = "X86" in { def CL : Register<"cl">, DwarfRegNum<[2, 1, 1]>; def BL : Register<"bl">, DwarfRegNum<[3, 3, 3]>; - // X86-64 only + // X86-64 only, requires REX. + let CostPerUse = 1 in { def SIL : Register<"sil">, DwarfRegNum<[4, 6, 6]>; def DIL : Register<"dil">, DwarfRegNum<[5, 7, 7]>; def BPL : Register<"bpl">, DwarfRegNum<[6, 4, 5]>; @@ -59,6 +60,7 @@ let Namespace = "X86" in { def R13B : Register<"r13b">, DwarfRegNum<[13, -2, -2]>; def R14B : Register<"r14b">, DwarfRegNum<[14, -2, -2]>; def R15B : Register<"r15b">, DwarfRegNum<[15, -2, -2]>; + } // High registers. On x86-64, these cannot be used in any instruction // with a REX prefix. @@ -82,8 +84,8 @@ let Namespace = "X86" in { } def IP : Register<"ip">, DwarfRegNum<[16]>; - // X86-64 only - let SubRegIndices = [sub_8bit] in { + // X86-64 only, requires REX. + let SubRegIndices = [sub_8bit], CostPerUse = 1 in { def R8W : RegisterWithSubRegs<"r8w", [R8B]>, DwarfRegNum<[8, -2, -2]>; def R9W : RegisterWithSubRegs<"r9w", [R9B]>, DwarfRegNum<[9, -2, -2]>; def R10W : RegisterWithSubRegs<"r10w", [R10B]>, DwarfRegNum<[10, -2, -2]>; @@ -105,7 +107,8 @@ let Namespace = "X86" in { def ESP : RegisterWithSubRegs<"esp", [SP]>, DwarfRegNum<[7, 5, 4]>; def EIP : RegisterWithSubRegs<"eip", [IP]>, DwarfRegNum<[16, 8, 8]>; - // X86-64 only + // X86-64 only, requires REX + let CostPerUse = 1 in { def R8D : RegisterWithSubRegs<"r8d", [R8W]>, DwarfRegNum<[8, -2, -2]>; def R9D : RegisterWithSubRegs<"r9d", [R9W]>, DwarfRegNum<[9, -2, -2]>; def R10D : RegisterWithSubRegs<"r10d", [R10W]>, DwarfRegNum<[10, -2, -2]>; @@ -114,7 +117,7 @@ let Namespace = "X86" in { def R13D : RegisterWithSubRegs<"r13d", [R13W]>, DwarfRegNum<[13, -2, -2]>; def R14D : RegisterWithSubRegs<"r14d", [R14W]>, DwarfRegNum<[14, -2, -2]>; def R15D : RegisterWithSubRegs<"r15d", [R15W]>, DwarfRegNum<[15, -2, -2]>; - } + }} // 64-bit registers, X86-64 only let SubRegIndices = [sub_32bit] in { @@ -127,6 +130,8 @@ let Namespace = "X86" in { def RBP : RegisterWithSubRegs<"rbp", [EBP]>, DwarfRegNum<[6, -2, -2]>; def RSP : RegisterWithSubRegs<"rsp", [ESP]>, DwarfRegNum<[7, -2, -2]>; + // These also require REX. + let CostPerUse = 1 in { def R8 : RegisterWithSubRegs<"r8", [R8D]>, DwarfRegNum<[8, -2, -2]>; def R9 : RegisterWithSubRegs<"r9", [R9D]>, DwarfRegNum<[9, -2, -2]>; def R10 : RegisterWithSubRegs<"r10", [R10D]>, DwarfRegNum<[10, -2, -2]>; @@ -136,7 +141,7 @@ let Namespace = "X86" in { def R14 : RegisterWithSubRegs<"r14", [R14D]>, DwarfRegNum<[14, -2, -2]>; def R15 : RegisterWithSubRegs<"r15", [R15D]>, DwarfRegNum<[15, -2, -2]>; def RIP : RegisterWithSubRegs<"rip", [EIP]>, DwarfRegNum<[16, -2, -2]>; - } + }} // MMX Registers. These are actually aliased to ST0 .. ST7 def MM0 : Register<"mm0">, DwarfRegNum<[41, 29, 29]>; @@ -170,6 +175,7 @@ let Namespace = "X86" in { def XMM7: Register<"xmm7">, DwarfRegNum<[24, 28, 28]>; // X86-64 only + let CostPerUse = 1 in { def XMM8: Register<"xmm8">, DwarfRegNum<[25, -2, -2]>; def XMM9: Register<"xmm9">, DwarfRegNum<[26, -2, -2]>; def XMM10: Register<"xmm10">, DwarfRegNum<[27, -2, -2]>; @@ -178,7 +184,7 @@ let Namespace = "X86" in { def XMM13: Register<"xmm13">, DwarfRegNum<[30, -2, -2]>; def XMM14: Register<"xmm14">, DwarfRegNum<[31, -2, -2]>; def XMM15: Register<"xmm15">, DwarfRegNum<[32, -2, -2]>; - } + }} // YMM Registers, used by AVX instructions let SubRegIndices = [sub_xmm] in { diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp index 42e8193..02754f9 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -178,7 +178,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, bool isVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { - // This requires the copy size to be a constant, preferrably + // This requires the copy size to be a constant, preferably // within a subtarget-specific limit. ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); if (!ConstantSize) diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 1ee7312..ba5864e 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -144,7 +144,8 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const { /// passed as the second argument. Otherwise it returns null. const char *X86Subtarget::getBZeroEntry() const { // Darwin 10 has a __bzero entry point for this purpose. - if (getDarwinVers() >= 10) + if (getTargetTriple().isMacOSX() && + !getTargetTriple().isMacOSXVersionLT(10, 6)) return "__bzero"; return 0; diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 0a62a02..286a798 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -165,9 +165,15 @@ public: bool isUnalignedMemAccessFast() const { return IsUAMemFast; } bool hasVectorUAMem() const { return HasVectorUAMem; } - bool isTargetDarwin() const { return TargetTriple.getOS() == Triple::Darwin; } - bool isTargetFreeBSD() const { return TargetTriple.getOS() == Triple::FreeBSD; } - bool isTargetSolaris() const { return TargetTriple.getOS() == Triple::Solaris; } + const Triple &getTargetTriple() const { return TargetTriple; } + + bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } + bool isTargetFreeBSD() const { + return TargetTriple.getOS() == Triple::FreeBSD; + } + bool isTargetSolaris() const { + return TargetTriple.getOS() == Triple::Solaris; + } // ELF is a reasonably sane default and the only other X86 targets we // support are Darwin and Windows. Just use "not those". @@ -215,13 +221,6 @@ public: return PICStyle == PICStyles::StubDynamicNoPIC || PICStyle == PICStyles::StubPIC; } - /// getDarwinVers - Return the darwin version number, 8 = Tiger, 9 = Leopard, - /// 10 = Snow Leopard, etc. - unsigned getDarwinVers() const { - if (isTargetDarwin()) return TargetTriple.getDarwinMajorNumber(); - return 0; - } - /// ClassifyGlobalReference - Classify a global variable reference for the /// current subtarget according to how we should reference it in a non-pcrel /// context. diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 889c824..7483329 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -26,19 +26,18 @@ using namespace llvm; static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) { Triple TheTriple(TT); - switch (TheTriple.getOS()) { - case Triple::Darwin: - return new X86MCAsmInfoDarwin(TheTriple); - case Triple::MinGW32: - case Triple::Cygwin: - case Triple::Win32: - if (TheTriple.getEnvironment() == Triple::MachO) - return new X86MCAsmInfoDarwin(TheTriple); + + if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) { + if (TheTriple.getArch() == Triple::x86_64) + return new X86_64MCAsmInfoDarwin(TheTriple); else - return new X86MCAsmInfoCOFF(TheTriple); - default: - return new X86ELFMCAsmInfo(TheTriple); + return new X86MCAsmInfoDarwin(TheTriple); } + + if (TheTriple.isOSWindows()) + return new X86MCAsmInfoCOFF(TheTriple); + + return new X86ELFMCAsmInfo(TheTriple); } static MCStreamer *createMCStreamer(const Target &T, const std::string &TT, @@ -48,19 +47,14 @@ static MCStreamer *createMCStreamer(const Target &T, const std::string &TT, bool RelaxAll, bool NoExecStack) { Triple TheTriple(TT); - switch (TheTriple.getOS()) { - case Triple::Darwin: + + if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) return createMachOStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll); - case Triple::MinGW32: - case Triple::Cygwin: - case Triple::Win32: - if (TheTriple.getEnvironment() == Triple::MachO) - return createMachOStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll); - else - return createWinCOFFStreamer(Ctx, TAB, *_Emitter, _OS, RelaxAll); - default: - return createELFStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll, NoExecStack); - } + + if (TheTriple.isOSWindows()) + return createWinCOFFStreamer(Ctx, TAB, *_Emitter, _OS, RelaxAll); + + return createELFStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll, NoExecStack); } extern "C" void LLVMInitializeX86Target() { @@ -96,11 +90,11 @@ X86_32TargetMachine::X86_32TargetMachine(const Target &T, const std::string &TT, const std::string &FS) : X86TargetMachine(T, TT, FS, false), DataLayout(getSubtargetImpl()->isTargetDarwin() ? - "e-p:32:32-f64:32:64-i64:32:64-f80:128:128-n8:16:32" : + "e-p:32:32-f64:32:64-i64:32:64-f80:128:128-f128:128:128-n8:16:32" : (getSubtargetImpl()->isTargetCygMing() || getSubtargetImpl()->isTargetWindows()) ? - "e-p:32:32-f64:64:64-i64:64:64-f80:32:32-n8:16:32" : - "e-p:32:32-f64:32:64-i64:32:64-f80:32:32-n8:16:32"), + "e-p:32:32-f64:64:64-i64:64:64-f80:32:32-f128:128:128-n8:16:32" : + "e-p:32:32-f64:32:64-i64:32:64-f80:32:32-f128:128:128-n8:16:32"), InstrInfo(*this), TSInfo(*this), TLInfo(*this), @@ -111,7 +105,7 @@ X86_32TargetMachine::X86_32TargetMachine(const Target &T, const std::string &TT, X86_64TargetMachine::X86_64TargetMachine(const Target &T, const std::string &TT, const std::string &FS) : X86TargetMachine(T, TT, FS, true), - DataLayout("e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-n8:16:32:64"), + DataLayout("e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-n8:16:32:64"), InstrInfo(*this), TSInfo(*this), TLInfo(*this), diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp index c15dfbb..1231798 100644 --- a/lib/Target/X86/X86TargetObjectFile.cpp +++ b/lib/Target/X86/X86TargetObjectFile.cpp @@ -38,6 +38,12 @@ getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang, getExprForDwarfGlobalReference(GV, Mang, MMI, Encoding, Streamer); } +MCSymbol *X8664_MachoTargetObjectFile:: +getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang, + MachineModuleInfo *MMI) const { + return Mang->getSymbol(GV); +} + unsigned X8632_ELFTargetObjectFile::getPersonalityEncoding() const { if (TM.getRelocationModel() == Reloc::PIC_) return DW_EH_PE_indirect | DW_EH_PE_pcrel | DW_EH_PE_sdata4; @@ -52,7 +58,7 @@ unsigned X8632_ELFTargetObjectFile::getLSDAEncoding() const { return DW_EH_PE_absptr; } -unsigned X8632_ELFTargetObjectFile::getFDEEncoding() const { +unsigned X8632_ELFTargetObjectFile::getFDEEncoding(bool FDE) const { if (TM.getRelocationModel() == Reloc::PIC_) return DW_EH_PE_pcrel | DW_EH_PE_sdata4; else @@ -91,17 +97,14 @@ unsigned X8664_ELFTargetObjectFile::getLSDAEncoding() const { return DW_EH_PE_absptr; } -unsigned X8664_ELFTargetObjectFile::getFDEEncoding() const { - CodeModel::Model Model = TM.getCodeModel(); - if (TM.getRelocationModel() == Reloc::PIC_) - return DW_EH_PE_pcrel | (Model == CodeModel::Small || - Model == CodeModel::Medium ? - DW_EH_PE_sdata4 : DW_EH_PE_sdata8); +unsigned X8664_ELFTargetObjectFile::getFDEEncoding(bool CFI) const { + if (CFI) + return DW_EH_PE_pcrel | DW_EH_PE_sdata4; - if (Model == CodeModel::Small || Model == CodeModel::Medium) - return DW_EH_PE_udata4; + if (TM.getRelocationModel() == Reloc::PIC_) + return DW_EH_PE_pcrel | DW_EH_PE_sdata4; - return DW_EH_PE_absptr; + return DW_EH_PE_udata4; } unsigned X8664_ELFTargetObjectFile::getTTypeEncoding() const { diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h index f2fd49c..e21b5bf 100644 --- a/lib/Target/X86/X86TargetObjectFile.h +++ b/lib/Target/X86/X86TargetObjectFile.h @@ -25,6 +25,12 @@ namespace llvm { getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang, MachineModuleInfo *MMI, unsigned Encoding, MCStreamer &Streamer) const; + + // getCFIPersonalitySymbol - The symbol that gets passed to + // .cfi_personality. + virtual MCSymbol * + getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang, + MachineModuleInfo *MMI) const; }; class X8632_ELFTargetObjectFile : public TargetLoweringObjectFileELF { @@ -34,7 +40,7 @@ namespace llvm { :TM(tm) { } virtual unsigned getPersonalityEncoding() const; virtual unsigned getLSDAEncoding() const; - virtual unsigned getFDEEncoding() const; + virtual unsigned getFDEEncoding(bool CFI) const; virtual unsigned getTTypeEncoding() const; }; @@ -45,7 +51,7 @@ namespace llvm { :TM(tm) { } virtual unsigned getPersonalityEncoding() const; virtual unsigned getLSDAEncoding() const; - virtual unsigned getFDEEncoding() const; + virtual unsigned getFDEEncoding(bool CFI) const; virtual unsigned getTTypeEncoding() const; }; diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp index fc8a07a..6bec9f9 100644 --- a/lib/Target/XCore/XCoreISelDAGToDAG.cpp +++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp @@ -30,8 +30,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include <queue> -#include <set> using namespace llvm; /// XCoreDAGToDAGISel - XCore specific code to select XCore machine @@ -49,7 +47,8 @@ namespace { Subtarget(*TM.getSubtargetImpl()) { } SDNode *Select(SDNode *N); - + SDNode *SelectBRIND(SDNode *N); + /// getI32Imm - Return a target constant with the specified value, of type /// i32. inline SDValue getI32Imm(unsigned Imm) { @@ -154,62 +153,133 @@ bool XCoreDAGToDAGISel::SelectADDRcpii(SDValue Addr, SDValue &Base, SDNode *XCoreDAGToDAGISel::Select(SDNode *N) { DebugLoc dl = N->getDebugLoc(); - EVT NVT = N->getValueType(0); - if (NVT == MVT::i32) { - switch (N->getOpcode()) { - default: break; - case ISD::Constant: { - uint64_t Val = cast<ConstantSDNode>(N)->getZExtValue(); - if (immMskBitp(N)) { - // Transformation function: get the size of a mask - // Look for the first non-zero bit - SDValue MskSize = getI32Imm(32 - CountLeadingZeros_32(Val)); - return CurDAG->getMachineNode(XCore::MKMSK_rus, dl, - MVT::i32, MskSize); - } - else if (!isUInt<16>(Val)) { - SDValue CPIdx = - CurDAG->getTargetConstantPool(ConstantInt::get( - Type::getInt32Ty(*CurDAG->getContext()), Val), - TLI.getPointerTy()); - return CurDAG->getMachineNode(XCore::LDWCP_lru6, dl, MVT::i32, - MVT::Other, CPIdx, - CurDAG->getEntryNode()); - } - break; - } - case XCoreISD::LADD: { - SDValue Ops[] = { N->getOperand(0), N->getOperand(1), - N->getOperand(2) }; - return CurDAG->getMachineNode(XCore::LADD_l5r, dl, MVT::i32, MVT::i32, - Ops, 3); - } - case XCoreISD::LSUB: { - SDValue Ops[] = { N->getOperand(0), N->getOperand(1), - N->getOperand(2) }; - return CurDAG->getMachineNode(XCore::LSUB_l5r, dl, MVT::i32, MVT::i32, - Ops, 3); - } - case XCoreISD::MACCU: { - SDValue Ops[] = { N->getOperand(0), N->getOperand(1), - N->getOperand(2), N->getOperand(3) }; - return CurDAG->getMachineNode(XCore::MACCU_l4r, dl, MVT::i32, MVT::i32, - Ops, 4); - } - case XCoreISD::MACCS: { - SDValue Ops[] = { N->getOperand(0), N->getOperand(1), - N->getOperand(2), N->getOperand(3) }; - return CurDAG->getMachineNode(XCore::MACCS_l4r, dl, MVT::i32, MVT::i32, - Ops, 4); - } - case XCoreISD::LMUL: { - SDValue Ops[] = { N->getOperand(0), N->getOperand(1), - N->getOperand(2), N->getOperand(3) }; - return CurDAG->getMachineNode(XCore::LMUL_l6r, dl, MVT::i32, MVT::i32, - Ops, 4); - } - // Other cases are autogenerated. + switch (N->getOpcode()) { + default: break; + case ISD::Constant: { + uint64_t Val = cast<ConstantSDNode>(N)->getZExtValue(); + if (immMskBitp(N)) { + // Transformation function: get the size of a mask + // Look for the first non-zero bit + SDValue MskSize = getI32Imm(32 - CountLeadingZeros_32(Val)); + return CurDAG->getMachineNode(XCore::MKMSK_rus, dl, + MVT::i32, MskSize); + } + else if (!isUInt<16>(Val)) { + SDValue CPIdx = + CurDAG->getTargetConstantPool(ConstantInt::get( + Type::getInt32Ty(*CurDAG->getContext()), Val), + TLI.getPointerTy()); + return CurDAG->getMachineNode(XCore::LDWCP_lru6, dl, MVT::i32, + MVT::Other, CPIdx, + CurDAG->getEntryNode()); } + break; + } + case XCoreISD::LADD: { + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), + N->getOperand(2) }; + return CurDAG->getMachineNode(XCore::LADD_l5r, dl, MVT::i32, MVT::i32, + Ops, 3); + } + case XCoreISD::LSUB: { + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), + N->getOperand(2) }; + return CurDAG->getMachineNode(XCore::LSUB_l5r, dl, MVT::i32, MVT::i32, + Ops, 3); + } + case XCoreISD::MACCU: { + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), + N->getOperand(2), N->getOperand(3) }; + return CurDAG->getMachineNode(XCore::MACCU_l4r, dl, MVT::i32, MVT::i32, + Ops, 4); + } + case XCoreISD::MACCS: { + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), + N->getOperand(2), N->getOperand(3) }; + return CurDAG->getMachineNode(XCore::MACCS_l4r, dl, MVT::i32, MVT::i32, + Ops, 4); + } + case XCoreISD::LMUL: { + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), + N->getOperand(2), N->getOperand(3) }; + return CurDAG->getMachineNode(XCore::LMUL_l6r, dl, MVT::i32, MVT::i32, + Ops, 4); + } + case ISD::BRIND: + if (SDNode *ResNode = SelectBRIND(N)) + return ResNode; + break; + // Other cases are autogenerated. } return SelectCode(N); } + +/// Given a chain return a new chain where any appearance of Old is replaced +/// by New. There must be at most one instruction between Old and Chain and +/// this instruction must be a TokenFactor. Returns an empty SDValue if +/// these conditions don't hold. +static SDValue +replaceInChain(SelectionDAG *CurDAG, SDValue Chain, SDValue Old, SDValue New) +{ + if (Chain == Old) + return New; + if (Chain->getOpcode() != ISD::TokenFactor) + return SDValue(); + SmallVector<SDValue, 8> Ops; + bool found = false; + for (unsigned i = 0, e = Chain->getNumOperands(); i != e; ++i) { + if (Chain->getOperand(i) == Old) { + Ops.push_back(New); + found = true; + } else { + Ops.push_back(Chain->getOperand(i)); + } + } + if (!found) + return SDValue(); + return CurDAG->getNode(ISD::TokenFactor, Chain->getDebugLoc(), MVT::Other, + &Ops[0], Ops.size()); +} + +SDNode *XCoreDAGToDAGISel::SelectBRIND(SDNode *N) { + DebugLoc dl = N->getDebugLoc(); + // (brind (int_xcore_checkevent (addr))) + SDValue Chain = N->getOperand(0); + SDValue Addr = N->getOperand(1); + if (Addr->getOpcode() != ISD::INTRINSIC_W_CHAIN) + return 0; + unsigned IntNo = cast<ConstantSDNode>(Addr->getOperand(1))->getZExtValue(); + if (IntNo != Intrinsic::xcore_checkevent) + return 0; + SDValue nextAddr = Addr->getOperand(2); + SDValue CheckEventChainOut(Addr.getNode(), 1); + if (!CheckEventChainOut.use_empty()) { + // If the chain out of the checkevent intrinsic is an operand of the + // indirect branch or used in a TokenFactor which is the operand of the + // indirect branch then build a new chain which uses the chain coming into + // the checkevent intrinsic instead. + SDValue CheckEventChainIn = Addr->getOperand(0); + SDValue NewChain = replaceInChain(CurDAG, Chain, CheckEventChainOut, + CheckEventChainIn); + if (!NewChain.getNode()) + return 0; + Chain = NewChain; + } + // Enable events on the thread using setsr 1 and then disable them immediately + // after with clrsr 1. If any resources owned by the thread are ready an event + // will be taken. If no resource is ready we branch to the address which was + // the operand to the checkevent intrinsic. + SDValue constOne = getI32Imm(1); + SDValue Glue = + SDValue(CurDAG->getMachineNode(XCore::SETSR_branch_u6, dl, MVT::Glue, + constOne, Chain), 0); + Glue = + SDValue(CurDAG->getMachineNode(XCore::CLRSR_branch_u6, dl, MVT::Glue, + constOne, Glue), 0); + if (nextAddr->getOpcode() == XCoreISD::PCRelativeWrapper && + nextAddr->getOperand(0)->getOpcode() == ISD::TargetBlockAddress) { + return CurDAG->SelectNodeTo(N, XCore::BRFU_lu6, MVT::Other, + nextAddr->getOperand(0), Glue); + } + return CurDAG->SelectNodeTo(N, XCore::BAU_1r, MVT::Other, nextAddr, Glue); +} diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp index 4817787..5987e8b 100644 --- a/lib/Target/XCore/XCoreISelLowering.cpp +++ b/lib/Target/XCore/XCoreISelLowering.cpp @@ -37,8 +37,6 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/ADT/VectorExtras.h" -#include <queue> -#include <set> using namespace llvm; const char *XCoreTargetLowering:: @@ -967,7 +965,7 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee, // Build a sequence of copy-to-reg nodes chained together with token // chain and flag operands which copy the outgoing args into registers. - // The InFlag in necessary since all emited instructions must be + // The InFlag in necessary since all emitted instructions must be // stuck together. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td index ecdd4cb..789546e 100644 --- a/lib/Target/XCore/XCoreInstrInfo.td +++ b/lib/Target/XCore/XCoreInstrInfo.td @@ -308,6 +308,16 @@ multiclass FU6_LU6<string OpcStr, SDNode OpNode> { !strconcat(OpcStr, " $b"), [(OpNode immU16:$b)]>; } +multiclass FU6_LU6_int<string OpcStr, Intrinsic Int> { + def _u6: _FU6< + (outs), (ins i32imm:$b), + !strconcat(OpcStr, " $b"), + [(Int immU6:$b)]>; + def _lu6: _FLU6< + (outs), (ins i32imm:$b), + !strconcat(OpcStr, " $b"), + [(Int immU16:$b)]>; +} multiclass FU6_LU6_np<string OpcStr> { def _u6: _FU6< @@ -638,8 +648,8 @@ defm RETSP : FU6_LU6<"retsp", XCoreRetsp>; } } -// TODO extdp, kentsp, krestsp, blat, setsr -// clrsr, getsr, kalli +// TODO extdp, kentsp, krestsp, blat +// getsr, kalli let isBranch = 1, isTerminator = 1, isBarrier = 1 in { def BRBU_u6 : _FU6< (outs), @@ -678,6 +688,17 @@ def LDAWCP_lu6: _FLRU6< "ldaw r11, cp[$a]", [(set R11, ADDRcpii:$a)]>; +defm SETSR : FU6_LU6_int<"setsr", int_xcore_setsr>; + +defm CLRSR : FU6_LU6_int<"clrsr", int_xcore_clrsr>; + +// setsr may cause a branch if it is used to enable events. clrsr may +// branch if it is executed while events are enabled. +let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in { +defm SETSR_branch : FU6_LU6_np<"setsr">; +defm CLRSR_branch : FU6_LU6_np<"clrsr">; +} + // U10 // TODO ldwcpl, blacp @@ -718,7 +739,7 @@ def BL_lu10 : _FLU10< } // Two operand short -// TODO getr, getst +// TODO eet, eef, testwct, tsetmr, sext (reg), zext (reg) def NOT : _F2R<(outs GRRegs:$dst), (ins GRRegs:$b), "not $dst, $b", [(set GRRegs:$dst, (not GRRegs:$b))]>; @@ -727,8 +748,6 @@ def NEG : _F2R<(outs GRRegs:$dst), (ins GRRegs:$b), "neg $dst, $b", [(set GRRegs:$dst, (ineg GRRegs:$b))]>; -// TODO setd, eet, eef, testwct, tinitpc, tinitdp, -// tinitsp, tinitcp, tsetmr, sext (reg), zext (reg) let Constraints = "$src1 = $dst" in { let neverHasSideEffects = 1 in def SEXT_rus : _FRUS<(outs GRRegs:$dst), (ins GRRegs:$src1, i32imm:$src2), @@ -816,9 +835,29 @@ def SETD_2r : _F2R<(outs), (ins GRRegs:$r, GRRegs:$val), "setd res[$r], $val", [(int_xcore_setd GRRegs:$r, GRRegs:$val)]>; +def GETST_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$r), + "getst $dst, res[$r]", + [(set GRRegs:$dst, (int_xcore_getst GRRegs:$r))]>; + +def INITSP_2r : _F2R<(outs), (ins GRRegs:$t, GRRegs:$src), + "init t[$t]:sp, $src", + [(int_xcore_initsp GRRegs:$t, GRRegs:$src)]>; + +def INITPC_2r : _F2R<(outs), (ins GRRegs:$t, GRRegs:$src), + "init t[$t]:pc, $src", + [(int_xcore_initpc GRRegs:$t, GRRegs:$src)]>; + +def INITCP_2r : _F2R<(outs), (ins GRRegs:$t, GRRegs:$src), + "init t[$t]:cp, $src", + [(int_xcore_initcp GRRegs:$t, GRRegs:$src)]>; + +def INITDP_2r : _F2R<(outs), (ins GRRegs:$t, GRRegs:$src), + "init t[$t]:dp, $src", + [(int_xcore_initdp GRRegs:$t, GRRegs:$src)]>; + // Two operand long -// TODO setclk, setrdy, setpsc, endin, peek, -// getd, testlcl, tinitlr, getps, setps +// TODO endin, peek, +// getd, testlcl def BITREV_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src), "bitrev $dst, $src", [(set GRRegs:$dst, (int_xcore_bitrev GRRegs:$src))]>; @@ -839,10 +878,41 @@ def SETTW_l2r : _FL2R<(outs), (ins GRRegs:$r, GRRegs:$val), "settw res[$r], $val", [(int_xcore_settw GRRegs:$r, GRRegs:$val)]>; +def GETPS_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src), + "get $dst, ps[$src]", + [(set GRRegs:$dst, (int_xcore_getps GRRegs:$src))]>; + +def SETPS_l2r : _FL2R<(outs), (ins GRRegs:$src1, GRRegs:$src2), + "set ps[$src1], $src2", + [(int_xcore_setps GRRegs:$src1, GRRegs:$src2)]>; + +def INITLR_l2r : _FL2R<(outs), (ins GRRegs:$t, GRRegs:$src), + "init t[$t]:lr, $src", + [(int_xcore_initlr GRRegs:$t, GRRegs:$src)]>; + +def SETCLK_l2r : _FL2R<(outs), (ins GRRegs:$src1, GRRegs:$src2), + "setclk res[$src1], $src2", + [(int_xcore_setclk GRRegs:$src1, GRRegs:$src2)]>; + +def SETRDY_l2r : _FL2R<(outs), (ins GRRegs:$src1, GRRegs:$src2), + "setrdy res[$src1], $src2", + [(int_xcore_setrdy GRRegs:$src1, GRRegs:$src2)]>; + +def SETPSC_l2r : _FL2R<(outs), (ins GRRegs:$src1, GRRegs:$src2), + "setpsc res[$src1], $src2", + [(int_xcore_setpsc GRRegs:$src1, GRRegs:$src2)]>; + // One operand short -// TODO edu, eeu, waitet, waitef, tstart, msync, mjoin, clrtp +// TODO edu, eeu, waitet, waitef, tstart, clrtp // setdp, setcp, setev, kcall // dgetreg +def MSYNC_1r : _F1R<(outs), (ins GRRegs:$i), + "msync res[$i]", + [(int_xcore_msync GRRegs:$i)]>; +def MJOIN_1r : _F1R<(outs), (ins GRRegs:$i), + "mjoin res[$i]", + [(int_xcore_mjoin GRRegs:$i)]>; + let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in def BAU_1r : _F1R<(outs), (ins GRRegs:$addr), "bau $addr", @@ -899,7 +969,7 @@ def EEU_1r : _F1R<(outs), (ins GRRegs:$r), [(int_xcore_eeu GRRegs:$r)]>; // Zero operand short -// TODO ssync, freet, ldspc, stspc, ldssr, stssr, ldsed, stsed, +// TODO freet, ldspc, stspc, ldssr, stssr, ldsed, stsed, // stet, geted, getet, getkep, getksp, setkep, getid, kret, dcall, dret, // dentsp, drestsp @@ -910,6 +980,10 @@ def GETID_0R : _F0R<(outs), (ins), "get r11, id", [(set R11, (int_xcore_getid))]>; +def SSYNC_0r : _F0R<(outs), (ins), + "ssync", + [(int_xcore_ssync)]>; + let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1, hasSideEffects = 1 in def WAITEU_0R : _F0R<(outs), (ins), diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp index 56c0879..0287a51 100644 --- a/lib/Target/XCore/XCoreRegisterInfo.cpp +++ b/lib/Target/XCore/XCoreRegisterInfo.cpp @@ -104,6 +104,11 @@ XCoreRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const { return TFI->hasFP(MF); } +bool +XCoreRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const { + return false; +} + // This function eliminates ADJCALLSTACKDOWN, // ADJCALLSTACKUP pseudo instructions void XCoreRegisterInfo:: diff --git a/lib/Target/XCore/XCoreRegisterInfo.h b/lib/Target/XCore/XCoreRegisterInfo.h index 2185755..770483b 100644 --- a/lib/Target/XCore/XCoreRegisterInfo.h +++ b/lib/Target/XCore/XCoreRegisterInfo.h @@ -48,6 +48,8 @@ public: bool requiresRegisterScavenging(const MachineFunction &MF) const; + bool useFPForScavengingIndex(const MachineFunction &MF) const; + void eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; |