diff options
Diffstat (limited to 'contrib/llvm/lib/Target/ARM')
52 files changed, 3402 insertions, 2106 deletions
diff --git a/contrib/llvm/lib/Target/ARM/ARM.h b/contrib/llvm/lib/Target/ARM/ARM.h index ae7ae59..14825a7 100644 --- a/contrib/llvm/lib/Target/ARM/ARM.h +++ b/contrib/llvm/lib/Target/ARM/ARM.h @@ -90,10 +90,6 @@ inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) { } } -/// ModelWithRegSequence - Return true if isel should use REG_SEQUENCE to model -/// operations involving sub-registers. -bool ModelWithRegSequence(); - FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM, CodeGenOpt::Level OptLevel); diff --git a/contrib/llvm/lib/Target/ARM/ARM.td b/contrib/llvm/lib/Target/ARM/ARM.td index f1e6a9f..fa64d6c 100644 --- a/contrib/llvm/lib/Target/ARM/ARM.td +++ b/contrib/llvm/lib/Target/ARM/ARM.td @@ -48,6 +48,8 @@ def FeatureHWDiv : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true", "Enable divide instructions">; def FeatureT2ExtractPack: SubtargetFeature<"t2xtpk", "HasT2ExtractPack", "true", "Enable Thumb2 extract and pack instructions">; +def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true", + "FP compare + branch is slow">; // Some processors have multiply-accumulate instructions that don't // play nicely with other VFP instructions, and it's generally better @@ -129,7 +131,7 @@ def : Processor<"arm1156t2f-s", ARMV6Itineraries, // V7 Processors. def : Processor<"cortex-a8", CortexA8Itineraries, [ArchV7A, FeatureThumb2, FeatureNEON, FeatureHasSlowVMLx, - FeatureNEONForFP, FeatureT2ExtractPack]>; + FeatureSlowFPBrcc, FeatureNEONForFP, FeatureT2ExtractPack]>; def : Processor<"cortex-a9", CortexA9Itineraries, [ArchV7A, FeatureThumb2, FeatureNEON, FeatureT2ExtractPack]>; def : ProcNoItin<"cortex-m3", [ArchV7M, FeatureThumb2, FeatureHWDiv]>; diff --git a/contrib/llvm/lib/Target/ARM/ARMAddressingModes.h b/contrib/llvm/lib/Target/ARM/ARMAddressingModes.h index e68354a..92a13f1 100644 --- a/contrib/llvm/lib/Target/ARM/ARMAddressingModes.h +++ b/contrib/llvm/lib/Target/ARM/ARMAddressingModes.h @@ -519,7 +519,70 @@ namespace ARM_AM { // // This is stored in two operands [regaddr, align]. The first is the // address register. The second operand is the value of the alignment - // specifier to use or zero if no explicit alignment. + // specifier in bytes or zero if no explicit alignment. + // Valid alignments depend on the specific instruction. + + //===--------------------------------------------------------------------===// + // NEON Modified Immediates + //===--------------------------------------------------------------------===// + // + // Several NEON instructions (e.g., VMOV) take a "modified immediate" + // vector operand, where a small immediate encoded in the instruction + // specifies a full NEON vector value. These modified immediates are + // represented here as encoded integers. The low 8 bits hold the immediate + // value; bit 12 holds the "Op" field of the instruction, and bits 11-8 hold + // the "Cmode" field of the instruction. The interfaces below treat the + // Op and Cmode values as a single 5-bit value. + + static inline unsigned createNEONModImm(unsigned OpCmode, unsigned Val) { + return (OpCmode << 8) | Val; + } + static inline unsigned getNEONModImmOpCmode(unsigned ModImm) { + return (ModImm >> 8) & 0x1f; + } + static inline unsigned getNEONModImmVal(unsigned ModImm) { + return ModImm & 0xff; + } + + /// decodeNEONModImm - Decode a NEON modified immediate value into the + /// element value and the element size in bits. (If the element size is + /// smaller than the vector, it is splatted into all the elements.) + static inline uint64_t decodeNEONModImm(unsigned ModImm, unsigned &EltBits) { + unsigned OpCmode = getNEONModImmOpCmode(ModImm); + unsigned Imm8 = getNEONModImmVal(ModImm); + uint64_t Val = 0; + + if (OpCmode == 0xe) { + // 8-bit vector elements + Val = Imm8; + EltBits = 8; + } else if ((OpCmode & 0xc) == 0x8) { + // 16-bit vector elements + unsigned ByteNum = (OpCmode & 0x6) >> 1; + Val = Imm8 << (8 * ByteNum); + EltBits = 16; + } else if ((OpCmode & 0x8) == 0) { + // 32-bit vector elements, zero with one byte set + unsigned ByteNum = (OpCmode & 0x6) >> 1; + Val = Imm8 << (8 * ByteNum); + EltBits = 32; + } else if ((OpCmode & 0xe) == 0xc) { + // 32-bit vector elements, one byte with low bits set + unsigned ByteNum = 1 + (OpCmode & 0x1); + Val = (Imm8 << (8 * ByteNum)) | (0xffff >> (8 * (2 - ByteNum))); + EltBits = 32; + } else if (OpCmode == 0x1e) { + // 64-bit vector elements + for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) { + if ((ModImm >> ByteNum) & 1) + Val |= (uint64_t)0xff << (8 * ByteNum); + } + EltBits = 64; + } else { + assert(false && "Unsupported NEON immediate"); + } + return Val; + } } // end namespace ARM_AM } // end namespace llvm diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 2528854..49c16f3 100644 --- a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -56,7 +56,7 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, MachineInstr *MI = MBBI; MachineFunction &MF = *MI->getParent()->getParent(); - unsigned TSFlags = MI->getDesc().TSFlags; + uint64_t TSFlags = MI->getDesc().TSFlags; bool isPre = false; switch ((TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift) { default: return NULL; @@ -199,9 +199,9 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, bool ARMBaseInstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const { + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { if (CSI.empty()) return false; @@ -227,8 +227,9 @@ ARMBaseInstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB, // Insert the spill to the stack frame. The register is killed at the spill // + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); storeRegToStackSlot(MBB, MI, Reg, isKill, - CSI[i].getFrameIdx(), CSI[i].getRegClass(), TRI); + CSI[i].getFrameIdx(), RC, TRI); } return true; } @@ -347,10 +348,8 @@ unsigned ARMBaseInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { unsigned ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond) const { - // FIXME this should probably have a DebugLoc argument - DebugLoc dl; - + const SmallVectorImpl<MachineOperand> &Cond, + DebugLoc DL) const { ARMFunctionInfo *AFI = MBB.getParent()->getInfo<ARMFunctionInfo>(); int BOpc = !AFI->isThumbFunction() ? ARM::B : (AFI->isThumb2Function() ? ARM::t2B : ARM::tB); @@ -364,17 +363,17 @@ ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, if (FBB == 0) { if (Cond.empty()) // Unconditional branch? - BuildMI(&MBB, dl, get(BOpc)).addMBB(TBB); + BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB); else - BuildMI(&MBB, dl, get(BccOpc)).addMBB(TBB) + BuildMI(&MBB, DL, get(BccOpc)).addMBB(TBB) .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()); return 1; } // Two-way conditional branch. - BuildMI(&MBB, dl, get(BccOpc)).addMBB(TBB) + BuildMI(&MBB, DL, get(BccOpc)).addMBB(TBB) .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()); - BuildMI(&MBB, dl, get(BOpc)).addMBB(FBB); + BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB); return 2; } @@ -487,7 +486,7 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { // Basic size info comes from the TSFlags field. const TargetInstrDesc &TID = MI->getDesc(); - unsigned TSFlags = TID.TSFlags; + uint64_t TSFlags = TID.TSFlags; unsigned Opc = MI->getOpcode(); switch ((TSFlags & ARMII::SizeMask) >> ARMII::SizeShift) { @@ -524,11 +523,11 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { return 10; case ARM::Int_eh_sjlj_setjmp: case ARM::Int_eh_sjlj_setjmp_nofp: - return 24; + return 20; case ARM::tInt_eh_sjlj_setjmp: case ARM::t2Int_eh_sjlj_setjmp: case ARM::t2Int_eh_sjlj_setjmp_nofp: - return 14; + return 12; case ARM::BR_JTr: case ARM::BR_JTm: case ARM::BR_JTadd: @@ -595,6 +594,7 @@ ARMBaseInstrInfo::isMoveInstr(const MachineInstr &MI, return true; } case ARM::MOVr: + case ARM::MOVr_TC: case ARM::tMOVr: case ARM::tMOVgpr2tgpr: case ARM::tMOVtgpr2gpr: @@ -693,75 +693,44 @@ ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI, return 0; } -bool -ARMBaseInstrInfo::copyRegToReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - unsigned DestReg, unsigned SrcReg, - const TargetRegisterClass *DestRC, - const TargetRegisterClass *SrcRC, - DebugLoc DL) const { - // tGPR is used sometimes in ARM instructions that need to avoid using - // certain registers. Just treat it as GPR here. - if (DestRC == ARM::tGPRRegisterClass) - DestRC = ARM::GPRRegisterClass; - if (SrcRC == ARM::tGPRRegisterClass) - SrcRC = ARM::GPRRegisterClass; - - // Allow DPR / DPR_VFP2 / DPR_8 cross-class copies. - if (DestRC == ARM::DPR_8RegisterClass) - DestRC = ARM::DPR_VFP2RegisterClass; - if (SrcRC == ARM::DPR_8RegisterClass) - SrcRC = ARM::DPR_VFP2RegisterClass; - - // Allow QPR / QPR_VFP2 / QPR_8 cross-class copies. - if (DestRC == ARM::QPR_VFP2RegisterClass || - DestRC == ARM::QPR_8RegisterClass) - DestRC = ARM::QPRRegisterClass; - if (SrcRC == ARM::QPR_VFP2RegisterClass || - SrcRC == ARM::QPR_8RegisterClass) - SrcRC = ARM::QPRRegisterClass; - - // Allow QQPR / QQPR_VFP2 cross-class copies. - if (DestRC == ARM::QQPR_VFP2RegisterClass) - DestRC = ARM::QQPRRegisterClass; - if (SrcRC == ARM::QQPR_VFP2RegisterClass) - SrcRC = ARM::QQPRRegisterClass; - - // Disallow copies of unequal sizes. - if (DestRC != SrcRC && DestRC->getSize() != SrcRC->getSize()) - return false; - - if (DestRC == ARM::GPRRegisterClass) { - if (SrcRC == ARM::SPRRegisterClass) - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VMOVRS), DestReg) - .addReg(SrcReg)); - else - AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::MOVr), - DestReg).addReg(SrcReg))); - } else { - unsigned Opc; - - if (DestRC == ARM::SPRRegisterClass) - Opc = (SrcRC == ARM::GPRRegisterClass ? ARM::VMOVSR : ARM::VMOVS); - else if (DestRC == ARM::DPRRegisterClass) - Opc = ARM::VMOVD; - else if (DestRC == ARM::DPR_VFP2RegisterClass || - SrcRC == ARM::DPR_VFP2RegisterClass) - // Always use neon reg-reg move if source or dest is NEON-only regclass. - Opc = ARM::VMOVDneon; - else if (DestRC == ARM::QPRRegisterClass) - Opc = ARM::VMOVQ; - else if (DestRC == ARM::QQPRRegisterClass) - Opc = ARM::VMOVQQ; - else if (DestRC == ARM::QQQQPRRegisterClass) - Opc = ARM::VMOVQQQQ; - else - return false; - - AddDefaultPred(BuildMI(MBB, I, DL, get(Opc), DestReg).addReg(SrcReg)); +void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const { + bool GPRDest = ARM::GPRRegClass.contains(DestReg); + bool GPRSrc = ARM::GPRRegClass.contains(SrcReg); + + if (GPRDest && GPRSrc) { + AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::MOVr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)))); + return; } - return true; + bool SPRDest = ARM::SPRRegClass.contains(DestReg); + bool SPRSrc = ARM::SPRRegClass.contains(SrcReg); + + unsigned Opc; + if (SPRDest && SPRSrc) + Opc = ARM::VMOVS; + else if (GPRDest && SPRSrc) + Opc = ARM::VMOVRS; + else if (SPRDest && GPRSrc) + Opc = ARM::VMOVSR; + else if (ARM::DPRRegClass.contains(DestReg, SrcReg)) + Opc = ARM::VMOVD; + else if (ARM::QPRRegClass.contains(DestReg, SrcReg)) + Opc = ARM::VMOVQ; + else if (ARM::QQPRRegClass.contains(DestReg, SrcReg)) + Opc = ARM::VMOVQQ; + else if (ARM::QQQQPRRegClass.contains(DestReg, SrcReg)) + Opc = ARM::VMOVQQQQ; + else + llvm_unreachable("Impossible reg-to-reg copy"); + + MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc), DestReg); + MIB.addReg(SrcReg, getKillRegState(KillSrc)); + if (Opc != ARM::VMOVQQ && Opc != ARM::VMOVQQQQ) + AddDefaultPred(MIB); } static const @@ -795,30 +764,34 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, // tGPR is used sometimes in ARM instructions that need to avoid using // certain registers. Just treat it as GPR here. - if (RC == ARM::tGPRRegisterClass) + if (RC == ARM::tGPRRegisterClass || RC == ARM::tcGPRRegisterClass) RC = ARM::GPRRegisterClass; - if (RC == ARM::GPRRegisterClass) { + switch (RC->getID()) { + case ARM::GPRRegClassID: AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STR)) .addReg(SrcReg, getKillRegState(isKill)) .addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO)); - } else if (RC == ARM::SPRRegisterClass) { + break; + case ARM::SPRRegClassID: AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRS)) .addReg(SrcReg, getKillRegState(isKill)) .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); - } else if (RC == ARM::DPRRegisterClass || - RC == ARM::DPR_VFP2RegisterClass || - RC == ARM::DPR_8RegisterClass) { + break; + case ARM::DPRRegClassID: + case ARM::DPR_VFP2RegClassID: + case ARM::DPR_8RegClassID: AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRD)) .addReg(SrcReg, getKillRegState(isKill)) .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); - } else if (RC == ARM::QPRRegisterClass || - RC == ARM::QPR_VFP2RegisterClass || - RC == ARM::QPR_8RegisterClass) { + break; + case ARM::QPRRegClassID: + case ARM::QPR_VFP2RegClassID: + case ARM::QPR_8RegClassID: // FIXME: Neon instructions should support predicates if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q)) - .addFrameIndex(FI).addImm(128) + .addFrameIndex(FI).addImm(16) .addReg(SrcReg, getKillRegState(isKill)) .addMemOperand(MMO)); } else { @@ -828,12 +801,14 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4)) .addMemOperand(MMO)); } - } else if (RC == ARM::QQPRRegisterClass || RC == ARM::QQPR_VFP2RegisterClass){ + break; + case ARM::QQPRRegClassID: + case ARM::QQPR_VFP2RegClassID: if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { // FIXME: It's possible to only store part of the QQ register if the // spilled def has a sub-register index. - MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VST2q32)) - .addFrameIndex(FI).addImm(128); + MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VST1d64Q)) + .addFrameIndex(FI).addImm(16); MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI); MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI); MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI); @@ -850,8 +825,8 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI); AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI); } - } else { - assert(RC == ARM::QQQQPRRegisterClass && "Unknown regclass!"); + break; + case ARM::QQQQPRRegClassID: { MachineInstrBuilder MIB = AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMD)) .addFrameIndex(FI) @@ -865,6 +840,10 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MIB = AddDReg(MIB, SrcReg, ARM::dsub_5, 0, TRI); MIB = AddDReg(MIB, SrcReg, ARM::dsub_6, 0, TRI); AddDReg(MIB, SrcReg, ARM::dsub_7, 0, TRI); + break; + } + default: + llvm_unreachable("Unknown regclass!"); } } @@ -886,26 +865,30 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, // tGPR is used sometimes in ARM instructions that need to avoid using // certain registers. Just treat it as GPR here. - if (RC == ARM::tGPRRegisterClass) + if (RC == ARM::tGPRRegisterClass || RC == ARM::tcGPRRegisterClass) RC = ARM::GPRRegisterClass; - if (RC == ARM::GPRRegisterClass) { + switch (RC->getID()) { + case ARM::GPRRegClassID: AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDR), DestReg) .addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO)); - } else if (RC == ARM::SPRRegisterClass) { + break; + case ARM::SPRRegClassID: AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRS), DestReg) .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); - } else if (RC == ARM::DPRRegisterClass || - RC == ARM::DPR_VFP2RegisterClass || - RC == ARM::DPR_8RegisterClass) { + break; + case ARM::DPRRegClassID: + case ARM::DPR_VFP2RegClassID: + case ARM::DPR_8RegClassID: AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRD), DestReg) .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); - } else if (RC == ARM::QPRRegisterClass || - RC == ARM::QPR_VFP2RegisterClass || - RC == ARM::QPR_8RegisterClass) { + break; + case ARM::QPRRegClassID: + case ARM::QPR_VFP2RegClassID: + case ARM::QPR_8RegClassID: if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q), DestReg) - .addFrameIndex(FI).addImm(128) + .addFrameIndex(FI).addImm(16) .addMemOperand(MMO)); } else { AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMQ), DestReg) @@ -913,14 +896,16 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4)) .addMemOperand(MMO)); } - } else if (RC == ARM::QQPRRegisterClass || RC == ARM::QQPR_VFP2RegisterClass){ + break; + case ARM::QQPRRegClassID: + case ARM::QQPR_VFP2RegClassID: if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { - MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLD2q32)); + MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLD1d64Q)); MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI); MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI); MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::Define, TRI); MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::Define, TRI); - AddDefaultPred(MIB.addFrameIndex(FI).addImm(128).addMemOperand(MMO)); + AddDefaultPred(MIB.addFrameIndex(FI).addImm(16).addMemOperand(MMO)); } else { MachineInstrBuilder MIB = AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMD)) @@ -932,21 +917,25 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::Define, TRI); AddDReg(MIB, DestReg, ARM::dsub_3, RegState::Define, TRI); } - } else { - assert(RC == ARM::QQQQPRRegisterClass && "Unknown regclass!"); - MachineInstrBuilder MIB = - AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMD)) - .addFrameIndex(FI) - .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4))) - .addMemOperand(MMO); - MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::Define, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::Define, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_4, RegState::Define, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::Define, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::Define, TRI); - AddDReg(MIB, DestReg, ARM::dsub_7, RegState::Define, TRI); + break; + case ARM::QQQQPRRegClassID: { + MachineInstrBuilder MIB = + AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMD)) + .addFrameIndex(FI) + .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4))) + .addMemOperand(MMO); + MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::Define, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::Define, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_4, RegState::Define, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::Define, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::Define, TRI); + AddDReg(MIB, DestReg, ARM::dsub_7, RegState::Define, TRI); + break; + } + default: + llvm_unreachable("Unknown regclass!"); } } @@ -960,223 +949,6 @@ ARMBaseInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, return &*MIB; } -MachineInstr *ARMBaseInstrInfo:: -foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops, int FI) const { - if (Ops.size() != 1) return NULL; - - unsigned OpNum = Ops[0]; - unsigned Opc = MI->getOpcode(); - MachineInstr *NewMI = NULL; - if (Opc == ARM::MOVr || Opc == ARM::t2MOVr) { - // If it is updating CPSR, then it cannot be folded. - if (MI->getOperand(4).getReg() == ARM::CPSR && !MI->getOperand(4).isDead()) - return NULL; - unsigned Pred = MI->getOperand(2).getImm(); - unsigned PredReg = MI->getOperand(3).getReg(); - if (OpNum == 0) { // move -> store - unsigned SrcReg = MI->getOperand(1).getReg(); - unsigned SrcSubReg = MI->getOperand(1).getSubReg(); - bool isKill = MI->getOperand(1).isKill(); - bool isUndef = MI->getOperand(1).isUndef(); - if (Opc == ARM::MOVr) - NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::STR)) - .addReg(SrcReg, - getKillRegState(isKill) | getUndefRegState(isUndef), - SrcSubReg) - .addFrameIndex(FI).addReg(0).addImm(0).addImm(Pred).addReg(PredReg); - else // ARM::t2MOVr - NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2STRi12)) - .addReg(SrcReg, - getKillRegState(isKill) | getUndefRegState(isUndef), - SrcSubReg) - .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg); - } else { // move -> load - unsigned DstReg = MI->getOperand(0).getReg(); - unsigned DstSubReg = MI->getOperand(0).getSubReg(); - bool isDead = MI->getOperand(0).isDead(); - bool isUndef = MI->getOperand(0).isUndef(); - if (Opc == ARM::MOVr) - NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::LDR)) - .addReg(DstReg, - RegState::Define | - getDeadRegState(isDead) | - getUndefRegState(isUndef), DstSubReg) - .addFrameIndex(FI).addReg(0).addImm(0).addImm(Pred).addReg(PredReg); - else // ARM::t2MOVr - NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2LDRi12)) - .addReg(DstReg, - RegState::Define | - getDeadRegState(isDead) | - getUndefRegState(isUndef), DstSubReg) - .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg); - } - } else if (Opc == ARM::tMOVgpr2gpr || - Opc == ARM::tMOVtgpr2gpr || - Opc == ARM::tMOVgpr2tgpr) { - if (OpNum == 0) { // move -> store - unsigned SrcReg = MI->getOperand(1).getReg(); - unsigned SrcSubReg = MI->getOperand(1).getSubReg(); - bool isKill = MI->getOperand(1).isKill(); - bool isUndef = MI->getOperand(1).isUndef(); - NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2STRi12)) - .addReg(SrcReg, - getKillRegState(isKill) | getUndefRegState(isUndef), - SrcSubReg) - .addFrameIndex(FI).addImm(0).addImm(ARMCC::AL).addReg(0); - } else { // move -> load - unsigned DstReg = MI->getOperand(0).getReg(); - unsigned DstSubReg = MI->getOperand(0).getSubReg(); - bool isDead = MI->getOperand(0).isDead(); - bool isUndef = MI->getOperand(0).isUndef(); - NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2LDRi12)) - .addReg(DstReg, - RegState::Define | - getDeadRegState(isDead) | - getUndefRegState(isUndef), - DstSubReg) - .addFrameIndex(FI).addImm(0).addImm(ARMCC::AL).addReg(0); - } - } else if (Opc == ARM::VMOVS) { - unsigned Pred = MI->getOperand(2).getImm(); - unsigned PredReg = MI->getOperand(3).getReg(); - if (OpNum == 0) { // move -> store - unsigned SrcReg = MI->getOperand(1).getReg(); - unsigned SrcSubReg = MI->getOperand(1).getSubReg(); - bool isKill = MI->getOperand(1).isKill(); - bool isUndef = MI->getOperand(1).isUndef(); - NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VSTRS)) - .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef), - SrcSubReg) - .addFrameIndex(FI) - .addImm(0).addImm(Pred).addReg(PredReg); - } else { // move -> load - unsigned DstReg = MI->getOperand(0).getReg(); - unsigned DstSubReg = MI->getOperand(0).getSubReg(); - bool isDead = MI->getOperand(0).isDead(); - bool isUndef = MI->getOperand(0).isUndef(); - NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VLDRS)) - .addReg(DstReg, - RegState::Define | - getDeadRegState(isDead) | - getUndefRegState(isUndef), - DstSubReg) - .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg); - } - } else if (Opc == ARM::VMOVD || Opc == ARM::VMOVDneon) { - unsigned Pred = MI->getOperand(2).getImm(); - unsigned PredReg = MI->getOperand(3).getReg(); - if (OpNum == 0) { // move -> store - unsigned SrcReg = MI->getOperand(1).getReg(); - unsigned SrcSubReg = MI->getOperand(1).getSubReg(); - bool isKill = MI->getOperand(1).isKill(); - bool isUndef = MI->getOperand(1).isUndef(); - NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VSTRD)) - .addReg(SrcReg, - getKillRegState(isKill) | getUndefRegState(isUndef), - SrcSubReg) - .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg); - } else { // move -> load - unsigned DstReg = MI->getOperand(0).getReg(); - unsigned DstSubReg = MI->getOperand(0).getSubReg(); - bool isDead = MI->getOperand(0).isDead(); - bool isUndef = MI->getOperand(0).isUndef(); - NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VLDRD)) - .addReg(DstReg, - RegState::Define | - getDeadRegState(isDead) | - getUndefRegState(isUndef), - DstSubReg) - .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg); - } - } else if (Opc == ARM::VMOVQ) { - MachineFrameInfo &MFI = *MF.getFrameInfo(); - unsigned Pred = MI->getOperand(2).getImm(); - unsigned PredReg = MI->getOperand(3).getReg(); - if (OpNum == 0) { // move -> store - unsigned SrcReg = MI->getOperand(1).getReg(); - unsigned SrcSubReg = MI->getOperand(1).getSubReg(); - bool isKill = MI->getOperand(1).isKill(); - bool isUndef = MI->getOperand(1).isUndef(); - if (MFI.getObjectAlignment(FI) >= 16 && - getRegisterInfo().canRealignStack(MF)) { - NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VST1q)) - .addFrameIndex(FI).addImm(128) - .addReg(SrcReg, - getKillRegState(isKill) | getUndefRegState(isUndef), - SrcSubReg) - .addImm(Pred).addReg(PredReg); - } else { - NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VSTMQ)) - .addReg(SrcReg, - getKillRegState(isKill) | getUndefRegState(isUndef), - SrcSubReg) - .addFrameIndex(FI).addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4)) - .addImm(Pred).addReg(PredReg); - } - } else { // move -> load - unsigned DstReg = MI->getOperand(0).getReg(); - unsigned DstSubReg = MI->getOperand(0).getSubReg(); - bool isDead = MI->getOperand(0).isDead(); - bool isUndef = MI->getOperand(0).isUndef(); - if (MFI.getObjectAlignment(FI) >= 16 && - getRegisterInfo().canRealignStack(MF)) { - NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VLD1q)) - .addReg(DstReg, - RegState::Define | - getDeadRegState(isDead) | - getUndefRegState(isUndef), - DstSubReg) - .addFrameIndex(FI).addImm(128).addImm(Pred).addReg(PredReg); - } else { - NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VLDMQ)) - .addReg(DstReg, - RegState::Define | - getDeadRegState(isDead) | - getUndefRegState(isUndef), - DstSubReg) - .addFrameIndex(FI).addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4)) - .addImm(Pred).addReg(PredReg); - } - } - } - - return NewMI; -} - -MachineInstr* -ARMBaseInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr* MI, - const SmallVectorImpl<unsigned> &Ops, - MachineInstr* LoadMI) const { - // FIXME - return 0; -} - -bool -ARMBaseInstrInfo::canFoldMemoryOperand(const MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops) const { - if (Ops.size() != 1) return false; - - unsigned Opc = MI->getOpcode(); - if (Opc == ARM::MOVr || Opc == ARM::t2MOVr) { - // If it is updating CPSR, then it cannot be folded. - return MI->getOperand(4).getReg() != ARM::CPSR || - MI->getOperand(4).isDead(); - } else if (Opc == ARM::tMOVgpr2gpr || - Opc == ARM::tMOVtgpr2gpr || - Opc == ARM::tMOVgpr2tgpr) { - return true; - } else if (Opc == ARM::VMOVS || Opc == ARM::VMOVD || - Opc == ARM::VMOVDneon || Opc == ARM::VMOVQ) { - return true; - } - - // FIXME: VMOVQQ and VMOVQQQQ? - - return false; -} - /// Create a copy of a const pool value. Update CPI to the new index and return /// the label UID. static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) { @@ -1211,17 +983,12 @@ reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned DestReg, unsigned SubIdx, const MachineInstr *Orig, - const TargetRegisterInfo *TRI) const { - if (SubIdx && TargetRegisterInfo::isPhysicalRegister(DestReg)) { - DestReg = TRI->getSubReg(DestReg, SubIdx); - SubIdx = 0; - } - + const TargetRegisterInfo &TRI) const { unsigned Opcode = Orig->getOpcode(); switch (Opcode) { default: { MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig); - MI->getOperand(0).setReg(DestReg); + MI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI); MBB.insert(I, MI); break; } @@ -1237,9 +1004,6 @@ reMaterialize(MachineBasicBlock &MBB, break; } } - - MachineInstr *NewMI = prior(I); - NewMI->getOperand(0).setSubReg(SubIdx); } MachineInstr * @@ -1291,6 +1055,165 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0, return MI0->isIdenticalTo(MI1, MachineInstr::IgnoreVRegDefs); } +/// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to +/// determine if two loads are loading from the same base address. It should +/// only return true if the base pointers are the same and the only differences +/// between the two addresses is the offset. It also returns the offsets by +/// reference. +bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, + int64_t &Offset1, + int64_t &Offset2) const { + // Don't worry about Thumb: just ARM and Thumb2. + if (Subtarget.isThumb1Only()) return false; + + if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode()) + return false; + + switch (Load1->getMachineOpcode()) { + default: + return false; + case ARM::LDR: + case ARM::LDRB: + case ARM::LDRD: + case ARM::LDRH: + case ARM::LDRSB: + case ARM::LDRSH: + case ARM::VLDRD: + case ARM::VLDRS: + case ARM::t2LDRi8: + case ARM::t2LDRDi8: + case ARM::t2LDRSHi8: + case ARM::t2LDRi12: + case ARM::t2LDRSHi12: + break; + } + + switch (Load2->getMachineOpcode()) { + default: + return false; + case ARM::LDR: + case ARM::LDRB: + case ARM::LDRD: + case ARM::LDRH: + case ARM::LDRSB: + case ARM::LDRSH: + case ARM::VLDRD: + case ARM::VLDRS: + case ARM::t2LDRi8: + case ARM::t2LDRDi8: + case ARM::t2LDRSHi8: + case ARM::t2LDRi12: + case ARM::t2LDRSHi12: + break; + } + + // Check if base addresses and chain operands match. + if (Load1->getOperand(0) != Load2->getOperand(0) || + Load1->getOperand(4) != Load2->getOperand(4)) + return false; + + // Index should be Reg0. + if (Load1->getOperand(3) != Load2->getOperand(3)) + return false; + + // Determine the offsets. + if (isa<ConstantSDNode>(Load1->getOperand(1)) && + isa<ConstantSDNode>(Load2->getOperand(1))) { + Offset1 = cast<ConstantSDNode>(Load1->getOperand(1))->getSExtValue(); + Offset2 = cast<ConstantSDNode>(Load2->getOperand(1))->getSExtValue(); + return true; + } + + return false; +} + +/// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to +/// determine (in conjuction with areLoadsFromSameBasePtr) if two loads should +/// be scheduled togther. On some targets if two loads are loading from +/// addresses in the same cache line, it's better if they are scheduled +/// together. This function takes two integers that represent the load offsets +/// from the common base address. It returns true if it decides it's desirable +/// to schedule the two loads together. "NumLoads" is the number of loads that +/// have already been scheduled after Load1. +bool ARMBaseInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, + int64_t Offset1, int64_t Offset2, + unsigned NumLoads) const { + // Don't worry about Thumb: just ARM and Thumb2. + if (Subtarget.isThumb1Only()) return false; + + assert(Offset2 > Offset1); + + if ((Offset2 - Offset1) / 8 > 64) + return false; + + if (Load1->getMachineOpcode() != Load2->getMachineOpcode()) + return false; // FIXME: overly conservative? + + // Four loads in a row should be sufficient. + if (NumLoads >= 3) + return false; + + return true; +} + +bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI, + const MachineBasicBlock *MBB, + const MachineFunction &MF) const { + // Debug info is never a scheduling boundary. It's necessary to be explicit + // due to the special treatment of IT instructions below, otherwise a + // dbg_value followed by an IT will result in the IT instruction being + // considered a scheduling hazard, which is wrong. It should be the actual + // instruction preceding the dbg_value instruction(s), just like it is + // when debug info is not present. + if (MI->isDebugValue()) + return false; + + // Terminators and labels can't be scheduled around. + if (MI->getDesc().isTerminator() || MI->isLabel()) + return true; + + // Treat the start of the IT block as a scheduling boundary, but schedule + // t2IT along with all instructions following it. + // FIXME: This is a big hammer. But the alternative is to add all potential + // true and anti dependencies to IT block instructions as implicit operands + // to the t2IT instruction. The added compile time and complexity does not + // seem worth it. + MachineBasicBlock::const_iterator I = MI; + // Make sure to skip any dbg_value instructions + while (++I != MBB->end() && I->isDebugValue()) + ; + if (I != MBB->end() && I->getOpcode() == ARM::t2IT) + return true; + + // Don't attempt to schedule around any instruction that defines + // a stack-oriented pointer, as it's unlikely to be profitable. This + // saves compile time, because it doesn't require every single + // stack slot reference to depend on the instruction that does the + // modification. + if (MI->definesRegister(ARM::SP)) + return true; + + return false; +} + +bool ARMBaseInstrInfo:: +isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumInstrs) const { + if (!NumInstrs) + return false; + if (Subtarget.getCPUString() == "generic") + // Generic (and overly aggressive) if-conversion limits for testing. + return NumInstrs <= 10; + else if (Subtarget.hasV7Ops()) + return NumInstrs <= 3; + return NumInstrs <= 2; +} + +bool ARMBaseInstrInfo:: +isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumT, + MachineBasicBlock &FMBB, unsigned NumF) const { + return NumT && NumF && NumT <= 2 && NumF <= 2; +} + /// getInstrPredicate - If instruction is predicated, returns its predicate /// condition, otherwise returns AL. It also returns the condition code /// register by reference. diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index b566271..89a2db7 100644 --- a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -116,11 +116,25 @@ namespace ARMII { // Thumb format ThumbFrm = 24 << FormShift, - // NEON format - NEONFrm = 25 << FormShift, - NEONGetLnFrm = 26 << FormShift, - NEONSetLnFrm = 27 << FormShift, - NEONDupFrm = 28 << FormShift, + // Miscelleaneous format + MiscFrm = 25 << FormShift, + + // NEON formats + NGetLnFrm = 26 << FormShift, + NSetLnFrm = 27 << FormShift, + NDupFrm = 28 << FormShift, + NLdStFrm = 29 << FormShift, + N1RegModImmFrm= 30 << FormShift, + N2RegFrm = 31 << FormShift, + NVCVTFrm = 32 << FormShift, + NVDupLnFrm = 33 << FormShift, + N2RegVShLFrm = 34 << FormShift, + N2RegVShRFrm = 35 << FormShift, + N3RegFrm = 36 << FormShift, + N3RegVShFrm = 37 << FormShift, + NVExtFrm = 38 << FormShift, + NVMulSLFrm = 39 << FormShift, + NVTBLFrm = 40 << FormShift, //===------------------------------------------------------------------===// // Misc flags. @@ -213,7 +227,8 @@ public: virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const; virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, - const SmallVectorImpl<MachineOperand> &Cond) const; + const SmallVectorImpl<MachineOperand> &Cond, + DebugLoc DL) const; virtual bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const; @@ -258,12 +273,10 @@ public: virtual unsigned isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const; - virtual bool copyRegToReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - unsigned DestReg, unsigned SrcReg, - const TargetRegisterClass *DestRC, - const TargetRegisterClass *SrcRC, - DebugLoc DL) const; + virtual void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const; virtual void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, @@ -283,29 +296,51 @@ public: const MDNode *MDPtr, DebugLoc DL) const; - virtual bool canFoldMemoryOperand(const MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops) const; - - virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr* MI, - const SmallVectorImpl<unsigned> &Ops, - int FrameIndex) const; - - virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr* MI, - const SmallVectorImpl<unsigned> &Ops, - MachineInstr* LoadMI) const; - virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned DestReg, unsigned SubIdx, const MachineInstr *Orig, - const TargetRegisterInfo *TRI) const; + const TargetRegisterInfo &TRI) const; MachineInstr *duplicate(MachineInstr *Orig, MachineFunction &MF) const; virtual bool produceSameValue(const MachineInstr *MI0, const MachineInstr *MI1) const; + + /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to + /// determine if two loads are loading from the same base address. It should + /// only return true if the base pointers are the same and the only + /// differences between the two addresses is the offset. It also returns the + /// offsets by reference. + virtual bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, + int64_t &Offset1, int64_t &Offset2)const; + + /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to + /// determine (in conjuction with areLoadsFromSameBasePtr) if two loads should + /// be scheduled togther. On some targets if two loads are loading from + /// addresses in the same cache line, it's better if they are scheduled + /// together. This function takes two integers that represent the load offsets + /// from the common base address. It returns true if it decides it's desirable + /// to schedule the two loads together. "NumLoads" is the number of loads that + /// have already been scheduled after Load1. + virtual bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, + int64_t Offset1, int64_t Offset2, + unsigned NumLoads) const; + + virtual bool isSchedulingBoundary(const MachineInstr *MI, + const MachineBasicBlock *MBB, + const MachineFunction &MF) const; + + virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB, + unsigned NumInstrs) const; + + virtual bool isProfitableToIfCvt(MachineBasicBlock &TMBB,unsigned NumT, + MachineBasicBlock &FMBB,unsigned NumF) const; + + virtual bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, + unsigned NumInstrs) const { + return NumInstrs && NumInstrs == 1; + } }; static inline diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp index 82458d2..182bd99 100644 --- a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -170,56 +170,6 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return STI.isTargetDarwin() ? DarwinCalleeSavedRegs : CalleeSavedRegs; } -const TargetRegisterClass* const * -ARMBaseRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const { - static const TargetRegisterClass * const CalleeSavedRegClasses[] = { - &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass, - &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass, - &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass, - - &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, - &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, - 0 - }; - - static const TargetRegisterClass * const ThumbCalleeSavedRegClasses[] = { - &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass, - &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::tGPRRegClass, - &ARM::tGPRRegClass,&ARM::tGPRRegClass,&ARM::tGPRRegClass, - - &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, - &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, - 0 - }; - - static const TargetRegisterClass * const DarwinCalleeSavedRegClasses[] = { - &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass, - &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass, - &ARM::GPRRegClass, &ARM::GPRRegClass, - - &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, - &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, - 0 - }; - - static const TargetRegisterClass * const DarwinThumbCalleeSavedRegClasses[] ={ - &ARM::GPRRegClass, &ARM::tGPRRegClass, &ARM::tGPRRegClass, - &ARM::tGPRRegClass, &ARM::tGPRRegClass, &ARM::GPRRegClass, - &ARM::GPRRegClass, &ARM::GPRRegClass, - - &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, - &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, - 0 - }; - - if (STI.isThumb1Only()) { - return STI.isTargetDarwin() - ? DarwinThumbCalleeSavedRegClasses : ThumbCalleeSavedRegClasses; - } - return STI.isTargetDarwin() - ? DarwinCalleeSavedRegClasses : CalleeSavedRegClasses; -} - BitVector ARMBaseRegisterInfo:: getReservedRegs(const MachineFunction &MF) const { // FIXME: avoid re-calculating this everytime. @@ -352,7 +302,7 @@ ARMBaseRegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, } bool -ARMBaseRegisterInfo::canCombinedSubRegIndex(const TargetRegisterClass *RC, +ARMBaseRegisterInfo::canCombineSubRegIndices(const TargetRegisterClass *RC, SmallVectorImpl<unsigned> &SubIndices, unsigned &NewSubIdx) const { @@ -724,6 +674,15 @@ ARMBaseRegisterInfo::estimateRSStackSizeLimit(MachineFunction &MF) const { I != E; ++I) { for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { if (!I->getOperand(i).isFI()) continue; + + // When using ADDri to get the address of a stack object, 255 is the + // largest offset guaranteed to fit in the immediate offset. + if (I->getOpcode() == ARM::ADDri) { + Limit = std::min(Limit, (1U << 8) - 1); + break; + } + + // Otherwise check the addressing mode. switch (I->getDesc().TSFlags & ARMII::AddrModeMask) { case ARMII::AddrMode3: case ARMII::AddrModeT2_i8: @@ -765,6 +724,7 @@ ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, SmallVector<unsigned, 4> UnspilledCS1GPRs; SmallVector<unsigned, 4> UnspilledCS2GPRs; ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + MachineFrameInfo *MFI = MF.getFrameInfo(); // Spill R4 if Thumb2 function requires stack realignment - it will be used as // scratch register. @@ -780,7 +740,6 @@ ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, // Don't spill FP if the frame can be eliminated. This is determined // by scanning the callee-save registers to see if any is used. const unsigned *CSRegs = getCalleeSavedRegs(); - const TargetRegisterClass* const *CSRegClasses = getCalleeSavedRegClasses(); for (unsigned i = 0; CSRegs[i]; ++i) { unsigned Reg = CSRegs[i]; bool Spilled = false; @@ -798,50 +757,50 @@ ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, } } - if (CSRegClasses[i] == ARM::GPRRegisterClass || - CSRegClasses[i] == ARM::tGPRRegisterClass) { - if (Spilled) { - NumGPRSpills++; + if (!ARM::GPRRegisterClass->contains(Reg)) + continue; - if (!STI.isTargetDarwin()) { - if (Reg == ARM::LR) - LRSpilled = true; - CS1Spilled = true; - continue; - } + if (Spilled) { + NumGPRSpills++; - // Keep track if LR and any of R4, R5, R6, and R7 is spilled. - switch (Reg) { - case ARM::LR: + if (!STI.isTargetDarwin()) { + if (Reg == ARM::LR) LRSpilled = true; - // Fallthrough - case ARM::R4: - case ARM::R5: - case ARM::R6: - case ARM::R7: - CS1Spilled = true; - break; - default: - break; - } - } else { - if (!STI.isTargetDarwin()) { - UnspilledCS1GPRs.push_back(Reg); - continue; - } + CS1Spilled = true; + continue; + } - switch (Reg) { - case ARM::R4: - case ARM::R5: - case ARM::R6: - case ARM::R7: - case ARM::LR: - UnspilledCS1GPRs.push_back(Reg); - break; - default: - UnspilledCS2GPRs.push_back(Reg); - break; - } + // Keep track if LR and any of R4, R5, R6, and R7 is spilled. + switch (Reg) { + case ARM::LR: + LRSpilled = true; + // Fallthrough + case ARM::R4: + case ARM::R5: + case ARM::R6: + case ARM::R7: + CS1Spilled = true; + break; + default: + break; + } + } else { + if (!STI.isTargetDarwin()) { + UnspilledCS1GPRs.push_back(Reg); + continue; + } + + switch (Reg) { + case ARM::R4: + case ARM::R5: + case ARM::R6: + case ARM::R7: + case ARM::LR: + UnspilledCS1GPRs.push_back(Reg); + break; + default: + UnspilledCS2GPRs.push_back(Reg); + break; } } } @@ -862,9 +821,16 @@ ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, // offset, make sure a register (or a spill slot) is available for the // register scavenger. Note that if we're indexing off the frame pointer, the // effective stack size is 4 bytes larger since the FP points to the stack - // slot of the previous FP. - bool BigStack = RS && - estimateStackSize(MF) + (hasFP(MF) ? 4 : 0) >= estimateRSStackSizeLimit(MF); + // slot of the previous FP. Also, if we have variable sized objects in the + // function, stack slot references will often be negative, and some of + // our instructions are positive-offset only, so conservatively consider + // that case to want a spill slot (or register) as well. + // FIXME: We could add logic to be more precise about negative offsets + // and which instructions will need a scratch register for them. Is it + // worth the effort and added fragility? + bool BigStack = + (RS && (estimateStackSize(MF) + (hasFP(MF) ? 4:0) >= + estimateRSStackSizeLimit(MF))) || MFI->hasVarSizedObjects(); bool ExtraCSSpill = false; if (BigStack || !CanEliminateFrame || cannotEliminateFrame(MF)) { @@ -957,7 +923,6 @@ ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, // note: Thumb1 functions spill to R12, not the stack. Reserve a slot // closest to SP or frame pointer. const TargetRegisterClass *RC = ARM::GPRRegisterClass; - MachineFrameInfo *MFI = MF.getFrameInfo(); RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), false)); @@ -1622,6 +1587,7 @@ emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = prior(MBB.end()); assert(MBBI->getDesc().isReturn() && "Can only insert epilog into returning blocks"); + unsigned RetOpcode = MBBI->getOpcode(); DebugLoc dl = MBBI->getDebugLoc(); MachineFrameInfo *MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); @@ -1696,6 +1662,39 @@ emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getGPRCalleeSavedArea1Size()); } + if (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNdiND || + RetOpcode == ARM::TCRETURNri || RetOpcode == ARM::TCRETURNriND) { + // Tail call return: adjust the stack pointer and jump to callee. + MBBI = prior(MBB.end()); + MachineOperand &JumpTarget = MBBI->getOperand(0); + + // Jump to label or value in register. + if (RetOpcode == ARM::TCRETURNdi) { + BuildMI(MBB, MBBI, dl, + TII.get(STI.isThumb() ? ARM::TAILJMPdt : ARM::TAILJMPd)). + addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), + JumpTarget.getTargetFlags()); + } else if (RetOpcode == ARM::TCRETURNdiND) { + BuildMI(MBB, MBBI, dl, + TII.get(STI.isThumb() ? ARM::TAILJMPdNDt : ARM::TAILJMPdND)). + addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), + JumpTarget.getTargetFlags()); + } else if (RetOpcode == ARM::TCRETURNri) { + BuildMI(MBB, MBBI, dl, TII.get(ARM::TAILJMPr)). + addReg(JumpTarget.getReg(), RegState::Kill); + } else if (RetOpcode == ARM::TCRETURNriND) { + BuildMI(MBB, MBBI, dl, TII.get(ARM::TAILJMPrND)). + addReg(JumpTarget.getReg(), RegState::Kill); + } + + MachineInstr *NewMI = prior(MBBI); + for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i) + NewMI->addOperand(MBBI->getOperand(i)); + + // Delete the pseudo instruction TCRETURN. + MBB.erase(MBBI); + } + if (VARegSaveSize) emitSPUpdate(isARM, MBB, MBBI, dl, TII, VARegSaveSize); } diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h index 2c9c82d..f7ee0d5 100644 --- a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h @@ -69,9 +69,6 @@ public: /// Code Generation virtual methods... const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const; - const TargetRegisterClass* const* - getCalleeSavedRegClasses(const MachineFunction *MF = 0) const; - BitVector getReservedRegs(const MachineFunction &MF) const; /// getMatchingSuperRegClass - Return a subclass of the specified register @@ -81,14 +78,15 @@ public: getMatchingSuperRegClass(const TargetRegisterClass *A, const TargetRegisterClass *B, unsigned Idx) const; - /// canCombinedSubRegIndex - Given a register class and a list of sub-register - /// indices, return true if it's possible to combine the sub-register indices - /// into one that corresponds to a larger sub-register. Return the new sub- - /// register index by reference. Note the new index by be zero if the given - /// sub-registers combined to form the whole register. - virtual bool canCombinedSubRegIndex(const TargetRegisterClass *RC, - SmallVectorImpl<unsigned> &SubIndices, - unsigned &NewSubIdx) const; + /// canCombineSubRegIndices - Given a register class and a list of + /// subregister indices, return true if it's possible to combine the + /// subregister indices into one that corresponds to a larger + /// subregister. Return the new subregister index by reference. Note the + /// new index may be zero if the given subregisters can be combined to + /// form the whole register. + virtual bool canCombineSubRegIndices(const TargetRegisterClass *RC, + SmallVectorImpl<unsigned> &SubIndices, + unsigned &NewSubIdx) const; const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const; @@ -150,8 +148,8 @@ public: virtual bool canSimplifyCallFramePseudos(MachineFunction &MF) const; virtual void eliminateCallFramePseudoInstr(MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) const; + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const; virtual unsigned eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, FrameIndexValue *Value = NULL, diff --git a/contrib/llvm/lib/Target/ARM/ARMCodeEmitter.cpp b/contrib/llvm/lib/Target/ARM/ARMCodeEmitter.cpp index f2730fc..7895cb0 100644 --- a/contrib/llvm/lib/Target/ARM/ARMCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMCodeEmitter.cpp @@ -55,6 +55,7 @@ namespace { const std::vector<MachineConstantPoolEntry> *MCPEs; const std::vector<MachineJumpTableEntry> *MJTEs; bool IsPIC; + bool IsThumb; void getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<MachineModuleInfo>(); @@ -67,8 +68,8 @@ namespace { : MachineFunctionPass(&ID), JTI(0), II((const ARMInstrInfo *)tm.getInstrInfo()), TD(tm.getTargetData()), TM(tm), - MCE(mce), MCPEs(0), MJTEs(0), - IsPIC(TM.getRelocationModel() == Reloc::PIC_) {} + MCE(mce), MCPEs(0), MJTEs(0), + IsPIC(TM.getRelocationModel() == Reloc::PIC_), IsThumb(false) {} /// getBinaryCodeForInstr - This function, generated by the /// CodeEmitterGenerator using TableGen, produces the binary encoding for @@ -139,6 +140,12 @@ namespace { void emitMiscInstruction(const MachineInstr &MI); + void emitNEONLaneInstruction(const MachineInstr &MI); + void emitNEONDupInstruction(const MachineInstr &MI); + void emitNEON1RegModImmInstruction(const MachineInstr &MI); + void emitNEON2RegInstruction(const MachineInstr &MI); + void emitNEON3RegInstruction(const MachineInstr &MI); + /// getMachineOpValue - Return binary encoding of operand. If the machine /// operand requires relocation, record the relocation and return zero. unsigned getMachineOpValue(const MachineInstr &MI,const MachineOperand &MO); @@ -147,7 +154,8 @@ namespace { } /// getMovi32Value - Return binary encoding of operand for movw/movt. If the - /// machine operand requires relocation, record the relocation and return zero. + /// machine operand requires relocation, record the relocation and return + /// zero. unsigned getMovi32Value(const MachineInstr &MI,const MachineOperand &MO, unsigned Reloc); unsigned getMovi32Value(const MachineInstr &MI, unsigned OpIdx, @@ -193,6 +201,7 @@ bool ARMCodeEmitter::runOnMachineFunction(MachineFunction &MF) { MJTEs = 0; if (MF.getJumpTableInfo()) MJTEs = &MF.getJumpTableInfo()->getJumpTables(); IsPIC = TM.getRelocationModel() == Reloc::PIC_; + IsThumb = MF.getInfo<ARMFunctionInfo>()->isThumbFunction(); JTI->Initialize(MF, IsPIC); MMI = &getAnalysis<MachineModuleInfo>(); MCE.setModuleInfo(MMI); @@ -347,7 +356,7 @@ void ARMCodeEmitter::emitInstruction(const MachineInstr &MI) { MCE.processDebugLoc(MI.getDebugLoc(), true); - NumEmitted++; // Keep track of the # of mi's emitted + ++NumEmitted; // Keep track of the # of mi's emitted switch (MI.getDesc().TSFlags & ARMII::FormMask) { default: { llvm_unreachable("Unhandled instruction encoding format!"); @@ -407,6 +416,23 @@ void ARMCodeEmitter::emitInstruction(const MachineInstr &MI) { case ARMII::VFPMiscFrm: emitMiscInstruction(MI); break; + // NEON instructions. + case ARMII::NGetLnFrm: + case ARMII::NSetLnFrm: + emitNEONLaneInstruction(MI); + break; + case ARMII::NDupFrm: + emitNEONDupInstruction(MI); + break; + case ARMII::N1RegModImmFrm: + emitNEON1RegModImmInstruction(MI); + break; + case ARMII::N2RegFrm: + emitNEON2RegInstruction(MI); + break; + case ARMII::N3RegFrm: + emitNEON3RegInstruction(MI); + break; } MCE.processDebugLoc(MI.getDebugLoc(), false); } @@ -1539,4 +1565,144 @@ void ARMCodeEmitter::emitMiscInstruction(const MachineInstr &MI) { emitWordLE(Binary); } +static unsigned encodeNEONRd(const MachineInstr &MI, unsigned OpIdx) { + unsigned RegD = MI.getOperand(OpIdx).getReg(); + unsigned Binary = 0; + RegD = ARMRegisterInfo::getRegisterNumbering(RegD); + Binary |= (RegD & 0xf) << ARMII::RegRdShift; + Binary |= ((RegD >> 4) & 1) << ARMII::D_BitShift; + return Binary; +} + +static unsigned encodeNEONRn(const MachineInstr &MI, unsigned OpIdx) { + unsigned RegN = MI.getOperand(OpIdx).getReg(); + unsigned Binary = 0; + RegN = ARMRegisterInfo::getRegisterNumbering(RegN); + Binary |= (RegN & 0xf) << ARMII::RegRnShift; + Binary |= ((RegN >> 4) & 1) << ARMII::N_BitShift; + return Binary; +} + +static unsigned encodeNEONRm(const MachineInstr &MI, unsigned OpIdx) { + unsigned RegM = MI.getOperand(OpIdx).getReg(); + unsigned Binary = 0; + RegM = ARMRegisterInfo::getRegisterNumbering(RegM); + Binary |= (RegM & 0xf); + Binary |= ((RegM >> 4) & 1) << ARMII::M_BitShift; + return Binary; +} + +/// convertNEONDataProcToThumb - Convert the ARM mode encoding for a NEON +/// data-processing instruction to the corresponding Thumb encoding. +static unsigned convertNEONDataProcToThumb(unsigned Binary) { + assert((Binary & 0xfe000000) == 0xf2000000 && + "not an ARM NEON data-processing instruction"); + unsigned UBit = (Binary >> 24) & 1; + return 0xef000000 | (UBit << 28) | (Binary & 0xffffff); +} + +void ARMCodeEmitter::emitNEONLaneInstruction(const MachineInstr &MI) { + unsigned Binary = getBinaryCodeForInstr(MI); + + unsigned RegTOpIdx, RegNOpIdx, LnOpIdx; + const TargetInstrDesc &TID = MI.getDesc(); + if ((TID.TSFlags & ARMII::FormMask) == ARMII::NGetLnFrm) { + RegTOpIdx = 0; + RegNOpIdx = 1; + LnOpIdx = 2; + } else { // ARMII::NSetLnFrm + RegTOpIdx = 2; + RegNOpIdx = 0; + LnOpIdx = 3; + } + + // Set the conditional execution predicate + Binary |= (IsThumb ? ARMCC::AL : II->getPredicate(&MI)) << ARMII::CondShift; + + unsigned RegT = MI.getOperand(RegTOpIdx).getReg(); + RegT = ARMRegisterInfo::getRegisterNumbering(RegT); + Binary |= (RegT << ARMII::RegRdShift); + Binary |= encodeNEONRn(MI, RegNOpIdx); + + unsigned LaneShift; + if ((Binary & (1 << 22)) != 0) + LaneShift = 0; // 8-bit elements + else if ((Binary & (1 << 5)) != 0) + LaneShift = 1; // 16-bit elements + else + LaneShift = 2; // 32-bit elements + + unsigned Lane = MI.getOperand(LnOpIdx).getImm() << LaneShift; + unsigned Opc1 = Lane >> 2; + unsigned Opc2 = Lane & 3; + assert((Opc1 & 3) == 0 && "out-of-range lane number operand"); + Binary |= (Opc1 << 21); + Binary |= (Opc2 << 5); + + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitNEONDupInstruction(const MachineInstr &MI) { + unsigned Binary = getBinaryCodeForInstr(MI); + + // Set the conditional execution predicate + Binary |= (IsThumb ? ARMCC::AL : II->getPredicate(&MI)) << ARMII::CondShift; + + unsigned RegT = MI.getOperand(1).getReg(); + RegT = ARMRegisterInfo::getRegisterNumbering(RegT); + Binary |= (RegT << ARMII::RegRdShift); + Binary |= encodeNEONRn(MI, 0); + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitNEON1RegModImmInstruction(const MachineInstr &MI) { + unsigned Binary = getBinaryCodeForInstr(MI); + // Destination register is encoded in Dd. + Binary |= encodeNEONRd(MI, 0); + // Immediate fields: Op, Cmode, I, Imm3, Imm4 + unsigned Imm = MI.getOperand(1).getImm(); + unsigned Op = (Imm >> 12) & 1; + unsigned Cmode = (Imm >> 8) & 0xf; + unsigned I = (Imm >> 7) & 1; + unsigned Imm3 = (Imm >> 4) & 0x7; + unsigned Imm4 = Imm & 0xf; + Binary |= (I << 24) | (Imm3 << 16) | (Cmode << 8) | (Op << 5) | Imm4; + if (IsThumb) + Binary = convertNEONDataProcToThumb(Binary); + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitNEON2RegInstruction(const MachineInstr &MI) { + const TargetInstrDesc &TID = MI.getDesc(); + unsigned Binary = getBinaryCodeForInstr(MI); + // Destination register is encoded in Dd; source register in Dm. + unsigned OpIdx = 0; + Binary |= encodeNEONRd(MI, OpIdx++); + if (TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1) + ++OpIdx; + Binary |= encodeNEONRm(MI, OpIdx); + if (IsThumb) + Binary = convertNEONDataProcToThumb(Binary); + // FIXME: This does not handle VDUPfdf or VDUPfqf. + emitWordLE(Binary); +} + +void ARMCodeEmitter::emitNEON3RegInstruction(const MachineInstr &MI) { + const TargetInstrDesc &TID = MI.getDesc(); + unsigned Binary = getBinaryCodeForInstr(MI); + // Destination register is encoded in Dd; source registers in Dn and Dm. + unsigned OpIdx = 0; + Binary |= encodeNEONRd(MI, OpIdx++); + if (TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1) + ++OpIdx; + Binary |= encodeNEONRn(MI, OpIdx++); + if (TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1) + ++OpIdx; + Binary |= encodeNEONRm(MI, OpIdx); + if (IsThumb) + Binary = convertNEONDataProcToThumb(Binary); + // FIXME: This does not handle VMOVDneon or VMOVQ. + emitWordLE(Binary); +} + #include "ARMGenCodeEmitter.inc" diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp index 13d8b74..65a3da6 100644 --- a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -337,7 +337,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) { if (CPChange && ++NoCPIters > 30) llvm_unreachable("Constant Island pass failed to converge!"); DEBUG(dumpBBs()); - + // Clear NewWaterList now. If we split a block for branches, it should // appear as "new water" for the next iteration of constant pool placement. NewWaterList.clear(); @@ -361,8 +361,8 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) { // After a while, this might be made debug-only, but it is not expensive. verify(MF); - // If LR has been forced spilled and no far jumps (i.e. BL) has been issued. - // Undo the spill / restore of LR if possible. + // If LR has been forced spilled and no far jump (i.e. BL) has been issued, + // undo the spill / restore of LR if possible. if (isThumb && !HasFarJump && AFI->isLRSpilledForFarJump()) MadeChange |= UndoLRSpillRestore(); @@ -407,7 +407,7 @@ void ARMConstantIslands::DoInitialPlacement(MachineFunction &MF, std::vector<CPEntry> CPEs; CPEs.push_back(CPEntry(CPEMI, i)); CPEntries.push_back(CPEs); - NumCPEs++; + ++NumCPEs; DEBUG(errs() << "Moved CPI#" << i << " to end of function as #" << i << "\n"); } @@ -418,7 +418,8 @@ void ARMConstantIslands::DoInitialPlacement(MachineFunction &MF, static bool BBHasFallthrough(MachineBasicBlock *MBB) { // Get the next machine basic block in the function. MachineFunction::iterator MBBI = MBB; - if (llvm::next(MBBI) == MBB->getParent()->end()) // Can't fall off end of function. + // Can't fall off end of function. + if (llvm::next(MBBI) == MBB->getParent()->end()) return false; MachineBasicBlock *NextBB = llvm::next(MBBI); @@ -491,6 +492,8 @@ void ARMConstantIslands::InitialFunctionScan(MachineFunction &MF, unsigned MBBSize = 0; for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I) { + if (I->isDebugValue()) + continue; // Add instruction size to MBBSize. MBBSize += TII->GetInstSizeInBytes(I); @@ -722,7 +725,7 @@ MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) { // correspond to anything in the source. unsigned Opc = isThumb ? (isThumb2 ? ARM::t2B : ARM::tB) : ARM::B; BuildMI(OrigBB, DebugLoc(), TII->get(Opc)).addMBB(NewBB); - NumSplit++; + ++NumSplit; // Update the CFG. All succs of OrigBB are now succs of NewBB. while (!OrigBB->succ_empty()) { @@ -945,7 +948,7 @@ bool ARMConstantIslands::DecrementOldEntry(unsigned CPI, MachineInstr *CPEMI) { if (--CPE->RefCount == 0) { RemoveDeadCPEMI(CPEMI); CPE->CPEMI = NULL; - NumCPEs--; + --NumCPEs; return true; } return false; @@ -1246,7 +1249,7 @@ bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &MF, U.CPEMI = BuildMI(NewIsland, DebugLoc(), TII->get(ARM::CONSTPOOL_ENTRY)) .addImm(ID).addConstantPoolIndex(CPI).addImm(Size); CPEntries[CPI].push_back(CPEntry(U.CPEMI, ID, 1)); - NumCPEs++; + ++NumCPEs; BBOffsets[NewIsland->getNumber()] = BBOffsets[NewMBB->getNumber()]; // Compensate for .align 2 in thumb mode. @@ -1369,7 +1372,7 @@ ARMConstantIslands::FixUpUnconditionalBr(MachineFunction &MF, ImmBranch &Br) { BBSizes[MBB->getNumber()] += 2; AdjustBBOffsetsAfter(MBB, 2); HasFarJump = true; - NumUBrFixed++; + ++NumUBrFixed; DEBUG(errs() << " Changed B to long jump " << *MI); @@ -1402,7 +1405,7 @@ ARMConstantIslands::FixUpConditionalBr(MachineFunction &MF, ImmBranch &Br) { MachineInstr *BMI = &MBB->back(); bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB); - NumCBrFixed++; + ++NumCBrFixed; if (BMI != MI) { if (llvm::next(MachineBasicBlock::iterator(MI)) == prior(MBB->end()) && BMI->getOpcode() == Br.UncondBr) { @@ -1621,7 +1624,7 @@ bool ARMConstantIslands::OptimizeThumb2JumpTables(MachineFunction &MF) { // constantpool tables? MachineJumpTableInfo *MJTI = MF.getJumpTableInfo(); if (MJTI == 0) return false; - + const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) { MachineInstr *MI = T2JumpTables[i]; @@ -1658,15 +1661,25 @@ bool ARMConstantIslands::OptimizeThumb2JumpTables(MachineFunction &MF) { continue; unsigned IdxReg = MI->getOperand(1).getReg(); bool IdxRegKill = MI->getOperand(1).isKill(); + + // Scan backwards to find the instruction that defines the base + // register. Due to post-RA scheduling, we can't count on it + // immediately preceding the branch instruction. MachineBasicBlock::iterator PrevI = MI; - if (PrevI == MBB->begin()) + MachineBasicBlock::iterator B = MBB->begin(); + while (PrevI != B && !PrevI->definesRegister(BaseReg)) + --PrevI; + + // If for some reason we didn't find it, we can't do anything, so + // just skip this one. + if (!PrevI->definesRegister(BaseReg)) continue; - MachineInstr *AddrMI = --PrevI; + MachineInstr *AddrMI = PrevI; bool OptOk = true; - // Examine the instruction that calculate the jumptable entry address. - // If it's not the one just before the t2BR_JT, we won't delete it, then - // it's not worth doing the optimization. + // Examine the instruction that calculates the jumptable entry address. + // Make sure it only defines the base register and kills any uses + // other than the index register. for (unsigned k = 0, eee = AddrMI->getNumOperands(); k != eee; ++k) { const MachineOperand &MO = AddrMI->getOperand(k); if (!MO.isReg() || !MO.getReg()) @@ -1683,9 +1696,14 @@ bool ARMConstantIslands::OptimizeThumb2JumpTables(MachineFunction &MF) { if (!OptOk) continue; - // The previous instruction should be a tLEApcrel or t2LEApcrelJT, we want + // Now scan back again to find the tLEApcrel or t2LEApcrelJT instruction + // that gave us the initial base register definition. + for (--PrevI; PrevI != B && !PrevI->definesRegister(BaseReg); --PrevI) + ; + + // The instruction should be a tLEApcrel or t2LEApcrelJT; we want // to delete it as well. - MachineInstr *LeaMI = --PrevI; + MachineInstr *LeaMI = PrevI; if ((LeaMI->getOpcode() != ARM::tLEApcrelJT && LeaMI->getOpcode() != ARM::t2LEApcrelJT) || LeaMI->getOperand(0).getReg() != BaseReg) @@ -1729,7 +1747,7 @@ bool ARMConstantIslands::ReorderThumb2JumpTables(MachineFunction &MF) { MachineJumpTableInfo *MJTI = MF.getJumpTableInfo(); if (MJTI == 0) return false; - + const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) { MachineInstr *MI = T2JumpTables[i]; @@ -1769,7 +1787,7 @@ AdjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) { MachineFunction &MF = *BB->getParent(); - // If it's the destination block is terminated by an unconditional branch, + // If the destination block is terminated by an unconditional branch, // try to move it; otherwise, create a new block following the jump // table that branches back to the actual target. This is a very simple // heuristic. FIXME: We can definitely improve it. diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h index 6f4eddf..3119b54 100644 --- a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h +++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h @@ -15,6 +15,7 @@ #define LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H #include "llvm/CodeGen/MachineConstantPool.h" +#include <cstddef> namespace llvm { diff --git a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index c87f5d7..9c62597 100644 --- a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -144,13 +144,15 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { MachineInstrBuilder Even = AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::VMOVQ)) - .addReg(EvenDst, getDefRegState(true) | getDeadRegState(DstIsDead)) - .addReg(EvenSrc, getKillRegState(SrcIsKill))); + .addReg(EvenDst, + getDefRegState(true) | getDeadRegState(DstIsDead)) + .addReg(EvenSrc, getKillRegState(SrcIsKill))); MachineInstrBuilder Odd = AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::VMOVQ)) - .addReg(OddDst, getDefRegState(true) | getDeadRegState(DstIsDead)) - .addReg(OddSrc, getKillRegState(SrcIsKill))); + .addReg(OddDst, + getDefRegState(true) | getDeadRegState(DstIsDead)) + .addReg(OddSrc, getKillRegState(SrcIsKill))); TransferImpOps(MI, Even, Odd); MI.eraseFromParent(); Modified = true; diff --git a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 9baef6b..c84d3ff 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -11,6 +11,7 @@ // //===----------------------------------------------------------------------===// +#define DEBUG_TYPE "arm-isel" #include "ARM.h" #include "ARMAddressingModes.h" #include "ARMTargetMachine.h" @@ -35,11 +36,6 @@ using namespace llvm; -static cl::opt<bool> -UseRegSeq("neon-reg-sequence", cl::Hidden, - cl::desc("Use reg_sequence to model ld / st of multiple neon regs"), - cl::init(true)); - //===--------------------------------------------------------------------===// /// ARMDAGToDAGISel - ARM specific code to select ARM machine /// instructions for SelectionDAG operations. @@ -147,6 +143,11 @@ private: unsigned *DOpcodes, unsigned *QOpcodes0, unsigned *QOpcodes1); + /// SelectVTBL - Select NEON VTBL and VTBX intrinsics. NumVecs should be 2, + /// 3 or 4. These are custom-selected so that a REG_SEQUENCE can be + /// generated to force the table registers to be consecutive. + SDNode *SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, unsigned Opc); + /// SelectV6T2BitfieldExtractOp - Select SBFX/UBFX instructions for ARM. SDNode *SelectV6T2BitfieldExtractOp(SDNode *N, bool isSigned); @@ -173,24 +174,17 @@ private: char ConstraintCode, std::vector<SDValue> &OutOps); - /// PairDRegs - Form a quad register from a pair of D registers. - /// + // Form pairs of consecutive S, D, or Q registers. + SDNode *PairSRegs(EVT VT, SDValue V0, SDValue V1); SDNode *PairDRegs(EVT VT, SDValue V0, SDValue V1); - - /// PairDRegs - Form a quad register pair from a pair of Q registers. - /// SDNode *PairQRegs(EVT VT, SDValue V0, SDValue V1); - /// QuadDRegs - Form a quad register pair from a quad of D registers. - /// + // Form sequences of 4 consecutive S, D, or Q registers. + SDNode *QuadSRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3); SDNode *QuadDRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3); - - /// QuadQRegs - Form 4 consecutive Q registers. - /// SDNode *QuadQRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3); - /// OctoDRegs - Form 8 consecutive D registers. - /// + // Form sequences of 8 consecutive D registers. SDNode *OctoDRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3, SDValue V4, SDValue V5, SDValue V6, SDValue V7); }; @@ -544,10 +538,9 @@ bool ARMDAGToDAGISel::SelectAddrModePC(SDNode *Op, SDValue N, bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDNode *Op, SDValue N, SDValue &Base, SDValue &Offset){ // FIXME dl should come from the parent load or store, not the address - DebugLoc dl = Op->getDebugLoc(); if (N.getOpcode() != ISD::ADD) { ConstantSDNode *NC = dyn_cast<ConstantSDNode>(N); - if (!NC || NC->getZExtValue() != 0) + if (!NC || !NC->isNullValue()) return false; Base = Offset = N; @@ -788,8 +781,9 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8s4(SDNode *Op, SDValue N, if (N.getOpcode() == ISD::ADD) { if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { int RHSC = (int)RHS->getZExtValue(); + // 8 bits. if (((RHSC & 0x3) == 0) && - ((RHSC >= 0 && RHSC < 0x400) || (RHSC < 0 && RHSC > -0x400))) { // 8 bits. + ((RHSC >= 0 && RHSC < 0x400) || (RHSC < 0 && RHSC > -0x400))) { Base = N.getOperand(0); OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); return true; @@ -798,7 +792,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8s4(SDNode *Op, SDValue N, } else if (N.getOpcode() == ISD::SUB) { if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { int RHSC = (int)RHS->getZExtValue(); - if (((RHSC & 0x3) == 0) && (RHSC >= 0 && RHSC < 0x400)) { // 8 bits. + // 8 bits. + if (((RHSC & 0x3) == 0) && (RHSC >= 0 && RHSC < 0x400)) { Base = N.getOperand(0); OffImm = CurDAG->getTargetConstant(-RHSC, MVT::i32); return true; @@ -960,22 +955,24 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) { return NULL; } +/// PairSRegs - Form a D register from a pair of S registers. +/// +SDNode *ARMDAGToDAGISel::PairSRegs(EVT VT, SDValue V0, SDValue V1) { + DebugLoc dl = V0.getNode()->getDebugLoc(); + SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(ARM::ssub_1, MVT::i32); + const SDValue Ops[] = { V0, SubReg0, V1, SubReg1 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 4); +} + /// PairDRegs - Form a quad register from a pair of D registers. /// SDNode *ARMDAGToDAGISel::PairDRegs(EVT VT, SDValue V0, SDValue V1) { DebugLoc dl = V0.getNode()->getDebugLoc(); SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32); SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32); - if (llvm::ModelWithRegSequence()) { - const SDValue Ops[] = { V0, SubReg0, V1, SubReg1 }; - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 4); - } - SDValue Undef = - SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0); - SDNode *Pair = CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, dl, - VT, Undef, V0, SubReg0); - return CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, dl, - VT, SDValue(Pair, 0), V1, SubReg1); + const SDValue Ops[] = { V0, SubReg0, V1, SubReg1 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 4); } /// PairQRegs - Form 4 consecutive D registers from a pair of Q registers. @@ -988,6 +985,19 @@ SDNode *ARMDAGToDAGISel::PairQRegs(EVT VT, SDValue V0, SDValue V1) { return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 4); } +/// QuadSRegs - Form 4 consecutive S registers. +/// +SDNode *ARMDAGToDAGISel::QuadSRegs(EVT VT, SDValue V0, SDValue V1, + SDValue V2, SDValue V3) { + DebugLoc dl = V0.getNode()->getDebugLoc(); + SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(ARM::ssub_1, MVT::i32); + SDValue SubReg2 = CurDAG->getTargetConstant(ARM::ssub_2, MVT::i32); + SDValue SubReg3 = CurDAG->getTargetConstant(ARM::ssub_3, MVT::i32); + const SDValue Ops[] = { V0, SubReg0, V1, SubReg1, V2, SubReg2, V3, SubReg3 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 8); +} + /// QuadDRegs - Form 4 consecutive D registers. /// SDNode *ARMDAGToDAGISel::QuadDRegs(EVT VT, SDValue V0, SDValue V1, @@ -1088,7 +1098,7 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs, std::vector<EVT> ResTys(NumVecs, VT); ResTys.push_back(MVT::Other); SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops, 5); - if (!llvm::ModelWithRegSequence() || NumVecs < 2) + if (NumVecs < 2) return VLd; SDValue RegSeq; @@ -1129,24 +1139,17 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs, Chain = SDValue(VLd, 2 * NumVecs); // Combine the even and odd subregs to produce the result. - if (llvm::ModelWithRegSequence()) { - if (NumVecs == 1) { - SDNode *Q = PairDRegs(VT, SDValue(VLd, 0), SDValue(VLd, 1)); - ReplaceUses(SDValue(N, 0), SDValue(Q, 0)); - } else { - SDValue QQ = SDValue(QuadDRegs(MVT::v4i64, - SDValue(VLd, 0), SDValue(VLd, 1), - SDValue(VLd, 2), SDValue(VLd, 3)), 0); - SDValue Q0 = CurDAG->getTargetExtractSubreg(ARM::qsub_0, dl, VT, QQ); - SDValue Q1 = CurDAG->getTargetExtractSubreg(ARM::qsub_1, dl, VT, QQ); - ReplaceUses(SDValue(N, 0), Q0); - ReplaceUses(SDValue(N, 1), Q1); - } + if (NumVecs == 1) { + SDNode *Q = PairDRegs(VT, SDValue(VLd, 0), SDValue(VLd, 1)); + ReplaceUses(SDValue(N, 0), SDValue(Q, 0)); } else { - for (unsigned Vec = 0; Vec < NumVecs; ++Vec) { - SDNode *Q = PairDRegs(VT, SDValue(VLd, 2*Vec), SDValue(VLd, 2*Vec+1)); - ReplaceUses(SDValue(N, Vec), SDValue(Q, 0)); - } + SDValue QQ = SDValue(QuadDRegs(MVT::v4i64, + SDValue(VLd, 0), SDValue(VLd, 1), + SDValue(VLd, 2), SDValue(VLd, 3)), 0); + SDValue Q0 = CurDAG->getTargetExtractSubreg(ARM::qsub_0, dl, VT, QQ); + SDValue Q1 = CurDAG->getTargetExtractSubreg(ARM::qsub_1, dl, VT, QQ); + ReplaceUses(SDValue(N, 0), Q0); + ReplaceUses(SDValue(N, 1), Q1); } } else { // Otherwise, quad registers are loaded with two separate instructions, @@ -1169,37 +1172,27 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs, SDNode *VLdB = CurDAG->getMachineNode(Opc, dl, ResTys, OpsB, 6); Chain = SDValue(VLdB, NumVecs+1); - if (llvm::ModelWithRegSequence()) { - SDValue V0 = SDValue(VLdA, 0); - SDValue V1 = SDValue(VLdB, 0); - SDValue V2 = SDValue(VLdA, 1); - SDValue V3 = SDValue(VLdB, 1); - SDValue V4 = SDValue(VLdA, 2); - SDValue V5 = SDValue(VLdB, 2); - SDValue V6 = (NumVecs == 3) - ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,RegVT), - 0) - : SDValue(VLdA, 3); - SDValue V7 = (NumVecs == 3) - ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,RegVT), - 0) - : SDValue(VLdB, 3); - SDValue RegSeq = SDValue(OctoDRegs(MVT::v8i64, V0, V1, V2, V3, - V4, V5, V6, V7), 0); - - // Extract out the 3 / 4 Q registers. - assert(ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering"); - for (unsigned Vec = 0; Vec < NumVecs; ++Vec) { - SDValue Q = CurDAG->getTargetExtractSubreg(ARM::qsub_0+Vec, - dl, VT, RegSeq); - ReplaceUses(SDValue(N, Vec), Q); - } - } else { - // Combine the even and odd subregs to produce the result. - for (unsigned Vec = 0; Vec < NumVecs; ++Vec) { - SDNode *Q = PairDRegs(VT, SDValue(VLdA, Vec), SDValue(VLdB, Vec)); - ReplaceUses(SDValue(N, Vec), SDValue(Q, 0)); - } + SDValue V0 = SDValue(VLdA, 0); + SDValue V1 = SDValue(VLdB, 0); + SDValue V2 = SDValue(VLdA, 1); + SDValue V3 = SDValue(VLdB, 1); + SDValue V4 = SDValue(VLdA, 2); + SDValue V5 = SDValue(VLdB, 2); + SDValue V6 = (NumVecs == 3) + ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,RegVT), 0) + : SDValue(VLdA, 3); + SDValue V7 = (NumVecs == 3) + ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,RegVT), 0) + : SDValue(VLdB, 3); + SDValue RegSeq = SDValue(OctoDRegs(MVT::v8i64, V0, V1, V2, V3, + V4, V5, V6, V7), 0); + + // Extract out the 3 / 4 Q registers. + assert(ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering"); + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) { + SDValue Q = CurDAG->getTargetExtractSubreg(ARM::qsub_0+Vec, + dl, VT, RegSeq); + ReplaceUses(SDValue(N, Vec), Q); } } ReplaceUses(SDValue(N, NumVecs), Chain); @@ -1209,7 +1202,7 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs, SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, unsigned *DOpcodes, unsigned *QOpcodes0, unsigned *QOpcodes1) { - assert(NumVecs >=1 && NumVecs <= 4 && "VST NumVecs out-of-range"); + assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range"); DebugLoc dl = N->getDebugLoc(); SDValue MemAddr, Align; @@ -1247,7 +1240,7 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, Ops.push_back(Align); if (is64BitVector) { - if (llvm::ModelWithRegSequence() && NumVecs >= 2) { + if (NumVecs >= 2) { SDValue RegSeq; SDValue V0 = N->getOperand(0+3); SDValue V1 = N->getOperand(1+3); @@ -1292,7 +1285,7 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, // Quad registers are directly supported for VST1 and VST2, // storing pairs of D regs. unsigned Opc = QOpcodes0[OpcodeIndex]; - if (llvm::ModelWithRegSequence() && NumVecs == 2) { + if (NumVecs == 2) { // First extract the pair of Q registers. SDValue Q0 = N->getOperand(3); SDValue Q1 = N->getOperand(4); @@ -1330,76 +1323,48 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, // Otherwise, quad registers are stored with two separate instructions, // where one stores the even registers and the other stores the odd registers. - if (llvm::ModelWithRegSequence()) { - // Form the QQQQ REG_SEQUENCE. - SDValue V[8]; - for (unsigned Vec = 0, i = 0; Vec < NumVecs; ++Vec, i+=2) { - V[i] = CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, RegVT, - N->getOperand(Vec+3)); - V[i+1] = CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, RegVT, - N->getOperand(Vec+3)); - } - if (NumVecs == 3) - V[6] = V[7] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, - dl, RegVT), 0); - - SDValue RegSeq = SDValue(OctoDRegs(MVT::v8i64, V[0], V[1], V[2], V[3], - V[4], V[5], V[6], V[7]), 0); - - // Store the even D registers. - assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); - Ops.push_back(Reg0); // post-access address offset - for (unsigned Vec = 0; Vec < NumVecs; ++Vec) - Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0+Vec*2, dl, - RegVT, RegSeq)); - Ops.push_back(Pred); - Ops.push_back(Reg0); // predicate register - Ops.push_back(Chain); - unsigned Opc = QOpcodes0[OpcodeIndex]; - SDNode *VStA = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(), - MVT::Other, Ops.data(), NumVecs+6); - Chain = SDValue(VStA, 1); - // Store the odd D registers. - Ops[0] = SDValue(VStA, 0); // MemAddr - for (unsigned Vec = 0; Vec < NumVecs; ++Vec) - Ops[Vec+3] = CurDAG->getTargetExtractSubreg(ARM::dsub_1+Vec*2, dl, - RegVT, RegSeq); - Ops[NumVecs+5] = Chain; - Opc = QOpcodes1[OpcodeIndex]; - SDNode *VStB = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(), - MVT::Other, Ops.data(), NumVecs+6); - Chain = SDValue(VStB, 1); - ReplaceUses(SDValue(N, 0), Chain); - return NULL; - } else { - Ops.push_back(Reg0); // post-access address offset - - // Store the even subregs. - for (unsigned Vec = 0; Vec < NumVecs; ++Vec) - Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, RegVT, - N->getOperand(Vec+3))); - Ops.push_back(Pred); - Ops.push_back(Reg0); // predicate register - Ops.push_back(Chain); - unsigned Opc = QOpcodes0[OpcodeIndex]; - SDNode *VStA = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(), - MVT::Other, Ops.data(), NumVecs+6); - Chain = SDValue(VStA, 1); - - // Store the odd subregs. - Ops[0] = SDValue(VStA, 0); // MemAddr - for (unsigned Vec = 0; Vec < NumVecs; ++Vec) - Ops[Vec+3] = CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, RegVT, - N->getOperand(Vec+3)); - Ops[NumVecs+5] = Chain; - Opc = QOpcodes1[OpcodeIndex]; - SDNode *VStB = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(), - MVT::Other, Ops.data(), NumVecs+6); - Chain = SDValue(VStB, 1); - ReplaceUses(SDValue(N, 0), Chain); - return NULL; - } + // Form the QQQQ REG_SEQUENCE. + SDValue V[8]; + for (unsigned Vec = 0, i = 0; Vec < NumVecs; ++Vec, i+=2) { + V[i] = CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, RegVT, + N->getOperand(Vec+3)); + V[i+1] = CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, RegVT, + N->getOperand(Vec+3)); + } + if (NumVecs == 3) + V[6] = V[7] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, + dl, RegVT), 0); + + SDValue RegSeq = SDValue(OctoDRegs(MVT::v8i64, V[0], V[1], V[2], V[3], + V[4], V[5], V[6], V[7]), 0); + + // Store the even D registers. + assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); + Ops.push_back(Reg0); // post-access address offset + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0+Vec*2, dl, + RegVT, RegSeq)); + Ops.push_back(Pred); + Ops.push_back(Reg0); // predicate register + Ops.push_back(Chain); + unsigned Opc = QOpcodes0[OpcodeIndex]; + SDNode *VStA = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(), + MVT::Other, Ops.data(), NumVecs+6); + Chain = SDValue(VStA, 1); + + // Store the odd D registers. + Ops[0] = SDValue(VStA, 0); // MemAddr + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + Ops[Vec+3] = CurDAG->getTargetExtractSubreg(ARM::dsub_1+Vec*2, dl, + RegVT, RegSeq); + Ops[NumVecs+5] = Chain; + Opc = QOpcodes1[OpcodeIndex]; + SDNode *VStB = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(), + MVT::Other, Ops.data(), NumVecs+6); + Chain = SDValue(VStB, 1); + ReplaceUses(SDValue(N, 0), Chain); + return NULL; } SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, @@ -1421,13 +1386,11 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, // Quad registers are handled by load/store of subregs. Find the subreg info. unsigned NumElts = 0; - int SubregIdx = 0; bool Even = false; EVT RegVT = VT; if (!is64BitVector) { RegVT = GetNEONSubregVT(VT); NumElts = RegVT.getVectorNumElements(); - SubregIdx = (Lane < NumElts) ? ARM::dsub_0 : ARM::dsub_1; Even = Lane < NumElts; } @@ -1455,35 +1418,26 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, unsigned Opc = 0; if (is64BitVector) { Opc = DOpcodes[OpcodeIndex]; - if (llvm::ModelWithRegSequence()) { - SDValue RegSeq; - SDValue V0 = N->getOperand(0+3); - SDValue V1 = N->getOperand(1+3); - if (NumVecs == 2) { - RegSeq = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0); - } else { - SDValue V2 = N->getOperand(2+3); - SDValue V3 = (NumVecs == 3) - ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0) - : N->getOperand(3+3); - RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0); - } - - // Now extract the D registers back out. - Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, VT, - RegSeq)); - Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, VT, - RegSeq)); - if (NumVecs > 2) - Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_2, dl, VT, - RegSeq)); - if (NumVecs > 3) - Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_3, dl, VT, - RegSeq)); + SDValue RegSeq; + SDValue V0 = N->getOperand(0+3); + SDValue V1 = N->getOperand(1+3); + if (NumVecs == 2) { + RegSeq = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0); } else { - for (unsigned Vec = 0; Vec < NumVecs; ++Vec) - Ops.push_back(N->getOperand(Vec+3)); + SDValue V2 = N->getOperand(2+3); + SDValue V3 = (NumVecs == 3) + ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0) + : N->getOperand(3+3); + RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0); } + + // Now extract the D registers back out. + Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, VT, RegSeq)); + Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, VT, RegSeq)); + if (NumVecs > 2) + Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_2, dl, VT,RegSeq)); + if (NumVecs > 3) + Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_3, dl, VT,RegSeq)); } else { // Check if this is loading the even or odd subreg of a Q register. if (Lane < NumElts) { @@ -1493,31 +1447,24 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, Opc = QOpcodes1[OpcodeIndex]; } - if (llvm::ModelWithRegSequence()) { - SDValue RegSeq; - SDValue V0 = N->getOperand(0+3); - SDValue V1 = N->getOperand(1+3); - if (NumVecs == 2) { - RegSeq = SDValue(PairQRegs(MVT::v4i64, V0, V1), 0); - } else { - SDValue V2 = N->getOperand(2+3); - SDValue V3 = (NumVecs == 3) - ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0) - : N->getOperand(3+3); - RegSeq = SDValue(QuadQRegs(MVT::v8i64, V0, V1, V2, V3), 0); - } - - // Extract the subregs of the input vector. - unsigned SubIdx = Even ? ARM::dsub_0 : ARM::dsub_1; - for (unsigned Vec = 0; Vec < NumVecs; ++Vec) - Ops.push_back(CurDAG->getTargetExtractSubreg(SubIdx+Vec*2, dl, RegVT, - RegSeq)); + SDValue RegSeq; + SDValue V0 = N->getOperand(0+3); + SDValue V1 = N->getOperand(1+3); + if (NumVecs == 2) { + RegSeq = SDValue(PairQRegs(MVT::v4i64, V0, V1), 0); } else { - // Extract the subregs of the input vector. - for (unsigned Vec = 0; Vec < NumVecs; ++Vec) - Ops.push_back(CurDAG->getTargetExtractSubreg(SubregIdx, dl, RegVT, - N->getOperand(Vec+3))); + SDValue V2 = N->getOperand(2+3); + SDValue V3 = (NumVecs == 3) + ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0) + : N->getOperand(3+3); + RegSeq = SDValue(QuadQRegs(MVT::v8i64, V0, V1, V2, V3), 0); } + + // Extract the subregs of the input vector. + unsigned SubIdx = Even ? ARM::dsub_0 : ARM::dsub_1; + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + Ops.push_back(CurDAG->getTargetExtractSubreg(SubIdx+Vec*2, dl, RegVT, + RegSeq)); } Ops.push_back(getI32Imm(Lane)); Ops.push_back(Pred); @@ -1531,76 +1478,97 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, ResTys.push_back(MVT::Other); SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(),NumVecs+6); - if (llvm::ModelWithRegSequence()) { - // Form a REG_SEQUENCE to force register allocation. - SDValue RegSeq; - if (is64BitVector) { - SDValue V0 = SDValue(VLdLn, 0); - SDValue V1 = SDValue(VLdLn, 1); - if (NumVecs == 2) { - RegSeq = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0); - } else { - SDValue V2 = SDValue(VLdLn, 2); - // If it's a vld3, form a quad D-register but discard the last part. - SDValue V3 = (NumVecs == 3) - ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0) - : SDValue(VLdLn, 3); - RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0); - } + // Form a REG_SEQUENCE to force register allocation. + SDValue RegSeq; + if (is64BitVector) { + SDValue V0 = SDValue(VLdLn, 0); + SDValue V1 = SDValue(VLdLn, 1); + if (NumVecs == 2) { + RegSeq = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0); } else { - // For 128-bit vectors, take the 64-bit results of the load and insert them - // as subregs into the result. - SDValue V[8]; - for (unsigned Vec = 0, i = 0; Vec < NumVecs; ++Vec, i+=2) { - if (Even) { - V[i] = SDValue(VLdLn, Vec); - V[i+1] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, - dl, RegVT), 0); - } else { - V[i] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, - dl, RegVT), 0); - V[i+1] = SDValue(VLdLn, Vec); - } + SDValue V2 = SDValue(VLdLn, 2); + // If it's a vld3, form a quad D-register but discard the last part. + SDValue V3 = (NumVecs == 3) + ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0) + : SDValue(VLdLn, 3); + RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0); + } + } else { + // For 128-bit vectors, take the 64-bit results of the load and insert + // them as subregs into the result. + SDValue V[8]; + for (unsigned Vec = 0, i = 0; Vec < NumVecs; ++Vec, i+=2) { + if (Even) { + V[i] = SDValue(VLdLn, Vec); + V[i+1] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, + dl, RegVT), 0); + } else { + V[i] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, + dl, RegVT), 0); + V[i+1] = SDValue(VLdLn, Vec); } - if (NumVecs == 3) - V[6] = V[7] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, - dl, RegVT), 0); - - if (NumVecs == 2) - RegSeq = SDValue(QuadDRegs(MVT::v4i64, V[0], V[1], V[2], V[3]), 0); - else - RegSeq = SDValue(OctoDRegs(MVT::v8i64, V[0], V[1], V[2], V[3], - V[4], V[5], V[6], V[7]), 0); } + if (NumVecs == 3) + V[6] = V[7] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, + dl, RegVT), 0); - assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); - assert(ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering"); - unsigned SubIdx = is64BitVector ? ARM::dsub_0 : ARM::qsub_0; - for (unsigned Vec = 0; Vec < NumVecs; ++Vec) - ReplaceUses(SDValue(N, Vec), - CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, RegSeq)); - ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, NumVecs)); - return NULL; - } - - // For a 64-bit vector load to D registers, nothing more needs to be done. - if (is64BitVector) - return VLdLn; - - // For 128-bit vectors, take the 64-bit results of the load and insert them - // as subregs into the result. - for (unsigned Vec = 0; Vec < NumVecs; ++Vec) { - SDValue QuadVec = CurDAG->getTargetInsertSubreg(SubregIdx, dl, VT, - N->getOperand(Vec+3), - SDValue(VLdLn, Vec)); - ReplaceUses(SDValue(N, Vec), QuadVec); + if (NumVecs == 2) + RegSeq = SDValue(QuadDRegs(MVT::v4i64, V[0], V[1], V[2], V[3]), 0); + else + RegSeq = SDValue(OctoDRegs(MVT::v8i64, V[0], V[1], V[2], V[3], + V[4], V[5], V[6], V[7]), 0); } - Chain = SDValue(VLdLn, NumVecs); - ReplaceUses(SDValue(N, NumVecs), Chain); + assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); + assert(ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering"); + unsigned SubIdx = is64BitVector ? ARM::dsub_0 : ARM::qsub_0; + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + ReplaceUses(SDValue(N, Vec), + CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, RegSeq)); + ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, NumVecs)); return NULL; } +SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, + unsigned Opc) { + assert(NumVecs >= 2 && NumVecs <= 4 && "VTBL NumVecs out-of-range"); + DebugLoc dl = N->getDebugLoc(); + EVT VT = N->getValueType(0); + unsigned FirstTblReg = IsExt ? 2 : 1; + + // Form a REG_SEQUENCE to force register allocation. + SDValue RegSeq; + SDValue V0 = N->getOperand(FirstTblReg + 0); + SDValue V1 = N->getOperand(FirstTblReg + 1); + if (NumVecs == 2) + RegSeq = SDValue(PairDRegs(MVT::v16i8, V0, V1), 0); + else { + SDValue V2 = N->getOperand(FirstTblReg + 2); + // If it's a vtbl3, form a quad D-register and leave the last part as + // an undef. + SDValue V3 = (NumVecs == 3) + ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0) + : N->getOperand(FirstTblReg + 3); + RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0); + } + + // Now extract the D registers back out. + SmallVector<SDValue, 6> Ops; + if (IsExt) + Ops.push_back(N->getOperand(1)); + Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, VT, RegSeq)); + Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, VT, RegSeq)); + if (NumVecs > 2) + Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_2, dl, VT, RegSeq)); + if (NumVecs > 3) + Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_3, dl, VT, RegSeq)); + + Ops.push_back(N->getOperand(FirstTblReg + NumVecs)); + Ops.push_back(getAL(CurDAG)); // predicate + Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // predicate register + return CurDAG->getMachineNode(Opc, dl, VT, Ops.data(), Ops.size()); +} + SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N, bool isSigned) { if (!Subtarget->hasV6T2Ops()) @@ -1954,8 +1922,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, MVT::i32); SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); if (Subtarget->isThumb()) { - SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG), Reg0 }; - return CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops, 5); + SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG), Reg0, Reg0 }; + return CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops, 6); } else { SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG), Reg0, Reg0 }; return CurDAG->SelectNodeTo(N, ARM::RSBrs, MVT::i32, Ops, 7); @@ -2015,7 +1983,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { SDValue Ops[] = { N->getOperand(0), N->getOperand(1), getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), CurDAG->getRegister(0, MVT::i32) }; - return CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32, Ops,4); + return CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32,Ops,4); } else { SDValue Ops[] = { N->getOperand(0), N->getOperand(1), getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), @@ -2029,7 +1997,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { if (Subtarget->isThumb()) { SDValue Ops[] = { N->getOperand(0), N->getOperand(1), getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) }; - return CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32, Ops,4); + return CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32,Ops,4); } else { SDValue Ops[] = { N->getOperand(0), N->getOperand(1), getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), @@ -2211,6 +2179,22 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg }; return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops, 4); } + case ARMISD::BUILD_VECTOR: { + EVT VecVT = N->getValueType(0); + EVT EltVT = VecVT.getVectorElementType(); + unsigned NumElts = VecVT.getVectorNumElements(); + if (EltVT.getSimpleVT() == MVT::f64) { + assert(NumElts == 2 && "unexpected type for BUILD_VECTOR"); + return PairDRegs(VecVT, N->getOperand(0), N->getOperand(1)); + } + assert(EltVT.getSimpleVT() == MVT::f32 && + "unexpected type for BUILD_VECTOR"); + if (NumElts == 2) + return PairSRegs(VecVT, N->getOperand(0), N->getOperand(1)); + assert(NumElts == 4 && "unexpected type for BUILD_VECTOR"); + return QuadSRegs(VecVT, N->getOperand(0), N->getOperand(1), + N->getOperand(2), N->getOperand(3)); + } case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: { @@ -2342,6 +2326,29 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { break; } + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + switch (IntNo) { + default: + break; + + case Intrinsic::arm_neon_vtbl2: + return SelectVTBL(N, false, 2, ARM::VTBL2); + case Intrinsic::arm_neon_vtbl3: + return SelectVTBL(N, false, 3, ARM::VTBL3); + case Intrinsic::arm_neon_vtbl4: + return SelectVTBL(N, false, 4, ARM::VTBL4); + + case Intrinsic::arm_neon_vtbx2: + return SelectVTBL(N, true, 2, ARM::VTBX2); + case Intrinsic::arm_neon_vtbx3: + return SelectVTBL(N, true, 3, ARM::VTBX3); + case Intrinsic::arm_neon_vtbx4: + return SelectVTBL(N, true, 4, ARM::VTBX4); + } + break; + } + case ISD::CONCAT_VECTORS: return SelectConcatVector(N); } @@ -2367,9 +2374,3 @@ FunctionPass *llvm::createARMISelDag(ARMBaseTargetMachine &TM, CodeGenOpt::Level OptLevel) { return new ARMDAGToDAGISel(TM, OptLevel); } - -/// ModelWithRegSequence - Return true if isel should use REG_SEQUENCE to model -/// operations involving sub-registers. -bool llvm::ModelWithRegSequence() { - return UseRegSeq; -} diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp index b8126a3..0091df7 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -12,6 +12,7 @@ // //===----------------------------------------------------------------------===// +#define DEBUG_TYPE "arm-isel" #include "ARM.h" #include "ARMAddressingModes.h" #include "ARMConstantPoolValue.h" @@ -40,6 +41,7 @@ #include "llvm/MC/MCSectionMachO.h" #include "llvm/Target/TargetOptions.h" #include "llvm/ADT/VectorExtras.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" @@ -47,9 +49,27 @@ #include <sstream> using namespace llvm; +STATISTIC(NumTailCalls, "Number of tail calls"); + +// This option should go away when tail calls fully work. +static cl::opt<bool> +EnableARMTailCalls("arm-tail-calls", cl::Hidden, + cl::desc("Generate tail calls (TEMPORARY OPTION)."), + cl::init(true)); + static cl::opt<bool> EnableARMLongCalls("arm-long-calls", cl::Hidden, - cl::desc("Generate calls via indirect call instructions."), + cl::desc("Generate calls via indirect call instructions"), + cl::init(false)); + +static cl::opt<bool> +ARMInterworking("arm-interworking", cl::Hidden, + cl::desc("Enable / disable ARM interworking (for debugging only)"), + cl::init(true)); + +static cl::opt<bool> +EnableARMCodePlacement("arm-code-placement", cl::Hidden, + cl::desc("Enable code placement pass for ARM"), cl::init(false)); static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, @@ -94,10 +114,7 @@ void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT, } setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom); - if (llvm::ModelWithRegSequence()) - setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal); - else - setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Expand); setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand); setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand); @@ -393,13 +410,57 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) // doesn't yet know how to not do that for SjLj. setExceptionSelectorRegister(ARM::R0); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); - setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom); - - // If the subtarget does not have extract instructions, sign_extend_inreg - // needs to be expanded. Extract is available in ARM mode on v6 and up, - // and on most Thumb2 implementations. - if ((!Subtarget->isThumb() && !Subtarget->hasV6Ops()) - || (Subtarget->isThumb2() && !Subtarget->hasT2ExtractPack())) { + // Handle atomics directly for ARMv[67] (except for Thumb1), otherwise + // use the default expansion. + bool canHandleAtomics = + (Subtarget->hasV7Ops() || + (Subtarget->hasV6Ops() && !Subtarget->isThumb1Only())); + if (canHandleAtomics) { + // membarrier needs custom lowering; the rest are legal and handled + // normally. + setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom); + } else { + // Set them all for expansion, which will force libcalls. + setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand); + // Since the libcalls include locking, fold in the fences + setShouldFoldAtomicFences(true); + } + // 64-bit versions are always libcalls (for now) + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Expand); + + // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes. + if (!Subtarget->hasV6Ops()) { setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); } @@ -412,8 +473,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); - setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); + if (Subtarget->isTargetDarwin()) { + setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); + setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); + } setOperationAction(ISD::SETCC, MVT::i32, Expand); setOperationAction(ISD::SETCC, MVT::f32, Expand); @@ -474,28 +537,14 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) else setSchedulingPreference(Sched::Hybrid); - // FIXME: If-converter should use instruction latency to determine - // profitability rather than relying on fixed limits. - if (Subtarget->getCPUString() == "generic") { - // Generic (and overly aggressive) if-conversion limits. - setIfCvtBlockSizeLimit(10); - setIfCvtDupBlockSizeLimit(2); - } else if (Subtarget->hasV7Ops()) { - setIfCvtBlockSizeLimit(3); - setIfCvtDupBlockSizeLimit(1); - } else if (Subtarget->hasV6Ops()) { - setIfCvtBlockSizeLimit(2); - setIfCvtDupBlockSizeLimit(1); - } else { - setIfCvtBlockSizeLimit(3); - setIfCvtDupBlockSizeLimit(2); - } - maxStoresPerMemcpy = 1; //// temporary - rewrite interface to use type - // Do not enable CodePlacementOpt for now: it currently runs after the - // ARMConstantIslandPass and messes up branch relaxation and placement - // of constant islands. - // benefitFromCodePlacementOpt = true; + + // On ARM arguments smaller than 4 bytes are extended, so all arguments + // are at least 4 bytes aligned. + setMinStackArgumentAlignment(4); + + if (EnableARMCodePlacement) + benefitFromCodePlacementOpt = true; } const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { @@ -516,6 +565,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::CMPZ: return "ARMISD::CMPZ"; case ARMISD::CMPFP: return "ARMISD::CMPFP"; case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0"; + case ARMISD::BCC_i64: return "ARMISD::BCC_i64"; case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; case ARMISD::CMOV: return "ARMISD::CMOV"; case ARMISD::CNEG: return "ARMISD::CNEG"; @@ -537,6 +587,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; case ARMISD::EH_SJLJ_LONGJMP:return "ARMISD::EH_SJLJ_LONGJMP"; + case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN"; + case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC"; @@ -572,6 +624,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu"; case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; + case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM"; + case ARMISD::VMVNIMM: return "ARMISD::VMVNIMM"; case ARMISD::VDUP: return "ARMISD::VDUP"; case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE"; case ARMISD::VEXT: return "ARMISD::VEXT"; @@ -581,6 +635,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VZIP: return "ARMISD::VZIP"; case ARMISD::VUZP: return "ARMISD::VUZP"; case ARMISD::VTRN: return "ARMISD::VTRN"; + case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; case ARMISD::FMAX: return "ARMISD::FMAX"; case ARMISD::FMIN: return "ARMISD::FMIN"; } @@ -603,15 +658,33 @@ TargetRegisterClass *ARMTargetLowering::getRegClassFor(EVT VT) const { /// getFunctionAlignment - Return the Log2 alignment of this function. unsigned ARMTargetLowering::getFunctionAlignment(const Function *F) const { - return getTargetMachine().getSubtarget<ARMSubtarget>().isThumb() ? 0 : 1; + return getTargetMachine().getSubtarget<ARMSubtarget>().isThumb() ? 1 : 2; } Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { - for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) { + unsigned NumVals = N->getNumValues(); + if (!NumVals) + return Sched::RegPressure; + + for (unsigned i = 0; i != NumVals; ++i) { EVT VT = N->getValueType(i); if (VT.isFloatingPoint() || VT.isVector()) return Sched::Latency; } + + if (!N->isMachineOpcode()) + return Sched::RegPressure; + + // Load are scheduled for latency even if there instruction itinerary + // is not available. + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrDesc &TID = TII->get(N->getMachineOpcode()); + if (TID.mayLoad()) + return Sched::Latency; + + const InstrItineraryData &Itins = getTargetMachine().getInstrItineraryData(); + if (!Itins.isEmpty() && Itins.getStageLatency(TID.getSchedClass()) > 2) + return Sched::Latency; return Sched::RegPressure; } @@ -964,11 +1037,28 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, bool &isTailCall, const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, const SmallVectorImpl<ISD::InputArg> &Ins, DebugLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { - // ARM target does not yet support tail call optimization. - isTailCall = false; + MachineFunction &MF = DAG.getMachineFunction(); + bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); + bool IsSibCall = false; + // Temporarily disable tail calls so things don't break. + if (!EnableARMTailCalls) + isTailCall = false; + if (isTailCall) { + // Check if it's really possible to do a tail call. + isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, + isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), + Outs, OutVals, Ins, DAG); + // We don't support GuaranteedTailCallOpt for ARM, only automatically + // detected sibcalls. + if (isTailCall) { + ++NumTailCalls; + IsSibCall = true; + } + } // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; @@ -981,9 +1071,14 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); + // For tail calls, memory operands are available in our caller's stack. + if (IsSibCall) + NumBytes = 0; + // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); + if (!IsSibCall) + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy()); @@ -996,7 +1091,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, i != e; ++i, ++realArgIdx) { CCValAssign &VA = ArgLocs[i]; - SDValue Arg = Outs[realArgIdx].Val; + SDValue Arg = OutVals[realArgIdx]; ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; // Promote the value if needed. @@ -1044,7 +1139,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, } } else if (VA.isRegLoc()) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); - } else { + } else if (!IsSibCall) { assert(VA.isMemLoc()); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, @@ -1059,10 +1154,32 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. SDValue InFlag; - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { - Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, - RegsToPass[i].second, InFlag); - InFlag = Chain.getValue(1); + // Tail call byval lowering might overwrite argument registers so in case of + // tail call optimization the copies to registers are lowered later. + if (!isTailCall) + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); + } + + // For tail calls lower the arguments to the 'real' stack slot. + if (isTailCall) { + // Force all the incoming stack arguments to be loaded from the stack + // before any new outgoing arguments are stored to the stack, because the + // outgoing stack slots may alias the incoming argument stack slots, and + // the alias isn't otherwise explicit. This is slightly more conservative + // than necessary, because it means that each store effectively depends + // on every argument instead of just those arguments it would clobber. + + // Do not flag preceeding copytoreg stuff together with the following stuff. + InFlag = SDValue(); + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); + } + InFlag =SDValue(); } // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every @@ -1071,7 +1188,6 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, bool isDirect = false; bool isARMFunc = false; bool isLocalARMFunc = false; - MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); if (EnableARMLongCalls) { @@ -1117,7 +1233,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, getTargetMachine().getRelocationModel() != Reloc::Static; isARMFunc = !Subtarget->isThumb() || isStub; // ARM call to a local ARM function is predicable. - isLocalARMFunc = !Subtarget->isThumb() && !isExt; + isLocalARMFunc = !Subtarget->isThumb() && (!isExt || !ARMInterworking); // tBX takes a register source operand. if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId(); @@ -1134,7 +1250,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, Callee = DAG.getNode(ARMISD::PIC_ADD, dl, getPointerTy(), Callee, PICLabel); } else - Callee = DAG.getTargetGlobalAddress(GV, getPointerTy()); + Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy()); } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { isDirect = true; bool isStub = Subtarget->isTargetDarwin() && @@ -1171,11 +1287,6 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, ? (isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL) : ARMISD::CALL_NOLINK; } - if (CallOpc == ARMISD::CALL_NOLINK && !Subtarget->isThumb1Only()) { - // implicit def LR - LR mustn't be allocated as GRP:$dst of CALL_NOLINK - Chain = DAG.getCopyToReg(Chain, dl, ARM::LR, DAG.getUNDEF(MVT::i32),InFlag); - InFlag = Chain.getValue(1); - } std::vector<SDValue> Ops; Ops.push_back(Chain); @@ -1189,9 +1300,13 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (InFlag.getNode()) Ops.push_back(InFlag); + + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); + if (isTailCall) + return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size()); + // Returns a chain and a flag for retval copy to use. - Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Flag), - &Ops[0], Ops.size()); + Chain = DAG.getNode(CallOpc, dl, NodeTys, &Ops[0], Ops.size()); InFlag = Chain.getValue(1); Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), @@ -1205,10 +1320,203 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, dl, DAG, InVals); } +/// MatchingStackOffset - Return true if the given stack call argument is +/// already available in the same position (relatively) of the caller's +/// incoming argument stack. +static +bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, + MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, + const ARMInstrInfo *TII) { + unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; + int FI = INT_MAX; + if (Arg.getOpcode() == ISD::CopyFromReg) { + unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); + if (!VR || TargetRegisterInfo::isPhysicalRegister(VR)) + return false; + MachineInstr *Def = MRI->getVRegDef(VR); + if (!Def) + return false; + if (!Flags.isByVal()) { + if (!TII->isLoadFromStackSlot(Def, FI)) + return false; + } else { + return false; + } + } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { + if (Flags.isByVal()) + // ByVal argument is passed in as a pointer but it's now being + // dereferenced. e.g. + // define @foo(%struct.X* %A) { + // tail call @bar(%struct.X* byval %A) + // } + return false; + SDValue Ptr = Ld->getBasePtr(); + FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); + if (!FINode) + return false; + FI = FINode->getIndex(); + } else + return false; + + assert(FI != INT_MAX); + if (!MFI->isFixedObjectIndex(FI)) + return false; + return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); +} + +/// IsEligibleForTailCallOptimization - Check whether the call is eligible +/// for tail call optimization. Targets which want to do tail call +/// optimization should implement this function. +bool +ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, + CallingConv::ID CalleeCC, + bool isVarArg, + bool isCalleeStructRet, + bool isCallerStructRet, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + SelectionDAG& DAG) const { + const Function *CallerF = DAG.getMachineFunction().getFunction(); + CallingConv::ID CallerCC = CallerF->getCallingConv(); + bool CCMatch = CallerCC == CalleeCC; + + // Look for obvious safe cases to perform tail call optimization that do not + // require ABI changes. This is what gcc calls sibcall. + + // Do not sibcall optimize vararg calls unless the call site is not passing + // any arguments. + if (isVarArg && !Outs.empty()) + return false; + + // Also avoid sibcall optimization if either caller or callee uses struct + // return semantics. + if (isCalleeStructRet || isCallerStructRet) + return false; + + // FIXME: Completely disable sibcall for Thumb1 since Thumb1RegisterInfo:: + // emitEpilogue is not ready for them. + // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take + // LR. This means if we need to reload LR, it takes an extra instructions, + // which outweighs the value of the tail call; but here we don't know yet + // whether LR is going to be used. Probably the right approach is to + // generate the tail call here and turn it back into CALL/RET in + // emitEpilogue if LR is used. + if (Subtarget->isThumb1Only()) + return false; + + // For the moment, we can only do this to functions defined in this + // compilation, or to indirect calls. A Thumb B to an ARM function, + // or vice versa, is not easily fixed up in the linker unlike BL. + // (We could do this by loading the address of the callee into a register; + // that is an extra instruction over the direct call and burns a register + // as well, so is not likely to be a win.) + + // It might be safe to remove this restriction on non-Darwin. + + // Thumb1 PIC calls to external symbols use BX, so they can be tail calls, + // but we need to make sure there are enough registers; the only valid + // registers are the 4 used for parameters. We don't currently do this + // case. + if (isa<ExternalSymbolSDNode>(Callee)) + return false; + + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + const GlobalValue *GV = G->getGlobal(); + if (GV->isDeclaration() || GV->isWeakForLinker()) + return false; + } + + // If the calling conventions do not match, then we'd better make sure the + // results are returned in the same way as what the caller expects. + if (!CCMatch) { + SmallVector<CCValAssign, 16> RVLocs1; + CCState CCInfo1(CalleeCC, false, getTargetMachine(), + RVLocs1, *DAG.getContext()); + CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC, true, isVarArg)); + + SmallVector<CCValAssign, 16> RVLocs2; + CCState CCInfo2(CallerCC, false, getTargetMachine(), + RVLocs2, *DAG.getContext()); + CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC, true, isVarArg)); + + if (RVLocs1.size() != RVLocs2.size()) + return false; + for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { + if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) + return false; + if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) + return false; + if (RVLocs1[i].isRegLoc()) { + if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) + return false; + } else { + if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) + return false; + } + } + } + + // If the callee takes no arguments then go on to check the results of the + // call. + if (!Outs.empty()) { + // Check if stack adjustment is needed. For now, do not do this if any + // argument is passed on the stack. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), + ArgLocs, *DAG.getContext()); + CCInfo.AnalyzeCallOperands(Outs, + CCAssignFnForNode(CalleeCC, false, isVarArg)); + if (CCInfo.getNextStackOffset()) { + MachineFunction &MF = DAG.getMachineFunction(); + + // Check if the arguments are already laid out in the right way as + // the caller's fixed stack objects. + MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineRegisterInfo *MRI = &MF.getRegInfo(); + const ARMInstrInfo *TII = + ((ARMTargetMachine&)getTargetMachine()).getInstrInfo(); + for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); + i != e; + ++i, ++realArgIdx) { + CCValAssign &VA = ArgLocs[i]; + EVT RegVT = VA.getLocVT(); + SDValue Arg = OutVals[realArgIdx]; + ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; + if (VA.getLocInfo() == CCValAssign::Indirect) + return false; + if (VA.needsCustom()) { + // f64 and vector types are split into multiple registers or + // register/stack-slot combinations. The types will not match + // the registers; give up on memory f64 refs until we figure + // out what to do about this. + if (!VA.isRegLoc()) + return false; + if (!ArgLocs[++i].isRegLoc()) + return false; + if (RegVT == MVT::v2f64) { + if (!ArgLocs[++i].isRegLoc()) + return false; + if (!ArgLocs[++i].isRegLoc()) + return false; + } + } else if (!VA.isRegLoc()) { + if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, + MFI, MRI, TII)) + return false; + } + } + } + } + + return true; +} + SDValue ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, DebugLoc dl, SelectionDAG &DAG) const { // CCValAssign - represent the assignment of the return value to a location. @@ -1239,7 +1547,7 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); - SDValue Arg = Outs[realRVLocIdx].Val; + SDValue Arg = OutVals[realRVLocIdx]; switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); @@ -1477,7 +1785,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, // pair. This is always cheaper. if (Subtarget->useMovt()) { return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, - DAG.getTargetGlobalAddress(GV, PtrVT)); + DAG.getTargetGlobalAddress(GV, dl, PtrVT)); } else { SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); @@ -1552,9 +1860,7 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, SDValue ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); - SDValue Val = Subtarget->isThumb() ? - DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::SP, MVT::i32) : - DAG.getConstant(0, MVT::i32); + SDValue Val = DAG.getConstant(0, MVT::i32); return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, MVT::i32, Op.getOperand(0), Op.getOperand(1), Val); } @@ -1568,8 +1874,7 @@ ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { SDValue ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, - const ARMSubtarget *Subtarget) - const { + const ARMSubtarget *Subtarget) const { unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); DebugLoc dl = Op.getDebugLoc(); switch (IntNo) { @@ -1597,7 +1902,6 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, PseudoSourceValue::getConstantPool(), 0, false, false, 0); - SDValue Chain = Result.getValue(1); if (RelocM == Reloc::PIC_) { SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); @@ -1609,25 +1913,21 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, } static SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG, - const ARMSubtarget *Subtarget) { + const ARMSubtarget *Subtarget) { DebugLoc dl = Op.getDebugLoc(); SDValue Op5 = Op.getOperand(5); - SDValue Res; unsigned isDeviceBarrier = cast<ConstantSDNode>(Op5)->getZExtValue(); - if (isDeviceBarrier) { - if (Subtarget->hasV7Ops()) - Res = DAG.getNode(ARMISD::SYNCBARRIER, dl, MVT::Other, Op.getOperand(0)); - else - Res = DAG.getNode(ARMISD::SYNCBARRIER, dl, MVT::Other, Op.getOperand(0), - DAG.getConstant(0, MVT::i32)); - } else { - if (Subtarget->hasV7Ops()) - Res = DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); - else - Res = DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0), - DAG.getConstant(0, MVT::i32)); - } - return Res; + // v6 and v7 can both handle barriers directly, but need handled a bit + // differently. Thumb1 and pre-v6 ARM mode use a libcall instead and should + // never get here. + unsigned Opc = isDeviceBarrier ? ARMISD::SYNCBARRIER : ARMISD::MEMBARRIER; + if (Subtarget->hasV7Ops()) + return DAG.getNode(Opc, dl, MVT::Other, Op.getOperand(0)); + else if (Subtarget->hasV6Ops() && !Subtarget->isThumb1Only()) + return DAG.getNode(Opc, dl, MVT::Other, Op.getOperand(0), + DAG.getConstant(0, MVT::i32)); + assert(0 && "Unexpected ISD::MEMBARRIER encountered. Should be libcall!"); + return SDValue(); } static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { @@ -1712,7 +2012,7 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, SDValue ArgValue2; if (NextVA.isMemLoc()) { MachineFrameInfo *MFI = MF.getFrameInfo(); - int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true, false); + int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true); // Create load node to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); @@ -1768,8 +2068,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, VA = ArgLocs[++i]; // skip ahead to next loc SDValue ArgValue2; if (VA.isMemLoc()) { - int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), - true, false); + int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true); SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, PseudoSourceValue::getFixedStack(FI), 0, @@ -1836,8 +2135,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); unsigned ArgSize = VA.getLocVT().getSizeInBits()/8; - int FI = MFI->CreateFixedObject(ArgSize, VA.getLocMemOffset(), - true, false); + int FI = MFI->CreateFixedObject(ArgSize, VA.getLocMemOffset(), true); // Create load nodes to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); @@ -1868,7 +2166,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, AFI->setVarArgsFrameIndex( MFI->CreateFixedObject(VARegSaveSize, ArgOffset + VARegSaveSize - VARegSize, - true, false)); + true)); SDValue FIN = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(), getPointerTy()); @@ -1884,8 +2182,8 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, - PseudoSourceValue::getFixedStack(AFI->getVarArgsFrameIndex()), 0, - false, false, 0); + PseudoSourceValue::getFixedStack(AFI->getVarArgsFrameIndex()), + 0, false, false, 0); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, DAG.getConstant(4, getPointerTy())); @@ -1895,8 +2193,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, &MemOps[0], MemOps.size()); } else // This will point to the next argument passed via stack. - AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(4, ArgOffset, - true, false)); + AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(4, ArgOffset, true)); } return Chain; @@ -1922,7 +2219,7 @@ static bool isFloatingPointZero(SDValue Op) { /// the given operands. SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, - SDValue &ARMCC, SelectionDAG &DAG, + SDValue &ARMcc, SelectionDAG &DAG, DebugLoc dl) const { if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { unsigned C = RHSC->getZExtValue(); @@ -1974,13 +2271,14 @@ ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, CompareType = ARMISD::CMPZ; break; } - ARMCC = DAG.getConstant(CondCode, MVT::i32); + ARMcc = DAG.getConstant(CondCode, MVT::i32); return DAG.getNode(CompareType, dl, MVT::Flag, LHS, RHS); } /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. -static SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, - DebugLoc dl) { +SDValue +ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, + DebugLoc dl) const { SDValue Cmp; if (!isFloatingPointZero(RHS)) Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Flag, LHS, RHS); @@ -1999,59 +2297,184 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); if (LHS.getValueType() == MVT::i32) { - SDValue ARMCC; + SDValue ARMcc; SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); - SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, dl); - return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMCC, CCR,Cmp); + SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); + return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,Cmp); } ARMCC::CondCodes CondCode, CondCode2; FPCCToARMCC(CC, CondCode, CondCode2); - SDValue ARMCC = DAG.getConstant(CondCode, MVT::i32); - SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32); SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDValue Result = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, - ARMCC, CCR, Cmp); + ARMcc, CCR, Cmp); if (CondCode2 != ARMCC::AL) { - SDValue ARMCC2 = DAG.getConstant(CondCode2, MVT::i32); + SDValue ARMcc2 = DAG.getConstant(CondCode2, MVT::i32); // FIXME: Needs another CMP because flag can have but one use. SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl); Result = DAG.getNode(ARMISD::CMOV, dl, VT, - Result, TrueVal, ARMCC2, CCR, Cmp2); + Result, TrueVal, ARMcc2, CCR, Cmp2); } return Result; } +/// canChangeToInt - Given the fp compare operand, return true if it is suitable +/// to morph to an integer compare sequence. +static bool canChangeToInt(SDValue Op, bool &SeenZero, + const ARMSubtarget *Subtarget) { + SDNode *N = Op.getNode(); + if (!N->hasOneUse()) + // Otherwise it requires moving the value from fp to integer registers. + return false; + if (!N->getNumValues()) + return false; + EVT VT = Op.getValueType(); + if (VT != MVT::f32 && !Subtarget->isFPBrccSlow()) + // f32 case is generally profitable. f64 case only makes sense when vcmpe + + // vmrs are very slow, e.g. cortex-a8. + return false; + + if (isFloatingPointZero(Op)) { + SeenZero = true; + return true; + } + return ISD::isNormalLoad(N); +} + +static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { + if (isFloatingPointZero(Op)) + return DAG.getConstant(0, MVT::i32); + + if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) + return DAG.getLoad(MVT::i32, Op.getDebugLoc(), + Ld->getChain(), Ld->getBasePtr(), + Ld->getSrcValue(), Ld->getSrcValueOffset(), + Ld->isVolatile(), Ld->isNonTemporal(), + Ld->getAlignment()); + + llvm_unreachable("Unknown VFP cmp argument!"); +} + +static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, + SDValue &RetVal1, SDValue &RetVal2) { + if (isFloatingPointZero(Op)) { + RetVal1 = DAG.getConstant(0, MVT::i32); + RetVal2 = DAG.getConstant(0, MVT::i32); + return; + } + + if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) { + SDValue Ptr = Ld->getBasePtr(); + RetVal1 = DAG.getLoad(MVT::i32, Op.getDebugLoc(), + Ld->getChain(), Ptr, + Ld->getSrcValue(), Ld->getSrcValueOffset(), + Ld->isVolatile(), Ld->isNonTemporal(), + Ld->getAlignment()); + + EVT PtrType = Ptr.getValueType(); + unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); + SDValue NewPtr = DAG.getNode(ISD::ADD, Op.getDebugLoc(), + PtrType, Ptr, DAG.getConstant(4, PtrType)); + RetVal2 = DAG.getLoad(MVT::i32, Op.getDebugLoc(), + Ld->getChain(), NewPtr, + Ld->getSrcValue(), Ld->getSrcValueOffset() + 4, + Ld->isVolatile(), Ld->isNonTemporal(), + NewAlign); + return; + } + + llvm_unreachable("Unknown VFP cmp argument!"); +} + +/// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some +/// f32 and even f64 comparisons to integer ones. +SDValue +ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); + SDValue LHS = Op.getOperand(2); + SDValue RHS = Op.getOperand(3); + SDValue Dest = Op.getOperand(4); + DebugLoc dl = Op.getDebugLoc(); + + bool SeenZero = false; + if (canChangeToInt(LHS, SeenZero, Subtarget) && + canChangeToInt(RHS, SeenZero, Subtarget) && + // If one of the operand is zero, it's safe to ignore the NaN case. + (FiniteOnlyFPMath() || SeenZero)) { + // If unsafe fp math optimization is enabled and there are no othter uses of + // the CMP operands, and the condition code is EQ oe NE, we can optimize it + // to an integer comparison. + if (CC == ISD::SETOEQ) + CC = ISD::SETEQ; + else if (CC == ISD::SETUNE) + CC = ISD::SETNE; + + SDValue ARMcc; + if (LHS.getValueType() == MVT::f32) { + LHS = bitcastf32Toi32(LHS, DAG); + RHS = bitcastf32Toi32(RHS, DAG); + SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, + Chain, Dest, ARMcc, CCR, Cmp); + } + + SDValue LHS1, LHS2; + SDValue RHS1, RHS2; + expandf64Toi32(LHS, DAG, LHS1, LHS2); + expandf64Toi32(RHS, DAG, RHS1, RHS2); + ARMCC::CondCodes CondCode = IntCCToARMCC(CC); + ARMcc = DAG.getConstant(CondCode, MVT::i32); + SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Flag); + SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest }; + return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops, 7); + } + + return SDValue(); +} + SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { - SDValue Chain = Op.getOperand(0); + SDValue Chain = Op.getOperand(0); ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); - SDValue LHS = Op.getOperand(2); - SDValue RHS = Op.getOperand(3); - SDValue Dest = Op.getOperand(4); + SDValue LHS = Op.getOperand(2); + SDValue RHS = Op.getOperand(3); + SDValue Dest = Op.getOperand(4); DebugLoc dl = Op.getDebugLoc(); if (LHS.getValueType() == MVT::i32) { - SDValue ARMCC; + SDValue ARMcc; + SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); - SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, dl); return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, - Chain, Dest, ARMCC, CCR,Cmp); + Chain, Dest, ARMcc, CCR, Cmp); } assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); + + if (UnsafeFPMath && + (CC == ISD::SETEQ || CC == ISD::SETOEQ || + CC == ISD::SETNE || CC == ISD::SETUNE)) { + SDValue Result = OptimizeVFPBrcond(Op, DAG); + if (Result.getNode()) + return Result; + } + ARMCC::CondCodes CondCode, CondCode2; FPCCToARMCC(CC, CondCode, CondCode2); + SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32); SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); - SDValue ARMCC = DAG.getConstant(CondCode, MVT::i32); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Flag); - SDValue Ops[] = { Chain, Dest, ARMCC, CCR, Cmp }; + SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5); if (CondCode2 != ARMCC::AL) { - ARMCC = DAG.getConstant(CondCode2, MVT::i32); - SDValue Ops[] = { Res, Dest, ARMCC, CCR, Res.getValue(1) }; + ARMcc = DAG.getConstant(CondCode2, MVT::i32); + SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) }; Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5); } return Res; @@ -2132,7 +2555,7 @@ static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(Opc, dl, VT, Op); } -static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { +SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { // Implement fcopysign with a fabs and a conditional fneg. SDValue Tmp0 = Op.getOperand(0); SDValue Tmp1 = Op.getOperand(1); @@ -2140,10 +2563,11 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); EVT SrcVT = Tmp1.getValueType(); SDValue AbsVal = DAG.getNode(ISD::FABS, dl, VT, Tmp0); - SDValue Cmp = getVFPCmp(Tmp1, DAG.getConstantFP(0.0, SrcVT), DAG, dl); - SDValue ARMCC = DAG.getConstant(ARMCC::LT, MVT::i32); + SDValue ARMcc = DAG.getConstant(ARMCC::LT, MVT::i32); + SDValue FP0 = DAG.getConstantFP(0.0, SrcVT); + SDValue Cmp = getVFPCmp(Tmp1, FP0, DAG, dl); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); - return DAG.getNode(ARMISD::CNEG, dl, VT, AbsVal, AbsVal, ARMCC, CCR, Cmp); + return DAG.getNode(ARMISD::CNEG, dl, VT, AbsVal, AbsVal, ARMcc, CCR, Cmp); } SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ @@ -2206,7 +2630,8 @@ static SDValue ExpandBIT_CONVERT(SDNode *N, SelectionDAG &DAG) { DAG.getConstant(0, MVT::i32)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, DAG.getConstant(1, MVT::i32)); - return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); + return DAG.getNode(ISD::BIT_CONVERT, dl, DstVT, + DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi)); } // Turn f64->i64 into VMOVRRD. @@ -2221,51 +2646,18 @@ static SDValue ExpandBIT_CONVERT(SDNode *N, SelectionDAG &DAG) { } /// getZeroVector - Returns a vector of specified type with all zero elements. -/// +/// Zero vectors are used to represent vector negation and in those cases +/// will be implemented with the NEON VNEG instruction. However, VNEG does +/// not support i64 elements, so sometimes the zero vectors will need to be +/// explicitly constructed. Regardless, use a canonical VMOV to create the +/// zero vector. static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { assert(VT.isVector() && "Expected a vector type"); - - // Zero vectors are used to represent vector negation and in those cases - // will be implemented with the NEON VNEG instruction. However, VNEG does - // not support i64 elements, so sometimes the zero vectors will need to be - // explicitly constructed. For those cases, and potentially other uses in - // the future, always build zero vectors as <16 x i8> or <8 x i8> bitcasted - // to their dest type. This ensures they get CSE'd. - SDValue Vec; - SDValue Cst = DAG.getTargetConstant(0, MVT::i8); - SmallVector<SDValue, 8> Ops; - MVT TVT; - - if (VT.getSizeInBits() == 64) { - Ops.assign(8, Cst); TVT = MVT::v8i8; - } else { - Ops.assign(16, Cst); TVT = MVT::v16i8; - } - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, TVT, &Ops[0], Ops.size()); - - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); -} - -/// getOnesVector - Returns a vector of specified type with all bits set. -/// -static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { - assert(VT.isVector() && "Expected a vector type"); - - // Always build ones vectors as <16 x i8> or <8 x i8> bitcasted to their - // dest type. This ensures they get CSE'd. - SDValue Vec; - SDValue Cst = DAG.getTargetConstant(0xFF, MVT::i8); - SmallVector<SDValue, 8> Ops; - MVT TVT; - - if (VT.getSizeInBits() == 64) { - Ops.assign(8, Cst); TVT = MVT::v8i8; - } else { - Ops.assign(16, Cst); TVT = MVT::v16i8; - } - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, TVT, &Ops[0], Ops.size()); - - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); + // The canonical modified immediate encoding of a zero vector is....0! + SDValue EncodedVal = DAG.getTargetConstant(0, MVT::i32); + EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; + SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal); + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vmov); } /// LowerShiftRightParts - Lower SRA_PARTS, which returns two @@ -2279,7 +2671,7 @@ SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); - SDValue ARMCC; + SDValue ARMcc; unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); @@ -2295,9 +2687,9 @@ SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE, - ARMCC, DAG, dl); + ARMcc, DAG, dl); SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); - SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMCC, + SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp); SDValue Ops[2] = { Lo, Hi }; @@ -2315,7 +2707,7 @@ SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); - SDValue ARMCC; + SDValue ARMcc; assert(Op.getOpcode() == ISD::SHL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, @@ -2329,9 +2721,9 @@ SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE, - ARMCC, DAG, dl); + ARMcc, DAG, dl); SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); - SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, Tmp3, ARMCC, + SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, Tmp3, ARMcc, CCR, Cmp); SDValue Ops[2] = { Lo, Hi }; @@ -2516,89 +2908,138 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { return Result; } -/// isVMOVSplat - Check if the specified splat value corresponds to an immediate -/// VMOV instruction, and if so, return the constant being splatted. -static SDValue isVMOVSplat(uint64_t SplatBits, uint64_t SplatUndef, - unsigned SplatBitSize, SelectionDAG &DAG) { +/// isNEONModifiedImm - Check if the specified splat value corresponds to a +/// valid vector constant for a NEON instruction with a "modified immediate" +/// operand (e.g., VMOV). If so, return the encoded value. +static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, + unsigned SplatBitSize, SelectionDAG &DAG, + EVT &VT, bool is128Bits, bool isVMOV) { + unsigned OpCmode, Imm; + + // SplatBitSize is set to the smallest size that splats the vector, so a + // zero vector will always have SplatBitSize == 8. However, NEON modified + // immediate instructions others than VMOV do not support the 8-bit encoding + // of a zero vector, and the default encoding of zero is supposed to be the + // 32-bit version. + if (SplatBits == 0) + SplatBitSize = 32; + switch (SplatBitSize) { case 8: - // Any 1-byte value is OK. + if (!isVMOV) + return SDValue(); + // Any 1-byte value is OK. Op=0, Cmode=1110. assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); - return DAG.getTargetConstant(SplatBits, MVT::i8); + OpCmode = 0xe; + Imm = SplatBits; + VT = is128Bits ? MVT::v16i8 : MVT::v8i8; + break; case 16: // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. - if ((SplatBits & ~0xff) == 0 || - (SplatBits & ~0xff00) == 0) - return DAG.getTargetConstant(SplatBits, MVT::i16); - break; + VT = is128Bits ? MVT::v8i16 : MVT::v4i16; + if ((SplatBits & ~0xff) == 0) { + // Value = 0x00nn: Op=x, Cmode=100x. + OpCmode = 0x8; + Imm = SplatBits; + break; + } + if ((SplatBits & ~0xff00) == 0) { + // Value = 0xnn00: Op=x, Cmode=101x. + OpCmode = 0xa; + Imm = SplatBits >> 8; + break; + } + return SDValue(); case 32: // NEON's 32-bit VMOV supports splat values where: // * only one byte is nonzero, or // * the least significant byte is 0xff and the second byte is nonzero, or // * the least significant 2 bytes are 0xff and the third is nonzero. - if ((SplatBits & ~0xff) == 0 || - (SplatBits & ~0xff00) == 0 || - (SplatBits & ~0xff0000) == 0 || - (SplatBits & ~0xff000000) == 0) - return DAG.getTargetConstant(SplatBits, MVT::i32); + VT = is128Bits ? MVT::v4i32 : MVT::v2i32; + if ((SplatBits & ~0xff) == 0) { + // Value = 0x000000nn: Op=x, Cmode=000x. + OpCmode = 0; + Imm = SplatBits; + break; + } + if ((SplatBits & ~0xff00) == 0) { + // Value = 0x0000nn00: Op=x, Cmode=001x. + OpCmode = 0x2; + Imm = SplatBits >> 8; + break; + } + if ((SplatBits & ~0xff0000) == 0) { + // Value = 0x00nn0000: Op=x, Cmode=010x. + OpCmode = 0x4; + Imm = SplatBits >> 16; + break; + } + if ((SplatBits & ~0xff000000) == 0) { + // Value = 0xnn000000: Op=x, Cmode=011x. + OpCmode = 0x6; + Imm = SplatBits >> 24; + break; + } if ((SplatBits & ~0xffff) == 0 && - ((SplatBits | SplatUndef) & 0xff) == 0xff) - return DAG.getTargetConstant(SplatBits | 0xff, MVT::i32); + ((SplatBits | SplatUndef) & 0xff) == 0xff) { + // Value = 0x0000nnff: Op=x, Cmode=1100. + OpCmode = 0xc; + Imm = SplatBits >> 8; + SplatBits |= 0xff; + break; + } if ((SplatBits & ~0xffffff) == 0 && - ((SplatBits | SplatUndef) & 0xffff) == 0xffff) - return DAG.getTargetConstant(SplatBits | 0xffff, MVT::i32); + ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { + // Value = 0x00nnffff: Op=x, Cmode=1101. + OpCmode = 0xd; + Imm = SplatBits >> 16; + SplatBits |= 0xffff; + break; + } // Note: there are a few 32-bit splat values (specifically: 00ffff00, // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not // VMOV.I32. A (very) minor optimization would be to replicate the value // and fall through here to test for a valid 64-bit splat. But, then the // caller would also need to check and handle the change in size. - break; + return SDValue(); case 64: { + if (!isVMOV) + return SDValue(); // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. uint64_t BitMask = 0xff; uint64_t Val = 0; + unsigned ImmMask = 1; + Imm = 0; for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { - if (((SplatBits | SplatUndef) & BitMask) == BitMask) + if (((SplatBits | SplatUndef) & BitMask) == BitMask) { Val |= BitMask; - else if ((SplatBits & BitMask) != 0) + Imm |= ImmMask; + } else if ((SplatBits & BitMask) != 0) { return SDValue(); + } BitMask <<= 8; + ImmMask <<= 1; } - return DAG.getTargetConstant(Val, MVT::i64); - } - - default: - llvm_unreachable("unexpected size for isVMOVSplat"); + // Op=1, Cmode=1110. + OpCmode = 0x1e; + SplatBits = Val; + VT = is128Bits ? MVT::v2i64 : MVT::v1i64; break; } - return SDValue(); -} - -/// getVMOVImm - If this is a build_vector of constants which can be -/// formed by using a VMOV instruction of the specified element size, -/// return the constant being splatted. The ByteSize field indicates the -/// number of bytes of each element [1248]. -SDValue ARM::getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { - BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N); - APInt SplatBits, SplatUndef; - unsigned SplatBitSize; - bool HasAnyUndefs; - if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, - HasAnyUndefs, ByteSize * 8)) - return SDValue(); - - if (SplatBitSize > ByteSize * 8) + default: + llvm_unreachable("unexpected size for isNEONModifiedImm"); return SDValue(); + } - return isVMOVSplat(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), - SplatBitSize, DAG); + unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm); + return DAG.getTargetConstant(EncodedVal, MVT::i32); } static bool isVEXTMask(const SmallVectorImpl<int> &M, EVT VT, @@ -2789,43 +3230,6 @@ static bool isVZIP_v_undef_Mask(const SmallVectorImpl<int> &M, EVT VT, return true; } - -static SDValue BuildSplat(SDValue Val, EVT VT, SelectionDAG &DAG, DebugLoc dl) { - // Canonicalize all-zeros and all-ones vectors. - ConstantSDNode *ConstVal = cast<ConstantSDNode>(Val.getNode()); - if (ConstVal->isNullValue()) - return getZeroVector(VT, DAG, dl); - if (ConstVal->isAllOnesValue()) - return getOnesVector(VT, DAG, dl); - - EVT CanonicalVT; - if (VT.is64BitVector()) { - switch (Val.getValueType().getSizeInBits()) { - case 8: CanonicalVT = MVT::v8i8; break; - case 16: CanonicalVT = MVT::v4i16; break; - case 32: CanonicalVT = MVT::v2i32; break; - case 64: CanonicalVT = MVT::v1i64; break; - default: llvm_unreachable("unexpected splat element type"); break; - } - } else { - assert(VT.is128BitVector() && "unknown splat vector size"); - switch (Val.getValueType().getSizeInBits()) { - case 8: CanonicalVT = MVT::v16i8; break; - case 16: CanonicalVT = MVT::v8i16; break; - case 32: CanonicalVT = MVT::v4i32; break; - case 64: CanonicalVT = MVT::v2i64; break; - default: llvm_unreachable("unexpected splat element type"); break; - } - } - - // Build a canonical splat for this value. - SmallVector<SDValue, 8> Ops; - Ops.assign(CanonicalVT.getVectorNumElements(), Val); - SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, dl, CanonicalVT, &Ops[0], - Ops.size()); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Res); -} - // If this is a case we can't handle, return null and let the default // expansion code take care of it. static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { @@ -2838,10 +3242,26 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { bool HasAnyUndefs; if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { if (SplatBitSize <= 64) { - SDValue Val = isVMOVSplat(SplatBits.getZExtValue(), - SplatUndef.getZExtValue(), SplatBitSize, DAG); - if (Val.getNode()) - return BuildSplat(Val, VT, DAG, dl); + // Check if an immediate VMOV works. + EVT VmovVT; + SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), + SplatUndef.getZExtValue(), SplatBitSize, + DAG, VmovVT, VT.is128BitVector(), true); + if (Val.getNode()) { + SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vmov); + } + + // Try an immediate VMVN. + uint64_t NegatedImm = (SplatBits.getZExtValue() ^ + ((1LL << SplatBitSize) - 1)); + Val = isNEONModifiedImm(NegatedImm, + SplatUndef.getZExtValue(), SplatBitSize, + DAG, VmovVT, VT.is128BitVector(), false); + if (Val.getNode()) { + SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vmov); + } } } @@ -2883,21 +3303,17 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ARMISD::VDUP, dl, VT, Value); // Vectors with 32- or 64-bit elements can be built by directly assigning - // the subregisters. + // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands + // will be legalized. if (EltSize >= 32) { // Do the expansion with floating-point types, since that is what the VFP // registers are defined to use, and since i64 is not legal. EVT EltVT = EVT::getFloatingPointVT(EltSize); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); - SDValue Val = DAG.getUNDEF(VecVT); - for (unsigned i = 0; i < NumElts; ++i) { - SDValue Elt = Op.getOperand(i); - if (Elt.getOpcode() == ISD::UNDEF) - continue; - Elt = DAG.getNode(ISD::BIT_CONVERT, dl, EltVT, Elt); - Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Val, Elt, - DAG.getConstant(i, MVT::i32)); - } + SmallVector<SDValue, 8> Ops; + for (unsigned i = 0; i < NumElts; ++i) + Ops.push_back(DAG.getNode(ISD::BIT_CONVERT, dl, EltVT, Op.getOperand(i))); + SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts); return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Val); } @@ -2934,7 +3350,9 @@ ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, bool ReverseVEXT; unsigned Imm, WhichResult; - return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + return (EltSize >= 32 || + ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isVREVMask(M, VT, 64) || isVREVMask(M, VT, 32) || isVREVMask(M, VT, 16) || @@ -3032,59 +3450,62 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { // of the same time so that they get CSEd properly. SVN->getMask(ShuffleMask); - if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) { - int Lane = SVN->getSplatIndex(); - // If this is undef splat, generate it via "just" vdup, if possible. - if (Lane == -1) Lane = 0; - - if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { - return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + if (EltSize <= 32) { + if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) { + int Lane = SVN->getSplatIndex(); + // If this is undef splat, generate it via "just" vdup, if possible. + if (Lane == -1) Lane = 0; + + if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { + return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); + } + return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, + DAG.getConstant(Lane, MVT::i32)); } - return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, - DAG.getConstant(Lane, MVT::i32)); - } - bool ReverseVEXT; - unsigned Imm; - if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { - if (ReverseVEXT) - std::swap(V1, V2); - return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, - DAG.getConstant(Imm, MVT::i32)); - } - - if (isVREVMask(ShuffleMask, VT, 64)) - return DAG.getNode(ARMISD::VREV64, dl, VT, V1); - if (isVREVMask(ShuffleMask, VT, 32)) - return DAG.getNode(ARMISD::VREV32, dl, VT, V1); - if (isVREVMask(ShuffleMask, VT, 16)) - return DAG.getNode(ARMISD::VREV16, dl, VT, V1); - - // Check for Neon shuffles that modify both input vectors in place. - // If both results are used, i.e., if there are two shuffles with the same - // source operands and with masks corresponding to both results of one of - // these operations, DAG memoization will ensure that a single node is - // used for both shuffles. - unsigned WhichResult; - if (isVTRNMask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), - V1, V2).getValue(WhichResult); - if (isVUZPMask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), - V1, V2).getValue(WhichResult); - if (isVZIPMask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), - V1, V2).getValue(WhichResult); + bool ReverseVEXT; + unsigned Imm; + if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { + if (ReverseVEXT) + std::swap(V1, V2); + return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, + DAG.getConstant(Imm, MVT::i32)); + } - if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), - V1, V1).getValue(WhichResult); - if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), - V1, V1).getValue(WhichResult); - if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), - V1, V1).getValue(WhichResult); + if (isVREVMask(ShuffleMask, VT, 64)) + return DAG.getNode(ARMISD::VREV64, dl, VT, V1); + if (isVREVMask(ShuffleMask, VT, 32)) + return DAG.getNode(ARMISD::VREV32, dl, VT, V1); + if (isVREVMask(ShuffleMask, VT, 16)) + return DAG.getNode(ARMISD::VREV16, dl, VT, V1); + + // Check for Neon shuffles that modify both input vectors in place. + // If both results are used, i.e., if there are two shuffles with the same + // source operands and with masks corresponding to both results of one of + // these operations, DAG memoization will ensure that a single node is + // used for both shuffles. + unsigned WhichResult; + if (isVTRNMask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), + V1, V2).getValue(WhichResult); + if (isVUZPMask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), + V1, V2).getValue(WhichResult); + if (isVZIPMask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), + V1, V2).getValue(WhichResult); + + if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), + V1, V1).getValue(WhichResult); + if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), + V1, V1).getValue(WhichResult); + if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), + V1, V1).getValue(WhichResult); + } // If the shuffle is not directly supported and it has 4 elements, use // the PerfectShuffle-generated table to synthesize it from other shuffles. @@ -3108,8 +3529,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); } - // Implement shuffles with 32- or 64-bit elements as subreg copies. - unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. if (EltSize >= 32) { // Do the expansion with floating-point types, since that is what the VFP // registers are defined to use, and since i64 is not legal. @@ -3117,17 +3537,17 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); V1 = DAG.getNode(ISD::BIT_CONVERT, dl, VecVT, V1); V2 = DAG.getNode(ISD::BIT_CONVERT, dl, VecVT, V2); - SDValue Val = DAG.getUNDEF(VecVT); + SmallVector<SDValue, 8> Ops; for (unsigned i = 0; i < NumElts; ++i) { if (ShuffleMask[i] < 0) - continue; - SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, - ShuffleMask[i] < (int)NumElts ? V1 : V2, - DAG.getConstant(ShuffleMask[i] & (NumElts-1), - MVT::i32)); - Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Val, - Elt, DAG.getConstant(i, MVT::i32)); + Ops.push_back(DAG.getUNDEF(EltVT)); + else + Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, + ShuffleMask[i] < (int)NumElts ? V1 : V2, + DAG.getConstant(ShuffleMask[i] & (NumElts-1), + MVT::i32))); } + SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts); return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Val); } @@ -3277,7 +3697,12 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI, MF->insert(It, loop1MBB); MF->insert(It, loop2MBB); MF->insert(It, exitMBB); - exitMBB->transferSuccessors(BB); + + // Transfer the remainder of BB and its successor edges to exitMBB. + exitMBB->splice(exitMBB->begin(), BB, + llvm::next(MachineBasicBlock::iterator(MI)), + BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); // thisMBB: // ... @@ -3315,7 +3740,7 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI, // ... BB = exitMBB; - MF->DeleteMachineInstr(MI); // The instruction is gone now. + MI->eraseFromParent(); // The instruction is gone now. return BB; } @@ -3358,7 +3783,12 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); MF->insert(It, loopMBB); MF->insert(It, exitMBB); - exitMBB->transferSuccessors(BB); + + // Transfer the remainder of BB and its successor edges to exitMBB. + exitMBB->splice(exitMBB->begin(), BB, + llvm::next(MachineBasicBlock::iterator(MI)), + BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); MachineRegisterInfo &RegInfo = MF->getRegInfo(); unsigned scratch = RegInfo.createVirtualRegister(ARM::GPRRegisterClass); @@ -3403,11 +3833,20 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, // ... BB = exitMBB; - MF->DeleteMachineInstr(MI); // The instruction is gone now. + MI->eraseFromParent(); // The instruction is gone now. return BB; } +static +MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { + for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), + E = MBB->succ_end(); I != E; ++I) + if (*I != Succ) + return *I; + llvm_unreachable("Expecting a BB with two successors!"); +} + MachineBasicBlock * ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { @@ -3488,22 +3927,21 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineFunction *F = BB->getParent(); MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); - BuildMI(BB, dl, TII->get(ARM::tBcc)).addMBB(sinkMBB) - .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg()); F->insert(It, copy0MBB); F->insert(It, sinkMBB); - // Update machine-CFG edges by first adding all successors of the current - // block to the new block which will contain the Phi node for the select. - for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), - E = BB->succ_end(); I != E; ++I) - sinkMBB->addSuccessor(*I); - // Next, remove all successors of the current block, and add the true - // and fallthrough blocks as its successors. - while (!BB->succ_empty()) - BB->removeSuccessor(BB->succ_begin()); + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), BB, + llvm::next(MachineBasicBlock::iterator(MI)), + BB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(BB); + BB->addSuccessor(copy0MBB); BB->addSuccessor(sinkMBB); + BuildMI(BB, dl, TII->get(ARM::tBcc)).addMBB(sinkMBB) + .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg()); + // copy0MBB: // %FalseValue = ... // # fallthrough to sinkMBB @@ -3516,11 +3954,52 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] // ... BB = sinkMBB; - BuildMI(BB, dl, TII->get(ARM::PHI), MI->getOperand(0).getReg()) + BuildMI(*BB, BB->begin(), dl, + TII->get(ARM::PHI), MI->getOperand(0).getReg()) .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); - F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; + } + + case ARM::BCCi64: + case ARM::BCCZi64: { + // Compare both parts that make up the double comparison separately for + // equality. + bool RHSisZero = MI->getOpcode() == ARM::BCCZi64; + + unsigned LHS1 = MI->getOperand(1).getReg(); + unsigned LHS2 = MI->getOperand(2).getReg(); + if (RHSisZero) { + AddDefaultPred(BuildMI(BB, dl, + TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) + .addReg(LHS1).addImm(0)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) + .addReg(LHS2).addImm(0) + .addImm(ARMCC::EQ).addReg(ARM::CPSR); + } else { + unsigned RHS1 = MI->getOperand(3).getReg(); + unsigned RHS2 = MI->getOperand(4).getReg(); + AddDefaultPred(BuildMI(BB, dl, + TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) + .addReg(LHS1).addReg(RHS1)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) + .addReg(LHS2).addReg(RHS2) + .addImm(ARMCC::EQ).addReg(ARM::CPSR); + } + + MachineBasicBlock *destMBB = MI->getOperand(RHSisZero ? 3 : 5).getMBB(); + MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB); + if (MI->getOperand(0).getImm() == ARMCC::NE) + std::swap(destMBB, exitMBB); + + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) + .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2B : ARM::B)) + .addMBB(exitMBB); + + MI->eraseFromParent(); // The pseudo instruction is gone now. return BB; } @@ -3541,7 +4020,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(SrcReg); unsigned CopyOpc = (RC == ARM::tGPRRegisterClass) ? ARM::tMOVtgpr2gpr : ARM::tMOVgpr2gpr; - BuildMI(BB, dl, TII->get(CopyOpc), ARM::SP) + BuildMI(*BB, MI, dl, TII->get(CopyOpc), ARM::SP) .addReg(SrcReg, getKillRegState(SrcIsKill)); } @@ -3573,7 +4052,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, NeedPred = true; NeedCC = true; NeedOp3 = true; break; } - MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(OpOpc), ARM::SP); + MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(OpOpc), ARM::SP); if (OpOpc == ARM::tAND) AddDefaultT1CC(MIB); MIB.addReg(ARM::SP); @@ -3589,10 +4068,10 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(DstReg); unsigned CopyOpc = (RC == ARM::tGPRRegisterClass) ? ARM::tMOVgpr2tgpr : ARM::tMOVgpr2gpr; - BuildMI(BB, dl, TII->get(CopyOpc)) + BuildMI(*BB, MI, dl, TII->get(CopyOpc)) .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstIsDead)) .addReg(ARM::SP); - MF->DeleteMachineInstr(MI); // The pseudo instruction is gone now. + MI->eraseFromParent(); // The pseudo instruction is gone now. return BB; } } @@ -3763,6 +4242,35 @@ static SDValue PerformVMOVRRDCombine(SDNode *N, return SDValue(); } +/// PerformVDUPLANECombine - Target-specific dag combine xforms for +/// ARMISD::VDUPLANE. +static SDValue PerformVDUPLANECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is + // redundant. + SDValue Op = N->getOperand(0); + EVT VT = N->getValueType(0); + + // Ignore bit_converts. + while (Op.getOpcode() == ISD::BIT_CONVERT) + Op = Op.getOperand(0); + if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM) + return SDValue(); + + // Make sure the VMOV element size is not bigger than the VDUPLANE elements. + unsigned EltSize = Op.getValueType().getVectorElementType().getSizeInBits(); + // The canonical VMOV for a zero vector uses a 32-bit element size. + unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + unsigned EltBits; + if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0) + EltSize = 8; + if (EltSize > VT.getVectorElementType().getSizeInBits()) + return SDValue(); + + SDValue Res = DCI.DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op); + return DCI.CombineTo(N, Res, false); +} + /// getVShiftImm - Check if this is a valid build_vector for the immediate /// operand of a vector shift operation, where all the elements of the /// build_vector must have the same constant integer value. @@ -3893,7 +4401,8 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { // Narrowing shifts require an immediate right shift. if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) break; - llvm_unreachable("invalid shift count for narrowing vector shift intrinsic"); + llvm_unreachable("invalid shift count for narrowing vector shift " + "intrinsic"); default: llvm_unreachable("unhandled vector shift"); @@ -4140,6 +4649,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SUB: return PerformSUBCombine(N, DCI); case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI); + case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); case ISD::SHL: case ISD::SRA: @@ -4156,14 +4666,13 @@ bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const { if (!Subtarget->hasV6Ops()) // Pre-v6 does not support unaligned mem access. return false; - else { - // v6+ may or may not support unaligned mem access depending on the system - // configuration. - // FIXME: This is pretty conservative. Should we provide cmdline option to - // control the behaviour? - if (!Subtarget->isTargetDarwin()) - return false; - } + + // v6+ may or may not support unaligned mem access depending on the system + // configuration. + // FIXME: This is pretty conservative. Should we provide cmdline option to + // control the behaviour? + if (!Subtarget->isTargetDarwin()) + return false; switch (VT.getSimpleVT().SimpleTy) { default: @@ -4619,7 +5128,7 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, } } if (StringRef("{cc}").equals_lower(Constraint)) - return std::make_pair(0U, ARM::CCRRegisterClass); + return std::make_pair(unsigned(ARM::CPSR), ARM::CCRRegisterClass); return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); } @@ -4669,7 +5178,6 @@ getRegClassForInlineAsmConstraint(const std::string &Constraint, /// vector. If it is invalid, don't add anything to Ops. void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, char Constraint, - bool hasMemory, std::vector<SDValue>&Ops, SelectionDAG &DAG) const { SDValue Result(0, 0); @@ -4818,8 +5326,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, Ops.push_back(Result); return; } - return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory, - Ops, DAG); + return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } bool diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h index 9c7517c..128b72e 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h @@ -53,6 +53,8 @@ namespace llvm { CMOV, // ARM conditional move instructions. CNEG, // ARM conditional negate instructions. + BCC_i64, + RBIT, // ARM bitreverse instruction FTOSI, // FP to sint within a FP register. @@ -70,6 +72,8 @@ namespace llvm { EH_SJLJ_SETJMP, // SjLj exception handling setjmp. EH_SJLJ_LONGJMP, // SjLj exception handling longjmp. + TC_RETURN, // Tail call return pseudo. + THREAD_POINTER, DYN_ALLOC, // Dynamic allocation on the stack. @@ -120,6 +124,10 @@ namespace llvm { VGETLANEu, // zero-extend vector extract element VGETLANEs, // sign-extend vector extract element + // Vector move immediate and move negated immediate: + VMOVIMM, + VMVNIMM, + // Vector duplicate: VDUP, VDUPLANE, @@ -133,6 +141,13 @@ namespace llvm { VUZP, // unzip (deinterleave) VTRN, // transpose + // Operands of the standard BUILD_VECTOR node are not legalized, which + // is fine if BUILD_VECTORs are always lowered to shuffles or other + // operations, but for ARM some BUILD_VECTORs are legal as-is and their + // operands need to be legalized. Define an ARM-specific version of + // BUILD_VECTOR for this purpose. + BUILD_VECTOR, + // Floating-point max and min: FMAX, FMIN @@ -141,12 +156,6 @@ namespace llvm { /// Define some predicates that are used for node matching. namespace ARM { - /// getVMOVImm - If this is a build_vector of constants which can be - /// formed by using a VMOV instruction of the specified element size, - /// return the constant being splatted. The ByteSize field indicates the - /// number of bytes of each element [1248]. - SDValue getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG); - /// getVFPf32Imm / getVFPf64Imm - If the given fp immediate can be /// materialized with a VMOV.f32 / VMOV.f64 (i.e. fconsts / fconstd) /// instruction, returns its 8-bit integer representation. Otherwise, @@ -189,9 +198,9 @@ namespace llvm { bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const; /// isLegalICmpImmediate - Return true if the specified immediate is legal - /// icmp immediate, that is the target has icmp instructions which can compare - /// a register against the immediate without having to materialize the - /// immediate into a register. + /// icmp immediate, that is the target has icmp instructions which can + /// compare a register against the immediate without having to materialize + /// the immediate into a register. virtual bool isLegalICmpImmediate(int64_t Imm) const; /// getPreIndexedAddressParts - returns true by value, base pointer and @@ -232,7 +241,6 @@ namespace llvm { /// being processed is 'm'. virtual void LowerAsmOperandForConstraint(SDValue Op, char ConstraintLetter, - bool hasMemory, std::vector<SDValue> &Ops, SelectionDAG &DAG) const; @@ -282,7 +290,8 @@ namespace llvm { SDValue &Root, SelectionDAG &DAG, DebugLoc dl) const; - CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return, bool isVarArg) const; + CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return, + bool isVarArg) const; SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, DebugLoc dl, SelectionDAG &DAG, const CCValAssign &VA, @@ -303,6 +312,7 @@ namespace llvm { SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; @@ -327,18 +337,36 @@ namespace llvm { CallingConv::ID CallConv, bool isVarArg, bool &isTailCall, const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, const SmallVectorImpl<ISD::InputArg> &Ins, DebugLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const; + /// IsEligibleForTailCallOptimization - Check whether the call is eligible + /// for tail call optimization. Targets which want to do tail call + /// optimization should implement this function. + bool IsEligibleForTailCallOptimization(SDValue Callee, + CallingConv::ID CalleeCC, + bool isVarArg, + bool isCalleeStructRet, + bool isCallerStructRet, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + SelectionDAG& DAG) const; virtual SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, DebugLoc dl, SelectionDAG &DAG) const; SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, - SDValue &ARMCC, SelectionDAG &DAG, DebugLoc dl) const; + SDValue &ARMcc, SelectionDAG &DAG, DebugLoc dl) const; + SDValue getVFPCmp(SDValue LHS, SDValue RHS, + SelectionDAG &DAG, DebugLoc dl) const; + + SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const; MachineBasicBlock *EmitAtomicCmpSwap(MachineInstr *MI, MachineBasicBlock *BB, diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td index d487df1..ac568e7 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td @@ -50,27 +50,23 @@ def VFPLdStMulFrm : Format<22>; def VFPMiscFrm : Format<23>; def ThumbFrm : Format<24>; - -def NEONFrm : Format<25>; -def NEONGetLnFrm : Format<26>; -def NEONSetLnFrm : Format<27>; -def NEONDupFrm : Format<28>; - -def MiscFrm : Format<29>; -def ThumbMiscFrm : Format<30>; - -def NLdStFrm : Format<31>; -def N1RegModImmFrm : Format<32>; -def N2RegFrm : Format<33>; -def NVCVTFrm : Format<34>; -def NVDupLnFrm : Format<35>; -def N2RegVShLFrm : Format<36>; -def N2RegVShRFrm : Format<37>; -def N3RegFrm : Format<38>; -def N3RegVShFrm : Format<39>; -def NVExtFrm : Format<40>; -def NVMulSLFrm : Format<41>; -def NVTBLFrm : Format<42>; +def MiscFrm : Format<25>; + +def NGetLnFrm : Format<26>; +def NSetLnFrm : Format<27>; +def NDupFrm : Format<28>; +def NLdStFrm : Format<29>; +def N1RegModImmFrm: Format<30>; +def N2RegFrm : Format<31>; +def NVCVTFrm : Format<32>; +def NVDupLnFrm : Format<33>; +def N2RegVShLFrm : Format<34>; +def N2RegVShRFrm : Format<35>; +def N3RegFrm : Format<36>; +def N3RegVShFrm : Format<37>; +def NVExtFrm : Format<38>; +def NVMulSLFrm : Format<39>; +def NVTBLFrm : Format<40>; // Misc flags. @@ -1653,17 +1649,17 @@ class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, class NVGetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, dag oops, dag iops, InstrItinClass itin, string opc, string dt, string asm, list<dag> pattern> - : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONGetLnFrm, itin, + : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NGetLnFrm, itin, opc, dt, asm, pattern>; class NVSetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, dag oops, dag iops, InstrItinClass itin, string opc, string dt, string asm, list<dag> pattern> - : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONSetLnFrm, itin, + : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NSetLnFrm, itin, opc, dt, asm, pattern>; class NVDup<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, dag oops, dag iops, InstrItinClass itin, string opc, string dt, string asm, list<dag> pattern> - : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONDupFrm, itin, + : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NDupFrm, itin, opc, dt, asm, pattern>; // Vector Duplicate Lane (from scalar to all elements) diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp index 85f6b40..ba228ff 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp @@ -63,7 +63,7 @@ unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const { void ARMInstrInfo:: reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned DestReg, unsigned SubIdx, const MachineInstr *Orig, - const TargetRegisterInfo *TRI) const { + const TargetRegisterInfo &TRI) const { DebugLoc dl = Orig->getDebugLoc(); unsigned Opcode = Orig->getOpcode(); switch (Opcode) { diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h index d4199d1..4563ffe 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h @@ -35,7 +35,7 @@ public: void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned DestReg, unsigned SubIdx, const MachineInstr *Orig, - const TargetRegisterInfo *TRI) const; + const TargetRegisterInfo &TRI) const; /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As /// such, whenever a client has an instance of instruction info, it should diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td index f3156d9..51fc152 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -38,6 +38,12 @@ def SDT_ARMBr2JT : SDTypeProfile<0, 4, [SDTCisPtrTy<0>, SDTCisVT<1, i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; +def SDT_ARMBCC_i64 : SDTypeProfile<0, 6, + [SDTCisVT<0, i32>, + SDTCisVT<1, i32>, SDTCisVT<2, i32>, + SDTCisVT<3, i32>, SDTCisVT<4, i32>, + SDTCisVT<5, OtherVT>]>; + def SDT_ARMCmp : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>; def SDT_ARMPICAdd : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, @@ -53,6 +59,8 @@ def SDT_ARMSYNCBARRIERV7 : SDTypeProfile<0, 0, []>; def SDT_ARMMEMBARRIERV6 : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def SDT_ARMSYNCBARRIERV6 : SDTypeProfile<0, 1, [SDTCisInt<0>]>; +def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; + // Node definitions. def ARMWrapper : SDNode<"ARMISD::Wrapper", SDTIntUnaryOp>; def ARMWrapperJT : SDNode<"ARMISD::WrapperJT", SDTIntBinOp>; @@ -88,6 +96,9 @@ def ARMbrjt : SDNode<"ARMISD::BR_JT", SDT_ARMBrJT, def ARMbr2jt : SDNode<"ARMISD::BR2_JT", SDT_ARMBr2JT, [SDNPHasChain]>; +def ARMBcci64 : SDNode<"ARMISD::BCC_i64", SDT_ARMBCC_i64, + [SDNPHasChain]>; + def ARMcmp : SDNode<"ARMISD::CMP", SDT_ARMCmp, [SDNPOutFlag]>; @@ -117,6 +128,9 @@ def ARMSyncBarrierV6 : SDNode<"ARMISD::SYNCBARRIER", SDT_ARMMEMBARRIERV6, def ARMrbit : SDNode<"ARMISD::RBIT", SDTIntUnaryOp>; +def ARMtcret : SDNode<"ARMISD::TC_RETURN", SDT_ARMTCRET, + [SDNPHasChain, SDNPOptInFlag, SDNPVariadic]>; + //===----------------------------------------------------------------------===// // ARM Instruction Predicate Definitions. // @@ -858,13 +872,13 @@ def LEApcrel : AXI1<0x0, (outs GPR:$dst), (ins i32imm:$label, pred:$p), Pseudo, IIC_iALUi, "adr$p\t$dst, #$label", []>; +} // neverHasSideEffects def LEApcrelJT : AXI1<0x0, (outs GPR:$dst), (ins i32imm:$label, nohash_imm:$id, pred:$p), Pseudo, IIC_iALUi, "adr$p\t$dst, #${label}_${id}", []> { let Inst{25} = 1; } -} // neverHasSideEffects //===----------------------------------------------------------------------===// // Control Flow Instructions. @@ -1026,6 +1040,74 @@ let isCall = 1, } } +// Tail calls. + +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in { + // Darwin versions. + let Defs = [R0, R1, R2, R3, R9, R12, + D0, D1, D2, D3, D4, D5, D6, D7, + D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, + D27, D28, D29, D30, D31, PC], + Uses = [SP] in { + def TCRETURNdi : AInoP<(outs), (ins i32imm:$dst, variable_ops), + Pseudo, IIC_Br, + "@TC_RETURN","\t$dst", []>, Requires<[IsDarwin]>; + + def TCRETURNri : AInoP<(outs), (ins tcGPR:$dst, variable_ops), + Pseudo, IIC_Br, + "@TC_RETURN","\t$dst", []>, Requires<[IsDarwin]>; + + def TAILJMPd : ABXI<0b1010, (outs), (ins brtarget:$dst, variable_ops), + IIC_Br, "b\t$dst @ TAILCALL", + []>, Requires<[IsDarwin]>; + + def TAILJMPdt: ABXI<0b1010, (outs), (ins brtarget:$dst, variable_ops), + IIC_Br, "b.w\t$dst @ TAILCALL", + []>, Requires<[IsDarwin]>; + + def TAILJMPr : AXI<(outs), (ins tcGPR:$dst, variable_ops), + BrMiscFrm, IIC_Br, "bx\t$dst @ TAILCALL", + []>, Requires<[IsDarwin]> { + let Inst{7-4} = 0b0001; + let Inst{19-8} = 0b111111111111; + let Inst{27-20} = 0b00010010; + let Inst{31-28} = 0b1110; + } + } + + // Non-Darwin versions (the difference is R9). + let Defs = [R0, R1, R2, R3, R12, + D0, D1, D2, D3, D4, D5, D6, D7, + D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, + D27, D28, D29, D30, D31, PC], + Uses = [SP] in { + def TCRETURNdiND : AInoP<(outs), (ins i32imm:$dst, variable_ops), + Pseudo, IIC_Br, + "@TC_RETURN","\t$dst", []>, Requires<[IsNotDarwin]>; + + def TCRETURNriND : AInoP<(outs), (ins tcGPR:$dst, variable_ops), + Pseudo, IIC_Br, + "@TC_RETURN","\t$dst", []>, Requires<[IsNotDarwin]>; + + def TAILJMPdND : ABXI<0b1010, (outs), (ins brtarget:$dst, variable_ops), + IIC_Br, "b\t$dst @ TAILCALL", + []>, Requires<[IsARM, IsNotDarwin]>; + + def TAILJMPdNDt : ABXI<0b1010, (outs), (ins brtarget:$dst, variable_ops), + IIC_Br, "b.w\t$dst @ TAILCALL", + []>, Requires<[IsThumb, IsNotDarwin]>; + + def TAILJMPrND : AXI<(outs), (ins tcGPR:$dst, variable_ops), + BrMiscFrm, IIC_Br, "bx\t$dst @ TAILCALL", + []>, Requires<[IsNotDarwin]> { + let Inst{7-4} = 0b0001; + let Inst{19-8} = 0b111111111111; + let Inst{27-20} = 0b00010010; + let Inst{31-28} = 0b1110; + } + } +} + let isBranch = 1, isTerminator = 1 in { // B is "predicable" since it can be xformed into a Bcc. let isBarrier = 1 in { @@ -1397,6 +1479,14 @@ def MOVr : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), DPFrm, IIC_iMOVr, let Inst{25} = 0; } +// A version for the smaller set of tail call registers. +let neverHasSideEffects = 1 in +def MOVr_TC : AsI1<0b1101, (outs tcGPR:$dst), (ins tcGPR:$src), DPFrm, + IIC_iMOVr, "mov", "\t$dst, $src", []>, UnaryDP { + let Inst{11-4} = 0b00000000; + let Inst{25} = 0; +} + def MOVs : AsI1<0b1101, (outs GPR:$dst), (ins so_reg:$src), DPSoRegFrm, IIC_iMOVsr, "mov", "\t$dst, $src", [(set GPR:$dst, so_reg:$src)]>, UnaryDP { @@ -1604,13 +1694,19 @@ def RSCSrs : AXI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), } // (sub X, imm) gets canonicalized to (add X, -imm). Match this form. +// The assume-no-carry-in form uses the negation of the input since add/sub +// assume opposite meanings of the carry flag (i.e., carry == !borrow). +// See the definition of AddWithCarry() in the ARM ARM A2.2.1 for the gory +// details. def : ARMPat<(add GPR:$src, so_imm_neg:$imm), (SUBri GPR:$src, so_imm_neg:$imm)>; - -//def : ARMPat<(addc GPR:$src, so_imm_neg:$imm), -// (SUBSri GPR:$src, so_imm_neg:$imm)>; -//def : ARMPat<(adde GPR:$src, so_imm_neg:$imm), -// (SBCri GPR:$src, so_imm_neg:$imm)>; +def : ARMPat<(addc GPR:$src, so_imm_neg:$imm), + (SUBSri GPR:$src, so_imm_neg:$imm)>; +// The with-carry-in form matches bitwise not instead of the negation. +// Effectively, the inverse interpretation of the carry flag already accounts +// for part of the negation. +def : ARMPat<(adde GPR:$src, so_imm_not:$imm), + (SBCri GPR:$src, so_imm_not:$imm)>; // Note: These are implemented in C++ code, because they have to generate // ADD/SUBrs instructions, which use a complex pattern that a xform function @@ -2198,6 +2294,22 @@ defm CMNz : AI1_cmp_irs<0b1011, "cmn", def : ARMPat<(ARMcmpZ GPR:$src, so_imm_neg:$imm), (CMNzri GPR:$src, so_imm_neg:$imm)>; +// Pseudo i64 compares for some floating point compares. +let usesCustomInserter = 1, isBranch = 1, isTerminator = 1, + Defs = [CPSR] in { +def BCCi64 : PseudoInst<(outs), + (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, brtarget:$dst), + IIC_Br, + "${:comment} B\t$dst GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, imm:$cc", + [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, bb:$dst)]>; + +def BCCZi64 : PseudoInst<(outs), + (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, brtarget:$dst), + IIC_Br, + "${:comment} B\t$dst GPR:$lhs1, GPR:$lhs2, 0, 0, imm:$cc", + [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, 0, 0, bb:$dst)]>; +} // usesCustomInserter + // Conditional moves // FIXME: should be able to write a pattern for ARMcmov, but can't use @@ -2530,31 +2642,30 @@ let Defs = [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15, D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, - D31 ] in { + D31 ], hasSideEffects = 1, isBarrier = 1 in { def Int_eh_sjlj_setjmp : XI<(outs), (ins GPR:$src, GPR:$val), AddrModeNone, SizeSpecial, IndexModeNone, Pseudo, NoItinerary, - "str\tsp, [$src, #+8] ${:comment} eh_setjmp begin\n\t" - "add\t$val, pc, #8\n\t" - "str\t$val, [$src, #+4]\n\t" - "mov\tr0, #0\n\t" - "add\tpc, pc, #0\n\t" - "mov\tr0, #1 ${:comment} eh_setjmp end", "", + "add\t$val, pc, #8\t${:comment} eh_setjmp begin\n\t" + "str\t$val, [$src, #+4]\n\t" + "mov\tr0, #0\n\t" + "add\tpc, pc, #0\n\t" + "mov\tr0, #1 ${:comment} eh_setjmp end", "", [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>, Requires<[IsARM, HasVFP2]>; } let Defs = - [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR ] in { + [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR ], + hasSideEffects = 1, isBarrier = 1 in { def Int_eh_sjlj_setjmp_nofp : XI<(outs), (ins GPR:$src, GPR:$val), AddrModeNone, SizeSpecial, IndexModeNone, Pseudo, NoItinerary, - "str\tsp, [$src, #+8] ${:comment} eh_setjmp begin\n\t" - "add\t$val, pc, #8\n\t" - "str\t$val, [$src, #+4]\n\t" - "mov\tr0, #0\n\t" - "add\tpc, pc, #0\n\t" - "mov\tr0, #1 ${:comment} eh_setjmp end", "", + "add\t$val, pc, #8\n ${:comment} eh_setjmp begin\n\t" + "str\t$val, [$src, #+4]\n\t" + "mov\tr0, #0\n\t" + "add\tpc, pc, #0\n\t" + "mov\tr0, #1 ${:comment} eh_setjmp end", "", [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>, Requires<[IsARM, NoVFP]>; } @@ -2621,6 +2732,24 @@ def : ARMPat<(ARMWrapperJT tjumptable:$dst, imm:$id), // TODO: add,sub,and, 3-instr forms? +// Tail calls +def : ARMPat<(ARMtcret tcGPR:$dst), + (TCRETURNri tcGPR:$dst)>, Requires<[IsDarwin]>; + +def : ARMPat<(ARMtcret (i32 tglobaladdr:$dst)), + (TCRETURNdi texternalsym:$dst)>, Requires<[IsDarwin]>; + +def : ARMPat<(ARMtcret (i32 texternalsym:$dst)), + (TCRETURNdi texternalsym:$dst)>, Requires<[IsDarwin]>; + +def : ARMPat<(ARMtcret tcGPR:$dst), + (TCRETURNriND tcGPR:$dst)>, Requires<[IsNotDarwin]>; + +def : ARMPat<(ARMtcret (i32 tglobaladdr:$dst)), + (TCRETURNdiND texternalsym:$dst)>, Requires<[IsNotDarwin]>; + +def : ARMPat<(ARMtcret (i32 texternalsym:$dst)), + (TCRETURNdiND texternalsym:$dst)>, Requires<[IsNotDarwin]>; // Direct calls def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>, diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td index 197ec16..7f7eb98 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -65,6 +65,10 @@ def SDTARMVGETLN : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>, def NEONvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>; def NEONvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>; +def SDTARMVMOVIMM : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>; +def NEONvmovImm : SDNode<"ARMISD::VMOVIMM", SDTARMVMOVIMM>; +def NEONvmvnImm : SDNode<"ARMISD::VMVNIMM", SDTARMVMOVIMM>; + def NEONvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>; // VDUPLANE can produce a quad-register result from a double-register source, @@ -94,21 +98,26 @@ def SDTARMFMAX : SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisSameAs<0, 1>, def NEONfmax : SDNode<"ARMISD::FMAX", SDTARMFMAX>; def NEONfmin : SDNode<"ARMISD::FMIN", SDTARMFMAX>; +def NEONimmAllZerosV: PatLeaf<(NEONvmovImm (i32 timm)), [{ + ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0)); + unsigned EltBits; + uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits); + return (EltBits == 32 && EltVal == 0); +}]>; + +def NEONimmAllOnesV: PatLeaf<(NEONvmovImm (i32 timm)), [{ + ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0)); + unsigned EltBits; + uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits); + return (EltBits == 8 && EltVal == 0xff); +}]>; + //===----------------------------------------------------------------------===// // NEON operand definitions //===----------------------------------------------------------------------===// -def h8imm : Operand<i8> { - let PrintMethod = "printHex8ImmOperand"; -} -def h16imm : Operand<i16> { - let PrintMethod = "printHex16ImmOperand"; -} -def h32imm : Operand<i32> { - let PrintMethod = "printHex32ImmOperand"; -} -def h64imm : Operand<i64> { - let PrintMethod = "printHex64ImmOperand"; +def nModImm : Operand<i32> { + let PrintMethod = "printNEONModImmOperand"; } //===----------------------------------------------------------------------===// @@ -812,11 +821,6 @@ def DSubReg_f64_reg : SDNodeXForm<imm, [{ assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue(), MVT::i32); }]>; -def DSubReg_f64_other_reg : SDNodeXForm<imm, [{ - assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); - return CurDAG->getTargetConstant(ARM::dsub_0 + (1 - N->getZExtValue()), - MVT::i32); -}]>; // Extract S sub-registers of Q/D registers. def SSubReg_f32_reg : SDNodeXForm<imm, [{ @@ -2282,7 +2286,7 @@ def VCEQfq : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32, NEONvceq, 1>; // For disassembly only. defm VCEQz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00010, 0, "vceq", "i", - "$dst, $src, #0">; + "$dst, $src, #0">; // VCGE : Vector Compare Greater Than or Equal defm VCGEs : N3V_QHS<0, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, @@ -2332,10 +2336,10 @@ defm VTST : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, // Vector Bitwise Operations. -def vnot8 : PatFrag<(ops node:$in), - (xor node:$in, (bitconvert (v8i8 immAllOnesV)))>; -def vnot16 : PatFrag<(ops node:$in), - (xor node:$in, (bitconvert (v16i8 immAllOnesV)))>; +def vnotd : PatFrag<(ops node:$in), + (xor node:$in, (bitconvert (v8i8 NEONimmAllOnesV)))>; +def vnotq : PatFrag<(ops node:$in), + (xor node:$in, (bitconvert (v16i8 NEONimmAllOnesV)))>; // VAND : Vector Bitwise AND @@ -2361,36 +2365,58 @@ def VBICd : N3VX<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, IIC_VBINiD, "vbic", "$dst, $src1, $src2", "", [(set DPR:$dst, (v2i32 (and DPR:$src1, - (vnot8 DPR:$src2))))]>; + (vnotd DPR:$src2))))]>; def VBICq : N3VX<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), N3RegFrm, IIC_VBINiQ, "vbic", "$dst, $src1, $src2", "", [(set QPR:$dst, (v4i32 (and QPR:$src1, - (vnot16 QPR:$src2))))]>; + (vnotq QPR:$src2))))]>; // VORN : Vector Bitwise OR NOT def VORNd : N3VX<0, 0, 0b11, 0b0001, 0, 1, (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, IIC_VBINiD, "vorn", "$dst, $src1, $src2", "", [(set DPR:$dst, (v2i32 (or DPR:$src1, - (vnot8 DPR:$src2))))]>; + (vnotd DPR:$src2))))]>; def VORNq : N3VX<0, 0, 0b11, 0b0001, 1, 1, (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), N3RegFrm, IIC_VBINiQ, "vorn", "$dst, $src1, $src2", "", [(set QPR:$dst, (v4i32 (or QPR:$src1, - (vnot16 QPR:$src2))))]>; + (vnotq QPR:$src2))))]>; + +// VMVN : Vector Bitwise NOT (Immediate) + +let isReMaterializable = 1 in { +def VMVNv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 1, 1, (outs DPR:$dst), + (ins nModImm:$SIMM), IIC_VMOVImm, + "vmvn", "i16", "$dst, $SIMM", "", + [(set DPR:$dst, (v4i16 (NEONvmvnImm timm:$SIMM)))]>; +def VMVNv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 1, 1, (outs QPR:$dst), + (ins nModImm:$SIMM), IIC_VMOVImm, + "vmvn", "i16", "$dst, $SIMM", "", + [(set QPR:$dst, (v8i16 (NEONvmvnImm timm:$SIMM)))]>; + +def VMVNv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 1, 1, (outs DPR:$dst), + (ins nModImm:$SIMM), IIC_VMOVImm, + "vmvn", "i32", "$dst, $SIMM", "", + [(set DPR:$dst, (v2i32 (NEONvmvnImm timm:$SIMM)))]>; +def VMVNv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 1, 1, (outs QPR:$dst), + (ins nModImm:$SIMM), IIC_VMOVImm, + "vmvn", "i32", "$dst, $SIMM", "", + [(set QPR:$dst, (v4i32 (NEONvmvnImm timm:$SIMM)))]>; +} // VMVN : Vector Bitwise NOT def VMVNd : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 0, 0, (outs DPR:$dst), (ins DPR:$src), IIC_VSUBiD, "vmvn", "$dst, $src", "", - [(set DPR:$dst, (v2i32 (vnot8 DPR:$src)))]>; + [(set DPR:$dst, (v2i32 (vnotd DPR:$src)))]>; def VMVNq : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 1, 0, (outs QPR:$dst), (ins QPR:$src), IIC_VSUBiD, "vmvn", "$dst, $src", "", - [(set QPR:$dst, (v4i32 (vnot16 QPR:$src)))]>; -def : Pat<(v2i32 (vnot8 DPR:$src)), (VMVNd DPR:$src)>; -def : Pat<(v4i32 (vnot16 QPR:$src)), (VMVNq QPR:$src)>; + [(set QPR:$dst, (v4i32 (vnotq QPR:$src)))]>; +def : Pat<(v2i32 (vnotd DPR:$src)), (VMVNd DPR:$src)>; +def : Pat<(v4i32 (vnotq QPR:$src)), (VMVNq QPR:$src)>; // VBSL : Vector Bitwise Select def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst), @@ -2399,14 +2425,14 @@ def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst), "vbsl", "$dst, $src2, $src3", "$src1 = $dst", [(set DPR:$dst, (v2i32 (or (and DPR:$src2, DPR:$src1), - (and DPR:$src3, (vnot8 DPR:$src1)))))]>; + (and DPR:$src3, (vnotd DPR:$src1)))))]>; def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), N3RegFrm, IIC_VCNTiQ, "vbsl", "$dst, $src2, $src3", "$src1 = $dst", [(set QPR:$dst, (v4i32 (or (and QPR:$src2, QPR:$src1), - (and QPR:$src3, (vnot16 QPR:$src1)))))]>; + (and QPR:$src3, (vnotq QPR:$src1)))))]>; // VBIF : Vector Bitwise Insert if False // like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst", @@ -2740,20 +2766,19 @@ defm VQABS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0, // Vector Negate. -def vneg : PatFrag<(ops node:$in), (sub immAllZerosV, node:$in)>; -def vneg8 : PatFrag<(ops node:$in), - (sub (bitconvert (v8i8 immAllZerosV)), node:$in)>; -def vneg16 : PatFrag<(ops node:$in), - (sub (bitconvert (v16i8 immAllZerosV)), node:$in)>; +def vnegd : PatFrag<(ops node:$in), + (sub (bitconvert (v2i32 NEONimmAllZerosV)), node:$in)>; +def vnegq : PatFrag<(ops node:$in), + (sub (bitconvert (v4i32 NEONimmAllZerosV)), node:$in)>; class VNEGD<bits<2> size, string OpcodeStr, string Dt, ValueType Ty> : N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$dst), (ins DPR:$src), IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "", - [(set DPR:$dst, (Ty (vneg8 DPR:$src)))]>; + [(set DPR:$dst, (Ty (vnegd DPR:$src)))]>; class VNEGQ<bits<2> size, string OpcodeStr, string Dt, ValueType Ty> : N2V<0b11, 0b11, size, 0b01, 0b00111, 1, 0, (outs QPR:$dst), (ins QPR:$src), IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "", - [(set QPR:$dst, (Ty (vneg16 QPR:$src)))]>; + [(set QPR:$dst, (Ty (vnegq QPR:$src)))]>; // VNEG : Vector Negate (integer) def VNEGs8d : VNEGD<0b00, "vneg", "s8", v8i8>; @@ -2773,12 +2798,12 @@ def VNEGf32q : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 1, 0, "vneg", "f32", "$dst, $src", "", [(set QPR:$dst, (v4f32 (fneg QPR:$src)))]>; -def : Pat<(v8i8 (vneg8 DPR:$src)), (VNEGs8d DPR:$src)>; -def : Pat<(v4i16 (vneg8 DPR:$src)), (VNEGs16d DPR:$src)>; -def : Pat<(v2i32 (vneg8 DPR:$src)), (VNEGs32d DPR:$src)>; -def : Pat<(v16i8 (vneg16 QPR:$src)), (VNEGs8q QPR:$src)>; -def : Pat<(v8i16 (vneg16 QPR:$src)), (VNEGs16q QPR:$src)>; -def : Pat<(v4i32 (vneg16 QPR:$src)), (VNEGs32q QPR:$src)>; +def : Pat<(v8i8 (vnegd DPR:$src)), (VNEGs8d DPR:$src)>; +def : Pat<(v4i16 (vnegd DPR:$src)), (VNEGs16d DPR:$src)>; +def : Pat<(v2i32 (vnegd DPR:$src)), (VNEGs32d DPR:$src)>; +def : Pat<(v16i8 (vnegq QPR:$src)), (VNEGs8q QPR:$src)>; +def : Pat<(v8i16 (vnegq QPR:$src)), (VNEGs16q QPR:$src)>; +def : Pat<(v4i32 (vnegq QPR:$src)), (VNEGs32q QPR:$src)>; // VQNEG : Vector Saturating Negate defm VQNEG : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0, @@ -2832,77 +2857,42 @@ def VMOVQQQQ : PseudoInst<(outs QQQQPR:$dst), (ins QQQQPR:$src), // VMOV : Vector Move (Immediate) -// VMOV_get_imm8 xform function: convert build_vector to VMOV.i8 imm. -def VMOV_get_imm8 : SDNodeXForm<build_vector, [{ - return ARM::getVMOVImm(N, 1, *CurDAG); -}]>; -def vmovImm8 : PatLeaf<(build_vector), [{ - return ARM::getVMOVImm(N, 1, *CurDAG).getNode() != 0; -}], VMOV_get_imm8>; - -// VMOV_get_imm16 xform function: convert build_vector to VMOV.i16 imm. -def VMOV_get_imm16 : SDNodeXForm<build_vector, [{ - return ARM::getVMOVImm(N, 2, *CurDAG); -}]>; -def vmovImm16 : PatLeaf<(build_vector), [{ - return ARM::getVMOVImm(N, 2, *CurDAG).getNode() != 0; -}], VMOV_get_imm16>; - -// VMOV_get_imm32 xform function: convert build_vector to VMOV.i32 imm. -def VMOV_get_imm32 : SDNodeXForm<build_vector, [{ - return ARM::getVMOVImm(N, 4, *CurDAG); -}]>; -def vmovImm32 : PatLeaf<(build_vector), [{ - return ARM::getVMOVImm(N, 4, *CurDAG).getNode() != 0; -}], VMOV_get_imm32>; - -// VMOV_get_imm64 xform function: convert build_vector to VMOV.i64 imm. -def VMOV_get_imm64 : SDNodeXForm<build_vector, [{ - return ARM::getVMOVImm(N, 8, *CurDAG); -}]>; -def vmovImm64 : PatLeaf<(build_vector), [{ - return ARM::getVMOVImm(N, 8, *CurDAG).getNode() != 0; -}], VMOV_get_imm64>; - -// Note: Some of the cmode bits in the following VMOV instructions need to -// be encoded based on the immed values. - let isReMaterializable = 1 in { def VMOVv8i8 : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$dst), - (ins h8imm:$SIMM), IIC_VMOVImm, + (ins nModImm:$SIMM), IIC_VMOVImm, "vmov", "i8", "$dst, $SIMM", "", - [(set DPR:$dst, (v8i8 vmovImm8:$SIMM))]>; + [(set DPR:$dst, (v8i8 (NEONvmovImm timm:$SIMM)))]>; def VMOVv16i8 : N1ModImm<1, 0b000, 0b1110, 0, 1, 0, 1, (outs QPR:$dst), - (ins h8imm:$SIMM), IIC_VMOVImm, + (ins nModImm:$SIMM), IIC_VMOVImm, "vmov", "i8", "$dst, $SIMM", "", - [(set QPR:$dst, (v16i8 vmovImm8:$SIMM))]>; + [(set QPR:$dst, (v16i8 (NEONvmovImm timm:$SIMM)))]>; -def VMOVv4i16 : N1ModImm<1, 0b000, {1,0,?,?}, 0, 0, {?}, 1, (outs DPR:$dst), - (ins h16imm:$SIMM), IIC_VMOVImm, +def VMOVv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 0, 1, (outs DPR:$dst), + (ins nModImm:$SIMM), IIC_VMOVImm, "vmov", "i16", "$dst, $SIMM", "", - [(set DPR:$dst, (v4i16 vmovImm16:$SIMM))]>; -def VMOVv8i16 : N1ModImm<1, 0b000, {1,0,?,?}, 0, 1, {?}, 1, (outs QPR:$dst), - (ins h16imm:$SIMM), IIC_VMOVImm, + [(set DPR:$dst, (v4i16 (NEONvmovImm timm:$SIMM)))]>; +def VMOVv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 0, 1, (outs QPR:$dst), + (ins nModImm:$SIMM), IIC_VMOVImm, "vmov", "i16", "$dst, $SIMM", "", - [(set QPR:$dst, (v8i16 vmovImm16:$SIMM))]>; + [(set QPR:$dst, (v8i16 (NEONvmovImm timm:$SIMM)))]>; -def VMOVv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, {?}, 1, (outs DPR:$dst), - (ins h32imm:$SIMM), IIC_VMOVImm, +def VMOVv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 0, 1, (outs DPR:$dst), + (ins nModImm:$SIMM), IIC_VMOVImm, "vmov", "i32", "$dst, $SIMM", "", - [(set DPR:$dst, (v2i32 vmovImm32:$SIMM))]>; -def VMOVv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, {?}, 1, (outs QPR:$dst), - (ins h32imm:$SIMM), IIC_VMOVImm, + [(set DPR:$dst, (v2i32 (NEONvmovImm timm:$SIMM)))]>; +def VMOVv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 0, 1, (outs QPR:$dst), + (ins nModImm:$SIMM), IIC_VMOVImm, "vmov", "i32", "$dst, $SIMM", "", - [(set QPR:$dst, (v4i32 vmovImm32:$SIMM))]>; + [(set QPR:$dst, (v4i32 (NEONvmovImm timm:$SIMM)))]>; def VMOVv1i64 : N1ModImm<1, 0b000, 0b1110, 0, 0, 1, 1, (outs DPR:$dst), - (ins h64imm:$SIMM), IIC_VMOVImm, + (ins nModImm:$SIMM), IIC_VMOVImm, "vmov", "i64", "$dst, $SIMM", "", - [(set DPR:$dst, (v1i64 vmovImm64:$SIMM))]>; + [(set DPR:$dst, (v1i64 (NEONvmovImm timm:$SIMM)))]>; def VMOVv2i64 : N1ModImm<1, 0b000, 0b1110, 0, 1, 1, 1, (outs QPR:$dst), - (ins h64imm:$SIMM), IIC_VMOVImm, + (ins nModImm:$SIMM), IIC_VMOVImm, "vmov", "i64", "$dst, $SIMM", "", - [(set QPR:$dst, (v2i64 vmovImm64:$SIMM))]>; + [(set QPR:$dst, (v2i64 (NEONvmovImm timm:$SIMM)))]>; } // isReMaterializable // VMOV : Vector Get Lane (move scalar to ARM core register) @@ -3122,17 +3112,6 @@ def VDUPfqf : N2V<0b11, 0b11, {?,1}, {0,0}, 0b11000, 1, 0, IIC_VMOVD, "vdup", "32", "$dst, ${src:lane}", "", [(set QPR:$dst, (v4f32 (NEONvdup (f32 SPR:$src))))]>; -def : Pat<(v2i64 (NEONvduplane (v2i64 QPR:$src), imm:$lane)), - (INSERT_SUBREG QPR:$src, - (i64 (EXTRACT_SUBREG QPR:$src, - (DSubReg_f64_reg imm:$lane))), - (DSubReg_f64_other_reg imm:$lane))>; -def : Pat<(v2f64 (NEONvduplane (v2f64 QPR:$src), imm:$lane)), - (INSERT_SUBREG QPR:$src, - (f64 (EXTRACT_SUBREG QPR:$src, - (DSubReg_f64_reg imm:$lane))), - (DSubReg_f64_other_reg imm:$lane))>; - // VMOVN : Vector Narrowing Move defm VMOVN : N2VNInt_HSD<0b11,0b11,0b10,0b00100,0,0, IIC_VMOVD, "vmovn", "i", int_arm_neon_vmovn>; @@ -3319,22 +3298,16 @@ let hasExtraSrcRegAllocReq = 1 in { def VTBL2 : N3V<1,1,0b11,0b1001,0,0, (outs DPR:$dst), (ins DPR:$tbl1, DPR:$tbl2, DPR:$src), NVTBLFrm, IIC_VTB2, - "vtbl", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "", - [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl2 - DPR:$tbl1, DPR:$tbl2, DPR:$src)))]>; + "vtbl", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "", []>; def VTBL3 : N3V<1,1,0b11,0b1010,0,0, (outs DPR:$dst), (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), NVTBLFrm, IIC_VTB3, - "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src", "", - [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl3 - DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src)))]>; + "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src", "", []>; def VTBL4 : N3V<1,1,0b11,0b1011,0,0, (outs DPR:$dst), (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), NVTBLFrm, IIC_VTB4, - "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3, $tbl4\\}, $src", "", - [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl4 DPR:$tbl1, DPR:$tbl2, - DPR:$tbl3, DPR:$tbl4, DPR:$src)))]>; + "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3, $tbl4\\}, $src", "", []>; } // hasExtraSrcRegAllocReq = 1 // VTBX : Vector Table Extension @@ -3348,23 +3321,18 @@ let hasExtraSrcRegAllocReq = 1 in { def VTBX2 : N3V<1,1,0b11,0b1001,1,0, (outs DPR:$dst), (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src), NVTBLFrm, IIC_VTBX2, - "vtbx", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "$orig = $dst", - [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx2 - DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src)))]>; + "vtbx", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "$orig = $dst", []>; def VTBX3 : N3V<1,1,0b11,0b1010,1,0, (outs DPR:$dst), (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), NVTBLFrm, IIC_VTBX3, - "vtbx", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src", "$orig = $dst", - [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx3 DPR:$orig, DPR:$tbl1, - DPR:$tbl2, DPR:$tbl3, DPR:$src)))]>; + "vtbx", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src", + "$orig = $dst", []>; def VTBX4 : N3V<1,1,0b11,0b1011,1,0, (outs DPR:$dst), (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), NVTBLFrm, IIC_VTBX4, "vtbx", "8", "$dst, \\{$tbl1, $tbl2, $tbl3, $tbl4\\}, $src", - "$orig = $dst", - [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx4 DPR:$orig, DPR:$tbl1, - DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src)))]>; + "$orig = $dst", []>; } // hasExtraSrcRegAllocReq = 1 //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td index 40f924b..bc0790d 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td @@ -894,11 +894,11 @@ def tLEApcrel : T1I<(outs tGPR:$dst), (ins i32imm:$label, pred:$p), IIC_iALUi, "adr$p\t$dst, #$label", []>, T1Encoding<{1,0,1,0,0,?}>; // A6.2 & A8.6.10 +} // neverHasSideEffects def tLEApcrelJT : T1I<(outs tGPR:$dst), (ins i32imm:$label, nohash_imm:$id, pred:$p), IIC_iALUi, "adr$p\t$dst, #${label}_${id}", []>, T1Encoding<{1,0,1,0,0,?}>; // A6.2 & A8.6.10 -} // neverHasSideEffects //===----------------------------------------------------------------------===// // TLS Instructions @@ -923,18 +923,18 @@ let isCall = 1, // except for our own input by listing the relevant registers in Defs. By // doing so, we also cause the prologue/epilogue code to actively preserve // all of the callee-saved resgisters, which is exactly what we want. -// The current SP is passed in $val, and we reuse the reg as a scratch. +// $val is a scratch register for our use. let Defs = - [ R0, R1, R2, R3, R4, R5, R6, R7, R12 ] in { + [ R0, R1, R2, R3, R4, R5, R6, R7, R12 ], hasSideEffects = 1, + isBarrier = 1 in { def tInt_eh_sjlj_setjmp : ThumbXI<(outs),(ins tGPR:$src, tGPR:$val), AddrModeNone, SizeSpecial, NoItinerary, - "str\t$val, [$src, #8]\t${:comment} begin eh.setjmp\n" - "\tmov\t$val, pc\n" - "\tadds\t$val, #7\n" - "\tstr\t$val, [$src, #4]\n" - "\tmovs\tr0, #0\n" - "\tb\t1f\n" - "\tmovs\tr0, #1\t${:comment} end eh.setjmp\n" + "mov\t$val, pc\t${:comment} begin eh.setjmp\n\t" + "adds\t$val, #7\n\t" + "str\t$val, [$src, #4]\n\t" + "movs\tr0, #0\n\t" + "b\t1f\n\t" + "movs\tr0, #1\t${:comment} end eh.setjmp\n\t" "1:", "", [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>; } @@ -1037,7 +1037,8 @@ def : T1Pat<(i32 imm0_255_comp:$src), // scheduling. let isReMaterializable = 1 in def tLDRpci_pic : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr, pclabel:$cp), - NoItinerary, "${:comment} ldr.n\t$dst, $addr\n$cp:\n\tadd\t$dst, pc", + NoItinerary, + "${:comment} ldr.n\t$dst, $addr\n$cp:\n\tadd\t$dst, pc", [(set GPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)), imm:$cp))]>, Requires<[IsThumb1Only]>; diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td index b91c089..bbe675e 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -122,6 +122,10 @@ def imm0_255_neg : PatLeaf<(i32 imm), [{ return (uint32_t)(-N->getZExtValue()) < 255; }], imm_neg_XFORM>; +def imm0_255_not : PatLeaf<(i32 imm), [{ + return (uint32_t)(~N->getZExtValue()) < 255; +}], imm_comp_XFORM>; + // Define Thumb2 specific addressing modes. // t2addrmode_imm12 := reg + imm12 @@ -637,8 +641,7 @@ multiclass T2I_st<bits<2> opcod, string opc, PatFrag opnode> { multiclass T2I_unary_rrot<bits<3> opcod, string opc, PatFrag opnode> { def r : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, opc, ".w\t$dst, $src", - [(set GPR:$dst, (opnode GPR:$src))]>, - Requires<[HasT2ExtractPack]> { + [(set GPR:$dst, (opnode GPR:$src))]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0100; let Inst{22-20} = opcod; @@ -649,8 +652,7 @@ multiclass T2I_unary_rrot<bits<3> opcod, string opc, PatFrag opnode> { } def r_rot : T2I<(outs GPR:$dst), (ins GPR:$src, i32imm:$rot), IIC_iUNAsi, opc, ".w\t$dst, $src, ror $rot", - [(set GPR:$dst, (opnode (rotr GPR:$src, rot_imm:$rot)))]>, - Requires<[HasT2ExtractPack]> { + [(set GPR:$dst, (opnode (rotr GPR:$src, rot_imm:$rot)))]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0100; let Inst{22-20} = opcod; @@ -661,8 +663,8 @@ multiclass T2I_unary_rrot<bits<3> opcod, string opc, PatFrag opnode> { } } -// SXTB16 and UXTB16 do not need the .w qualifier. -multiclass T2I_unary_rrot_nw<bits<3> opcod, string opc, PatFrag opnode> { +// UXTB16 - Requres T2ExtractPack, does not need the .w qualifier. +multiclass T2I_unary_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode> { def r : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, opc, "\t$dst, $src", [(set GPR:$dst, (opnode GPR:$src))]>, @@ -689,9 +691,9 @@ multiclass T2I_unary_rrot_nw<bits<3> opcod, string opc, PatFrag opnode> { } } -// DO variant - disassembly only, no pattern - -multiclass T2I_unary_rrot_DO<bits<3> opcod, string opc> { +// SXTB16 - Requres T2ExtractPack, does not need the .w qualifier, no pattern +// supported yet. +multiclass T2I_unary_rrot_sxtb16<bits<3> opcod, string opc> { def r : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr, opc, "\t$dst, $src", []> { let Inst{31-27} = 0b11111; @@ -787,6 +789,7 @@ def t2LEApcrel : T2XI<(outs GPR:$dst), (ins i32imm:$label, pred:$p), IIC_iALUi, let Inst{19-16} = 0b1111; // Rn let Inst{15} = 0; } +} // neverHasSideEffects def t2LEApcrelJT : T2XI<(outs GPR:$dst), (ins i32imm:$label, nohash_imm:$id, pred:$p), IIC_iALUi, "adr$p.w\t$dst, #${label}_${id}", []> { @@ -798,7 +801,6 @@ def t2LEApcrelJT : T2XI<(outs GPR:$dst), let Inst{19-16} = 0b1111; // Rn let Inst{15} = 0; } -} // neverHasSideEffects // ADD r, sp, {so_imm|i12} def t2ADDrSPi : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm), @@ -1330,7 +1332,7 @@ defm t2SXTB : T2I_unary_rrot<0b100, "sxtb", UnOpFrag<(sext_inreg node:$Src, i8)>>; defm t2SXTH : T2I_unary_rrot<0b000, "sxth", UnOpFrag<(sext_inreg node:$Src, i16)>>; -defm t2SXTB16 : T2I_unary_rrot_DO<0b010, "sxtb16">; +defm t2SXTB16 : T2I_unary_rrot_sxtb16<0b010, "sxtb16">; defm t2SXTAB : T2I_bin_rrot<0b100, "sxtab", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>; @@ -1347,13 +1349,13 @@ defm t2UXTB : T2I_unary_rrot<0b101, "uxtb", UnOpFrag<(and node:$Src, 0x000000FF)>>; defm t2UXTH : T2I_unary_rrot<0b001, "uxth", UnOpFrag<(and node:$Src, 0x0000FFFF)>>; -defm t2UXTB16 : T2I_unary_rrot_nw<0b011, "uxtb16", +defm t2UXTB16 : T2I_unary_rrot_uxtb16<0b011, "uxtb16", UnOpFrag<(and node:$Src, 0x00FF00FF)>>; def : T2Pat<(and (shl GPR:$Src, (i32 8)), 0xFF00FF), - (t2UXTB16r_rot GPR:$Src, 24)>; + (t2UXTB16r_rot GPR:$Src, 24)>, Requires<[HasT2ExtractPack]>; def : T2Pat<(and (srl GPR:$Src, (i32 8)), 0xFF00FF), - (t2UXTB16r_rot GPR:$Src, 8)>; + (t2UXTB16r_rot GPR:$Src, 8)>, Requires<[HasT2ExtractPack]>; defm t2UXTAB : T2I_bin_rrot<0b101, "uxtab", BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>; @@ -1393,13 +1395,32 @@ defm t2RSBS : T2I_rbin_s_is <0b1110, "rsb", BinOpFrag<(subc node:$LHS, node:$RHS)>>; // (sub X, imm) gets canonicalized to (add X, -imm). Match this form. +// The assume-no-carry-in form uses the negation of the input since add/sub +// assume opposite meanings of the carry flag (i.e., carry == !borrow). +// See the definition of AddWithCarry() in the ARM ARM A2.2.1 for the gory +// details. +// The AddedComplexity preferences the first variant over the others since +// it can be shrunk to a 16-bit wide encoding, while the others cannot. +let AddedComplexity = 1 in +def : T2Pat<(add GPR:$src, imm0_255_neg:$imm), + (t2SUBri GPR:$src, imm0_255_neg:$imm)>; +def : T2Pat<(add GPR:$src, t2_so_imm_neg:$imm), + (t2SUBri GPR:$src, t2_so_imm_neg:$imm)>; +def : T2Pat<(add GPR:$src, imm0_4095_neg:$imm), + (t2SUBri12 GPR:$src, imm0_4095_neg:$imm)>; +let AddedComplexity = 1 in +def : T2Pat<(addc GPR:$src, imm0_255_neg:$imm), + (t2SUBSri GPR:$src, imm0_255_neg:$imm)>; +def : T2Pat<(addc GPR:$src, t2_so_imm_neg:$imm), + (t2SUBSri GPR:$src, t2_so_imm_neg:$imm)>; +// The with-carry-in form matches bitwise not instead of the negation. +// Effectively, the inverse interpretation of the carry flag already accounts +// for part of the negation. let AddedComplexity = 1 in -def : T2Pat<(add GPR:$src, imm0_255_neg:$imm), - (t2SUBri GPR:$src, imm0_255_neg:$imm)>; -def : T2Pat<(add GPR:$src, t2_so_imm_neg:$imm), - (t2SUBri GPR:$src, t2_so_imm_neg:$imm)>; -def : T2Pat<(add GPR:$src, imm0_4095_neg:$imm), - (t2SUBri12 GPR:$src, imm0_4095_neg:$imm)>; +def : T2Pat<(adde GPR:$src, imm0_255_not:$imm), + (t2SBCSri GPR:$src, imm0_255_not:$imm)>; +def : T2Pat<(adde GPR:$src, t2_so_imm_not:$imm), + (t2SBCSri GPR:$src, t2_so_imm_not:$imm)>; // Select Bytes -- for disassembly only @@ -2389,37 +2410,36 @@ let isCall = 1, // except for our own input by listing the relevant registers in Defs. By // doing so, we also cause the prologue/epilogue code to actively preserve // all of the callee-saved resgisters, which is exactly what we want. -// The current SP is passed in $val, and we reuse the reg as a scratch. +// $val is a scratch register for our use. let Defs = [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15, D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, - D31 ] in { + D31 ], hasSideEffects = 1, isBarrier = 1 in { def t2Int_eh_sjlj_setjmp : Thumb2XI<(outs), (ins GPR:$src, tGPR:$val), AddrModeNone, SizeSpecial, NoItinerary, - "str\t$val, [$src, #8]\t${:comment} begin eh.setjmp\n" - "\tmov\t$val, pc\n" - "\tadds\t$val, #7\n" - "\tstr\t$val, [$src, #4]\n" - "\tmovs\tr0, #0\n" - "\tb\t1f\n" - "\tmovs\tr0, #1\t${:comment} end eh.setjmp\n" + "mov\t$val, pc\t${:comment} begin eh.setjmp\n\t" + "adds\t$val, #7\n\t" + "str\t$val, [$src, #4]\n\t" + "movs\tr0, #0\n\t" + "b\t1f\n\t" + "movs\tr0, #1\t${:comment} end eh.setjmp\n\t" "1:", "", [(set R0, (ARMeh_sjlj_setjmp GPR:$src, tGPR:$val))]>, Requires<[IsThumb2, HasVFP2]>; } let Defs = - [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR ] in { + [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR ], + hasSideEffects = 1, isBarrier = 1 in { def t2Int_eh_sjlj_setjmp_nofp : Thumb2XI<(outs), (ins GPR:$src, tGPR:$val), AddrModeNone, SizeSpecial, NoItinerary, - "str\t$val, [$src, #8]\t${:comment} begin eh.setjmp\n" - "\tmov\t$val, pc\n" - "\tadds\t$val, #7\n" - "\tstr\t$val, [$src, #4]\n" - "\tmovs\tr0, #0\n" - "\tb\t1f\n" - "\tmovs\tr0, #1\t${:comment} end eh.setjmp\n" + "mov\t$val, pc\t${:comment} begin eh.setjmp\n\t" + "adds\t$val, #7\n\t" + "str\t$val, [$src, #4]\n\t" + "movs\tr0, #0\n\t" + "b\t1f\n\t" + "movs\tr0, #1\t${:comment} end eh.setjmp\n\t" "1:", "", [(set R0, (ARMeh_sjlj_setjmp GPR:$src, tGPR:$val))]>, Requires<[IsThumb2, NoVFP]>; @@ -2438,7 +2458,7 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1, hasExtraDefRegAllocReq = 1 in def t2LDM_RET : T2XIt<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, reglist:$dsts, variable_ops), IIC_Br, - "ldm${addr:submode}${p}${addr:wide}\t$addr, $dsts", + "ldm${addr:submode}${p}${addr:wide}\t$addr!, $dsts", "$addr.addr = $wb", []> { let Inst{31-27} = 0b11101; let Inst{26-25} = 0b00; @@ -2529,6 +2549,7 @@ def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br, // IT block +let Defs = [ITSTATE] in def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask), AddrModeNone, Size2Bytes, IIC_iALUx, "it$mask\t$cc", "", []> { @@ -2691,7 +2712,8 @@ def : T2Pat<(ARMWrapperJT tjumptable:$dst, imm:$id), // scheduling. let canFoldAsLoad = 1, isReMaterializable = 1 in def t2LDRpci_pic : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr, pclabel:$cp), - NoItinerary, "${:comment} ldr.w\t$dst, $addr\n$cp:\n\tadd\t$dst, pc", + NoItinerary, + "${:comment} ldr.w\t$dst, $addr\n$cp:\n\tadd\t$dst, pc", [(set GPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)), imm:$cp))]>, Requires<[IsThumb2]>; diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td index 54474cf..84c23e1 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td @@ -255,25 +255,25 @@ def VCVTSD : VFPAI<(outs SPR:$dst), (ins DPR:$a), VFPUnaryFrm, // Between half-precision and single-precision. For disassembly only. -def VCVTBSH : ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$dst), (ins SPR:$a), +def VCVTBSH: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$dst), (ins SPR:$a), /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$dst, $a", [/* For disassembly only; pattern left blank */]>; def : ARMPat<(f32_to_f16 SPR:$a), (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>; -def VCVTBHS : ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$dst), (ins SPR:$a), +def VCVTBHS: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$dst), (ins SPR:$a), /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$dst, $a", [/* For disassembly only; pattern left blank */]>; def : ARMPat<(f16_to_f32 GPR:$a), (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>; -def VCVTTSH : ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$dst), (ins SPR:$a), +def VCVTTSH: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$dst), (ins SPR:$a), /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$dst, $a", [/* For disassembly only; pattern left blank */]>; -def VCVTTHS : ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$dst), (ins SPR:$a), +def VCVTTHS: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$dst), (ins SPR:$a), /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$dst, $a", [/* For disassembly only; pattern left blank */]>; diff --git a/contrib/llvm/lib/Target/ARM/ARMJITInfo.h b/contrib/llvm/lib/Target/ARM/ARMJITInfo.h index ff332b7..f5d9eff 100644 --- a/contrib/llvm/lib/Target/ARM/ARMJITInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMJITInfo.h @@ -143,7 +143,8 @@ namespace llvm { JumpTableId2AddrMap[JTI] = Addr; } - /// getPCLabelAddr - Retrieve the address of the PC label of the specified id. + /// getPCLabelAddr - Retrieve the address of the PC label of the + /// specified id. intptr_t getPCLabelAddr(unsigned Id) const { DenseMap<unsigned, intptr_t>::const_iterator I = PCLabelMap.find(Id); assert(I != PCLabelMap.end()); diff --git a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 8585c1e..f80e316 100644 --- a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -74,11 +74,14 @@ namespace { private: struct MemOpQueueEntry { int Offset; + unsigned Reg; + bool isKill; unsigned Position; MachineBasicBlock::iterator MBBI; bool Merged; - MemOpQueueEntry(int o, int p, MachineBasicBlock::iterator i) - : Offset(o), Position(p), MBBI(i), Merged(false) {} + MemOpQueueEntry(int o, unsigned r, bool k, unsigned p, + MachineBasicBlock::iterator i) + : Offset(o), Reg(r), isKill(k), Position(p), MBBI(i), Merged(false) {} }; typedef SmallVector<MemOpQueueEntry,8> MemOpQueue; typedef MemOpQueue::iterator MemOpQueueIter; @@ -128,30 +131,30 @@ namespace { static int getLoadStoreMultipleOpcode(int Opcode) { switch (Opcode) { case ARM::LDR: - NumLDMGened++; + ++NumLDMGened; return ARM::LDM; case ARM::STR: - NumSTMGened++; + ++NumSTMGened; return ARM::STM; case ARM::t2LDRi8: case ARM::t2LDRi12: - NumLDMGened++; + ++NumLDMGened; return ARM::t2LDM; case ARM::t2STRi8: case ARM::t2STRi12: - NumSTMGened++; + ++NumSTMGened; return ARM::t2STM; case ARM::VLDRS: - NumVLDMGened++; + ++NumVLDMGened; return ARM::VLDMS; case ARM::VSTRS: - NumVSTMGened++; + ++NumVSTMGened; return ARM::VSTMS; case ARM::VLDRD: - NumVLDMGened++; + ++NumVLDMGened; return ARM::VLDMD; case ARM::VSTRD: - NumVSTMGened++; + ++NumVSTMGened; return ARM::VSTMD; default: llvm_unreachable("Unhandled opcode!"); } @@ -264,45 +267,59 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, // MergeOpsUpdate - call MergeOps and update MemOps and merges accordingly on // success. -void ARMLoadStoreOpt:: -MergeOpsUpdate(MachineBasicBlock &MBB, - MemOpQueue &memOps, - unsigned memOpsBegin, - unsigned memOpsEnd, - unsigned insertAfter, - int Offset, - unsigned Base, - bool BaseKill, - int Opcode, - ARMCC::CondCodes Pred, - unsigned PredReg, - unsigned Scratch, - DebugLoc dl, - SmallVector<MachineBasicBlock::iterator, 4> &Merges) { +void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB, + MemOpQueue &memOps, + unsigned memOpsBegin, unsigned memOpsEnd, + unsigned insertAfter, int Offset, + unsigned Base, bool BaseKill, + int Opcode, + ARMCC::CondCodes Pred, unsigned PredReg, + unsigned Scratch, + DebugLoc dl, + SmallVector<MachineBasicBlock::iterator, 4> &Merges) { // First calculate which of the registers should be killed by the merged // instruction. - SmallVector<std::pair<unsigned, bool>, 8> Regs; const unsigned insertPos = memOps[insertAfter].Position; + + SmallSet<unsigned, 4> UnavailRegs; + SmallSet<unsigned, 4> KilledRegs; + DenseMap<unsigned, unsigned> Killer; + for (unsigned i = 0; i < memOpsBegin; ++i) { + if (memOps[i].Position < insertPos && memOps[i].isKill) { + unsigned Reg = memOps[i].Reg; + if (memOps[i].Merged) + UnavailRegs.insert(Reg); + else { + KilledRegs.insert(Reg); + Killer[Reg] = i; + } + } + } + for (unsigned i = memOpsEnd, e = memOps.size(); i != e; ++i) { + if (memOps[i].Position < insertPos && memOps[i].isKill) { + unsigned Reg = memOps[i].Reg; + KilledRegs.insert(Reg); + Killer[Reg] = i; + } + } + + SmallVector<std::pair<unsigned, bool>, 8> Regs; for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) { - const MachineOperand &MO = memOps[i].MBBI->getOperand(0); - unsigned Reg = MO.getReg(); - bool isKill = MO.isKill(); + unsigned Reg = memOps[i].Reg; + if (UnavailRegs.count(Reg)) + // Register is killed before and it's not easy / possible to update the + // kill marker on already merged instructions. Abort. + return; // If we are inserting the merged operation after an unmerged operation that // uses the same register, make sure to transfer any kill flag. - for (unsigned j = memOpsEnd, e = memOps.size(); !isKill && j != e; ++j) - if (memOps[j].Position<insertPos) { - const MachineOperand &MOJ = memOps[j].MBBI->getOperand(0); - if (MOJ.getReg() == Reg && MOJ.isKill()) - isKill = true; - } - + bool isKill = memOps[i].isKill || KilledRegs.count(Reg); Regs.push_back(std::make_pair(Reg, isKill)); } // Try to do the merge. MachineBasicBlock::iterator Loc = memOps[insertAfter].MBBI; - Loc++; + ++Loc; if (!MergeOps(MBB, Loc, Offset, Base, BaseKill, Opcode, Pred, PredReg, Scratch, dl, Regs)) return; @@ -311,13 +328,13 @@ MergeOpsUpdate(MachineBasicBlock &MBB, Merges.push_back(prior(Loc)); for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) { // Remove kill flags from any unmerged memops that come before insertPos. - if (Regs[i-memOpsBegin].second) - for (unsigned j = memOpsEnd, e = memOps.size(); j != e; ++j) - if (memOps[j].Position<insertPos) { - MachineOperand &MOJ = memOps[j].MBBI->getOperand(0); - if (MOJ.getReg() == Regs[i-memOpsBegin].first && MOJ.isKill()) - MOJ.setIsKill(false); - } + if (Regs[i-memOpsBegin].second) { + unsigned Reg = Regs[i-memOpsBegin].first; + if (KilledRegs.count(Reg)) { + unsigned j = Killer[Reg]; + memOps[j].MBBI->getOperand(0).setIsKill(false); + } + } MBB.erase(memOps[i].MBBI); memOps[i].Merged = true; } @@ -517,8 +534,11 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, } // Try merging with the previous instruction. - if (MBBI != MBB.begin()) { + MachineBasicBlock::iterator BeginMBBI = MBB.begin(); + if (MBBI != BeginMBBI) { MachineBasicBlock::iterator PrevMBBI = prior(MBBI); + while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue()) + --PrevMBBI; if (isAM4) { if (Mode == ARM_AM::ia && isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { @@ -541,8 +561,11 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, } // Try merging with the next instruction. - if (!DoMerge && MBBI != MBB.end()) { + MachineBasicBlock::iterator EndMBBI = MBB.end(); + if (!DoMerge && MBBI != EndMBBI) { MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI); + while (NextMBBI != EndMBBI && NextMBBI->isDebugValue()) + ++NextMBBI; if (isAM4) { if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) && isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { @@ -669,8 +692,11 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100); // Try merging with the previous instruction. - if (MBBI != MBB.begin()) { + MachineBasicBlock::iterator BeginMBBI = MBB.begin(); + if (MBBI != BeginMBBI) { MachineBasicBlock::iterator PrevMBBI = prior(MBBI); + while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue()) + --PrevMBBI; if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) { DoMerge = true; AddSub = ARM_AM::sub; @@ -685,8 +711,11 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, } // Try merging with the next instruction. - if (!DoMerge && MBBI != MBB.end()) { + MachineBasicBlock::iterator EndMBBI = MBB.end(); + if (!DoMerge && MBBI != EndMBBI) { MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI); + while (NextMBBI != EndMBBI && NextMBBI->isDebugValue()) + ++NextMBBI; if (!isAM5 && isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) { DoMerge = true; @@ -759,18 +788,21 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, /// isMemoryOp - Returns true if instruction is a memory operations (that this /// pass is capable of operating on). static bool isMemoryOp(const MachineInstr *MI) { - if (MI->hasOneMemOperand()) { - const MachineMemOperand *MMO = *MI->memoperands_begin(); + // When no memory operands are present, conservatively assume unaligned, + // volatile, unfoldable. + if (!MI->hasOneMemOperand()) + return false; - // Don't touch volatile memory accesses - we may be changing their order. - if (MMO->isVolatile()) - return false; + const MachineMemOperand *MMO = *MI->memoperands_begin(); - // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is - // not. - if (MMO->getAlignment() < 4) - return false; - } + // Don't touch volatile memory accesses - we may be changing their order. + if (MMO->isVolatile()) + return false; + + // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is + // not. + if (MMO->getAlignment() < 4) + return false; // str <undef> could probably be eliminated entirely, but for now we just want // to avoid making a mess of it. @@ -898,6 +930,7 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, if ((EvenRegNum & 1) == 0 && (EvenRegNum + 1) == OddRegNum) return false; + MachineBasicBlock::iterator NewBBI = MBBI; bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8; bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8; bool EvenDeadKill = isLd ? @@ -942,6 +975,7 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, getKillRegState(OddDeadKill) | getUndefRegState(OddUndef)); ++NumSTRD2STM; } + NewBBI = llvm::prior(MBBI); } else { // Split into two instructions. assert((!isT2 || !OffReg) && @@ -962,14 +996,15 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, OddReg, OddDeadKill, false, BaseReg, false, BaseUndef, OffReg, false, OffUndef, Pred, PredReg, TII, isT2); + NewBBI = llvm::prior(MBBI); InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, EvenReg, EvenDeadKill, false, BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef, Pred, PredReg, TII, isT2); } else { if (OddReg == EvenReg && EvenDeadKill) { - // If the two source operands are the same, the kill marker is probably - // on the first one. e.g. + // If the two source operands are the same, the kill marker is + // probably on the first one. e.g. // t2STRDi8 %R5<kill>, %R5, %R9<kill>, 0, 14, %reg0 EvenDeadKill = false; OddDeadKill = true; @@ -978,6 +1013,7 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, EvenReg, EvenDeadKill, EvenUndef, BaseReg, false, BaseUndef, OffReg, false, OffUndef, Pred, PredReg, TII, isT2); + NewBBI = llvm::prior(MBBI); InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc, OddReg, OddDeadKill, OddUndef, BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef, @@ -989,8 +1025,9 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, ++NumSTRD2STR; } - MBBI = prior(MBBI); MBB.erase(MI); + MBBI = NewBBI; + return true; } return false; } @@ -1023,6 +1060,9 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { if (isMemOp) { int Opcode = MBBI->getOpcode(); unsigned Size = getLSMultipleTransferSize(MBBI); + const MachineOperand &MO = MBBI->getOperand(0); + unsigned Reg = MO.getReg(); + bool isKill = MO.isDef() ? false : MO.isKill(); unsigned Base = MBBI->getOperand(1).getReg(); unsigned PredReg = 0; ARMCC::CondCodes Pred = llvm::getInstrPredicate(MBBI, PredReg); @@ -1044,8 +1084,8 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { CurrSize = Size; CurrPred = Pred; CurrPredReg = PredReg; - MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI)); - NumMemOps++; + MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, Position, MBBI)); + ++NumMemOps; Advance = true; } else { if (Clobber) { @@ -1057,15 +1097,17 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { // No need to match PredReg. // Continue adding to the queue. if (Offset > MemOps.back().Offset) { - MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI)); - NumMemOps++; + MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, + Position, MBBI)); + ++NumMemOps; Advance = true; } else { for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); I != E; ++I) { if (Offset < I->Offset) { - MemOps.insert(I, MemOpQueueEntry(Offset, Position, MBBI)); - NumMemOps++; + MemOps.insert(I, MemOpQueueEntry(Offset, Reg, isKill, + Position, MBBI)); + ++NumMemOps; Advance = true; break; } else if (Offset == I->Offset) { @@ -1078,7 +1120,12 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { } } - if (Advance) { + if (MBBI->isDebugValue()) { + ++MBBI; + if (MBBI == E) + // Reach the end of the block, try merging the memory instructions. + TryMerge = true; + } else if (Advance) { ++Position; ++MBBI; if (MBBI == E) @@ -1279,7 +1326,7 @@ static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base, // some day. SmallSet<unsigned, 4> AddedRegPressure; while (++I != E) { - if (MemOps.count(&*I)) + if (I->isDebugValue() || MemOps.count(&*I)) continue; const TargetInstrDesc &TID = I->getDesc(); if (TID.isCall() || TID.isTerminator() || TID.hasUnmodeledSideEffects()) @@ -1411,7 +1458,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, std::sort(Ops.begin(), Ops.end(), OffsetCompare()); // The loads / stores of the same base are in order. Scan them from first to - // last and check for the followins: + // last and check for the following: // 1. Any def of base. // 2. Any gaps. while (Ops.size() > 1) { @@ -1474,7 +1521,8 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, } else { // This is the new location for the loads / stores. MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp; - while (InsertPos != MBB->end() && MemOps.count(InsertPos)) + while (InsertPos != MBB->end() + && (MemOps.count(InsertPos) || InsertPos->isDebugValue())) ++InsertPos; // If we are moving a pair of loads / stores, see if it makes sense @@ -1562,7 +1610,9 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { break; } - MI2LocMap[MI] = Loc++; + if (!MI->isDebugValue()) + MI2LocMap[MI] = ++Loc; + if (!isMemoryOp(MI)) continue; unsigned PredReg = 0; diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h index 0134276..7e57a1c 100644 --- a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -88,6 +88,9 @@ class ARMFunctionInfo : public MachineFunctionInfo { /// VarArgsFrameIndex - FrameIndex for start of varargs area. int VarArgsFrameIndex; + /// HasITBlocks - True if IT blocks have been inserted. + bool HasITBlocks; + public: ARMFunctionInfo() : isThumb(false), @@ -97,7 +100,8 @@ public: FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), GPRCS1Frames(0), GPRCS2Frames(0), DPRCSFrames(0), - JumpTableUId(0), ConstPoolEntryUId(0), VarArgsFrameIndex(0) {} + JumpTableUId(0), ConstPoolEntryUId(0), VarArgsFrameIndex(0), + HasITBlocks(false) {} explicit ARMFunctionInfo(MachineFunction &MF) : isThumb(MF.getTarget().getSubtarget<ARMSubtarget>().isThumb()), @@ -108,7 +112,8 @@ public: GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), GPRCS1Frames(32), GPRCS2Frames(32), DPRCSFrames(32), SpilledCSRegs(MF.getTarget().getRegisterInfo()->getNumRegs()), - JumpTableUId(0), ConstPoolEntryUId(0), VarArgsFrameIndex(0) {} + JumpTableUId(0), ConstPoolEntryUId(0), VarArgsFrameIndex(0), + HasITBlocks(false) {} bool isThumbFunction() const { return isThumb; } bool isThumb1OnlyFunction() const { return isThumb && !hasThumb2; } @@ -229,6 +234,9 @@ public: int getVarArgsFrameIndex() const { return VarArgsFrameIndex; } void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; } + + bool hasITBlocks() const { return HasITBlocks; } + void setHasITBlocks(bool h) { HasITBlocks = h; } }; } // End llvm namespace diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td index 6beca8b..d020f3c 100644 --- a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td +++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td @@ -153,11 +153,11 @@ def Q15 : ARMReg<15, "q15", [D30, D31]>; // Pseudo 256-bit registers to represent pairs of Q registers. These should // never be present in the emitted code. -// These are used for NEON load / store instructions, e.g. vld4, vst3. -// NOTE: It's possible to define more QQ registers since technical the -// starting D register number doesn't have to be multiple of 4. e.g. -// D1, D2, D3, D4 would be a legal quad. But that would make the sub-register -// stuffs very messy. +// These are used for NEON load / store instructions, e.g., vld4, vst3. +// NOTE: It's possible to define more QQ registers since technically the +// starting D register number doesn't have to be multiple of 4, e.g., +// D1, D2, D3, D4 would be a legal quad, but that would make the subregister +// stuff very messy. let SubRegIndices = [qsub_0, qsub_1] in { let CompositeIndices = [(dsub_2 qsub_1, dsub_0), (dsub_3 qsub_1, dsub_1), (ssub_4 qsub_1, ssub_0), (ssub_5 qsub_1, ssub_1), @@ -183,7 +183,8 @@ let CompositeIndices = [(qsub_2 qqsub_1, qsub_0), (qsub_3 qqsub_1, qsub_1), (ssub_8 qqsub_1, ssub_0), (ssub_9 qqsub_1, ssub_1), (ssub_10 qqsub_1, ssub_2), (ssub_11 qqsub_1, ssub_3), (ssub_12 qqsub_1, ssub_4), (ssub_13 qqsub_1, ssub_5), - (ssub_14 qqsub_1, ssub_6), (ssub_15 qqsub_1, ssub_7)] in { + (ssub_14 qqsub_1, ssub_6), (ssub_15 qqsub_1, ssub_7)] in +{ def QQQQ0 : ARMReg<0, "qqqq0", [QQ0, QQ1]>; def QQQQ1 : ARMReg<1, "qqqq1", [QQ2, QQ3]>; } @@ -196,9 +197,9 @@ def QQQQ3 : ARMReg<3, "qqqq3", [QQ6, QQ7]>; } // Current Program Status Register. -def CPSR : ARMReg<0, "cpsr">; - -def FPSCR : ARMReg<1, "fpscr">; +def CPSR : ARMReg<0, "cpsr">; +def FPSCR : ARMReg<1, "fpscr">; +def ITSTATE : ARMReg<2, "itstate">; // Register classes. // @@ -348,6 +349,73 @@ def tGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, R7]> { }]; } +// For tail calls, we can't use callee-saved registers, as they are restored +// to the saved value before the tail call, which would clobber a call address. +// Note, getMinimalPhysRegClass(R0) returns tGPR because of the names of +// this class and the preceding one(!) This is what we want. +def tcGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R9, R12]> { + let MethodProtos = [{ + iterator allocation_order_begin(const MachineFunction &MF) const; + iterator allocation_order_end(const MachineFunction &MF) const; + }]; + let MethodBodies = [{ + // R9 is available. + static const unsigned ARM_GPR_R9_TC[] = { + ARM::R0, ARM::R1, ARM::R2, ARM::R3, + ARM::R9, ARM::R12 }; + // R9 is not available. + static const unsigned ARM_GPR_NOR9_TC[] = { + ARM::R0, ARM::R1, ARM::R2, ARM::R3, + ARM::R12 }; + + // For Thumb1 mode, we don't want to allocate hi regs at all, as we + // don't know how to spill them. If we make our prologue/epilogue code + // smarter at some point, we can go back to using the above allocation + // orders for the Thumb1 instructions that know how to use hi regs. + static const unsigned THUMB_GPR_AO_TC[] = { + ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; + + tcGPRClass::iterator + tcGPRClass::allocation_order_begin(const MachineFunction &MF) const { + const TargetMachine &TM = MF.getTarget(); + const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>(); + if (Subtarget.isThumb1Only()) + return THUMB_GPR_AO_TC; + if (Subtarget.isTargetDarwin()) { + if (Subtarget.isR9Reserved()) + return ARM_GPR_NOR9_TC; + else + return ARM_GPR_R9_TC; + } else + // R9 is either callee-saved or reserved; can't use it. + return ARM_GPR_NOR9_TC; + } + + tcGPRClass::iterator + tcGPRClass::allocation_order_end(const MachineFunction &MF) const { + const TargetMachine &TM = MF.getTarget(); + const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>(); + GPRClass::iterator I; + + if (Subtarget.isThumb1Only()) { + I = THUMB_GPR_AO_TC + (sizeof(THUMB_GPR_AO_TC)/sizeof(unsigned)); + return I; + } + + if (Subtarget.isTargetDarwin()) { + if (Subtarget.isR9Reserved()) + I = ARM_GPR_NOR9_TC + (sizeof(ARM_GPR_NOR9_TC)/sizeof(unsigned)); + else + I = ARM_GPR_R9_TC + (sizeof(ARM_GPR_R9_TC)/sizeof(unsigned)); + } else + // R9 is either callee-saved or reserved; can't use it. + I = ARM_GPR_NOR9_TC + (sizeof(ARM_GPR_NOR9_TC)/sizeof(unsigned)); + return I; + } + }]; +} + + // Scalar single precision floating point register class.. def SPR : RegisterClass<"ARM", [f32], 32, [S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22, @@ -479,4 +547,3 @@ def QQQQPR : RegisterClass<"ARM", [v8i64], // Condition code registers. def CCR : RegisterClass<"ARM", [i32], 32, [CPSR]>; - diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td b/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td index bbfc0b2..282abca 100644 --- a/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td +++ b/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td @@ -1,10 +1,10 @@ //=- ARMScheduleA8.td - ARM Cortex-A8 Scheduling Definitions -*- tablegen -*-=// -// +// // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. -// +// //===----------------------------------------------------------------------===// // // This file defines the itinerary class data for the ARM Cortex A8 processors. @@ -32,50 +32,50 @@ def CortexA8Itineraries : ProcessorItineraries< InstrItinData<IIC_iALUx , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>]>, // // Binary Instructions that produce a result - InstrItinData<IIC_iALUi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>, - InstrItinData<IIC_iALUr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 2]>, - InstrItinData<IIC_iALUsi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>, - InstrItinData<IIC_iALUsr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1, 1]>, + InstrItinData<IIC_iALUi ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>, + InstrItinData<IIC_iALUr ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 2]>, + InstrItinData<IIC_iALUsi,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>, + InstrItinData<IIC_iALUsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1, 1]>, // // Unary Instructions that produce a result - InstrItinData<IIC_iUNAr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>, - InstrItinData<IIC_iUNAsi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>, - InstrItinData<IIC_iUNAsr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>, + InstrItinData<IIC_iUNAr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>, + InstrItinData<IIC_iUNAsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>, + InstrItinData<IIC_iUNAsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>, // // Compare instructions - InstrItinData<IIC_iCMPi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>, - InstrItinData<IIC_iCMPr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>, - InstrItinData<IIC_iCMPsi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>, - InstrItinData<IIC_iCMPsr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>, + InstrItinData<IIC_iCMPi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>, + InstrItinData<IIC_iCMPr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>, + InstrItinData<IIC_iCMPsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>, + InstrItinData<IIC_iCMPsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>, // // Move instructions, unconditional - InstrItinData<IIC_iMOVi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1]>, - InstrItinData<IIC_iMOVr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>, - InstrItinData<IIC_iMOVsi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>, - InstrItinData<IIC_iMOVsr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1, 1]>, + InstrItinData<IIC_iMOVi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1]>, + InstrItinData<IIC_iMOVr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>, + InstrItinData<IIC_iMOVsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>, + InstrItinData<IIC_iMOVsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1, 1]>, // // Move instructions, conditional - InstrItinData<IIC_iCMOVi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>, - InstrItinData<IIC_iCMOVr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>, - InstrItinData<IIC_iCMOVsi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>, - InstrItinData<IIC_iCMOVsr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>, + InstrItinData<IIC_iCMOVi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>, + InstrItinData<IIC_iCMOVr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>, + InstrItinData<IIC_iCMOVsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>, + InstrItinData<IIC_iCMOVsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>, // Integer multiply pipeline // Result written in E5, but that is relative to the last cycle of multicycle, // so we use 6 for those cases // InstrItinData<IIC_iMUL16 , [InstrStage<1, [A8_Pipe0]>], [5, 1, 1]>, - InstrItinData<IIC_iMAC16 , [InstrStage<1, [A8_Pipe1], 0>, + InstrItinData<IIC_iMAC16 , [InstrStage<1, [A8_Pipe1], 0>, InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>, - InstrItinData<IIC_iMUL32 , [InstrStage<1, [A8_Pipe1], 0>, + InstrItinData<IIC_iMUL32 , [InstrStage<1, [A8_Pipe1], 0>, InstrStage<2, [A8_Pipe0]>], [6, 1, 1]>, - InstrItinData<IIC_iMAC32 , [InstrStage<1, [A8_Pipe1], 0>, + InstrItinData<IIC_iMAC32 , [InstrStage<1, [A8_Pipe1], 0>, InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>, - InstrItinData<IIC_iMUL64 , [InstrStage<2, [A8_Pipe1], 0>, + InstrItinData<IIC_iMUL64 , [InstrStage<2, [A8_Pipe1], 0>, InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>, - InstrItinData<IIC_iMAC64 , [InstrStage<2, [A8_Pipe1], 0>, + InstrItinData<IIC_iMAC64 , [InstrStage<2, [A8_Pipe1], 0>, InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>, - + // Integer load pipeline // // loads have an extra cycle of latency, but are fully pipelined @@ -166,7 +166,7 @@ def CortexA8Itineraries : ProcessorItineraries< InstrStage<2, [A8_Pipe1]>, InstrStage<1, [A8_Pipe0, A8_Pipe1]>, InstrStage<1, [A8_LdSt0]>]>, - + // Branch // // no delay slots, so the latency of a branch is unimportant @@ -276,14 +276,14 @@ def CortexA8Itineraries : ProcessorItineraries< // // Single-precision FP Load // use A8_Issue to enforce the 1 load/store per cycle limit - InstrItinData<IIC_fpLoad32, [InstrStage<1, [A8_Issue], 0>, + InstrItinData<IIC_fpLoad32, [InstrStage<1, [A8_Issue], 0>, InstrStage<1, [A8_Pipe0, A8_Pipe1]>, InstrStage<1, [A8_LdSt0], 0>, InstrStage<1, [A8_NLSPipe]>]>, // // Double-precision FP Load // use A8_Issue to enforce the 1 load/store per cycle limit - InstrItinData<IIC_fpLoad64, [InstrStage<2, [A8_Issue], 0>, + InstrItinData<IIC_fpLoad64, [InstrStage<2, [A8_Issue], 0>, InstrStage<1, [A8_Pipe0], 0>, InstrStage<1, [A8_Pipe1]>, InstrStage<1, [A8_Pipe0, A8_Pipe1]>, @@ -292,7 +292,7 @@ def CortexA8Itineraries : ProcessorItineraries< // // FP Load Multiple // use A8_Issue to enforce the 1 load/store per cycle limit - InstrItinData<IIC_fpLoadm, [InstrStage<3, [A8_Issue], 0>, + InstrItinData<IIC_fpLoadm, [InstrStage<3, [A8_Issue], 0>, InstrStage<2, [A8_Pipe0], 0>, InstrStage<2, [A8_Pipe1]>, InstrStage<1, [A8_Pipe0, A8_Pipe1]>, @@ -301,14 +301,14 @@ def CortexA8Itineraries : ProcessorItineraries< // // Single-precision FP Store // use A8_Issue to enforce the 1 load/store per cycle limit - InstrItinData<IIC_fpStore32,[InstrStage<1, [A8_Issue], 0>, + InstrItinData<IIC_fpStore32,[InstrStage<1, [A8_Issue], 0>, InstrStage<1, [A8_Pipe0, A8_Pipe1]>, InstrStage<1, [A8_LdSt0], 0>, InstrStage<1, [A8_NLSPipe]>]>, // // Double-precision FP Store // use A8_Issue to enforce the 1 load/store per cycle limit - InstrItinData<IIC_fpStore64,[InstrStage<2, [A8_Issue], 0>, + InstrItinData<IIC_fpStore64,[InstrStage<2, [A8_Issue], 0>, InstrStage<1, [A8_Pipe0], 0>, InstrStage<1, [A8_Pipe1]>, InstrStage<1, [A8_Pipe0, A8_Pipe1]>, @@ -317,7 +317,7 @@ def CortexA8Itineraries : ProcessorItineraries< // // FP Store Multiple // use A8_Issue to enforce the 1 load/store per cycle limit - InstrItinData<IIC_fpStorem, [InstrStage<3, [A8_Issue], 0>, + InstrItinData<IIC_fpStorem, [InstrStage<3, [A8_Issue], 0>, InstrStage<2, [A8_Pipe0], 0>, InstrStage<2, [A8_Pipe1]>, InstrStage<1, [A8_Pipe0, A8_Pipe1]>, @@ -329,35 +329,35 @@ def CortexA8Itineraries : ProcessorItineraries< // // VLD1 // FIXME: We don't model this instruction properly - InstrItinData<IIC_VLD1, [InstrStage<1, [A8_Issue], 0>, + InstrItinData<IIC_VLD1, [InstrStage<1, [A8_Issue], 0>, InstrStage<1, [A8_Pipe0, A8_Pipe1]>, InstrStage<1, [A8_LdSt0], 0>, InstrStage<1, [A8_NLSPipe]>]>, // // VLD2 // FIXME: We don't model this instruction properly - InstrItinData<IIC_VLD2, [InstrStage<1, [A8_Issue], 0>, + InstrItinData<IIC_VLD2, [InstrStage<1, [A8_Issue], 0>, InstrStage<1, [A8_Pipe0, A8_Pipe1]>, InstrStage<1, [A8_LdSt0], 0>, InstrStage<1, [A8_NLSPipe]>], [2, 2, 1]>, // // VLD3 // FIXME: We don't model this instruction properly - InstrItinData<IIC_VLD3, [InstrStage<1, [A8_Issue], 0>, + InstrItinData<IIC_VLD3, [InstrStage<1, [A8_Issue], 0>, InstrStage<1, [A8_Pipe0, A8_Pipe1]>, InstrStage<1, [A8_LdSt0], 0>, InstrStage<1, [A8_NLSPipe]>], [2, 2, 2, 1]>, // // VLD4 // FIXME: We don't model this instruction properly - InstrItinData<IIC_VLD4, [InstrStage<1, [A8_Issue], 0>, + InstrItinData<IIC_VLD4, [InstrStage<1, [A8_Issue], 0>, InstrStage<1, [A8_Pipe0, A8_Pipe1]>, InstrStage<1, [A8_LdSt0], 0>, InstrStage<1, [A8_NLSPipe]>], [2, 2, 2, 2, 1]>, // // VST // FIXME: We don't model this instruction properly - InstrItinData<IIC_VST, [InstrStage<1, [A8_Issue], 0>, + InstrItinData<IIC_VST, [InstrStage<1, [A8_Issue], 0>, InstrStage<1, [A8_Pipe0, A8_Pipe1]>, InstrStage<1, [A8_LdSt0], 0>, InstrStage<1, [A8_NLSPipe]>]>, @@ -600,7 +600,7 @@ def CortexA8Itineraries : ProcessorItineraries< InstrItinData<IIC_VTB4, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, InstrStage<1, [A8_NLSPipe]>, InstrStage<1, [A8_NPipe], 0>, - InstrStage<2, [A8_NLSPipe]>], [4, 2, 2, 3, 3, 1]>, + InstrStage<2, [A8_NLSPipe]>],[4, 2, 2, 3, 3, 1]>, // // VTBX InstrItinData<IIC_VTBX1, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, @@ -610,9 +610,9 @@ def CortexA8Itineraries : ProcessorItineraries< InstrItinData<IIC_VTBX3, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, InstrStage<1, [A8_NLSPipe]>, InstrStage<1, [A8_NPipe], 0>, - InstrStage<2, [A8_NLSPipe]>], [4, 1, 2, 2, 3, 1]>, + InstrStage<2, [A8_NLSPipe]>],[4, 1, 2, 2, 3, 1]>, InstrItinData<IIC_VTBX4, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>, InstrStage<1, [A8_NLSPipe]>, InstrStage<1, [A8_NPipe], 0>, - InstrStage<2, [A8_NLSPipe]>], [4, 1, 2, 2, 3, 3, 1]> + InstrStage<2, [A8_NLSPipe]>], [4, 1, 2, 2, 3, 3, 1]> ]>; diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td b/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td index 75320d9..df2f896 100644 --- a/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td +++ b/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td @@ -1,10 +1,10 @@ //=- ARMScheduleA9.td - ARM Cortex-A9 Scheduling Definitions -*- tablegen -*-=// -// +// // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. -// +// //===----------------------------------------------------------------------===// // // This file defines the itinerary class data for the ARM Cortex A9 processors. @@ -16,7 +16,6 @@ // Reference Manual". // // Functional units -def A9_Issue : FuncUnit; // issue def A9_Pipe0 : FuncUnit; // pipeline 0 def A9_Pipe1 : FuncUnit; // pipeline 1 def A9_LSPipe : FuncUnit; // LS pipe @@ -27,7 +26,121 @@ def A9_DRegsN : FuncUnit; // FP register set, NEON side // Dual issue pipeline represented by A9_Pipe0 | A9_Pipe1 // def CortexA9Itineraries : ProcessorItineraries< - [A9_NPipe, A9_DRegsN, A9_DRegsVFP, A9_LSPipe, A9_Pipe0, A9_Pipe1, A9_Issue], [ + [A9_NPipe, A9_DRegsN, A9_DRegsVFP, A9_LSPipe, A9_Pipe0, A9_Pipe1], [ + // Two fully-pipelined integer ALU pipelines + // FIXME: There are no operand latencies for these instructions at all! + // + // Move instructions, unconditional + InstrItinData<IIC_iMOVi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1]>, + InstrItinData<IIC_iMOVr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>, + InstrItinData<IIC_iMOVsi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>, + InstrItinData<IIC_iMOVsr , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 2, 1]>, + // + // No operand cycles + InstrItinData<IIC_iALUx , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>]>, + // + // Binary Instructions that produce a result + InstrItinData<IIC_iALUi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2]>, + InstrItinData<IIC_iALUr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2, 2]>, + InstrItinData<IIC_iALUsi, [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 2, 1]>, + InstrItinData<IIC_iALUsr,[InstrStage<3, [A9_Pipe0, A9_Pipe1]>], [2, 2, 1, 1]>, + // + // Unary Instructions that produce a result + InstrItinData<IIC_iUNAr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2]>, + InstrItinData<IIC_iUNAsi , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1]>, + InstrItinData<IIC_iUNAsr , [InstrStage<3, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>, + // + // Compare instructions + InstrItinData<IIC_iCMPi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2]>, + InstrItinData<IIC_iCMPr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2]>, + InstrItinData<IIC_iCMPsi , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1]>, + InstrItinData<IIC_iCMPsr , [InstrStage<3, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>, + // + // Move instructions, conditional + InstrItinData<IIC_iCMOVi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2]>, + InstrItinData<IIC_iCMOVr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 1]>, + InstrItinData<IIC_iCMOVsi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 1]>, + InstrItinData<IIC_iCMOVsr , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>, + + // Integer multiply pipeline + // + InstrItinData<IIC_iMUL16 , [InstrStage<1, [A9_Pipe1], 0>, + InstrStage<2, [A9_Pipe0]>], [4, 1, 1]>, + InstrItinData<IIC_iMAC16 , [InstrStage<1, [A9_Pipe1], 0>, + InstrStage<2, [A9_Pipe0]>], [4, 1, 1, 2]>, + InstrItinData<IIC_iMUL32 , [InstrStage<1, [A9_Pipe1], 0>, + InstrStage<2, [A9_Pipe0]>], [4, 1, 1]>, + InstrItinData<IIC_iMAC32 , [InstrStage<1, [A9_Pipe1], 0>, + InstrStage<2, [A9_Pipe0]>], [4, 1, 1, 2]>, + InstrItinData<IIC_iMUL64 , [InstrStage<2, [A9_Pipe1], 0>, + InstrStage<3, [A9_Pipe0]>], [4, 5, 1, 1]>, + InstrItinData<IIC_iMAC64 , [InstrStage<2, [A9_Pipe1], 0>, + InstrStage<3, [A9_Pipe0]>], [4, 5, 1, 1]>, + // Integer load pipeline + // FIXME: The timings are some rough approximations + // + // Immediate offset + InstrItinData<IIC_iLoadi , [InstrStage<1, [A9_Pipe1]>, + InstrStage<1, [A9_LSPipe]>], [3, 1]>, + // + // Register offset + InstrItinData<IIC_iLoadr , [InstrStage<1, [A9_Pipe1]>, + InstrStage<1, [A9_LSPipe]>], [3, 1, 1]>, + // + // Scaled register offset + InstrItinData<IIC_iLoadsi , [InstrStage<1, [A9_Pipe1]>, + InstrStage<2, [A9_LSPipe]>], [4, 1, 1]>, + // + // Immediate offset with update + InstrItinData<IIC_iLoadiu , [InstrStage<1, [A9_Pipe1]>, + InstrStage<2, [A9_LSPipe]>], [3, 2, 1]>, + // + // Register offset with update + InstrItinData<IIC_iLoadru , [InstrStage<1, [A9_Pipe1]>, + InstrStage<2, [A9_LSPipe]>], [3, 2, 1, 1]>, + // + // Scaled register offset with update + InstrItinData<IIC_iLoadsiu , [InstrStage<1, [A9_Pipe1]>, + InstrStage<2, [A9_LSPipe]>], [4, 3, 1, 1]>, + // + // Load multiple + InstrItinData<IIC_iLoadm , [InstrStage<1, [A9_Pipe1]>, + InstrStage<1, [A9_LSPipe]>]>, + + // Integer store pipeline + /// + // Immediate offset + InstrItinData<IIC_iStorei , [InstrStage<1, [A9_Pipe1]>, + InstrStage<1, [A9_LSPipe]>], [3, 1]>, + // + // Register offset + InstrItinData<IIC_iStorer , [InstrStage<1, [ A9_Pipe1]>, + InstrStage<1, [A9_LSPipe]>], [3, 1, 1]>, + // + // Scaled register offset + InstrItinData<IIC_iStoresi , [InstrStage<1, [A9_Pipe1]>, + InstrStage<2, [A9_LSPipe]>], [3, 1, 1]>, + // + // Immediate offset with update + InstrItinData<IIC_iStoreiu , [InstrStage<1, [A9_Pipe1]>, + InstrStage<1, [A9_LSPipe]>], [2, 3, 1]>, + // + // Register offset with update + InstrItinData<IIC_iStoreru , [InstrStage<1, [A9_Pipe1]>, + InstrStage<1, [A9_LSPipe]>], [2, 3, 1, 1]>, + // + // Scaled register offset with update + InstrItinData<IIC_iStoresiu, [InstrStage<1, [A9_Pipe1]>, + InstrStage<2, [A9_LSPipe]>], [3, 3, 1, 1]>, + // + // Store multiple + InstrItinData<IIC_iStorem , [InstrStage<1, [A9_Pipe1]>, + InstrStage<1, [A9_LSPipe]>]>, + // Branch + // + // no delay slots, so the latency of a branch is unimportant + InstrItinData<IIC_Br , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>]>, + // VFP and NEON shares the same register file. This means that every VFP // instruction should wait for full completion of the consecutive NEON // instruction and vice-versa. We model this behavior with two artificial FUs: @@ -39,8 +152,8 @@ def CortexA9Itineraries : ProcessorItineraries< // register file writeback!). // Every NEON instruction does the same but with FUs swapped. // - // Since the reserved FU cannot be acquired this models precisly "cross-domain" - // stalls. + // Since the reserved FU cannot be acquired, this models precisely + // "cross-domain" stalls. // VFP // Issue through integer pipeline, and execute in NEON unit. @@ -48,21 +161,21 @@ def CortexA9Itineraries : ProcessorItineraries< // FP Special Register to Integer Register File Move InstrItinData<IIC_fpSTAT , [InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>]>, // // Single-precision FP Unary InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, // Extra latency cycles since wbck is 2 cycles InstrStage<3, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [1, 1]>, // // Double-precision FP Unary InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, // Extra latency cycles since wbck is 2 cycles InstrStage<3, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [1, 1]>, // @@ -70,124 +183,124 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, // Extra latency cycles since wbck is 4 cycles InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [1, 1]>, // // Double-precision FP Compare InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, // Extra latency cycles since wbck is 4 cycles InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [1, 1]>, // // Single to Double FP Convert InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [4, 1]>, // // Double to Single FP Convert InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [4, 1]>, // // Single to Half FP Convert InstrItinData<IIC_fpCVTSH , [InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [4, 1]>, // // Half to Single FP Convert InstrItinData<IIC_fpCVTHS , [InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<3, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [2, 1]>, // // Single-Precision FP to Integer Convert InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [4, 1]>, // // Double-Precision FP to Integer Convert InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [4, 1]>, // // Integer to Single-Precision FP Convert InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [4, 1]>, // // Integer to Double-Precision FP Convert InstrItinData<IIC_fpCVTID , [InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [4, 1]>, // // Single-precision FP ALU InstrItinData<IIC_fpALU32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [4, 1, 1]>, // // Double-precision FP ALU InstrItinData<IIC_fpALU64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<5, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [4, 1, 1]>, // // Single-precision FP Multiply InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<6, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [5, 1, 1]>, // // Double-precision FP Multiply InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<7, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<2, [A9_NPipe]>], [6, 1, 1]>, // // Single-precision FP MAC InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<9, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [8, 0, 1, 1]>, // // Double-precision FP MAC InstrItinData<IIC_fpMAC64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<10, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<2, [A9_NPipe]>], [9, 0, 1, 1]>, // // Single-precision FP DIV InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<16, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<10, [A9_NPipe]>], [15, 1, 1]>, // // Double-precision FP DIV InstrItinData<IIC_fpDIV64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<26, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<20, [A9_NPipe]>], [25, 1, 1]>, // // Single-precision FP SQRT InstrItinData<IIC_fpSQRT32, [InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<18, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, - InstrStage<13, [A9_NPipe]>], [17, 1]>, + InstrStage<1, [A9_Pipe1]>, + InstrStage<13, [A9_NPipe]>], [17, 1]>, // // Double-precision FP SQRT InstrItinData<IIC_fpSQRT64, [InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<33, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<28, [A9_NPipe]>], [32, 1]>, // @@ -195,92 +308,79 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_fpMOVIS, [InstrStage<1, [A9_DRegsVFP], 0, Required>, // Extra 1 latency cycle since wbck is 2 cycles InstrStage<3, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [1, 1]>, // // Integer to Double-precision Move InstrItinData<IIC_fpMOVID, [InstrStage<1, [A9_DRegsVFP], 0, Required>, // Extra 1 latency cycle since wbck is 2 cycles InstrStage<3, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [1, 1, 1]>, // // Single-precision to Integer Move InstrItinData<IIC_fpMOVSI, [InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [1, 1]>, // // Double-precision to Integer Move InstrItinData<IIC_fpMOVDI, [InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [1, 1, 1]>, // // Single-precision FP Load - // use A9_Issue to enforce the 1 load/store per cycle limit InstrItinData<IIC_fpLoad32, [InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Issue], 0>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, - InstrStage<1, [A9_LSPipe], 0>, + InstrStage<1, [A9_Pipe1], 0>, + InstrStage<1, [A9_LSPipe]>, InstrStage<1, [A9_NPipe]>]>, // // Double-precision FP Load - // use A9_Issue to enforce the 1 load/store per cycle limit InstrItinData<IIC_fpLoad64, [InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Issue], 0>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, - InstrStage<1, [A9_LSPipe], 0>, + InstrStage<1, [A9_Pipe1], 0>, + InstrStage<1, [A9_LSPipe]>, InstrStage<1, [A9_NPipe]>]>, // // FP Load Multiple - // use A9_Issue to enforce the 1 load/store per cycle limit InstrItinData<IIC_fpLoadm, [InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Issue], 0>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, - InstrStage<1, [A9_LSPipe], 0>, + InstrStage<1, [A9_Pipe1], 0>, + InstrStage<1, [A9_LSPipe]>, InstrStage<1, [A9_NPipe]>]>, // // Single-precision FP Store - // use A9_Issue to enforce the 1 load/store per cycle limit InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Issue], 0>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, - InstrStage<1, [A9_LSPipe], 0>, + InstrStage<1, [A9_Pipe1], 0>, + InstrStage<1, [A9_LSPipe]>, InstrStage<1, [A9_NPipe]>]>, // // Double-precision FP Store - // use A9_Issue to enforce the 1 load/store per cycle limit InstrItinData<IIC_fpStore64,[InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Issue], 0>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, - InstrStage<1, [A9_LSPipe], 0>, + InstrStage<1, [A9_Pipe1], 0>, + InstrStage<1, [A9_LSPipe]>, InstrStage<1, [A9_NPipe]>]>, // // FP Store Multiple - // use A9_Issue to enforce the 1 load/store per cycle limit InstrItinData<IIC_fpStorem, [InstrStage<1, [A9_DRegsVFP], 0, Required>, InstrStage<2, [A9_DRegsN], 0, Reserved>, - InstrStage<1, [A9_Issue], 0>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, - InstrStage<1, [A9_LSPipe], 0>, + InstrStage<1, [A9_Pipe1], 0>, + InstrStage<1, [A9_LSPipe]>, InstrStage<1, [A9_NPipe]>]>, // NEON // Issue through integer pipeline, and execute in NEON unit. - // FIXME: Neon pipeline and LdSt unit are multiplexed. + // FIXME: Neon pipeline and LdSt unit are multiplexed. // Add some syntactic sugar to model this! // VLD1 // FIXME: We don't model this instruction properly InstrItinData<IIC_VLD1, [InstrStage<1, [A9_DRegsN], 0, Required>, InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Issue], 0>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, - InstrStage<1, [A9_LSPipe], 0>, + InstrStage<1, [A9_Pipe1], 0>, + InstrStage<1, [A9_LSPipe]>, InstrStage<1, [A9_NPipe]>]>, // // VLD2 @@ -288,9 +388,8 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_VLD2, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Issue], 0>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, - InstrStage<1, [A9_LSPipe], 0>, + InstrStage<1, [A9_Pipe1], 0>, + InstrStage<1, [A9_LSPipe]>, InstrStage<1, [A9_NPipe]>], [2, 2, 1]>, // // VLD3 @@ -298,9 +397,8 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_VLD3, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Issue], 0>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, - InstrStage<1, [A9_LSPipe], 0>, + InstrStage<1, [A9_Pipe1], 0>, + InstrStage<1, [A9_LSPipe]>, InstrStage<1, [A9_NPipe]>], [2, 2, 2, 1]>, // // VLD4 @@ -308,9 +406,8 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_VLD4, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Issue], 0>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, - InstrStage<1, [A9_LSPipe], 0>, + InstrStage<1, [A9_Pipe1], 0>, + InstrStage<1, [A9_LSPipe]>, InstrStage<1, [A9_NPipe]>], [2, 2, 2, 2, 1]>, // // VST @@ -318,121 +415,120 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_VST, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Issue], 0>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, - InstrStage<1, [A9_LSPipe], 0>, + InstrStage<1, [A9_Pipe1], 0>, + InstrStage<1, [A9_LSPipe]>, InstrStage<1, [A9_NPipe]>]>, // // Double-register Integer Unary InstrItinData<IIC_VUNAiD, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [4, 2]>, // // Quad-register Integer Unary InstrItinData<IIC_VUNAiQ, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [4, 2]>, // // Double-register Integer Q-Unary InstrItinData<IIC_VQUNAiD, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [4, 1]>, // // Quad-register Integer CountQ-Unary InstrItinData<IIC_VQUNAiQ, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [4, 1]>, // // Double-register Integer Binary InstrItinData<IIC_VBINiD, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [3, 2, 2]>, // // Quad-register Integer Binary InstrItinData<IIC_VBINiQ, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [3, 2, 2]>, // // Double-register Integer Subtract InstrItinData<IIC_VSUBiD, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [3, 2, 1]>, // // Quad-register Integer Subtract InstrItinData<IIC_VSUBiQ, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [3, 2, 1]>, // // Double-register Integer Shift InstrItinData<IIC_VSHLiD, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [3, 1, 1]>, // // Quad-register Integer Shift InstrItinData<IIC_VSHLiQ, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [3, 1, 1]>, // // Double-register Integer Shift (4 cycle) InstrItinData<IIC_VSHLi4D, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [4, 1, 1]>, // // Quad-register Integer Shift (4 cycle) InstrItinData<IIC_VSHLi4Q, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [4, 1, 1]>, // // Double-register Integer Binary (4 cycle) InstrItinData<IIC_VBINi4D, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [4, 2, 2]>, // // Quad-register Integer Binary (4 cycle) InstrItinData<IIC_VBINi4Q, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [4, 2, 2]>, // // Double-register Integer Subtract (4 cycle) InstrItinData<IIC_VSUBiD, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [4, 2, 1]>, // // Quad-register Integer Subtract (4 cycle) InstrItinData<IIC_VSUBiQ, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [4, 2, 1]>, // @@ -440,7 +536,7 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_VCNTiD, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [3, 2, 2]>, // // Quad-register Integer Count @@ -449,35 +545,35 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_VCNTiQ, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<2, [A9_NPipe]>], [4, 2, 2]>, // // Double-register Absolute Difference and Accumulate InstrItinData<IIC_VABAD, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [6, 3, 2, 1]>, // // Quad-register Absolute Difference and Accumulate InstrItinData<IIC_VABAQ, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<2, [A9_NPipe]>], [6, 3, 2, 1]>, // // Double-register Integer Pair Add Long InstrItinData<IIC_VPALiD, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [6, 3, 1]>, // // Quad-register Integer Pair Add Long InstrItinData<IIC_VPALiQ, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<2, [A9_NPipe]>], [6, 3, 1]>, // @@ -485,14 +581,14 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_VMULi16D, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [6, 2, 2]>, // // Quad-register Integer Multiply (.8, .16) InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<2, [A9_NPipe]>], [7, 2, 2]>, // @@ -500,56 +596,56 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_VMULi32D, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<2, [A9_NPipe]>], [7, 2, 1]>, // // Quad-register Integer Multiply (.32) InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 9 cycles InstrStage<10, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<4, [A9_NPipe]>], [9, 2, 1]>, // // Double-register Integer Multiply-Accumulate (.8, .16) InstrItinData<IIC_VMACi16D, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [6, 3, 2, 2]>, // // Double-register Integer Multiply-Accumulate (.32) InstrItinData<IIC_VMACi32D, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<2, [A9_NPipe]>], [7, 3, 2, 1]>, // // Quad-register Integer Multiply-Accumulate (.8, .16) InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<2, [A9_NPipe]>], [7, 3, 2, 2]>, // // Quad-register Integer Multiply-Accumulate (.32) InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 9 cycles InstrStage<10, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<4, [A9_NPipe]>], [9, 3, 2, 1]>, // // Move Immediate InstrItinData<IIC_VMOVImm, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [3]>, // // Double-register Permute Move InstrItinData<IIC_VMOVD, [InstrStage<1, [A9_DRegsN], 0, Required>, // FIXME: all latencies are arbitrary, no information is available InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_LSPipe]>], [2, 1]>, // // Quad-register Permute Move @@ -558,42 +654,42 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_VMOVQ, [InstrStage<1, [A9_DRegsN], 0, Required>, // FIXME: all latencies are arbitrary, no information is available InstrStage<4, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<2, [A9_NPipe]>], [3, 1]>, // // Integer to Single-precision Move InstrItinData<IIC_VMOVIS , [InstrStage<1, [A9_DRegsN], 0, Required>, // FIXME: all latencies are arbitrary, no information is available InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [2, 1]>, // // Integer to Double-precision Move InstrItinData<IIC_VMOVID , [InstrStage<1, [A9_DRegsN], 0, Required>, // FIXME: all latencies are arbitrary, no information is available InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [2, 1, 1]>, // // Single-precision to Integer Move InstrItinData<IIC_VMOVSI , [InstrStage<1, [A9_DRegsN], 0, Required>, // FIXME: all latencies are arbitrary, no information is available InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [2, 1]>, // // Double-precision to Integer Move InstrItinData<IIC_VMOVDI , [InstrStage<1, [A9_DRegsN], 0, Required>, // FIXME: all latencies are arbitrary, no information is available InstrStage<3, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [2, 2, 1]>, // // Integer to Lane Move InstrItinData<IIC_VMOVISL , [InstrStage<1, [A9_DRegsN], 0, Required>, // FIXME: all latencies are arbitrary, no information is available InstrStage<4, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<2, [A9_NPipe]>], [3, 1, 1]>, // @@ -601,7 +697,7 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_VUNAD, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [5, 2]>, // // Quad-register FP Unary @@ -610,7 +706,7 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_VUNAQ, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<2, [A9_NPipe]>], [6, 2]>, // // Double-register FP Binary @@ -619,7 +715,7 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_VBIND, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [5, 2, 2]>, // // Quad-register FP Binary @@ -630,14 +726,14 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_VBINQ, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 8 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<2, [A9_NPipe]>], [6, 2, 2]>, // // Double-register FP Multiple-Accumulate InstrItinData<IIC_VMACD, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<2, [A9_NPipe]>], [6, 3, 2, 1]>, // // Quad-register FP Multiple-Accumulate @@ -646,28 +742,28 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_VMACQ, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 9 cycles InstrStage<10, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<4, [A9_NPipe]>], [8, 4, 2, 1]>, // // Double-register Reciprical Step InstrItinData<IIC_VRECSD, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<2, [A9_NPipe]>], [6, 2, 2]>, // // Quad-register Reciprical Step InstrItinData<IIC_VRECSQ, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 9 cycles InstrStage<10, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<4, [A9_NPipe]>], [8, 2, 2]>, // // Double-register Permute InstrItinData<IIC_VPERMD, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 6 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [2, 2, 1, 1]>, // // Quad-register Permute @@ -676,7 +772,7 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_VPERMQ, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<2, [A9_NPipe]>], [3, 3, 1, 1]>, // // Quad-register Permute (3 cycle issue) @@ -685,7 +781,7 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_VPERMQ3, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 8 cycles InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<3, [A9_LSPipe]>], [4, 4, 1, 1]>, // @@ -693,57 +789,57 @@ def CortexA9Itineraries : ProcessorItineraries< InstrItinData<IIC_VEXTD, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<7, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<1, [A9_NPipe]>], [2, 1, 1]>, // // Quad-register VEXT InstrItinData<IIC_VEXTQ, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 9 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<2, [A9_NPipe]>], [3, 1, 1]>, // // VTB InstrItinData<IIC_VTB1, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<2, [A9_NPipe]>], [3, 2, 1]>, InstrItinData<IIC_VTB2, [InstrStage<2, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<2, [A9_NPipe]>], [3, 2, 2, 1]>, InstrItinData<IIC_VTB3, [InstrStage<2, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 8 cycles InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<3, [A9_NPipe]>], [4, 2, 2, 3, 1]>, InstrItinData<IIC_VTB4, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 8 cycles InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<3, [A9_NPipe]>], [4, 2, 2, 3, 3, 1]>, // // VTBX InstrItinData<IIC_VTBX1, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<2, [A9_NPipe]>], [3, 1, 2, 1]>, InstrItinData<IIC_VTBX2, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 7 cycles InstrStage<8, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<2, [A9_NPipe]>], [3, 1, 2, 2, 1]>, InstrItinData<IIC_VTBX3, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 8 cycles InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, + InstrStage<1, [A9_Pipe1]>, InstrStage<3, [A9_NPipe]>], [4, 1, 2, 2, 3, 1]>, InstrItinData<IIC_VTBX4, [InstrStage<1, [A9_DRegsN], 0, Required>, // Extra latency cycles since wbck is 8 cycles InstrStage<9, [A9_DRegsVFP], 0, Reserved>, - InstrStage<1, [A9_Pipe0, A9_Pipe1]>, - InstrStage<2, [A9_NPipe]>], [4, 1, 2, 2, 3, 3, 1]> + InstrStage<1, [A9_Pipe1]>, + InstrStage<2, [A9_NPipe]>], [4, 1, 2, 2, 3, 3, 1]> ]>; diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td b/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td index f813022..08b560c 100644 --- a/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td +++ b/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td @@ -16,7 +16,7 @@ // Functional Units def V6_Pipe : FuncUnit; // pipeline -// Scheduling information derived from "ARM1176JZF-S Technical Reference Manual". +// Scheduling information derived from "ARM1176JZF-S Technical Reference Manual" // def ARMV6Itineraries : ProcessorItineraries< [V6_Pipe], [ diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h index 8332bba..e7d92ed 100644 --- a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h @@ -54,6 +54,9 @@ protected: /// the VML[AS] instructions are slow (if so, don't use them). bool SlowVMLx; + /// SlowFPBrcc - True if floating point compare + branch is slow. + bool SlowFPBrcc; + /// IsThumb - True if we are in thumb mode, false if in ARM mode. bool IsThumb; @@ -133,6 +136,7 @@ protected: bool hasDivide() const { return HasHardwareDivide; } bool hasT2ExtractPack() const { return HasT2ExtractPack; } bool useVMLx() const {return hasVFP2() && !SlowVMLx; } + bool isFPBrccSlow() const { return SlowFPBrcc; } bool hasFP16() const { return HasFP16; } diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp index b4a9252..09203f9 100644 --- a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -60,8 +60,10 @@ ARMTargetMachine::ARMTargetMachine(const Target &T, const std::string &TT, const std::string &FS) : ARMBaseTargetMachine(T, TT, FS, false), InstrInfo(Subtarget), DataLayout(Subtarget.isAPCS_ABI() ? - std::string("e-p:32:32-f64:32:32-i64:32:32-n32") : - std::string("e-p:32:32-f64:64:64-i64:64:64-n32")), + std::string("e-p:32:32-f64:32:32-i64:32:32-" + "v128:32:128-v64:32:64-n32") : + std::string("e-p:32:32-f64:64:64-i64:64:64-" + "v128:64:128-v64:64:64-n32")), TLInfo(*this), TSInfo(*this) { } @@ -74,9 +76,11 @@ ThumbTargetMachine::ThumbTargetMachine(const Target &T, const std::string &TT, : ((ARMBaseInstrInfo*)new Thumb1InstrInfo(Subtarget))), DataLayout(Subtarget.isAPCS_ABI() ? std::string("e-p:32:32-f64:32:32-i64:32:32-" - "i16:16:32-i8:8:32-i1:8:32-a:0:32-n32") : + "i16:16:32-i8:8:32-i1:8:32-" + "v128:32:128-v64:32:64-a:0:32-n32") : std::string("e-p:32:32-f64:64:64-i64:64:64-" - "i16:16:32-i8:8:32-i1:8:32-a:0:32-n32")), + "i16:16:32-i8:8:32-i1:8:32-" + "v128:64:128-v64:64:64-a:0:32-n32")), TLInfo(*this), TSInfo(*this) { } @@ -98,6 +102,7 @@ bool ARMBaseTargetMachine::addPreRegAlloc(PassManagerBase &PM, // FIXME: temporarily disabling load / store optimization pass for Thumb1. if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only()) PM.add(createARMLoadStoreOptimizationPass(true)); + return true; } @@ -115,21 +120,20 @@ bool ARMBaseTargetMachine::addPreSched2(PassManagerBase &PM, // proper scheduling. PM.add(createARMExpandPseudoPass()); - return true; -} - -bool ARMBaseTargetMachine::addPreEmitPass(PassManagerBase &PM, - CodeGenOpt::Level OptLevel) { - // FIXME: temporarily disabling load / store optimization pass for Thumb1. if (OptLevel != CodeGenOpt::None) { if (!Subtarget.isThumb1Only()) PM.add(createIfConverterPass()); } - - if (Subtarget.isThumb2()) { + if (Subtarget.isThumb2()) PM.add(createThumb2ITBlockPass()); + + return true; +} + +bool ARMBaseTargetMachine::addPreEmitPass(PassManagerBase &PM, + CodeGenOpt::Level OptLevel) { + if (Subtarget.isThumb2()) PM.add(createThumb2SizeReductionPass()); - } PM.add(createARMConstantIslandPass()); return true; diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index bfa89c4..4b08324 100644 --- a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -88,7 +88,7 @@ private: /// its register number, or -1 if there is no match. To allow return values /// to be used directly in register lists, arm registers have values between /// 0 and 15. - int MatchRegisterName(const StringRef &Name); + int MatchRegisterName(StringRef Name); /// } @@ -97,7 +97,7 @@ public: ARMAsmParser(const Target &T, MCAsmParser &_Parser) : TargetAsmParser(T), Parser(_Parser) {} - virtual bool ParseInstruction(const StringRef &Name, SMLoc NameLoc, + virtual bool ParseInstruction(StringRef Name, SMLoc NameLoc, SmallVectorImpl<MCParsedAsmOperand*> &Operands); virtual bool ParseDirective(AsmToken DirectiveID); @@ -425,7 +425,7 @@ bool ARMAsmParser::ParseMemory(OwningPtr<ARMOperand> &Op) { const AsmToken &NextTok = Parser.getTok(); if (NextTok.isNot(AsmToken::EndOfStatement)) { if (NextTok.isNot(AsmToken::Comma)) - return Error(NextTok.getLoc(), "',' expected"); + return Error(NextTok.getLoc(), "',' expected"); Parser.Lex(); // Eat comma token. if(ParseMemoryOffsetReg(Negative, OffsetRegShifted, ShiftType, ShiftAmount, Offset, OffsetIsReg, OffsetRegNum, @@ -488,7 +488,7 @@ bool ARMAsmParser::ParseMemoryOffsetReg(bool &Negative, const AsmToken &Tok = Parser.getTok(); if (ParseShift(ShiftType, ShiftAmount, E)) - return Error(Tok.getLoc(), "shift expected"); + return Error(Tok.getLoc(), "shift expected"); OffsetRegShifted = true; } } @@ -517,7 +517,7 @@ bool ARMAsmParser::ParseShift(ShiftType &St, const AsmToken &Tok = Parser.getTok(); if (Tok.isNot(AsmToken::Identifier)) return true; - const StringRef &ShiftName = Tok.getString(); + StringRef ShiftName = Tok.getString(); if (ShiftName == "lsl" || ShiftName == "LSL") St = Lsl; else if (ShiftName == "lsr" || ShiftName == "LSR") @@ -549,7 +549,7 @@ bool ARMAsmParser::ParseShift(ShiftType &St, } /// A hack to allow some testing, to be replaced by a real table gen version. -int ARMAsmParser::MatchRegisterName(const StringRef &Name) { +int ARMAsmParser::MatchRegisterName(StringRef Name) { if (Name == "r0" || Name == "R0") return 0; else if (Name == "r1" || Name == "R1") @@ -593,7 +593,7 @@ MatchInstruction(const SmallVectorImpl<MCParsedAsmOperand*> &Operands, MCInst &Inst) { ARMOperand &Op0 = *(ARMOperand*)Operands[0]; assert(Op0.Kind == ARMOperand::Token && "First operand not a Token"); - const StringRef &Mnemonic = Op0.getToken(); + StringRef Mnemonic = Op0.getToken(); if (Mnemonic == "add" || Mnemonic == "stmfd" || Mnemonic == "str" || @@ -658,14 +658,13 @@ bool ARMAsmParser::ParseOperand(OwningPtr<ARMOperand> &Op) { } /// Parse an arm instruction mnemonic followed by its operands. -bool ARMAsmParser::ParseInstruction(const StringRef &Name, SMLoc NameLoc, +bool ARMAsmParser::ParseInstruction(StringRef Name, SMLoc NameLoc, SmallVectorImpl<MCParsedAsmOperand*> &Operands) { OwningPtr<ARMOperand> Op; ARMOperand::CreateToken(Op, Name, NameLoc); Operands.push_back(Op.take()); - SMLoc Loc = Parser.getTok().getLoc(); if (getLexer().isNot(AsmToken::EndOfStatement)) { // Read the first operand. @@ -762,16 +761,11 @@ bool ARMAsmParser::ParseDirectiveSyntax(SMLoc L) { const AsmToken &Tok = Parser.getTok(); if (Tok.isNot(AsmToken::Identifier)) return Error(L, "unexpected token in .syntax directive"); - const StringRef &Mode = Tok.getString(); - bool unified_syntax; - if (Mode == "unified" || Mode == "UNIFIED") { + StringRef Mode = Tok.getString(); + if (Mode == "unified" || Mode == "UNIFIED") Parser.Lex(); - unified_syntax = true; - } - else if (Mode == "divided" || Mode == "DIVIDED") { + else if (Mode == "divided" || Mode == "DIVIDED") Parser.Lex(); - unified_syntax = false; - } else return Error(L, "unrecognized syntax mode in .syntax directive"); @@ -791,15 +785,10 @@ bool ARMAsmParser::ParseDirectiveCode(SMLoc L) { if (Tok.isNot(AsmToken::Integer)) return Error(L, "unexpected token in .code directive"); int64_t Val = Parser.getTok().getIntVal(); - bool thumb_mode; - if (Val == 16) { + if (Val == 16) Parser.Lex(); - thumb_mode = true; - } - else if (Val == 32) { + else if (Val == 32) Parser.Lex(); - thumb_mode = false; - } else return Error(L, "invalid operand to .code directive"); diff --git a/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp b/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp index d95efdb..946f474 100644 --- a/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp @@ -175,23 +175,8 @@ namespace { raw_ostream &O); void printVFPf64ImmOperand(const MachineInstr *MI, int OpNum, raw_ostream &O); - - void printHex8ImmOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O) { - O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm() & 0xff); - } - void printHex16ImmOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O) { - O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm() & 0xffff); - } - void printHex32ImmOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O) { - O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm() & 0xffffffff); - } - void printHex64ImmOperand(const MachineInstr *MI, int OpNum, - raw_ostream &O) { - O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm()); - } + void printNEONModImmOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O); virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, unsigned AsmVariant, const char *ExtraCode, @@ -322,7 +307,7 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum, unsigned DRegLo = TM.getRegisterInfo()->getSubReg(Reg, ARM::dsub_0); unsigned DRegHi = TM.getRegisterInfo()->getSubReg(Reg, ARM::dsub_1); O << '{' - << getRegisterName(DRegLo) << ',' << getRegisterName(DRegHi) + << getRegisterName(DRegLo) << ", " << getRegisterName(DRegHi) << '}'; } else if (Modifier && strcmp(Modifier, "lane") == 0) { unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg); @@ -618,7 +603,7 @@ void ARMAsmPrinter::printAddrMode6Operand(const MachineInstr *MI, int Op, O << "[" << getRegisterName(MO1.getReg()); if (MO2.getImm()) { // FIXME: Both darwin as and GNU as violate ARM docs here. - O << ", :" << MO2.getImm(); + O << ", :" << (MO2.getImm() << 3); } O << "]"; } @@ -1039,6 +1024,14 @@ void ARMAsmPrinter::printVFPf64ImmOperand(const MachineInstr *MI, int OpNum, } } +void ARMAsmPrinter::printNEONModImmOperand(const MachineInstr *MI, int OpNum, + raw_ostream &O) { + unsigned EncodedImm = MI->getOperand(OpNum).getImm(); + unsigned EltBits; + uint64_t Val = ARM_AM::decodeNEONModImm(EncodedImm, EltBits); + O << "#0x" << utohexstr(Val); +} + bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { @@ -1064,20 +1057,10 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, printOperand(MI, OpNum, O); return false; case 'Q': - if (TM.getTargetData()->isLittleEndian()) - break; - // Fallthrough case 'R': - if (TM.getTargetData()->isBigEndian()) - break; - // Fallthrough - case 'H': // Write second word of DI / DF reference. - // Verify that this operand has two consecutive registers. - if (!MI->getOperand(OpNum).isReg() || - OpNum+1 == MI->getNumOperands() || - !MI->getOperand(OpNum+1).isReg()) - return true; - ++OpNum; // Return the high-part. + case 'H': + report_fatal_error("llvm does not support 'Q', 'R', and 'H' modifiers!"); + return true; } } @@ -1384,11 +1367,11 @@ void ARMAsmPrinter::printInstructionThroughMCStreamer(const MachineInstr *MI) { } else if (MO.isGlobal()) { MCSymbol *Symbol = MCInstLowering.GetGlobalAddressSymbol(MO); const MCSymbolRefExpr *SymRef1 = - MCSymbolRefExpr::Create(Symbol, - MCSymbolRefExpr::VK_ARM_LO16, OutContext); + MCSymbolRefExpr::Create(Symbol, + MCSymbolRefExpr::VK_ARM_LO16, OutContext); const MCSymbolRefExpr *SymRef2 = - MCSymbolRefExpr::Create(Symbol, - MCSymbolRefExpr::VK_ARM_HI16, OutContext); + MCSymbolRefExpr::Create(Symbol, + MCSymbolRefExpr::VK_ARM_HI16, OutContext); V1 = MCOperand::CreateExpr(SymRef1); V2 = MCOperand::CreateExpr(SymRef2); } else { diff --git a/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp b/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp index 2b94b76..edc9345 100644 --- a/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp +++ b/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp @@ -442,7 +442,7 @@ void ARMInstPrinter::printAddrMode6Operand(const MCInst *MI, unsigned OpNum, O << "[" << getRegisterName(MO1.getReg()); if (MO2.getImm()) { // FIXME: Both darwin as and GNU as violate ARM docs here. - O << ", :" << MO2.getImm(); + O << ", :" << (MO2.getImm() << 3); } O << "]"; } @@ -779,22 +779,10 @@ void ARMInstPrinter::printVFPf64ImmOperand(const MCInst *MI, unsigned OpNum, O << '#' << MI->getOperand(OpNum).getImm(); } -void ARMInstPrinter::printHex8ImmOperand(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm() & 0xff); -} - -void ARMInstPrinter::printHex16ImmOperand(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm() & 0xffff); -} - -void ARMInstPrinter::printHex32ImmOperand(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm() & 0xffffffff); -} - -void ARMInstPrinter::printHex64ImmOperand(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm()); +void ARMInstPrinter::printNEONModImmOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + unsigned EncodedImm = MI->getOperand(OpNum).getImm(); + unsigned EltBits; + uint64_t Val = ARM_AM::decodeNEONModImm(EncodedImm, EltBits); + O << "#0x" << utohexstr(Val); } diff --git a/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h b/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h index be0b7c1..ddf5047 100644 --- a/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h +++ b/contrib/llvm/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h @@ -104,10 +104,7 @@ public: void printNoHashImmediate(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printVFPf32ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printVFPf64ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printHex8ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printHex16ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printHex32ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printHex64ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printNEONModImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printPCLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O); // FIXME: Implement. diff --git a/contrib/llvm/lib/Target/ARM/CMakeLists.txt b/contrib/llvm/lib/Target/ARM/CMakeLists.txt index 29e66e1..0df3466 100644 --- a/contrib/llvm/lib/Target/ARM/CMakeLists.txt +++ b/contrib/llvm/lib/Target/ARM/CMakeLists.txt @@ -33,6 +33,7 @@ add_llvm_target(ARMCodeGen NEONPreAllocPass.cpp Thumb1InstrInfo.cpp Thumb1RegisterInfo.cpp + Thumb2HazardRecognizer.cpp Thumb2ITBlockPass.cpp Thumb2InstrInfo.cpp Thumb2RegisterInfo.cpp diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp index adb7795..a07ff28 100644 --- a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp +++ b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp @@ -34,7 +34,7 @@ /// Uses and Defs by this instr. For the Uses part, the pred:$p operand is /// defined with two components: /// -/// def pred { // Operand PredicateOperand +/// def pred { // Operand PredicateOperand /// ValueType Type = OtherVT; /// string PrintMethod = "printPredicateOperand"; /// string AsmOperandLowerMethod = ?; @@ -54,7 +54,7 @@ /// /// For the Defs part, in the simple case of only cc_out:$s, we have: /// -/// def cc_out { // Operand OptionalDefOperand +/// def cc_out { // Operand OptionalDefOperand /// ValueType Type = OtherVT; /// string PrintMethod = "printSBitModifierOperand"; /// string AsmOperandLowerMethod = ?; @@ -765,7 +765,7 @@ static bool DisassembleBrFrm(MCInst &MI, unsigned Opcode, uint32_t insn, || Opcode == ARM::SMC || Opcode == ARM::SVC) && "Unexpected Opcode"); - assert(NumOps >= 1 && OpInfo[0].RegClass == 0 && "Reg operand expected"); + assert(NumOps >= 1 && OpInfo[0].RegClass < 0 && "Reg operand expected"); int Imm32 = 0; if (Opcode == ARM::SMC) { @@ -1106,7 +1106,7 @@ static bool DisassembleDPSoRegFrm(MCInst &MI, unsigned Opcode, uint32_t insn, assert((OpInfo[OpIdx].RegClass == ARM::GPRRegClassID) && (OpInfo[OpIdx+1].RegClass == ARM::GPRRegClassID) && - (OpInfo[OpIdx+2].RegClass == 0) && + (OpInfo[OpIdx+2].RegClass < 0) && "Expect 3 reg operands"); // Register-controlled shifts have Inst{7} = 0 and Inst{4} = 1. @@ -1201,7 +1201,7 @@ static bool DisassembleLdStFrm(MCInst &MI, unsigned Opcode, uint32_t insn, return false; assert((OpInfo[OpIdx].RegClass == ARM::GPRRegClassID) && - (OpInfo[OpIdx+1].RegClass == 0) && + (OpInfo[OpIdx+1].RegClass < 0) && "Expect 1 reg operand followed by 1 imm operand"); ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub; @@ -1323,7 +1323,7 @@ static bool DisassembleLdStMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, return false; assert((OpInfo[OpIdx].RegClass == ARM::GPRRegClassID) && - (OpInfo[OpIdx+1].RegClass == 0) && + (OpInfo[OpIdx+1].RegClass < 0) && "Expect 1 reg operand followed by 1 imm operand"); ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub; @@ -1494,7 +1494,7 @@ static bool DisassembleArithMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, // If there is still an operand info left which is an immediate operand, add // an additional imm5 LSL/ASR operand. - if (ThreeReg && OpInfo[OpIdx].RegClass == 0 + if (ThreeReg && OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) { // Extract the 5-bit immediate field Inst{11-7}. unsigned ShiftAmt = (insn >> ARMII::ShiftShift) & 0x1F; @@ -1540,7 +1540,7 @@ static bool DisassembleExtFrm(MCInst &MI, unsigned Opcode, uint32_t insn, // If there is still an operand info left which is an immediate operand, add // an additional rotate immediate operand. - if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0 + if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) { // Extract the 2-bit rotate field Inst{11-10}. unsigned rot = (insn >> ARMII::ExtRotImmShift) & 3; @@ -1725,7 +1725,7 @@ static bool DisassembleVFPConv1Frm(MCInst &MI, unsigned Opcode, uint32_t insn, "Tied to operand expected"); MI.addOperand(MI.getOperand(0)); - assert(OpInfo[2].RegClass == 0 && !OpInfo[2].isPredicate() && + assert(OpInfo[2].RegClass < 0 && !OpInfo[2].isPredicate() && !OpInfo[2].isOptionalDef() && "Imm operand expected"); MI.addOperand(MCOperand::CreateImm(fbits)); @@ -1984,7 +1984,7 @@ static bool DisassembleVFPMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, ++OpIdx; // Extract/decode the f64/f32 immediate. - if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0 + if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) { // The asm syntax specifies the before-expanded <imm>. // Not VFPExpandImm(slice(insn,19,16) << 4 | slice(insn, 3, 0), @@ -2077,42 +2077,12 @@ static unsigned decodeLaneIndex(uint32_t insn) { // imm3 = Inst{18-16}, imm4 = Inst{3-0} // Ref: Table A7-15 Modified immediate values for Advanced SIMD instructions. static uint64_t decodeN1VImm(uint32_t insn, ElemSize esize) { + unsigned char op = (insn >> 5) & 1; unsigned char cmode = (insn >> 8) & 0xF; unsigned char Imm8 = ((insn >> 24) & 1) << 7 | ((insn >> 16) & 7) << 4 | (insn & 0xF); - uint64_t Imm64 = 0; - - switch (esize) { - case ESize8: - Imm64 = Imm8; - break; - case ESize16: - Imm64 = Imm8 << 8*(cmode >> 1 & 1); - break; - case ESize32: { - if (cmode == 12) - Imm64 = (Imm8 << 8) | 0xFF; - else if (cmode == 13) - Imm64 = (Imm8 << 16) | 0xFFFF; - else { - // Imm8 to be shifted left by how many bytes... - Imm64 = Imm8 << 8*(cmode >> 1 & 3); - } - break; - } - case ESize64: { - for (unsigned i = 0; i < 8; ++i) - if ((Imm8 >> i) & 1) - Imm64 |= (uint64_t)0xFF << 8*i; - break; - } - default: - assert(0 && "Unreachable code!"); - return 0; - } - - return Imm64; + return (op << 12) | (cmode << 8) | Imm8; } // A8.6.339 VMUL, VMULL (by scalar) @@ -2303,7 +2273,7 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn, } assert((OpIdx+1) < NumOps && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID && - OpInfo[OpIdx + 1].RegClass == 0 && "Addrmode #6 Operands expected"); + OpInfo[OpIdx + 1].RegClass < 0 && "Addrmode #6 Operands expected"); MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, Rn))); MI.addOperand(MCOperand::CreateImm(0)); // Alignment ignored? @@ -2320,7 +2290,7 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn, "Reg operand expected"); RegClass = OpInfo[OpIdx].RegClass; - while (OpIdx < NumOps && OpInfo[OpIdx].RegClass == RegClass) { + while (OpIdx < NumOps && (unsigned)OpInfo[OpIdx].RegClass == RegClass) { MI.addOperand(MCOperand::CreateReg( getRegisterEnum(B, RegClass, Rd, UseDRegPair(Opcode)))); @@ -2329,7 +2299,7 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn, } // Handle possible lane index. - if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0 + if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) { MI.addOperand(MCOperand::CreateImm(decodeLaneIndex(insn))); ++OpIdx; @@ -2340,7 +2310,7 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn, // possible TIED_TO DPR/QPR's (ignored), then possible lane index. RegClass = OpInfo[0].RegClass; - while (OpIdx < NumOps && OpInfo[OpIdx].RegClass == RegClass) { + while (OpIdx < NumOps && (unsigned)OpInfo[OpIdx].RegClass == RegClass) { MI.addOperand(MCOperand::CreateReg( getRegisterEnum(B, RegClass, Rd, UseDRegPair(Opcode)))); @@ -2355,7 +2325,7 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn, } assert((OpIdx+1) < NumOps && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID && - OpInfo[OpIdx + 1].RegClass == 0 && "Addrmode #6 Operands expected"); + OpInfo[OpIdx + 1].RegClass < 0 && "Addrmode #6 Operands expected"); MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, Rn))); MI.addOperand(MCOperand::CreateImm(0)); // Alignment ignored? @@ -2366,7 +2336,7 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn, ++OpIdx; } - while (OpIdx < NumOps && OpInfo[OpIdx].RegClass == RegClass) { + while (OpIdx < NumOps && (unsigned)OpInfo[OpIdx].RegClass == RegClass) { assert(TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1 && "Tied to operand expected"); MI.addOperand(MCOperand::CreateReg(0)); @@ -2374,7 +2344,7 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn, } // Handle possible lane index. - if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0 + if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) { MI.addOperand(MCOperand::CreateImm(decodeLaneIndex(insn))); ++OpIdx; @@ -2438,7 +2408,7 @@ static bool DisassembleN1RegModImmFrm(MCInst &MI, unsigned Opcode, assert(NumOps >= 2 && (OpInfo[0].RegClass == ARM::DPRRegClassID || OpInfo[0].RegClass == ARM::QPRRegClassID) && - (OpInfo[1].RegClass == 0) && + (OpInfo[1].RegClass < 0) && "Expect 1 reg operand followed by 1 imm operand"); // Qd/Dd = Inst{22:15-12} => NEON Rd @@ -2552,7 +2522,7 @@ static bool DisassembleNVdVmOptImm(MCInst &MI, unsigned Opc, uint32_t insn, } // Add the imm operand, if required. - if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0 + if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) { unsigned imm = 0xFFFFFFFF; @@ -2632,7 +2602,7 @@ static bool DisassembleNVectorShift(MCInst &MI, unsigned Opcode, uint32_t insn, decodeNEONRm(insn)))); ++OpIdx; - assert(OpInfo[OpIdx].RegClass == 0 && "Imm operand expected"); + assert(OpInfo[OpIdx].RegClass < 0 && "Imm operand expected"); // Add the imm operand. @@ -2762,7 +2732,7 @@ static bool DisassembleNVdVnVmOptImm(MCInst &MI, unsigned Opcode, uint32_t insn, getRegisterEnum(B, OpInfo[OpIdx].RegClass, m))); ++OpIdx; - if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0 + if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) { // Add the imm operand. unsigned Imm = 0; @@ -2869,15 +2839,9 @@ static bool DisassembleNVTBLFrm(MCInst &MI, unsigned Opcode, uint32_t insn, return true; } -static bool DisassembleNEONFrm(MCInst &MI, unsigned Opcode, uint32_t insn, - unsigned short NumOps, unsigned &NumOpsAdded, BO) { - assert(0 && "Unreachable code!"); - return false; -} - // Vector Get Lane (move scalar to ARM core register) Instructions. // VGETLNi32, VGETLNs16, VGETLNs8, VGETLNu16, VGETLNu8: Rt Dn index -static bool DisassembleNEONGetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn, +static bool DisassembleNGetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { const TargetInstrDesc &TID = ARMInsts[Opcode]; @@ -2887,7 +2851,7 @@ static bool DisassembleNEONGetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn, assert(TID.getNumDefs() == 1 && NumOps >= 3 && OpInfo[0].RegClass == ARM::GPRRegClassID && OpInfo[1].RegClass == ARM::DPRRegClassID && - OpInfo[2].RegClass == 0 && + OpInfo[2].RegClass < 0 && "Expect >= 3 operands with one dst operand"); ElemSize esize = @@ -2911,7 +2875,7 @@ static bool DisassembleNEONGetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn, // Vector Set Lane (move ARM core register to scalar) Instructions. // VSETLNi16, VSETLNi32, VSETLNi8: Dd Dd (TIED_TO) Rt index -static bool DisassembleNEONSetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn, +static bool DisassembleNSetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { const TargetInstrDesc &TID = ARMInsts[Opcode]; @@ -2923,7 +2887,7 @@ static bool DisassembleNEONSetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn, OpInfo[1].RegClass == ARM::DPRRegClassID && TID.getOperandConstraint(1, TOI::TIED_TO) != -1 && OpInfo[2].RegClass == ARM::GPRRegClassID && - OpInfo[3].RegClass == 0 && + OpInfo[3].RegClass < 0 && "Expect >= 3 operands with one dst operand"); ElemSize esize = @@ -2950,7 +2914,7 @@ static bool DisassembleNEONSetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn, // Vector Duplicate Instructions (from ARM core register to all elements). // VDUP8d, VDUP16d, VDUP32d, VDUP8q, VDUP16q, VDUP32q: Qd/Dd Rt -static bool DisassembleNEONDupFrm(MCInst &MI, unsigned Opcode, uint32_t insn, +static bool DisassembleNDupFrm(MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) { const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; @@ -3090,13 +3054,6 @@ static bool DisassembleMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, return false; } -static bool DisassembleThumbMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn, - unsigned short NumOps, unsigned &NumOpsAdded, BO) { - - assert(0 && "Unexpected thumb misc. instruction!"); - return false; -} - /// FuncPtrs - FuncPtrs maps ARMFormat to its corresponding DisassembleFP. /// We divide the disassembly task into different categories, with each one /// corresponding to a specific instruction encoding format. There could be @@ -3128,12 +3085,10 @@ static const DisassembleFP FuncPtrs[] = { &DisassembleVFPLdStMulFrm, &DisassembleVFPMiscFrm, &DisassembleThumbFrm, - &DisassembleNEONFrm, - &DisassembleNEONGetLnFrm, - &DisassembleNEONSetLnFrm, - &DisassembleNEONDupFrm, &DisassembleMiscFrm, - &DisassembleThumbMiscFrm, + &DisassembleNGetLnFrm, + &DisassembleNSetLnFrm, + &DisassembleNDupFrm, // VLD and VST (including one lane) Instructions. &DisassembleNLdSt, @@ -3233,7 +3188,8 @@ bool ARMBasicMCBuilder::DoPredicateOperands(MCInst& MI, unsigned Opcode, // a pair of TargetOperandInfos with isPredicate() property. if (NumOpsRemaining >= 2 && OpInfo[Idx].isPredicate() && OpInfo[Idx+1].isPredicate() && - OpInfo[Idx].RegClass == 0 && OpInfo[Idx+1].RegClass == ARM::CCRRegClassID) + OpInfo[Idx].RegClass < 0 && + OpInfo[Idx+1].RegClass == ARM::CCRRegClassID) { // If we are inside an IT block, get the IT condition bits maintained via // ARMBasicMCBuilder::ITState[7:0], through ARMBasicMCBuilder::GetITCond(). @@ -3265,7 +3221,8 @@ bool ARMBasicMCBuilder::TryPredicateAndSBitModifier(MCInst& MI, unsigned Opcode, // a pair of TargetOperandInfos with isPredicate() property. if (NumOpsRemaining >= 2 && OpInfo[Idx].isPredicate() && OpInfo[Idx+1].isPredicate() && - OpInfo[Idx].RegClass == 0 && OpInfo[Idx+1].RegClass == ARM::CCRRegClassID) + OpInfo[Idx].RegClass < 0 && + OpInfo[Idx+1].RegClass == ARM::CCRRegClassID) { // If we are inside an IT block, get the IT condition bits maintained via // ARMBasicMCBuilder::ITState[7:0], through ARMBasicMCBuilder::GetITCond(). diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h index b1d90df..7d21256 100644 --- a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h +++ b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h @@ -137,25 +137,25 @@ static inline void setSlice(uint32_t &Bits, unsigned From, unsigned To, /// Various utilities for checking the target specific flags. /// A unary data processing instruction doesn't have an Rn operand. -static inline bool isUnaryDP(unsigned TSFlags) { +static inline bool isUnaryDP(uint64_t TSFlags) { return (TSFlags & ARMII::UnaryDP); } /// This four-bit field describes the addressing mode used. /// See also ARMBaseInstrInfo.h. -static inline unsigned getAddrMode(unsigned TSFlags) { +static inline unsigned getAddrMode(uint64_t TSFlags) { return (TSFlags & ARMII::AddrModeMask); } /// {IndexModePre, IndexModePost} /// Only valid for load and store ops. /// See also ARMBaseInstrInfo.h. -static inline unsigned getIndexMode(unsigned TSFlags) { +static inline unsigned getIndexMode(uint64_t TSFlags) { return (TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift; } /// Pre-/post-indexed operations define an extra $base_wb in the OutOperandList. -static inline bool isPrePostLdSt(unsigned TSFlags) { +static inline bool isPrePostLdSt(uint64_t TSFlags) { return (TSFlags & ARMII::IndexModeMask) != 0; } diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h b/contrib/llvm/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h index 4b2e308..4b7a0bf 100644 --- a/contrib/llvm/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h +++ b/contrib/llvm/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h @@ -395,7 +395,7 @@ static bool DisassembleThumb1General(MCInst &MI, unsigned Opcode, uint32_t insn, MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID, getT1tRm(insn)))); } else { - assert(OpInfo[OpIdx].RegClass == 0 && + assert(OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef() && "Pure imm operand expected"); MI.addOperand(MCOperand::CreateImm(UseRt ? getT1Imm8(insn) @@ -531,7 +531,7 @@ static bool DisassembleThumb1LdPC(MCInst &MI, unsigned Opcode, uint32_t insn, if (!OpInfo) return false; assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::tGPRRegClassID && - (OpInfo[1].RegClass == 0 && + (OpInfo[1].RegClass < 0 && !OpInfo[1].isPredicate() && !OpInfo[1].isOptionalDef()) && "Invalid arguments"); @@ -598,7 +598,7 @@ static bool DisassembleThumb1LdSt(unsigned opA, MCInst &MI, unsigned Opcode, assert(OpIdx < NumOps && "More operands expected"); - if (OpInfo[OpIdx].RegClass == 0 && !OpInfo[OpIdx].isPredicate() && + if (OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) { MI.addOperand(MCOperand::CreateImm(Imm5 ? getT1Imm5(insn) : 0)); @@ -632,7 +632,7 @@ static bool DisassembleThumb1LdStSP(MCInst &MI, unsigned Opcode, uint32_t insn, assert(NumOps >= 3 && OpInfo[0].RegClass == ARM::tGPRRegClassID && OpInfo[1].RegClass == ARM::GPRRegClassID && - (OpInfo[2].RegClass == 0 && + (OpInfo[2].RegClass < 0 && !OpInfo[2].isPredicate() && !OpInfo[2].isOptionalDef()) && "Invalid arguments"); @@ -658,7 +658,7 @@ static bool DisassembleThumb1AddPCi(MCInst &MI, unsigned Opcode, uint32_t insn, if (!OpInfo) return false; assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::tGPRRegClassID && - (OpInfo[1].RegClass == 0 && + (OpInfo[1].RegClass < 0 && !OpInfo[1].isPredicate() && !OpInfo[1].isOptionalDef()) && "Invalid arguments"); @@ -685,7 +685,7 @@ static bool DisassembleThumb1AddSPi(MCInst &MI, unsigned Opcode, uint32_t insn, assert(NumOps >= 3 && OpInfo[0].RegClass == ARM::tGPRRegClassID && OpInfo[1].RegClass == ARM::GPRRegClassID && - (OpInfo[2].RegClass == 0 && + (OpInfo[2].RegClass < 0 && !OpInfo[2].isPredicate() && !OpInfo[2].isOptionalDef()) && "Invalid arguments"); @@ -761,7 +761,7 @@ static bool DisassembleThumb1Misc(MCInst &MI, unsigned Opcode, uint32_t insn, // Predicate operands are handled elsewhere. if (NumOps == 2 && OpInfo[0].isPredicate() && OpInfo[1].isPredicate() && - OpInfo[0].RegClass == 0 && OpInfo[1].RegClass == ARM::CCRRegClassID) { + OpInfo[0].RegClass < 0 && OpInfo[1].RegClass == ARM::CCRRegClassID) { return true; } @@ -808,7 +808,7 @@ static bool DisassembleThumb1Misc(MCInst &MI, unsigned Opcode, uint32_t insn, } assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::tGPRRegClassID && - (OpInfo[1].RegClass==0 || OpInfo[1].RegClass==ARM::tGPRRegClassID) + (OpInfo[1].RegClass < 0 || OpInfo[1].RegClass==ARM::tGPRRegClassID) && "Expect >=2 operands"); // Add the destination operand. @@ -913,7 +913,7 @@ static bool DisassembleThumb1CondBr(MCInst &MI, unsigned Opcode, uint32_t insn, const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; if (!OpInfo) return false; - assert(NumOps == 3 && OpInfo[0].RegClass == 0 && + assert(NumOps == 3 && OpInfo[0].RegClass < 0 && OpInfo[1].isPredicate() && OpInfo[2].RegClass == ARM::CCRRegClassID && "Exactly 3 operands expected"); @@ -939,7 +939,7 @@ static bool DisassembleThumb1Br(MCInst &MI, unsigned Opcode, uint32_t insn, const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo; if (!OpInfo) return false; - assert(NumOps == 1 && OpInfo[0].RegClass == 0 && "1 imm operand expected"); + assert(NumOps == 1 && OpInfo[0].RegClass < 0 && "1 imm operand expected"); unsigned Imm11 = getT1Imm11(insn); @@ -1239,7 +1239,7 @@ static bool DisassembleThumb2LdStDual(MCInst &MI, unsigned Opcode, && OpInfo[0].RegClass == ARM::GPRRegClassID && OpInfo[1].RegClass == ARM::GPRRegClassID && OpInfo[2].RegClass == ARM::GPRRegClassID - && OpInfo[3].RegClass == 0 + && OpInfo[3].RegClass < 0 && "Expect >= 4 operands and first 3 as reg operands"); // Add the <Rt> <Rt2> operands. @@ -1322,8 +1322,8 @@ static bool DisassembleThumb2DPSoReg(MCInst &MI, unsigned Opcode, uint32_t insn, assert(NumOps == 4 && OpInfo[0].RegClass == ARM::GPRRegClassID && OpInfo[1].RegClass == ARM::GPRRegClassID - && OpInfo[2].RegClass == 0 - && OpInfo[3].RegClass == 0 + && OpInfo[2].RegClass < 0 + && OpInfo[3].RegClass < 0 && "Exactlt 4 operands expect and first two as reg operands"); // Only need to populate the src reg operand. MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, @@ -1375,7 +1375,7 @@ static bool DisassembleThumb2DPSoReg(MCInst &MI, unsigned Opcode, uint32_t insn, if (NumOps == OpIdx) return true; - if (OpInfo[OpIdx].RegClass == 0 && !OpInfo[OpIdx].isPredicate() + if (OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) { if (Thumb2ShiftOpcode(Opcode)) @@ -1440,7 +1440,7 @@ static bool DisassembleThumb2DPModImm(MCInst &MI, unsigned Opcode, } // The modified immediate operand should come next. - assert(OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0 && + assert(OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef() && "Pure imm operand expected"); @@ -1555,7 +1555,7 @@ static bool DisassembleThumb2DPBinImm(MCInst &MI, unsigned Opcode, ++OpIdx; } - assert(OpInfo[OpIdx].RegClass == 0 && !OpInfo[OpIdx].isPredicate() + assert(OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef() && "Pure imm operand expected"); @@ -1772,7 +1772,7 @@ static bool DisassembleThumb2PreLoad(MCInst &MI, unsigned Opcode, uint32_t insn, MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID, decodeRm(insn)))); } else { - assert(OpInfo[OpIdx].RegClass == 0 && !OpInfo[OpIdx].isPredicate() + assert(OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef() && "Pure imm operand expected"); int Offset = 0; @@ -1792,7 +1792,7 @@ static bool DisassembleThumb2PreLoad(MCInst &MI, unsigned Opcode, uint32_t insn, } ++OpIdx; - if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0 && + if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) { // Fills in the shift amount for t2PLDs, t2PLDWs, t2PLIs. MI.addOperand(MCOperand::CreateImm(slice(insn, 5, 4))); @@ -1818,7 +1818,7 @@ static bool DisassembleThumb2Ldpci(MCInst &MI, unsigned Opcode, assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::GPRRegClassID && - OpInfo[1].RegClass == 0 && + OpInfo[1].RegClass < 0 && "Expect >= 2 operands, first as reg, and second as imm operand"); // Build the register operand, followed by the (+/-)imm12 immediate. @@ -1930,7 +1930,7 @@ static bool DisassembleThumb2LdSt(bool Load, MCInst &MI, unsigned Opcode, ++OpIdx; } - assert(OpInfo[OpIdx].RegClass == 0 && !OpInfo[OpIdx].isPredicate() + assert(OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef() && "Pure imm operand expected"); @@ -1981,7 +1981,7 @@ static bool DisassembleThumb2DPReg(MCInst &MI, unsigned Opcode, uint32_t insn, decodeRm(insn)))); ++OpIdx; - if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0 + if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) { // Add the rotation amount immediate. MI.addOperand(MCOperand::CreateImm(decodeRotate(insn))); diff --git a/contrib/llvm/lib/Target/ARM/NEONMoveFix.cpp b/contrib/llvm/lib/Target/ARM/NEONMoveFix.cpp index 0a4400c..bbdd3c7 100644 --- a/contrib/llvm/lib/Target/ARM/NEONMoveFix.cpp +++ b/contrib/llvm/lib/Target/ARM/NEONMoveFix.cpp @@ -105,8 +105,8 @@ bool NEONMoveFixPass::InsertMoves(MachineBasicBlock &MBB) { unsigned MOReg = MO.getReg(); Defs[MOReg] = MI; - // Catch subregs as well. - for (const unsigned *R = TRI->getSubRegisters(MOReg); *R; ++R) + // Catch aliases as well. + for (const unsigned *R = TRI->getAliasSet(MOReg); *R; ++R) Defs[*R] = MI; } } diff --git a/contrib/llvm/lib/Target/ARM/NEONPreAllocPass.cpp b/contrib/llvm/lib/Target/ARM/NEONPreAllocPass.cpp index a725898..f67717c 100644 --- a/contrib/llvm/lib/Target/ARM/NEONPreAllocPass.cpp +++ b/contrib/llvm/lib/Target/ARM/NEONPreAllocPass.cpp @@ -407,7 +407,7 @@ NEONPreAllocPass::FormsRegSequence(MachineInstr *MI, "expected a virtual register"); // Extracting from a Q or QQ register. MachineInstr *DefMI = MRI->getVRegDef(VirtReg); - if (!DefMI || !DefMI->isExtractSubreg()) + if (!DefMI || !DefMI->isCopy() || !DefMI->getOperand(1).getSubReg()) return false; VirtReg = DefMI->getOperand(1).getReg(); if (LastSrcReg && LastSrcReg != VirtReg) @@ -418,7 +418,7 @@ NEONPreAllocPass::FormsRegSequence(MachineInstr *MI, RC != ARM::QQPRRegisterClass && RC != ARM::QQQQPRRegisterClass) return false; - unsigned SubIdx = DefMI->getOperand(2).getImm(); + unsigned SubIdx = DefMI->getOperand(1).getSubReg(); if (LastSubIdx) { if (LastSubIdx != SubIdx-Stride) return false; @@ -434,22 +434,21 @@ NEONPreAllocPass::FormsRegSequence(MachineInstr *MI, // FIXME: Update the uses of EXTRACT_SUBREG from REG_SEQUENCE is // currently required for correctness. e.g. - // %reg1041;<def> = REG_SEQUENCE %reg1040<kill>, 5, %reg1035<kill>, 6 + // %reg1041<def> = REG_SEQUENCE %reg1040<kill>, 5, %reg1035<kill>, 6 // %reg1042<def> = EXTRACT_SUBREG %reg1041, 6 // %reg1043<def> = EXTRACT_SUBREG %reg1041, 5 // VST1q16 %reg1025<kill>, 0, %reg1043<kill>, %reg1042<kill>, - // reg1025 and reg1043 should be replaced with reg1041:6 and reg1041:5 + // reg1042 and reg1043 should be replaced with reg1041:6 and reg1041:5 // respectively. // We need to change how we model uses of REG_SEQUENCE. for (unsigned R = 0; R < NumRegs; ++R) { MachineOperand &MO = MI->getOperand(FirstOpnd + R); unsigned OldReg = MO.getReg(); MachineInstr *DefMI = MRI->getVRegDef(OldReg); - assert(DefMI->isExtractSubreg()); + assert(DefMI->isCopy()); MO.setReg(LastSrcReg); MO.setSubReg(SubIds[R]); - if (R != 0) - MO.setIsKill(false); + MO.setIsKill(false); // Delete the EXTRACT_SUBREG if its result is now dead. if (MRI->use_empty(OldReg)) DefMI->eraseFromParent(); @@ -467,43 +466,9 @@ bool NEONPreAllocPass::PreAllocNEONRegisters(MachineBasicBlock &MBB) { unsigned FirstOpnd, NumRegs, Offset, Stride; if (!isNEONMultiRegOp(MI->getOpcode(), FirstOpnd, NumRegs, Offset, Stride)) continue; - if (llvm::ModelWithRegSequence() && - FormsRegSequence(MI, FirstOpnd, NumRegs, Offset, Stride)) + if (FormsRegSequence(MI, FirstOpnd, NumRegs, Offset, Stride)) continue; - - MachineBasicBlock::iterator NextI = llvm::next(MBBI); - for (unsigned R = 0; R < NumRegs; ++R) { - MachineOperand &MO = MI->getOperand(FirstOpnd + R); - assert(MO.isReg() && MO.getSubReg() == 0 && "unexpected operand"); - unsigned VirtReg = MO.getReg(); - assert(TargetRegisterInfo::isVirtualRegister(VirtReg) && - "expected a virtual register"); - - // For now, just assign a fixed set of adjacent registers. - // This leaves plenty of room for future improvements. - static const unsigned NEONDRegs[] = { - ARM::D0, ARM::D1, ARM::D2, ARM::D3, - ARM::D4, ARM::D5, ARM::D6, ARM::D7 - }; - MO.setReg(NEONDRegs[Offset + R * Stride]); - - if (MO.isUse()) { - // Insert a copy from VirtReg. - TII->copyRegToReg(MBB, MBBI, MO.getReg(), VirtReg, - ARM::DPRRegisterClass, ARM::DPRRegisterClass, - DebugLoc()); - if (MO.isKill()) { - MachineInstr *CopyMI = prior(MBBI); - CopyMI->findRegisterUseOperand(VirtReg)->setIsKill(); - } - MO.setIsKill(); - } else if (MO.isDef() && !MO.isDead()) { - // Add a copy to VirtReg. - TII->copyRegToReg(MBB, NextI, VirtReg, MO.getReg(), - ARM::DPRRegisterClass, ARM::DPRRegisterClass, - DebugLoc()); - } - } + llvm_unreachable("expected a REG_SEQUENCE"); } return Modified; diff --git a/contrib/llvm/lib/Target/ARM/README.txt b/contrib/llvm/lib/Target/ARM/README.txt index 85d5ca0..0cb8ff0 100644 --- a/contrib/llvm/lib/Target/ARM/README.txt +++ b/contrib/llvm/lib/Target/ARM/README.txt @@ -590,3 +590,70 @@ than the Z bit, we'll need additional logic to reverse the conditionals associated with the comparison. Perhaps a pseudo-instruction for the comparison, with a post-codegen pass to clean up and handle the condition codes? See PR5694 for testcase. + +//===---------------------------------------------------------------------===// + +Given the following on armv5: +int test1(int A, int B) { + return (A&-8388481)|(B&8388480); +} + +We currently generate: + ldr r2, .LCPI0_0 + and r0, r0, r2 + ldr r2, .LCPI0_1 + and r1, r1, r2 + orr r0, r1, r0 + bx lr + +We should be able to replace the second ldr+and with a bic (i.e. reuse the +constant which was already loaded). Not sure what's necessary to do that. + +//===---------------------------------------------------------------------===// + +Given the following on ARMv7: +int test1(int A, int B) { + return (A&-8388481)|(B&8388480); +} + +We currently generate: + bfc r0, #7, #16 + movw r2, #:lower16:8388480 + movt r2, #:upper16:8388480 + and r1, r1, r2 + orr r0, r1, r0 + bx lr + +The following is much shorter: + lsr r1, r1, #7 + bfi r0, r1, #7, #16 + bx lr + + +//===---------------------------------------------------------------------===// + +The code generated for bswap on armv4/5 (CPUs without rev) is less than ideal: + +int a(int x) { return __builtin_bswap32(x); } + +a: + mov r1, #255, 24 + mov r2, #255, 16 + and r1, r1, r0, lsr #8 + and r2, r2, r0, lsl #8 + orr r1, r1, r0, lsr #24 + orr r0, r2, r0, lsl #24 + orr r0, r0, r1 + bx lr + +Something like the following would be better (fewer instructions/registers): + eor r1, r0, r0, ror #16 + bic r1, r1, #0xff0000 + mov r1, r1, lsr #8 + eor r0, r1, r0, ror #8 + bx lr + +A custom Thumb version would also be a slight improvement over the generic +version. + +//===---------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp index fae84d4..af630ac 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -33,64 +33,24 @@ unsigned Thumb1InstrInfo::getUnindexedOpcode(unsigned Opc) const { return 0; } -bool Thumb1InstrInfo::copyRegToReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - unsigned DestReg, unsigned SrcReg, - const TargetRegisterClass *DestRC, - const TargetRegisterClass *SrcRC, - DebugLoc DL) const { - if (DestRC == ARM::GPRRegisterClass) { - if (SrcRC == ARM::GPRRegisterClass) { - BuildMI(MBB, I, DL, get(ARM::tMOVgpr2gpr), DestReg).addReg(SrcReg); - return true; - } else if (SrcRC == ARM::tGPRRegisterClass) { - BuildMI(MBB, I, DL, get(ARM::tMOVtgpr2gpr), DestReg).addReg(SrcReg); - return true; - } - } else if (DestRC == ARM::tGPRRegisterClass) { - if (SrcRC == ARM::GPRRegisterClass) { - BuildMI(MBB, I, DL, get(ARM::tMOVgpr2tgpr), DestReg).addReg(SrcReg); - return true; - } else if (SrcRC == ARM::tGPRRegisterClass) { - BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg).addReg(SrcReg); - return true; - } - } - - return false; -} - -bool Thumb1InstrInfo:: -canFoldMemoryOperand(const MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops) const { - if (Ops.size() != 1) return false; - - unsigned OpNum = Ops[0]; - unsigned Opc = MI->getOpcode(); - switch (Opc) { - default: break; - case ARM::tMOVr: - case ARM::tMOVtgpr2gpr: - case ARM::tMOVgpr2tgpr: - case ARM::tMOVgpr2gpr: { - if (OpNum == 0) { // move -> store - unsigned SrcReg = MI->getOperand(1).getReg(); - if (TargetRegisterInfo::isPhysicalRegister(SrcReg) && - !isARMLowRegister(SrcReg)) - // tSpill cannot take a high register operand. - return false; - } else { // move -> load - unsigned DstReg = MI->getOperand(0).getReg(); - if (TargetRegisterInfo::isPhysicalRegister(DstReg) && - !isARMLowRegister(DstReg)) - // tRestore cannot target a high register operand. - return false; - } - return true; - } - } - - return false; +void Thumb1InstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const { + bool tDest = ARM::tGPRRegClass.contains(DestReg); + bool tSrc = ARM::tGPRRegClass.contains(SrcReg); + unsigned Opc = ARM::tMOVgpr2gpr; + if (tDest && tSrc) + Opc = ARM::tMOVr; + else if (tSrc) + Opc = ARM::tMOVtgpr2gpr; + else if (tDest) + Opc = ARM::tMOVgpr2tgpr; + + BuildMI(MBB, I, DL, get(Opc), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + assert(ARM::GPRRegClass.contains(DestReg, SrcReg) && + "Thumb1 can only copy GPR registers"); } void Thumb1InstrInfo:: @@ -175,10 +135,10 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB, isKill = false; } - if (isKill) { + if (isKill) MBB.addLiveIn(Reg); - MIB.addReg(Reg, RegState::Kill); - } + + MIB.addReg(Reg, getKillRegState(isKill)); } return true; } @@ -221,46 +181,3 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB, return true; } - -MachineInstr *Thumb1InstrInfo:: -foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops, int FI) const { - if (Ops.size() != 1) return NULL; - - unsigned OpNum = Ops[0]; - unsigned Opc = MI->getOpcode(); - MachineInstr *NewMI = NULL; - switch (Opc) { - default: break; - case ARM::tMOVr: - case ARM::tMOVtgpr2gpr: - case ARM::tMOVgpr2tgpr: - case ARM::tMOVgpr2gpr: { - if (OpNum == 0) { // move -> store - unsigned SrcReg = MI->getOperand(1).getReg(); - bool isKill = MI->getOperand(1).isKill(); - if (TargetRegisterInfo::isPhysicalRegister(SrcReg) && - !isARMLowRegister(SrcReg)) - // tSpill cannot take a high register operand. - break; - NewMI = AddDefaultPred(BuildMI(MF, MI->getDebugLoc(), get(ARM::tSpill)) - .addReg(SrcReg, getKillRegState(isKill)) - .addFrameIndex(FI).addImm(0)); - } else { // move -> load - unsigned DstReg = MI->getOperand(0).getReg(); - if (TargetRegisterInfo::isPhysicalRegister(DstReg) && - !isARMLowRegister(DstReg)) - // tRestore cannot target a high register operand. - break; - bool isDead = MI->getOperand(0).isDead(); - NewMI = AddDefaultPred(BuildMI(MF, MI->getDebugLoc(), get(ARM::tRestore)) - .addReg(DstReg, - RegState::Define | getDeadRegState(isDead)) - .addFrameIndex(FI).addImm(0)); - } - break; - } - } - - return NewMI; -} diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h index c937296..555135a 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h +++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h @@ -46,12 +46,10 @@ public: const std::vector<CalleeSavedInfo> &CSI, const TargetRegisterInfo *TRI) const; - bool copyRegToReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - unsigned DestReg, unsigned SrcReg, - const TargetRegisterClass *DestRC, - const TargetRegisterClass *SrcRC, - DebugLoc DL) const; + void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const; void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg, bool isKill, int FrameIndex, @@ -64,20 +62,6 @@ public: const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const; - bool canFoldMemoryOperand(const MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops) const; - - MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr* MI, - const SmallVectorImpl<unsigned> &Ops, - int FrameIndex) const; - - MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr* MI, - const SmallVectorImpl<unsigned> &Ops, - MachineInstr* LoadMI) const { - return 0; - } }; } diff --git a/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp index 2f635fe..39b70b4 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp @@ -68,21 +68,6 @@ void Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB, .addConstantPoolIndex(Idx).addImm(Pred).addReg(PredReg); } -const TargetRegisterClass* -Thumb1RegisterInfo::getPhysicalRegisterRegClass(unsigned Reg, EVT VT) const { - if (isARMLowRegister(Reg)) - return ARM::tGPRRegisterClass; - switch (Reg) { - default: - break; - case ARM::R8: case ARM::R9: case ARM::R10: case ARM::R11: - case ARM::R12: case ARM::SP: case ARM::LR: case ARM::PC: - return ARM::GPRRegisterClass; - } - - return TargetRegisterInfo::getPhysicalRegisterRegClass(Reg, VT); -} - bool Thumb1RegisterInfo::hasReservedCallFrame(MachineFunction &MF) const { const MachineFrameInfo *FFI = MF.getFrameInfo(); unsigned CFSize = FFI->getMaxCallFrameSize(); @@ -410,6 +395,8 @@ Thumb1RegisterInfo::saveScavengerRegister(MachineBasicBlock &MBB, // before that instead and adjust the UseMI. bool done = false; for (MachineBasicBlock::iterator II = I; !done && II != UseMI ; ++II) { + if (II->isDebugValue()) + continue; // If this instruction affects R12, adjust our restore point. for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) { const MachineOperand &MO = II->getOperand(i); diff --git a/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.h b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.h index 4eca367..9a0308af 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.h +++ b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.h @@ -38,9 +38,6 @@ public: unsigned PredReg = 0) const; /// Code Generation virtual methods... - const TargetRegisterClass * - getPhysicalRegisterRegClass(unsigned Reg, EVT VT = MVT::Other) const; - bool hasReservedCallFrame(MachineFunction &MF) const; void eliminateCallFramePseudoInstr(MachineFunction &MF, @@ -51,7 +48,8 @@ public: // could not be handled directly in MI. int rewriteFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, unsigned FrameReg, int Offset, - unsigned MOVOpc, unsigned ADDriOpc, unsigned SUBriOpc) const; + unsigned MOVOpc, unsigned ADDriOpc, + unsigned SUBriOpc) const; bool saveScavengerRegister(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, diff --git a/contrib/llvm/lib/Target/ARM/Thumb2HazardRecognizer.cpp b/contrib/llvm/lib/Target/ARM/Thumb2HazardRecognizer.cpp new file mode 100644 index 0000000..172908d --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb2HazardRecognizer.cpp @@ -0,0 +1,53 @@ +//===-- Thumb2HazardRecognizer.cpp - Thumb2 postra hazard recognizer ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "Thumb2HazardRecognizer.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/ScheduleDAG.h" +using namespace llvm; + +ScheduleHazardRecognizer::HazardType +Thumb2HazardRecognizer::getHazardType(SUnit *SU) { + if (ITBlockSize) { + MachineInstr *MI = SU->getInstr(); + if (!MI->isDebugValue() && MI != ITBlockMIs[ITBlockSize-1]) + return Hazard; + } + + return PostRAHazardRecognizer::getHazardType(SU); +} + +void Thumb2HazardRecognizer::Reset() { + ITBlockSize = 0; + PostRAHazardRecognizer::Reset(); +} + +void Thumb2HazardRecognizer::EmitInstruction(SUnit *SU) { + MachineInstr *MI = SU->getInstr(); + unsigned Opcode = MI->getOpcode(); + if (ITBlockSize) { + --ITBlockSize; + } else if (Opcode == ARM::t2IT) { + unsigned Mask = MI->getOperand(1).getImm(); + unsigned NumTZ = CountTrailingZeros_32(Mask); + assert(NumTZ <= 3 && "Invalid IT mask!"); + ITBlockSize = 4 - NumTZ; + MachineBasicBlock::iterator I = MI; + for (unsigned i = 0; i < ITBlockSize; ++i) { + // Advance to the next instruction, skipping any dbg_value instructions. + do { + ++I; + } while (I->isDebugValue()); + ITBlockMIs[ITBlockSize-1-i] = &*I; + } + } + + PostRAHazardRecognizer::EmitInstruction(SU); +} diff --git a/contrib/llvm/lib/Target/ARM/Thumb2HazardRecognizer.h b/contrib/llvm/lib/Target/ARM/Thumb2HazardRecognizer.h new file mode 100644 index 0000000..4726658 --- /dev/null +++ b/contrib/llvm/lib/Target/ARM/Thumb2HazardRecognizer.h @@ -0,0 +1,40 @@ +//===-- Thumb2HazardRecognizer.h - Thumb2 Hazard Recognizers ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines hazard recognizers for scheduling Thumb2 functions on +// ARM processors. +// +//===----------------------------------------------------------------------===// + +#ifndef THUMB2HAZARDRECOGNIZER_H +#define THUMB2HAZARDRECOGNIZER_H + +#include "llvm/CodeGen/PostRAHazardRecognizer.h" + +namespace llvm { + +class MachineInstr; + +class Thumb2HazardRecognizer : public PostRAHazardRecognizer { + unsigned ITBlockSize; // No. of MIs in current IT block yet to be scheduled. + MachineInstr *ITBlockMIs[4]; + +public: + Thumb2HazardRecognizer(const InstrItineraryData &ItinData) : + PostRAHazardRecognizer(ItinData) {} + + virtual HazardType getHazardType(SUnit *SU); + virtual void Reset(); + virtual void EmitInstruction(SUnit *SU); +}; + + +} // end namespace llvm + +#endif // THUMB2HAZARDRECOGNIZER_H diff --git a/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp index f36d4ef..cd15bbe 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp +++ b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp @@ -14,17 +14,23 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" using namespace llvm; -STATISTIC(NumITs, "Number of IT blocks inserted"); +STATISTIC(NumITs, "Number of IT blocks inserted"); +STATISTIC(NumMovedInsts, "Number of predicated instructions moved"); namespace { - struct Thumb2ITBlockPass : public MachineFunctionPass { + class Thumb2ITBlockPass : public MachineFunctionPass { + bool PreRegAlloc; + + public: static char ID; Thumb2ITBlockPass() : MachineFunctionPass(&ID) {} const Thumb2InstrInfo *TII; + const TargetRegisterInfo *TRI; ARMFunctionInfo *AFI; virtual bool runOnMachineFunction(MachineFunction &Fn); @@ -34,61 +40,167 @@ namespace { } private: - bool InsertITBlocks(MachineBasicBlock &MBB); + bool MoveCopyOutOfITBlock(MachineInstr *MI, + ARMCC::CondCodes CC, ARMCC::CondCodes OCC, + SmallSet<unsigned, 4> &Defs, + SmallSet<unsigned, 4> &Uses); + bool InsertITInstructions(MachineBasicBlock &MBB); }; char Thumb2ITBlockPass::ID = 0; } -static ARMCC::CondCodes getPredicate(const MachineInstr *MI, unsigned &PredReg){ - unsigned Opc = MI->getOpcode(); - if (Opc == ARM::tBcc || Opc == ARM::t2Bcc) - return ARMCC::AL; - return llvm::getInstrPredicate(MI, PredReg); +/// TrackDefUses - Tracking what registers are being defined and used by +/// instructions in the IT block. This also tracks "dependencies", i.e. uses +/// in the IT block that are defined before the IT instruction. +static void TrackDefUses(MachineInstr *MI, + SmallSet<unsigned, 4> &Defs, + SmallSet<unsigned, 4> &Uses, + const TargetRegisterInfo *TRI) { + SmallVector<unsigned, 4> LocalDefs; + SmallVector<unsigned, 4> LocalUses; + + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (!Reg || Reg == ARM::ITSTATE || Reg == ARM::SP) + continue; + if (MO.isUse()) + LocalUses.push_back(Reg); + else + LocalDefs.push_back(Reg); + } + + for (unsigned i = 0, e = LocalUses.size(); i != e; ++i) { + unsigned Reg = LocalUses[i]; + Uses.insert(Reg); + for (const unsigned *Subreg = TRI->getSubRegisters(Reg); + *Subreg; ++Subreg) + Uses.insert(*Subreg); + } + + for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) { + unsigned Reg = LocalDefs[i]; + Defs.insert(Reg); + for (const unsigned *Subreg = TRI->getSubRegisters(Reg); + *Subreg; ++Subreg) + Defs.insert(*Subreg); + if (Reg == ARM::CPSR) + continue; + } +} + +bool +Thumb2ITBlockPass::MoveCopyOutOfITBlock(MachineInstr *MI, + ARMCC::CondCodes CC, ARMCC::CondCodes OCC, + SmallSet<unsigned, 4> &Defs, + SmallSet<unsigned, 4> &Uses) { + unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx; + if (TII->isMoveInstr(*MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx)) { + assert(SrcSubIdx == 0 && DstSubIdx == 0 && + "Sub-register indices still around?"); + // llvm models select's as two-address instructions. That means a copy + // is inserted before a t2MOVccr, etc. If the copy is scheduled in + // between selects we would end up creating multiple IT blocks. + + // First check if it's safe to move it. + if (Uses.count(DstReg) || Defs.count(SrcReg)) + return false; + + // Then peek at the next instruction to see if it's predicated on CC or OCC. + // If not, then there is nothing to be gained by moving the copy. + MachineBasicBlock::iterator I = MI; ++I; + MachineBasicBlock::iterator E = MI->getParent()->end(); + while (I != E && I->isDebugValue()) + ++I; + if (I != E) { + unsigned NPredReg = 0; + ARMCC::CondCodes NCC = llvm::getITInstrPredicate(I, NPredReg); + if (NCC == CC || NCC == OCC) + return true; + } + } + return false; } -bool Thumb2ITBlockPass::InsertITBlocks(MachineBasicBlock &MBB) { +bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) { bool Modified = false; + SmallSet<unsigned, 4> Defs; + SmallSet<unsigned, 4> Uses; MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); while (MBBI != E) { MachineInstr *MI = &*MBBI; DebugLoc dl = MI->getDebugLoc(); unsigned PredReg = 0; - ARMCC::CondCodes CC = getPredicate(MI, PredReg); - + ARMCC::CondCodes CC = llvm::getITInstrPredicate(MI, PredReg); if (CC == ARMCC::AL) { ++MBBI; continue; } + Defs.clear(); + Uses.clear(); + TrackDefUses(MI, Defs, Uses, TRI); + // Insert an IT instruction. MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(ARM::t2IT)) .addImm(CC); + + // Add implicit use of ITSTATE to IT block instructions. + MI->addOperand(MachineOperand::CreateReg(ARM::ITSTATE, false/*ifDef*/, + true/*isImp*/, false/*isKill*/)); + + MachineInstr *LastITMI = MI; + MachineBasicBlock::iterator InsertPos = MIB; ++MBBI; - // Finalize IT mask. + // Form IT block. ARMCC::CondCodes OCC = ARMCC::getOppositeCondition(CC); unsigned Mask = 0, Pos = 3; // Branches, including tricky ones like LDM_RET, need to end an IT // block so check the instruction we just put in the block. - while (MBBI != E && Pos && - (!MI->getDesc().isBranch() && !MI->getDesc().isReturn())) { + for (; MBBI != E && Pos && + (!MI->getDesc().isBranch() && !MI->getDesc().isReturn()) ; ++MBBI) { + if (MBBI->isDebugValue()) + continue; + MachineInstr *NMI = &*MBBI; MI = NMI; - DebugLoc ndl = NMI->getDebugLoc(); + unsigned NPredReg = 0; - ARMCC::CondCodes NCC = getPredicate(NMI, NPredReg); - if (NCC == CC || NCC == OCC) + ARMCC::CondCodes NCC = llvm::getITInstrPredicate(NMI, NPredReg); + if (NCC == CC || NCC == OCC) { Mask |= (NCC & 1) << Pos; - else + // Add implicit use of ITSTATE. + NMI->addOperand(MachineOperand::CreateReg(ARM::ITSTATE, false/*ifDef*/, + true/*isImp*/, false/*isKill*/)); + LastITMI = NMI; + } else { + if (NCC == ARMCC::AL && + MoveCopyOutOfITBlock(NMI, CC, OCC, Defs, Uses)) { + --MBBI; + MBB.remove(NMI); + MBB.insert(InsertPos, NMI); + ++NumMovedInsts; + continue; + } break; + } + TrackDefUses(NMI, Defs, Uses, TRI); --Pos; - ++MBBI; } + + // Finalize IT mask. Mask |= (1 << Pos); // Tag along (firstcond[0] << 4) with the mask. Mask |= (CC & 1) << 4; MIB.addImm(Mask); + + // Last instruction in IT block kills ITSTATE. + LastITMI->findRegisterUseOperand(ARM::ITSTATE)->setIsKill(); + Modified = true; ++NumITs; } @@ -100,17 +212,21 @@ bool Thumb2ITBlockPass::runOnMachineFunction(MachineFunction &Fn) { const TargetMachine &TM = Fn.getTarget(); AFI = Fn.getInfo<ARMFunctionInfo>(); TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo()); + TRI = TM.getRegisterInfo(); if (!AFI->isThumbFunction()) return false; bool Modified = false; - for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; - ++MFI) { + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; ) { MachineBasicBlock &MBB = *MFI; - Modified |= InsertITBlocks(MBB); + ++MFI; + Modified |= InsertITInstructions(MBB); } + if (Modified) + AFI->setHasITBlocks(true); + return Modified; } diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp index 531d5e9..ee51727 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -17,15 +17,27 @@ #include "ARMAddressingModes.h" #include "ARMGenInstrInfo.inc" #include "ARMMachineFunctionInfo.h" +#include "Thumb2HazardRecognizer.h" +#include "Thumb2InstrInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/ADT/SmallVector.h" -#include "Thumb2InstrInfo.h" +#include "llvm/Support/CommandLine.h" using namespace llvm; +static cl::opt<unsigned> +IfCvtLimit("thumb2-ifcvt-limit", cl::Hidden, + cl::desc("Thumb2 if-conversion limit (default 3)"), + cl::init(3)); + +static cl::opt<unsigned> +IfCvtDiamondLimit("thumb2-ifcvt-diamond-limit", cl::Hidden, + cl::desc("Thumb2 diamond if-conversion limit (default 3)"), + cl::init(3)); + Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI) : ARMBaseInstrInfo(STI), RI(*this, STI) { } @@ -35,33 +47,99 @@ unsigned Thumb2InstrInfo::getUnindexedOpcode(unsigned Opc) const { return 0; } -bool -Thumb2InstrInfo::copyRegToReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - unsigned DestReg, unsigned SrcReg, - const TargetRegisterClass *DestRC, - const TargetRegisterClass *SrcRC, - DebugLoc DL) const { - if (DestRC == ARM::GPRRegisterClass) { - if (SrcRC == ARM::GPRRegisterClass) { - BuildMI(MBB, I, DL, get(ARM::tMOVgpr2gpr), DestReg).addReg(SrcReg); - return true; - } else if (SrcRC == ARM::tGPRRegisterClass) { - BuildMI(MBB, I, DL, get(ARM::tMOVtgpr2gpr), DestReg).addReg(SrcReg); - return true; - } - } else if (DestRC == ARM::tGPRRegisterClass) { - if (SrcRC == ARM::GPRRegisterClass) { - BuildMI(MBB, I, DL, get(ARM::tMOVgpr2tgpr), DestReg).addReg(SrcReg); - return true; - } else if (SrcRC == ARM::tGPRRegisterClass) { - BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg).addReg(SrcReg); - return true; +void +Thumb2InstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail, + MachineBasicBlock *NewDest) const { + MachineBasicBlock *MBB = Tail->getParent(); + ARMFunctionInfo *AFI = MBB->getParent()->getInfo<ARMFunctionInfo>(); + if (!AFI->hasITBlocks()) { + TargetInstrInfoImpl::ReplaceTailWithBranchTo(Tail, NewDest); + return; + } + + // If the first instruction of Tail is predicated, we may have to update + // the IT instruction. + unsigned PredReg = 0; + ARMCC::CondCodes CC = llvm::getInstrPredicate(Tail, PredReg); + MachineBasicBlock::iterator MBBI = Tail; + if (CC != ARMCC::AL) + // Expecting at least the t2IT instruction before it. + --MBBI; + + // Actually replace the tail. + TargetInstrInfoImpl::ReplaceTailWithBranchTo(Tail, NewDest); + + // Fix up IT. + if (CC != ARMCC::AL) { + MachineBasicBlock::iterator E = MBB->begin(); + unsigned Count = 4; // At most 4 instructions in an IT block. + while (Count && MBBI != E) { + if (MBBI->isDebugValue()) { + --MBBI; + continue; + } + if (MBBI->getOpcode() == ARM::t2IT) { + unsigned Mask = MBBI->getOperand(1).getImm(); + if (Count == 4) + MBBI->eraseFromParent(); + else { + unsigned MaskOn = 1 << Count; + unsigned MaskOff = ~(MaskOn - 1); + MBBI->getOperand(1).setImm((Mask & MaskOff) | MaskOn); + } + return; + } + --MBBI; + --Count; } + + // Ctrl flow can reach here if branch folding is run before IT block + // formation pass. } +} + +bool +Thumb2InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) const { + unsigned PredReg = 0; + return llvm::getITInstrPredicate(MBBI, PredReg) == ARMCC::AL; +} +bool Thumb2InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB, + unsigned NumInstrs) const { + return NumInstrs && NumInstrs <= IfCvtLimit; +} + +bool Thumb2InstrInfo:: +isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumT, + MachineBasicBlock &FMBB, unsigned NumF) const { + // FIXME: Catch optimization such as: + // r0 = movne + // r0 = moveq + return NumT && NumF && + NumT <= (IfCvtDiamondLimit) && NumF <= (IfCvtDiamondLimit); +} + +void Thumb2InstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const { // Handle SPR, DPR, and QPR copies. - return ARMBaseInstrInfo::copyRegToReg(MBB, I, DestReg, SrcReg, DestRC, SrcRC, DL); + if (!ARM::GPRRegClass.contains(DestReg, SrcReg)) + return ARMBaseInstrInfo::copyPhysReg(MBB, I, DL, DestReg, SrcReg, KillSrc); + + bool tDest = ARM::tGPRRegClass.contains(DestReg); + bool tSrc = ARM::tGPRRegClass.contains(SrcReg); + unsigned Opc = ARM::tMOVgpr2gpr; + if (tDest && tSrc) + Opc = ARM::tMOVr; + else if (tSrc) + Opc = ARM::tMOVtgpr2gpr; + else if (tDest) + Opc = ARM::tMOVgpr2tgpr; + + BuildMI(MBB, I, DL, get(Opc), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); } void Thumb2InstrInfo:: @@ -69,7 +147,8 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { - if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass) { + if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass || + RC == ARM::tcGPRRegisterClass) { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); @@ -94,7 +173,8 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned DestReg, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { - if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass) { + if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass || + RC == ARM::tcGPRRegisterClass) { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); @@ -113,6 +193,11 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, TRI); } +ScheduleHazardRecognizer *Thumb2InstrInfo:: +CreateTargetPostRAHazardRecognizer(const InstrItineraryData &II) const { + return (ScheduleHazardRecognizer *)new Thumb2HazardRecognizer(II); +} + void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, DebugLoc dl, unsigned DestReg, unsigned BaseReg, int NumBytes, @@ -131,14 +216,14 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB, // Use a movw to materialize the 16-bit constant. BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), DestReg) .addImm(NumBytes) - .addImm((unsigned)Pred).addReg(PredReg).addReg(0); + .addImm((unsigned)Pred).addReg(PredReg); Fits = true; } else if ((NumBytes & 0xffff) == 0) { // Use a movt to materialize the 32-bit constant. BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVTi16), DestReg) .addReg(DestReg) .addImm(NumBytes >> 16) - .addImm((unsigned)Pred).addReg(PredReg).addReg(0); + .addImm((unsigned)Pred).addReg(PredReg); Fits = true; } @@ -502,3 +587,54 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, Offset = (isSub) ? -Offset : Offset; return Offset == 0; } + +/// scheduleTwoAddrSource - Schedule the copy / re-mat of the source of the +/// two-addrss instruction inserted by two-address pass. +void +Thumb2InstrInfo::scheduleTwoAddrSource(MachineInstr *SrcMI, + MachineInstr *UseMI, + const TargetRegisterInfo &TRI) const { + if (SrcMI->getOpcode() != ARM::tMOVgpr2gpr || + SrcMI->getOperand(1).isKill()) + return; + + unsigned PredReg = 0; + ARMCC::CondCodes CC = llvm::getInstrPredicate(UseMI, PredReg); + if (CC == ARMCC::AL || PredReg != ARM::CPSR) + return; + + // Schedule the copy so it doesn't come between previous instructions + // and UseMI which can form an IT block. + unsigned SrcReg = SrcMI->getOperand(1).getReg(); + ARMCC::CondCodes OCC = ARMCC::getOppositeCondition(CC); + MachineBasicBlock *MBB = UseMI->getParent(); + MachineBasicBlock::iterator MBBI = SrcMI; + unsigned NumInsts = 0; + while (--MBBI != MBB->begin()) { + if (MBBI->isDebugValue()) + continue; + + MachineInstr *NMI = &*MBBI; + ARMCC::CondCodes NCC = llvm::getInstrPredicate(NMI, PredReg); + if (!(NCC == CC || NCC == OCC) || + NMI->modifiesRegister(SrcReg, &TRI) || + NMI->definesRegister(ARM::CPSR)) + break; + if (++NumInsts == 4) + // Too many in a row! + return; + } + + if (NumInsts) { + MBB->remove(SrcMI); + MBB->insert(++MBBI, SrcMI); + } +} + +ARMCC::CondCodes +llvm::getITInstrPredicate(const MachineInstr *MI, unsigned &PredReg) { + unsigned Opc = MI->getOpcode(); + if (Opc == ARM::tBcc || Opc == ARM::t2Bcc) + return ARMCC::AL; + return llvm::getInstrPredicate(MI, PredReg); +} diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h index 2948770..3a9f8b1 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h +++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h @@ -20,7 +20,8 @@ #include "Thumb2RegisterInfo.h" namespace llvm { - class ARMSubtarget; +class ARMSubtarget; +class ScheduleHazardRecognizer; class Thumb2InstrInfo : public ARMBaseInstrInfo { Thumb2RegisterInfo RI; @@ -31,12 +32,21 @@ public: // if there is not such an opcode. unsigned getUnindexedOpcode(unsigned Opc) const; - bool copyRegToReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - unsigned DestReg, unsigned SrcReg, - const TargetRegisterClass *DestRC, - const TargetRegisterClass *SrcRC, - DebugLoc DL) const; + void ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail, + MachineBasicBlock *NewDest) const; + + bool isLegalToSplitMBBAt(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) const; + + bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumInstrs) const; + + bool isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumTInstrs, + MachineBasicBlock &FMBB, unsigned NumFInstrs) const; + + void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const; void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, @@ -50,12 +60,27 @@ public: const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const; + /// scheduleTwoAddrSource - Schedule the copy / re-mat of the source of the + /// two-addrss instruction inserted by two-address pass. + void scheduleTwoAddrSource(MachineInstr *SrcMI, MachineInstr *UseMI, + const TargetRegisterInfo &TRI) const; + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As /// such, whenever a client has an instance of instruction info, it should /// always be able to get register info as well (through this method). /// const Thumb2RegisterInfo &getRegisterInfo() const { return RI; } + + ScheduleHazardRecognizer * + CreateTargetPostRAHazardRecognizer(const InstrItineraryData &II) const; }; + +/// getITInstrPredicate - Valid only in Thumb2 mode. This function is identical +/// to llvm::getInstrPredicate except it returns AL for conditional branch +/// instructions which are "predicated", but are not in IT blocks. +ARMCC::CondCodes getITInstrPredicate(const MachineInstr *MI, unsigned &PredReg); + + } #endif // THUMB2INSTRUCTIONINFO_H diff --git a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp index 8fe2e42..ba392f3 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -451,11 +451,18 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr)) return false; - const TargetInstrDesc &TID = MI->getDesc(); unsigned Reg0 = MI->getOperand(0).getReg(); unsigned Reg1 = MI->getOperand(1).getReg(); - if (Reg0 != Reg1) - return false; + if (Reg0 != Reg1) { + // Try to commute the operands to make it a 2-address instruction. + unsigned CommOpIdx1, CommOpIdx2; + if (!TII->findCommutedOpIndices(MI, CommOpIdx1, CommOpIdx2) || + CommOpIdx1 != 1 || MI->getOperand(CommOpIdx2).getReg() != Reg0) + return false; + MachineInstr *CommutedMI = TII->commuteInstruction(MI); + if (!CommutedMI) + return false; + } if (Entry.LowRegs2 && !isARMLowRegister(Reg0)) return false; if (Entry.Imm2Limit) { @@ -484,6 +491,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, bool HasCC = false; bool CCDead = false; + const TargetInstrDesc &TID = MI->getDesc(); if (TID.hasOptionalDef()) { unsigned NumOps = TID.getNumOperands(); HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR); @@ -689,7 +697,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { goto ProcessNext; } - // Try to transform ro a 16-bit non-two-address instruction. + // Try to transform to a 16-bit non-two-address instruction. if (Entry.NarrowOpc1 && ReduceToNarrow(MBB, MI, Entry, LiveCPSR)) { Modified = true; MachineBasicBlock::iterator I = prior(NextMII); |