diff options
Diffstat (limited to 'contrib/llvm/lib/Target/PowerPC')
50 files changed, 3663 insertions, 1007 deletions
diff --git a/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp index 12ffbfd..11d2237 100644 --- a/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp +++ b/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp @@ -204,6 +204,17 @@ static const unsigned G8Regs[] = { PPC::X28, PPC::X29, PPC::X30, PPC::X31 }; +static const unsigned G80Regs[] = { + PPC::ZERO8, PPC::X1, PPC::X2, PPC::X3, + PPC::X4, PPC::X5, PPC::X6, PPC::X7, + PPC::X8, PPC::X9, PPC::X10, PPC::X11, + PPC::X12, PPC::X13, PPC::X14, PPC::X15, + PPC::X16, PPC::X17, PPC::X18, PPC::X19, + PPC::X20, PPC::X21, PPC::X22, PPC::X23, + PPC::X24, PPC::X25, PPC::X26, PPC::X27, + PPC::X28, PPC::X29, PPC::X30, PPC::X31 +}; + static const unsigned QFRegs[] = { PPC::QF0, PPC::QF1, PPC::QF2, PPC::QF3, PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7, @@ -301,6 +312,12 @@ static DecodeStatus DecodeG8RCRegisterClass(MCInst &Inst, uint64_t RegNo, return decodeRegisterClass(Inst, RegNo, G8Regs); } +static DecodeStatus DecodeG8RC_NOX0RegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, G80Regs); +} + #define DecodePointerLikeRegClass0 DecodeGPRCRegisterClass #define DecodePointerLikeRegClass1 DecodeGPRC_NOR0RegisterClass diff --git a/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp index 609d959..baf5902 100644 --- a/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp +++ b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp @@ -12,9 +12,9 @@ //===----------------------------------------------------------------------===// #include "PPCInstPrinter.h" -#include "PPCInstrInfo.h" #include "MCTargetDesc/PPCMCTargetDesc.h" #include "MCTargetDesc/PPCPredicates.h" +#include "PPCInstrInfo.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" @@ -95,7 +95,8 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O, return; } - if (MI->getOpcode() == PPC::RLDICR) { + if (MI->getOpcode() == PPC::RLDICR || + MI->getOpcode() == PPC::RLDICR_32) { unsigned char SH = MI->getOperand(2).getImm(); unsigned char ME = MI->getOperand(3).getImm(); // rldicr RA, RS, SH, 63-SH == sldi RA, RS, SH diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp index 5847b3a..bdad2fe 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp @@ -7,8 +7,10 @@ // //===----------------------------------------------------------------------===// -#include "MCTargetDesc/PPCMCTargetDesc.h" #include "MCTargetDesc/PPCFixupKinds.h" +#include "MCTargetDesc/PPCMCTargetDesc.h" +#include "llvm/BinaryFormat/ELF.h" +#include "llvm/BinaryFormat/MachO.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCELFObjectWriter.h" @@ -18,9 +20,7 @@ #include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/MCValue.h" -#include "llvm/Support/ELF.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MachO.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; @@ -113,8 +113,9 @@ public: return (IsLittleEndian? InfosLE : InfosBE)[Kind - FirstTargetFixupKind]; } - void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value, bool IsPCRel) const override { + void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, MutableArrayRef<char> Data, + uint64_t Value, bool IsResolved) const override { Value = adjustFixupValue(Fixup.getKind(), Value); if (!Value) return; // Doesn't change encoding. @@ -130,12 +131,11 @@ public: } } - void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout, - const MCFixup &Fixup, const MCFragment *DF, - const MCValue &Target, uint64_t &Value, - bool &IsResolved) override { + bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target) override { switch ((PPC::Fixups)Fixup.getKind()) { - default: break; + default: + return false; case PPC::fixup_ppc_br24: case PPC::fixup_ppc_br24abs: // If the target symbol has a local entry point we must not attempt @@ -148,10 +148,10 @@ public: // and thus the shift to pack it. unsigned Other = S->getOther() << 2; if ((Other & ELF::STO_PPC64_LOCAL_MASK) != 0) - IsResolved = false; + return true; } } - break; + return false; } } diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp index fd279c6..1488bd5 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp @@ -7,9 +7,9 @@ // //===----------------------------------------------------------------------===// -#include "MCTargetDesc/PPCMCTargetDesc.h" #include "MCTargetDesc/PPCFixupKinds.h" #include "MCTargetDesc/PPCMCExpr.h" +#include "MCTargetDesc/PPCMCTargetDesc.h" #include "llvm/ADT/STLExtras.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCExpr.h" diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h index ae43e59d..dce4439 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h @@ -17,35 +17,31 @@ namespace llvm { namespace PPC { enum Fixups { - // fixup_ppc_br24 - 24-bit PC relative relocation for direct branches like 'b' - // and 'bl'. + // 24-bit PC relative relocation for direct branches like 'b' and 'bl'. fixup_ppc_br24 = FirstTargetFixupKind, - - /// fixup_ppc_brcond14 - 14-bit PC relative relocation for conditional - /// branches. + + /// 14-bit PC relative relocation for conditional branches. fixup_ppc_brcond14, - - /// fixup_ppc_br24abs - 24-bit absolute relocation for direct branches - /// like 'ba' and 'bla'. + + /// 24-bit absolute relocation for direct branches like 'ba' and 'bla'. fixup_ppc_br24abs, - /// fixup_ppc_brcond14abs - 14-bit absolute relocation for conditional - /// branches. + /// 14-bit absolute relocation for conditional branches. fixup_ppc_brcond14abs, - /// fixup_ppc_half16 - A 16-bit fixup corresponding to lo16(_foo) - /// or ha16(_foo) for instrs like 'li' or 'addis'. + /// A 16-bit fixup corresponding to lo16(_foo) or ha16(_foo) for instrs like + /// 'li' or 'addis'. fixup_ppc_half16, - - /// fixup_ppc_half16ds - A 14-bit fixup corresponding to lo16(_foo) with - /// implied 2 zero bits for instrs like 'std'. + + /// A 14-bit fixup corresponding to lo16(_foo) with implied 2 zero bits for + /// instrs like 'std'. fixup_ppc_half16ds, - /// fixup_ppc_nofixup - Not a true fixup, but ties a symbol to a call - /// to __tls_get_addr for the TLS general and local dynamic models, - /// or inserts the thread-pointer register number. + /// Not a true fixup, but ties a symbol to a call to __tls_get_addr for the + /// TLS general and local dynamic models, or inserts the thread-pointer + /// register number. fixup_ppc_nofixup, - + // Marker LastTargetFixupKind, NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp index d8fab5b..d30bf1a 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp @@ -20,7 +20,7 @@ void PPCMCAsmInfoDarwin::anchor() { } PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit, const Triple& T) { if (is64Bit) { - PointerSize = CalleeSaveStackSlotSize = 8; + CodePointerSize = CalleeSaveStackSlotSize = 8; } IsLittleEndian = false; @@ -50,7 +50,7 @@ PPCELFMCAsmInfo::PPCELFMCAsmInfo(bool is64Bit, const Triple& T) { NeedsLocalForSize = true; if (is64Bit) { - PointerSize = CalleeSaveStackSlotSize = 8; + CodePointerSize = CalleeSaveStackSlotSize = 8; } IsLittleEndian = T.getArch() == Triple::ppc64le; diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp index 017d21a..92c8c22 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp @@ -11,22 +11,28 @@ // //===----------------------------------------------------------------------===// -#include "PPCInstrInfo.h" -#include "MCTargetDesc/PPCMCTargetDesc.h" #include "MCTargetDesc/PPCFixupKinds.h" +#include "PPCInstrInfo.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/Triple.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" -#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCFixup.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/Endian.h" #include "llvm/Support/EndianStream.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetOpcodes.h" +#include <cassert> +#include <cstdint> + using namespace llvm; #define DEBUG_TYPE "mccodeemitter" @@ -34,10 +40,8 @@ using namespace llvm; STATISTIC(MCNumEmitted, "Number of MC instructions emitted"); namespace { -class PPCMCCodeEmitter : public MCCodeEmitter { - PPCMCCodeEmitter(const PPCMCCodeEmitter &) = delete; - void operator=(const PPCMCCodeEmitter &) = delete; +class PPCMCCodeEmitter : public MCCodeEmitter { const MCInstrInfo &MCII; const MCContext &CTX; bool IsLittleEndian; @@ -46,8 +50,9 @@ public: PPCMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx) : MCII(mcii), CTX(ctx), IsLittleEndian(ctx.getAsmInfo()->isLittleEndian()) {} - - ~PPCMCCodeEmitter() override {} + PPCMCCodeEmitter(const PPCMCCodeEmitter &) = delete; + void operator=(const PPCMCCodeEmitter &) = delete; + ~PPCMCCodeEmitter() override = default; unsigned getDirectBrEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, @@ -103,6 +108,7 @@ public: uint64_t getBinaryCodeForInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; + void encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const override { @@ -137,7 +143,7 @@ public: } break; default: - llvm_unreachable ("Invalid instruction size"); + llvm_unreachable("Invalid instruction size"); } ++MCNumEmitted; // Keep track of the # of mi's emitted. @@ -238,7 +244,6 @@ unsigned PPCMCCodeEmitter::getMemRIEncoding(const MCInst &MI, unsigned OpNo, return RegBits; } - unsigned PPCMCCodeEmitter::getMemRIXEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { @@ -266,7 +271,8 @@ unsigned PPCMCCodeEmitter::getMemRIX16Encoding(const MCInst &MI, unsigned OpNo, unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 12; const MCOperand &MO = MI.getOperand(OpNo); - assert(MO.isImm()); + assert(MO.isImm() && !(MO.getImm() % 16) && + "Expecting an immediate that is a multiple of 16"); return ((getMachineOpValue(MI, MO, Fixups, STI) >> 4) & 0xFFF) | RegBits; } @@ -286,7 +292,6 @@ unsigned PPCMCCodeEmitter::getSPE8DisEncoding(const MCInst &MI, unsigned OpNo, return reverseBits(Imm | RegBits) >> 22; } - unsigned PPCMCCodeEmitter::getSPE4DisEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) @@ -302,7 +307,6 @@ unsigned PPCMCCodeEmitter::getSPE4DisEncoding(const MCInst &MI, unsigned OpNo, return reverseBits(Imm | RegBits) >> 22; } - unsigned PPCMCCodeEmitter::getSPE2DisEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) @@ -318,7 +322,6 @@ unsigned PPCMCCodeEmitter::getSPE2DisEncoding(const MCInst &MI, unsigned OpNo, return reverseBits(Imm | RegBits) >> 22; } - unsigned PPCMCCodeEmitter::getTLSRegEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { @@ -383,7 +386,5 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO, return MO.getImm(); } - - #define ENABLE_INSTR_PREDICATE_VERIFIER #include "PPCGenMCCodeEmitter.inc" diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp index 6b97d4c..54f6643 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// -#include "PPCFixupKinds.h" #include "PPCMCExpr.h" +#include "PPCFixupKinds.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp index bbd10e5..e8f220e 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp @@ -11,23 +11,30 @@ // //===----------------------------------------------------------------------===// -#include "PPCMCTargetDesc.h" +#include "MCTargetDesc/PPCMCTargetDesc.h" #include "InstPrinter/PPCInstPrinter.h" -#include "PPCMCAsmInfo.h" +#include "MCTargetDesc/PPCMCAsmInfo.h" #include "PPCTargetStreamer.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" +#include "llvm/BinaryFormat/ELF.h" +#include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCELFStreamer.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCSymbolELF.h" -#include "llvm/MC/MachineLocation.h" -#include "llvm/Support/ELF.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -41,9 +48,10 @@ using namespace llvm; #include "PPCGenRegisterInfo.inc" // Pin the vtable to this file. -PPCTargetStreamer::~PPCTargetStreamer() {} PPCTargetStreamer::PPCTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {} +PPCTargetStreamer::~PPCTargetStreamer() = default; + static MCInstrInfo *createPPCMCInstrInfo() { MCInstrInfo *X = new MCInstrInfo(); InitPPCMCInstrInfo(X); @@ -96,12 +104,14 @@ static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM, } namespace { + class PPCTargetAsmStreamer : public PPCTargetStreamer { formatted_raw_ostream &OS; public: PPCTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS) : PPCTargetStreamer(S), OS(OS) {} + void emitTCEntry(const MCSymbol &S) override { OS << "\t.tc "; OS << S.getName(); @@ -109,12 +119,15 @@ public: OS << S.getName(); OS << '\n'; } + void emitMachine(StringRef CPU) override { OS << "\t.machine " << CPU << '\n'; } + void emitAbiVersion(int AbiVersion) override { OS << "\t.abiversion " << AbiVersion << '\n'; } + void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) override { const MCAsmInfo *MAI = Streamer.getContext().getAsmInfo(); @@ -129,18 +142,22 @@ public: class PPCTargetELFStreamer : public PPCTargetStreamer { public: PPCTargetELFStreamer(MCStreamer &S) : PPCTargetStreamer(S) {} + MCELFStreamer &getStreamer() { return static_cast<MCELFStreamer &>(Streamer); } + void emitTCEntry(const MCSymbol &S) override { // Creates a R_PPC64_TOC relocation Streamer.EmitValueToAlignment(8); Streamer.EmitSymbolValue(&S, 8); } + void emitMachine(StringRef CPU) override { // FIXME: Is there anything to do in here or does this directive only // limit the parser? } + void emitAbiVersion(int AbiVersion) override { MCAssembler &MCA = getStreamer().getAssembler(); unsigned Flags = MCA.getELFHeaderEFlags(); @@ -148,6 +165,7 @@ public: Flags |= (AbiVersion & ELF::EF_PPC64_ABI); MCA.setELFHeaderEFlags(Flags); } + void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) override { MCAssembler &MCA = getStreamer().getAssembler(); @@ -170,6 +188,7 @@ public: if ((Flags & ELF::EF_PPC64_ABI) == 0) MCA.setELFHeaderEFlags(Flags | 2); } + void emitAssignment(MCSymbol *S, const MCExpr *Value) override { auto *Symbol = cast<MCSymbolELF>(S); // When encoding an assignment to set symbol A to symbol B, also copy @@ -188,21 +207,26 @@ public: class PPCTargetMachOStreamer : public PPCTargetStreamer { public: PPCTargetMachOStreamer(MCStreamer &S) : PPCTargetStreamer(S) {} + void emitTCEntry(const MCSymbol &S) override { llvm_unreachable("Unknown pseudo-op: .tc"); } + void emitMachine(StringRef CPU) override { // FIXME: We should update the CPUType, CPUSubType in the Object file if // the new values are different from the defaults. } + void emitAbiVersion(int AbiVersion) override { llvm_unreachable("Unknown pseudo-op: .abiversion"); } + void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) override { llvm_unreachable("Unknown pseudo-op: .localentry"); } }; -} + +} // end anonymous namespace static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS, diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h index 0989e0c..893233e 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h @@ -17,23 +17,22 @@ // GCC #defines PPC on Linux but we use it as our namespace name #undef PPC -#include "llvm/Support/DataTypes.h" #include "llvm/Support/MathExtras.h" +#include <cstdint> namespace llvm { + class MCAsmBackend; class MCCodeEmitter; class MCContext; class MCInstrInfo; class MCObjectWriter; class MCRegisterInfo; -class MCSubtargetInfo; class MCTargetOptions; class Target; class Triple; class StringRef; class raw_pwrite_stream; -class raw_ostream; Target &getThePPC32Target(); Target &getThePPC64Target(); @@ -83,7 +82,7 @@ static inline bool isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME) { return false; } -} // End llvm namespace +} // end namespace llvm // Generated files will use "namespace PPC". To avoid symbol clash, // undefine PPC here. PPC may be predefined on some hosts. @@ -103,4 +102,4 @@ static inline bool isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME) { #define GET_SUBTARGETINFO_ENUM #include "PPCGenSubtargetInfo.inc" -#endif +#endif // LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCTARGETDESC_H diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp index 1f38a8c..d550627 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp @@ -7,9 +7,10 @@ // //===----------------------------------------------------------------------===// -#include "MCTargetDesc/PPCMCTargetDesc.h" #include "MCTargetDesc/PPCFixupKinds.h" +#include "MCTargetDesc/PPCMCTargetDesc.h" #include "llvm/ADT/Twine.h" +#include "llvm/BinaryFormat/MachO.h" #include "llvm/MC/MCAsmLayout.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" @@ -18,7 +19,6 @@ #include "llvm/MC/MCValue.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" -#include "llvm/Support/MachO.h" using namespace llvm; @@ -151,7 +151,7 @@ static void makeRelocationInfo(MachO::any_relocation_info &MRE, // The bitfield offsets that work (as determined by trial-and-error) // are different than what is documented in the mach-o manuals. // This appears to be an endianness issue; reversing the order of the - // documented bitfields in <llvm/Support/MachO.h> fixes this (but + // documented bitfields in <llvm/BinaryFormat/MachO.h> fixes this (but // breaks x86/ARM assembly). MRE.r_word1 = ((Index << 8) | // was << 0 (IsPCRel << 7) | // was << 24 @@ -219,11 +219,11 @@ bool PPCMachObjectWriter::recordScatteredRelocation( const MCSymbol *SB = &B->getSymbol(); if (!SB->getFragment()) - report_fatal_error("symbol '" + B->getSymbol().getName() + + report_fatal_error("symbol '" + SB->getName() + "' can not be undefined in a subtraction expression"); - // FIXME: is Type correct? see include/llvm/Support/MachO.h - Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout); + // FIXME: is Type correct? see include/llvm/BinaryFormat/MachO.h + Value2 = Writer->getSymbolAddress(*SB, Layout); FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent()); } // FIXME: does FixedValue get used?? diff --git a/contrib/llvm/lib/Target/PowerPC/PPC.h b/contrib/llvm/lib/Target/PowerPC/PPC.h index e01f49d..ad92ac8 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPC.h +++ b/contrib/llvm/lib/Target/PowerPC/PPC.h @@ -15,6 +15,7 @@ #ifndef LLVM_LIB_TARGET_POWERPC_PPC_H #define LLVM_LIB_TARGET_POWERPC_PPC_H +#include "llvm/Support/CodeGen.h" #include "MCTargetDesc/PPCMCTargetDesc.h" // GCC #defines PPC on Linux but we use it as our namespace name @@ -24,12 +25,11 @@ namespace llvm { class PPCTargetMachine; class PassRegistry; class FunctionPass; - class ImmutablePass; class MachineInstr; class AsmPrinter; class MCInst; - FunctionPass *createPPCCTRLoops(PPCTargetMachine &TM); + FunctionPass *createPPCCTRLoops(); #ifndef NDEBUG FunctionPass *createPPCCTRLoopsVerify(); #endif @@ -42,14 +42,17 @@ namespace llvm { FunctionPass *createPPCMIPeepholePass(); FunctionPass *createPPCBranchSelectionPass(); FunctionPass *createPPCQPXLoadSplatPass(); - FunctionPass *createPPCISelDag(PPCTargetMachine &TM); + FunctionPass *createPPCISelDag(PPCTargetMachine &TM, CodeGenOpt::Level OL); FunctionPass *createPPCTLSDynamicCallPass(); FunctionPass *createPPCBoolRetToIntPass(); + FunctionPass *createPPCExpandISELPass(); void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, AsmPrinter &AP, bool isDarwin); void initializePPCVSXFMAMutatePass(PassRegistry&); void initializePPCBoolRetToIntPass(PassRegistry&); + void initializePPCExpandISELPass(PassRegistry &); + void initializePPCTLSDynamicCallPass(PassRegistry &); extern char &PPCVSXFMAMutateID; namespace PPCII { diff --git a/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index f0e0ebc..841b8c5 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -16,11 +16,11 @@ // //===----------------------------------------------------------------------===// -#include "PPC.h" -#include "PPCInstrInfo.h" #include "InstPrinter/PPCInstPrinter.h" #include "MCTargetDesc/PPCMCExpr.h" #include "MCTargetDesc/PPCMCTargetDesc.h" +#include "PPC.h" +#include "PPCInstrInfo.h" #include "PPCMachineFunctionInfo.h" #include "PPCSubtarget.h" #include "PPCTargetMachine.h" @@ -29,6 +29,8 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" +#include "llvm/BinaryFormat/ELF.h" +#include "llvm/BinaryFormat/MachO.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" @@ -55,11 +57,9 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ELF.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MachO.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include <algorithm> #include <cassert> @@ -112,7 +112,9 @@ public: void EmitTlsCall(const MachineInstr *MI, MCSymbolRefExpr::VariantKind VK); bool runOnMachineFunction(MachineFunction &MF) override { Subtarget = &MF.getSubtarget<PPCSubtarget>(); - return AsmPrinter::runOnMachineFunction(MF); + bool Changed = AsmPrinter::runOnMachineFunction(MF); + emitXRayTable(); + return Changed; } }; @@ -134,6 +136,7 @@ public: void EmitFunctionBodyStart() override; void EmitFunctionBodyEnd() override; + void EmitInstruction(const MachineInstr *MI) override; }; /// PPCDarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac @@ -402,7 +405,7 @@ void PPCAsmPrinter::LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI) { .addImm(CallTarget & 0xFFFF)); // Save the current TOC pointer before the remote call. - int TOCSaveOffset = Subtarget->isELFv2ABI() ? 24 : 40; + int TOCSaveOffset = Subtarget->getFrameLowering()->getTOCSaveOffset(); EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::STD) .addReg(PPC::X2) .addImm(TOCSaveOffset) @@ -1046,6 +1049,97 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { EmitToStreamer(*OutStreamer, TmpInst); } +void PPCLinuxAsmPrinter::EmitInstruction(const MachineInstr *MI) { + if (!Subtarget->isPPC64()) + return PPCAsmPrinter::EmitInstruction(MI); + + switch (MI->getOpcode()) { + default: + return PPCAsmPrinter::EmitInstruction(MI); + case TargetOpcode::PATCHABLE_FUNCTION_ENTER: { + // .begin: + // b .end # lis 0, FuncId[16..32] + // nop # li 0, FuncId[0..15] + // std 0, -8(1) + // mflr 0 + // bl __xray_FunctionEntry + // mtlr 0 + // .end: + // + // Update compiler-rt/lib/xray/xray_powerpc64.cc accordingly when number + // of instructions change. + MCSymbol *BeginOfSled = OutContext.createTempSymbol(); + MCSymbol *EndOfSled = OutContext.createTempSymbol(); + OutStreamer->EmitLabel(BeginOfSled); + EmitToStreamer(*OutStreamer, + MCInstBuilder(PPC::B).addExpr( + MCSymbolRefExpr::create(EndOfSled, OutContext))); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP)); + EmitToStreamer( + *OutStreamer, + MCInstBuilder(PPC::STD).addReg(PPC::X0).addImm(-8).addReg(PPC::X1)); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MFLR8).addReg(PPC::X0)); + EmitToStreamer(*OutStreamer, + MCInstBuilder(PPC::BL8_NOP) + .addExpr(MCSymbolRefExpr::create( + OutContext.getOrCreateSymbol("__xray_FunctionEntry"), + OutContext))); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTLR8).addReg(PPC::X0)); + OutStreamer->EmitLabel(EndOfSled); + recordSled(BeginOfSled, *MI, SledKind::FUNCTION_ENTER); + break; + } + case TargetOpcode::PATCHABLE_FUNCTION_EXIT: { + // .p2align 3 + // .begin: + // b(lr)? # lis 0, FuncId[16..32] + // nop # li 0, FuncId[0..15] + // std 0, -8(1) + // mflr 0 + // bl __xray_FunctionExit + // mtlr 0 + // .end: + // b(lr)? + // + // Update compiler-rt/lib/xray/xray_powerpc64.cc accordingly when number + // of instructions change. + const MachineInstr *Next = [&] { + MachineBasicBlock::const_iterator It(MI); + assert(It != MI->getParent()->end()); + ++It; + assert(It->isReturn()); + return &*It; + }(); + OutStreamer->EmitCodeAlignment(8); + MCSymbol *BeginOfSled = OutContext.createTempSymbol(); + OutStreamer->EmitLabel(BeginOfSled); + MCInst TmpInst; + LowerPPCMachineInstrToMCInst(Next, TmpInst, *this, false); + EmitToStreamer(*OutStreamer, TmpInst); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP)); + EmitToStreamer( + *OutStreamer, + MCInstBuilder(PPC::STD).addReg(PPC::X0).addImm(-8).addReg(PPC::X1)); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MFLR8).addReg(PPC::X0)); + EmitToStreamer(*OutStreamer, + MCInstBuilder(PPC::BL8_NOP) + .addExpr(MCSymbolRefExpr::create( + OutContext.getOrCreateSymbol("__xray_FunctionExit"), + OutContext))); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTLR8).addReg(PPC::X0)); + recordSled(BeginOfSled, *MI, SledKind::FUNCTION_EXIT); + break; + } + case TargetOpcode::PATCHABLE_TAIL_CALL: + case TargetOpcode::PATCHABLE_RET: + // PPC's tail call instruction, e.g. PPC::TCRETURNdi8, doesn't really + // lower to a PPC::B instruction. The PPC::B instruction is generated + // before it, and handled by the normal case. + llvm_unreachable("Tail call is handled in the normal case. See comments" + "around this assert."); + } +} + void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) { if (static_cast<const PPCTargetMachine &>(TM).isELFv2ABI()) { PPCTargetStreamer *TS = diff --git a/contrib/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp b/contrib/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp index 93c201d..55e105d 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp @@ -7,15 +7,15 @@ // //===----------------------------------------------------------------------===// // -// This file implements converting i1 values to i32 if they could be more +// This file implements converting i1 values to i32/i64 if they could be more // profitably allocated as GPRs rather than CRs. This pass will become totally // unnecessary if Register Bank Allocation and Global Instruction Selection ever // go upstream. // -// Presently, the pass converts i1 Constants, and Arguments to i32 if the +// Presently, the pass converts i1 Constants, and Arguments to i32/i64 if the // transitive closure of their uses includes only PHINodes, CallInsts, and // ReturnInsts. The rational is that arguments are generally passed and returned -// in GPRs rather than CRs, so casting them to i32 at the LLVM IR level will +// in GPRs rather than CRs, so casting them to i32/i64 at the LLVM IR level will // actually save casts at the Machine Instruction level. // // It might be useful to expand this pass to add bit-wise operations to the list @@ -33,11 +33,12 @@ //===----------------------------------------------------------------------===// #include "PPC.h" +#include "PPCTargetMachine.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/IR/Argument.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" @@ -50,8 +51,9 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" -#include "llvm/Support/Casting.h" #include "llvm/Pass.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/Support/Casting.h" #include <cassert> using namespace llvm; @@ -87,17 +89,19 @@ class PPCBoolRetToInt : public FunctionPass { return Defs; } - // Translate a i1 value to an equivalent i32 value: - static Value *translate(Value *V) { - Type *Int32Ty = Type::getInt32Ty(V->getContext()); + // Translate a i1 value to an equivalent i32/i64 value: + Value *translate(Value *V) { + Type *IntTy = ST->isPPC64() ? Type::getInt64Ty(V->getContext()) + : Type::getInt32Ty(V->getContext()); + if (auto *C = dyn_cast<Constant>(V)) - return ConstantExpr::getZExt(C, Int32Ty); + return ConstantExpr::getZExt(C, IntTy); if (auto *P = dyn_cast<PHINode>(V)) { // Temporarily set the operands to 0. We'll fix this later in // runOnUse. - Value *Zero = Constant::getNullValue(Int32Ty); + Value *Zero = Constant::getNullValue(IntTy); PHINode *Q = - PHINode::Create(Int32Ty, P->getNumIncomingValues(), P->getName(), P); + PHINode::Create(IntTy, P->getNumIncomingValues(), P->getName(), P); for (unsigned i = 0; i < P->getNumOperands(); ++i) Q->addIncoming(Zero, P->getIncomingBlock(i)); return Q; @@ -109,7 +113,7 @@ class PPCBoolRetToInt : public FunctionPass { auto InstPt = A ? &*A->getParent()->getEntryBlock().begin() : I->getNextNode(); - return new ZExtInst(V, Int32Ty, "", InstPt); + return new ZExtInst(V, IntTy, "", InstPt); } typedef SmallPtrSet<const PHINode *, 8> PHINodeSet; @@ -185,6 +189,13 @@ class PPCBoolRetToInt : public FunctionPass { if (skipFunction(F)) return false; + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC) + return false; + + auto &TM = TPC->getTM<PPCTargetMachine>(); + ST = TM.getSubtargetImpl(F); + PHINodeSet PromotablePHINodes = getPromotablePHINodes(F); B2IMap Bool2IntMap; bool Changed = false; @@ -205,7 +216,7 @@ class PPCBoolRetToInt : public FunctionPass { return Changed; } - static bool runOnUse(Use &U, const PHINodeSet &PromotablePHINodes, + bool runOnUse(Use &U, const PHINodeSet &PromotablePHINodes, B2IMap &BoolToIntMap) { auto Defs = findAllDefs(U); @@ -262,13 +273,16 @@ class PPCBoolRetToInt : public FunctionPass { AU.addPreserved<DominatorTreeWrapperPass>(); FunctionPass::getAnalysisUsage(AU); } + +private: + const PPCSubtarget *ST; }; } // end anonymous namespace char PPCBoolRetToInt::ID = 0; INITIALIZE_PASS(PPCBoolRetToInt, "bool-ret-to-int", - "Convert i1 constants to i32 if they are returned", + "Convert i1 constants to i32/i64 if they are returned", false, false) FunctionPass *llvm::createPPCBoolRetToIntPass() { return new PPCBoolRetToInt(); } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp b/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp index ae76386..d0b66f9 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp @@ -15,8 +15,8 @@ // //===----------------------------------------------------------------------===// -#include "PPC.h" #include "MCTargetDesc/PPCPredicates.h" +#include "PPC.h" #include "PPCInstrBuilder.h" #include "PPCInstrInfo.h" #include "PPCSubtarget.h" @@ -78,7 +78,7 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) { BlockSizes.resize(Fn.getNumBlockIDs()); auto GetAlignmentAdjustment = - [TII](MachineBasicBlock &MBB, unsigned Offset) -> unsigned { + [](MachineBasicBlock &MBB, unsigned Offset) -> unsigned { unsigned Align = MBB.getAlignment(); if (!Align) return 0; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp b/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp index 2c62a0f..53f33ac 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp @@ -23,14 +23,15 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" #include "PPC.h" +#include "PPCSubtarget.h" #include "PPCTargetMachine.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" @@ -43,6 +44,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" @@ -81,10 +83,7 @@ namespace { public: static char ID; - PPCCTRLoops() : FunctionPass(ID), TM(nullptr) { - initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry()); - } - PPCCTRLoops(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) { + PPCCTRLoops() : FunctionPass(ID) { initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry()); } @@ -99,16 +98,18 @@ namespace { } private: - bool mightUseCTR(const Triple &TT, BasicBlock *BB); + bool mightUseCTR(BasicBlock *BB); bool convertToCTRLoop(Loop *L); private: - PPCTargetMachine *TM; + const PPCTargetMachine *TM; + const PPCSubtarget *STI; + const PPCTargetLowering *TLI; + const DataLayout *DL; + const TargetLibraryInfo *LibInfo; LoopInfo *LI; ScalarEvolution *SE; - const DataLayout *DL; DominatorTree *DT; - const TargetLibraryInfo *LibInfo; bool PreserveLCSSA; }; @@ -149,9 +150,7 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops", false, false) -FunctionPass *llvm::createPPCCTRLoops(PPCTargetMachine &TM) { - return new PPCCTRLoops(TM); -} +FunctionPass *llvm::createPPCCTRLoops() { return new PPCCTRLoops(); } #ifndef NDEBUG INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify", @@ -169,6 +168,14 @@ bool PPCCTRLoops::runOnFunction(Function &F) { if (skipFunction(F)) return false; + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC) + return false; + + TM = &TPC->getTM<PPCTargetMachine>(); + STI = TM->getSubtargetImpl(F); + TLI = STI->getTargetLowering(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); @@ -198,8 +205,7 @@ static bool isLargeIntegerTy(bool Is32Bit, Type *Ty) { // Determining the address of a TLS variable results in a function call in // certain TLS models. -static bool memAddrUsesCTR(const PPCTargetMachine *TM, - const Value *MemAddr) { +static bool memAddrUsesCTR(const PPCTargetMachine &TM, const Value *MemAddr) { const auto *GV = dyn_cast<GlobalValue>(MemAddr); if (!GV) { // Recurse to check for constants that refer to TLS global variables. @@ -213,35 +219,35 @@ static bool memAddrUsesCTR(const PPCTargetMachine *TM, if (!GV->isThreadLocal()) return false; - if (!TM) - return true; - TLSModel::Model Model = TM->getTLSModel(GV); + TLSModel::Model Model = TM.getTLSModel(GV); return Model == TLSModel::GeneralDynamic || Model == TLSModel::LocalDynamic; } -bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) { +// Loop through the inline asm constraints and look for something that clobbers +// ctr. +static bool asmClobbersCTR(InlineAsm *IA) { + InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints(); + for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) { + InlineAsm::ConstraintInfo &C = CIV[i]; + if (C.Type != InlineAsm::isInput) + for (unsigned j = 0, je = C.Codes.size(); j < je; ++j) + if (StringRef(C.Codes[j]).equals_lower("{ctr}")) + return true; + } + return false; +} + +bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) { for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J) { if (CallInst *CI = dyn_cast<CallInst>(J)) { + // Inline ASM is okay, unless it clobbers the ctr register. if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) { - // Inline ASM is okay, unless it clobbers the ctr register. - InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints(); - for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) { - InlineAsm::ConstraintInfo &C = CIV[i]; - if (C.Type != InlineAsm::isInput) - for (unsigned j = 0, je = C.Codes.size(); j < je; ++j) - if (StringRef(C.Codes[j]).equals_lower("{ctr}")) - return true; - } - + if (asmClobbersCTR(IA)) + return true; continue; } - if (!TM) - return true; - const TargetLowering *TLI = - TM->getSubtargetImpl(*BB->getParent())->getTargetLowering(); - if (Function *F = CI->getCalledFunction()) { // Most intrinsics don't become function calls, but some might. // sin, cos, exp and log are always calls. @@ -298,15 +304,17 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) { return true; else continue; // ISD::FCOPYSIGN is never a library call. - case Intrinsic::sqrt: Opcode = ISD::FSQRT; break; - case Intrinsic::floor: Opcode = ISD::FFLOOR; break; - case Intrinsic::ceil: Opcode = ISD::FCEIL; break; - case Intrinsic::trunc: Opcode = ISD::FTRUNC; break; - case Intrinsic::rint: Opcode = ISD::FRINT; break; - case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break; - case Intrinsic::round: Opcode = ISD::FROUND; break; - case Intrinsic::minnum: Opcode = ISD::FMINNUM; break; - case Intrinsic::maxnum: Opcode = ISD::FMAXNUM; break; + case Intrinsic::sqrt: Opcode = ISD::FSQRT; break; + case Intrinsic::floor: Opcode = ISD::FFLOOR; break; + case Intrinsic::ceil: Opcode = ISD::FCEIL; break; + case Intrinsic::trunc: Opcode = ISD::FTRUNC; break; + case Intrinsic::rint: Opcode = ISD::FRINT; break; + case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break; + case Intrinsic::round: Opcode = ISD::FROUND; break; + case Intrinsic::minnum: Opcode = ISD::FMINNUM; break; + case Intrinsic::maxnum: Opcode = ISD::FMAXNUM; break; + case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO; break; + case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO; break; } } @@ -315,7 +323,7 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) { // (i.e. soft float or atomics). If adapting for targets that do, // additional care is required here. - LibFunc::Func Func; + LibFunc Func; if (!F->hasLocalLinkage() && F->hasName() && LibInfo && LibInfo->getLibFunc(F->getName(), Func) && LibInfo->hasOptimizedCodeGen(Func)) { @@ -329,58 +337,57 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) { switch (Func) { default: return true; - case LibFunc::copysign: - case LibFunc::copysignf: + case LibFunc_copysign: + case LibFunc_copysignf: continue; // ISD::FCOPYSIGN is never a library call. - case LibFunc::copysignl: + case LibFunc_copysignl: return true; - case LibFunc::fabs: - case LibFunc::fabsf: - case LibFunc::fabsl: + case LibFunc_fabs: + case LibFunc_fabsf: + case LibFunc_fabsl: continue; // ISD::FABS is never a library call. - case LibFunc::sqrt: - case LibFunc::sqrtf: - case LibFunc::sqrtl: + case LibFunc_sqrt: + case LibFunc_sqrtf: + case LibFunc_sqrtl: Opcode = ISD::FSQRT; break; - case LibFunc::floor: - case LibFunc::floorf: - case LibFunc::floorl: + case LibFunc_floor: + case LibFunc_floorf: + case LibFunc_floorl: Opcode = ISD::FFLOOR; break; - case LibFunc::nearbyint: - case LibFunc::nearbyintf: - case LibFunc::nearbyintl: + case LibFunc_nearbyint: + case LibFunc_nearbyintf: + case LibFunc_nearbyintl: Opcode = ISD::FNEARBYINT; break; - case LibFunc::ceil: - case LibFunc::ceilf: - case LibFunc::ceill: + case LibFunc_ceil: + case LibFunc_ceilf: + case LibFunc_ceill: Opcode = ISD::FCEIL; break; - case LibFunc::rint: - case LibFunc::rintf: - case LibFunc::rintl: + case LibFunc_rint: + case LibFunc_rintf: + case LibFunc_rintl: Opcode = ISD::FRINT; break; - case LibFunc::round: - case LibFunc::roundf: - case LibFunc::roundl: + case LibFunc_round: + case LibFunc_roundf: + case LibFunc_roundl: Opcode = ISD::FROUND; break; - case LibFunc::trunc: - case LibFunc::truncf: - case LibFunc::truncl: + case LibFunc_trunc: + case LibFunc_truncf: + case LibFunc_truncl: Opcode = ISD::FTRUNC; break; - case LibFunc::fmin: - case LibFunc::fminf: - case LibFunc::fminl: + case LibFunc_fmin: + case LibFunc_fminf: + case LibFunc_fminl: Opcode = ISD::FMINNUM; break; - case LibFunc::fmax: - case LibFunc::fmaxf: - case LibFunc::fmaxl: + case LibFunc_fmax: + case LibFunc_fmaxf: + case LibFunc_fmaxl: Opcode = ISD::FMAXNUM; break; } } if (Opcode) { - auto &DL = CI->getModule()->getDataLayout(); - MVT VTy = TLI->getSimpleValueType(DL, CI->getArgOperand(0)->getType(), - true); + MVT VTy = TLI->getSimpleValueType( + *DL, CI->getArgOperand(0)->getType(), true); if (VTy == MVT::Other) return true; @@ -404,17 +411,17 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) { CastInst *CI = cast<CastInst>(J); if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() || CI->getDestTy()->getScalarType()->isPPC_FP128Ty() || - isLargeIntegerTy(TT.isArch32Bit(), CI->getSrcTy()->getScalarType()) || - isLargeIntegerTy(TT.isArch32Bit(), CI->getDestTy()->getScalarType())) + isLargeIntegerTy(!TM->isPPC64(), CI->getSrcTy()->getScalarType()) || + isLargeIntegerTy(!TM->isPPC64(), CI->getDestTy()->getScalarType())) return true; - } else if (isLargeIntegerTy(TT.isArch32Bit(), + } else if (isLargeIntegerTy(!TM->isPPC64(), J->getType()->getScalarType()) && (J->getOpcode() == Instruction::UDiv || J->getOpcode() == Instruction::SDiv || J->getOpcode() == Instruction::URem || J->getOpcode() == Instruction::SRem)) { return true; - } else if (TT.isArch32Bit() && + } else if (!TM->isPPC64() && isLargeIntegerTy(false, J->getType()->getScalarType()) && (J->getOpcode() == Instruction::Shl || J->getOpcode() == Instruction::AShr || @@ -426,16 +433,11 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) { // On PowerPC, indirect jumps use the counter register. return true; } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) { - if (!TM) - return true; - const TargetLowering *TLI = - TM->getSubtargetImpl(*BB->getParent())->getTargetLowering(); - if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries()) return true; } - if (TM->getSubtargetImpl(*BB->getParent())->getTargetLowering()->useSoftFloat()) { + if (STI->useSoftFloat()) { switch(J->getOpcode()) { case Instruction::FAdd: case Instruction::FSub: @@ -454,7 +456,7 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) { } for (Value *Operand : J->operands()) - if (memAddrUsesCTR(TM, Operand)) + if (memAddrUsesCTR(*TM, Operand)) return true; } @@ -464,11 +466,6 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) { bool PPCCTRLoops::convertToCTRLoop(Loop *L) { bool MadeChange = false; - const Triple TT = - Triple(L->getHeader()->getParent()->getParent()->getTargetTriple()); - if (!TT.isArch32Bit() && !TT.isArch64Bit()) - return MadeChange; // Unknown arch. type. - // Process nested loops first. for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) { MadeChange |= convertToCTRLoop(*I); @@ -493,7 +490,7 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) { // want to use the counter register if the loop contains calls. for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); I != IE; ++I) - if (mightUseCTR(TT, *I)) + if (mightUseCTR(*I)) return MadeChange; SmallVector<BasicBlock*, 4> ExitingBlocks; @@ -515,7 +512,7 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) { } else if (!SE->isLoopInvariant(EC, L)) continue; - if (SE->getTypeSizeInBits(EC->getType()) > (TT.isArch64Bit() ? 64 : 32)) + if (SE->getTypeSizeInBits(EC->getType()) > (TM->isPPC64() ? 64 : 32)) continue; // We now have a loop-invariant count of loop iterations (which is not the @@ -569,7 +566,7 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) { // preheader, then we can use it (except if the preheader contains a use of // the CTR register because some such uses might be reordered by the // selection DAG after the mtctr instruction). - if (!Preheader || mightUseCTR(TT, Preheader)) + if (!Preheader || mightUseCTR(Preheader)) Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA); if (!Preheader) return MadeChange; @@ -580,10 +577,9 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) { // selected branch. MadeChange = true; - SCEVExpander SCEVE(*SE, Preheader->getModule()->getDataLayout(), "loopcnt"); + SCEVExpander SCEVE(*SE, *DL, "loopcnt"); LLVMContext &C = SE->getContext(); - Type *CountType = TT.isArch64Bit() ? Type::getInt64Ty(C) : - Type::getInt32Ty(C); + Type *CountType = TM->isPPC64() ? Type::getInt64Ty(C) : Type::getInt32Ty(C); if (!ExitCount->getType()->isPointerTy() && ExitCount->getType() != CountType) ExitCount = SE->getZeroExtendExpr(ExitCount, CountType); @@ -611,7 +607,10 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) { // The old condition may be dead now, and may have even created a dead PHI // (the original induction variable). RecursivelyDeleteTriviallyDeadInstructions(OldCond); - DeleteDeadPHIs(CountedExitBlock); + // Run through the basic blocks of the loop and see if any of them have dead + // PHIs that can be removed. + for (auto I : L->blocks()) + DeleteDeadPHIs(I); ++NumCTRLoops; return MadeChange; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp b/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp index 6bd2296..811e4dd 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#include "PPC.h" #include "MCTargetDesc/PPCPredicates.h" +#include "PPC.h" #include "PPCInstrBuilder.h" #include "PPCInstrInfo.h" #include "PPCMachineFunctionInfo.h" diff --git a/contrib/llvm/lib/Target/PowerPC/PPCExpandISEL.cpp b/contrib/llvm/lib/Target/PowerPC/PPCExpandISEL.cpp new file mode 100644 index 0000000..41e3190 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCExpandISEL.cpp @@ -0,0 +1,458 @@ +//===------------- PPCExpandISEL.cpp - Expand ISEL instruction ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// A pass that expands the ISEL instruction into an if-then-else sequence. +// This pass must be run post-RA since all operands must be physical registers. +// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "PPCInstrInfo.h" +#include "PPCSubtarget.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "ppc-expand-isel" + +STATISTIC(NumExpanded, "Number of ISEL instructions expanded"); +STATISTIC(NumRemoved, "Number of ISEL instructions removed"); +STATISTIC(NumFolded, "Number of ISEL instructions folded"); + +// If -ppc-gen-isel=false is set, we will disable generating the ISEL +// instruction on all PPC targets. Otherwise, if the user set option +// -misel or the platform supports ISEL by default, still generate the +// ISEL instruction, else expand it. +static cl::opt<bool> + GenerateISEL("ppc-gen-isel", + cl::desc("Enable generating the ISEL instruction."), + cl::init(true), cl::Hidden); + +namespace { +class PPCExpandISEL : public MachineFunctionPass { + DebugLoc dl; + MachineFunction *MF; + const TargetInstrInfo *TII; + bool IsTrueBlockRequired; + bool IsFalseBlockRequired; + MachineBasicBlock *TrueBlock; + MachineBasicBlock *FalseBlock; + MachineBasicBlock *NewSuccessor; + MachineBasicBlock::iterator TrueBlockI; + MachineBasicBlock::iterator FalseBlockI; + + typedef SmallVector<MachineInstr *, 4> BlockISELList; + typedef SmallDenseMap<int, BlockISELList> ISELInstructionList; + + // A map of MBB numbers to their lists of contained ISEL instructions. + ISELInstructionList ISELInstructions; + + /// Initialize the object. + void initialize(MachineFunction &MFParam); + + void handleSpecialCases(BlockISELList &BIL, MachineBasicBlock *MBB); + void reorganizeBlockLayout(BlockISELList &BIL, MachineBasicBlock *MBB); + void populateBlocks(BlockISELList &BIL); + void expandMergeableISELs(BlockISELList &BIL); + void expandAndMergeISELs(); + + bool canMerge(MachineInstr *PrevPushedMI, MachineInstr *MI); + + /// Is this instruction an ISEL or ISEL8? + static bool isISEL(const MachineInstr &MI) { + return (MI.getOpcode() == PPC::ISEL || MI.getOpcode() == PPC::ISEL8); + } + + /// Is this instruction an ISEL8? + static bool isISEL8(const MachineInstr &MI) { + return (MI.getOpcode() == PPC::ISEL8); + } + + /// Are the two operands using the same register? + bool useSameRegister(const MachineOperand &Op1, const MachineOperand &Op2) { + return (Op1.getReg() == Op2.getReg()); + } + + /// + /// Collect all ISEL instructions from the current function. + /// + /// Walk the current function and collect all the ISEL instructions that are + /// found. The instructions are placed in the ISELInstructions vector. + /// + /// \return true if any ISEL instructions were found, false otherwise + /// + bool collectISELInstructions(); + +public: + static char ID; + PPCExpandISEL() : MachineFunctionPass(ID) { + initializePPCExpandISELPass(*PassRegistry::getPassRegistry()); + } + + /// + /// Determine whether to generate the ISEL instruction or expand it. + /// + /// Expand ISEL instruction into if-then-else sequence when one of + /// the following two conditions hold: + /// (1) -ppc-gen-isel=false + /// (2) hasISEL() return false + /// Otherwise, still generate ISEL instruction. + /// The -ppc-gen-isel option is set to true by default. Which means the ISEL + /// instruction is still generated by default on targets that support them. + /// + /// \return true if ISEL should be expanded into if-then-else code sequence; + /// false if ISEL instruction should be generated, i.e. not expaned. + /// + static bool isExpandISELEnabled(const MachineFunction &MF); + +#ifndef NDEBUG + void DumpISELInstructions() const; +#endif + + bool runOnMachineFunction(MachineFunction &MF) override { + if (!isExpandISELEnabled(MF)) + return false; + + DEBUG(dbgs() << "Function: "; MF.dump(); dbgs() << "\n"); + initialize(MF); + + if (!collectISELInstructions()) { + DEBUG(dbgs() << "No ISEL instructions in this function\n"); + return false; + } + +#ifndef NDEBUG + DumpISELInstructions(); +#endif + + expandAndMergeISELs(); + + return true; + } +}; +} // end anonymous namespace + +void PPCExpandISEL::initialize(MachineFunction &MFParam) { + MF = &MFParam; + TII = MF->getSubtarget().getInstrInfo(); + ISELInstructions.clear(); +} + +bool PPCExpandISEL::isExpandISELEnabled(const MachineFunction &MF) { + return !GenerateISEL || !MF.getSubtarget<PPCSubtarget>().hasISEL(); +} + +bool PPCExpandISEL::collectISELInstructions() { + for (MachineBasicBlock &MBB : *MF) { + BlockISELList thisBlockISELs; + for (MachineInstr &MI : MBB) + if (isISEL(MI)) + thisBlockISELs.push_back(&MI); + if (!thisBlockISELs.empty()) + ISELInstructions.insert(std::make_pair(MBB.getNumber(), thisBlockISELs)); + } + return !ISELInstructions.empty(); +} + +#ifndef NDEBUG +void PPCExpandISEL::DumpISELInstructions() const { + for (const auto &I : ISELInstructions) { + DEBUG(dbgs() << "BB#" << I.first << ":\n"); + for (const auto &VI : I.second) + DEBUG(dbgs() << " "; VI->print(dbgs())); + } +} +#endif + +/// Contiguous ISELs that have the same condition can be merged. +bool PPCExpandISEL::canMerge(MachineInstr *PrevPushedMI, MachineInstr *MI) { + // Same Condition Register? + if (!useSameRegister(PrevPushedMI->getOperand(3), MI->getOperand(3))) + return false; + + MachineBasicBlock::iterator PrevPushedMBBI = *PrevPushedMI; + MachineBasicBlock::iterator MBBI = *MI; + return (std::prev(MBBI) == PrevPushedMBBI); // Contiguous ISELs? +} + +void PPCExpandISEL::expandAndMergeISELs() { + for (auto &BlockList : ISELInstructions) { + DEBUG(dbgs() << "Expanding ISEL instructions in BB#" << BlockList.first + << "\n"); + + BlockISELList &CurrentISELList = BlockList.second; + auto I = CurrentISELList.begin(); + auto E = CurrentISELList.end(); + + while (I != E) { + BlockISELList SubISELList; + + SubISELList.push_back(*I++); + + // Collect the ISELs that can be merged together. + while (I != E && canMerge(SubISELList.back(), *I)) + SubISELList.push_back(*I++); + + expandMergeableISELs(SubISELList); + } + } +} + +void PPCExpandISEL::handleSpecialCases(BlockISELList &BIL, + MachineBasicBlock *MBB) { + IsTrueBlockRequired = false; + IsFalseBlockRequired = false; + + auto MI = BIL.begin(); + while (MI != BIL.end()) { + assert(isISEL(**MI) && "Expecting an ISEL instruction"); + DEBUG(dbgs() << "ISEL: " << **MI << "\n"); + + MachineOperand &Dest = (*MI)->getOperand(0); + MachineOperand &TrueValue = (*MI)->getOperand(1); + MachineOperand &FalseValue = (*MI)->getOperand(2); + + // If at least one of the ISEL instructions satisfy the following + // condition, we need the True Block: + // The Dest Register and True Value Register are not the same + // Similarly, if at least one of the ISEL instructions satisfy the + // following condition, we need the False Block: + // The Dest Register and False Value Register are not the same. + + bool IsADDIInstRequired = !useSameRegister(Dest, TrueValue); + bool IsORIInstRequired = !useSameRegister(Dest, FalseValue); + + // Special case 1, all registers used by ISEL are the same one. + if (!IsADDIInstRequired && !IsORIInstRequired) { + DEBUG(dbgs() << "Remove redudant ISEL instruction."); + NumRemoved++; + (*MI)->eraseFromParent(); + // Setting MI to the erase result keeps the iterator valid and increased. + MI = BIL.erase(MI); + continue; + } + + // Special case 2, the two input registers used by ISEL are the same. + // Note 1: We favor merging ISEL expansions over folding a single one. If + // the passed list has multiple merge-able ISEL's, we won't fold any. + // Note 2: There is no need to test for PPC::R0/PPC::X0 because PPC::ZERO/ + // PPC::ZERO8 will be used for the first operand if the value is meant to + // be zero. In this case, the useSameRegister method will return false, + // thereby preventing this ISEL from being folded. + + if (useSameRegister(TrueValue, FalseValue) && (BIL.size() == 1)) { + DEBUG(dbgs() << "Fold the ISEL instruction to an unconditonal copy."); + NumFolded++; + BuildMI(*MBB, (*MI), dl, TII->get(isISEL8(**MI) ? PPC::ADDI8 : PPC::ADDI)) + .add(Dest) + .add(TrueValue) + .add(MachineOperand::CreateImm(0)); + (*MI)->eraseFromParent(); + // Setting MI to the erase result keeps the iterator valid and increased. + MI = BIL.erase(MI); + continue; + } + + IsTrueBlockRequired |= IsADDIInstRequired; + IsFalseBlockRequired |= IsORIInstRequired; + MI++; + } +} + +void PPCExpandISEL::reorganizeBlockLayout(BlockISELList &BIL, + MachineBasicBlock *MBB) { + if (BIL.empty()) + return; + + assert((IsTrueBlockRequired || IsFalseBlockRequired) && + "Should have been handled by special cases earlier!"); + + MachineBasicBlock *Successor = nullptr; + const BasicBlock *LLVM_BB = MBB->getBasicBlock(); + MachineBasicBlock::iterator MBBI = (*BIL.back()); + NewSuccessor = (MBBI != MBB->getLastNonDebugInstr() || !MBB->canFallThrough()) + // Another BB is needed to move the instructions that + // follow this ISEL. If the ISEL is the last instruction + // in a block that can't fall through, we also need a block + // to branch to. + ? MF->CreateMachineBasicBlock(LLVM_BB) + : nullptr; + + MachineFunction::iterator It = MBB->getIterator(); + ++It; // Point to the successor block of MBB. + + // If NewSuccessor is NULL then the last ISEL in this group is the last + // non-debug instruction in this block. Find the fall-through successor + // of this block to use when updating the CFG below. + if (!NewSuccessor) { + for (auto &Succ : MBB->successors()) { + if (MBB->isLayoutSuccessor(Succ)) { + Successor = Succ; + break; + } + } + } else + Successor = NewSuccessor; + + // The FalseBlock and TrueBlock are inserted after the MBB block but before + // its successor. + // Note this need to be done *after* the above setting the Successor code. + if (IsFalseBlockRequired) { + FalseBlock = MF->CreateMachineBasicBlock(LLVM_BB); + MF->insert(It, FalseBlock); + } + + if (IsTrueBlockRequired) { + TrueBlock = MF->CreateMachineBasicBlock(LLVM_BB); + MF->insert(It, TrueBlock); + } + + if (NewSuccessor) { + MF->insert(It, NewSuccessor); + + // Transfer the rest of this block into the new successor block. + NewSuccessor->splice(NewSuccessor->end(), MBB, + std::next(MachineBasicBlock::iterator(BIL.back())), + MBB->end()); + NewSuccessor->transferSuccessorsAndUpdatePHIs(MBB); + + // Copy the original liveIns of MBB to NewSuccessor. + for (auto &LI : MBB->liveins()) + NewSuccessor->addLiveIn(LI); + + // After splitting the NewSuccessor block, Regs defined but not killed + // in MBB should be treated as liveins of NewSuccessor. + // Note: Cannot use stepBackward instead since we are using the Reg + // liveness state at the end of MBB (liveOut of MBB) as the liveIn for + // NewSuccessor. Otherwise, will cause cyclic dependence. + LivePhysRegs LPR(*MF->getSubtarget<PPCSubtarget>().getRegisterInfo()); + SmallVector<std::pair<unsigned, const MachineOperand *>, 2> Clobbers; + for (MachineInstr &MI : *MBB) + LPR.stepForward(MI, Clobbers); + for (auto &LI : LPR) + NewSuccessor->addLiveIn(LI); + } else { + // Remove successor from MBB. + MBB->removeSuccessor(Successor); + } + + // Note that this needs to be done *after* transfering the successors from MBB + // to the NewSuccessor block, otherwise these blocks will also be transferred + // as successors! + MBB->addSuccessor(IsTrueBlockRequired ? TrueBlock : Successor); + MBB->addSuccessor(IsFalseBlockRequired ? FalseBlock : Successor); + + if (IsTrueBlockRequired) { + TrueBlockI = TrueBlock->begin(); + TrueBlock->addSuccessor(Successor); + } + + if (IsFalseBlockRequired) { + FalseBlockI = FalseBlock->begin(); + FalseBlock->addSuccessor(Successor); + } + + // Conditional branch to the TrueBlock or Successor + BuildMI(*MBB, BIL.back(), dl, TII->get(PPC::BC)) + .add(BIL.back()->getOperand(3)) + .addMBB(IsTrueBlockRequired ? TrueBlock : Successor); + + // Jump over the true block to the new successor if the condition is false. + BuildMI(*(IsFalseBlockRequired ? FalseBlock : MBB), + (IsFalseBlockRequired ? FalseBlockI : BIL.back()), dl, + TII->get(PPC::B)) + .addMBB(Successor); + + if (IsFalseBlockRequired) + FalseBlockI = FalseBlock->begin(); // get the position of PPC::B +} + +void PPCExpandISEL::populateBlocks(BlockISELList &BIL) { + for (auto &MI : BIL) { + assert(isISEL(*MI) && "Expecting an ISEL instruction"); + + MachineOperand &Dest = MI->getOperand(0); // location to store to + MachineOperand &TrueValue = MI->getOperand(1); // Value to store if + // condition is true + MachineOperand &FalseValue = MI->getOperand(2); // Value to store if + // condition is false + MachineOperand &ConditionRegister = MI->getOperand(3); // Condition + + DEBUG(dbgs() << "Dest: " << Dest << "\n"); + DEBUG(dbgs() << "TrueValue: " << TrueValue << "\n"); + DEBUG(dbgs() << "FalseValue: " << FalseValue << "\n"); + DEBUG(dbgs() << "ConditionRegister: " << ConditionRegister << "\n"); + + + // If the Dest Register and True Value Register are not the same one, we + // need the True Block. + bool IsADDIInstRequired = !useSameRegister(Dest, TrueValue); + bool IsORIInstRequired = !useSameRegister(Dest, FalseValue); + + if (IsADDIInstRequired) { + // Copy the result into the destination if the condition is true. + BuildMI(*TrueBlock, TrueBlockI, dl, + TII->get(isISEL8(*MI) ? PPC::ADDI8 : PPC::ADDI)) + .add(Dest) + .add(TrueValue) + .add(MachineOperand::CreateImm(0)); + + // Add the LiveIn registers required by true block. + TrueBlock->addLiveIn(TrueValue.getReg()); + } + + if (IsORIInstRequired) { + // Add the LiveIn registers required by false block. + FalseBlock->addLiveIn(FalseValue.getReg()); + } + + if (NewSuccessor) { + // Add the LiveIn registers required by NewSuccessor block. + NewSuccessor->addLiveIn(Dest.getReg()); + NewSuccessor->addLiveIn(TrueValue.getReg()); + NewSuccessor->addLiveIn(FalseValue.getReg()); + NewSuccessor->addLiveIn(ConditionRegister.getReg()); + } + + // Copy the value into the destination if the condition is false. + if (IsORIInstRequired) + BuildMI(*FalseBlock, FalseBlockI, dl, + TII->get(isISEL8(*MI) ? PPC::ORI8 : PPC::ORI)) + .add(Dest) + .add(FalseValue) + .add(MachineOperand::CreateImm(0)); + + MI->eraseFromParent(); // Remove the ISEL instruction. + + NumExpanded++; + } +} + +void PPCExpandISEL::expandMergeableISELs(BlockISELList &BIL) { + // At this stage all the ISELs of BIL are in the same MBB. + MachineBasicBlock *MBB = BIL.back()->getParent(); + + handleSpecialCases(BIL, MBB); + reorganizeBlockLayout(BIL, MBB); + populateBlocks(BIL); +} + +INITIALIZE_PASS(PPCExpandISEL, DEBUG_TYPE, "PowerPC Expand ISEL Generation", + false, false) +char PPCExpandISEL::ID = 0; + +FunctionPass *llvm::createPPCExpandISELPass() { return new PPCExpandISEL(); } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp index 9b91b9a..bc99571 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp @@ -13,10 +13,10 @@ // //===----------------------------------------------------------------------===// -#include "PPC.h" #include "MCTargetDesc/PPCPredicates.h" -#include "PPCCallingConv.h" +#include "PPC.h" #include "PPCCCState.h" +#include "PPCCallingConv.h" #include "PPCISelLowering.h" #include "PPCMachineFunctionInfo.h" #include "PPCSubtarget.h" @@ -1330,7 +1330,7 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args, // Issue CALLSEQ_START. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TII.getCallFrameSetupOpcode())) - .addImm(NumBytes); + .addImm(NumBytes).addImm(0); // Prepare to assign register arguments. Every argument uses up a // GPR protocol register even if it's passed in a floating-point @@ -2246,6 +2246,7 @@ bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, } case PPC::EXTSW: + case PPC::EXTSW_32: case PPC::EXTSW_32_64: { if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8) return false; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp index f9ea871..b49c334 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -433,25 +433,21 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF, unsigned MaxAlign = MFI.getMaxAlignment(); // algmt required by data in frame unsigned AlignMask = std::max(MaxAlign, TargetAlign) - 1; - const PPCRegisterInfo *RegInfo = - static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo()); - - // If we are a leaf function, and use up to 224 bytes of stack space, - // don't have a frame pointer, calls, or dynamic alloca then we do not need - // to adjust the stack pointer (we fit in the Red Zone). - // The 32-bit SVR4 ABI has no Red Zone. However, it can still generate - // stackless code if all local vars are reg-allocated. - bool DisableRedZone = MF.getFunction()->hasFnAttribute(Attribute::NoRedZone); + const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + unsigned LR = RegInfo->getRARegister(); - if (!DisableRedZone && - (Subtarget.isPPC64() || // 32-bit SVR4, no stack- - !Subtarget.isSVR4ABI() || // allocated locals. - FrameSize == 0) && - FrameSize <= 224 && // Fits in red zone. - !MFI.hasVarSizedObjects() && // No dynamic alloca. - !MFI.adjustsStack() && // No calls. - !MustSaveLR(MF, LR) && - !RegInfo->hasBasePointer(MF)) { // No special alignment. + bool DisableRedZone = MF.getFunction()->hasFnAttribute(Attribute::NoRedZone); + bool CanUseRedZone = !MFI.hasVarSizedObjects() && // No dynamic alloca. + !MFI.adjustsStack() && // No calls. + !MustSaveLR(MF, LR) && // No need to save LR. + !RegInfo->hasBasePointer(MF); // No special alignment. + + // Note: for PPC32 SVR4ABI (Non-DarwinABI), we can still generate stackless + // code if all local vars are reg-allocated. + bool FitsInRedZone = FrameSize <= Subtarget.getRedZoneSize(); + + // Check whether we can skip adjusting the stack pointer (by using red zone) + if (!DisableRedZone && CanUseRedZone && FitsInRedZone) { // No need for frame if (UpdateMF) MFI.setStackSize(0); @@ -519,11 +515,10 @@ void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const { unsigned FPReg = is31 ? PPC::R31 : PPC::R1; unsigned FP8Reg = is31 ? PPC::X31 : PPC::X1; - const PPCRegisterInfo *RegInfo = - static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo()); + const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo(); bool HasBP = RegInfo->hasBasePointer(MF); unsigned BPReg = HasBP ? (unsigned) RegInfo->getBaseRegister(MF) : FPReg; - unsigned BP8Reg = HasBP ? (unsigned) PPC::X30 : FPReg; + unsigned BP8Reg = HasBP ? (unsigned) PPC::X30 : FP8Reg; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) @@ -616,8 +611,7 @@ PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB, return true; // Get the list of callee-saved registers for the target. - const PPCRegisterInfo *RegInfo = - static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo()); + const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo(); const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(MBB->getParent()); // Get all the available registers in the block. @@ -663,8 +657,7 @@ PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB, // and the stack frame is large, we need two scratch registers. bool PPCFrameLowering::twoUniqueScratchRegsRequired(MachineBasicBlock *MBB) const { - const PPCRegisterInfo *RegInfo = - static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo()); + const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo(); MachineFunction &MF = *(MBB->getParent()); bool HasBP = RegInfo->hasBasePointer(MF); unsigned FrameSize = determineFrameLayout(MF, false); @@ -694,10 +687,8 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.begin(); MachineFrameInfo &MFI = MF.getFrameInfo(); - const PPCInstrInfo &TII = - *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo()); - const PPCRegisterInfo *RegInfo = - static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo()); + const PPCInstrInfo &TII = *Subtarget.getInstrInfo(); + const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo(); MachineModuleInfo &MMI = MF.getMMI(); const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); @@ -1221,10 +1212,8 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, if (MBBI != MBB.end()) dl = MBBI->getDebugLoc(); - const PPCInstrInfo &TII = - *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo()); - const PPCRegisterInfo *RegInfo = - static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo()); + const PPCInstrInfo &TII = *Subtarget.getInstrInfo(); + const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo(); // Get alignment info so we know how to restore the SP. const MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -1549,8 +1538,7 @@ void PPCFrameLowering::createTailCallBranchInstr(MachineBasicBlock &MBB) const { if (MBBI != MBB.end()) dl = MBBI->getDebugLoc(); - const PPCInstrInfo &TII = - *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo()); + const PPCInstrInfo &TII = *Subtarget.getInstrInfo(); // Create branch instruction for pseudo tail call return instruction unsigned RetOpcode = MBBI->getOpcode(); @@ -1588,8 +1576,7 @@ void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF, RegScavenger *RS) const { TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); - const PPCRegisterInfo *RegInfo = - static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo()); + const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo(); // Save and clear the LR state. PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); @@ -1791,8 +1778,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF, HasGPSaveArea = true; } - const PPCRegisterInfo *RegInfo = - static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo()); + const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo(); if (RegInfo->hasBasePointer(MF)) { int FI = PFI->getBasePointerSaveIndex(); assert(FI && "No Base Pointer Save Slot!"); @@ -1880,8 +1866,13 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF, } if (HasVRSaveArea) { - // Insert alignment padding, we need 16-byte alignment. - LowerBound = (LowerBound - 15) & ~(15); + // Insert alignment padding, we need 16-byte alignment. Note: for postive + // number the alignment formula is : y = (x + (n-1)) & (~(n-1)). But since + // we are using negative number here (the stack grows downward). We should + // use formula : y = x & (~(n-1)). Where x is the size before aligning, n + // is the alignment size ( n = 16 here) and y is the size after aligning. + assert(LowerBound <= 0 && "Expect LowerBound have a non-positive value!"); + LowerBound &= ~(15); for (unsigned i = 0, e = VRegs.size(); i != e; ++i) { int FI = VRegs[i].getFrameIdx(); @@ -1913,12 +1904,13 @@ PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF, MachineFrameInfo &MFI = MF.getFrameInfo(); if (MFI.hasVarSizedObjects() || spillsCR(MF) || spillsVRSAVE(MF) || hasNonRISpills(MF) || (hasSpills(MF) && !isInt<16>(StackSize))) { - const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; - const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; - const TargetRegisterClass *RC = Subtarget.isPPC64() ? G8RC : GPRC; - RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(), - RC->getAlignment(), - false)); + const TargetRegisterClass &GPRC = PPC::GPRCRegClass; + const TargetRegisterClass &G8RC = PPC::G8RCRegClass; + const TargetRegisterClass &RC = Subtarget.isPPC64() ? G8RC : GPRC; + const TargetRegisterInfo &TRI = *Subtarget.getRegisterInfo(); + unsigned Size = TRI.getSpillSize(RC); + unsigned Align = TRI.getSpillAlignment(RC); + RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false)); // Might we have over-aligned allocas? bool HasAlVars = MFI.hasVarSizedObjects() && @@ -1926,9 +1918,7 @@ PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF, // These kinds of spills might need two registers. if (spillsCR(MF) || spillsVRSAVE(MF) || HasAlVars) - RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(), - RC->getAlignment(), - false)); + RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false)); } } @@ -1945,8 +1935,7 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, return false; MachineFunction *MF = MBB.getParent(); - const PPCInstrInfo &TII = - *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo()); + const PPCInstrInfo &TII = *Subtarget.getInstrInfo(); DebugLoc DL; bool CRSpilled = false; MachineInstrBuilder CRMIB; @@ -2087,8 +2076,7 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, return false; MachineFunction *MF = MBB.getParent(); - const PPCInstrInfo &TII = - *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo()); + const PPCInstrInfo &TII = *Subtarget.getInstrInfo(); bool CR2Spilled = false; bool CR3Spilled = false; bool CR4Spilled = false; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 1e51c1f..901539b 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -12,34 +12,76 @@ // //===----------------------------------------------------------------------===// -#include "PPC.h" +#include "MCTargetDesc/PPCMCTargetDesc.h" #include "MCTargetDesc/PPCPredicates.h" +#include "PPC.h" +#include "PPCISelLowering.h" #include "PPCMachineFunctionInfo.h" +#include "PPCSubtarget.h" #include "PPCTargetMachine.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" +#include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGISel.h" -#include "llvm/IR/Constants.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/Function.h" -#include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalValue.h" -#include "llvm/IR/GlobalVariable.h" -#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstrTypes.h" #include "llvm/IR/Module.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> +#include <limits> +#include <memory> +#include <new> +#include <tuple> +#include <utility> + using namespace llvm; #define DEBUG_TYPE "ppc-codegen" +STATISTIC(NumSextSetcc, + "Number of (sext(setcc)) nodes expanded into GPR sequence."); +STATISTIC(NumZextSetcc, + "Number of (zext(setcc)) nodes expanded into GPR sequence."); +STATISTIC(SignExtensionsAdded, + "Number of sign extensions for compare inputs added."); +STATISTIC(ZeroExtensionsAdded, + "Number of zero extensions for compare inputs added."); +STATISTIC(NumLogicOpsOnComparison, + "Number of logical ops on i1 values calculated in GPR."); +STATISTIC(OmittedForNonExtendUses, + "Number of compares not eliminated as they have non-extending uses."); + // FIXME: Remove this once the bug has been fixed! cl::opt<bool> ANDIGlueBug("expose-ppc-andi-glue-bug", cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden); @@ -60,6 +102,7 @@ static cl::opt<bool> EnableBranchHint( cl::Hidden); namespace { + //===--------------------------------------------------------------------===// /// PPCDAGToDAGISel - PPC specific code to select PPC machine /// instructions for SelectionDAG operations. @@ -69,9 +112,10 @@ namespace { const PPCSubtarget *PPCSubTarget; const PPCTargetLowering *PPCLowering; unsigned GlobalBaseReg; + public: - explicit PPCDAGToDAGISel(PPCTargetMachine &tm) - : SelectionDAGISel(tm), TM(tm) {} + explicit PPCDAGToDAGISel(PPCTargetMachine &tm, CodeGenOpt::Level OptLevel) + : SelectionDAGISel(tm, OptLevel), TM(tm) {} bool runOnMachineFunction(MachineFunction &MF) override { // Make sure we re-emit a set of the global base reg if necessary @@ -134,7 +178,7 @@ namespace { /// a base register plus a signed 16-bit displacement [r+imm]. bool SelectAddrImm(SDValue N, SDValue &Disp, SDValue &Base) { - return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, false); + return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 0); } /// SelectAddrImmOffs - Return true if the operand is valid for a preinc @@ -167,7 +211,11 @@ namespace { /// a base register plus a signed 16-bit displacement that is a multiple of 4. /// Suitable for use by STD and friends. bool SelectAddrImmX4(SDValue N, SDValue &Disp, SDValue &Base) { - return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, true); + return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 4); + } + + bool SelectAddrImmX16(SDValue N, SDValue &Disp, SDValue &Base) { + return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 16); } // Select an address into a single register. @@ -184,7 +232,6 @@ namespace { bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) override { - switch(ConstraintID) { default: errs() << "ConstraintID: " << ConstraintID << "\n"; @@ -223,7 +270,34 @@ namespace { #include "PPCGenDAGISel.inc" private: + // Conversion type for interpreting results of a 32-bit instruction as + // a 64-bit value or vice versa. + enum ExtOrTruncConversion { Ext, Trunc }; + + // Modifiers to guide how an ISD::SETCC node's result is to be computed + // in a GPR. + // ZExtOrig - use the original condition code, zero-extend value + // ZExtInvert - invert the condition code, zero-extend value + // SExtOrig - use the original condition code, sign-extend value + // SExtInvert - invert the condition code, sign-extend value + enum SetccInGPROpts { ZExtOrig, ZExtInvert, SExtOrig, SExtInvert }; + bool trySETCC(SDNode *N); + bool tryEXTEND(SDNode *N); + bool tryLogicOpOfCompares(SDNode *N); + SDValue computeLogicOpInGPR(SDValue LogicOp); + SDValue signExtendInputIfNeeded(SDValue Input); + SDValue zeroExtendInputIfNeeded(SDValue Input); + SDValue addExtOrTrunc(SDValue NatWidthRes, ExtOrTruncConversion Conv); + SDValue get32BitZExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC, + int64_t RHSValue, SDLoc dl); + SDValue get32BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC, + int64_t RHSValue, SDLoc dl); + SDValue get64BitZExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC, + int64_t RHSValue, SDLoc dl); + SDValue get64BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC, + int64_t RHSValue, SDLoc dl); + SDValue getSETCCInGPR(SDValue Compare, SetccInGPROpts ConvOpts); void PeepholePPC64(); void PeepholePPC64ZExt(); @@ -235,9 +309,11 @@ private: bool AllUsersSelectZero(SDNode *N); void SwapAllSelectUsers(SDNode *N); + bool isOffsetMultipleOf(SDNode *N, unsigned Val) const; void transferMemOperands(SDNode *N, SDNode *Result); }; -} + +} // end anonymous namespace /// InsertVRSaveCode - Once the entire function has been instruction selected, /// all virtual registers are created and all machine instructions are built, @@ -303,7 +379,6 @@ void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) { } } - /// getGlobalBaseReg - Output the instructions required to put the /// base address to use for accessing globals into a register. /// @@ -349,26 +424,6 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() { .getNode(); } -/// isIntS16Immediate - This method tests to see if the node is either a 32-bit -/// or 64-bit immediate, and if the value can be accurately represented as a -/// sign extension from a 16-bit value. If so, this returns true and the -/// immediate. -static bool isIntS16Immediate(SDNode *N, short &Imm) { - if (N->getOpcode() != ISD::Constant) - return false; - - Imm = (short)cast<ConstantSDNode>(N)->getZExtValue(); - if (N->getValueType(0) == MVT::i32) - return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue(); - else - return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue(); -} - -static bool isIntS16Immediate(SDValue Op, short &Imm) { - return isIntS16Immediate(Op.getNode(), Imm); -} - - /// isInt32Immediate - This method tests to see if the node is a 32-bit constant /// operand. If so Imm will receive the 32-bit value. static bool isInt32Immediate(SDNode *N, unsigned &Imm) { @@ -515,12 +570,12 @@ bool PPCDAGToDAGISel::tryBitfieldInsert(SDNode *N) { SDValue Op1 = N->getOperand(1); SDLoc dl(N); - APInt LKZ, LKO, RKZ, RKO; - CurDAG->computeKnownBits(Op0, LKZ, LKO); - CurDAG->computeKnownBits(Op1, RKZ, RKO); + KnownBits LKnown, RKnown; + CurDAG->computeKnownBits(Op0, LKnown); + CurDAG->computeKnownBits(Op1, RKnown); - unsigned TargetMask = LKZ.getZExtValue(); - unsigned InsertMask = RKZ.getZExtValue(); + unsigned TargetMask = LKnown.Zero.getZExtValue(); + unsigned InsertMask = RKnown.Zero.getZExtValue(); if ((TargetMask | InsertMask) == 0xFFFFFFFF) { unsigned Op0Opc = Op0.getOpcode(); @@ -563,9 +618,9 @@ bool PPCDAGToDAGISel::tryBitfieldInsert(SDNode *N) { // The AND mask might not be a constant, and we need to make sure that // if we're going to fold the masking with the insert, all bits not // know to be zero in the mask are known to be one. - APInt MKZ, MKO; - CurDAG->computeKnownBits(Op1.getOperand(1), MKZ, MKO); - bool CanFoldMask = InsertMask == MKO.getZExtValue(); + KnownBits MKnown; + CurDAG->computeKnownBits(Op1.getOperand(1), MKnown); + bool CanFoldMask = InsertMask == MKnown.One.getZExtValue(); unsigned SHOpc = Op1.getOperand(0).getOpcode(); if ((SHOpc == ISD::SHL || SHOpc == ISD::SRL) && CanFoldMask && @@ -659,7 +714,10 @@ static uint64_t Rot64(uint64_t Imm, unsigned R) { static unsigned getInt64Count(int64_t Imm) { unsigned Count = getInt64CountDirect(Imm); - if (Count == 1) + + // If the instruction count is 1 or 2, we do not need further analysis + // since rotate + load constant requires at least 2 instructions. + if (Count <= 2) return Count; for (unsigned r = 1; r < 63; ++r) { @@ -769,7 +827,10 @@ static SDNode *getInt64Direct(SelectionDAG *CurDAG, const SDLoc &dl, static SDNode *getInt64(SelectionDAG *CurDAG, const SDLoc &dl, int64_t Imm) { unsigned Count = getInt64CountDirect(Imm); - if (Count == 1) + + // If the instruction count is 1 or 2, we do not need further analysis + // since rotate + load constant requires at least 2 instructions. + if (Count <= 2) return getInt64Direct(CurDAG, dl, Imm); unsigned RMin = 0; @@ -833,6 +894,7 @@ static SDNode *getInt64(SelectionDAG *CurDAG, SDNode *N) { } namespace { + class BitPermutationSelector { struct ValueBit { SDValue V; @@ -898,14 +960,12 @@ class BitPermutationSelector { // associated with each) used to choose the lowering method. struct ValueRotInfo { SDValue V; - unsigned RLAmt; - unsigned NumGroups; - unsigned FirstGroupStartIdx; - bool Repl32; + unsigned RLAmt = std::numeric_limits<unsigned>::max(); + unsigned NumGroups = 0; + unsigned FirstGroupStartIdx = std::numeric_limits<unsigned>::max(); + bool Repl32 = false; - ValueRotInfo() - : RLAmt(UINT32_MAX), NumGroups(0), FirstGroupStartIdx(UINT32_MAX), - Repl32(false) {} + ValueRotInfo() = default; // For sorting (in reverse order) by NumGroups, and then by // FirstGroupStartIdx. @@ -1985,7 +2045,8 @@ public: return RNLM; } }; -} // anonymous namespace + +} // end anonymous namespace bool PPCDAGToDAGISel::tryBitPermutation(SDNode *N) { if (N->getValueType(0) != MVT::i32 && @@ -2057,7 +2118,7 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, getI32Imm(Imm & 0xFFFF, dl)), 0); Opc = PPC::CMPLW; } else { - short SImm; + int16_t SImm; if (isIntS16Immediate(RHS, SImm)) return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS, getI32Imm((int)SImm & 0xFFFF, @@ -2104,7 +2165,7 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, getI64Imm(Imm & 0xFFFF, dl)), 0); Opc = PPC::CMPLD; } else { - short SImm; + int16_t SImm; if (isIntS16Immediate(RHS, SImm)) return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS, getI64Imm(SImm & 0xFFFF, dl)), @@ -2443,6 +2504,525 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) { return true; } +// Is this opcode a bitwise logical operation? +static bool isLogicOp(unsigned Opc) { + return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR; +} + +/// If this node is a sign/zero extension of an integer comparison, +/// it can usually be computed in GPR's rather than using comparison +/// instructions and ISEL. We only do this on 64-bit targets for now +/// as the code is specialized for 64-bit (it uses 64-bit instructions +/// and assumes 64-bit registers). +bool PPCDAGToDAGISel::tryEXTEND(SDNode *N) { + if (TM.getOptLevel() == CodeGenOpt::None || !TM.isPPC64()) + return false; + assert((N->getOpcode() == ISD::ZERO_EXTEND || + N->getOpcode() == ISD::SIGN_EXTEND) && + "Expecting a zero/sign extend node!"); + + SDValue WideRes; + // If we are zero-extending the result of a logical operation on i1 + // values, we can keep the values in GPRs. + if (isLogicOp(N->getOperand(0).getOpcode()) && + N->getOperand(0).getValueType() == MVT::i1 && + N->getOpcode() == ISD::ZERO_EXTEND) + WideRes = computeLogicOpInGPR(N->getOperand(0)); + else if (N->getOperand(0).getOpcode() != ISD::SETCC) + return false; + else + WideRes = + getSETCCInGPR(N->getOperand(0), + N->getOpcode() == ISD::SIGN_EXTEND ? + SetccInGPROpts::SExtOrig : SetccInGPROpts::ZExtOrig); + + if (!WideRes) + return false; + + SDLoc dl(N); + bool Inputs32Bit = N->getOperand(0).getOperand(0).getValueType() == MVT::i32; + bool Output32Bit = N->getValueType(0) == MVT::i32; + + NumSextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 1 : 0; + NumZextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 0 : 1; + + SDValue ConvOp = WideRes; + if (Inputs32Bit != Output32Bit) + ConvOp = addExtOrTrunc(WideRes, Inputs32Bit ? ExtOrTruncConversion::Ext : + ExtOrTruncConversion::Trunc); + ReplaceNode(N, ConvOp.getNode()); + + return true; +} + +// Lower a logical operation on i1 values into a GPR sequence if possible. +// The result can be kept in a GPR if requested. +// Three types of inputs can be handled: +// - SETCC +// - TRUNCATE +// - Logical operation (AND/OR/XOR) +// There is also a special case that is handled (namely a complement operation +// achieved with xor %a, -1). +SDValue PPCDAGToDAGISel::computeLogicOpInGPR(SDValue LogicOp) { + assert(isLogicOp(LogicOp.getOpcode()) && + "Can only handle logic operations here."); + assert(LogicOp.getValueType() == MVT::i1 && + "Can only handle logic operations on i1 values here."); + SDLoc dl(LogicOp); + SDValue LHS, RHS; + + // Special case: xor %a, -1 + bool IsBitwiseNegation = isBitwiseNot(LogicOp); + + // Produces a GPR sequence for each operand of the binary logic operation. + // For SETCC, it produces the respective comparison, for TRUNCATE it truncates + // the value in a GPR and for logic operations, it will recursively produce + // a GPR sequence for the operation. + auto getLogicOperand = [&] (SDValue Operand) -> SDValue { + unsigned OperandOpcode = Operand.getOpcode(); + if (OperandOpcode == ISD::SETCC) + return getSETCCInGPR(Operand, SetccInGPROpts::ZExtOrig); + else if (OperandOpcode == ISD::TRUNCATE) { + SDValue InputOp = Operand.getOperand(0); + EVT InVT = InputOp.getValueType(); + return + SDValue(CurDAG->getMachineNode(InVT == MVT::i32 ? PPC::RLDICL_32 : + PPC::RLDICL, dl, InVT, InputOp, + getI64Imm(0, dl), getI64Imm(63, dl)), 0); + } else if (isLogicOp(OperandOpcode)) + return computeLogicOpInGPR(Operand); + return SDValue(); + }; + LHS = getLogicOperand(LogicOp.getOperand(0)); + RHS = getLogicOperand(LogicOp.getOperand(1)); + + // If a GPR sequence can't be produced for the LHS we can't proceed. + // Not producing a GPR sequence for the RHS is only a problem if this isn't + // a bitwise negation operation. + if (!LHS || (!RHS && !IsBitwiseNegation)) + return SDValue(); + + NumLogicOpsOnComparison++; + + // We will use the inputs as 64-bit values. + if (LHS.getValueType() == MVT::i32) + LHS = addExtOrTrunc(LHS, ExtOrTruncConversion::Ext); + if (!IsBitwiseNegation && RHS.getValueType() == MVT::i32) + RHS = addExtOrTrunc(RHS, ExtOrTruncConversion::Ext); + + unsigned NewOpc; + switch (LogicOp.getOpcode()) { + default: llvm_unreachable("Unknown logic operation."); + case ISD::AND: NewOpc = PPC::AND8; break; + case ISD::OR: NewOpc = PPC::OR8; break; + case ISD::XOR: NewOpc = PPC::XOR8; break; + } + + if (IsBitwiseNegation) { + RHS = getI64Imm(1, dl); + NewOpc = PPC::XORI8; + } + + return SDValue(CurDAG->getMachineNode(NewOpc, dl, MVT::i64, LHS, RHS), 0); + +} + +/// Try performing logical operations on results of comparisons in GPRs. +/// It is typically preferred from a performance perspective over performing +/// the operations on individual bits in the CR. We only do this on 64-bit +/// targets for now as the code is specialized for 64-bit (it uses 64-bit +/// instructions and assumes 64-bit registers). +bool PPCDAGToDAGISel::tryLogicOpOfCompares(SDNode *N) { + if (TM.getOptLevel() == CodeGenOpt::None || !TM.isPPC64()) + return false; + if (N->getValueType(0) != MVT::i1) + return false; + assert(isLogicOp(N->getOpcode()) && + "Expected a logic operation on setcc results."); + SDValue LoweredLogical = computeLogicOpInGPR(SDValue(N, 0)); + if (!LoweredLogical) + return false; + + SDLoc dl(N); + bool IsBitwiseNegate = LoweredLogical.getMachineOpcode() == PPC::XORI8; + unsigned SubRegToExtract = IsBitwiseNegate ? PPC::sub_eq : PPC::sub_gt; + SDValue CR0Reg = CurDAG->getRegister(PPC::CR0, MVT::i32); + SDValue LHS = LoweredLogical.getOperand(0); + SDValue RHS = LoweredLogical.getOperand(1); + SDValue WideOp; + SDValue OpToConvToRecForm; + + // Look through any 32-bit to 64-bit implicit extend nodes to find the opcode + // that is input to the XORI. + if (IsBitwiseNegate && + LoweredLogical.getOperand(0).getMachineOpcode() == PPC::INSERT_SUBREG) + OpToConvToRecForm = LoweredLogical.getOperand(0).getOperand(1); + else if (IsBitwiseNegate) + // If the input to the XORI isn't an extension, that's what we're after. + OpToConvToRecForm = LoweredLogical.getOperand(0); + else + // If this is not an XORI, it is a reg-reg logical op and we can convert it + // to record-form. + OpToConvToRecForm = LoweredLogical; + + // Get the record-form version of the node we're looking to use to get the + // CR result from. + uint16_t NonRecOpc = OpToConvToRecForm.getMachineOpcode(); + int NewOpc = PPCInstrInfo::getRecordFormOpcode(NonRecOpc); + + // Convert the right node to record-form. This is either the logical we're + // looking at or it is the input node to the negation (if we're looking at + // a bitwise negation). + if (NewOpc != -1 && IsBitwiseNegate) { + // The input to the XORI has a record-form. Use it. + assert(LoweredLogical.getConstantOperandVal(1) == 1 && + "Expected a PPC::XORI8 only for bitwise negation."); + // Emit the record-form instruction. + std::vector<SDValue> Ops; + for (int i = 0, e = OpToConvToRecForm.getNumOperands(); i < e; i++) + Ops.push_back(OpToConvToRecForm.getOperand(i)); + + WideOp = + SDValue(CurDAG->getMachineNode(NewOpc, dl, + OpToConvToRecForm.getValueType(), + MVT::Glue, Ops), 0); + } else { + assert((NewOpc != -1 || !IsBitwiseNegate) && + "No record form available for AND8/OR8/XOR8?"); + WideOp = + SDValue(CurDAG->getMachineNode(NewOpc == -1 ? PPC::ANDIo8 : NewOpc, dl, + MVT::i64, MVT::Glue, LHS, RHS), 0); + } + + // Select this node to a single bit from CR0 set by the record-form node + // just created. For bitwise negation, use the EQ bit which is the equivalent + // of negating the result (i.e. it is a bit set when the result of the + // operation is zero). + SDValue SRIdxVal = + CurDAG->getTargetConstant(SubRegToExtract, dl, MVT::i32); + SDValue CRBit = + SDValue(CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, + MVT::i1, CR0Reg, SRIdxVal, + WideOp.getValue(1)), 0); + ReplaceNode(N, CRBit.getNode()); + return true; +} + +/// If the value isn't guaranteed to be sign-extended to 64-bits, extend it. +/// Useful when emitting comparison code for 32-bit values without using +/// the compare instruction (which only considers the lower 32-bits). +SDValue PPCDAGToDAGISel::signExtendInputIfNeeded(SDValue Input) { + assert(Input.getValueType() == MVT::i32 && + "Can only sign-extend 32-bit values here."); + unsigned Opc = Input.getOpcode(); + + // The value was sign extended and then truncated to 32-bits. No need to + // sign extend it again. + if (Opc == ISD::TRUNCATE && + (Input.getOperand(0).getOpcode() == ISD::AssertSext || + Input.getOperand(0).getOpcode() == ISD::SIGN_EXTEND)) + return Input; + + LoadSDNode *InputLoad = dyn_cast<LoadSDNode>(Input); + // The input is a sign-extending load. No reason to sign-extend. + if (InputLoad && InputLoad->getExtensionType() == ISD::SEXTLOAD) + return Input; + + ConstantSDNode *InputConst = dyn_cast<ConstantSDNode>(Input); + // We don't sign-extend constants and already sign-extended values. + if (InputConst || Opc == ISD::AssertSext || Opc == ISD::SIGN_EXTEND_INREG || + Opc == ISD::SIGN_EXTEND) + return Input; + + SDLoc dl(Input); + SignExtensionsAdded++; + return SDValue(CurDAG->getMachineNode(PPC::EXTSW_32, dl, MVT::i32, Input), 0); +} + +/// If the value isn't guaranteed to be zero-extended to 64-bits, extend it. +/// Useful when emitting comparison code for 32-bit values without using +/// the compare instruction (which only considers the lower 32-bits). +SDValue PPCDAGToDAGISel::zeroExtendInputIfNeeded(SDValue Input) { + assert(Input.getValueType() == MVT::i32 && + "Can only zero-extend 32-bit values here."); + LoadSDNode *InputLoad = dyn_cast<LoadSDNode>(Input); + unsigned Opc = Input.getOpcode(); + + // No need to zero-extend loaded values (unless they're loaded with + // a sign-extending load). + if (InputLoad && InputLoad->getExtensionType() != ISD::SEXTLOAD) + return Input; + + ConstantSDNode *InputConst = dyn_cast<ConstantSDNode>(Input); + bool InputZExtConst = InputConst && InputConst->getSExtValue() >= 0; + // An ISD::TRUNCATE will be lowered to an EXTRACT_SUBREG so we have + // to conservatively actually clear the high bits. We also don't need to + // zero-extend constants or values that are already zero-extended. + if (InputZExtConst || Opc == ISD::AssertZext || Opc == ISD::ZERO_EXTEND) + return Input; + + SDLoc dl(Input); + ZeroExtensionsAdded++; + return SDValue(CurDAG->getMachineNode(PPC::RLDICL_32, dl, MVT::i32, Input, + getI64Imm(0, dl), getI64Imm(32, dl)), + 0); +} + +// Handle a 32-bit value in a 64-bit register and vice-versa. These are of +// course not actual zero/sign extensions that will generate machine code, +// they're just a way to reinterpret a 32 bit value in a register as a +// 64 bit value and vice-versa. +SDValue PPCDAGToDAGISel::addExtOrTrunc(SDValue NatWidthRes, + ExtOrTruncConversion Conv) { + SDLoc dl(NatWidthRes); + + // For reinterpreting 32-bit values as 64 bit values, we generate + // INSERT_SUBREG IMPLICIT_DEF:i64, <input>, TargetConstant:i32<1> + if (Conv == ExtOrTruncConversion::Ext) { + SDValue ImDef(CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl, MVT::i64), 0); + SDValue SubRegIdx = + CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32); + return SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl, MVT::i64, + ImDef, NatWidthRes, SubRegIdx), 0); + } + + assert(Conv == ExtOrTruncConversion::Trunc && + "Unknown convertion between 32 and 64 bit values."); + // For reinterpreting 64-bit values as 32-bit values, we just need to + // EXTRACT_SUBREG (i.e. extract the low word). + SDValue SubRegIdx = + CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32); + return SDValue(CurDAG->getMachineNode(PPC::EXTRACT_SUBREG, dl, MVT::i32, + NatWidthRes, SubRegIdx), 0); +} + +/// Produces a zero-extended result of comparing two 32-bit values according to +/// the passed condition code. +SDValue PPCDAGToDAGISel::get32BitZExtCompare(SDValue LHS, SDValue RHS, + ISD::CondCode CC, + int64_t RHSValue, SDLoc dl) { + bool IsRHSZero = RHSValue == 0; + switch (CC) { + default: return SDValue(); + case ISD::SETEQ: { + // (zext (setcc %a, %b, seteq)) -> (lshr (cntlzw (xor %a, %b)), 5) + // (zext (setcc %a, 0, seteq)) -> (lshr (cntlzw %a), 5) + SDValue Xor = IsRHSZero ? LHS : + SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0); + SDValue Clz = + SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0); + SDValue ShiftOps[] = { Clz, getI32Imm(27, dl), getI32Imm(5, dl), + getI32Imm(31, dl) }; + return SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, + ShiftOps), 0); + } + case ISD::SETNE: { + // (zext (setcc %a, %b, setne)) -> (xor (lshr (cntlzw (xor %a, %b)), 5), 1) + // (zext (setcc %a, 0, setne)) -> (xor (lshr (cntlzw %a), 5), 1) + SDValue Xor = IsRHSZero ? LHS : + SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0); + SDValue Clz = + SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0); + SDValue ShiftOps[] = { Clz, getI32Imm(27, dl), getI32Imm(5, dl), + getI32Imm(31, dl) }; + SDValue Shift = + SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, ShiftOps), 0); + return SDValue(CurDAG->getMachineNode(PPC::XORI, dl, MVT::i32, Shift, + getI32Imm(1, dl)), 0); + } + } +} + +/// Produces a sign-extended result of comparing two 32-bit values according to +/// the passed condition code. +SDValue PPCDAGToDAGISel::get32BitSExtCompare(SDValue LHS, SDValue RHS, + ISD::CondCode CC, + int64_t RHSValue, SDLoc dl) { + bool IsRHSZero = RHSValue == 0; + switch (CC) { + default: return SDValue(); + case ISD::SETEQ: { + // (sext (setcc %a, %b, seteq)) -> + // (ashr (shl (ctlz (xor %a, %b)), 58), 63) + // (sext (setcc %a, 0, seteq)) -> + // (ashr (shl (ctlz %a), 58), 63) + SDValue CountInput = IsRHSZero ? LHS : + SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0); + SDValue Cntlzw = + SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, CountInput), 0); + SDValue SHLOps[] = { Cntlzw, getI32Imm(58, dl), getI32Imm(0, dl) }; + SDValue Sldi = + SDValue(CurDAG->getMachineNode(PPC::RLDICR_32, dl, MVT::i32, SHLOps), 0); + return SDValue(CurDAG->getMachineNode(PPC::SRADI_32, dl, MVT::i32, Sldi, + getI32Imm(63, dl)), 0); + } + case ISD::SETNE: { + // Bitwise xor the operands, count leading zeros, shift right by 5 bits and + // flip the bit, finally take 2's complement. + // (sext (setcc %a, %b, setne)) -> + // (neg (xor (lshr (ctlz (xor %a, %b)), 5), 1)) + // Same as above, but the first xor is not needed. + // (sext (setcc %a, 0, setne)) -> + // (neg (xor (lshr (ctlz %a), 5), 1)) + SDValue Xor = IsRHSZero ? LHS : + SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0); + SDValue Clz = + SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0); + SDValue ShiftOps[] = + { Clz, getI32Imm(27, dl), getI32Imm(5, dl), getI32Imm(31, dl) }; + SDValue Shift = + SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, ShiftOps), 0); + SDValue Xori = + SDValue(CurDAG->getMachineNode(PPC::XORI, dl, MVT::i32, Shift, + getI32Imm(1, dl)), 0); + return SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Xori), 0); + } + } +} + +/// Produces a zero-extended result of comparing two 64-bit values according to +/// the passed condition code. +SDValue PPCDAGToDAGISel::get64BitZExtCompare(SDValue LHS, SDValue RHS, + ISD::CondCode CC, + int64_t RHSValue, SDLoc dl) { + bool IsRHSZero = RHSValue == 0; + switch (CC) { + default: return SDValue(); + case ISD::SETEQ: { + // (zext (setcc %a, %b, seteq)) -> (lshr (ctlz (xor %a, %b)), 6) + // (zext (setcc %a, 0, seteq)) -> (lshr (ctlz %a), 6) + SDValue Xor = IsRHSZero ? LHS : + SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0); + SDValue Clz = + SDValue(CurDAG->getMachineNode(PPC::CNTLZD, dl, MVT::i64, Xor), 0); + return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Clz, + getI64Imm(58, dl), getI64Imm(63, dl)), + 0); + } + } +} + +/// Produces a sign-extended result of comparing two 64-bit values according to +/// the passed condition code. +SDValue PPCDAGToDAGISel::get64BitSExtCompare(SDValue LHS, SDValue RHS, + ISD::CondCode CC, + int64_t RHSValue, SDLoc dl) { + bool IsRHSZero = RHSValue == 0; + switch (CC) { + default: return SDValue(); + case ISD::SETEQ: { + // {addc.reg, addc.CA} = (addcarry (xor %a, %b), -1) + // (sext (setcc %a, %b, seteq)) -> (sube addc.reg, addc.reg, addc.CA) + // {addcz.reg, addcz.CA} = (addcarry %a, -1) + // (sext (setcc %a, 0, seteq)) -> (sube addcz.reg, addcz.reg, addcz.CA) + SDValue AddInput = IsRHSZero ? LHS : + SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0); + SDValue Addic = + SDValue(CurDAG->getMachineNode(PPC::ADDIC8, dl, MVT::i64, MVT::Glue, + AddInput, getI32Imm(~0U, dl)), 0); + return SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, Addic, + Addic, Addic.getValue(1)), 0); + } + } +} + +/// Does this SDValue have any uses for which keeping the value in a GPR is +/// appropriate. This is meant to be used on values that have type i1 since +/// it is somewhat meaningless to ask if values of other types can be kept in +/// GPR's. +static bool allUsesExtend(SDValue Compare, SelectionDAG *CurDAG) { + assert(Compare.getOpcode() == ISD::SETCC && + "An ISD::SETCC node required here."); + + // For values that have a single use, the caller should obviously already have + // checked if that use is an extending use. We check the other uses here. + if (Compare.hasOneUse()) + return true; + // We want the value in a GPR if it is being extended, used for a select, or + // used in logical operations. + for (auto CompareUse : Compare.getNode()->uses()) + if (CompareUse->getOpcode() != ISD::SIGN_EXTEND && + CompareUse->getOpcode() != ISD::ZERO_EXTEND && + CompareUse->getOpcode() != ISD::SELECT && + !isLogicOp(CompareUse->getOpcode())) { + OmittedForNonExtendUses++; + return false; + } + return true; +} + +/// Returns an equivalent of a SETCC node but with the result the same width as +/// the inputs. This can nalso be used for SELECT_CC if either the true or false +/// values is a power of two while the other is zero. +SDValue PPCDAGToDAGISel::getSETCCInGPR(SDValue Compare, + SetccInGPROpts ConvOpts) { + assert((Compare.getOpcode() == ISD::SETCC || + Compare.getOpcode() == ISD::SELECT_CC) && + "An ISD::SETCC node required here."); + + // Don't convert this comparison to a GPR sequence because there are uses + // of the i1 result (i.e. uses that require the result in the CR). + if ((Compare.getOpcode() == ISD::SETCC) && !allUsesExtend(Compare, CurDAG)) + return SDValue(); + + SDValue LHS = Compare.getOperand(0); + SDValue RHS = Compare.getOperand(1); + + // The condition code is operand 2 for SETCC and operand 4 for SELECT_CC. + int CCOpNum = Compare.getOpcode() == ISD::SELECT_CC ? 4 : 2; + ISD::CondCode CC = + cast<CondCodeSDNode>(Compare.getOperand(CCOpNum))->get(); + EVT InputVT = LHS.getValueType(); + if (InputVT != MVT::i32 && InputVT != MVT::i64) + return SDValue(); + + if (ConvOpts == SetccInGPROpts::ZExtInvert || + ConvOpts == SetccInGPROpts::SExtInvert) + CC = ISD::getSetCCInverse(CC, true); + + bool Inputs32Bit = InputVT == MVT::i32; + if (ISD::isSignedIntSetCC(CC) && Inputs32Bit) { + LHS = signExtendInputIfNeeded(LHS); + RHS = signExtendInputIfNeeded(RHS); + } else if (ISD::isUnsignedIntSetCC(CC) && Inputs32Bit) { + LHS = zeroExtendInputIfNeeded(LHS); + RHS = zeroExtendInputIfNeeded(RHS); + } + + SDLoc dl(Compare); + ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS); + int64_t RHSValue = RHSConst ? RHSConst->getSExtValue() : INT64_MAX; + bool IsSext = ConvOpts == SetccInGPROpts::SExtOrig || + ConvOpts == SetccInGPROpts::SExtInvert; + + if (IsSext && Inputs32Bit) + return get32BitSExtCompare(LHS, RHS, CC, RHSValue, dl); + else if (Inputs32Bit) + return get32BitZExtCompare(LHS, RHS, CC, RHSValue, dl); + else if (IsSext) + return get64BitSExtCompare(LHS, RHS, CC, RHSValue, dl); + return get64BitZExtCompare(LHS, RHS, CC, RHSValue, dl); +} + +/// Does this node represent a load/store node whose address can be represented +/// with a register plus an immediate that's a multiple of \p Val: +bool PPCDAGToDAGISel::isOffsetMultipleOf(SDNode *N, unsigned Val) const { + LoadSDNode *LDN = dyn_cast<LoadSDNode>(N); + StoreSDNode *STN = dyn_cast<StoreSDNode>(N); + SDValue AddrOp; + if (LDN) + AddrOp = LDN->getOperand(1); + else if (STN) + AddrOp = STN->getOperand(2); + + short Imm = 0; + if (AddrOp.getOpcode() == ISD::ADD) + return isIntS16Immediate(AddrOp.getOperand(1), Imm) && !(Imm % Val); + + // If the address comes from the outside, the offset will be zero. + return AddrOp.getOpcode() == ISD::CopyFromReg; +} + void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) { // Transfer memoperands. MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); @@ -2450,7 +3030,6 @@ void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) { cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1); } - // Select - Convert the specified operand from a target-independent to a // target-specific node if it hasn't already been changed. void PPCDAGToDAGISel::Select(SDNode *N) { @@ -2474,19 +3053,24 @@ void PPCDAGToDAGISel::Select(SDNode *N) { switch (N->getOpcode()) { default: break; - case ISD::Constant: { + case ISD::Constant: if (N->getValueType(0) == MVT::i64) { ReplaceNode(N, getInt64(CurDAG, N)); return; } break; - } - case ISD::SETCC: { + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND: + if (tryEXTEND(N)) + return; + break; + + case ISD::SETCC: if (trySETCC(N)) return; break; - } + case PPCISD::GlobalBaseReg: ReplaceNode(N, getGlobalBaseReg()); return; @@ -2502,11 +3086,10 @@ void PPCDAGToDAGISel::Select(SDNode *N) { return; } - case PPCISD::READ_TIME_BASE: { + case PPCISD::READ_TIME_BASE: ReplaceNode(N, CurDAG->getMachineNode(PPC::ReadTB, dl, MVT::i32, MVT::i32, MVT::Other, N->getOperand(0))); return; - } case PPCISD::SRA_ADDZE: { SDValue N0 = N->getOperand(0); @@ -2626,6 +3209,9 @@ void PPCDAGToDAGISel::Select(SDNode *N) { } case ISD::AND: { + if (tryLogicOpOfCompares(N)) + return; + unsigned Imm, Imm2, SH, MB, ME; uint64_t Imm64; @@ -2690,6 +3276,19 @@ void PPCDAGToDAGISel::Select(SDNode *N) { CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops); return; } + // If this is a negated 64-bit zero-extension mask, + // i.e. the immediate is a sequence of ones from most significant side + // and all zero for reminder, we should use rldicr. + if (isInt64Immediate(N->getOperand(1).getNode(), Imm64) && + isMask_64(~Imm64)) { + SDValue Val = N->getOperand(0); + MB = 63 - countTrailingOnes(~Imm64); + SH = 0; + SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl) }; + CurDAG->SelectNodeTo(N, PPC::RLDICR, MVT::i64, Ops); + return; + } + // AND X, 0 -> 0, not "rlwinm 32". if (isInt32Immediate(N->getOperand(1), Imm) && (Imm == 0)) { ReplaceUses(SDValue(N, 0), N->getOperand(1)); @@ -2732,15 +3331,18 @@ void PPCDAGToDAGISel::Select(SDNode *N) { if (tryBitfieldInsert(N)) return; - short Imm; + if (tryLogicOpOfCompares(N)) + return; + + int16_t Imm; if (N->getOperand(0)->getOpcode() == ISD::FrameIndex && isIntS16Immediate(N->getOperand(1), Imm)) { - APInt LHSKnownZero, LHSKnownOne; - CurDAG->computeKnownBits(N->getOperand(0), LHSKnownZero, LHSKnownOne); + KnownBits LHSKnown; + CurDAG->computeKnownBits(N->getOperand(0), LHSKnown); // If this is equivalent to an add, then we can fold it with the // FrameIndex calculation. - if ((LHSKnownZero.getZExtValue()|~(uint64_t)Imm) == ~0ULL) { + if ((LHSKnown.Zero.getZExtValue()|~(uint64_t)Imm) == ~0ULL) { selectFrameIndex(N, N->getOperand(0).getNode(), (int)Imm); return; } @@ -2749,8 +3351,13 @@ void PPCDAGToDAGISel::Select(SDNode *N) { // Other cases are autogenerated. break; } + case ISD::XOR: { + if (tryLogicOpOfCompares(N)) + return; + break; + } case ISD::ADD: { - short Imm; + int16_t Imm; if (N->getOperand(0)->getOpcode() == ISD::FrameIndex && isIntS16Immediate(N->getOperand(1), Imm)) { selectFrameIndex(N, N->getOperand(0).getNode(), (int)Imm); @@ -2911,8 +3518,8 @@ void PPCDAGToDAGISel::Select(SDNode *N) { CurDAG->SelectNodeTo(N, PPC::XXSEL, N->getValueType(0), Ops); return; } - break; + case ISD::VECTOR_SHUFFLE: if (PPCSubTarget->hasVSX() && (N->getValueType(0) == MVT::v2f64 || N->getValueType(0) == MVT::v2i64)) { @@ -2940,7 +3547,11 @@ void PPCDAGToDAGISel::Select(SDNode *N) { SelectAddrIdxOnly(LD->getBasePtr(), Base, Offset)) { SDValue Chain = LD->getChain(); SDValue Ops[] = { Base, Offset, Chain }; - CurDAG->SelectNodeTo(N, PPC::LXVDSX, N->getValueType(0), Ops); + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = LD->getMemOperand(); + SDNode *NewN = CurDAG->SelectNodeTo(N, PPC::LXVDSX, + N->getValueType(0), Ops); + cast<MachineSDNode>(NewN)->setMemRefs(MemOp, MemOp + 1); return; } } @@ -3088,7 +3699,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) { SDValue(Tmp, 0), GA)); return; } - case PPCISD::PPC32_PICGOT: { + case PPCISD::PPC32_PICGOT: // Generate a PIC-safe GOT reference. assert(!PPCSubTarget->isPPC64() && PPCSubTarget->isSVR4ABI() && "PPCISD::PPC32_PICGOT is only supported for 32-bit SVR4"); @@ -3096,7 +3707,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) { PPCLowering->getPointerTy(CurDAG->getDataLayout()), MVT::i32); return; - } + case PPCISD::VADD_SPLAT: { // This expands into one of three sequences, depending on whether // the first operand is odd or even, positive or negative. @@ -3139,7 +3750,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) { SDValue TmpVal = SDValue(Tmp, 0); ReplaceNode(N, CurDAG->getMachineNode(Opc2, dl, VT, TmpVal, TmpVal)); return; - } else if (Elt > 0) { // Elt is odd and positive, in the range [17,31]. // @@ -3154,7 +3764,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) { ReplaceNode(N, CurDAG->getMachineNode(Opc3, dl, VT, SDValue(Tmp1, 0), SDValue(Tmp2, 0))); return; - } else { // Elt is odd and negative, in the range [-31,-17]. // @@ -3199,7 +3808,7 @@ SDValue PPCDAGToDAGISel::combineToCMPB(SDNode *N) { EVT VT = N->getValueType(0); SDValue RHS, LHS; - bool BytesFound[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; + bool BytesFound[8] = {false, false, false, false, false, false, false, false}; uint64_t Mask = 0, Alt = 0; auto IsByteSelectCC = [this](SDValue O, unsigned &b, @@ -3436,11 +4045,13 @@ void PPCDAGToDAGISel::foldBoolExts(SDValue &Res, SDNode *&N) { O0.getNode(), O1.getNode()); }; + // FIXME: When the semantics of the interaction between select and undef + // are clearly defined, it may turn out to be unnecessary to break here. SDValue TrueRes = TryFold(ConstTrue); - if (!TrueRes) + if (!TrueRes || TrueRes.isUndef()) break; SDValue FalseRes = TryFold(ConstFalse); - if (!FalseRes) + if (!FalseRes || FalseRes.isUndef()) break; // For us to materialize these using one instruction, we must be able to @@ -3499,7 +4110,6 @@ void PPCDAGToDAGISel::PreprocessISelDAG() { /// PostprocessISelDAG - Perform some late peephole optimizations /// on the DAG representation. void PPCDAGToDAGISel::PostprocessISelDAG() { - // Skip peepholes at -O0. if (TM.getOptLevel() == CodeGenOpt::None) return; @@ -3515,10 +4125,6 @@ void PPCDAGToDAGISel::PostprocessISelDAG() { // be folded with the isel so that we don't need to materialize a register // containing zero. bool PPCDAGToDAGISel::AllUsersSelectZero(SDNode *N) { - // If we're not using isel, then this does not matter. - if (!PPCSubTarget->hasISEL()) - return false; - for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE; ++UI) { SDNode *User = *UI; @@ -4520,10 +5126,10 @@ void PPCDAGToDAGISel::PeepholePPC64() { } } - /// createPPCISelDag - This pass converts a legalized DAG into a /// PowerPC-specific DAG, ready for instruction scheduling. /// -FunctionPass *llvm::createPPCISelDag(PPCTargetMachine &TM) { - return new PPCDAGToDAGISel(TM); +FunctionPass *llvm::createPPCISelDag(PPCTargetMachine &TM, + CodeGenOpt::Level OptLevel) { + return new PPCDAGToDAGISel(TM, OptLevel); } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 2b9195b..b3a3c73 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -13,37 +13,87 @@ #include "PPCISelLowering.h" #include "MCTargetDesc/PPCPredicates.h" -#include "PPCCallingConv.h" +#include "PPC.h" #include "PPCCCState.h" +#include "PPCCallingConv.h" +#include "PPCFrameLowering.h" +#include "PPCInstrInfo.h" #include "PPCMachineFunctionInfo.h" #include "PPCPerfectShuffle.h" +#include "PPCRegisterInfo.h" +#include "PPCSubtarget.h" #include "PPCTargetMachine.h" -#include "PPCTargetObjectFile.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/None.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" -#include "llvm/ADT/Triple.h" #include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/Value.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/AtomicOrdering.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> #include <list> +#include <utility> +#include <vector> using namespace llvm; @@ -86,7 +136,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, addRegisterClass(MVT::f64, &PPC::F8RCRegClass); } - // PowerPC has an i16 but no i8 (or i1) SEXTLOAD + // Match BITREVERSE to customized fast code sequence in the td file. + setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); + setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); + + // PowerPC has an i16 but no i8 (or i1) SEXTLOAD. for (MVT VT : MVT::integer_valuetypes()) { setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand); @@ -125,7 +179,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom); } - // PowerPC does not support direct load / store of condition registers + // PowerPC does not support direct load/store of condition registers. setOperationAction(ISD::LOAD, MVT::i1, Custom); setOperationAction(ISD::STORE, MVT::i1, Custom); @@ -154,11 +208,23 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand); setOperationAction(ISD::FREM, MVT::ppcf128, Expand); - // PowerPC has no SREM/UREM instructions - setOperationAction(ISD::SREM, MVT::i32, Expand); - setOperationAction(ISD::UREM, MVT::i32, Expand); - setOperationAction(ISD::SREM, MVT::i64, Expand); - setOperationAction(ISD::UREM, MVT::i64, Expand); + // PowerPC has no SREM/UREM instructions unless we are on P9 + // On P9 we may use a hardware instruction to compute the remainder. + // The instructions are not legalized directly because in the cases where the + // result of both the remainder and the division is required it is more + // efficient to compute the remainder from the result of the division rather + // than use the remainder instruction. + if (Subtarget.isISA3_0()) { + setOperationAction(ISD::SREM, MVT::i32, Custom); + setOperationAction(ISD::UREM, MVT::i32, Custom); + setOperationAction(ISD::SREM, MVT::i64, Custom); + setOperationAction(ISD::UREM, MVT::i64, Custom); + } else { + setOperationAction(ISD::SREM, MVT::i32, Expand); + setOperationAction(ISD::UREM, MVT::i32, Expand); + setOperationAction(ISD::SREM, MVT::i64, Expand); + setOperationAction(ISD::UREM, MVT::i64, Expand); + } // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM. setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); @@ -360,6 +426,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // To handle counter-based loop conditions. setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i32, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + // Comparisons that require checking two conditions. setCondCodeAction(ISD::SETULT, MVT::f32, Expand); setCondCodeAction(ISD::SETULT, MVT::f64, Expand); @@ -484,7 +555,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FABS, VT, Expand); - setOperationAction(ISD::FPOWI, VT, Expand); setOperationAction(ISD::FFLOOR, VT, Expand); setOperationAction(ISD::FCEIL, VT, Expand); setOperationAction(ISD::FTRUNC, VT, Expand); @@ -634,6 +704,14 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::SRA, MVT::v2i64, Legal); setOperationAction(ISD::SRL, MVT::v2i64, Legal); + // 128 bit shifts can be accomplished via 3 instructions for SHL and + // SRL, but not for SRA because of the instructions available: + // VS{RL} and VS{RL}O. However due to direct move costs, it's not worth + // doing + setOperationAction(ISD::SHL, MVT::v1i128, Expand); + setOperationAction(ISD::SRL, MVT::v1i128, Expand); + setOperationAction(ISD::SRA, MVT::v1i128, Expand); + setOperationAction(ISD::SETCC, MVT::v2i64, Legal); } else { @@ -687,6 +765,13 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, if (Subtarget.hasP9Vector()) { setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + + // 128 bit shifts can be accomplished via 3 instructions for SHL and + // SRL, but not for SRA because of the instructions available: + // VS{RL} and VS{RL}O. + setOperationAction(ISD::SHL, MVT::v1i128, Legal); + setOperationAction(ISD::SRL, MVT::v1i128, Legal); + setOperationAction(ISD::SRA, MVT::v1i128, Expand); } } @@ -728,7 +813,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FABS , MVT::v4f64, Legal); setOperationAction(ISD::FSIN , MVT::v4f64, Expand); setOperationAction(ISD::FCOS , MVT::v4f64, Expand); - setOperationAction(ISD::FPOWI , MVT::v4f64, Expand); setOperationAction(ISD::FPOW , MVT::v4f64, Expand); setOperationAction(ISD::FLOG , MVT::v4f64, Expand); setOperationAction(ISD::FLOG2 , MVT::v4f64, Expand); @@ -774,7 +858,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FABS , MVT::v4f32, Legal); setOperationAction(ISD::FSIN , MVT::v4f32, Expand); setOperationAction(ISD::FCOS , MVT::v4f32, Expand); - setOperationAction(ISD::FPOWI , MVT::v4f32, Expand); setOperationAction(ISD::FPOW , MVT::v4f32, Expand); setOperationAction(ISD::FLOG , MVT::v4f32, Expand); setOperationAction(ISD::FLOG2 , MVT::v4f32, Expand); @@ -873,6 +956,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1); // We have target-specific dag combine patterns for the following nodes: + setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::SRA); + setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::BUILD_VECTOR); if (Subtarget.hasFPCVT()) @@ -971,6 +1057,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, MaxStoresPerMemset = 128; MaxStoresPerMemcpy = 128; MaxStoresPerMemmove = 128; + MaxLoadsPerMemcmp = 128; + } else { + MaxLoadsPerMemcmp = 8; + MaxLoadsPerMemcmpOptSize = 4; } } @@ -1042,6 +1132,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::VPERM: return "PPCISD::VPERM"; case PPCISD::XXSPLT: return "PPCISD::XXSPLT"; case PPCISD::XXINSERT: return "PPCISD::XXINSERT"; + case PPCISD::XXREVERSE: return "PPCISD::XXREVERSE"; + case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI"; case PPCISD::VECSHL: return "PPCISD::VECSHL"; case PPCISD::CMPB: return "PPCISD::CMPB"; case PPCISD::Hi: return "PPCISD::Hi"; @@ -1080,6 +1172,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::LXSIZX: return "PPCISD::LXSIZX"; case PPCISD::STXSIX: return "PPCISD::STXSIX"; case PPCISD::VEXTS: return "PPCISD::VEXTS"; + case PPCISD::SExtVElems: return "PPCISD::SExtVElems"; case PPCISD::LXVD2X: return "PPCISD::LXVD2X"; case PPCISD::STXVD2X: return "PPCISD::STXVD2X"; case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH"; @@ -1523,21 +1616,47 @@ bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) { return true; } -bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, - unsigned &InsertAtByte, bool &Swap, bool IsLE) { - - // Check that the mask is shuffling words - for (unsigned i = 0; i < 4; ++i) { - unsigned B0 = N->getMaskElt(i*4); - unsigned B1 = N->getMaskElt(i*4+1); - unsigned B2 = N->getMaskElt(i*4+2); - unsigned B3 = N->getMaskElt(i*4+3); - if (B0 % 4) +/// Check that the mask is shuffling N byte elements. Within each N byte +/// element of the mask, the indices could be either in increasing or +/// decreasing order as long as they are consecutive. +/// \param[in] N the shuffle vector SD Node to analyze +/// \param[in] Width the element width in bytes, could be 2/4/8/16 (HalfWord/ +/// Word/DoubleWord/QuadWord). +/// \param[in] StepLen the delta indices number among the N byte element, if +/// the mask is in increasing/decreasing order then it is 1/-1. +/// \return true iff the mask is shuffling N byte elements. +static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width, + int StepLen) { + assert((Width == 2 || Width == 4 || Width == 8 || Width == 16) && + "Unexpected element width."); + assert((StepLen == 1 || StepLen == -1) && "Unexpected element width."); + + unsigned NumOfElem = 16 / Width; + unsigned MaskVal[16]; // Width is never greater than 16 + for (unsigned i = 0; i < NumOfElem; ++i) { + MaskVal[0] = N->getMaskElt(i * Width); + if ((StepLen == 1) && (MaskVal[0] % Width)) { return false; - if (B1 != B0+1 || B2 != B1+1 || B3 != B2+1) + } else if ((StepLen == -1) && ((MaskVal[0] + 1) % Width)) { return false; + } + + for (unsigned int j = 1; j < Width; ++j) { + MaskVal[j] = N->getMaskElt(i * Width + j); + if (MaskVal[j] != MaskVal[j-1] + StepLen) { + return false; + } + } } + return true; +} + +bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, + unsigned &InsertAtByte, bool &Swap, bool IsLE) { + if (!isNByteElemShuffleMask(N, 4, 1)) + return false; + // Now we look at mask elements 0,4,8,12 unsigned M0 = N->getMaskElt(0) / 4; unsigned M1 = N->getMaskElt(4) / 4; @@ -1608,6 +1727,158 @@ bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, return false; } +bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, + bool &Swap, bool IsLE) { + assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); + // Ensure each byte index of the word is consecutive. + if (!isNByteElemShuffleMask(N, 4, 1)) + return false; + + // Now we look at mask elements 0,4,8,12, which are the beginning of words. + unsigned M0 = N->getMaskElt(0) / 4; + unsigned M1 = N->getMaskElt(4) / 4; + unsigned M2 = N->getMaskElt(8) / 4; + unsigned M3 = N->getMaskElt(12) / 4; + + // If both vector operands for the shuffle are the same vector, the mask will + // contain only elements from the first one and the second one will be undef. + if (N->getOperand(1).isUndef()) { + assert(M0 < 4 && "Indexing into an undef vector?"); + if (M1 != (M0 + 1) % 4 || M2 != (M1 + 1) % 4 || M3 != (M2 + 1) % 4) + return false; + + ShiftElts = IsLE ? (4 - M0) % 4 : M0; + Swap = false; + return true; + } + + // Ensure each word index of the ShuffleVector Mask is consecutive. + if (M1 != (M0 + 1) % 8 || M2 != (M1 + 1) % 8 || M3 != (M2 + 1) % 8) + return false; + + if (IsLE) { + if (M0 == 0 || M0 == 7 || M0 == 6 || M0 == 5) { + // Input vectors don't need to be swapped if the leading element + // of the result is one of the 3 left elements of the second vector + // (or if there is no shift to be done at all). + Swap = false; + ShiftElts = (8 - M0) % 8; + } else if (M0 == 4 || M0 == 3 || M0 == 2 || M0 == 1) { + // Input vectors need to be swapped if the leading element + // of the result is one of the 3 left elements of the first vector + // (or if we're shifting by 4 - thereby simply swapping the vectors). + Swap = true; + ShiftElts = (4 - M0) % 4; + } + + return true; + } else { // BE + if (M0 == 0 || M0 == 1 || M0 == 2 || M0 == 3) { + // Input vectors don't need to be swapped if the leading element + // of the result is one of the 4 elements of the first vector. + Swap = false; + ShiftElts = M0; + } else if (M0 == 4 || M0 == 5 || M0 == 6 || M0 == 7) { + // Input vectors need to be swapped if the leading element + // of the result is one of the 4 elements of the right vector. + Swap = true; + ShiftElts = M0 - 4; + } + + return true; + } +} + +bool static isXXBRShuffleMaskHelper(ShuffleVectorSDNode *N, int Width) { + assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); + + if (!isNByteElemShuffleMask(N, Width, -1)) + return false; + + for (int i = 0; i < 16; i += Width) + if (N->getMaskElt(i) != i + Width - 1) + return false; + + return true; +} + +bool PPC::isXXBRHShuffleMask(ShuffleVectorSDNode *N) { + return isXXBRShuffleMaskHelper(N, 2); +} + +bool PPC::isXXBRWShuffleMask(ShuffleVectorSDNode *N) { + return isXXBRShuffleMaskHelper(N, 4); +} + +bool PPC::isXXBRDShuffleMask(ShuffleVectorSDNode *N) { + return isXXBRShuffleMaskHelper(N, 8); +} + +bool PPC::isXXBRQShuffleMask(ShuffleVectorSDNode *N) { + return isXXBRShuffleMaskHelper(N, 16); +} + +/// Can node \p N be lowered to an XXPERMDI instruction? If so, set \p Swap +/// if the inputs to the instruction should be swapped and set \p DM to the +/// value for the immediate. +/// Specifically, set \p Swap to true only if \p N can be lowered to XXPERMDI +/// AND element 0 of the result comes from the first input (LE) or second input +/// (BE). Set \p DM to the calculated result (0-3) only if \p N can be lowered. +/// \return true iff the given mask of shuffle node \p N is a XXPERMDI shuffle +/// mask. +bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM, + bool &Swap, bool IsLE) { + assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); + + // Ensure each byte index of the double word is consecutive. + if (!isNByteElemShuffleMask(N, 8, 1)) + return false; + + unsigned M0 = N->getMaskElt(0) / 8; + unsigned M1 = N->getMaskElt(8) / 8; + assert(((M0 | M1) < 4) && "A mask element out of bounds?"); + + // If both vector operands for the shuffle are the same vector, the mask will + // contain only elements from the first one and the second one will be undef. + if (N->getOperand(1).isUndef()) { + if ((M0 | M1) < 2) { + DM = IsLE ? (((~M1) & 1) << 1) + ((~M0) & 1) : (M0 << 1) + (M1 & 1); + Swap = false; + return true; + } else + return false; + } + + if (IsLE) { + if (M0 > 1 && M1 < 2) { + Swap = false; + } else if (M0 < 2 && M1 > 1) { + M0 = (M0 + 2) % 4; + M1 = (M1 + 2) % 4; + Swap = true; + } else + return false; + + // Note: if control flow comes here that means Swap is already set above + DM = (((~M1) & 1) << 1) + ((~M0) & 1); + return true; + } else { // BE + if (M0 < 2 && M1 > 1) { + Swap = false; + } else if (M0 > 1 && M1 < 2) { + M0 = (M0 + 2) % 4; + M1 = (M1 + 2) % 4; + Swap = true; + } else + return false; + + // Note: if control flow comes here that means Swap is already set above + DM = (M0 << 1) + (M1 & 1); + return true; + } +} + + /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the /// specified isSplatShuffleMask VECTOR_SHUFFLE mask. unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize, @@ -1643,7 +1914,6 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { // If the element isn't a constant, bail fully out. if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue(); - if (!UniquedVals[i&(Multiple-1)].getNode()) UniquedVals[i&(Multiple-1)] = N->getOperand(i); else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i)) @@ -1763,17 +2033,17 @@ int PPC::isQVALIGNIShuffleMask(SDNode *N) { /// or 64-bit immediate, and if the value can be accurately represented as a /// sign extension from a 16-bit value. If so, this returns true and the /// immediate. -static bool isIntS16Immediate(SDNode *N, short &Imm) { +bool llvm::isIntS16Immediate(SDNode *N, int16_t &Imm) { if (!isa<ConstantSDNode>(N)) return false; - Imm = (short)cast<ConstantSDNode>(N)->getZExtValue(); + Imm = (int16_t)cast<ConstantSDNode>(N)->getZExtValue(); if (N->getValueType(0) == MVT::i32) return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue(); else return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue(); } -static bool isIntS16Immediate(SDValue Op, short &Imm) { +bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) { return isIntS16Immediate(Op.getNode(), Imm); } @@ -1783,7 +2053,7 @@ static bool isIntS16Immediate(SDValue Op, short &Imm) { bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base, SDValue &Index, SelectionDAG &DAG) const { - short imm = 0; + int16_t imm = 0; if (N.getOpcode() == ISD::ADD) { if (isIntS16Immediate(N.getOperand(1), imm)) return false; // r+i @@ -1800,17 +2070,14 @@ bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base, // If this is an or of disjoint bitfields, we can codegen this as an add // (for better address arithmetic) if the LHS and RHS of the OR are provably // disjoint. - APInt LHSKnownZero, LHSKnownOne; - APInt RHSKnownZero, RHSKnownOne; - DAG.computeKnownBits(N.getOperand(0), - LHSKnownZero, LHSKnownOne); - - if (LHSKnownZero.getBoolValue()) { - DAG.computeKnownBits(N.getOperand(1), - RHSKnownZero, RHSKnownOne); + KnownBits LHSKnown, RHSKnown; + DAG.computeKnownBits(N.getOperand(0), LHSKnown); + + if (LHSKnown.Zero.getBoolValue()) { + DAG.computeKnownBits(N.getOperand(1), RHSKnown); // If all of the bits are known zero on the LHS or RHS, the add won't // carry. - if (~(LHSKnownZero | RHSKnownZero) == 0) { + if (~(LHSKnown.Zero | RHSKnown.Zero) == 0) { Base = N.getOperand(0); Index = N.getOperand(1); return true; @@ -1863,12 +2130,12 @@ static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) { /// Returns true if the address N can be represented by a base register plus /// a signed 16-bit displacement [r+imm], and if it is not better -/// represented as reg+reg. If Aligned is true, only accept displacements -/// suitable for STD and friends, i.e. multiples of 4. +/// represented as reg+reg. If \p Alignment is non-zero, only accept +/// displacements that are multiples of that value. bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base, SelectionDAG &DAG, - bool Aligned) const { + unsigned Alignment) const { // FIXME dl should come from parent load or store, not from address SDLoc dl(N); // If this can be more profitably realized as r+r, fail. @@ -1876,9 +2143,9 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, return false; if (N.getOpcode() == ISD::ADD) { - short imm = 0; + int16_t imm = 0; if (isIntS16Immediate(N.getOperand(1), imm) && - (!Aligned || (imm & 3) == 0)) { + (!Alignment || (imm % Alignment) == 0)) { Disp = DAG.getTargetConstant(imm, dl, N.getValueType()); if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) { Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); @@ -1900,16 +2167,16 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, return true; // [&g+r] } } else if (N.getOpcode() == ISD::OR) { - short imm = 0; + int16_t imm = 0; if (isIntS16Immediate(N.getOperand(1), imm) && - (!Aligned || (imm & 3) == 0)) { + (!Alignment || (imm % Alignment) == 0)) { // If this is an or of disjoint bitfields, we can codegen this as an add // (for better address arithmetic) if the LHS and RHS of the OR are // provably disjoint. - APInt LHSKnownZero, LHSKnownOne; - DAG.computeKnownBits(N.getOperand(0), LHSKnownZero, LHSKnownOne); + KnownBits LHSKnown; + DAG.computeKnownBits(N.getOperand(0), LHSKnown); - if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) { + if ((LHSKnown.Zero.getZExtValue()|~(uint64_t)imm) == ~0ULL) { // If all of the bits are known zero on the LHS or RHS, the add won't // carry. if (FrameIndexSDNode *FI = @@ -1928,8 +2195,8 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, // If this address fits entirely in a 16-bit sext immediate field, codegen // this as "d, 0" - short Imm; - if (isIntS16Immediate(CN, Imm) && (!Aligned || (Imm & 3) == 0)) { + int16_t Imm; + if (isIntS16Immediate(CN, Imm) && (!Alignment || (Imm % Alignment) == 0)) { Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0)); Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, CN->getValueType(0)); @@ -1939,7 +2206,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, // Handle 32-bit sext immediates with LIS + addr mode. if ((CN->getValueType(0) == MVT::i32 || (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) && - (!Aligned || (CN->getZExtValue() & 3) == 0)) { + (!Alignment || (CN->getZExtValue() % Alignment) == 0)) { int Addr = (int)CN->getZExtValue(); // Otherwise, break this down into an LIS + disp. @@ -1973,10 +2240,15 @@ bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base, if (SelectAddressRegReg(N, Base, Index, DAG)) return true; - // If the operand is an addition, always emit this as [r+r], since this is - // better (for code size, and execution, as the memop does the add for free) - // than emitting an explicit add. - if (N.getOpcode() == ISD::ADD) { + // If the address is the result of an add, we will utilize the fact that the + // address calculation includes an implicit add. However, we can reduce + // register pressure if we do not materialize a constant just for use as the + // index register. We only get rid of the add if it is not an add of a + // value and a 16-bit signed constant and both have a single use. + int16_t imm = 0; + if (N.getOpcode() == ISD::ADD && + (!isIntS16Immediate(N.getOperand(1), imm) || + !N.getOperand(1).hasOneUse() || !N.getOperand(0).hasOneUse())) { Base = N.getOperand(0); Index = N.getOperand(1); return true; @@ -2026,7 +2298,6 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, } if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) { - // Common code will reject creating a pre-inc form if the base pointer // is a frame index, or if N is a store and the base pointer is either // the same as or a predecessor of the value being stored. Check for @@ -2050,14 +2321,14 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, // LDU/STU can only handle immediates that are a multiple of 4. if (VT != MVT::i64) { - if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, false)) + if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 0)) return false; } else { // LDU/STU need an address with at least 4-byte alignment. if (Alignment < 4) return false; - if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, true)) + if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 4)) return false; } @@ -2277,7 +2548,6 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op, SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { - // FIXME: TLS addresses currently use medium model code sequences, // which is the most useful form. Eventually support for small and // large models could be added if users need it, at the cost of @@ -2300,8 +2570,9 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, PPCII::MO_TPREL_HA); SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TPREL_LO); - SDValue TLSReg = DAG.getRegister(is64bit ? PPC::X13 : PPC::R2, - is64bit ? MVT::i64 : MVT::i32); + SDValue TLSReg = is64bit ? DAG.getRegister(PPC::X13, MVT::i64) + : DAG.getRegister(PPC::R2, MVT::i32); + SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg); return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi); } @@ -2602,10 +2873,9 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg) TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(Chain) - .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol("__trampoline_setup", PtrVT), - std::move(Args)); + CLI.setDebugLoc(dl).setChain(Chain).setLibCallee( + CallingConv::C, Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args)); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); return CallResult.second; @@ -2737,7 +3007,7 @@ bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT, return false; } -bool +bool llvm::CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo, @@ -2752,7 +3022,7 @@ llvm::CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT, unsigned RegNum = State.getFirstUnallocated(ArgRegs); int RegsLeft = NumArgRegs - RegNum; - // Skip if there is not enough registers left for long double type (4 gpr regs + // Skip if there is not enough registers left for long double type (4 gpr regs // in soft float mode) and put long double argument on the stack. if (RegNum != NumArgRegs && RegsLeft < 4) { for (int i = 0; i < RegsLeft; i++) { @@ -4066,7 +4336,7 @@ needStackSlotPassParameters(const PPCSubtarget &Subtarget, static bool hasSameArgumentList(const Function *CallerFn, ImmutableCallSite *CS) { - if (CS->arg_size() != CallerFn->getArgumentList().size()) + if (CS->arg_size() != CallerFn->arg_size()) return false; ImmutableCallSite::arg_iterator CalleeArgIter = CS->arg_begin(); @@ -4222,11 +4492,12 @@ namespace { struct TailCallArgumentInfo { SDValue Arg; SDValue FrameIdxOp; - int FrameIdx; + int FrameIdx = 0; - TailCallArgumentInfo() : FrameIdx(0) {} + TailCallArgumentInfo() = default; }; -} + +} // end anonymous namespace /// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot. static void StoreTailCallArgumentsToStackSlot( @@ -4406,7 +4677,6 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys, ImmutableCallSite *CS, const PPCSubtarget &Subtarget) { - bool isPPC64 = Subtarget.isPPC64(); bool isSVR4ABI = Subtarget.isSVR4ABI(); bool isELFv2ABI = Subtarget.isELFv2ABI(); @@ -4602,7 +4872,6 @@ SDValue PPCTargetLowering::LowerCallResult( SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { - SmallVector<CCValAssign, 16> RVLocs; CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); @@ -4649,7 +4918,6 @@ SDValue PPCTargetLowering::FinishCall( SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff, unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins, SmallVectorImpl<SDValue> &InVals, ImmutableCallSite *CS) const { - std::vector<EVT> NodeTys; SmallVector<SDValue, 8> Ops; unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, CallSeqStart, dl, @@ -4909,8 +5177,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4( // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), - dl); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); SDValue CallSeqStart = Chain; // Load the return address and frame pointer so it can be moved somewhere else @@ -4960,9 +5227,8 @@ SDValue PPCTargetLowering::LowerCall_32SVR4( Flags, DAG, dl); // This must go outside the CALLSEQ_START..END. - SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, - CallSeqStart.getNode()->getOperand(1), - SDLoc(MemcpyCall)); + SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, NumBytes, 0, + SDLoc(MemcpyCall)); DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), NewCallSeqStart.getNode()); Chain = CallSeqStart = NewCallSeqStart; @@ -5043,9 +5309,9 @@ SDValue PPCTargetLowering::createMemcpyOutsideCallSeq( CallSeqStart.getNode()->getOperand(0), Flags, DAG, dl); // The MEMCPY must go outside the CALLSEQ_START..END. - SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, - CallSeqStart.getNode()->getOperand(1), - SDLoc(MemcpyCall)); + int64_t FrameSize = CallSeqStart.getConstantOperandVal(1); + SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, FrameSize, 0, + SDLoc(MemcpyCall)); DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), NewCallSeqStart.getNode()); return NewCallSeqStart; @@ -5059,7 +5325,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, ImmutableCallSite *CS) const { - bool isELFv2ABI = Subtarget.isELFv2ABI(); bool isLittleEndian = Subtarget.isLittleEndian(); unsigned NumOps = Outs.size(); @@ -5105,10 +5370,30 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( }; const unsigned NumGPRs = array_lengthof(GPR); - const unsigned NumFPRs = 13; + const unsigned NumFPRs = useSoftFloat() ? 0 : 13; const unsigned NumVRs = array_lengthof(VR); const unsigned NumQFPRs = NumFPRs; + // On ELFv2, we can avoid allocating the parameter area if all the arguments + // can be passed to the callee in registers. + // For the fast calling convention, there is another check below. + // Note: We should keep consistent with LowerFormalArguments_64SVR4() + bool HasParameterArea = !isELFv2ABI || isVarArg || CallConv == CallingConv::Fast; + if (!HasParameterArea) { + unsigned ParamAreaSize = NumGPRs * PtrByteSize; + unsigned AvailableFPRs = NumFPRs; + unsigned AvailableVRs = NumVRs; + unsigned NumBytesTmp = NumBytes; + for (unsigned i = 0; i != NumOps; ++i) { + if (Outs[i].Flags.isNest()) continue; + if (CalculateStackSlotUsed(Outs[i].VT, Outs[i].ArgVT, Outs[i].Flags, + PtrByteSize, LinkageSize, ParamAreaSize, + NumBytesTmp, AvailableFPRs, AvailableVRs, + Subtarget.hasQPX())) + HasParameterArea = true; + } + } + // When using the fast calling convention, we don't provide backing for // arguments that will be in registers. unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0; @@ -5176,13 +5461,18 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( unsigned NumBytesActuallyUsed = NumBytes; - // The prolog code of the callee may store up to 8 GPR argument registers to + // In the old ELFv1 ABI, + // the prolog code of the callee may store up to 8 GPR argument registers to // the stack, allowing va_start to index over them in memory if its varargs. // Because we cannot tell if this is needed on the caller side, we have to // conservatively assume that it is needed. As such, make sure we have at // least enough stack space for the caller to store the 8 GPRs. - // FIXME: On ELFv2, it may be unnecessary to allocate the parameter area. - NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); + // In the ELFv2 ABI, we allocate the parameter area iff a callee + // really requires memory operands, e.g. a vararg function. + if (HasParameterArea) + NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); + else + NumBytes = LinkageSize; // Tail call needs the stack to be aligned. if (getTargetMachine().Options.GuaranteedTailCallOpt && @@ -5204,8 +5494,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!IsSibCall) - Chain = DAG.getCALLSEQ_START(Chain, - DAG.getIntPtrConstant(NumBytes, dl, true), dl); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); SDValue CallSeqStart = Chain; // Load the return address and frame pointer so it can be move somewhere else @@ -5401,6 +5690,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( if (CallConv == CallingConv::Fast) ComputePtrOff(); + assert(HasParameterArea && + "Parameter area must exist to pass an argument in memory."); LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, true, isTailCall, false, MemOpChains, TailCallArguments, dl); @@ -5486,6 +5777,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); } + assert(HasParameterArea && + "Parameter area must exist to pass an argument in memory."); LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, true, isTailCall, false, MemOpChains, TailCallArguments, dl); @@ -5520,6 +5813,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( // GPRs when within range. For now, we always put the value in both // locations (or even all three). if (isVarArg) { + assert(HasParameterArea && + "Parameter area must exist if we have a varargs call."); // We could elide this store in the case where the object fits // entirely in R registers. Maybe later. SDValue Store = @@ -5552,6 +5847,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( if (CallConv == CallingConv::Fast) ComputePtrOff(); + assert(HasParameterArea && + "Parameter area must exist to pass an argument in memory."); LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, true, isTailCall, true, MemOpChains, TailCallArguments, dl); @@ -5572,6 +5869,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( case MVT::v4i1: { bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32; if (isVarArg) { + assert(HasParameterArea && + "Parameter area must exist if we have a varargs call."); // We could elide this store in the case where the object fits // entirely in R registers. Maybe later. SDValue Store = @@ -5604,6 +5903,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( if (CallConv == CallingConv::Fast) ComputePtrOff(); + assert(HasParameterArea && + "Parameter area must exist to pass an argument in memory."); LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, true, isTailCall, true, MemOpChains, TailCallArguments, dl); @@ -5618,7 +5919,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( } } - assert(NumBytesActuallyUsed == ArgOffset); + assert((!HasParameterArea || NumBytesActuallyUsed == ArgOffset) && + "mismatch in size of parameter area"); (void)NumBytesActuallyUsed; if (!MemOpChains.empty()) @@ -5673,7 +5975,6 @@ SDValue PPCTargetLowering::LowerCall_Darwin( const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, ImmutableCallSite *CS) const { - unsigned NumOps = Outs.size(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); @@ -5752,8 +6053,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin( // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), - dl); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); SDValue CallSeqStart = Chain; // Load the return address and frame pointer so it can be move somewhere else @@ -6065,7 +6365,6 @@ PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl, SelectionDAG &DAG) const { - SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); @@ -6133,7 +6432,7 @@ PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); - // Get the corect type for integers. + // Get the correct type for integers. EVT IntVT = Op.getValueType(); // Get the inputs. @@ -6150,7 +6449,7 @@ SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, // When we pop the dynamic allocation we need to restore the SP link. SDLoc dl(Op); - // Get the corect type for pointers. + // Get the correct type for pointers. EVT PtrVT = getPointerTy(DAG.getDataLayout()); // Construct the stack pointer operand. @@ -6225,7 +6524,7 @@ SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue Size = Op.getOperand(1); SDLoc dl(Op); - // Get the corect type for pointers. + // Get the correct type for pointers. EVT PtrVT = getPointerTy(DAG.getDataLayout()); // Negate the size. SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT, @@ -6356,6 +6655,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { default: break; // SETUO etc aren't handled by fsel. case ISD::SETNE: std::swap(TV, FV); + LLVM_FALLTHROUGH; case ISD::SETEQ: if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); @@ -6367,6 +6667,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { case ISD::SETULT: case ISD::SETLT: std::swap(TV, FV); // fsel is natively setge, swap operands for setlt + LLVM_FALLTHROUGH; case ISD::SETOGE: case ISD::SETGE: if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits @@ -6375,6 +6676,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { case ISD::SETUGT: case ISD::SETGT: std::swap(TV, FV); // fsel is natively setge, swap operands for setlt + LLVM_FALLTHROUGH; case ISD::SETOLE: case ISD::SETLE: if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits @@ -6388,8 +6690,9 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { default: break; // SETUO etc aren't handled by fsel. case ISD::SETNE: std::swap(TV, FV); + LLVM_FALLTHROUGH; case ISD::SETEQ: - Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags); + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); @@ -6399,25 +6702,25 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV); case ISD::SETULT: case ISD::SETLT: - Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags); + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); case ISD::SETOGE: case ISD::SETGE: - Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags); + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); case ISD::SETUGT: case ISD::SETGT: - Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, &Flags); + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); case ISD::SETOLE: case ISD::SETLE: - Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, &Flags); + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); @@ -6585,6 +6888,7 @@ bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT, // Given the head of the old chain, ResChain, insert a token factor containing // it and NewResChain, and make users of ResChain now be users of that token // factor. +// TODO: Remove and use DAG::makeEquivalentMemoryOrdering() instead. void PPCTargetLowering::spliceIntoChain(SDValue ResChain, SDValue NewResChain, SelectionDAG &DAG) const { @@ -7585,6 +7889,53 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); } + + if (Subtarget.hasVSX() && + PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) { + if (Swap) + std::swap(V1, V2); + SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); + SDValue Conv2 = + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2.isUndef() ? V1 : V2); + + SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv1, Conv2, + DAG.getConstant(ShiftElts, dl, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Shl); + } + + if (Subtarget.hasVSX() && + PPC::isXXPERMDIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) { + if (Swap) + std::swap(V1, V2); + SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1); + SDValue Conv2 = + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2.isUndef() ? V1 : V2); + + SDValue PermDI = DAG.getNode(PPCISD::XXPERMDI, dl, MVT::v2i64, Conv1, Conv2, + DAG.getConstant(ShiftElts, dl, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, PermDI); + } + + if (Subtarget.hasP9Vector()) { + if (PPC::isXXBRHShuffleMask(SVOp)) { + SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); + SDValue ReveHWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v8i16, Conv); + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveHWord); + } else if (PPC::isXXBRWShuffleMask(SVOp)) { + SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); + SDValue ReveWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v4i32, Conv); + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveWord); + } else if (PPC::isXXBRDShuffleMask(SVOp)) { + SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1); + SDValue ReveDWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v2i64, Conv); + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveDWord); + } else if (PPC::isXXBRQShuffleMask(SVOp)) { + SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, V1); + SDValue ReveQWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v1i128, Conv); + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveQWord); + } + } + if (Subtarget.hasVSX()) { if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) { int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG); @@ -7612,7 +7963,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SDValue Swap = DAG.getNode(PPCISD::SWAP_NO_CHAIN, dl, MVT::v2f64, Conv); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Swap); } - } if (Subtarget.hasQPX()) { @@ -7792,24 +8142,39 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc, bool &isDot, const PPCSubtarget &Subtarget) { unsigned IntrinsicID = - cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue(); + cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue(); CompareOpc = -1; isDot = false; switch (IntrinsicID) { - default: return false; - // Comparison predicates. - case Intrinsic::ppc_altivec_vcmpbfp_p: CompareOpc = 966; isDot = 1; break; - case Intrinsic::ppc_altivec_vcmpeqfp_p: CompareOpc = 198; isDot = 1; break; - case Intrinsic::ppc_altivec_vcmpequb_p: CompareOpc = 6; isDot = 1; break; - case Intrinsic::ppc_altivec_vcmpequh_p: CompareOpc = 70; isDot = 1; break; - case Intrinsic::ppc_altivec_vcmpequw_p: CompareOpc = 134; isDot = 1; break; + default: + return false; + // Comparison predicates. + case Intrinsic::ppc_altivec_vcmpbfp_p: + CompareOpc = 966; + isDot = true; + break; + case Intrinsic::ppc_altivec_vcmpeqfp_p: + CompareOpc = 198; + isDot = true; + break; + case Intrinsic::ppc_altivec_vcmpequb_p: + CompareOpc = 6; + isDot = true; + break; + case Intrinsic::ppc_altivec_vcmpequh_p: + CompareOpc = 70; + isDot = true; + break; + case Intrinsic::ppc_altivec_vcmpequw_p: + CompareOpc = 134; + isDot = true; + break; case Intrinsic::ppc_altivec_vcmpequd_p: if (Subtarget.hasP8Altivec()) { CompareOpc = 199; - isDot = 1; + isDot = true; } else return false; - break; case Intrinsic::ppc_altivec_vcmpneb_p: case Intrinsic::ppc_altivec_vcmpneh_p: @@ -7818,45 +8183,80 @@ static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc, case Intrinsic::ppc_altivec_vcmpnezh_p: case Intrinsic::ppc_altivec_vcmpnezw_p: if (Subtarget.hasP9Altivec()) { - switch(IntrinsicID) { - default: llvm_unreachable("Unknown comparison intrinsic."); - case Intrinsic::ppc_altivec_vcmpneb_p: CompareOpc = 7; break; - case Intrinsic::ppc_altivec_vcmpneh_p: CompareOpc = 71; break; - case Intrinsic::ppc_altivec_vcmpnew_p: CompareOpc = 135; break; - case Intrinsic::ppc_altivec_vcmpnezb_p: CompareOpc = 263; break; - case Intrinsic::ppc_altivec_vcmpnezh_p: CompareOpc = 327; break; - case Intrinsic::ppc_altivec_vcmpnezw_p: CompareOpc = 391; break; + switch (IntrinsicID) { + default: + llvm_unreachable("Unknown comparison intrinsic."); + case Intrinsic::ppc_altivec_vcmpneb_p: + CompareOpc = 7; + break; + case Intrinsic::ppc_altivec_vcmpneh_p: + CompareOpc = 71; + break; + case Intrinsic::ppc_altivec_vcmpnew_p: + CompareOpc = 135; + break; + case Intrinsic::ppc_altivec_vcmpnezb_p: + CompareOpc = 263; + break; + case Intrinsic::ppc_altivec_vcmpnezh_p: + CompareOpc = 327; + break; + case Intrinsic::ppc_altivec_vcmpnezw_p: + CompareOpc = 391; + break; } - isDot = 1; + isDot = true; } else return false; - break; - case Intrinsic::ppc_altivec_vcmpgefp_p: CompareOpc = 454; isDot = 1; break; - case Intrinsic::ppc_altivec_vcmpgtfp_p: CompareOpc = 710; isDot = 1; break; - case Intrinsic::ppc_altivec_vcmpgtsb_p: CompareOpc = 774; isDot = 1; break; - case Intrinsic::ppc_altivec_vcmpgtsh_p: CompareOpc = 838; isDot = 1; break; - case Intrinsic::ppc_altivec_vcmpgtsw_p: CompareOpc = 902; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgefp_p: + CompareOpc = 454; + isDot = true; + break; + case Intrinsic::ppc_altivec_vcmpgtfp_p: + CompareOpc = 710; + isDot = true; + break; + case Intrinsic::ppc_altivec_vcmpgtsb_p: + CompareOpc = 774; + isDot = true; + break; + case Intrinsic::ppc_altivec_vcmpgtsh_p: + CompareOpc = 838; + isDot = true; + break; + case Intrinsic::ppc_altivec_vcmpgtsw_p: + CompareOpc = 902; + isDot = true; + break; case Intrinsic::ppc_altivec_vcmpgtsd_p: if (Subtarget.hasP8Altivec()) { CompareOpc = 967; - isDot = 1; + isDot = true; } else return false; - break; - case Intrinsic::ppc_altivec_vcmpgtub_p: CompareOpc = 518; isDot = 1; break; - case Intrinsic::ppc_altivec_vcmpgtuh_p: CompareOpc = 582; isDot = 1; break; - case Intrinsic::ppc_altivec_vcmpgtuw_p: CompareOpc = 646; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgtub_p: + CompareOpc = 518; + isDot = true; + break; + case Intrinsic::ppc_altivec_vcmpgtuh_p: + CompareOpc = 582; + isDot = true; + break; + case Intrinsic::ppc_altivec_vcmpgtuw_p: + CompareOpc = 646; + isDot = true; + break; case Intrinsic::ppc_altivec_vcmpgtud_p: if (Subtarget.hasP8Altivec()) { CompareOpc = 711; - isDot = 1; + isDot = true; } else return false; - break; - // VSX predicate comparisons use the same infrastructure + + // VSX predicate comparisons use the same infrastructure case Intrinsic::ppc_vsx_xvcmpeqdp_p: case Intrinsic::ppc_vsx_xvcmpgedp_p: case Intrinsic::ppc_vsx_xvcmpgtdp_p: @@ -7865,33 +8265,51 @@ static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc, case Intrinsic::ppc_vsx_xvcmpgtsp_p: if (Subtarget.hasVSX()) { switch (IntrinsicID) { - case Intrinsic::ppc_vsx_xvcmpeqdp_p: CompareOpc = 99; break; - case Intrinsic::ppc_vsx_xvcmpgedp_p: CompareOpc = 115; break; - case Intrinsic::ppc_vsx_xvcmpgtdp_p: CompareOpc = 107; break; - case Intrinsic::ppc_vsx_xvcmpeqsp_p: CompareOpc = 67; break; - case Intrinsic::ppc_vsx_xvcmpgesp_p: CompareOpc = 83; break; - case Intrinsic::ppc_vsx_xvcmpgtsp_p: CompareOpc = 75; break; + case Intrinsic::ppc_vsx_xvcmpeqdp_p: + CompareOpc = 99; + break; + case Intrinsic::ppc_vsx_xvcmpgedp_p: + CompareOpc = 115; + break; + case Intrinsic::ppc_vsx_xvcmpgtdp_p: + CompareOpc = 107; + break; + case Intrinsic::ppc_vsx_xvcmpeqsp_p: + CompareOpc = 67; + break; + case Intrinsic::ppc_vsx_xvcmpgesp_p: + CompareOpc = 83; + break; + case Intrinsic::ppc_vsx_xvcmpgtsp_p: + CompareOpc = 75; + break; } - isDot = 1; - } - else + isDot = true; + } else return false; - break; - // Normal Comparisons. - case Intrinsic::ppc_altivec_vcmpbfp: CompareOpc = 966; isDot = 0; break; - case Intrinsic::ppc_altivec_vcmpeqfp: CompareOpc = 198; isDot = 0; break; - case Intrinsic::ppc_altivec_vcmpequb: CompareOpc = 6; isDot = 0; break; - case Intrinsic::ppc_altivec_vcmpequh: CompareOpc = 70; isDot = 0; break; - case Intrinsic::ppc_altivec_vcmpequw: CompareOpc = 134; isDot = 0; break; + // Normal Comparisons. + case Intrinsic::ppc_altivec_vcmpbfp: + CompareOpc = 966; + break; + case Intrinsic::ppc_altivec_vcmpeqfp: + CompareOpc = 198; + break; + case Intrinsic::ppc_altivec_vcmpequb: + CompareOpc = 6; + break; + case Intrinsic::ppc_altivec_vcmpequh: + CompareOpc = 70; + break; + case Intrinsic::ppc_altivec_vcmpequw: + CompareOpc = 134; + break; case Intrinsic::ppc_altivec_vcmpequd: - if (Subtarget.hasP8Altivec()) { + if (Subtarget.hasP8Altivec()) CompareOpc = 199; - isDot = 0; - } else + else return false; - break; case Intrinsic::ppc_altivec_vcmpneb: case Intrinsic::ppc_altivec_vcmpneh: @@ -7899,43 +8317,67 @@ static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc, case Intrinsic::ppc_altivec_vcmpnezb: case Intrinsic::ppc_altivec_vcmpnezh: case Intrinsic::ppc_altivec_vcmpnezw: - if (Subtarget.hasP9Altivec()) { + if (Subtarget.hasP9Altivec()) switch (IntrinsicID) { - default: llvm_unreachable("Unknown comparison intrinsic."); - case Intrinsic::ppc_altivec_vcmpneb: CompareOpc = 7; break; - case Intrinsic::ppc_altivec_vcmpneh: CompareOpc = 71; break; - case Intrinsic::ppc_altivec_vcmpnew: CompareOpc = 135; break; - case Intrinsic::ppc_altivec_vcmpnezb: CompareOpc = 263; break; - case Intrinsic::ppc_altivec_vcmpnezh: CompareOpc = 327; break; - case Intrinsic::ppc_altivec_vcmpnezw: CompareOpc = 391; break; + default: + llvm_unreachable("Unknown comparison intrinsic."); + case Intrinsic::ppc_altivec_vcmpneb: + CompareOpc = 7; + break; + case Intrinsic::ppc_altivec_vcmpneh: + CompareOpc = 71; + break; + case Intrinsic::ppc_altivec_vcmpnew: + CompareOpc = 135; + break; + case Intrinsic::ppc_altivec_vcmpnezb: + CompareOpc = 263; + break; + case Intrinsic::ppc_altivec_vcmpnezh: + CompareOpc = 327; + break; + case Intrinsic::ppc_altivec_vcmpnezw: + CompareOpc = 391; + break; } - isDot = 0; - } else + else return false; break; - case Intrinsic::ppc_altivec_vcmpgefp: CompareOpc = 454; isDot = 0; break; - case Intrinsic::ppc_altivec_vcmpgtfp: CompareOpc = 710; isDot = 0; break; - case Intrinsic::ppc_altivec_vcmpgtsb: CompareOpc = 774; isDot = 0; break; - case Intrinsic::ppc_altivec_vcmpgtsh: CompareOpc = 838; isDot = 0; break; - case Intrinsic::ppc_altivec_vcmpgtsw: CompareOpc = 902; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgefp: + CompareOpc = 454; + break; + case Intrinsic::ppc_altivec_vcmpgtfp: + CompareOpc = 710; + break; + case Intrinsic::ppc_altivec_vcmpgtsb: + CompareOpc = 774; + break; + case Intrinsic::ppc_altivec_vcmpgtsh: + CompareOpc = 838; + break; + case Intrinsic::ppc_altivec_vcmpgtsw: + CompareOpc = 902; + break; case Intrinsic::ppc_altivec_vcmpgtsd: - if (Subtarget.hasP8Altivec()) { + if (Subtarget.hasP8Altivec()) CompareOpc = 967; - isDot = 0; - } else + else return false; - break; - case Intrinsic::ppc_altivec_vcmpgtub: CompareOpc = 518; isDot = 0; break; - case Intrinsic::ppc_altivec_vcmpgtuh: CompareOpc = 582; isDot = 0; break; - case Intrinsic::ppc_altivec_vcmpgtuw: CompareOpc = 646; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgtub: + CompareOpc = 518; + break; + case Intrinsic::ppc_altivec_vcmpgtuh: + CompareOpc = 582; + break; + case Intrinsic::ppc_altivec_vcmpgtuw: + CompareOpc = 646; + break; case Intrinsic::ppc_altivec_vcmpgtud: - if (Subtarget.hasP8Altivec()) { + if (Subtarget.hasP8Altivec()) CompareOpc = 711; - isDot = 0; - } else + else return false; - break; } return true; @@ -7950,9 +8392,9 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, if (IntrinsicID == Intrinsic::thread_pointer) { // Reads the thread pointer register, used for __builtin_thread_pointer. - bool is64bit = Subtarget.isPPC64(); - return DAG.getRegister(is64bit ? PPC::X13 : PPC::R2, - is64bit ? MVT::i64 : MVT::i32); + if (Subtarget.isPPC64()) + return DAG.getRegister(PPC::X13, MVT::i64); + return DAG.getRegister(PPC::R2, MVT::i32); } // If this is a lowered altivec predicate compare, CompareOpc is set to the @@ -8019,6 +8461,40 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return Flags; } +SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op, + SelectionDAG &DAG) const { + // SelectionDAGBuilder::visitTargetIntrinsic may insert one extra chain to + // the beginning of the argument list. + int ArgStart = isa<ConstantSDNode>(Op.getOperand(0)) ? 0 : 1; + SDLoc DL(Op); + switch (cast<ConstantSDNode>(Op.getOperand(ArgStart))->getZExtValue()) { + case Intrinsic::ppc_cfence: { + assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument."); + assert(Subtarget.isPPC64() && "Only 64-bit is supported for now."); + return SDValue(DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other, + DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, + Op.getOperand(ArgStart + 1)), + Op.getOperand(0)), + 0); + } + default: + break; + } + return SDValue(); +} + +SDValue PPCTargetLowering::LowerREM(SDValue Op, SelectionDAG &DAG) const { + // Check for a DIV with the same operands as this REM. + for (auto UI : Op.getOperand(1)->uses()) { + if ((Op.getOpcode() == ISD::SREM && UI->getOpcode() == ISD::SDIV) || + (Op.getOpcode() == ISD::UREM && UI->getOpcode() == ISD::UDIV)) + if (UI->getOperand(0) == Op.getOperand(0) && + UI->getOperand(1) == Op.getOperand(1)) + return SDValue(); + } + return Op; +} + SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -8044,7 +8520,7 @@ SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, } SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, - SelectionDAG &DAG) const { + SelectionDAG &DAG) const { SDLoc dl(Op); // Create a stack slot that is 16-byte aligned. MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); @@ -8484,6 +8960,12 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { // Frame & Return address. case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); + + case ISD::INTRINSIC_VOID: + return LowerINTRINSIC_VOID(Op, DAG); + case ISD::SREM: + case ISD::UREM: + return LowerREM(Op, DAG); } } @@ -8575,9 +9057,9 @@ static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) { // The mappings for emitLeading/TrailingFence is taken from // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html -Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder, - AtomicOrdering Ord, bool IsStore, - bool IsLoad) const { +Instruction *PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder, + Instruction *Inst, + AtomicOrdering Ord) const { if (Ord == AtomicOrdering::SequentiallyConsistent) return callIntrinsic(Builder, Intrinsic::ppc_sync); if (isReleaseOrStronger(Ord)) @@ -8585,15 +9067,22 @@ Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder, return nullptr; } -Instruction* PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder, - AtomicOrdering Ord, bool IsStore, - bool IsLoad) const { - if (IsLoad && isAcquireOrStronger(Ord)) +Instruction *PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder, + Instruction *Inst, + AtomicOrdering Ord) const { + if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) { + // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and + // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html + // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification. + if (isa<LoadInst>(Inst) && Subtarget.isPPC64()) + return Builder.CreateCall( + Intrinsic::getDeclaration( + Builder.GetInsertBlock()->getParent()->getParent(), + Intrinsic::ppc_cfence, {Inst->getType()}), + {Inst}); + // FIXME: Can use isync for rmw operation. return callIntrinsic(Builder, Intrinsic::ppc_lwsync); - // FIXME: this is too conservative, a dependent branch + isync is enough. - // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and - // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html - // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification. + } return nullptr; } @@ -8889,6 +9378,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI.getDebugLoc(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -8902,7 +9392,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, unsigned DstReg = MI.getOperand(0).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(DstReg); - assert(RC->hasType(MVT::i32) && "Invalid destination!"); + assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"); unsigned mainDstReg = MRI.createVirtualRegister(RC); unsigned restoreDstReg = MRI.createVirtualRegister(RC); @@ -8985,7 +9475,6 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, // Setup MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB); - const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); MIB.addRegMask(TRI->getNoPreservedMask()); BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1); @@ -9174,10 +9663,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineFunction *F = BB->getParent(); - if (Subtarget.hasISEL() && - (MI.getOpcode() == PPC::SELECT_CC_I4 || + if (MI.getOpcode() == PPC::SELECT_CC_I4 || MI.getOpcode() == PPC::SELECT_CC_I8 || - MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8)) { + MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8) { SmallVector<MachineOperand, 2> Cond; if (MI.getOpcode() == PPC::SELECT_CC_I4 || MI.getOpcode() == PPC::SELECT_CC_I8) @@ -9417,7 +9905,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, BB = EmitAtomicBinary(MI, BB, 4, 0); else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I64) BB = EmitAtomicBinary(MI, BB, 8, 0); - else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 || MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 || (Subtarget.hasPartwordAtomics() && @@ -10028,14 +10515,12 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { return false; } - /// This function is called when we have proved that a SETCC node can be replaced /// by subtraction (and other supporting instructions) so that the result of /// comparison is kept in a GPR instead of CR. This function is purely for /// codegen purposes and has some flags to guide the codegen process. static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement, bool Swap, SDLoc &DL, SelectionDAG &DAG) { - assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected."); // Zero extend the operands to the largest legal integer. Originally, they @@ -10068,7 +10553,6 @@ static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement, SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N, DAGCombinerInfo &DCI) const { - assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected."); SelectionDAG &DAG = DCI.DAG; @@ -10155,17 +10639,16 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N, } else { // This is neither a signed nor an unsigned comparison, just make sure // that the high bits are equal. - APInt Op1Zero, Op1One; - APInt Op2Zero, Op2One; - DAG.computeKnownBits(N->getOperand(0), Op1Zero, Op1One); - DAG.computeKnownBits(N->getOperand(1), Op2Zero, Op2One); + KnownBits Op1Known, Op2Known; + DAG.computeKnownBits(N->getOperand(0), Op1Known); + DAG.computeKnownBits(N->getOperand(1), Op2Known); // We don't really care about what is known about the first bit (if // anything), so clear it in all masks prior to comparing them. - Op1Zero.clearBit(0); Op1One.clearBit(0); - Op2Zero.clearBit(0); Op2One.clearBit(0); + Op1Known.Zero.clearBit(0); Op1Known.One.clearBit(0); + Op2Known.Zero.clearBit(0); Op2Known.One.clearBit(0); - if (Op1Zero != Op2Zero || Op1One != Op2One) + if (Op1Known.Zero != Op2Known.Zero || Op1Known.One != Op2Known.One) return SDValue(); } } @@ -10842,6 +11325,132 @@ static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +// This function adds the required vector_shuffle needed to get +// the elements of the vector extract in the correct position +// as specified by the CorrectElems encoding. +static SDValue addShuffleForVecExtend(SDNode *N, SelectionDAG &DAG, + SDValue Input, uint64_t Elems, + uint64_t CorrectElems) { + SDLoc dl(N); + + unsigned NumElems = Input.getValueType().getVectorNumElements(); + SmallVector<int, 16> ShuffleMask(NumElems, -1); + + // Knowing the element indices being extracted from the original + // vector and the order in which they're being inserted, just put + // them at element indices required for the instruction. + for (unsigned i = 0; i < N->getNumOperands(); i++) { + if (DAG.getDataLayout().isLittleEndian()) + ShuffleMask[CorrectElems & 0xF] = Elems & 0xF; + else + ShuffleMask[(CorrectElems & 0xF0) >> 4] = (Elems & 0xF0) >> 4; + CorrectElems = CorrectElems >> 8; + Elems = Elems >> 8; + } + + SDValue Shuffle = + DAG.getVectorShuffle(Input.getValueType(), dl, Input, + DAG.getUNDEF(Input.getValueType()), ShuffleMask); + + EVT Ty = N->getValueType(0); + SDValue BV = DAG.getNode(PPCISD::SExtVElems, dl, Ty, Shuffle); + return BV; +} + +// Look for build vector patterns where input operands come from sign +// extended vector_extract elements of specific indices. If the correct indices +// aren't used, add a vector shuffle to fix up the indices and create a new +// PPCISD:SExtVElems node which selects the vector sign extend instructions +// during instruction selection. +static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) { + // This array encodes the indices that the vector sign extend instructions + // extract from when extending from one type to another for both BE and LE. + // The right nibble of each byte corresponds to the LE incides. + // and the left nibble of each byte corresponds to the BE incides. + // For example: 0x3074B8FC byte->word + // For LE: the allowed indices are: 0x0,0x4,0x8,0xC + // For BE: the allowed indices are: 0x3,0x7,0xB,0xF + // For example: 0x000070F8 byte->double word + // For LE: the allowed indices are: 0x0,0x8 + // For BE: the allowed indices are: 0x7,0xF + uint64_t TargetElems[] = { + 0x3074B8FC, // b->w + 0x000070F8, // b->d + 0x10325476, // h->w + 0x00003074, // h->d + 0x00001032, // w->d + }; + + uint64_t Elems = 0; + int Index; + SDValue Input; + + auto isSExtOfVecExtract = [&](SDValue Op) -> bool { + if (!Op) + return false; + if (Op.getOpcode() != ISD::SIGN_EXTEND) + return false; + + SDValue Extract = Op.getOperand(0); + if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return false; + + ConstantSDNode *ExtOp = dyn_cast<ConstantSDNode>(Extract.getOperand(1)); + if (!ExtOp) + return false; + + Index = ExtOp->getZExtValue(); + if (Input && Input != Extract.getOperand(0)) + return false; + + if (!Input) + Input = Extract.getOperand(0); + + Elems = Elems << 8; + Index = DAG.getDataLayout().isLittleEndian() ? Index : Index << 4; + Elems |= Index; + + return true; + }; + + // If the build vector operands aren't sign extended vector extracts, + // of the same input vector, then return. + for (unsigned i = 0; i < N->getNumOperands(); i++) { + if (!isSExtOfVecExtract(N->getOperand(i))) { + return SDValue(); + } + } + + // If the vector extract indicies are not correct, add the appropriate + // vector_shuffle. + int TgtElemArrayIdx; + int InputSize = Input.getValueType().getScalarSizeInBits(); + int OutputSize = N->getValueType(0).getScalarSizeInBits(); + if (InputSize + OutputSize == 40) + TgtElemArrayIdx = 0; + else if (InputSize + OutputSize == 72) + TgtElemArrayIdx = 1; + else if (InputSize + OutputSize == 48) + TgtElemArrayIdx = 2; + else if (InputSize + OutputSize == 80) + TgtElemArrayIdx = 3; + else if (InputSize + OutputSize == 96) + TgtElemArrayIdx = 4; + else + return SDValue(); + + uint64_t CorrectElems = TargetElems[TgtElemArrayIdx]; + CorrectElems = DAG.getDataLayout().isLittleEndian() + ? CorrectElems & 0x0F0F0F0F0F0F0F0F + : CorrectElems & 0xF0F0F0F0F0F0F0F0; + if (Elems != CorrectElems) { + return addShuffleForVecExtend(N, DAG, Input, Elems, CorrectElems); + } + + // Regular lowering will catch cases where a shuffle is not needed. + return SDValue(); +} + SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N, DAGCombinerInfo &DCI) const { assert(N->getOpcode() == ISD::BUILD_VECTOR && @@ -10869,6 +11478,15 @@ SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N, if (Reduced) return Reduced; + // If we're building a vector out of extended elements from another vector + // we have P9 vector integer extend instructions. + if (Subtarget.hasP9Altivec()) { + Reduced = combineBVOfVecSExt(N, DAG); + if (Reduced) + return Reduced; + } + + if (N->getValueType(0) != MVT::v2f64) return SDValue(); @@ -11053,6 +11671,14 @@ SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N, } MVT VecTy = N->getValueType(0).getSimpleVT(); + + // Do not expand to PPCISD::LXVD2X + PPCISD::XXSWAPD when the load is + // aligned and the type is a vector with elements up to 4 bytes + if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16) + && VecTy.getScalarSizeInBits() <= 32 ) { + return SDValue(); + } + SDValue LoadOps[] = { Chain, Base }; SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl, DAG.getVTList(MVT::v2f64, MVT::Other), @@ -11117,6 +11743,13 @@ SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N, SDValue Src = N->getOperand(SrcOpnd); MVT VecTy = Src.getValueType().getSimpleVT(); + // Do not expand to PPCISD::XXSWAPD and PPCISD::STXVD2X when the load is + // aligned and the type is a vector with elements up to 4 bytes + if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16) + && VecTy.getScalarSizeInBits() <= 32 ) { + return SDValue(); + } + // All stores are done as v2f64 and possible bit cast. if (VecTy != MVT::v2f64) { Src = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Src); @@ -11141,6 +11774,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, SDLoc dl(N); switch (N->getOpcode()) { default: break; + case ISD::SHL: + return combineSHL(N, DCI); + case ISD::SRA: + return combineSRA(N, DCI); + case ISD::SRL: + return combineSRL(N, DCI); case PPCISD::SHL: if (isNullConstant(N->getOperand(0))) // 0 << V -> 0. return N->getOperand(0); @@ -11227,9 +11866,20 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, if (BSwapOp.getValueType() == MVT::i16) BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp); + // If the type of BSWAP operand is wider than stored memory width + // it need to be shifted to the right side before STBRX. + EVT mVT = cast<StoreSDNode>(N)->getMemoryVT(); + if (Op1VT.bitsGT(mVT)) { + int Shift = Op1VT.getSizeInBits() - mVT.getSizeInBits(); + BSwapOp = DAG.getNode(ISD::SRL, dl, Op1VT, BSwapOp, + DAG.getConstant(Shift, dl, MVT::i32)); + // Need to truncate if this is a bswap of i64 stored as i32/i16. + if (Op1VT == MVT::i64) + BSwapOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BSwapOp); + } + SDValue Ops[] = { - N->getOperand(0), BSwapOp, N->getOperand(2), - DAG.getValueType(N->getOperand(1).getValueType()) + N->getOperand(0), BSwapOp, N->getOperand(2), DAG.getValueType(mVT) }; return DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other), @@ -11570,7 +12220,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } break; - case ISD::INTRINSIC_W_CHAIN: { + case ISD::INTRINSIC_W_CHAIN: // For little endian, VSX loads require generating lxvd2x/xxswapd. // Not needed on ISA 3.0 based CPUs since we have a non-permuting load. if (Subtarget.needsSwapsForVSXMemOps()) { @@ -11583,8 +12233,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } } break; - } - case ISD::INTRINSIC_VOID: { + case ISD::INTRINSIC_VOID: // For little endian, VSX stores require generating xxswapd/stxvd2x. // Not needed on ISA 3.0 based CPUs since we have a non-permuting store. if (Subtarget.needsSwapsForVSXMemOps()) { @@ -11597,7 +12246,6 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } } break; - } case ISD::BSWAP: // Turn BSWAP (LOAD) -> lhbrx/lwbrx. if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) && @@ -11635,9 +12283,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, // Return N so it doesn't get rechecked! return SDValue(N, 0); } - break; - case PPCISD::VCMP: { + case PPCISD::VCMP: // If a VCMPo node already exists with exactly the same operands as this // node, use its result instead of this node (VCMPo computes both a CR6 and // a normal output). @@ -11687,7 +12334,6 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, return SDValue(VCMPoNode, 0); } break; - } case ISD::BRCOND: { SDValue Cond = N->getOperand(1); SDValue Target = N->getOperand(2); @@ -11845,17 +12491,17 @@ PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, //===----------------------------------------------------------------------===// void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, - APInt &KnownZero, - APInt &KnownOne, + KnownBits &Known, + const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { - KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0); + Known.resetAll(); switch (Op.getOpcode()) { default: break; case PPCISD::LBRX: { // lhbrx is known to have the top bits cleared out. if (cast<VTSDNode>(Op.getOperand(2))->getVT() == MVT::i16) - KnownZero = 0xFFFF0000; + Known.Zero = 0xFFFF0000; break; } case ISD::INTRINSIC_WO_CHAIN: { @@ -11877,7 +12523,7 @@ void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, case Intrinsic::ppc_altivec_vcmpgtuh_p: case Intrinsic::ppc_altivec_vcmpgtuw_p: case Intrinsic::ppc_altivec_vcmpgtud_p: - KnownZero = ~1U; // All bits but the low one are known to be zero. + Known.Zero = ~1U; // All bits but the low one are known to be zero. break; } } @@ -12295,7 +12941,6 @@ PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, unsigned Intrinsic) const { - switch (Intrinsic) { case Intrinsic::ppc_qpx_qvlfd: case Intrinsic::ppc_qpx_qvlfs: @@ -12753,7 +13398,6 @@ void PPCTargetLowering::insertSSPDeclarations(Module &M) const { } bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { - if (!VT.isSimple() || !Subtarget.hasVSX()) return false; @@ -12768,3 +13412,58 @@ bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { return Imm.isPosZero(); } } + +// For vector shift operation op, fold +// (op x, (and y, ((1 << numbits(x)) - 1))) -> (target op x, y) +static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N, + SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N0.getValueType(); + unsigned OpSizeInBits = VT.getScalarSizeInBits(); + unsigned Opcode = N->getOpcode(); + unsigned TargetOpcode; + + switch (Opcode) { + default: + llvm_unreachable("Unexpected shift operation"); + case ISD::SHL: + TargetOpcode = PPCISD::SHL; + break; + case ISD::SRL: + TargetOpcode = PPCISD::SRL; + break; + case ISD::SRA: + TargetOpcode = PPCISD::SRA; + break; + } + + if (VT.isVector() && TLI.isOperationLegal(Opcode, VT) && + N1->getOpcode() == ISD::AND) + if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1))) + if (Mask->getZExtValue() == OpSizeInBits - 1) + return DAG.getNode(TargetOpcode, SDLoc(N), VT, N0, N1->getOperand(0)); + + return SDValue(); +} + +SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const { + if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) + return Value; + + return SDValue(); +} + +SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const { + if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) + return Value; + + return SDValue(); +} + +SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const { + if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) + return Value; + + return SDValue(); +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h index 05acd25..49d7d82 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -17,13 +17,26 @@ #include "PPC.h" #include "PPCInstrInfo.h" -#include "PPCRegisterInfo.h" #include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Type.h" #include "llvm/Target/TargetLowering.h" +#include <utility> namespace llvm { + namespace PPCISD { + enum NodeType : unsigned { // Start the numbering where the builtin ops and target ops leave off. FIRST_NUMBER = ISD::BUILTIN_OP_END, @@ -54,6 +67,10 @@ namespace llvm { /// VSFRC that is sign-extended from ByteWidth to a 64-byte integer. VEXTS, + /// SExtVElems, takes an input vector of a smaller type and sign + /// extends to an output vector of a larger type. + SExtVElems, + /// Reciprocal estimate instructions (unary FP ops). FRE, FRSQRTE, @@ -73,10 +90,18 @@ namespace llvm { /// XXINSERT, + /// XXREVERSE - The PPC VSX reverse instruction + /// + XXREVERSE, + /// VECSHL - The PPC VSX shift left instruction /// VECSHL, + /// XXPERMDI - The PPC XXPERMDI instruction + /// + XXPERMDI, + /// The CMPB instruction (takes two operands of i32 or i64). CMPB, @@ -104,9 +129,13 @@ namespace llvm { /// at function entry, used for PIC code. GlobalBaseReg, - /// These nodes represent the 32-bit PPC shifts that operate on 6-bit - /// shift amounts. These nodes are generated by the multi-precision shift - /// code. + /// These nodes represent PPC shifts. + /// + /// For scalar types, only the last `n + 1` bits of the shift amounts + /// are used, where n is log2(sizeof(element) * 8). See sld/slw, etc. + /// for exact behaviors. + /// + /// For vector types, only the last n bits are used. See vsld. SRL, SRA, SHL, /// The combination of sra[wd]i and addze used to implemented signed @@ -398,10 +427,12 @@ namespace llvm { /// the last operand. TOC_ENTRY }; - } + + } // end namespace PPCISD /// Define some predicates that are used for node matching. namespace PPC { + /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a /// VPKUHUM instruction. bool isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, @@ -431,7 +462,32 @@ namespace llvm { /// a VMRGEW or VMRGOW instruction bool isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven, unsigned ShuffleKind, SelectionDAG &DAG); - + /// isXXSLDWIShuffleMask - Return true if this is a shuffle mask suitable + /// for a XXSLDWI instruction. + bool isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, + bool &Swap, bool IsLE); + + /// isXXBRHShuffleMask - Return true if this is a shuffle mask suitable + /// for a XXBRH instruction. + bool isXXBRHShuffleMask(ShuffleVectorSDNode *N); + + /// isXXBRWShuffleMask - Return true if this is a shuffle mask suitable + /// for a XXBRW instruction. + bool isXXBRWShuffleMask(ShuffleVectorSDNode *N); + + /// isXXBRDShuffleMask - Return true if this is a shuffle mask suitable + /// for a XXBRD instruction. + bool isXXBRDShuffleMask(ShuffleVectorSDNode *N); + + /// isXXBRQShuffleMask - Return true if this is a shuffle mask suitable + /// for a XXBRQ instruction. + bool isXXBRQShuffleMask(ShuffleVectorSDNode *N); + + /// isXXPERMDIShuffleMask - Return true if this is a shuffle mask suitable + /// for a XXPERMDI instruction. + bool isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, + bool &Swap, bool IsLE); + /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the /// shift amount, otherwise return -1. int isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind, @@ -465,7 +521,8 @@ namespace llvm { /// If this is a qvaligni shuffle mask, return the shift /// amount, otherwise return -1. int isQVALIGNIShuffleMask(SDNode *N); - } + + } // end namespace PPC class PPCTargetLowering : public TargetLowering { const PPCSubtarget &Subtarget; @@ -492,6 +549,7 @@ namespace llvm { return TypeWidenVector; return TargetLoweringBase::getPreferredVectorAction(VT); } + bool useSoftFloat() const override; MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override { @@ -514,6 +572,10 @@ namespace llvm { return true; } + bool convertSetCCLogicToBitwiseLogic(EVT VT) const override { + return VT.isScalarInteger(); + } + bool supportSplitCSR(MachineFunction *MF) const override { return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && @@ -554,7 +616,7 @@ namespace llvm { /// is not better represented as reg+reg. If Aligned is true, only accept /// displacements suitable for STD and friends, i.e. multiples of 4. bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base, - SelectionDAG &DAG, bool Aligned) const; + SelectionDAG &DAG, unsigned Alignment) const; /// SelectAddressRegRegOnly - Given the specified addressed, force it to be /// represented as an indexed [r+r] operation. @@ -585,8 +647,8 @@ namespace llvm { SelectionDAG &DAG) const override; void computeKnownBitsForTargetNode(const SDValue Op, - APInt &KnownZero, - APInt &KnownOne, + KnownBits &Known, + const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth = 0) const override; @@ -596,10 +658,10 @@ namespace llvm { return true; } - Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord, - bool IsStore, bool IsLoad) const override; - Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord, - bool IsStore, bool IsLoad) const override; + Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; + Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, @@ -694,6 +756,10 @@ namespace llvm { bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; + bool convertSelectOfConstantsToMath() const override { + return true; + } + bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; bool getTgtMemIntrinsic(IntrinsicInfo &Info, @@ -785,15 +851,13 @@ namespace llvm { SDValue Chain; SDValue ResChain; MachinePointerInfo MPI; - bool IsDereferenceable; - bool IsInvariant; - unsigned Alignment; + bool IsDereferenceable = false; + bool IsInvariant = false; + unsigned Alignment = 0; AAMDNodes AAInfo; - const MDNode *Ranges; + const MDNode *Ranges = nullptr; - ReuseLoadInfo() - : IsDereferenceable(false), IsInvariant(false), Alignment(0), - Ranges(nullptr) {} + ReuseLoadInfo() = default; MachineMemOperand::Flags MMOFlags() const { MachineMemOperand::Flags F = MachineMemOperand::MONone; @@ -878,6 +942,8 @@ namespace llvm { SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerREM(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const; @@ -906,15 +972,13 @@ namespace llvm { const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const override; - SDValue - LowerCall(TargetLowering::CallLoweringInfo &CLI, - SmallVectorImpl<SDValue> &InVals) const override; + SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const override; - bool - CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, - bool isVarArg, - const SmallVectorImpl<ISD::OutputArg> &Outs, - LLVMContext &Context) const override; + bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + LLVMContext &Context) const override; SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, @@ -978,6 +1042,9 @@ namespace llvm { SDValue DAGCombineBuildVector(SDNode *N, DAGCombinerInfo &DCI) const; SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineSHL(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineSRA(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineSRL(SDNode *N, DAGCombinerInfo &DCI) const; /// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces /// SETCC with integer subtraction when (1) there is a legal way of doing it @@ -994,14 +1061,16 @@ namespace llvm { CCAssignFn *useFastISelCCs(unsigned Flag) const; SDValue - combineElementTruncationToVectorTruncation(SDNode *N, - DAGCombinerInfo &DCI) const; + combineElementTruncationToVectorTruncation(SDNode *N, + DAGCombinerInfo &DCI) const; }; namespace PPC { + FastISel *createFastISel(FunctionLoweringInfo &FuncInfo, const TargetLibraryInfo *LibInfo); - } + + } // end namespace PPC bool CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo, @@ -1026,6 +1095,10 @@ namespace llvm { CCValAssign::LocInfo &LocInfo, ISD::ArgFlagsTy &ArgFlags, CCState &State); -} -#endif // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H + bool isIntS16Immediate(SDNode *N, int16_t &Imm); + bool isIntS16Immediate(SDValue Op, int16_t &Imm); + +} // end namespace llvm + +#endif // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td index fbec878..e2af5e5 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -253,11 +253,11 @@ def LDAT : X_RD5_RS5_IM5<31, 614, (outs g8rc:$rD), (ins g8rc:$rA, u5imm:$FC), Requires<[IsISA3_0]>; } -let Defs = [CR0], mayStore = 1, hasSideEffects = 0 in +let Defs = [CR0], mayStore = 1, mayLoad = 0, hasSideEffects = 0 in def STDCX : XForm_1<31, 214, (outs), (ins g8rc:$rS, memrr:$dst), "stdcx. $rS, $dst", IIC_LdStSTDCX, []>, isDOT; -let mayStore = 1, hasSideEffects = 0 in +let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in def STDAT : X_RD5_RS5_IM5<31, 742, (outs), (ins g8rc:$rS, g8rc:$rA, u5imm:$FC), "stdat $rS, $rA, $FC", IIC_LdStStore>, isPPC64, Requires<[IsISA3_0]>; @@ -634,10 +634,19 @@ let Interpretation64Bit = 1, isCodeGenOnly = 1 in defm EXTSW_32_64 : XForm_11r<31, 986, (outs g8rc:$rA), (ins gprc:$rS), "extsw", "$rA, $rS", IIC_IntSimple, [(set i64:$rA, (sext i32:$rS))]>, isPPC64; +let isCodeGenOnly = 1 in +def EXTSW_32 : XForm_11<31, 986, (outs gprc:$rA), (ins gprc:$rS), + "extsw $rA, $rS", IIC_IntSimple, + []>, isPPC64; defm SRADI : XSForm_1rc<31, 413, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH), "sradi", "$rA, $rS, $SH", IIC_IntRotateDI, [(set i64:$rA, (sra i64:$rS, (i32 imm:$SH)))]>, isPPC64; +// For fast-isel: +let isCodeGenOnly = 1 in +def SRADI_32 : XSForm_1<31, 413, (outs gprc:$rA), (ins gprc:$rS, u6imm:$SH), + "sradi $rA, $rS, $SH", IIC_IntRotateDI, []>, isPPC64; + defm CNTLZD : XForm_11r<31, 58, (outs g8rc:$rA), (ins g8rc:$rS), "cntlzd", "$rA, $rS", IIC_IntGeneral, [(set i64:$rA, (ctlz i64:$rS))]>; @@ -674,6 +683,16 @@ def DIVDE : XOForm_1<31, 425, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), "divde $rT, $rA, $rB", IIC_IntDivD, [(set i64:$rT, (int_ppc_divde g8rc:$rA, g8rc:$rB))]>, isPPC64, Requires<[HasExtDiv]>; + +let Predicates = [IsISA3_0] in { +def MODSD : XForm_8<31, 777, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "modsd $rT, $rA, $rB", IIC_IntDivW, + [(set i64:$rT, (srem i64:$rA, i64:$rB))]>; +def MODUD : XForm_8<31, 265, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "modud $rT, $rA, $rB", IIC_IntDivW, + [(set i64:$rT, (urem i64:$rA, i64:$rB))]>; +} + let Defs = [CR0] in def DIVDEo : XOForm_1<31, 425, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), "divde. $rT, $rA, $rB", IIC_IntDivD, @@ -721,15 +740,26 @@ defm RLDICL : MDForm_1r<30, 0, // For fast-isel: let isCodeGenOnly = 1 in def RLDICL_32_64 : MDForm_1<30, 0, - (outs g8rc:$rA), + (outs g8rc:$rA), + (ins gprc:$rS, u6imm:$SH, u6imm:$MBE), + "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI, + []>, isPPC64; +// End fast-isel. +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +defm RLDICL_32 : MDForm_1r<30, 0, + (outs gprc:$rA), (ins gprc:$rS, u6imm:$SH, u6imm:$MBE), - "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI, + "rldicl", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI, []>, isPPC64; -// End fast-isel. defm RLDICR : MDForm_1r<30, 1, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE), "rldicr", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI, []>, isPPC64; +let isCodeGenOnly = 1 in +def RLDICR_32 : MDForm_1<30, 1, + (outs gprc:$rA), (ins gprc:$rS, u6imm:$SH, u6imm:$MBE), + "rldicr $rA, $rS, $SH, $MBE", IIC_IntRotateDI, + []>, isPPC64; defm RLDIC : MDForm_1r<30, 2, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE), "rldic", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI, @@ -942,13 +972,15 @@ def LDMX : XForm_1<31, 309, (outs g8rc:$rD), (ins memrr:$src), // Support for medium and large code model. let hasSideEffects = 0 in { +let isReMaterializable = 1 in { def ADDIStocHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp), "#ADDIStocHA", []>, isPPC64; +def ADDItocL: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp), + "#ADDItocL", []>, isPPC64; +} let mayLoad = 1 in def LDtocL: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg), "#LDtocL", []>, isPPC64; -def ADDItocL: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp), - "#ADDItocL", []>, isPPC64; } // Support for thread-local storage. @@ -963,6 +995,10 @@ def LDgotTprelL: Pseudo<(outs g8rc:$rD), (ins s16imm64:$disp, g8rc_nox0:$reg), [(set i64:$rD, (PPCldGotTprelL tglobaltlsaddr:$disp, i64:$reg))]>, isPPC64; + +let isPseudo = 1, Defs = [CR7], Itinerary = IIC_LdStSync in +def CFENCE8 : Pseudo<(outs), (ins g8rc:$cr), "#CFENCE8", []>; + def : Pat<(PPCaddTls i64:$in, tglobaltlsaddr:$g), (ADD8TLS $in, tglobaltlsaddr:$g)>; def ADDIStlsgdHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), @@ -977,7 +1013,9 @@ def ADDItlsgdL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), isPPC64; // LR8 is a true define, while the rest of the Defs are clobbers. X3 is // explicitly defined when this op is created, so not mentioned here. -let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, +// This is lowered to BL8_NOP_TLS by the assembly printer, so the size must be +// correct because the branch select pass is relying on it. +let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, Size = 8, Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in def GETtlsADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym), "#GETtlsADDR", @@ -1082,7 +1120,7 @@ def STDBRX: XForm_8<31, 660, (outs), (ins g8rc:$rS, memrr:$dst), } // Stores with Update (pre-inc). -let PPC970_Unit = 2, mayStore = 1 in { +let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in { let Interpretation64Bit = 1, isCodeGenOnly = 1 in { def STBU8 : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst), "stbu $rS, $dst", IIC_LdStStoreUpd, []>, @@ -1232,6 +1270,10 @@ def : Pat<(srl i64:$rS, i32:$rB), def : Pat<(shl i64:$rS, i32:$rB), (SLD $rS, $rB)>; +// SUBFIC +def : Pat<(sub imm64SExt16:$imm, i64:$in), + (SUBFIC8 $in, imm:$imm)>; + // SHL/SRL def : Pat<(shl i64:$in, (i32 imm:$imm)), (RLDICR $in, imm:$imm, (SHL64 imm:$imm))>; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td index 5c02274..5465b5f 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td @@ -407,7 +407,7 @@ def MTVSCR : VXForm_5<1604, (outs), (ins vrrc:$vB), "mtvscr $vB", IIC_LdStLoad, [(int_ppc_altivec_mtvscr v4i32:$vB)]>; -let PPC970_Unit = 2 in { // Loads. +let PPC970_Unit = 2, mayLoad = 1, mayStore = 0 in { // Loads. def LVEBX: XForm_1<31, 7, (outs vrrc:$vD), (ins memrr:$src), "lvebx $vD, $src", IIC_LdStLoad, [(set v16i8:$vD, (int_ppc_altivec_lvebx xoaddr:$src))]>; @@ -434,7 +434,7 @@ def LVSR : XForm_1<31, 38, (outs vrrc:$vD), (ins memrr:$src), [(set v16i8:$vD, (int_ppc_altivec_lvsr xoaddr:$src))]>, PPC970_Unit_LSU; -let PPC970_Unit = 2 in { // Stores. +let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in { // Stores. def STVEBX: XForm_8<31, 135, (outs), (ins vrrc:$rS, memrr:$dst), "stvebx $rS, $dst", IIC_LdStStore, [(int_ppc_altivec_stvebx v16i8:$rS, xoaddr:$dst)]>; @@ -851,6 +851,10 @@ def V_SETALLONES : VXForm_3<908, (outs vrrc:$vD), (ins), // Additional Altivec Patterns // +// Extended mnemonics +def : InstAlias<"vmr $vD, $vA", (VOR vrrc:$vD, vrrc:$vA, vrrc:$vA)>; +def : InstAlias<"vnot $vD, $vA", (VNOR vrrc:$vD, vrrc:$vA, vrrc:$vA)>; + // Loads. def : Pat<(v4i32 (load xoaddr:$src)), (LVX xoaddr:$src)>; @@ -983,6 +987,16 @@ def : Pat<(v8i16 (shl v8i16:$vA, v8i16:$vB)), (v8i16 (VSLH $vA, $vB))>; def : Pat<(v4i32 (shl v4i32:$vA, v4i32:$vB)), (v4i32 (VSLW $vA, $vB))>; +def : Pat<(v1i128 (shl v1i128:$vA, v1i128:$vB)), + (v1i128 (VSL (VSLO $vA, $vB), (VSPLTB 15, $vB)))>; +def : Pat<(v16i8 (PPCshl v16i8:$vA, v16i8:$vB)), + (v16i8 (VSLB $vA, $vB))>; +def : Pat<(v8i16 (PPCshl v8i16:$vA, v8i16:$vB)), + (v8i16 (VSLH $vA, $vB))>; +def : Pat<(v4i32 (PPCshl v4i32:$vA, v4i32:$vB)), + (v4i32 (VSLW $vA, $vB))>; +def : Pat<(v1i128 (PPCshl v1i128:$vA, v1i128:$vB)), + (v1i128 (VSL (VSLO $vA, $vB), (VSPLTB 15, $vB)))>; def : Pat<(v16i8 (srl v16i8:$vA, v16i8:$vB)), (v16i8 (VSRB $vA, $vB))>; @@ -990,6 +1004,16 @@ def : Pat<(v8i16 (srl v8i16:$vA, v8i16:$vB)), (v8i16 (VSRH $vA, $vB))>; def : Pat<(v4i32 (srl v4i32:$vA, v4i32:$vB)), (v4i32 (VSRW $vA, $vB))>; +def : Pat<(v1i128 (srl v1i128:$vA, v1i128:$vB)), + (v1i128 (VSR (VSRO $vA, $vB), (VSPLTB 15, $vB)))>; +def : Pat<(v16i8 (PPCsrl v16i8:$vA, v16i8:$vB)), + (v16i8 (VSRB $vA, $vB))>; +def : Pat<(v8i16 (PPCsrl v8i16:$vA, v8i16:$vB)), + (v8i16 (VSRH $vA, $vB))>; +def : Pat<(v4i32 (PPCsrl v4i32:$vA, v4i32:$vB)), + (v4i32 (VSRW $vA, $vB))>; +def : Pat<(v1i128 (PPCsrl v1i128:$vA, v1i128:$vB)), + (v1i128 (VSR (VSRO $vA, $vB), (VSPLTB 15, $vB)))>; def : Pat<(v16i8 (sra v16i8:$vA, v16i8:$vB)), (v16i8 (VSRAB $vA, $vB))>; @@ -997,6 +1021,12 @@ def : Pat<(v8i16 (sra v8i16:$vA, v8i16:$vB)), (v8i16 (VSRAH $vA, $vB))>; def : Pat<(v4i32 (sra v4i32:$vA, v4i32:$vB)), (v4i32 (VSRAW $vA, $vB))>; +def : Pat<(v16i8 (PPCsra v16i8:$vA, v16i8:$vB)), + (v16i8 (VSRAB $vA, $vB))>; +def : Pat<(v8i16 (PPCsra v8i16:$vA, v8i16:$vB)), + (v8i16 (VSRAH $vA, $vB))>; +def : Pat<(v4i32 (PPCsra v4i32:$vA, v4i32:$vB)), + (v4i32 (VSRAW $vA, $vB))>; // Float to integer and integer to float conversions def : Pat<(v4i32 (fp_to_sint v4f32:$vA)), @@ -1068,14 +1098,24 @@ def:Pat<(vmrgow_swapped_shuffle v16i8:$vA, v16i8:$vB), // Vector shifts def VRLD : VX1_Int_Ty<196, "vrld", int_ppc_altivec_vrld, v2i64>; def VSLD : VXForm_1<1476, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vsld $vD, $vA, $vB", IIC_VecGeneral, - [(set v2i64:$vD, (shl v2i64:$vA, v2i64:$vB))]>; + "vsld $vD, $vA, $vB", IIC_VecGeneral, []>; def VSRD : VXForm_1<1732, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vsrd $vD, $vA, $vB", IIC_VecGeneral, - [(set v2i64:$vD, (srl v2i64:$vA, v2i64:$vB))]>; + "vsrd $vD, $vA, $vB", IIC_VecGeneral, []>; def VSRAD : VXForm_1<964, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vsrad $vD, $vA, $vB", IIC_VecGeneral, - [(set v2i64:$vD, (sra v2i64:$vA, v2i64:$vB))]>; + "vsrad $vD, $vA, $vB", IIC_VecGeneral, []>; + +def : Pat<(v2i64 (shl v2i64:$vA, v2i64:$vB)), + (v2i64 (VSLD $vA, $vB))>; +def : Pat<(v2i64 (PPCshl v2i64:$vA, v2i64:$vB)), + (v2i64 (VSLD $vA, $vB))>; +def : Pat<(v2i64 (srl v2i64:$vA, v2i64:$vB)), + (v2i64 (VSRD $vA, $vB))>; +def : Pat<(v2i64 (PPCsrl v2i64:$vA, v2i64:$vB)), + (v2i64 (VSRD $vA, $vB))>; +def : Pat<(v2i64 (sra v2i64:$vA, v2i64:$vB)), + (v2i64 (VSRAD $vA, $vB))>; +def : Pat<(v2i64 (PPCsra v2i64:$vA, v2i64:$vB)), + (v2i64 (VSRAD $vA, $vB))>; // Vector Integer Arithmetic Instructions let isCommutable = 1 in { diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 2e0b935..e74ba38 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -65,7 +65,9 @@ UseOldLatencyCalc("ppc-old-latency-calc", cl::Hidden, void PPCInstrInfo::anchor() {} PPCInstrInfo::PPCInstrInfo(PPCSubtarget &STI) - : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP), + : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP, + /* CatchRetOpcode */ -1, + STI.isPPC64() ? PPC::BLR8 : PPC::BLR), Subtarget(STI), RI(STI.getTargetMachine()) {} /// CreateTargetHazardRecognizer - Return the hazard recognizer to use for @@ -290,6 +292,29 @@ unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, return 0; } +// For opcodes with the ReMaterializable flag set, this function is called to +// verify the instruction is really rematable. +bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, + AliasAnalysis *AA) const { + switch (MI.getOpcode()) { + default: + // This function should only be called for opcodes with the ReMaterializable + // flag set. + llvm_unreachable("Unknown rematerializable operation!"); + break; + case PPC::LI: + case PPC::LI8: + case PPC::LIS: + case PPC::LIS8: + case PPC::QVGPCI: + case PPC::ADDIStocHA: + case PPC::ADDItocL: + case PPC::LOAD_STACK_GUARD: + return true; + } + return false; +} + unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const { // Note: This list must be kept consistent with StoreRegToStackSlot. @@ -438,8 +463,8 @@ void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB, BuildMI(MBB, MI, DL, get(Opcode)); } -/// getNoopForMachoTarget - Return the noop instruction to use for a noop. -void PPCInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { +/// Return the noop instruction to use for a noop. +void PPCInstrInfo::getNoop(MCInst &NopInst) const { NopInst.setOpcode(PPC::NOP); } @@ -662,12 +687,14 @@ unsigned PPCInstrInfo::insertBranch(MachineBasicBlock &MBB, (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) : (isPPC64 ? PPC::BDZ8 : PPC::BDZ))).addMBB(TBB); else if (Cond[0].getImm() == PPC::PRED_BIT_SET) - BuildMI(&MBB, DL, get(PPC::BC)).addOperand(Cond[1]).addMBB(TBB); + BuildMI(&MBB, DL, get(PPC::BC)).add(Cond[1]).addMBB(TBB); else if (Cond[0].getImm() == PPC::PRED_BIT_UNSET) - BuildMI(&MBB, DL, get(PPC::BCn)).addOperand(Cond[1]).addMBB(TBB); + BuildMI(&MBB, DL, get(PPC::BCn)).add(Cond[1]).addMBB(TBB); else // Conditional branch BuildMI(&MBB, DL, get(PPC::BCC)) - .addImm(Cond[0].getImm()).addOperand(Cond[1]).addMBB(TBB); + .addImm(Cond[0].getImm()) + .add(Cond[1]) + .addMBB(TBB); return 1; } @@ -677,12 +704,14 @@ unsigned PPCInstrInfo::insertBranch(MachineBasicBlock &MBB, (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) : (isPPC64 ? PPC::BDZ8 : PPC::BDZ))).addMBB(TBB); else if (Cond[0].getImm() == PPC::PRED_BIT_SET) - BuildMI(&MBB, DL, get(PPC::BC)).addOperand(Cond[1]).addMBB(TBB); + BuildMI(&MBB, DL, get(PPC::BC)).add(Cond[1]).addMBB(TBB); else if (Cond[0].getImm() == PPC::PRED_BIT_UNSET) - BuildMI(&MBB, DL, get(PPC::BCn)).addOperand(Cond[1]).addMBB(TBB); + BuildMI(&MBB, DL, get(PPC::BCn)).add(Cond[1]).addMBB(TBB); else BuildMI(&MBB, DL, get(PPC::BCC)) - .addImm(Cond[0].getImm()).addOperand(Cond[1]).addMBB(TBB); + .addImm(Cond[0].getImm()) + .add(Cond[1]) + .addMBB(TBB); BuildMI(&MBB, DL, get(PPC::B)).addMBB(FBB); return 2; } @@ -692,9 +721,6 @@ bool PPCInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, ArrayRef<MachineOperand> Cond, unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const { - if (!Subtarget.hasISEL()) - return false; - if (Cond.size() != 2) return false; @@ -736,9 +762,6 @@ void PPCInstrInfo::insertSelect(MachineBasicBlock &MBB, assert(Cond.size() == 2 && "PPC branch conditions have two components!"); - assert(Subtarget.hasISEL() && - "Cannot insert select on target without ISEL support"); - // Get the register classes. MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); const TargetRegisterClass *RC = @@ -1493,7 +1516,7 @@ bool PPCInstrInfo::DefinesPredicate(MachineInstr &MI, return Found; } -bool PPCInstrInfo::isPredicable(MachineInstr &MI) const { +bool PPCInstrInfo::isPredicable(const MachineInstr &MI) const { unsigned OpC = MI.getOpcode(); switch (OpC) { default: @@ -1533,6 +1556,8 @@ bool PPCInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, case PPC::FCMPUD: SrcReg = MI.getOperand(1).getReg(); SrcReg2 = MI.getOperand(2).getReg(); + Value = 0; + Mask = 0; return true; } } @@ -1591,9 +1616,12 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, // We can perform this optimization, equality only, if MI is // zero-extending. + // FIXME: Other possible target instructions include ANDISo and + // RLWINM aliases, such as ROTRWI, EXTLWI, SLWI and SRWI. if (MIOpC == PPC::CNTLZW || MIOpC == PPC::CNTLZWo || MIOpC == PPC::SLW || MIOpC == PPC::SLWo || MIOpC == PPC::SRW || MIOpC == PPC::SRWo || + MIOpC == PPC::ANDIo || isZeroExtendingRotate) { noSub = true; equalityOnly = true; @@ -1607,8 +1635,9 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, if (equalityOnly) { // We need to check the uses of the condition register in order to reject // non-equality comparisons. - for (MachineRegisterInfo::use_instr_iterator I =MRI->use_instr_begin(CRReg), - IE = MRI->use_instr_end(); I != IE; ++I) { + for (MachineRegisterInfo::use_instr_iterator + I = MRI->use_instr_begin(CRReg), IE = MRI->use_instr_end(); + I != IE; ++I) { MachineInstr *UseMI = &*I; if (UseMI->getOpcode() == PPC::BCC) { unsigned Pred = UseMI->getOperand(0).getImm(); @@ -1630,8 +1659,9 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, for (MachineBasicBlock::iterator EL = CmpInstr.getParent()->end(); I != EL; ++I) { bool FoundUse = false; - for (MachineRegisterInfo::use_instr_iterator J =MRI->use_instr_begin(CRReg), - JE = MRI->use_instr_end(); J != JE; ++J) + for (MachineRegisterInfo::use_instr_iterator + J = MRI->use_instr_begin(CRReg), JE = MRI->use_instr_end(); + J != JE; ++J) if (&*J == &*I) { FoundUse = true; break; @@ -1641,6 +1671,9 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, break; } + SmallVector<std::pair<MachineOperand*, PPC::Predicate>, 4> PredsToUpdate; + SmallVector<std::pair<MachineOperand*, unsigned>, 4> SubRegsToUpdate; + // There are two possible candidates which can be changed to set CR[01]. // One is MI, the other is a SUB instruction. // For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1). @@ -1652,9 +1685,37 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, // same BB as the comparison. This is to allow the check below to avoid calls // (and other explicit clobbers); instead we should really check for these // more explicitly (in at least a few predecessors). - else if (MI->getParent() != CmpInstr.getParent() || Value != 0) { - // PPC does not have a record-form SUBri. + else if (MI->getParent() != CmpInstr.getParent()) return false; + else if (Value != 0) { + // The record-form instructions set CR bit based on signed comparison against 0. + // We try to convert a compare against 1 or -1 into a compare against 0. + bool Success = false; + if (!equalityOnly && MRI->hasOneUse(CRReg)) { + MachineInstr *UseMI = &*MRI->use_instr_begin(CRReg); + if (UseMI->getOpcode() == PPC::BCC) { + PPC::Predicate Pred = (PPC::Predicate)UseMI->getOperand(0).getImm(); + int16_t Immed = (int16_t)Value; + + if (Immed == -1 && Pred == PPC::PRED_GT) { + // We convert "greater than -1" into "greater than or equal to 0", + // since we are assuming signed comparison by !equalityOnly + PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)), + PPC::PRED_GE)); + Success = true; + } + else if (Immed == 1 && Pred == PPC::PRED_LT) { + // We convert "less than 1" into "less than or equal to 0". + PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)), + PPC::PRED_LE)); + Success = true; + } + } + } + + // PPC does not have a record-form SUBri. + if (!Success) + return false; } // Search for Sub. @@ -1720,15 +1781,14 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, if (NewOpC == -1) return false; - SmallVector<std::pair<MachineOperand*, PPC::Predicate>, 4> PredsToUpdate; - SmallVector<std::pair<MachineOperand*, unsigned>, 4> SubRegsToUpdate; - // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based on CMP // needs to be updated to be based on SUB. Push the condition code // operands to OperandsToUpdate. If it is safe to remove CmpInstr, the // condition code of these operands will be modified. + // Here, Value == 0 means we haven't converted comparison against 1 or -1 to + // comparison against 0, which may modify predicate. bool ShouldSwap = false; - if (Sub) { + if (Sub && Value == 0) { ShouldSwap = SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 && Sub->getOperand(2).getReg() == SrcReg; @@ -1765,6 +1825,9 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, } else // We need to abort on a user we don't understand. return false; } + assert(!(Value != 0 && ShouldSwap) && + "Non-zero immediate support and ShouldSwap" + "may conflict in updating predicate"); // Create a new virtual register to hold the value of the CR set by the // record-form instruction. If the instruction was not previously in @@ -1836,8 +1899,7 @@ unsigned PPCInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { PatchPointOpers Opers(&MI); return Opers.getNumPatchBytes(); } else { - const MCInstrDesc &Desc = get(Opcode); - return Desc.getSize(); + return get(Opcode).getSize(); } } @@ -1874,6 +1936,8 @@ PPCInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { } bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { + auto &MBB = *MI.getParent(); + auto DL = MI.getDebugLoc(); switch (MI.getOpcode()) { case TargetOpcode::LOAD_STACK_GUARD: { assert(Subtarget.isTargetLinux() && @@ -1892,6 +1956,8 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case PPC::DFSTOREf64: { assert(Subtarget.hasP9Vector() && "Invalid D-Form Pseudo-ops on non-P9 target."); + assert(MI.getOperand(2).isReg() && MI.getOperand(1).isImm() && + "D-form op must have register and immediate operands"); unsigned UpperOpcode, LowerOpcode; switch (MI.getOpcode()) { case PPC::DFLOADf32: @@ -1921,6 +1987,17 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.setDesc(get(Opcode)); return true; } + case PPC::CFENCE8: { + auto Val = MI.getOperand(0).getReg(); + BuildMI(MBB, MI, DL, get(PPC::CMPD), PPC::CR7).addReg(Val).addReg(Val); + BuildMI(MBB, MI, DL, get(PPC::CTRL_DEP)) + .addImm(PPC::PRED_NE_MINUS) + .addReg(PPC::CR7) + .addImm(1); + MI.setDesc(get(PPC::ISYNC)); + MI.RemoveOperand(0); + return true; + } } return false; } @@ -1931,3 +2008,7 @@ PPCInstrInfo::updatedRC(const TargetRegisterClass *RC) const { return &PPC::VSRCRegClass; return RC; } + +int PPCInstrInfo::getRecordFormOpcode(unsigned Opcode) { + return PPC::getRecordFormOpcode(Opcode); +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h index 32b2f00..b0629c8 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -162,6 +162,8 @@ public: unsigned &SubIdx) const override; unsigned isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; + bool isReallyTriviallyReMaterializable(const MachineInstr &MI, + AliasAnalysis *AA) const override; unsigned isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override; @@ -253,7 +255,7 @@ public: bool DefinesPredicate(MachineInstr &MI, std::vector<MachineOperand> &Pred) const override; - bool isPredicable(MachineInstr &MI) const override; + bool isPredicable(const MachineInstr &MI) const override; // Comparison optimization. @@ -269,7 +271,7 @@ public: /// unsigned getInstSizeInBytes(const MachineInstr &MI) const override; - void getNoopForMachoTarget(MCInst &NopInst) const override; + void getNoop(MCInst &NopInst) const override; std::pair<unsigned, unsigned> decomposeMachineOperandsTargetFlags(unsigned TF) const override; @@ -290,6 +292,7 @@ public: return Reg >= PPC::V0 && Reg <= PPC::V31; } const TargetRegisterClass *updatedRC(const TargetRegisterClass *RC) const; + static int getRecordFormOpcode(unsigned Opcode); }; } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td index f615cc7..dd7fc26 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -32,8 +32,12 @@ def SDT_PPCstxsix : SDTypeProfile<0, 3, [ def SDT_PPCVexts : SDTypeProfile<1, 2, [ SDTCisVT<0, f64>, SDTCisVT<1, f64>, SDTCisPtrTy<2> ]>; +def SDT_PPCSExtVElems : SDTypeProfile<1, 1, [ + SDTCisVec<0>, SDTCisVec<1> +]>; -def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>; +def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>, + SDTCisVT<1, i32> ]>; def SDT_PPCCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>; def SDT_PPCvperm : SDTypeProfile<1, 3, [ @@ -45,13 +49,21 @@ def SDT_PPCVecSplat : SDTypeProfile<1, 2, [ SDTCisVec<0>, ]>; def SDT_PPCVecShift : SDTypeProfile<1, 3, [ SDTCisVec<0>, - SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3> + SDTCisVec<1>, SDTCisVec<2>, SDTCisPtrTy<3> ]>; def SDT_PPCVecInsert : SDTypeProfile<1, 3, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3> ]>; +def SDT_PPCVecReverse: SDTypeProfile<1, 1, [ SDTCisVec<0>, + SDTCisVec<1> +]>; + +def SDT_PPCxxpermdi: SDTypeProfile<1, 3, [ SDTCisVec<0>, + SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3> +]>; + def SDT_PPCvcmp : SDTypeProfile<1, 3, [ SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32> ]>; @@ -114,14 +126,15 @@ def PPCfctiwuz: SDNode<"PPCISD::FCTIWUZ",SDTFPUnaryOp, []>; def PPCstfiwx : SDNode<"PPCISD::STFIWX", SDT_PPCstfiwx, [SDNPHasChain, SDNPMayStore]>; def PPClfiwax : SDNode<"PPCISD::LFIWAX", SDT_PPClfiwx, - [SDNPHasChain, SDNPMayLoad]>; + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def PPClfiwzx : SDNode<"PPCISD::LFIWZX", SDT_PPClfiwx, - [SDNPHasChain, SDNPMayLoad]>; + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def PPClxsizx : SDNode<"PPCISD::LXSIZX", SDT_PPCLxsizx, [SDNPHasChain, SDNPMayLoad]>; def PPCstxsix : SDNode<"PPCISD::STXSIX", SDT_PPCstxsix, [SDNPHasChain, SDNPMayStore]>; def PPCVexts : SDNode<"PPCISD::VEXTS", SDT_PPCVexts, []>; +def PPCSExtVElems : SDNode<"PPCISD::SExtVElems", SDT_PPCSExtVElems, []>; // Extract FPSCR (not modeled at the DAG level). def PPCmffs : SDNode<"PPCISD::MFFS", @@ -169,6 +182,8 @@ def PPCaddiDtprelL : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>; def PPCvperm : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>; def PPCxxsplt : SDNode<"PPCISD::XXSPLT", SDT_PPCVecSplat, []>; def PPCxxinsert : SDNode<"PPCISD::XXINSERT", SDT_PPCVecInsert, []>; +def PPCxxreverse : SDNode<"PPCISD::XXREVERSE", SDT_PPCVecReverse, []>; +def PPCxxpermdi : SDNode<"PPCISD::XXPERMDI", SDT_PPCxxpermdi, []>; def PPCvecshl : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>; def PPCqvfperm : SDNode<"PPCISD::QVFPERM", SDT_PPCqvfperm, []>; @@ -243,7 +258,7 @@ def PPCcondbranch : SDNode<"PPCISD::COND_BRANCH", SDT_PPCcondbr, [SDNPHasChain, SDNPOptInGlue]>; def PPClbrx : SDNode<"PPCISD::LBRX", SDT_PPClbrx, - [SDNPHasChain, SDNPMayLoad]>; + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def PPCstbrx : SDNode<"PPCISD::STBRX", SDT_PPCstbrx, [SDNPHasChain, SDNPMayStore]>; @@ -390,6 +405,25 @@ def unaligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{ return cast<LoadSDNode>(N)->getAlignment() < 4; }]>; +// This is a somewhat weaker condition than actually checking for 16-byte +// alignment. It is simply checking that the displacement can be represented +// as an immediate that is a multiple of 16 (i.e. the requirements for DQ-Form +// instructions). +def quadwOffsetLoad : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isOffsetMultipleOf(N, 16); +}]>; +def quadwOffsetStore : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return isOffsetMultipleOf(N, 16); +}]>; +def nonQuadwOffsetLoad : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return !isOffsetMultipleOf(N, 16); +}]>; +def nonQuadwOffsetStore : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return !isOffsetMultipleOf(N, 16); +}]>; + //===----------------------------------------------------------------------===// // PowerPC Flag Definitions. @@ -770,9 +804,10 @@ def spe2dis : Operand<iPTR> { // SPE displacement where the imm is 2-aligned. } // A single-register address. This is used with the SjLj -// pseudo-instructions. +// pseudo-instructions which tranlates to LD/LWZ. These instructions requires +// G8RC_NOX0 registers. def memr : Operand<iPTR> { - let MIOperandInfo = (ops ptr_rc:$ptrreg); + let MIOperandInfo = (ops ptr_rc_nor0:$ptrreg); } def PPCTLSRegOperand : AsmOperandClass { let Name = "TLSReg"; let PredicateMethod = "isTLSReg"; @@ -799,7 +834,8 @@ def pred : Operand<OtherVT> { def iaddr : ComplexPattern<iPTR, 2, "SelectAddrImm", [], []>; def xaddr : ComplexPattern<iPTR, 2, "SelectAddrIdx", [], []>; def xoaddr : ComplexPattern<iPTR, 2, "SelectAddrIdxOnly",[], []>; -def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX4", [], []>; // "std" +def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX4", [], []>; // "std" +def iqaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX16", [], []>; // "stxv" // The address in a single register. This is used with the SjLj // pseudo-instructions. @@ -1098,9 +1134,11 @@ multiclass AForm_3r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, let hasCtrlDep = 1 in { let Defs = [R1], Uses = [R1] in { -def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt), "#ADJCALLSTACKDOWN $amt", - [(callseq_start timm:$amt)]>; -def ADJCALLSTACKUP : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), "#ADJCALLSTACKUP $amt1 $amt2", +def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), + "#ADJCALLSTACKDOWN $amt1 $amt2", + [(callseq_start timm:$amt1, timm:$amt2)]>; +def ADJCALLSTACKUP : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), + "#ADJCALLSTACKUP $amt1 $amt2", [(callseq_end timm:$amt1, timm:$amt2)]>; } @@ -1219,9 +1257,15 @@ let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in { // FIXME: should be able to write a pattern for PPCcondbranch, but can't use // a two-value operand where a dag node expects two operands. :( let isCodeGenOnly = 1 in { - def BCC : BForm<16, 0, 0, (outs), (ins pred:$cond, condbrtarget:$dst), - "b${cond:cc}${cond:pm} ${cond:reg}, $dst" - /*[(PPCcondbranch crrc:$crS, imm:$opc, bb:$dst)]*/>; + class BCC_class : BForm<16, 0, 0, (outs), (ins pred:$cond, condbrtarget:$dst), + "b${cond:cc}${cond:pm} ${cond:reg}, $dst" + /*[(PPCcondbranch crrc:$crS, imm:$opc, bb:$dst)]*/>; + def BCC : BCC_class; + + // The same as BCC, except that it's not a terminator. Used for introducing + // control flow dependency without creating new blocks. + let isTerminator = 0 in def CTRL_DEP : BCC_class; + def BCCA : BForm<16, 1, 0, (outs), (ins pred:$cond, abscondbrtarget:$dst), "b${cond:cc}a${cond:pm} ${cond:reg}, $dst">; @@ -1648,7 +1692,7 @@ let usesCustomInserter = 1 in { } // Instructions to support atomic operations -let mayLoad = 1, hasSideEffects = 0 in { +let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in { def LBARX : XForm_1<31, 52, (outs gprc:$rD), (ins memrr:$src), "lbarx $rD, $src", IIC_LdStLWARX, []>, Requires<[HasPartwordAtomics]>; @@ -1681,7 +1725,7 @@ def LWAT : X_RD5_RS5_IM5<31, 582, (outs gprc:$rD), (ins gprc:$rA, u5imm:$FC), Requires<[IsISA3_0]>; } -let Defs = [CR0], mayStore = 1, hasSideEffects = 0 in { +let Defs = [CR0], mayStore = 1, mayLoad = 0, hasSideEffects = 0 in { def STBCX : XForm_1<31, 694, (outs), (ins gprc:$rS, memrr:$dst), "stbcx. $rS, $dst", IIC_LdStSTWCX, []>, isDOT, Requires<[HasPartwordAtomics]>; @@ -1694,7 +1738,7 @@ def STWCX : XForm_1<31, 150, (outs), (ins gprc:$rS, memrr:$dst), "stwcx. $rS, $dst", IIC_LdStSTWCX, []>, isDOT; } -let mayStore = 1, hasSideEffects = 0 in +let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in def STWAT : X_RD5_RS5_IM5<31, 710, (outs), (ins gprc:$rS, gprc:$rA, u5imm:$FC), "stwat $rS, $rA, $FC", IIC_LdStStore>, Requires<[IsISA3_0]>; @@ -1740,7 +1784,7 @@ def LFD : DForm_1<50, (outs f8rc:$rD), (ins memri:$src), // Unindexed (r+i) Loads with Update (preinc). -let mayLoad = 1, hasSideEffects = 0 in { +let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in { def LBZU : DForm_1<35, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr), "lbzu $rD, $addr", IIC_LdStLoadUpd, []>, RegConstraint<"$addr.reg = $ea_result">, @@ -1813,7 +1857,7 @@ def LFDUX : XForm_1<31, 631, (outs f8rc:$rD, ptr_rc_nor0:$ea_result), // Indexed (r+r) Loads. // -let PPC970_Unit = 2 in { +let PPC970_Unit = 2, mayLoad = 1, mayStore = 0 in { def LBZX : XForm_1<31, 87, (outs gprc:$rD), (ins memrr:$src), "lbzx $rD, $src", IIC_LdStLoad, [(set i32:$rD, (zextloadi8 xaddr:$src))]>; @@ -1827,8 +1871,6 @@ def LHZX : XForm_1<31, 279, (outs gprc:$rD), (ins memrr:$src), def LWZX : XForm_1<31, 23, (outs gprc:$rD), (ins memrr:$src), "lwzx $rD, $src", IIC_LdStLoad, [(set i32:$rD, (load xaddr:$src))]>; - - def LHBRX : XForm_1<31, 790, (outs gprc:$rD), (ins memrr:$src), "lhbrx $rD, $src", IIC_LdStLoad, [(set i32:$rD, (PPClbrx xoaddr:$src, i16))]>; @@ -1860,7 +1902,7 @@ def LMW : DForm_1<46, (outs gprc:$rD), (ins memri:$src), // // Unindexed (r+i) Stores. -let PPC970_Unit = 2 in { +let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in { def STB : DForm_1<38, (outs), (ins gprc:$rS, memri:$src), "stb $rS, $src", IIC_LdStStore, [(truncstorei8 i32:$rS, iaddr:$src)]>; @@ -1879,7 +1921,7 @@ def STFD : DForm_1<54, (outs), (ins f8rc:$rS, memri:$dst), } // Unindexed (r+i) Stores with Update (preinc). -let PPC970_Unit = 2, mayStore = 1 in { +let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in { def STBU : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst), "stbu $rS, $dst", IIC_LdStStoreUpd, []>, RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; @@ -1948,7 +1990,7 @@ def STFDX : XForm_28<31, 727, (outs), (ins f8rc:$frS, memrr:$dst), } // Indexed (r+r) Stores with Update (preinc). -let PPC970_Unit = 2, mayStore = 1 in { +let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in { def STBUX : XForm_8<31, 247, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst), "stbux $rS, $dst", IIC_LdStStoreUpd, []>, RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">, @@ -2531,6 +2573,14 @@ let Uses = [RM] in { "mffs. $rT", IIC_IntMFFS, []>, isDOT; } +let Predicates = [IsISA3_0] in { +def MODSW : XForm_8<31, 779, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "modsw $rT, $rA, $rB", IIC_IntDivW, + [(set i32:$rT, (srem i32:$rA, i32:$rB))]>; +def MODUW : XForm_8<31, 267, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "moduw $rT, $rA, $rB", IIC_IntDivW, + [(set i32:$rT, (urem i32:$rA, i32:$rB))]>; +} let PPC970_Unit = 1, hasSideEffects = 0 in { // FXU Operations. // XO-Form instructions. Arithmetic instructions that can set overflow bit @@ -4164,6 +4214,8 @@ def : InstAlias<"rotldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, u6imm:$n, 0 def : InstAlias<"rotld $rA, $rS, $rB", (RLDCL g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>; def : InstAlias<"rotld. $rA, $rS, $rB", (RLDCLo g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>; def : InstAlias<"clrldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>; +def : InstAlias<"clrldi $rA, $rS, $n", + (RLDICL_32 gprc:$rA, gprc:$rS, 0, u6imm:$n)>; def : InstAlias<"clrldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>; def RLWINMbm : PPCAsmPseudo<"rlwinm $rA, $rS, $n, $b", @@ -4422,3 +4474,190 @@ def MSGSYNC : XForm_0<31, 886, (outs), (ins), "msgsync", IIC_SprMSGSYNC, []>; def STOP : XForm_0<19, 370, (outs), (ins), "stop", IIC_SprSTOP, []>; } // IsISA3_0 + +// Fast 32-bit reverse bits algorithm: +// Step 1: 1-bit swap (swap odd 1-bit and even 1-bit): +// n = ((n >> 1) & 0x55555555) | ((n << 1) & 0xAAAAAAAA); +// Step 2: 2-bit swap (swap odd 2-bit and even 2-bit): +// n = ((n >> 2) & 0x33333333) | ((n << 2) & 0xCCCCCCCC); +// Step 3: 4-bit swap (swap odd 4-bit and even 4-bit): +// n = ((n >> 4) & 0x0F0F0F0F) | ((n << 4) & 0xF0F0F0F0); +// Step 4: byte reverse (Suppose n = [B1,B2,B3,B4]): +// Step 4.1: Put B4,B2 in the right position (rotate left 3 bytes): +// n' = (n rotl 24); After which n' = [B4, B1, B2, B3] +// Step 4.2: Insert B3 to the right position: +// n' = rlwimi n', n, 8, 8, 15; After which n' = [B4, B3, B2, B3] +// Step 4.3: Insert B1 to the right position: +// n' = rlwimi n', n, 8, 24, 31; After which n' = [B4, B3, B2, B1] +def MaskValues { + dag Lo1 = (ORI (LIS 0x5555), 0x5555); + dag Hi1 = (ORI (LIS 0xAAAA), 0xAAAA); + dag Lo2 = (ORI (LIS 0x3333), 0x3333); + dag Hi2 = (ORI (LIS 0xCCCC), 0xCCCC); + dag Lo4 = (ORI (LIS 0x0F0F), 0x0F0F); + dag Hi4 = (ORI (LIS 0xF0F0), 0xF0F0); +} + +def Shift1 { + dag Right = (RLWINM $A, 31, 1, 31); + dag Left = (RLWINM $A, 1, 0, 30); +} + +def Swap1 { + dag Bit = (OR (AND Shift1.Right, MaskValues.Lo1), + (AND Shift1.Left, MaskValues.Hi1)); +} + +def Shift2 { + dag Right = (RLWINM Swap1.Bit, 30, 2, 31); + dag Left = (RLWINM Swap1.Bit, 2, 0, 29); +} + +def Swap2 { + dag Bits = (OR (AND Shift2.Right, MaskValues.Lo2), + (AND Shift2.Left, MaskValues.Hi2)); +} + +def Shift4 { + dag Right = (RLWINM Swap2.Bits, 28, 4, 31); + dag Left = (RLWINM Swap2.Bits, 4, 0, 27); +} + +def Swap4 { + dag Bits = (OR (AND Shift4.Right, MaskValues.Lo4), + (AND Shift4.Left, MaskValues.Hi4)); +} + +def Rotate { + dag Left3Bytes = (RLWINM Swap4.Bits, 24, 0, 31); +} + +def RotateInsertByte3 { + dag Left = (RLWIMI Rotate.Left3Bytes, Swap4.Bits, 8, 8, 15); +} + +def RotateInsertByte1 { + dag Left = (RLWIMI RotateInsertByte3.Left, Swap4.Bits, 8, 24, 31); +} + +def : Pat<(i32 (bitreverse i32:$A)), + (RLDICL_32 RotateInsertByte1.Left, 0, 32)>; + +// Fast 64-bit reverse bits algorithm: +// Step 1: 1-bit swap (swap odd 1-bit and even 1-bit): +// n = ((n >> 1) & 0x5555555555555555) | ((n << 1) & 0xAAAAAAAAAAAAAAAA); +// Step 2: 2-bit swap (swap odd 2-bit and even 2-bit): +// n = ((n >> 2) & 0x3333333333333333) | ((n << 2) & 0xCCCCCCCCCCCCCCCC); +// Step 3: 4-bit swap (swap odd 4-bit and even 4-bit): +// n = ((n >> 4) & 0x0F0F0F0F0F0F0F0F) | ((n << 4) & 0xF0F0F0F0F0F0F0F0); +// Step 4: byte reverse (Suppose n = [B1,B2,B3,B4,B5,B6,B7,B8]): +// Apply the same byte reverse algorithm mentioned above for the fast 32-bit +// reverse to both the high 32 bit and low 32 bit of the 64 bit value. And +// then OR them together to get the final result. +def MaskValues64 { + dag Lo1 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Lo1, sub_32)); + dag Hi1 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Hi1, sub_32)); + dag Lo2 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Lo2, sub_32)); + dag Hi2 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Hi2, sub_32)); + dag Lo4 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Lo4, sub_32)); + dag Hi4 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Hi4, sub_32)); +} + +def DWMaskValues { + dag Lo1 = (ORI8 (ORIS8 (RLDICR MaskValues64.Lo1, 32, 31), 0x5555), 0x5555); + dag Hi1 = (ORI8 (ORIS8 (RLDICR MaskValues64.Hi1, 32, 31), 0xAAAA), 0xAAAA); + dag Lo2 = (ORI8 (ORIS8 (RLDICR MaskValues64.Lo2, 32, 31), 0x3333), 0x3333); + dag Hi2 = (ORI8 (ORIS8 (RLDICR MaskValues64.Hi2, 32, 31), 0xCCCC), 0xCCCC); + dag Lo4 = (ORI8 (ORIS8 (RLDICR MaskValues64.Lo4, 32, 31), 0x0F0F), 0x0F0F); + dag Hi4 = (ORI8 (ORIS8 (RLDICR MaskValues64.Hi4, 32, 31), 0xF0F0), 0xF0F0); +} + +def DWShift1 { + dag Right = (RLDICL $A, 63, 1); + dag Left = (RLDICR $A, 1, 62); +} + +def DWSwap1 { + dag Bit = (OR8 (AND8 DWShift1.Right, DWMaskValues.Lo1), + (AND8 DWShift1.Left, DWMaskValues.Hi1)); +} + +def DWShift2 { + dag Right = (RLDICL DWSwap1.Bit, 62, 2); + dag Left = (RLDICR DWSwap1.Bit, 2, 61); +} + +def DWSwap2 { + dag Bits = (OR8 (AND8 DWShift2.Right, DWMaskValues.Lo2), + (AND8 DWShift2.Left, DWMaskValues.Hi2)); +} + +def DWShift4 { + dag Right = (RLDICL DWSwap2.Bits, 60, 4); + dag Left = (RLDICR DWSwap2.Bits, 4, 59); +} + +def DWSwap4 { + dag Bits = (OR8 (AND8 DWShift4.Right, DWMaskValues.Lo4), + (AND8 DWShift4.Left, DWMaskValues.Hi4)); +} + +// Bit swap is done, now start byte swap. +def DWExtractLo32 { + dag SubReg = (i32 (EXTRACT_SUBREG DWSwap4.Bits, sub_32)); +} + +def DWRotateLo32 { + dag Left24 = (RLWINM DWExtractLo32.SubReg, 24, 0, 31); +} + +def DWLo32RotateInsertByte3 { + dag Left = (RLWIMI DWRotateLo32.Left24, DWExtractLo32.SubReg, 8, 8, 15); +} + +// Lower 32 bits in the right order +def DWLo32RotateInsertByte1 { + dag Left = + (RLWIMI DWLo32RotateInsertByte3.Left, DWExtractLo32.SubReg, 8, 24, 31); +} + +def ExtendLo32 { + dag To64Bit = + (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + DWLo32RotateInsertByte1.Left, sub_32)); +} + +def DWShiftHi32 { // SRDI DWSwap4.Bits, 32) + dag ToLo32 = (RLDICL DWSwap4.Bits, 32, 32); +} + +def DWExtractHi32 { + dag SubReg = (i32 (EXTRACT_SUBREG DWShiftHi32.ToLo32, sub_32)); +} + +def DWRotateHi32 { + dag Left24 = (RLWINM DWExtractHi32.SubReg, 24, 0, 31); +} + +def DWHi32RotateInsertByte3 { + dag Left = (RLWIMI DWRotateHi32.Left24, DWExtractHi32.SubReg, 8, 8, 15); +} + +// High 32 bits in the right order, but in the low 32-bit position +def DWHi32RotateInsertByte1 { + dag Left = + (RLWIMI DWHi32RotateInsertByte3.Left, DWExtractHi32.SubReg, 8, 24, 31); +} + +def ExtendHi32 { + dag To64Bit = + (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + DWHi32RotateInsertByte1.Left, sub_32)); +} + +def DWShiftLo32 { // SLDI ExtendHi32.To64Bit, 32 + dag ToHi32 = (RLDICR ExtendHi32.To64Bit, 32, 31); +} + +def : Pat<(i64 (bitreverse i64:$A)), + (OR8 DWShiftLo32.ToHi32, ExtendLo32.To64Bit)>; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td index 0d9e345..942e8b3 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -62,7 +62,7 @@ def SDTVecConv : SDTypeProfile<1, 2, [ ]>; def PPClxvd2x : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x, - [SDNPHasChain, SDNPMayLoad]>; + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def PPCstxvd2x : SDNode<"PPCISD::STXVD2X", SDT_PPCstxvd2x, [SDNPHasChain, SDNPMayStore]>; def PPCxxswapd : SDNode<"PPCISD::XXSWAPD", SDT_PPCxxswapd, [SDNPHasChain]>; @@ -117,7 +117,7 @@ let hasSideEffects = 0 in { // VSX instructions don't have side effects. let Uses = [RM] in { // Load indexed instructions - let mayLoad = 1 in { + let mayLoad = 1, mayStore = 0 in { let CodeSize = 3 in def LXSDX : XX1Form<31, 588, (outs vsfrc:$XT), (ins memrr:$src), @@ -138,11 +138,11 @@ let Uses = [RM] in { def LXVW4X : XX1Form<31, 780, (outs vsrc:$XT), (ins memrr:$src), "lxvw4x $XT, $src", IIC_LdStLFD, - [(set v4i32:$XT, (int_ppc_vsx_lxvw4x xoaddr:$src))]>; + []>; } // mayLoad // Store indexed instructions - let mayStore = 1 in { + let mayStore = 1, mayLoad = 0 in { let CodeSize = 3 in def STXSDX : XX1Form<31, 716, (outs), (ins vsfrc:$XT, memrr:$dst), @@ -160,7 +160,7 @@ let Uses = [RM] in { def STXVW4X : XX1Form<31, 908, (outs), (ins vsrc:$XT, memrr:$dst), "stxvw4x $XT, $dst", IIC_LdStSTFD, - [(store v4i32:$XT, xoaddr:$dst)]>; + []>; } } // mayStore @@ -843,7 +843,9 @@ let Uses = [RM] in { def XXPERMDI : XX3Form_2<60, 10, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$DM), - "xxpermdi $XT, $XA, $XB, $DM", IIC_VecPerm, []>; + "xxpermdi $XT, $XA, $XB, $DM", IIC_VecPerm, + [(set v2i64:$XT, (PPCxxpermdi v2i64:$XA, v2i64:$XB, + imm32SExt16:$DM))]>; let isCodeGenOnly = 1 in def XXPERMDIs : XX3Form_2s<60, 10, (outs vsrc:$XT), (ins vsfrc:$XA, u2imm:$DM), "xxpermdi $XT, $XA, $XA, $DM", IIC_VecPerm, []>; @@ -1041,8 +1043,6 @@ let Predicates = [HasVSX, HasOnlySwappingMemOps] in { // Stores. def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>; - def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst), - (STXVW4X $rS, xoaddr:$dst)>; def : Pat<(int_ppc_vsx_stxvd2x_be v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>; def : Pat<(int_ppc_vsx_stxvw4x_be v4i32:$rS, xoaddr:$dst), @@ -1053,8 +1053,12 @@ let Predicates = [IsBigEndian, HasVSX, HasOnlySwappingMemOps] in { def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>; def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>; def : Pat<(v4i32 (load xoaddr:$src)), (LXVW4X xoaddr:$src)>; + def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVW4X xoaddr:$src)>; def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>; def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>; + def : Pat<(store v4i32:$XT, xoaddr:$dst), (STXVW4X $XT, xoaddr:$dst)>; + def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst), + (STXVW4X $rS, xoaddr:$dst)>; } // Permutes. @@ -1064,6 +1068,10 @@ def : Pat<(v4f32 (PPCxxswapd v4f32:$src)), (XXPERMDI $src, $src, 2)>; def : Pat<(v4i32 (PPCxxswapd v4i32:$src)), (XXPERMDI $src, $src, 2)>; def : Pat<(v2f64 (PPCswapNoChain v2f64:$src)), (XXPERMDI $src, $src, 2)>; +// PPCvecshl XT, XA, XA, 2 can be selected to both XXSLDWI XT,XA,XA,2 and +// XXSWAPD XT,XA (i.e. XXPERMDI XT,XA,XA,2), the later one is more profitable. +def : Pat<(v4i32 (PPCvecshl v4i32:$src, v4i32:$src, 2)), (XXPERMDI $src, $src, 2)>; + // Selects. def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLT)), (SELECT_VSRC (CRANDC $lhs, $rhs), $tval, $fval)>; @@ -1197,7 +1205,7 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. [(set v4i32:$XT, (or v4i32:$XA, (vnot_ppc v4i32:$XB)))]>; // VSX scalar loads introduced in ISA 2.07 - let mayLoad = 1 in { + let mayLoad = 1, mayStore = 0 in { let CodeSize = 3 in def LXSSPX : XX1Form<31, 524, (outs vssrc:$XT), (ins memrr:$src), "lxsspx $XT, $src", IIC_LdStLFD, @@ -1211,7 +1219,7 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. } // mayLoad // VSX scalar stores introduced in ISA 2.07 - let mayStore = 1 in { + let mayStore = 1, mayLoad = 0 in { let CodeSize = 3 in def STXSSPX : XX1Form<31, 652, (outs), (ins vssrc:$XT, memrr:$dst), "stxsspx $XT, $dst", IIC_LdStSTFD, @@ -1410,6 +1418,11 @@ let Predicates = [HasDirectMove] in { "mfvsrd $rA, $XT", IIC_VecGeneral, [(set i64:$rA, (PPCmfvsr f64:$XT))]>, Requires<[In64BitMode]>; + let isCodeGenOnly = 1 in + def MFVRD : XX1_RS6_RD5_XO<31, 51, (outs g8rc:$rA), (ins vrrc:$XT), + "mfvsrd $rA, $XT", IIC_VecGeneral, + []>, + Requires<[In64BitMode]>; def MFVSRWZ : XX1_RS6_RD5_XO<31, 115, (outs gprc:$rA), (ins vsfrc:$XT), "mfvsrwz $rA, $XT", IIC_VecGeneral, [(set i32:$rA, (PPCmfvsr f64:$XT))]>; @@ -1429,7 +1442,7 @@ let Predicates = [IsISA3_0, HasDirectMove] in { def MTVSRWS: XX1_RS6_RD5_XO<31, 403, (outs vsrc:$XT), (ins gprc:$rA), "mtvsrws $XT, $rA", IIC_VecGeneral, []>; - def MTVSRDD: XX1Form<31, 435, (outs vsrc:$XT), (ins g8rc:$rA, g8rc:$rB), + def MTVSRDD: XX1Form<31, 435, (outs vsrc:$XT), (ins g8rc_nox0:$rA, g8rc:$rB), "mtvsrdd $XT, $rA, $rB", IIC_VecGeneral, []>, Requires<[In64BitMode]>; @@ -1440,6 +1453,13 @@ let Predicates = [IsISA3_0, HasDirectMove] in { } // IsISA3_0, HasDirectMove } // UseVSXReg = 1 +// We want to parse this from asm, but we don't want to emit this as it would +// be emitted with a VSX reg. So leave Emit = 0 here. +def : InstAlias<"mfvrd $rA, $XT", + (MFVRD g8rc:$rA, vrrc:$XT), 0>; +def : InstAlias<"mffprd $rA, $src", + (MFVSRD g8rc:$rA, f8rc:$src)>; + /* Direct moves of various widths from GPR's into VSR's. Each move lines the value up into element 0 (both BE and LE). Namely, entities smaller than a doubleword are shifted left and moved for BE. For LE, they're moved, then @@ -1878,8 +1898,100 @@ let Predicates = [IsLittleEndian, HasVSX] in def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)), (f64 VectorExtractions.LE_VARIABLE_DOUBLE)>; - def : Pat<(v4i32 (int_ppc_vsx_lxvw4x_be xoaddr:$src)), (LXVW4X xoaddr:$src)>; - def : Pat<(v2f64 (int_ppc_vsx_lxvd2x_be xoaddr:$src)), (LXVD2X xoaddr:$src)>; +def : Pat<(v4i32 (int_ppc_vsx_lxvw4x_be xoaddr:$src)), (LXVW4X xoaddr:$src)>; +def : Pat<(v2f64 (int_ppc_vsx_lxvd2x_be xoaddr:$src)), (LXVD2X xoaddr:$src)>; + +// Variable index unsigned vector_extract on Power9 +let Predicates = [HasP9Altivec, IsLittleEndian] in { + def : Pat<(i64 (anyext (i32 (vector_extract v16i8:$S, i64:$Idx)))), + (VEXTUBRX $Idx, $S)>; + + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, i64:$Idx)))), + (VEXTUHRX (RLWINM8 $Idx, 1, 28, 30), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 0)))), + (VEXTUHRX (LI8 0), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 1)))), + (VEXTUHRX (LI8 2), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 2)))), + (VEXTUHRX (LI8 4), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 3)))), + (VEXTUHRX (LI8 6), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 4)))), + (VEXTUHRX (LI8 8), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 5)))), + (VEXTUHRX (LI8 10), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 6)))), + (VEXTUHRX (LI8 12), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 7)))), + (VEXTUHRX (LI8 14), $S)>; + + def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, i64:$Idx)))), + (VEXTUWRX (RLWINM8 $Idx, 2, 28, 29), $S)>; + def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 0)))), + (VEXTUWRX (LI8 0), $S)>; + def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 1)))), + (VEXTUWRX (LI8 4), $S)>; + def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 2)))), + (VEXTUWRX (LI8 8), $S)>; + def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 3)))), + (VEXTUWRX (LI8 12), $S)>; + + def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, i64:$Idx)))), + (EXTSW (VEXTUWRX (RLWINM8 $Idx, 2, 28, 29), $S))>; + def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 0)))), + (EXTSW (VEXTUWRX (LI8 0), $S))>; + def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 1)))), + (EXTSW (VEXTUWRX (LI8 4), $S))>; + def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 2)))), + (EXTSW (VEXTUWRX (LI8 8), $S))>; + def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 3)))), + (EXTSW (VEXTUWRX (LI8 12), $S))>; +} +let Predicates = [HasP9Altivec, IsBigEndian] in { + def : Pat<(i64 (anyext (i32 (vector_extract v16i8:$S, i64:$Idx)))), + (VEXTUBLX $Idx, $S)>; + + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, i64:$Idx)))), + (VEXTUHLX (RLWINM8 $Idx, 1, 28, 30), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 0)))), + (VEXTUHLX (LI8 0), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 1)))), + (VEXTUHLX (LI8 2), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 2)))), + (VEXTUHLX (LI8 4), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 3)))), + (VEXTUHLX (LI8 6), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 4)))), + (VEXTUHLX (LI8 8), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 5)))), + (VEXTUHLX (LI8 10), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 6)))), + (VEXTUHLX (LI8 12), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 7)))), + (VEXTUHLX (LI8 14), $S)>; + + def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, i64:$Idx)))), + (VEXTUWLX (RLWINM8 $Idx, 2, 28, 29), $S)>; + def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 0)))), + (VEXTUWLX (LI8 0), $S)>; + def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 1)))), + (VEXTUWLX (LI8 4), $S)>; + def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 2)))), + (VEXTUWLX (LI8 8), $S)>; + def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 3)))), + (VEXTUWLX (LI8 12), $S)>; + + def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, i64:$Idx)))), + (EXTSW (VEXTUWLX (RLWINM8 $Idx, 2, 28, 29), $S))>; + def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 0)))), + (EXTSW (VEXTUWLX (LI8 0), $S))>; + def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 1)))), + (EXTSW (VEXTUWLX (LI8 4), $S))>; + def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 2)))), + (EXTSW (VEXTUWLX (LI8 8), $S))>; + def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 3)))), + (EXTSW (VEXTUWLX (LI8 12), $S))>; +} let Predicates = [IsLittleEndian, HasDirectMove] in { // v16i8 scalar <-> vector conversions (LE) @@ -2186,7 +2298,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { } // UseVSXReg = 1 // Pattern for matching Vector HP -> Vector SP intrinsic. Defined as a - // seperate pattern so that it can convert the input register class from + // separate pattern so that it can convert the input register class from // VRRC(v8i16) to VSRC. def : Pat<(v4f32 (int_ppc_vsx_xvcvhpsp v8i16:$A)), (v4f32 (XVCVHPSP (COPY_TO_REGCLASS $A, VSRC)))>; @@ -2320,6 +2432,16 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { def XXBRD : XX2_XT6_XO5_XB6<60, 23, 475, "xxbrd", vsrc, []>; def XXBRQ : XX2_XT6_XO5_XB6<60, 31, 475, "xxbrq", vsrc, []>; + // Vector Reverse + def : Pat<(v8i16 (PPCxxreverse v8i16 :$A)), + (v8i16 (COPY_TO_REGCLASS (XXBRH (COPY_TO_REGCLASS $A, VSRC)), VRRC))>; + def : Pat<(v4i32 (PPCxxreverse v4i32 :$A)), + (v4i32 (XXBRW $A))>; + def : Pat<(v2i64 (PPCxxreverse v2i64 :$A)), + (v2i64 (XXBRD $A))>; + def : Pat<(v1i128 (PPCxxreverse v1i128 :$A)), + (v1i128 (COPY_TO_REGCLASS (XXBRQ (COPY_TO_REGCLASS $A, VSRC)), VRRC))>; + // Vector Permute def XXPERM : XX3_XT5_XA5_XB5<60, 26, "xxperm" , vsrc, vsrc, vsrc, IIC_VecPerm, []>; @@ -2335,7 +2457,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { // When adding new D-Form loads/stores, be sure to update the ImmToIdxMap in // PPCRegisterInfo::PPCRegisterInfo and maybe save yourself some debugging. - let mayLoad = 1 in { + let mayLoad = 1, mayStore = 0 in { // Load Vector def LXV : DQ_RD6_RS5_DQ12<61, 1, (outs vsrc:$XT), (ins memrix16:$src), "lxv $XT, $src", IIC_LdStLFD, []>, UseVSXReg; @@ -2365,8 +2487,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { // Load Vector Indexed def LXVX : X_XT6_RA5_RB5<31, 268, "lxvx" , vsrc, - [(set v2f64:$XT, (load xoaddr:$src))]>; - + [(set v2f64:$XT, (load xaddr:$src))]>; // Load Vector (Left-justified) with Length def LXVL : XX1Form<31, 269, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB), "lxvl $XT, $src, $rB", IIC_LdStLoad, @@ -2383,7 +2504,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { // When adding new D-Form loads/stores, be sure to update the ImmToIdxMap in // PPCRegisterInfo::PPCRegisterInfo and maybe save yourself some debugging. - let mayStore = 1 in { + let mayStore = 1, mayLoad = 0 in { // Store Vector def STXV : DQ_RD6_RS5_DQ12<61, 5, (outs), (ins vsrc:$XT, memrix16:$dst), "stxv $XT, $dst", IIC_LdStSTFD, []>, UseVSXReg; @@ -2416,7 +2537,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { // Store Vector Indexed def STXVX : X_XS6_RA5_RB5<31, 396, "stxvx" , vsrc, - [(store v2f64:$XT, xoaddr:$dst)]>; + [(store v2f64:$XT, xaddr:$dst)]>; // Store Vector (Left-justified) with Length def STXVL : XX1Form<31, 397, (outs), (ins vsrc:$XT, memr:$dst, g8rc:$rB), @@ -2484,21 +2605,42 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 12))>; } // IsLittleEndian, HasP9Vector - def : Pat<(v2f64 (load xoaddr:$src)), (LXVX xoaddr:$src)>; - def : Pat<(v2i64 (load xoaddr:$src)), (LXVX xoaddr:$src)>; - def : Pat<(v4f32 (load xoaddr:$src)), (LXVX xoaddr:$src)>; - def : Pat<(v4i32 (load xoaddr:$src)), (LXVX xoaddr:$src)>; + // D-Form Load/Store + def : Pat<(v4i32 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v4f32 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v2i64 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v2f64 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iqaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iqaddr:$src)), (LXV memrix16:$src)>; + + def : Pat<(quadwOffsetStore v4f32:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>; + def : Pat<(quadwOffsetStore v4i32:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>; + def : Pat<(quadwOffsetStore v2f64:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>; + def : Pat<(quadwOffsetStore v2i64:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>; + def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iqaddr:$dst), + (STXV $rS, memrix16:$dst)>; + def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, iqaddr:$dst), + (STXV $rS, memrix16:$dst)>; + + + def : Pat<(v2f64 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>; + def : Pat<(v2i64 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>; + def : Pat<(v4f32 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>; + def : Pat<(v4i32 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>; def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVX xoaddr:$src)>; def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xoaddr:$src)), (LXVX xoaddr:$src)>; - def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>; - def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>; - def : Pat<(store v4f32:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>; - def : Pat<(store v4i32:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>; + def : Pat<(nonQuadwOffsetStore v2f64:$rS, xoaddr:$dst), + (STXVX $rS, xoaddr:$dst)>; + def : Pat<(nonQuadwOffsetStore v2i64:$rS, xoaddr:$dst), + (STXVX $rS, xoaddr:$dst)>; + def : Pat<(nonQuadwOffsetStore v4f32:$rS, xoaddr:$dst), + (STXVX $rS, xoaddr:$dst)>; + def : Pat<(nonQuadwOffsetStore v4i32:$rS, xoaddr:$dst), + (STXVX $rS, xoaddr:$dst)>; def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>; def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>; - def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))), (v4i32 (LXVWSX xoaddr:$src))>; def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))), @@ -2650,21 +2792,21 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { let isPseudo = 1 in { def DFLOADf32 : Pseudo<(outs vssrc:$XT), (ins memrix:$src), "#DFLOADf32", - [(set f32:$XT, (load iaddr:$src))]>; + [(set f32:$XT, (load ixaddr:$src))]>; def DFLOADf64 : Pseudo<(outs vsfrc:$XT), (ins memrix:$src), "#DFLOADf64", - [(set f64:$XT, (load iaddr:$src))]>; + [(set f64:$XT, (load ixaddr:$src))]>; def DFSTOREf32 : Pseudo<(outs), (ins vssrc:$XT, memrix:$dst), "#DFSTOREf32", - [(store f32:$XT, iaddr:$dst)]>; + [(store f32:$XT, ixaddr:$dst)]>; def DFSTOREf64 : Pseudo<(outs), (ins vsfrc:$XT, memrix:$dst), "#DFSTOREf64", - [(store f64:$XT, iaddr:$dst)]>; + [(store f64:$XT, ixaddr:$dst)]>; } - def : Pat<(f64 (extloadf32 iaddr:$src)), - (COPY_TO_REGCLASS (DFLOADf32 iaddr:$src), VSFRC)>; - def : Pat<(f32 (fpround (extloadf32 iaddr:$src))), - (f32 (DFLOADf32 iaddr:$src))>; + def : Pat<(f64 (extloadf32 ixaddr:$src)), + (COPY_TO_REGCLASS (DFLOADf32 ixaddr:$src), VSFRC)>; + def : Pat<(f32 (fpround (extloadf32 ixaddr:$src))), + (f32 (DFLOADf32 ixaddr:$src))>; } // end HasP9Vector, AddedComplexity // Integer extend helper dags 32 -> 64 @@ -2681,6 +2823,58 @@ def DblToFlt { dag B0 = (f32 (fpround (f64 (extractelt v2f64:$B, 0)))); dag B1 = (f32 (fpround (f64 (extractelt v2f64:$B, 1)))); } + +def ByteToWord { + dag LE_A0 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 0)), i8)); + dag LE_A1 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 4)), i8)); + dag LE_A2 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 8)), i8)); + dag LE_A3 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 12)), i8)); + dag BE_A0 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 3)), i8)); + dag BE_A1 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 7)), i8)); + dag BE_A2 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 11)), i8)); + dag BE_A3 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 15)), i8)); +} + +def ByteToDWord { + dag LE_A0 = (i64 (sext_inreg + (i64 (anyext (i32 (vector_extract v16i8:$A, 0)))), i8)); + dag LE_A1 = (i64 (sext_inreg + (i64 (anyext (i32 (vector_extract v16i8:$A, 8)))), i8)); + dag BE_A0 = (i64 (sext_inreg + (i64 (anyext (i32 (vector_extract v16i8:$A, 7)))), i8)); + dag BE_A1 = (i64 (sext_inreg + (i64 (anyext (i32 (vector_extract v16i8:$A, 15)))), i8)); +} + +def HWordToWord { + dag LE_A0 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 0)), i16)); + dag LE_A1 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 2)), i16)); + dag LE_A2 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 4)), i16)); + dag LE_A3 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 6)), i16)); + dag BE_A0 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 1)), i16)); + dag BE_A1 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 3)), i16)); + dag BE_A2 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 5)), i16)); + dag BE_A3 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 7)), i16)); +} + +def HWordToDWord { + dag LE_A0 = (i64 (sext_inreg + (i64 (anyext (i32 (vector_extract v8i16:$A, 0)))), i16)); + dag LE_A1 = (i64 (sext_inreg + (i64 (anyext (i32 (vector_extract v8i16:$A, 4)))), i16)); + dag BE_A0 = (i64 (sext_inreg + (i64 (anyext (i32 (vector_extract v8i16:$A, 3)))), i16)); + dag BE_A1 = (i64 (sext_inreg + (i64 (anyext (i32 (vector_extract v8i16:$A, 7)))), i16)); +} + +def WordToDWord { + dag LE_A0 = (i64 (sext (i32 (vector_extract v4i32:$A, 0)))); + dag LE_A1 = (i64 (sext (i32 (vector_extract v4i32:$A, 2)))); + dag BE_A0 = (i64 (sext (i32 (vector_extract v4i32:$A, 1)))); + dag BE_A1 = (i64 (sext (i32 (vector_extract v4i32:$A, 3)))); +} + def FltToIntLoad { dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (extloadf32 xoaddr:$A))))); } @@ -2690,9 +2884,15 @@ def FltToUIntLoad { def FltToLongLoad { dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 xoaddr:$A))))); } +def FltToLongLoadP9 { + dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 ixaddr:$A))))); +} def FltToULongLoad { dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 xoaddr:$A))))); } +def FltToULongLoadP9 { + dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 ixaddr:$A))))); +} def FltToLong { dag A = (i64 (PPCmfvsr (PPCfctidz (fpextend f32:$A)))); } @@ -2714,9 +2914,15 @@ def DblToULong { def DblToIntLoad { dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load xoaddr:$A))))); } +def DblToIntLoadP9 { + dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load ixaddr:$A))))); +} def DblToUIntLoad { dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load xoaddr:$A))))); } +def DblToUIntLoadP9 { + dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load ixaddr:$A))))); +} def DblToLongLoad { dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (load xoaddr:$A))))); } @@ -2884,19 +3090,19 @@ let AddedComplexity = 400 in { (v4i32 (XVCVSPSXWS (LXVWSX xoaddr:$A)))>; def : Pat<(v4i32 (scalar_to_vector FltToUIntLoad.A)), (v4i32 (XVCVSPUXWS (LXVWSX xoaddr:$A)))>; - def : Pat<(v4i32 (scalar_to_vector DblToIntLoad.A)), + def : Pat<(v4i32 (scalar_to_vector DblToIntLoadP9.A)), (v4i32 (XXSPLTW (COPY_TO_REGCLASS - (XSCVDPSXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>; - def : Pat<(v4i32 (scalar_to_vector DblToUIntLoad.A)), + (XSCVDPSXWS (DFLOADf64 ixaddr:$A)), VSRC), 1))>; + def : Pat<(v4i32 (scalar_to_vector DblToUIntLoadP9.A)), (v4i32 (XXSPLTW (COPY_TO_REGCLASS - (XSCVDPUXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>; - def : Pat<(v2i64 (scalar_to_vector FltToLongLoad.A)), + (XSCVDPUXWS (DFLOADf64 ixaddr:$A)), VSRC), 1))>; + def : Pat<(v2i64 (scalar_to_vector FltToLongLoadP9.A)), (v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS - (DFLOADf32 iaddr:$A), + (DFLOADf32 ixaddr:$A), VSFRC)), 0))>; - def : Pat<(v2i64 (scalar_to_vector FltToULongLoad.A)), + def : Pat<(v2i64 (scalar_to_vector FltToULongLoadP9.A)), (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS - (DFLOADf32 iaddr:$A), + (DFLOADf32 ixaddr:$A), VSFRC)), 0))>; } @@ -2921,4 +3127,49 @@ let AddedComplexity = 400 in { (VMRGOW (COPY_TO_REGCLASS (MTVSRDD AnyExts.D, AnyExts.B), VSRC), (COPY_TO_REGCLASS (MTVSRDD AnyExts.C, AnyExts.A), VSRC))>; } + // P9 Altivec instructions that can be used to build vectors. + // Adding them to PPCInstrVSX.td rather than PPCAltivecVSX.td to compete + // with complexities of existing build vector patterns in this file. + let Predicates = [HasP9Altivec, IsLittleEndian] in { + def : Pat<(v2i64 (build_vector WordToDWord.LE_A0, WordToDWord.LE_A1)), + (v2i64 (VEXTSW2D $A))>; + def : Pat<(v2i64 (build_vector HWordToDWord.LE_A0, HWordToDWord.LE_A1)), + (v2i64 (VEXTSH2D $A))>; + def : Pat<(v4i32 (build_vector HWordToWord.LE_A0, HWordToWord.LE_A1, + HWordToWord.LE_A2, HWordToWord.LE_A3)), + (v4i32 (VEXTSH2W $A))>; + def : Pat<(v4i32 (build_vector ByteToWord.LE_A0, ByteToWord.LE_A1, + ByteToWord.LE_A2, ByteToWord.LE_A3)), + (v4i32 (VEXTSB2W $A))>; + def : Pat<(v2i64 (build_vector ByteToDWord.LE_A0, ByteToDWord.LE_A1)), + (v2i64 (VEXTSB2D $A))>; + } + + let Predicates = [HasP9Altivec, IsBigEndian] in { + def : Pat<(v2i64 (build_vector WordToDWord.BE_A0, WordToDWord.BE_A1)), + (v2i64 (VEXTSW2D $A))>; + def : Pat<(v2i64 (build_vector HWordToDWord.BE_A0, HWordToDWord.BE_A1)), + (v2i64 (VEXTSH2D $A))>; + def : Pat<(v4i32 (build_vector HWordToWord.BE_A0, HWordToWord.BE_A1, + HWordToWord.BE_A2, HWordToWord.BE_A3)), + (v4i32 (VEXTSH2W $A))>; + def : Pat<(v4i32 (build_vector ByteToWord.BE_A0, ByteToWord.BE_A1, + ByteToWord.BE_A2, ByteToWord.BE_A3)), + (v4i32 (VEXTSB2W $A))>; + def : Pat<(v2i64 (build_vector ByteToDWord.BE_A0, ByteToDWord.BE_A1)), + (v2i64 (VEXTSB2D $A))>; + } + + let Predicates = [HasP9Altivec] in { + def: Pat<(v2i64 (PPCSExtVElems v16i8:$A)), + (v2i64 (VEXTSB2D $A))>; + def: Pat<(v2i64 (PPCSExtVElems v8i16:$A)), + (v2i64 (VEXTSH2D $A))>; + def: Pat<(v2i64 (PPCSExtVElems v4i32:$A)), + (v2i64 (VEXTSW2D $A))>; + def: Pat<(v4i32 (PPCSExtVElems v16i8:$A)), + (v4i32 (VEXTSB2W $A))>; + def: Pat<(v4i32 (PPCSExtVElems v8i16:$A)), + (v4i32 (VEXTSH2W $A))>; + } } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp index 2c3e755..a349fa1 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp @@ -39,6 +39,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" @@ -72,9 +73,10 @@ namespace { public: static char ID; // Pass ID, replacement for typeid - PPCLoopPreIncPrep() : FunctionPass(ID), TM(nullptr) { + PPCLoopPreIncPrep() : FunctionPass(ID) { initializePPCLoopPreIncPrepPass(*PassRegistry::getPassRegistry()); } + PPCLoopPreIncPrep(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) { initializePPCLoopPreIncPrepPass(*PassRegistry::getPassRegistry()); } @@ -93,7 +95,7 @@ namespace { bool rotateLoop(Loop *L); private: - PPCTargetMachine *TM; + PPCTargetMachine *TM = nullptr; DominatorTree *DT; LoopInfo *LI; ScalarEvolution *SE; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp index e527b01..b310493 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#include "PPC.h" #include "MCTargetDesc/PPCMCExpr.h" +#include "PPC.h" #include "PPCSubtarget.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/Twine.h" @@ -148,7 +148,7 @@ void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, MCOperand MCOp; switch (MO.getType()) { default: - MI->dump(); + MI->print(errs()); llvm_unreachable("unknown operand type"); case MachineOperand::MO_Register: assert(!MO.getSubReg() && "Subregs should be eliminated!"); diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp index 2413af3..ff5f17c 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -19,9 +19,9 @@ // //===---------------------------------------------------------------------===// -#include "PPCInstrInfo.h" #include "PPC.h" #include "PPCInstrBuilder.h" +#include "PPCInstrInfo.h" #include "PPCTargetMachine.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -147,9 +147,9 @@ bool PPCMIPeephole::simplifyCode(void) { << "Optimizing load-and-splat/splat " "to load-and-splat/copy: "); DEBUG(MI.dump()); - BuildMI(MBB, &MI, MI.getDebugLoc(), - TII->get(PPC::COPY), MI.getOperand(0).getReg()) - .addOperand(MI.getOperand(1)); + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY), + MI.getOperand(0).getReg()) + .add(MI.getOperand(1)); ToErase = &MI; Simplified = true; } @@ -169,9 +169,9 @@ bool PPCMIPeephole::simplifyCode(void) { << "Optimizing splat/swap or splat/splat " "to splat/copy: "); DEBUG(MI.dump()); - BuildMI(MBB, &MI, MI.getDebugLoc(), - TII->get(PPC::COPY), MI.getOperand(0).getReg()) - .addOperand(MI.getOperand(1)); + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY), + MI.getOperand(0).getReg()) + .add(MI.getOperand(1)); ToErase = &MI; Simplified = true; } @@ -194,9 +194,9 @@ bool PPCMIPeephole::simplifyCode(void) { else if (Immed == 2 && FeedImmed == 2 && FeedReg1 == FeedReg2) { DEBUG(dbgs() << "Optimizing swap/swap => copy: "); DEBUG(MI.dump()); - BuildMI(MBB, &MI, MI.getDebugLoc(), - TII->get(PPC::COPY), MI.getOperand(0).getReg()) - .addOperand(DefMI->getOperand(1)); + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY), + MI.getOperand(0).getReg()) + .add(DefMI->getOperand(1)); ToErase = &MI; Simplified = true; } @@ -251,7 +251,7 @@ bool PPCMIPeephole::simplifyCode(void) { DEBUG(MI.dump()); BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY), MI.getOperand(0).getReg()) - .addOperand(MI.getOperand(OpNo)); + .add(MI.getOperand(OpNo)); ToErase = &MI; Simplified = true; } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp index 9d91e31..bc2d9a0 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp @@ -8,14 +8,13 @@ //===----------------------------------------------------------------------===// #include "PPCMachineFunctionInfo.h" +#include "llvm/ADT/Twine.h" #include "llvm/IR/DataLayout.h" #include "llvm/MC/MCContext.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; -void PPCFunctionInfo::anchor() { } +void PPCFunctionInfo::anchor() {} MCSymbol *PPCFunctionInfo::getPICOffsetSymbol() const { const DataLayout &DL = MF.getDataLayout(); diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h index 4c29aa0..202e100 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h @@ -14,6 +14,7 @@ #ifndef LLVM_LIB_TARGET_POWERPC_PPCMACHINEFUNCTIONINFO_H #define LLVM_LIB_TARGET_POWERPC_PPCMACHINEFUNCTIONINFO_H +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineFunction.h" namespace llvm { @@ -26,17 +27,17 @@ class PPCFunctionInfo : public MachineFunctionInfo { /// FramePointerSaveIndex - Frame index of where the old frame pointer is /// stored. Also used as an anchor for instructions that need to be altered /// when using frame pointers (dyna_add, dyna_sub.) - int FramePointerSaveIndex; + int FramePointerSaveIndex = 0; /// ReturnAddrSaveIndex - Frame index of where the return address is stored. /// - int ReturnAddrSaveIndex; + int ReturnAddrSaveIndex = 0; /// Frame index where the old base pointer is stored. - int BasePointerSaveIndex; + int BasePointerSaveIndex = 0; /// Frame index where the old PIC base pointer is stored. - int PICBasePointerSaveIndex; + int PICBasePointerSaveIndex = 0; /// MustSaveLR - Indicates whether LR is defined (or clobbered) in the current /// function. This is only valid after the initial scan of the function by @@ -44,54 +45,58 @@ class PPCFunctionInfo : public MachineFunctionInfo { bool MustSaveLR; /// Does this function have any stack spills. - bool HasSpills; + bool HasSpills = false; /// Does this function spill using instructions with only r+r (not r+i) /// forms. - bool HasNonRISpills; + bool HasNonRISpills = false; /// SpillsCR - Indicates whether CR is spilled in the current function. - bool SpillsCR; + bool SpillsCR = false; /// Indicates whether VRSAVE is spilled in the current function. - bool SpillsVRSAVE; + bool SpillsVRSAVE = false; /// LRStoreRequired - The bool indicates whether there is some explicit use of /// the LR/LR8 stack slot that is not obvious from scanning the code. This /// requires that the code generator produce a store of LR to the stack on /// entry, even though LR may otherwise apparently not be used. - bool LRStoreRequired; + bool LRStoreRequired = false; /// This function makes use of the PPC64 ELF TOC base pointer (register r2). - bool UsesTOCBasePtr; + bool UsesTOCBasePtr = false; /// MinReservedArea - This is the frame size that is at least reserved in a /// potential caller (parameter+linkage area). - unsigned MinReservedArea; + unsigned MinReservedArea = 0; /// TailCallSPDelta - Stack pointer delta used when tail calling. Maximum /// amount the stack pointer is adjusted to make the frame bigger for tail /// calls. Used for creating an area before the register spill area. - int TailCallSPDelta; + int TailCallSPDelta = 0; /// HasFastCall - Does this function contain a fast call. Used to determine /// how the caller's stack pointer should be calculated (epilog/dynamicalloc). - bool HasFastCall; + bool HasFastCall = false; /// VarArgsFrameIndex - FrameIndex for start of varargs area. - int VarArgsFrameIndex; + int VarArgsFrameIndex = 0; + /// VarArgsStackOffset - StackOffset for start of stack /// arguments. - int VarArgsStackOffset; + + int VarArgsStackOffset = 0; + /// VarArgsNumGPR - Index of the first unused integer /// register for parameter passing. - unsigned VarArgsNumGPR; + unsigned VarArgsNumGPR = 0; + /// VarArgsNumFPR - Index of the first unused double /// register for parameter passing. - unsigned VarArgsNumFPR; + unsigned VarArgsNumFPR = 0; /// CRSpillFrameIndex - FrameIndex for CR spill slot for 32-bit SVR4. - int CRSpillFrameIndex; + int CRSpillFrameIndex = 0; /// If any of CR[2-4] need to be saved in the prologue and restored in the /// epilogue then they are added to this array. This is used for the @@ -102,35 +107,14 @@ class PPCFunctionInfo : public MachineFunctionInfo { MachineFunction &MF; /// Whether this uses the PIC Base register or not. - bool UsesPICBase; + bool UsesPICBase = false; /// True if this function has a subset of CSRs that is handled explicitly via /// copies - bool IsSplitCSR; + bool IsSplitCSR = false; public: - explicit PPCFunctionInfo(MachineFunction &MF) - : FramePointerSaveIndex(0), - ReturnAddrSaveIndex(0), - BasePointerSaveIndex(0), - PICBasePointerSaveIndex(0), - HasSpills(false), - HasNonRISpills(false), - SpillsCR(false), - SpillsVRSAVE(false), - LRStoreRequired(false), - UsesTOCBasePtr(false), - MinReservedArea(0), - TailCallSPDelta(0), - HasFastCall(false), - VarArgsFrameIndex(0), - VarArgsStackOffset(0), - VarArgsNumGPR(0), - VarArgsNumFPR(0), - CRSpillFrameIndex(0), - MF(MF), - UsesPICBase(0), - IsSplitCSR(false) {} + explicit PPCFunctionInfo(MachineFunction &MF) : MF(MF) {} int getFramePointerSaveIndex() const { return FramePointerSaveIndex; } void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; } @@ -211,7 +195,6 @@ public: MCSymbol *getTOCOffsetSymbol() const; }; -} // end of namespace llvm - +} // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_POWERPC_PPCMACHINEFUNCTIONINFO_H diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index e492014..9207165 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -209,89 +209,84 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const { // The ZERO register is not really a register, but the representation of r0 // when used in instructions that treat r0 as the constant 0. - Reserved.set(PPC::ZERO); - Reserved.set(PPC::ZERO8); + markSuperRegs(Reserved, PPC::ZERO); // The FP register is also not really a register, but is the representation // of the frame pointer register used by ISD::FRAMEADDR. - Reserved.set(PPC::FP); - Reserved.set(PPC::FP8); + markSuperRegs(Reserved, PPC::FP); // The BP register is also not really a register, but is the representation // of the base pointer register used by setjmp. - Reserved.set(PPC::BP); - Reserved.set(PPC::BP8); + markSuperRegs(Reserved, PPC::BP); // The counter registers must be reserved so that counter-based loops can // be correctly formed (and the mtctr instructions are not DCE'd). - Reserved.set(PPC::CTR); - Reserved.set(PPC::CTR8); + markSuperRegs(Reserved, PPC::CTR); + markSuperRegs(Reserved, PPC::CTR8); - Reserved.set(PPC::R1); - Reserved.set(PPC::LR); - Reserved.set(PPC::LR8); - Reserved.set(PPC::RM); + markSuperRegs(Reserved, PPC::R1); + markSuperRegs(Reserved, PPC::LR); + markSuperRegs(Reserved, PPC::LR8); + markSuperRegs(Reserved, PPC::RM); if (!Subtarget.isDarwinABI() || !Subtarget.hasAltivec()) - Reserved.set(PPC::VRSAVE); + markSuperRegs(Reserved, PPC::VRSAVE); // The SVR4 ABI reserves r2 and r13 if (Subtarget.isSVR4ABI()) { - Reserved.set(PPC::R2); // System-reserved register - Reserved.set(PPC::R13); // Small Data Area pointer register + // We only reserve r2 if we need to use the TOC pointer. If we have no + // explicit uses of the TOC pointer (meaning we're a leaf function with + // no constant-pool loads, etc.) and we have no potential uses inside an + // inline asm block, then we can treat r2 has an ordinary callee-saved + // register. + const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); + if (!TM.isPPC64() || FuncInfo->usesTOCBasePtr() || MF.hasInlineAsm()) + markSuperRegs(Reserved, PPC::R2); // System-reserved register + markSuperRegs(Reserved, PPC::R13); // Small Data Area pointer register } // On PPC64, r13 is the thread pointer. Never allocate this register. - if (TM.isPPC64()) { - Reserved.set(PPC::R13); - - Reserved.set(PPC::X1); - Reserved.set(PPC::X13); - - if (TFI->needsFP(MF)) - Reserved.set(PPC::X31); - - if (hasBasePointer(MF)) - Reserved.set(PPC::X30); - - // The 64-bit SVR4 ABI reserves r2 for the TOC pointer. - if (Subtarget.isSVR4ABI()) { - // We only reserve r2 if we need to use the TOC pointer. If we have no - // explicit uses of the TOC pointer (meaning we're a leaf function with - // no constant-pool loads, etc.) and we have no potential uses inside an - // inline asm block, then we can treat r2 has an ordinary callee-saved - // register. - const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); - if (FuncInfo->usesTOCBasePtr() || MF.hasInlineAsm()) - Reserved.set(PPC::X2); - else - Reserved.reset(PPC::R2); - } - } + if (TM.isPPC64()) + markSuperRegs(Reserved, PPC::R13); if (TFI->needsFP(MF)) - Reserved.set(PPC::R31); + markSuperRegs(Reserved, PPC::R31); bool IsPositionIndependent = TM.isPositionIndependent(); if (hasBasePointer(MF)) { if (Subtarget.isSVR4ABI() && !TM.isPPC64() && IsPositionIndependent) - Reserved.set(PPC::R29); + markSuperRegs(Reserved, PPC::R29); else - Reserved.set(PPC::R30); + markSuperRegs(Reserved, PPC::R30); } if (Subtarget.isSVR4ABI() && !TM.isPPC64() && IsPositionIndependent) - Reserved.set(PPC::R30); + markSuperRegs(Reserved, PPC::R30); // Reserve Altivec registers when Altivec is unavailable. if (!Subtarget.hasAltivec()) for (TargetRegisterClass::iterator I = PPC::VRRCRegClass.begin(), IE = PPC::VRRCRegClass.end(); I != IE; ++I) - Reserved.set(*I); + markSuperRegs(Reserved, *I); + assert(checkAllSuperRegsMarked(Reserved)); return Reserved; } +bool PPCRegisterInfo::isCallerPreservedPhysReg(unsigned PhysReg, + const MachineFunction &MF) const { + assert(TargetRegisterInfo::isPhysicalRegister(PhysReg)); + if (TM.isELFv2ABI() && PhysReg == PPC::X2) { + // X2 is guaranteed to be preserved within a function if it is reserved. + // The reason it's reserved is that it's the TOC pointer (and the function + // uses the TOC). In functions where it isn't reserved (i.e. leaf functions + // with no TOC access), we can't claim that it is preserved. + return (getReservedRegs(MF).test(PPC::X2)); + } else { + return false; + } +} + unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { const PPCFrameLowering *TFI = getFrameLowering(MF); @@ -394,9 +389,14 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const { unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); if (MaxAlign < TargetAlign && isInt<16>(FrameSize)) { - BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg) - .addReg(PPC::R31) - .addImm(FrameSize); + if (LP64) + BuildMI(MBB, II, dl, TII.get(PPC::ADDI8), Reg) + .addReg(PPC::X31) + .addImm(FrameSize); + else + BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg) + .addReg(PPC::R31) + .addImm(FrameSize); } else if (LP64) { BuildMI(MBB, II, dl, TII.get(PPC::LD), Reg) .addImm(0) @@ -483,8 +483,10 @@ void PPCRegisterInfo::lowerDynamicAreaOffset( const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); unsigned maxCallFrameSize = MFI.getMaxCallFrameSize(); + bool is64Bit = TM.isPPC64(); DebugLoc dl = MI.getDebugLoc(); - BuildMI(MBB, II, dl, TII.get(PPC::LI), MI.getOperand(0).getReg()) + BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::LI8 : PPC::LI), + MI.getOperand(0).getReg()) .addImm(maxCallFrameSize); MBB.erase(II); } @@ -752,19 +754,31 @@ bool PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF, return false; } -// Figure out if the offset in the instruction must be a multiple of 4. -// This is true for instructions like "STD". -static bool usesIXAddr(const MachineInstr &MI) { +// If the offset must be a multiple of some value, return what that value is. +static unsigned offsetMinAlign(const MachineInstr &MI) { unsigned OpC = MI.getOpcode(); switch (OpC) { default: - return false; + return 1; case PPC::LWA: case PPC::LWA_32: case PPC::LD: + case PPC::LDU: case PPC::STD: - return true; + case PPC::STDU: + case PPC::DFLOADf32: + case PPC::DFLOADf64: + case PPC::DFSTOREf32: + case PPC::DFSTOREf64: + case PPC::LXSD: + case PPC::LXSSP: + case PPC::STXSD: + case PPC::STXSSP: + return 4; + case PPC::LXV: + case PPC::STXV: + return 16; } } @@ -850,9 +864,6 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MI.getOperand(FIOperandNum).ChangeToRegister( FrameIndex < 0 ? getBaseRegister(MF) : getFrameRegister(MF), false); - // Figure out if the offset in the instruction is shifted right two bits. - bool isIXAddr = usesIXAddr(MI); - // If the instruction is not present in ImmToIdxMap, then it has no immediate // form (and must be r+r). bool noImmForm = !MI.isInlineAsm() && OpC != TargetOpcode::STACKMAP && @@ -881,7 +892,8 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // happen in invalid code. assert(OpC != PPC::DBG_VALUE && "This should be handled in a target-independent way"); - if (!noImmForm && ((isInt<16>(Offset) && (!isIXAddr || (Offset & 3) == 0)) || + if (!noImmForm && ((isInt<16>(Offset) && + ((Offset % offsetMinAlign(MI)) == 0)) || OpC == TargetOpcode::STACKMAP || OpC == TargetOpcode::PATCHPOINT)) { MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset); @@ -1074,5 +1086,5 @@ bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, return MI->getOpcode() == PPC::DBG_VALUE || // DBG_VALUE is always Reg+Imm MI->getOpcode() == TargetOpcode::STACKMAP || MI->getOpcode() == TargetOpcode::PATCHPOINT || - (isInt<16>(Offset) && (!usesIXAddr(*MI) || (Offset & 3) == 0)); + (isInt<16>(Offset) && (Offset % offsetMinAlign(*MI)) == 0); } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h index 4a96327..0bbb71f 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h @@ -83,6 +83,7 @@ public: void adjustStackMapLiveOutMask(uint32_t *Mask) const override; BitVector getReservedRegs(const MachineFunction &MF) const override; + bool isCallerPreservedPhysReg(unsigned PhysReg, const MachineFunction &MF) const override; /// We require the register scavenger. bool requiresRegisterScavenging(const MachineFunction &MF) const override { diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleP8.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP8.td index 8e52da5..79963dd 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCScheduleP8.td +++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP8.td @@ -377,7 +377,7 @@ def P8Itineraries : ProcessorItineraries< InstrStage<1, [P8_FPU1, P8_FPU2]>], [7, 1, 1]>, InstrItinData<IIC_VecPerm , [InstrStage<1, [P8_DU1, P8_DU2], 0>, - InstrStage<1, [P8_FPU2, P8_FPU2]>], + InstrStage<1, [P8_FPU1, P8_FPU2]>], [3, 1, 1]> ]>; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleP9.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP9.td index a9c1bd7..a01995a 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCScheduleP9.td +++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP9.td @@ -260,8 +260,8 @@ let SchedModel = P9Model in { // ***************** Defining Itinerary Class Resources ***************** - def : ItinRW<[P9_DFU_76C, IP_EXEC_1C, DISP_1C, DISP_1C], [IIC_IntSimple, - IIC_IntGeneral]>; + def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C], + [IIC_IntSimple, IIC_IntGeneral]>; def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], [IIC_IntISEL, IIC_IntRotate, IIC_IntShift]>; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp index e8a87e7..ccf0f80 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -220,8 +220,8 @@ bool PPCSubtarget::enableSubRegLiveness() const { return UseSubRegLiveness; } -unsigned char PPCSubtarget::classifyGlobalReference( - const GlobalValue *GV) const { +unsigned char +PPCSubtarget::classifyGlobalReference(const GlobalValue *GV) const { // Note that currently we don't generate non-pic references. // If a caller wants that, this will have to be updated. @@ -229,23 +229,9 @@ unsigned char PPCSubtarget::classifyGlobalReference( if (TM.getCodeModel() == CodeModel::Large) return PPCII::MO_PIC_FLAG | PPCII::MO_NLP_FLAG; - unsigned char flags = PPCII::MO_PIC_FLAG; - - // Only if the relocation mode is PIC do we have to worry about - // interposition. In all other cases we can use a slightly looser standard to - // decide how to access the symbol. - if (TM.getRelocationModel() == Reloc::PIC_) { - // If it's local, or it's non-default, it can't be interposed. - if (!GV->hasLocalLinkage() && - GV->hasDefaultVisibility()) { - flags |= PPCII::MO_NLP_FLAG; - } - return flags; - } - - if (GV->isStrongDefinitionForLinker()) - return flags; - return flags | PPCII::MO_NLP_FLAG; + if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) + return PPCII::MO_PIC_FLAG; + return PPCII::MO_PIC_FLAG | PPCII::MO_NLP_FLAG; } bool PPCSubtarget::isELFv2ABI() const { return TM.isELFv2ABI(); } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h index 7fd9079..90d11f4 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -272,6 +272,13 @@ public: return 16; } + + // DarwinABI has a 224-byte red zone. PPC32 SVR4ABI(Non-DarwinABI) has no + // red zone and PPC64 SVR4ABI has a 288-byte red zone. + unsigned getRedZoneSize() const { + return isDarwinABI() ? 224 : (isPPC64() ? 288 : 0); + } + bool hasHTM() const { return HasHTM; } bool hasFusion() const { return HasFusion; } bool hasFloat128() const { return HasFloat128; } @@ -298,7 +305,9 @@ public: bool isSVR4ABI() const { return !isDarwinABI(); } bool isELFv2ABI() const; - bool enableEarlyIfConversion() const override { return hasISEL(); } + /// Originally, this function return hasISEL(). Now we always enable it, + /// but may expand the ISEL instruction later. + bool enableEarlyIfConversion() const override { return true; } // Scheduling customization. bool enableMachineScheduler() const override; @@ -316,6 +325,8 @@ public: /// classifyGlobalReference - Classify a global variable reference for the /// current subtarget accourding to how we should reference it. unsigned char classifyGlobalReference(const GlobalValue *GV) const; + + bool isXRaySupported() const override { return IsPPC64 && IsLittleEndian; } }; } // End llvm namespace diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp index 0c1260a..5f8085f 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp @@ -21,9 +21,9 @@ // //===----------------------------------------------------------------------===// -#include "PPCInstrInfo.h" #include "PPC.h" #include "PPCInstrBuilder.h" +#include "PPCInstrInfo.h" #include "PPCTargetMachine.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -52,6 +52,7 @@ namespace { protected: bool processBlock(MachineBasicBlock &MBB) { bool Changed = false; + bool NeedFence = true; bool Is64Bit = MBB.getParent()->getSubtarget<PPCSubtarget>().isPPC64(); for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end(); @@ -62,6 +63,16 @@ protected: MI.getOpcode() != PPC::ADDItlsldLADDR && MI.getOpcode() != PPC::ADDItlsgdLADDR32 && MI.getOpcode() != PPC::ADDItlsldLADDR32) { + + // Although we create ADJCALLSTACKDOWN and ADJCALLSTACKUP + // as scheduling fences, we skip creating fences if we already + // have existing ADJCALLSTACKDOWN/UP to avoid nesting, + // which causes verification error with -verify-machineinstrs. + if (MI.getOpcode() == PPC::ADJCALLSTACKDOWN) + NeedFence = false; + else if (MI.getOpcode() == PPC::ADJCALLSTACKUP) + NeedFence = true; + ++I; continue; } @@ -96,10 +107,15 @@ protected: break; } - // Don't really need to save data to the stack - the clobbered + // We create ADJCALLSTACKUP and ADJCALLSTACKDOWN around _tls_get_addr + // as schduling fence to avoid it is scheduled before + // mflr in the prologue and the address in LR is clobbered (PR25839). + // We don't really need to save data to the stack - the clobbered // registers are already saved when the SDNode (e.g. PPCaddiTlsgdLAddr) // gets translated to the pseudo instruction (e.g. ADDItlsgdLADDR). - BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0); + if (NeedFence) + BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0) + .addImm(0); // Expand into two ops built prior to the existing instruction. MachineInstr *Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3) @@ -115,7 +131,8 @@ protected: .addReg(GPR3)); Call->addOperand(MI.getOperand(3)); - BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKUP)).addImm(0).addImm(0); + if (NeedFence) + BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKUP)).addImm(0).addImm(0); BuildMI(MBB, I, DL, TII->get(TargetOpcode::COPY), OutReg) .addReg(GPR3); diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp index 7c53a56..17345b6 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp @@ -61,8 +61,8 @@ // //===----------------------------------------------------------------------===// -#include "PPC.h" #include "MCTargetDesc/PPCPredicates.h" +#include "PPC.h" #include "PPCInstrBuilder.h" #include "PPCInstrInfo.h" #include "PPCMachineFunctionInfo.h" diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp index 91b1d24..fe092cc 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -12,20 +12,32 @@ //===----------------------------------------------------------------------===// #include "PPCTargetMachine.h" +#include "MCTargetDesc/PPCMCTargetDesc.h" #include "PPC.h" +#include "PPCSubtarget.h" #include "PPCTargetObjectFile.h" #include "PPCTargetTransformInfo.h" -#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" -#include "llvm/IR/LegacyPassManager.h" -#include "llvm/MC/MCStreamer.h" +#include "llvm/Pass.h" +#include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/FormattedStream.h" #include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/Scalar.h" +#include <cassert> +#include <memory> +#include <string> + using namespace llvm; static cl:: @@ -74,12 +86,14 @@ EnableMachineCombinerPass("ppc-machine-combiner", extern "C" void LLVMInitializePowerPCTarget() { // Register the targets - RegisterTargetMachine<PPC32TargetMachine> A(getThePPC32Target()); - RegisterTargetMachine<PPC64TargetMachine> B(getThePPC64Target()); - RegisterTargetMachine<PPC64TargetMachine> C(getThePPC64LETarget()); + RegisterTargetMachine<PPCTargetMachine> A(getThePPC32Target()); + RegisterTargetMachine<PPCTargetMachine> B(getThePPC64Target()); + RegisterTargetMachine<PPCTargetMachine> C(getThePPC64LETarget()); PassRegistry &PR = *PassRegistry::getPassRegistry(); initializePPCBoolRetToIntPass(PR); + initializePPCExpandISELPass(PR); + initializePPCTLSDynamicCallPass(PR); } /// Return the datalayout string of a subtarget. @@ -149,9 +163,9 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { // If it isn't a Mach-O file then it's going to be a linux ELF // object file. if (TT.isOSDarwin()) - return make_unique<TargetLoweringObjectFileMachO>(); + return llvm::make_unique<TargetLoweringObjectFileMachO>(); - return make_unique<PPC64LinuxTargetObjectFile>(); + return llvm::make_unique<PPC64LinuxTargetObjectFile>(); } static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT, @@ -164,32 +178,34 @@ static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT, assert(Options.MCOptions.getABIName().empty() && "Unknown target-abi option!"); - if (!TT.isMacOSX()) { - switch (TT.getArch()) { - case Triple::ppc64le: - return PPCTargetMachine::PPC_ABI_ELFv2; - case Triple::ppc64: - return PPCTargetMachine::PPC_ABI_ELFv1; - default: - // Fallthrough. - ; - } + if (TT.isMacOSX()) + return PPCTargetMachine::PPC_ABI_UNKNOWN; + + switch (TT.getArch()) { + case Triple::ppc64le: + return PPCTargetMachine::PPC_ABI_ELFv2; + case Triple::ppc64: + return PPCTargetMachine::PPC_ABI_ELFv1; + default: + return PPCTargetMachine::PPC_ABI_UNKNOWN; } - return PPCTargetMachine::PPC_ABI_UNKNOWN; } static Reloc::Model getEffectiveRelocModel(const Triple &TT, Optional<Reloc::Model> RM) { - if (!RM.hasValue()) { - if (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le) { - if (!TT.isOSBinFormatMachO() && !TT.isMacOSX()) - return Reloc::PIC_; - } - if (TT.isOSDarwin()) - return Reloc::DynamicNoPIC; - return Reloc::Static; - } - return *RM; + if (RM.hasValue()) + return *RM; + + // Darwin defaults to dynamic-no-pic. + if (TT.isOSDarwin()) + return Reloc::DynamicNoPIC; + + // Non-darwin 64-bit platforms are PIC by default. + if (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le) + return Reloc::PIC_; + + // 32-bit is static by default. + return Reloc::Static; } // The FeatureString here is a little subtle. We are modifying the feature @@ -205,33 +221,11 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT, computeFSAdditions(FS, OL, TT), Options, getEffectiveRelocModel(TT, RM), CM, OL), TLOF(createTLOF(getTargetTriple())), - TargetABI(computeTargetABI(TT, Options)), - Subtarget(TargetTriple, CPU, computeFSAdditions(FS, OL, TT), *this) { - + TargetABI(computeTargetABI(TT, Options)) { initAsmInfo(); } -PPCTargetMachine::~PPCTargetMachine() {} - -void PPC32TargetMachine::anchor() { } - -PPC32TargetMachine::PPC32TargetMachine(const Target &T, const Triple &TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, - Optional<Reloc::Model> RM, - CodeModel::Model CM, - CodeGenOpt::Level OL) - : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} - -void PPC64TargetMachine::anchor() { } - -PPC64TargetMachine::PPC64TargetMachine(const Target &T, const Triple &TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, - Optional<Reloc::Model> RM, - CodeModel::Model CM, - CodeGenOpt::Level OL) - : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} +PPCTargetMachine::~PPCTargetMachine() = default; const PPCSubtarget * PPCTargetMachine::getSubtargetImpl(const Function &F) const { @@ -281,10 +275,11 @@ PPCTargetMachine::getSubtargetImpl(const Function &F) const { //===----------------------------------------------------------------------===// namespace { + /// PPC Code Generator Pass Configuration Options. class PPCPassConfig : public TargetPassConfig { public: - PPCPassConfig(PPCTargetMachine *TM, PassManagerBase &PM) + PPCPassConfig(PPCTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} PPCTargetMachine &getPPCTargetMachine() const { @@ -300,16 +295,17 @@ public: void addPreSched2() override; void addPreEmitPass() override; }; -} // namespace + +} // end anonymous namespace TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) { - return new PPCPassConfig(this, PM); + return new PPCPassConfig(*this, PM); } void PPCPassConfig::addIRPasses() { if (TM->getOptLevel() != CodeGenOpt::None) addPass(createPPCBoolRetToIntPass()); - addPass(createAtomicExpandPass(&getPPCTargetMachine())); + addPass(createAtomicExpandPass()); // For the BG/Q (or if explicitly requested), add explicit data prefetch // intrinsics. @@ -341,7 +337,7 @@ bool PPCPassConfig::addPreISel() { addPass(createPPCLoopPreIncPrepPass(getPPCTargetMachine())); if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None) - addPass(createPPCCTRLoops(getPPCTargetMachine())); + addPass(createPPCCTRLoops()); return false; } @@ -357,7 +353,7 @@ bool PPCPassConfig::addILPOpts() { bool PPCPassConfig::addInstSelector() { // Install an instruction selector. - addPass(createPPCISelDag(getPPCTargetMachine())); + addPass(createPPCISelDag(getPPCTargetMachine(), getOptLevel())); #ifndef NDEBUG if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None) @@ -393,7 +389,7 @@ void PPCPassConfig::addPreRegAlloc() { // FIXME: We probably don't need to run these for -fPIE. if (getPPCTargetMachine().isPositionIndependent()) { // FIXME: LiveVariables should not be necessary here! - // PPCTLSDYnamicCallPass uses LiveIntervals which previously dependet on + // PPCTLSDynamicCallPass uses LiveIntervals which previously dependent on // LiveVariables. This (unnecessary) dependency has been removed now, // however a stage-2 clang build fails without LiveVariables computed here. addPass(&LiveVariablesID, false); @@ -416,6 +412,8 @@ void PPCPassConfig::addPreSched2() { } void PPCPassConfig::addPreEmitPass() { + addPass(createPPCExpandISELPass()); + if (getOptLevel() != CodeGenOpt::None) addPass(createPPCEarlyReturnPass(), false); // Must run branch selection immediately preceding the asm printer. diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h index 59b4f1e..be70550 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h @@ -23,13 +23,12 @@ namespace llvm { /// Common code between 32-bit and 64-bit PowerPC targets. /// -class PPCTargetMachine : public LLVMTargetMachine { +class PPCTargetMachine final : public LLVMTargetMachine { public: enum PPCABI { PPC_ABI_UNKNOWN, PPC_ABI_ELFv1, PPC_ABI_ELFv2 }; private: std::unique_ptr<TargetLoweringObjectFile> TLOF; PPCABI TargetABI; - PPCSubtarget Subtarget; mutable StringMap<std::unique_ptr<PPCSubtarget>> SubtargetMap; @@ -42,6 +41,9 @@ public: ~PPCTargetMachine() override; const PPCSubtarget *getSubtargetImpl(const Function &F) const override; + // The no argument getSubtargetImpl, while it exists on some targets, is + // deprecated and should not be used. + const PPCSubtarget *getSubtargetImpl() const = delete; // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; @@ -56,30 +58,11 @@ public: const Triple &TT = getTargetTriple(); return (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le); }; -}; - -/// PowerPC 32-bit target machine. -/// -class PPC32TargetMachine : public PPCTargetMachine { - virtual void anchor(); -public: - PPC32TargetMachine(const Target &T, const Triple &TT, StringRef CPU, - StringRef FS, const TargetOptions &Options, - Optional<Reloc::Model> RM, CodeModel::Model CM, - CodeGenOpt::Level OL); -}; -/// PowerPC 64-bit target machine. -/// -class PPC64TargetMachine : public PPCTargetMachine { - virtual void anchor(); -public: - PPC64TargetMachine(const Target &T, const Triple &TT, StringRef CPU, - StringRef FS, const TargetOptions &Options, - Optional<Reloc::Model> RM, CodeModel::Model CM, - CodeGenOpt::Level OL); + bool isMachineVerifierClean() const override { + return false; + } }; - } // end namespace llvm #endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h index dbe7617..310fea9 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h @@ -1,4 +1,4 @@ -//===-- PPCTargetStreamer.h - PPC Target Streamer --s-----------*- C++ -*--===// +//===- PPCTargetStreamer.h - PPC Target Streamer ----------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -10,18 +10,26 @@ #ifndef LLVM_LIB_TARGET_POWERPC_PPCTARGETSTREAMER_H #define LLVM_LIB_TARGET_POWERPC_PPCTARGETSTREAMER_H +#include "llvm/ADT/StringRef.h" #include "llvm/MC/MCStreamer.h" namespace llvm { + +class MCExpr; +class MCSymbol; +class MCSymbolELF; + class PPCTargetStreamer : public MCTargetStreamer { public: PPCTargetStreamer(MCStreamer &S); ~PPCTargetStreamer() override; + virtual void emitTCEntry(const MCSymbol &S) = 0; virtual void emitMachine(StringRef CPU) = 0; virtual void emitAbiVersion(int AbiVersion) = 0; virtual void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) = 0; }; -} -#endif +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_POWERPC_PPCTARGETSTREAMER_H diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index f94d1ea..6110706 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -189,7 +189,7 @@ int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, return PPCTTIImpl::getIntImmCost(Imm, Ty); } -void PPCTTIImpl::getUnrollingPreferences(Loop *L, +void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { if (ST->getDarwinDirective() == PPC::DIR_A2) { // The A2 is in-order with a deep pipeline, and concatenation unrolling @@ -201,7 +201,7 @@ void PPCTTIImpl::getUnrollingPreferences(Loop *L, UP.AllowExpensiveTripCount = true; } - BaseT::getUnrollingPreferences(L, UP); + BaseT::getUnrollingPreferences(L, SE, UP); } bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { @@ -215,6 +215,11 @@ bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { return LoopHasReductions; } +bool PPCTTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) { + MaxLoadSize = 8; + return true; +} + bool PPCTTIImpl::enableInterleavedAccessVectorization() { return true; } @@ -225,7 +230,7 @@ unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) { return ST->hasVSX() ? 64 : 32; } -unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) { +unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) const { if (Vector) { if (ST->hasQPX()) return 256; if (ST->hasAltivec()) return 128; @@ -239,9 +244,18 @@ unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) { } unsigned PPCTTIImpl::getCacheLineSize() { - // This is currently only used for the data prefetch pass which is only - // enabled for BG/Q by default. - return CacheLineSize; + // Check first if the user specified a custom line size. + if (CacheLineSize.getNumOccurrences() > 0) + return CacheLineSize; + + // On P7, P8 or P9 we have a cache line size of 128. + unsigned Directive = ST->getDarwinDirective(); + if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 || + Directive == PPC::DIR_PWR9) + return 128; + + // On other processors return a default of 64 bytes. + return 64; } unsigned PPCTTIImpl::getPrefetchDistance() { @@ -302,14 +316,16 @@ int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, return LT.first; } -int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { +int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + const Instruction *I) { assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); return BaseT::getCastInstrCost(Opcode, Dst, Src); } -int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { - return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); +int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I) { + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); } int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { @@ -352,7 +368,7 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { } int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, const Instruction *I) { // Legalize the type. std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && @@ -401,6 +417,10 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, if (IsVSXType || (ST->hasVSX() && IsAltivecType)) return Cost; + // Newer PPC supports unaligned memory access. + if (TLI->allowsMisalignedMemoryAccesses(LT.second, 0)) + return Cost; + // PPC in general does not support unaligned loads and stores. They'll need // to be decomposed based on the alignment factor. diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h index 30ee281..99ca639 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -52,7 +52,8 @@ public: Type *Ty); TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); - void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); + void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, + TTI::UnrollingPreferences &UP); /// @} @@ -60,9 +61,10 @@ public: /// @{ bool enableAggressiveInterleaving(bool LoopHasReductions); + bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize); bool enableInterleavedAccessVectorization(); unsigned getNumberOfRegisters(bool Vector); - unsigned getRegisterBitWidth(bool Vector); + unsigned getRegisterBitWidth(bool Vector) const; unsigned getCacheLineSize(); unsigned getPrefetchDistance(); unsigned getMaxInterleaveFactor(unsigned VF); @@ -74,11 +76,13 @@ public: TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, ArrayRef<const Value *> Args = ArrayRef<const Value *>()); int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); - int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); - int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + const Instruction *I = nullptr); + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I = nullptr); int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, const Instruction *I = nullptr); int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp index 3b5d8f0..93fe323 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp @@ -13,8 +13,8 @@ // //===----------------------------------------------------------------------===// -#include "PPC.h" #include "MCTargetDesc/PPCPredicates.h" +#include "PPC.h" #include "PPCHazardRecognizers.h" #include "PPCInstrBuilder.h" #include "PPCInstrInfo.h" @@ -112,7 +112,7 @@ protected: TII->get(TargetOpcode::SUBREG_TO_REG), NewVReg) .addImm(1) // add 1, not 0, because there is no implicit clearing // of the high bits. - .addOperand(SrcMO) + .add(SrcMO) .addImm(PPC::sub_64); // The source of the original copy is now the new virtual register. @@ -132,7 +132,7 @@ protected: unsigned NewVReg = MRI.createVirtualRegister(DstRC); BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), NewVReg) - .addOperand(SrcMO); + .add(SrcMO); // Transform the original copy into a subregister extraction copy. SrcMO.setReg(NewVReg); diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp index f6d20ce..a57484e 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp @@ -12,10 +12,10 @@ // //===----------------------------------------------------------------------===// -#include "PPCInstrInfo.h" #include "MCTargetDesc/PPCPredicates.h" #include "PPC.h" #include "PPCInstrBuilder.h" +#include "PPCInstrInfo.h" #include "PPCMachineFunctionInfo.h" #include "PPCTargetMachine.h" #include "llvm/ADT/STLExtras.h" diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp index 8197285..7d34efd 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp @@ -42,9 +42,9 @@ // //===---------------------------------------------------------------------===// -#include "PPCInstrInfo.h" #include "PPC.h" #include "PPCInstrBuilder.h" +#include "PPCInstrInfo.h" #include "PPCTargetMachine.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/EquivalenceClasses.h" @@ -195,8 +195,10 @@ public: return false; // If we don't have VSX on the subtarget, don't do anything. + // Also, on Power 9 the load and store ops preserve element order and so + // the swaps are not required. const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>(); - if (!STI.hasVSX()) + if (!STI.hasVSX() || !STI.needsSwapsForVSXMemOps()) return false; bool Changed = false; @@ -522,7 +524,7 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() { if (RelevantFunction) { DEBUG(dbgs() << "Swap vector when first built\n\n"); - dumpSwapVector(); + DEBUG(dumpSwapVector()); } return RelevantFunction; @@ -731,7 +733,7 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() { } DEBUG(dbgs() << "Swap vector after web analysis:\n\n"); - dumpSwapVector(); + DEBUG(dumpSwapVector()); } // Walk the swap vector entries looking for swaps fed by permuting loads @@ -936,9 +938,9 @@ bool PPCVSXSwapRemoval::removeSwaps() { Changed = true; MachineInstr *MI = SwapVector[EntryIdx].VSEMI; MachineBasicBlock *MBB = MI->getParent(); - BuildMI(*MBB, MI, MI->getDebugLoc(), - TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) - .addOperand(MI->getOperand(1)); + BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(TargetOpcode::COPY), + MI->getOperand(0).getReg()) + .add(MI->getOperand(1)); DEBUG(dbgs() << format("Replaced %d with copy: ", SwapVector[EntryIdx].VSEId)); @@ -951,77 +953,78 @@ bool PPCVSXSwapRemoval::removeSwaps() { return Changed; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) // For debug purposes, dump the contents of the swap vector. -void PPCVSXSwapRemoval::dumpSwapVector() { +LLVM_DUMP_METHOD void PPCVSXSwapRemoval::dumpSwapVector() { for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) { MachineInstr *MI = SwapVector[EntryIdx].VSEMI; int ID = SwapVector[EntryIdx].VSEId; - DEBUG(dbgs() << format("%6d", ID)); - DEBUG(dbgs() << format("%6d", EC->getLeaderValue(ID))); - DEBUG(dbgs() << format(" BB#%3d", MI->getParent()->getNumber())); - DEBUG(dbgs() << format(" %14s ", - TII->getName(MI->getOpcode()).str().c_str())); + dbgs() << format("%6d", ID); + dbgs() << format("%6d", EC->getLeaderValue(ID)); + dbgs() << format(" BB#%3d", MI->getParent()->getNumber()); + dbgs() << format(" %14s ", TII->getName(MI->getOpcode()).str().c_str()); if (SwapVector[EntryIdx].IsLoad) - DEBUG(dbgs() << "load "); + dbgs() << "load "; if (SwapVector[EntryIdx].IsStore) - DEBUG(dbgs() << "store "); + dbgs() << "store "; if (SwapVector[EntryIdx].IsSwap) - DEBUG(dbgs() << "swap "); + dbgs() << "swap "; if (SwapVector[EntryIdx].MentionsPhysVR) - DEBUG(dbgs() << "physreg "); + dbgs() << "physreg "; if (SwapVector[EntryIdx].MentionsPartialVR) - DEBUG(dbgs() << "partialreg "); + dbgs() << "partialreg "; if (SwapVector[EntryIdx].IsSwappable) { - DEBUG(dbgs() << "swappable "); + dbgs() << "swappable "; switch(SwapVector[EntryIdx].SpecialHandling) { default: - DEBUG(dbgs() << "special:**unknown**"); + dbgs() << "special:**unknown**"; break; case SH_NONE: break; case SH_EXTRACT: - DEBUG(dbgs() << "special:extract "); + dbgs() << "special:extract "; break; case SH_INSERT: - DEBUG(dbgs() << "special:insert "); + dbgs() << "special:insert "; break; case SH_NOSWAP_LD: - DEBUG(dbgs() << "special:load "); + dbgs() << "special:load "; break; case SH_NOSWAP_ST: - DEBUG(dbgs() << "special:store "); + dbgs() << "special:store "; break; case SH_SPLAT: - DEBUG(dbgs() << "special:splat "); + dbgs() << "special:splat "; break; case SH_XXPERMDI: - DEBUG(dbgs() << "special:xxpermdi "); + dbgs() << "special:xxpermdi "; break; case SH_COPYWIDEN: - DEBUG(dbgs() << "special:copywiden "); + dbgs() << "special:copywiden "; break; } } if (SwapVector[EntryIdx].WebRejected) - DEBUG(dbgs() << "rejected "); + dbgs() << "rejected "; if (SwapVector[EntryIdx].WillRemove) - DEBUG(dbgs() << "remove "); + dbgs() << "remove "; - DEBUG(dbgs() << "\n"); + dbgs() << "\n"; // For no-asserts builds. (void)MI; (void)ID; } - DEBUG(dbgs() << "\n"); + dbgs() << "\n"; } +#endif } // end default namespace |